[X86] Simplify code. NFC.

[oota-llvm.git] / lib / Target / X86 / X86ISelLowering.cpp
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index 0347a5174449cc61271d03bb8cd17f1243b4a045..4e07f5208b947e083ca4d286ee54b97db00720ff 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -107,7 +107,7 @@ static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,
    // If the input is a buildvector just emit a smaller one.
    if (Vec.getOpcode() == ISD::BUILD_VECTOR)
      return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
-                       makeArrayRef(Vec->op_begin()+NormalizedIdxVal,
+                       makeArrayRef(Vec->op_begin() + NormalizedIdxVal,
                                      ElemsPerChunk));
  
    SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
@@ -115,13 +115,13 @@ static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,
                                 VecIdx);
  
    return Result;
-
  }
+
  /// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
  /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
  /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
  /// instructions or a simple subregister reference. Idx is an index in the
-/// 128 bits we want.  It need not be aligned to a 128-bit bounday.  That makes
+/// 128 bits we want.  It need not be aligned to a 128-bit boundary.  That makes
  /// lowering EXTRACT_VECTOR_ELT operations easier.
  static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
                                     SelectionDAG &DAG, SDLoc dl) {
@@ -158,25 +158,23 @@ static SDValue InsertSubVector(SDValue Result, SDValue Vec,
                                 * ElemsPerChunk);
  
    SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
-  return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec,
-                     VecIdx);
+  return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
  }
+
  /// Generate a DAG to put 128-bits into a vector > 128 bits.  This
  /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
  /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
  /// simple superregister reference.  Idx is an index in the 128 bits
-/// we want.  It need not be aligned to a 128-bit bounday.  That makes
+/// we want.  It need not be aligned to a 128-bit boundary.  That makes
  /// lowering INSERT_VECTOR_ELT operations easier.
-static SDValue Insert128BitVector(SDValue Result, SDValue Vec,
-                                  unsigned IdxVal, SelectionDAG &DAG,
-                                  SDLoc dl) {
+static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
+                                  SelectionDAG &DAG,SDLoc dl) {
    assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
    return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
  }
  
-static SDValue Insert256BitVector(SDValue Result, SDValue Vec,
-                                  unsigned IdxVal, SelectionDAG &DAG,
-                                  SDLoc dl) {
+static SDValue Insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
+                                  SelectionDAG &DAG, SDLoc dl) {
    assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
    return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
  }
@@ -230,13 +228,13 @@ void X86TargetLowering::resetOperationActions() {
    // Set up the TargetLowering object.
    static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
  
-  // X86 is weird, it always uses i8 for shift amounts and setcc results.
+  // X86 is weird. It always uses i8 for shift amounts and setcc results.
    setBooleanContents(ZeroOrOneBooleanContent);
    // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
    setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
  
-  // For 64-bit since we have so many registers use the ILP scheduler, for
-  // 32-bit code use the register pressure specific scheduling.
+  // For 64-bit, since we have so many registers, use the ILP scheduler.
+  // For 32-bit, use the register pressure specific scheduling.
    // For Atom, always use ILP scheduling.
    if (Subtarget->isAtom())
      setSchedulingPreference(Sched::ILP);
@@ -248,9 +246,9 @@ void X86TargetLowering::resetOperationActions() {
        TM.getSubtarget<X86Subtarget>().getRegisterInfo();
    setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
  
-  // Bypass expensive divides on Atom when compiling with O2
+  // Bypass expensive divides on Atom when compiling with O2.
    if (TM.getOptLevel() >= CodeGenOpt::Default) {
-    if (Subtarget->hasSlowDivide32()) 
+    if (Subtarget->hasSlowDivide32())
        addBypassSlowDiv(32, 8);
      if (Subtarget->hasSlowDivide64() && Subtarget->is64Bit())
        addBypassSlowDiv(64, 16);
@@ -1531,7 +1529,7 @@ void X86TargetLowering::resetOperationActions() {
      for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
        MVT VT = (MVT::SimpleValueType)i;
  
-      // Do not attempt to promote non-256-bit vectors
+      // Do not attempt to promote non-256-bit vectors.
        if (!VT.is512BitVector())
          continue;
  
@@ -1557,11 +1555,11 @@ void X86TargetLowering::resetOperationActions() {
  
        const unsigned EltSize = VT.getVectorElementType().getSizeInBits();
  
-      // Do not attempt to promote non-256-bit vectors
+      // Do not attempt to promote non-256-bit vectors.
        if (!VT.is512BitVector())
          continue;
  
-      if ( EltSize < 32) {
+      if (EltSize < 32) {
          setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
          setOperationAction(ISD::VSELECT,             VT, Legal);
        }
@@ -1622,9 +1620,8 @@ void X86TargetLowering::resetOperationActions() {
      setLibcallName(RTLIB::SINCOS_F32, "sincosf");
      setLibcallName(RTLIB::SINCOS_F64, "sincos");
      if (Subtarget->isTargetDarwin()) {
-      // For MacOSX, we don't want to the normal expansion of a libcall to
-      // sincos. We want to issue a libcall to __sincos_stret to avoid memory
-      // traffic.
+      // For MacOSX, we don't want the normal expansion of a libcall to sincos.
+      // We want to issue a libcall to __sincos_stret to avoid memory traffic.
        setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
        setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
      }
@@ -2009,7 +2006,7 @@ X86TargetLowering::LowerReturn(SDValue Chain,
      SDValue ValToCopy = OutVals[i];
      EVT ValVT = ValToCopy.getValueType();
  
-    // Promote values to the appropriate types
+    // Promote values to the appropriate types.
      if (VA.getLocInfo() == CCValAssign::SExt)
        ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
      else if (VA.getLocInfo() == CCValAssign::ZExt)
@@ -2020,7 +2017,7 @@ X86TargetLowering::LowerReturn(SDValue Chain,
        ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy);
  
      assert(VA.getLocInfo() != CCValAssign::FPExt &&
-           "Unexpected FP-extend for return value.");  
+           "Unexpected FP-extend for return value.");
  
      // If this is x86-64, and we disabled SSE, we can't return FP values,
      // or SSE or MMX vectors.
@@ -3509,7 +3506,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
        // In PIC we need an extra register to formulate the address computation
        // for the callee.
        unsigned MaxInRegs =
-       (DAG.getTarget().getRelocationModel() == Reloc::PIC_) ? 2 : 3;
+        (DAG.getTarget().getRelocationModel() == Reloc::PIC_) ? 2 : 3;
  
        for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
          CCValAssign &VA = ArgLocs[i];
@@ -5832,7 +5829,7 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
    for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
      if (Zeroable[i])
        continue;
-    
+
      SDValue Current = Op->getOperand(i);
      SDValue SrcVector = Current->getOperand(0);
      if (!V1.getNode())
@@ -6357,8 +6354,7 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
        AllContants = false;
        NonConstIdx = idx;
        NumNonConsts++;
-    }
-    else {
+    } else {
        NumConsts++;
        if (cast<ConstantSDNode>(In)->getZExtValue())
        Immediate |= (1ULL << idx);
@@ -6381,7 +6377,7 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
                                           MVT::getIntegerVT(VT.getSizeInBits()));
        DstVec = DAG.getNode(ISD::BITCAST, dl, VT, VecAsImm);
      }
-    else 
+    else
        DstVec = DAG.getUNDEF(VT);
      return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
                         Op.getOperand(NonConstIdx),
@@ -6404,7 +6400,7 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
  
  /// \brief Return true if \p N implements a horizontal binop and return the
  /// operands for the horizontal binop into V0 and V1.
-/// 
+///
  /// This is a helper function of PerformBUILD_VECTORCombine.
  /// This function checks that the build_vector \p N in input implements a
  /// horizontal operation. Parameter \p Opcode defines the kind of horizontal
@@ -6425,7 +6421,7 @@ static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
    assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
    assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
           "Invalid Vector in input!");
-  
+
    bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
    bool CanFold = true;
    unsigned ExpectedVExtractIdx = BaseIdx;
@@ -6494,13 +6490,13 @@ static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
  }
  
  /// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
-/// a concat_vector. 
+/// a concat_vector.
  ///
  /// This is a helper function of PerformBUILD_VECTORCombine.
  /// This function expects two 256-bit vectors called V0 and V1.
  /// At first, each vector is split into two separate 128-bit vectors.
  /// Then, the resulting 128-bit vectors are used to implement two
-/// horizontal binary operations. 
+/// horizontal binary operations.
  ///
  /// The kind of horizontal binary operation is defined by \p X86Opcode.
  ///
@@ -6694,18 +6690,18 @@ static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG,
      // Try to match an SSE3 float HADD/HSUB.
      if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
        return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
-    
+
      if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
        return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
    } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget->hasSSSE3()) {
      // Try to match an SSSE3 integer HADD/HSUB.
      if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
        return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
-    
+
      if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
        return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
    }
-  
+
    if (!Subtarget->hasAVX())
      return SDValue();
  
@@ -6756,7 +6752,7 @@ static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG,
        // Do this only if the target has AVX2.
        if (Subtarget->hasAVX2())
          return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
- 
+
        // Do not try to expand this build_vector into a pair of horizontal
        // add/sub if we can emit a pair of scalar add/sub.
        if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
@@ -7510,9 +7506,9 @@ static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(SDLoc DL, MVT VT,
  /// does not check for the profitability of lowering either as PALIGNR or
  /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
  /// This matches shuffle vectors that look like:
-/// 
+///
  ///   v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
-/// 
+///
  /// Essentially it concatenates V1 and V2, shifts right by some number of
  /// elements, and takes the low elements as the result. Note that while this is
  /// specified as a *right shift* because x86 is little-endian, it is a *left
@@ -8217,6 +8213,11 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
                      getV4X86ShuffleImm8ForMask(WidenedMask, DAG)));
    }
  
+  // Try to use byte shift instructions.
+  if (SDValue Shift = lowerVectorShuffleAsByteShift(
+          DL, MVT::v2i64, V1, V2, Mask, DAG))
+    return Shift;
+
    // If we have a single input from V2 insert that into V1 if we can do so
    // cheaply.
    if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
@@ -8243,11 +8244,6 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
                                                    Subtarget, DAG))
        return Blend;
  
-  // Try to use byte shift instructions.
-  if (SDValue Shift = lowerVectorShuffleAsByteShift(
-          DL, MVT::v2i64, V1, V2, Mask, DAG))
-    return Shift;
-
    // Try to use byte rotation instructions.
    // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
    if (Subtarget->hasSSSE3())
@@ -8508,6 +8504,11 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
                         getV4X86ShuffleImm8ForMask(Mask, DAG));
    }
  
+  // Try to use byte shift instructions.
+  if (SDValue Shift = lowerVectorShuffleAsByteShift(
+          DL, MVT::v4i32, V1, V2, Mask, DAG))
+    return Shift;
+
    // There are special ways we can lower some single-element blends.
    if (NumV2Elements == 1)
      if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4i32, DL, V1, V2,
@@ -8525,11 +8526,6 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
                                                    Subtarget, DAG))
        return Blend;
  
-  // Try to use byte shift instructions.
-  if (SDValue Shift = lowerVectorShuffleAsByteShift(
-          DL, MVT::v4i32, V1, V2, Mask, DAG))
-    return Shift;
-
    // Try to use byte rotation instructions.
    // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
    if (Subtarget->hasSSSE3())
@@ -8593,17 +8589,17 @@ static SDValue lowerV8I16SingleInputVectorShuffle(
                                                          Mask, Subtarget, DAG))
      return Broadcast;
  
+  // Try to use byte shift instructions.
+  if (SDValue Shift = lowerVectorShuffleAsByteShift(
+          DL, MVT::v8i16, V, V, Mask, DAG))
+    return Shift;
+
    // Use dedicated unpack instructions for masks that match their pattern.
    if (isShuffleEquivalent(Mask, 0, 0, 1, 1, 2, 2, 3, 3))
      return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V, V);
    if (isShuffleEquivalent(Mask, 4, 4, 5, 5, 6, 6, 7, 7))
      return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V, V);
  
-  // Try to use byte shift instructions.
-  if (SDValue Shift = lowerVectorShuffleAsByteShift(
-          DL, MVT::v8i16, V, V, Mask, DAG))
-    return Shift;
-
    // Try to use byte rotation instructions.
    if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
            DL, MVT::v8i16, V, V, Mask, Subtarget, DAG))
@@ -9210,6 +9206,11 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
    assert(NumV1Inputs > 0 && "All single-input shuffles should be canonicalized "
                              "to be V1-input shuffles.");
  
+  // Try to use byte shift instructions.
+  if (SDValue Shift = lowerVectorShuffleAsByteShift(
+          DL, MVT::v8i16, V1, V2, Mask, DAG))
+    return Shift;
+
    // There are special ways we can lower some single-element blends.
    if (NumV2Inputs == 1)
      if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v8i16, DL, V1, V2,
@@ -9227,11 +9228,6 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
                                                    Subtarget, DAG))
        return Blend;
  
-  // Try to use byte shift instructions.
-  if (SDValue Shift = lowerVectorShuffleAsByteShift(
-          DL, MVT::v8i16, V1, V2, Mask, DAG))
-    return Shift;
-
    // Try to use byte rotation instructions.
    if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
            DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
@@ -12802,7 +12798,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
  
  /// Insert one bit to mask vector, like v16i1 or v8i1.
  /// AVX-512 feature.
-SDValue 
+SDValue
  X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
    SDLoc dl(Op);
    SDValue Vec = Op.getOperand(0);
@@ -12815,7 +12811,7 @@ X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
      // insert element and then truncate the result.
      MVT ExtVecVT = (VecVT == MVT::v8i1 ?  MVT::v8i64 : MVT::v16i32);
      MVT ExtEltVT = (VecVT == MVT::v8i1 ?  MVT::i64 : MVT::i32);
-    SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT, 
+    SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
        DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec),
        DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx);
      return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
@@ -13563,7 +13559,7 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
      }
      return SDValue();
    }
-  
+
    assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
           "Unknown SINT_TO_FP to lower!");
  
@@ -14210,7 +14206,7 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
        In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
        InVT = ExtVT;
      }
-    
+
      SDValue Cst = DAG.getTargetConstant(1, InVT.getVectorElementType());
      const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue();
      SDValue CP = DAG.getConstantPool(C, getPointerTy());
@@ -14404,7 +14400,7 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
      EltVT = VT.getVectorElementType();
      NumElts = VT.getVectorNumElements();
    }
-  
+
    unsigned EltBits = EltVT.getSizeInBits();
    LLVMContext *Context = DAG.getContext();
    // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
@@ -14431,7 +14427,7 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
      return DAG.getNode(ISD::BITCAST, dl, VT,
                         DAG.getNode(BitOp, dl, VecVT, Operand, MaskCasted));
    }
-  
+
    // If not vector, then scalar.
    unsigned BitOp = IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
    SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
@@ -14888,12 +14884,12 @@ SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
       if (Op0.getValueType() == MVT::i1)
         llvm_unreachable("Unexpected comparison operation for MVT::i1 operands");
    }
- 
+
    if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
         Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
-    // Do the comparison at i32 if it's smaller, besides the Atom case. 
-    // This avoids subregister aliasing issues. Keep the smaller reference 
-    // if we're optimizing for size, however, as that'll allow better folding 
+    // Do the comparison at i32 if it's smaller, besides the Atom case.
+    // This avoids subregister aliasing issues. Keep the smaller reference
+    // if we're optimizing for size, however, as that'll allow better folding
      // of memory operations.
      if (Op0.getValueType() != MVT::i32 && Op0.getValueType() != MVT::i64 &&
          !DAG.getMachineFunction().getFunction()->getAttributes().hasAttribute(
@@ -14951,7 +14947,7 @@ SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op,
      return SDValue();
  
    EVT VT = Op.getValueType();
-  
+
    // SSE1 has rsqrtss and rsqrtps.
    // TODO: Add support for AVX512 (v16f32).
    // It is likely not profitable to do this for f64 because a double-precision
@@ -14979,9 +14975,9 @@ SDValue X86TargetLowering::getRecipEstimate(SDValue Op,
    // significant digits in the divisor.
    if (!Subtarget->useReciprocalEst())
      return SDValue();
-  
+
    EVT VT = Op.getValueType();
-  
+
    // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
    // TODO: Add support for AVX512 (v16f32).
    // It is likely not profitable to do this for f64 because a double-precision
@@ -15800,11 +15796,11 @@ static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, const X86Subtarget *Subtarget
  
         ((Subtarget->hasDQI() && Subtarget->hasVLX() &&
          VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() >= 32)) ||
-    
+
         ((Subtarget->hasDQI() && VT.is512BitVector() &&
          VTElt.getSizeInBits() >= 32))))
      return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
-    
+
    unsigned int NumElts = VT.getVectorNumElements();
  
    if (NumElts != 8 && NumElts != 16)
@@ -16799,6 +16795,23 @@ static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
      return DAG.getNode(ISD::VSELECT, dl, VT, VMask, Op, PreservedSrc);
  }
  
+static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
+                                    SDValue PreservedSrc,
+                                    const X86Subtarget *Subtarget,
+                                    SelectionDAG &DAG) {
+    if (isAllOnes(Mask))
+      return Op;
+
+    EVT VT = Op.getValueType();
+    SDLoc dl(Op);
+    // The mask should be of type MVT::i1
+    SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask);
+
+    if (PreservedSrc.getOpcode() == ISD::UNDEF)
+      PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
+    return DAG.getNode(X86ISD::SELECT, dl, VT, IMask, Op, PreservedSrc);
+}
+
  static unsigned getOpcodeForFMAIntrinsic(unsigned IntNo) {
      switch (IntNo) {
      default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
@@ -16872,7 +16885,21 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
                                                RoundingMode),
                                    Mask, Src0, Subtarget, DAG);
      }
-                                              
+    case INTR_TYPE_SCALAR_MASK_RM: {
+      SDValue Src1 = Op.getOperand(1);
+      SDValue Src2 = Op.getOperand(2);
+      SDValue Src0 = Op.getOperand(3);
+      SDValue Mask = Op.getOperand(4);
+      SDValue RoundingMode = Op.getOperand(5);
+      return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
+                                              RoundingMode),
+                                  Mask, Src0, Subtarget, DAG);
+    }
+    case INTR_TYPE_2OP_MASK: {
+      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Op.getOperand(1),
+                                              Op.getOperand(2)),
+                                  Op.getOperand(4), Op.getOperand(3), Subtarget, DAG);
+    }
      case CMP_MASK:
      case CMP_MASK_CC: {
        // Comparison intrinsics with masks.
@@ -16924,7 +16951,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
      case VSHIFT_MASK:
        return getVectorMaskingNode(getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
                                                        Op.getOperand(1), Op.getOperand(2), DAG),
-                                  Op.getOperand(4), Op.getOperand(3), Subtarget, DAG);;
+                                  Op.getOperand(4), Op.getOperand(3), Subtarget, DAG);
      default:
        break;
      }
@@ -17476,7 +17503,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
    switch(IntrData->Type) {
    default:
      llvm_unreachable("Unknown Intrinsic Type");
-    break;    
+    break;
    case RDSEED:
    case RDRAND: {
      // Emit the node with the right value type.
@@ -18420,22 +18447,12 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
      SDValue BaseShAmt;
      EVT EltVT = VT.getVectorElementType();
  
-    if (Amt.getOpcode() == ISD::BUILD_VECTOR) {
-      unsigned NumElts = VT.getVectorNumElements();
-      unsigned i, j;
-      for (i = 0; i != NumElts; ++i) {
-        if (Amt.getOperand(i).getOpcode() == ISD::UNDEF)
-          continue;
-        break;
-      }
-      for (j = i; j != NumElts; ++j) {
-        SDValue Arg = Amt.getOperand(j);
-        if (Arg.getOpcode() == ISD::UNDEF) continue;
-        if (Arg != Amt.getOperand(i))
-          break;
-      }
-      if (i != NumElts && j == NumElts)
-        BaseShAmt = Amt.getOperand(i);
+    if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) {
+      // Check if this build_vector node is doing a splat.
+      // If so, then set BaseShAmt equal to the splat value.
+      BaseShAmt = BV->getSplatValue();
+      if (BaseShAmt && BaseShAmt.getOpcode() == ISD::UNDEF)
+        BaseShAmt = SDValue();
      } else {
        if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
          Amt = Amt.getOperand(0);
@@ -18586,7 +18603,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
    // If possible, lower this packed shift into a vector multiply instead of
    // expanding it into a sequence of scalar shifts.
    // Do this only if the vector shift count is a constant build_vector.
-  if (Op.getOpcode() == ISD::SHL && 
+  if (Op.getOpcode() == ISD::SHL &&
        (VT == MVT::v8i16 || VT == MVT::v4i32 ||
         (Subtarget->hasInt256() && VT == MVT::v16i16)) &&
        ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
@@ -18678,15 +18695,15 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
            CanBeSimplified = Amt2 == Amt->getOperand(j);
        }
      }
-    
+
      if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
          isa<ConstantSDNode>(Amt2)) {
        // Replace this node with two shifts followed by a MOVSS/MOVSD.
        EVT CastVT = MVT::v4i32;
-      SDValue Splat1 = 
+      SDValue Splat1 =
          DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), VT);
        SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
-      SDValue Splat2 = 
+      SDValue Splat2 =
          DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), VT);
        SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
        if (TargetOpcode == X86ISD::MOVSD)
@@ -19980,6 +19997,7 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
    return (SVT.getVectorNumElements() == 2 ||
            ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
            isMOVLMask(M, SVT) ||
+          isCommutedMOVLMask(M, SVT) ||
            isMOVHLPSMask(M, SVT) ||
            isSHUFPMask(M, SVT) ||
            isSHUFPMask(M, SVT, /* Commuted */ true) ||
@@ -20910,6 +20928,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
    //  v = phi(main, restore)
    //
    // restoreMBB:
+  //  if base pointer being used, load it from frame
    //  v_restore = 1
  
    MachineBasicBlock *thisMBB = MBB;
@@ -20993,6 +21012,18 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
      .addReg(restoreDstReg).addMBB(restoreMBB);
  
    // restoreMBB:
+  if (RegInfo->hasBasePointer(*MF)) {
+    const X86Subtarget &STI = MF->getTarget().getSubtarget<X86Subtarget>();
+    const bool Uses64BitFramePtr = STI.isTarget64BitLP64() || STI.isTargetNaCl64();
+    X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
+    X86FI->setRestoreBasePointer(MF);
+    unsigned FramePtr = RegInfo->getFrameRegister(*MF);
+    unsigned BasePtr = RegInfo->getBaseRegister();
+    unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
+    addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
+                 FramePtr, true, X86FI->getRestoreBasePointerOffset())
+      .setMIFlag(MachineInstr::FrameSetup);
+  }
    BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
    BuildMI(restoreMBB, DL, TII->get(X86::JMP_4)).addMBB(sinkMBB);
    restoreMBB->addSuccessor(sinkMBB);
@@ -21066,7 +21097,7 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
  
  // Replace 213-type (isel default) FMA3 instructions with 231-type for
  // accumulator loops. Writing back to the accumulator allows the coalescer
-// to remove extra copies in the loop.   
+// to remove extra copies in the loop.
  MachineBasicBlock *
  X86TargetLowering::emitFMA3Instr(MachineInstr *MI,
                                   MachineBasicBlock *MBB) const {
@@ -21328,6 +21359,11 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
    case X86::EH_SjLj_LongJmp64:
      return emitEHSjLjLongJmp(MI, BB);
  
+  case TargetOpcode::STATEPOINT:
+    // As an implementation detail, STATEPOINT shares the STACKMAP format at
+    // this point in the process.  We diverge later.
+    return emitPatchPoint(MI, BB);
+
    case TargetOpcode::STACKMAP:
    case TargetOpcode::PATCHPOINT:
      return emitPatchPoint(MI, BB);
@@ -22347,7 +22383,7 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
      EVT SVT = BC0.getValueType();
      unsigned Opcode = BC0.getOpcode();
      unsigned NumElts = VT.getVectorNumElements();
-    
+
      if (BC0.hasOneUse() && SVT.isVector() &&
          SVT.getVectorNumElements() * 2 == NumElts &&
          TLI.isOperationLegal(Opcode, VT)) {
@@ -22693,7 +22729,7 @@ matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS,
  }
  
  static SDValue
-TransformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG,
+transformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG,
                                        const X86Subtarget *Subtarget) {
    SDLoc dl(N);
    SDValue Cond = N->getOperand(0);
@@ -22706,18 +22742,6 @@ TransformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG,
        Cond = CondSrc->getOperand(0);
    }
  
-  MVT VT = N->getSimpleValueType(0);
-  MVT EltVT = VT.getVectorElementType();
-  unsigned NumElems = VT.getVectorNumElements();
-  // There is no blend with immediate in AVX-512.
-  if (VT.is512BitVector())
-    return SDValue();
-
-  if (!Subtarget->hasSSE41() || EltVT == MVT::i8)
-    return SDValue();
-  if (!Subtarget->hasInt256() && VT == MVT::v16i16)
-    return SDValue();
-
    if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
      return SDValue();
  
@@ -22731,6 +22755,8 @@ TransformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG,
    if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue))
      return SDValue();
  
+  MVT VT = N->getSimpleValueType(0);
+  unsigned NumElems = VT.getVectorNumElements();
    SmallVector<int, 8> ShuffleMask(NumElems, -1);
    for (unsigned i = 0; i < NumElems; ++i) {
      // Be sure we emit undef where we can.
@@ -22740,6 +22766,9 @@ TransformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG,
        ShuffleMask[i] = i + NumElems * ((MaskValue >> i) & 1);
    }
  
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (!TLI.isShuffleMaskLegal(ShuffleMask, VT))
+    return SDValue();
    return DAG.getVectorShuffle(VT, dl, LHS, RHS, &ShuffleMask[0]);
  }
  
@@ -23179,81 +23208,6 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
      }
    }
  
-  // Try to fold this VSELECT into a MOVSS/MOVSD
-  if (N->getOpcode() == ISD::VSELECT &&
-      Cond.getOpcode() == ISD::BUILD_VECTOR && !DCI.isBeforeLegalize()) {
-    if (VT == MVT::v4i32 || VT == MVT::v4f32 ||
-        (Subtarget->hasSSE2() && (VT == MVT::v2i64 || VT == MVT::v2f64))) {
-      bool CanFold = false;
-      unsigned NumElems = Cond.getNumOperands();
-      SDValue A = LHS;
-      SDValue B = RHS;
-      
-      if (isZero(Cond.getOperand(0))) {
-        CanFold = true;
-
-        // fold (vselect <0,-1,-1,-1>, A, B) -> (movss A, B)
-        // fold (vselect <0,-1> -> (movsd A, B)
-        for (unsigned i = 1, e = NumElems; i != e && CanFold; ++i)
-          CanFold = isAllOnes(Cond.getOperand(i));
-      } else if (isAllOnes(Cond.getOperand(0))) {
-        CanFold = true;
-        std::swap(A, B);
-
-        // fold (vselect <-1,0,0,0>, A, B) -> (movss B, A)
-        // fold (vselect <-1,0> -> (movsd B, A)
-        for (unsigned i = 1, e = NumElems; i != e && CanFold; ++i)
-          CanFold = isZero(Cond.getOperand(i));
-      }
-
-      if (CanFold) {
-        if (VT == MVT::v4i32 || VT == MVT::v4f32)
-          return getTargetShuffleNode(X86ISD::MOVSS, DL, VT, A, B, DAG);
-        return getTargetShuffleNode(X86ISD::MOVSD, DL, VT, A, B, DAG);
-      }
-
-      if (Subtarget->hasSSE2() && (VT == MVT::v4i32 || VT == MVT::v4f32)) {
-        // fold (v4i32: vselect <0,0,-1,-1>, A, B) ->
-        //      (v4i32 (bitcast (movsd (v2i64 (bitcast A)),
-        //                             (v2i64 (bitcast B)))))
-        //
-        // fold (v4f32: vselect <0,0,-1,-1>, A, B) ->
-        //      (v4f32 (bitcast (movsd (v2f64 (bitcast A)),
-        //                             (v2f64 (bitcast B)))))
-        //
-        // fold (v4i32: vselect <-1,-1,0,0>, A, B) ->
-        //      (v4i32 (bitcast (movsd (v2i64 (bitcast B)),
-        //                             (v2i64 (bitcast A)))))
-        //
-        // fold (v4f32: vselect <-1,-1,0,0>, A, B) ->
-        //      (v4f32 (bitcast (movsd (v2f64 (bitcast B)),
-        //                             (v2f64 (bitcast A)))))
-
-        CanFold = (isZero(Cond.getOperand(0)) &&
-                   isZero(Cond.getOperand(1)) &&
-                   isAllOnes(Cond.getOperand(2)) &&
-                   isAllOnes(Cond.getOperand(3)));
-
-        if (!CanFold && isAllOnes(Cond.getOperand(0)) &&
-            isAllOnes(Cond.getOperand(1)) &&
-            isZero(Cond.getOperand(2)) &&
-            isZero(Cond.getOperand(3))) {
-          CanFold = true;
-          std::swap(LHS, RHS);
-        }
-
-        if (CanFold) {
-          EVT NVT = (VT == MVT::v4i32) ? MVT::v2i64 : MVT::v2f64;
-          SDValue NewA = DAG.getNode(ISD::BITCAST, DL, NVT, LHS);
-          SDValue NewB = DAG.getNode(ISD::BITCAST, DL, NVT, RHS);
-          SDValue Select = getTargetShuffleNode(X86ISD::MOVSD, DL, NVT, NewA,
-                                                NewB, DAG);
-          return DAG.getNode(ISD::BITCAST, DL, VT, Select);
-        }
-      }
-    }
-  }
-
    // If we know that this node is legal then we know that it is going to be
    // matched by one of the SSE/AVX BLEND instructions. These instructions only
    // depend on the highest bit in each word. Try to use SimplifyDemandedBits
@@ -23338,7 +23292,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
    if ((N->getOpcode() == ISD::VSELECT ||
         N->getOpcode() == X86ISD::SHRUNKBLEND) &&
        !DCI.isBeforeLegalize()) {
-    SDValue Shuffle = TransformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget);
+    SDValue Shuffle = transformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget);
      if (Shuffle.getNode())
        return Shuffle;
    }
@@ -23695,7 +23649,7 @@ static SDValue PerformINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG,
      // fold (blend A, B, allOnes) -> B
      if (ISD::isBuildVectorAllOnes(Mask.getNode()))
        return Op1;
-    
+
      // Simplify the case where the mask is a constant i32 value.
      if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Mask)) {
        if (C->isNullValue())
@@ -26363,7 +26317,7 @@ int X86TargetLowering::getScalingFactorCost(const AddrMode &AM,
    // "load" ports instead of the dedicated "store" port.
    // E.g., on Haswell:
    // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
-  // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.   
+  // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
    if (isLegalAddressingMode(AM, Ty))
      // Scale represents reg2 * scale, thus account for 1
      // as soon as we use a second register.