Masked Load / Store Intrinsics - the CodeGen part.

[oota-llvm.git] / lib / Target / X86 / X86ISelLowering.cpp
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index f05b6c61ca0981927fd3f5fb69ba1d9e7950e989..cdb691248ef1bf155d666697608fdeef90c715c0 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -107,7 +107,7 @@ static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,
    // If the input is a buildvector just emit a smaller one.
    if (Vec.getOpcode() == ISD::BUILD_VECTOR)
      return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
-                       makeArrayRef(Vec->op_begin()+NormalizedIdxVal,
+                       makeArrayRef(Vec->op_begin() + NormalizedIdxVal,
                                      ElemsPerChunk));
  
    SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
@@ -115,13 +115,13 @@ static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,
                                 VecIdx);
  
    return Result;
-
  }
+
  /// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
  /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
  /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
  /// instructions or a simple subregister reference. Idx is an index in the
-/// 128 bits we want.  It need not be aligned to a 128-bit bounday.  That makes
+/// 128 bits we want.  It need not be aligned to a 128-bit boundary.  That makes
  /// lowering EXTRACT_VECTOR_ELT operations easier.
  static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
                                     SelectionDAG &DAG, SDLoc dl) {
@@ -158,25 +158,23 @@ static SDValue InsertSubVector(SDValue Result, SDValue Vec,
                                 * ElemsPerChunk);
  
    SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
-  return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec,
-                     VecIdx);
+  return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
  }
+
  /// Generate a DAG to put 128-bits into a vector > 128 bits.  This
  /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
  /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
  /// simple superregister reference.  Idx is an index in the 128 bits
-/// we want.  It need not be aligned to a 128-bit bounday.  That makes
+/// we want.  It need not be aligned to a 128-bit boundary.  That makes
  /// lowering INSERT_VECTOR_ELT operations easier.
-static SDValue Insert128BitVector(SDValue Result, SDValue Vec,
-                                  unsigned IdxVal, SelectionDAG &DAG,
-                                  SDLoc dl) {
+static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
+                                  SelectionDAG &DAG,SDLoc dl) {
    assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
    return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
  }
  
-static SDValue Insert256BitVector(SDValue Result, SDValue Vec,
-                                  unsigned IdxVal, SelectionDAG &DAG,
-                                  SDLoc dl) {
+static SDValue Insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
+                                  SelectionDAG &DAG, SDLoc dl) {
    assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
    return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
  }
@@ -230,13 +228,13 @@ void X86TargetLowering::resetOperationActions() {
    // Set up the TargetLowering object.
    static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
  
-  // X86 is weird, it always uses i8 for shift amounts and setcc results.
+  // X86 is weird. It always uses i8 for shift amounts and setcc results.
    setBooleanContents(ZeroOrOneBooleanContent);
    // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
    setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
  
-  // For 64-bit since we have so many registers use the ILP scheduler, for
-  // 32-bit code use the register pressure specific scheduling.
+  // For 64-bit, since we have so many registers, use the ILP scheduler.
+  // For 32-bit, use the register pressure specific scheduling.
    // For Atom, always use ILP scheduling.
    if (Subtarget->isAtom())
      setSchedulingPreference(Sched::ILP);
@@ -248,10 +246,11 @@ void X86TargetLowering::resetOperationActions() {
        TM.getSubtarget<X86Subtarget>().getRegisterInfo();
    setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
  
-  // Bypass expensive divides on Atom when compiling with O2
-  if (Subtarget->hasSlowDivide() && TM.getOptLevel() >= CodeGenOpt::Default) {
-    addBypassSlowDiv(32, 8);
-    if (Subtarget->is64Bit())
+  // Bypass expensive divides on Atom when compiling with O2.
+  if (TM.getOptLevel() >= CodeGenOpt::Default) {
+    if (Subtarget->hasSlowDivide32())
+      addBypassSlowDiv(32, 8);
+    if (Subtarget->hasSlowDivide64() && Subtarget->is64Bit())
        addBypassSlowDiv(64, 16);
    }
  
@@ -1320,13 +1319,21 @@ void X86TargetLowering::resetOperationActions() {
  
        // Extract subvector is special because the value type
        // (result) is 128-bit but the source is 256-bit wide.
-      if (VT.is128BitVector())
+      if (VT.is128BitVector()) {
+        if (VT.getScalarSizeInBits() >= 32) {
+          setOperationAction(ISD::MLOAD,  VT, Custom);
+          setOperationAction(ISD::MSTORE, VT, Custom);
+        }
          setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
-
+      }
        // Do not attempt to custom lower other non-256-bit vectors
        if (!VT.is256BitVector())
          continue;
  
+      if (VT.getScalarSizeInBits() >= 32) {
+        setOperationAction(ISD::MLOAD,  VT, Legal);
+        setOperationAction(ISD::MSTORE, VT, Legal);
+      }
        setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
        setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
        setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
@@ -1493,9 +1500,13 @@ void X86TargetLowering::resetOperationActions() {
        unsigned EltSize = VT.getVectorElementType().getSizeInBits();
        // Extract subvector is special because the value type
        // (result) is 256/128-bit but the source is 512-bit wide.
-      if (VT.is128BitVector() || VT.is256BitVector())
+      if (VT.is128BitVector() || VT.is256BitVector()) {
          setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
-
+        if ( EltSize >= 32) {
+          setOperationAction(ISD::MLOAD,   VT, Legal);
+          setOperationAction(ISD::MSTORE,  VT, Legal);
+        }
+      }
        if (VT.getVectorElementType() == MVT::i1)
          setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
  
@@ -1511,12 +1522,14 @@ void X86TargetLowering::resetOperationActions() {
          setOperationAction(ISD::EXTRACT_VECTOR_ELT,  VT, Custom);
          setOperationAction(ISD::SCALAR_TO_VECTOR,    VT, Custom);
          setOperationAction(ISD::INSERT_SUBVECTOR,    VT, Custom);
+        setOperationAction(ISD::MLOAD,               VT, Legal);
+        setOperationAction(ISD::MSTORE,              VT, Legal);
        }
      }
      for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
        MVT VT = (MVT::SimpleValueType)i;
  
-      // Do not attempt to promote non-256-bit vectors
+      // Do not attempt to promote non-256-bit vectors.
        if (!VT.is512BitVector())
          continue;
  
@@ -1542,11 +1555,11 @@ void X86TargetLowering::resetOperationActions() {
  
        const unsigned EltSize = VT.getVectorElementType().getSizeInBits();
  
-      // Do not attempt to promote non-256-bit vectors
+      // Do not attempt to promote non-256-bit vectors.
        if (!VT.is512BitVector())
          continue;
  
-      if ( EltSize < 32) {
+      if (EltSize < 32) {
          setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
          setOperationAction(ISD::VSELECT,             VT, Legal);
        }
@@ -1607,9 +1620,8 @@ void X86TargetLowering::resetOperationActions() {
      setLibcallName(RTLIB::SINCOS_F32, "sincosf");
      setLibcallName(RTLIB::SINCOS_F64, "sincos");
      if (Subtarget->isTargetDarwin()) {
-      // For MacOSX, we don't want to the normal expansion of a libcall to
-      // sincos. We want to issue a libcall to __sincos_stret to avoid memory
-      // traffic.
+      // For MacOSX, we don't want the normal expansion of a libcall to sincos.
+      // We want to issue a libcall to __sincos_stret to avoid memory traffic.
        setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
        setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
      }
@@ -1994,7 +2006,7 @@ X86TargetLowering::LowerReturn(SDValue Chain,
      SDValue ValToCopy = OutVals[i];
      EVT ValVT = ValToCopy.getValueType();
  
-    // Promote values to the appropriate types
+    // Promote values to the appropriate types.
      if (VA.getLocInfo() == CCValAssign::SExt)
        ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
      else if (VA.getLocInfo() == CCValAssign::ZExt)
@@ -2005,7 +2017,7 @@ X86TargetLowering::LowerReturn(SDValue Chain,
        ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy);
  
      assert(VA.getLocInfo() != CCValAssign::FPExt &&
-           "Unexpected FP-extend for return value.");  
+           "Unexpected FP-extend for return value.");
  
      // If this is x86-64, and we disabled SSE, we can't return FP values,
      // or SSE or MMX vectors.
@@ -3494,7 +3506,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
        // In PIC we need an extra register to formulate the address computation
        // for the callee.
        unsigned MaxInRegs =
-       (DAG.getTarget().getRelocationModel() == Reloc::PIC_) ? 2 : 3;
+        (DAG.getTarget().getRelocationModel() == Reloc::PIC_) ? 2 : 3;
  
        for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
          CCValAssign &VA = ArgLocs[i];
@@ -5756,7 +5768,8 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
    // We only know how to deal with build_vector nodes where elements are either
    // zeroable or extract_vector_elt with constant index.
    SDValue FirstNonZero;
-  for (int i=0; i < 4; ++i) {
+  unsigned FirstNonZeroIdx;
+  for (unsigned i=0; i < 4; ++i) {
      if (Zeroable[i])
        continue;
      SDValue Elt = Op->getOperand(i);
@@ -5767,8 +5780,10 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
      MVT VT = Elt.getOperand(0).getSimpleValueType();
      if (!VT.is128BitVector())
        return SDValue();
-    if (!FirstNonZero.getNode())
+    if (!FirstNonZero.getNode()) {
        FirstNonZero = Elt;
+      FirstNonZeroIdx = i;
+    }
    }
  
    assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
@@ -5807,14 +5822,14 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
      return SDValue();
  
    SDValue V2 = Elt.getOperand(0);
-  if (Elt == FirstNonZero)
+  if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
      V1 = SDValue();
  
    bool CanFold = true;
    for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
      if (Zeroable[i])
        continue;
-    
+
      SDValue Current = Op->getOperand(i);
      SDValue SrcVector = Current->getOperand(0);
      if (!V1.getNode())
@@ -6339,8 +6354,7 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
        AllContants = false;
        NonConstIdx = idx;
        NumNonConsts++;
-    }
-    else {
+    } else {
        NumConsts++;
        if (cast<ConstantSDNode>(In)->getZExtValue())
        Immediate |= (1ULL << idx);
@@ -6363,7 +6377,7 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
                                           MVT::getIntegerVT(VT.getSizeInBits()));
        DstVec = DAG.getNode(ISD::BITCAST, dl, VT, VecAsImm);
      }
-    else 
+    else
        DstVec = DAG.getUNDEF(VT);
      return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
                         Op.getOperand(NonConstIdx),
@@ -6386,7 +6400,7 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
  
  /// \brief Return true if \p N implements a horizontal binop and return the
  /// operands for the horizontal binop into V0 and V1.
-/// 
+///
  /// This is a helper function of PerformBUILD_VECTORCombine.
  /// This function checks that the build_vector \p N in input implements a
  /// horizontal operation. Parameter \p Opcode defines the kind of horizontal
@@ -6407,7 +6421,7 @@ static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
    assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
    assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
           "Invalid Vector in input!");
-  
+
    bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
    bool CanFold = true;
    unsigned ExpectedVExtractIdx = BaseIdx;
@@ -6476,13 +6490,13 @@ static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
  }
  
  /// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
-/// a concat_vector. 
+/// a concat_vector.
  ///
  /// This is a helper function of PerformBUILD_VECTORCombine.
  /// This function expects two 256-bit vectors called V0 and V1.
  /// At first, each vector is split into two separate 128-bit vectors.
  /// Then, the resulting 128-bit vectors are used to implement two
-/// horizontal binary operations. 
+/// horizontal binary operations.
  ///
  /// The kind of horizontal binary operation is defined by \p X86Opcode.
  ///
@@ -6676,18 +6690,18 @@ static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG,
      // Try to match an SSE3 float HADD/HSUB.
      if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
        return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
-    
+
      if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
        return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
    } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget->hasSSSE3()) {
      // Try to match an SSSE3 integer HADD/HSUB.
      if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
        return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
-    
+
      if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
        return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
    }
-  
+
    if (!Subtarget->hasAVX())
      return SDValue();
  
@@ -6738,7 +6752,7 @@ static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG,
        // Do this only if the target has AVX2.
        if (Subtarget->hasAVX2())
          return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
- 
+
        // Do not try to expand this build_vector into a pair of horizontal
        // add/sub if we can emit a pair of scalar add/sub.
        if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
@@ -7492,9 +7506,9 @@ static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(SDLoc DL, MVT VT,
  /// does not check for the profitability of lowering either as PALIGNR or
  /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
  /// This matches shuffle vectors that look like:
-/// 
+///
  ///   v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
-/// 
+///
  /// Essentially it concatenates V1 and V2, shifts right by some number of
  /// elements, and takes the low elements as the result. Note that while this is
  /// specified as a *right shift* because x86 is little-endian, it is a *left
@@ -8199,6 +8213,11 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
                      getV4X86ShuffleImm8ForMask(WidenedMask, DAG)));
    }
  
+  // Try to use byte shift instructions.
+  if (SDValue Shift = lowerVectorShuffleAsByteShift(
+          DL, MVT::v2i64, V1, V2, Mask, DAG))
+    return Shift;
+
    // If we have a single input from V2 insert that into V1 if we can do so
    // cheaply.
    if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
@@ -8225,11 +8244,6 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
                                                    Subtarget, DAG))
        return Blend;
  
-  // Try to use byte shift instructions.
-  if (SDValue Shift = lowerVectorShuffleAsByteShift(
-          DL, MVT::v2i64, V1, V2, Mask, DAG))
-    return Shift;
-
    // Try to use byte rotation instructions.
    // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
    if (Subtarget->hasSSSE3())
@@ -8490,6 +8504,11 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
                         getV4X86ShuffleImm8ForMask(Mask, DAG));
    }
  
+  // Try to use byte shift instructions.
+  if (SDValue Shift = lowerVectorShuffleAsByteShift(
+          DL, MVT::v4i32, V1, V2, Mask, DAG))
+    return Shift;
+
    // There are special ways we can lower some single-element blends.
    if (NumV2Elements == 1)
      if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4i32, DL, V1, V2,
@@ -8507,11 +8526,6 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
                                                    Subtarget, DAG))
        return Blend;
  
-  // Try to use byte shift instructions.
-  if (SDValue Shift = lowerVectorShuffleAsByteShift(
-          DL, MVT::v4i32, V1, V2, Mask, DAG))
-    return Shift;
-
    // Try to use byte rotation instructions.
    // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
    if (Subtarget->hasSSSE3())
@@ -8575,17 +8589,17 @@ static SDValue lowerV8I16SingleInputVectorShuffle(
                                                          Mask, Subtarget, DAG))
      return Broadcast;
  
+  // Try to use byte shift instructions.
+  if (SDValue Shift = lowerVectorShuffleAsByteShift(
+          DL, MVT::v8i16, V, V, Mask, DAG))
+    return Shift;
+
    // Use dedicated unpack instructions for masks that match their pattern.
    if (isShuffleEquivalent(Mask, 0, 0, 1, 1, 2, 2, 3, 3))
      return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V, V);
    if (isShuffleEquivalent(Mask, 4, 4, 5, 5, 6, 6, 7, 7))
      return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V, V);
  
-  // Try to use byte shift instructions.
-  if (SDValue Shift = lowerVectorShuffleAsByteShift(
-          DL, MVT::v8i16, V, V, Mask, DAG))
-    return Shift;
-
    // Try to use byte rotation instructions.
    if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
            DL, MVT::v8i16, V, V, Mask, Subtarget, DAG))
@@ -9192,6 +9206,11 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
    assert(NumV1Inputs > 0 && "All single-input shuffles should be canonicalized "
                              "to be V1-input shuffles.");
  
+  // Try to use byte shift instructions.
+  if (SDValue Shift = lowerVectorShuffleAsByteShift(
+          DL, MVT::v8i16, V1, V2, Mask, DAG))
+    return Shift;
+
    // There are special ways we can lower some single-element blends.
    if (NumV2Inputs == 1)
      if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v8i16, DL, V1, V2,
@@ -9209,11 +9228,6 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
                                                    Subtarget, DAG))
        return Blend;
  
-  // Try to use byte shift instructions.
-  if (SDValue Shift = lowerVectorShuffleAsByteShift(
-          DL, MVT::v8i16, V1, V2, Mask, DAG))
-    return Shift;
-
    // Try to use byte rotation instructions.
    if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
            DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
@@ -9983,6 +9997,104 @@ static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1,
                       DAG.getConstant(PermMask, MVT::i8));
  }
  
+/// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then
+/// shuffling each lane.
+///
+/// This will only succeed when the result of fixing the 128-bit lanes results
+/// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in
+/// each 128-bit lanes. This handles many cases where we can quickly blend away
+/// the lane crosses early and then use simpler shuffles within each lane.
+///
+/// FIXME: It might be worthwhile at some point to support this without
+/// requiring the 128-bit lane-relative shuffles to be repeating, but currently
+/// in x86 only floating point has interesting non-repeating shuffles, and even
+/// those are still *marginally* more expensive.
+static SDValue lowerVectorShuffleByMerging128BitLanes(
+    SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
+    const X86Subtarget *Subtarget, SelectionDAG &DAG) {
+  assert(!isSingleInputShuffleMask(Mask) &&
+         "This is only useful with multiple inputs.");
+
+  int Size = Mask.size();
+  int LaneSize = 128 / VT.getScalarSizeInBits();
+  int NumLanes = Size / LaneSize;
+  assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.");
+
+  // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
+  // check whether the in-128-bit lane shuffles share a repeating pattern.
+  SmallVector<int, 4> Lanes;
+  Lanes.resize(NumLanes, -1);
+  SmallVector<int, 4> InLaneMask;
+  InLaneMask.resize(LaneSize, -1);
+  for (int i = 0; i < Size; ++i) {
+    if (Mask[i] < 0)
+      continue;
+
+    int j = i / LaneSize;
+
+    if (Lanes[j] < 0) {
+      // First entry we've seen for this lane.
+      Lanes[j] = Mask[i] / LaneSize;
+    } else if (Lanes[j] != Mask[i] / LaneSize) {
+      // This doesn't match the lane selected previously!
+      return SDValue();
+    }
+
+    // Check that within each lane we have a consistent shuffle mask.
+    int k = i % LaneSize;
+    if (InLaneMask[k] < 0) {
+      InLaneMask[k] = Mask[i] % LaneSize;
+    } else if (InLaneMask[k] != Mask[i] % LaneSize) {
+      // This doesn't fit a repeating in-lane mask.
+      return SDValue();
+    }
+  }
+
+  // First shuffle the lanes into place.
+  MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
+                                VT.getSizeInBits() / 64);
+  SmallVector<int, 8> LaneMask;
+  LaneMask.resize(NumLanes * 2, -1);
+  for (int i = 0; i < NumLanes; ++i)
+    if (Lanes[i] >= 0) {
+      LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
+      LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
+    }
+
+  V1 = DAG.getNode(ISD::BITCAST, DL, LaneVT, V1);
+  V2 = DAG.getNode(ISD::BITCAST, DL, LaneVT, V2);
+  SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);
+
+  // Cast it back to the type we actually want.
+  LaneShuffle = DAG.getNode(ISD::BITCAST, DL, VT, LaneShuffle);
+
+  // Now do a simple shuffle that isn't lane crossing.
+  SmallVector<int, 8> NewMask;
+  NewMask.resize(Size, -1);
+  for (int i = 0; i < Size; ++i)
+    if (Mask[i] >= 0)
+      NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
+  assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&
+         "Must not introduce lane crosses at this point!");
+
+  return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
+}
+
+/// \brief Test whether the specified input (0 or 1) is in-place blended by the
+/// given mask.
+///
+/// This returns true if the elements from a particular input are already in the
+/// slot required by the given mask and require no permutation.
+static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
+  assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
+  int Size = Mask.size();
+  for (int i = 0; i < Size; ++i)
+    if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
+      return false;
+
+  return true;
+}
+
  /// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
  ///
  /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
@@ -10067,6 +10179,16 @@ static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
                         DAG.getConstant(SHUFPDMask, MVT::i8));
    }
  
+  // Try to simplify this by merging 128-bit lanes to enable a lane-based
+  // shuffle. However, if we have AVX2 and either inputs are already in place,
+  // we will be able to shuffle even across lanes the other input in a single
+  // instruction so skip this pattern.
+  if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
+                                 isShuffleMaskInputInPlace(1, Mask))))
+    if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
+            DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
+      return Result;
+
    // If we have AVX2 then we always want to lower with a blend because an v4 we
    // can fully permute the elements.
    if (Subtarget->hasAVX2())
@@ -10137,6 +10259,16 @@ static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
      return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
                         getV4X86ShuffleImm8ForMask(Mask, DAG));
  
+  // Try to simplify this by merging 128-bit lanes to enable a lane-based
+  // shuffle. However, if we have AVX2 and either inputs are already in place,
+  // we will be able to shuffle even across lanes the other input in a single
+  // instruction so skip this pattern.
+  if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
+                                 isShuffleMaskInputInPlace(1, Mask))))
+    if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
+            DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
+      return Result;
+
    // Otherwise fall back on generic blend lowering.
    return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
                                                      Mask, DAG);
@@ -10214,6 +10346,12 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
                                                     DAG);
    }
  
+  // Try to simplify this by merging 128-bit lanes to enable a lane-based
+  // shuffle.
+  if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
+          DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
+    return Result;
+
    // If we have AVX2 then we always want to lower with a blend because at v8 we
    // can fully permute the elements.
    if (Subtarget->hasAVX2())
@@ -10277,6 +10415,12 @@ static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
          DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask), V1);
    }
  
+  // Try to simplify this by merging 128-bit lanes to enable a lane-based
+  // shuffle.
+  if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
+          DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
+    return Result;
+
    // Otherwise fall back on generic blend lowering.
    return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
                                                      Mask, DAG);
@@ -10302,12 +10446,6 @@ static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
                                                          Mask, Subtarget, DAG))
      return Broadcast;
  
-  // There are no generalized cross-lane shuffle operations available on i16
-  // element types.
-  if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
-    return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
-                                                   Mask, DAG);
-
    if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
                                                  Subtarget, DAG))
      return Blend;
@@ -10327,6 +10465,12 @@ static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
      return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i16, V1, V2);
  
    if (isSingleInputShuffleMask(Mask)) {
+    // There are no generalized cross-lane shuffle operations available on i16
+    // element types.
+    if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
+      return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
+                                                     Mask, DAG);
+
      SDValue PSHUFBMask[32];
      for (int i = 0; i < 16; ++i) {
        if (Mask[i] == -1) {
@@ -10347,6 +10491,12 @@ static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
              DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask)));
    }
  
+  // Try to simplify this by merging 128-bit lanes to enable a lane-based
+  // shuffle.
+  if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
+          DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
+    return Result;
+
    // Otherwise fall back on generic lowering.
    return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
  }
@@ -10371,12 +10521,6 @@ static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
                                                          Mask, Subtarget, DAG))
      return Broadcast;
  
-  // There are no generalized cross-lane shuffle operations available on i8
-  // element types.
-  if (is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
-    return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2,
-                                                   Mask, DAG);
-
    if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
                                                  Subtarget, DAG))
      return Blend;
@@ -10400,6 +10544,12 @@ static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
      return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v32i8, V1, V2);
  
    if (isSingleInputShuffleMask(Mask)) {
+    // There are no generalized cross-lane shuffle operations available on i8
+    // element types.
+    if (is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
+      return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2,
+                                                     Mask, DAG);
+
      SDValue PSHUFBMask[32];
      for (int i = 0; i < 32; ++i)
        PSHUFBMask[i] =
@@ -10412,6 +10562,12 @@ static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
          DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask));
    }
  
+  // Try to simplify this by merging 128-bit lanes to enable a lane-based
+  // shuffle.
+  if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
+          DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
+    return Result;
+
    // Otherwise fall back on generic lowering.
    return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
  }
@@ -10690,7 +10846,8 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
    // When the number of V1 and V2 elements are the same, try to minimize the
    // number of uses of V2 in the low half of the vector. When that is tied,
    // ensure that the sum of indices for V1 is equal to or lower than the sum
-  // indices for V2.
+  // indices for V2. When those are equal, try to ensure that the number of odd
+  // indices for V1 is lower than the number of odd indices for V2.
    if (NumV1Elements == NumV2Elements) {
      int LowV1Elements = 0, LowV2Elements = 0;
      for (int M : SVOp->getMask().slice(0, NumElements / 2))
@@ -10707,8 +10864,18 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
            SumV2Indices += i;
          else if (SVOp->getMask()[i] >= 0)
            SumV1Indices += i;
-      if (SumV2Indices < SumV1Indices)
+      if (SumV2Indices < SumV1Indices) {
          return DAG.getCommutedVectorShuffle(*SVOp);
+      } else if (SumV2Indices == SumV1Indices) {
+        int NumV1OddIndices = 0, NumV2OddIndices = 0;
+        for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i)
+          if (SVOp->getMask()[i] >= NumElements)
+            NumV2OddIndices += i % 2;
+          else if (SVOp->getMask()[i] >= 0)
+            NumV1OddIndices += i % 2;
+        if (NumV2OddIndices < NumV1OddIndices)
+          return DAG.getCommutedVectorShuffle(*SVOp);
+      }
      }
    }
  
@@ -12631,7 +12798,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
  
  /// Insert one bit to mask vector, like v16i1 or v8i1.
  /// AVX-512 feature.
-SDValue 
+SDValue
  X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
    SDLoc dl(Op);
    SDValue Vec = Op.getOperand(0);
@@ -12644,7 +12811,7 @@ X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
      // insert element and then truncate the result.
      MVT ExtVecVT = (VecVT == MVT::v8i1 ?  MVT::v8i64 : MVT::v16i32);
      MVT ExtEltVT = (VecVT == MVT::v8i1 ?  MVT::i64 : MVT::i32);
-    SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT, 
+    SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
        DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec),
        DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx);
      return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
@@ -13392,7 +13559,7 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
      }
      return SDValue();
    }
-  
+
    assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
           "Unknown SINT_TO_FP to lower!");
  
@@ -14039,7 +14206,7 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
        In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
        InVT = ExtVT;
      }
-    
+
      SDValue Cst = DAG.getTargetConstant(1, InVT.getVectorElementType());
      const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue();
      SDValue CP = DAG.getConstantPool(C, getPointerTy());
@@ -14233,7 +14400,7 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
      EltVT = VT.getVectorElementType();
      NumElts = VT.getVectorNumElements();
    }
-  
+
    unsigned EltBits = EltVT.getSizeInBits();
    LLVMContext *Context = DAG.getContext();
    // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
@@ -14260,7 +14427,7 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
      return DAG.getNode(ISD::BITCAST, dl, VT,
                         DAG.getNode(BitOp, dl, VecVT, Operand, MaskCasted));
    }
-  
+
    // If not vector, then scalar.
    unsigned BitOp = IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
    SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
@@ -14717,12 +14884,12 @@ SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
       if (Op0.getValueType() == MVT::i1)
         llvm_unreachable("Unexpected comparison operation for MVT::i1 operands");
    }
- 
+
    if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
         Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
-    // Do the comparison at i32 if it's smaller, besides the Atom case. 
-    // This avoids subregister aliasing issues. Keep the smaller reference 
-    // if we're optimizing for size, however, as that'll allow better folding 
+    // Do the comparison at i32 if it's smaller, besides the Atom case.
+    // This avoids subregister aliasing issues. Keep the smaller reference
+    // if we're optimizing for size, however, as that'll allow better folding
      // of memory operations.
      if (Op0.getValueType() != MVT::i32 && Op0.getValueType() != MVT::i64 &&
          !DAG.getMachineFunction().getFunction()->getAttributes().hasAttribute(
@@ -14780,7 +14947,7 @@ SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op,
      return SDValue();
  
    EVT VT = Op.getValueType();
-  
+
    // SSE1 has rsqrtss and rsqrtps.
    // TODO: Add support for AVX512 (v16f32).
    // It is likely not profitable to do this for f64 because a double-precision
@@ -14808,9 +14975,9 @@ SDValue X86TargetLowering::getRecipEstimate(SDValue Op,
    // significant digits in the divisor.
    if (!Subtarget->useReciprocalEst())
      return SDValue();
-  
+
    EVT VT = Op.getValueType();
-  
+
    // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
    // TODO: Add support for AVX512 (v16f32).
    // It is likely not profitable to do this for f64 because a double-precision
@@ -15629,11 +15796,11 @@ static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, const X86Subtarget *Subtarget
  
         ((Subtarget->hasDQI() && Subtarget->hasVLX() &&
          VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() >= 32)) ||
-    
+
         ((Subtarget->hasDQI() && VT.is512BitVector() &&
          VTElt.getSizeInBits() >= 32))))
      return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
-    
+
    unsigned int NumElts = VT.getVectorNumElements();
  
    if (NumElts != 8 && NumElts != 16)
@@ -16628,6 +16795,23 @@ static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
      return DAG.getNode(ISD::VSELECT, dl, VT, VMask, Op, PreservedSrc);
  }
  
+static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
+                                    SDValue PreservedSrc,
+                                    const X86Subtarget *Subtarget,
+                                    SelectionDAG &DAG) {
+    if (isAllOnes(Mask))
+      return Op;
+
+    EVT VT = Op.getValueType();
+    SDLoc dl(Op);
+    // The mask should be of type MVT::i1
+    SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask);
+
+    if (PreservedSrc.getOpcode() == ISD::UNDEF)
+      PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
+    return DAG.getNode(X86ISD::SELECT, dl, VT, IMask, Op, PreservedSrc);
+}
+
  static unsigned getOpcodeForFMAIntrinsic(unsigned IntNo) {
      switch (IntNo) {
      default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
@@ -16701,7 +16885,21 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
                                                RoundingMode),
                                    Mask, Src0, Subtarget, DAG);
      }
-                                              
+    case INTR_TYPE_SCALAR_MASK_RM: {
+      SDValue Src1 = Op.getOperand(1);
+      SDValue Src2 = Op.getOperand(2);
+      SDValue Src0 = Op.getOperand(3);
+      SDValue Mask = Op.getOperand(4);
+      SDValue RoundingMode = Op.getOperand(5);
+      return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
+                                              RoundingMode),
+                                  Mask, Src0, Subtarget, DAG);
+    }
+    case INTR_TYPE_2OP_MASK: {
+      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Op.getOperand(1),
+                                              Op.getOperand(2)),
+                                  Op.getOperand(4), Op.getOperand(3), Subtarget, DAG);
+    }
      case CMP_MASK:
      case CMP_MASK_CC: {
        // Comparison intrinsics with masks.
@@ -16753,7 +16951,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
      case VSHIFT_MASK:
        return getVectorMaskingNode(getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
                                                        Op.getOperand(1), Op.getOperand(2), DAG),
-                                  Op.getOperand(4), Op.getOperand(3), Subtarget, DAG);;
+                                  Op.getOperand(4), Op.getOperand(3), Subtarget, DAG);
      default:
        break;
      }
@@ -17305,7 +17503,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
    switch(IntrData->Type) {
    default:
      llvm_unreachable("Unknown Intrinsic Type");
-    break;    
+    break;
    case RDSEED:
    case RDRAND: {
      // Emit the node with the right value type.
@@ -18415,7 +18613,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
    // If possible, lower this packed shift into a vector multiply instead of
    // expanding it into a sequence of scalar shifts.
    // Do this only if the vector shift count is a constant build_vector.
-  if (Op.getOpcode() == ISD::SHL && 
+  if (Op.getOpcode() == ISD::SHL &&
        (VT == MVT::v8i16 || VT == MVT::v4i32 ||
         (Subtarget->hasInt256() && VT == MVT::v16i16)) &&
        ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
@@ -18507,15 +18705,15 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
            CanBeSimplified = Amt2 == Amt->getOperand(j);
        }
      }
-    
+
      if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
          isa<ConstantSDNode>(Amt2)) {
        // Replace this node with two shifts followed by a MOVSS/MOVSD.
        EVT CastVT = MVT::v4i32;
-      SDValue Splat1 = 
+      SDValue Splat1 =
          DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), VT);
        SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
-      SDValue Splat2 = 
+      SDValue Splat2 =
          DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), VT);
        SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
        if (TargetOpcode == X86ISD::MOVSD)
@@ -19809,6 +20007,7 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
    return (SVT.getVectorNumElements() == 2 ||
            ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
            isMOVLMask(M, SVT) ||
+          isCommutedMOVLMask(M, SVT) ||
            isMOVHLPSMask(M, SVT) ||
            isSHUFPMask(M, SVT) ||
            isSHUFPMask(M, SVT, /* Commuted */ true) ||
@@ -20739,6 +20938,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
    //  v = phi(main, restore)
    //
    // restoreMBB:
+  //  if base pointer being used, load it from frame
    //  v_restore = 1
  
    MachineBasicBlock *thisMBB = MBB;
@@ -20822,6 +21022,18 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
      .addReg(restoreDstReg).addMBB(restoreMBB);
  
    // restoreMBB:
+  if (RegInfo->hasBasePointer(*MF)) {
+    const X86Subtarget &STI = MF->getTarget().getSubtarget<X86Subtarget>();
+    const bool Uses64BitFramePtr = STI.isTarget64BitLP64() || STI.isTargetNaCl64();
+    X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
+    X86FI->setRestoreBasePointer(MF);
+    unsigned FramePtr = RegInfo->getFrameRegister(*MF);
+    unsigned BasePtr = RegInfo->getBaseRegister();
+    unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
+    addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
+                 FramePtr, true, X86FI->getRestoreBasePointerOffset())
+      .setMIFlag(MachineInstr::FrameSetup);
+  }
    BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
    BuildMI(restoreMBB, DL, TII->get(X86::JMP_4)).addMBB(sinkMBB);
    restoreMBB->addSuccessor(sinkMBB);
@@ -20895,7 +21107,7 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
  
  // Replace 213-type (isel default) FMA3 instructions with 231-type for
  // accumulator loops. Writing back to the accumulator allows the coalescer
-// to remove extra copies in the loop.   
+// to remove extra copies in the loop.
  MachineBasicBlock *
  X86TargetLowering::emitFMA3Instr(MachineInstr *MI,
                                   MachineBasicBlock *MBB) const {
@@ -21157,6 +21369,11 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
    case X86::EH_SjLj_LongJmp64:
      return emitEHSjLjLongJmp(MI, BB);
  
+  case TargetOpcode::STATEPOINT:
+    // As an implementation detail, STATEPOINT shares the STACKMAP format at
+    // this point in the process.  We diverge later.
+    return emitPatchPoint(MI, BB);
+
    case TargetOpcode::STACKMAP:
    case TargetOpcode::PATCHPOINT:
      return emitPatchPoint(MI, BB);
@@ -22176,7 +22393,7 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
      EVT SVT = BC0.getValueType();
      unsigned Opcode = BC0.getOpcode();
      unsigned NumElts = VT.getVectorNumElements();
-    
+
      if (BC0.hasOneUse() && SVT.isVector() &&
          SVT.getVectorNumElements() * 2 == NumElts &&
          TLI.isOperationLegal(Opcode, VT)) {
@@ -22522,7 +22739,7 @@ matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS,
  }
  
  static SDValue
-TransformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG,
+transformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG,
                                        const X86Subtarget *Subtarget) {
    SDLoc dl(N);
    SDValue Cond = N->getOperand(0);
@@ -22535,18 +22752,6 @@ TransformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG,
        Cond = CondSrc->getOperand(0);
    }
  
-  MVT VT = N->getSimpleValueType(0);
-  MVT EltVT = VT.getVectorElementType();
-  unsigned NumElems = VT.getVectorNumElements();
-  // There is no blend with immediate in AVX-512.
-  if (VT.is512BitVector())
-    return SDValue();
-
-  if (!Subtarget->hasSSE41() || EltVT == MVT::i8)
-    return SDValue();
-  if (!Subtarget->hasInt256() && VT == MVT::v16i16)
-    return SDValue();
-
    if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
      return SDValue();
  
@@ -22560,6 +22765,8 @@ TransformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG,
    if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue))
      return SDValue();
  
+  MVT VT = N->getSimpleValueType(0);
+  unsigned NumElems = VT.getVectorNumElements();
    SmallVector<int, 8> ShuffleMask(NumElems, -1);
    for (unsigned i = 0; i < NumElems; ++i) {
      // Be sure we emit undef where we can.
@@ -22569,6 +22776,9 @@ TransformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG,
        ShuffleMask[i] = i + NumElems * ((MaskValue >> i) & 1);
    }
  
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (!TLI.isShuffleMaskLegal(ShuffleMask, VT))
+    return SDValue();
    return DAG.getVectorShuffle(VT, dl, LHS, RHS, &ShuffleMask[0]);
  }
  
@@ -23008,81 +23218,6 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
      }
    }
  
-  // Try to fold this VSELECT into a MOVSS/MOVSD
-  if (N->getOpcode() == ISD::VSELECT &&
-      Cond.getOpcode() == ISD::BUILD_VECTOR && !DCI.isBeforeLegalize()) {
-    if (VT == MVT::v4i32 || VT == MVT::v4f32 ||
-        (Subtarget->hasSSE2() && (VT == MVT::v2i64 || VT == MVT::v2f64))) {
-      bool CanFold = false;
-      unsigned NumElems = Cond.getNumOperands();
-      SDValue A = LHS;
-      SDValue B = RHS;
-      
-      if (isZero(Cond.getOperand(0))) {
-        CanFold = true;
-
-        // fold (vselect <0,-1,-1,-1>, A, B) -> (movss A, B)
-        // fold (vselect <0,-1> -> (movsd A, B)
-        for (unsigned i = 1, e = NumElems; i != e && CanFold; ++i)
-          CanFold = isAllOnes(Cond.getOperand(i));
-      } else if (isAllOnes(Cond.getOperand(0))) {
-        CanFold = true;
-        std::swap(A, B);
-
-        // fold (vselect <-1,0,0,0>, A, B) -> (movss B, A)
-        // fold (vselect <-1,0> -> (movsd B, A)
-        for (unsigned i = 1, e = NumElems; i != e && CanFold; ++i)
-          CanFold = isZero(Cond.getOperand(i));
-      }
-
-      if (CanFold) {
-        if (VT == MVT::v4i32 || VT == MVT::v4f32)
-          return getTargetShuffleNode(X86ISD::MOVSS, DL, VT, A, B, DAG);
-        return getTargetShuffleNode(X86ISD::MOVSD, DL, VT, A, B, DAG);
-      }
-
-      if (Subtarget->hasSSE2() && (VT == MVT::v4i32 || VT == MVT::v4f32)) {
-        // fold (v4i32: vselect <0,0,-1,-1>, A, B) ->
-        //      (v4i32 (bitcast (movsd (v2i64 (bitcast A)),
-        //                             (v2i64 (bitcast B)))))
-        //
-        // fold (v4f32: vselect <0,0,-1,-1>, A, B) ->
-        //      (v4f32 (bitcast (movsd (v2f64 (bitcast A)),
-        //                             (v2f64 (bitcast B)))))
-        //
-        // fold (v4i32: vselect <-1,-1,0,0>, A, B) ->
-        //      (v4i32 (bitcast (movsd (v2i64 (bitcast B)),
-        //                             (v2i64 (bitcast A)))))
-        //
-        // fold (v4f32: vselect <-1,-1,0,0>, A, B) ->
-        //      (v4f32 (bitcast (movsd (v2f64 (bitcast B)),
-        //                             (v2f64 (bitcast A)))))
-
-        CanFold = (isZero(Cond.getOperand(0)) &&
-                   isZero(Cond.getOperand(1)) &&
-                   isAllOnes(Cond.getOperand(2)) &&
-                   isAllOnes(Cond.getOperand(3)));
-
-        if (!CanFold && isAllOnes(Cond.getOperand(0)) &&
-            isAllOnes(Cond.getOperand(1)) &&
-            isZero(Cond.getOperand(2)) &&
-            isZero(Cond.getOperand(3))) {
-          CanFold = true;
-          std::swap(LHS, RHS);
-        }
-
-        if (CanFold) {
-          EVT NVT = (VT == MVT::v4i32) ? MVT::v2i64 : MVT::v2f64;
-          SDValue NewA = DAG.getNode(ISD::BITCAST, DL, NVT, LHS);
-          SDValue NewB = DAG.getNode(ISD::BITCAST, DL, NVT, RHS);
-          SDValue Select = getTargetShuffleNode(X86ISD::MOVSD, DL, NVT, NewA,
-                                                NewB, DAG);
-          return DAG.getNode(ISD::BITCAST, DL, VT, Select);
-        }
-      }
-    }
-  }
-
    // If we know that this node is legal then we know that it is going to be
    // matched by one of the SSE/AVX BLEND instructions. These instructions only
    // depend on the highest bit in each word. Try to use SimplifyDemandedBits
@@ -23167,7 +23302,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
    if ((N->getOpcode() == ISD::VSELECT ||
         N->getOpcode() == X86ISD::SHRUNKBLEND) &&
        !DCI.isBeforeLegalize()) {
-    SDValue Shuffle = TransformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget);
+    SDValue Shuffle = transformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget);
      if (Shuffle.getNode())
        return Shuffle;
    }
@@ -23524,7 +23659,7 @@ static SDValue PerformINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG,
      // fold (blend A, B, allOnes) -> B
      if (ISD::isBuildVectorAllOnes(Mask.getNode()))
        return Op1;
-    
+
      // Simplify the case where the mask is a constant i32 value.
      if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Mask)) {
        if (C->isNullValue())
@@ -24233,11 +24368,12 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
    SDLoc dl(Ld);
    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
  
-  // On Sandybridge unaligned 256bit loads are inefficient.
+  // For chips with slow 32-byte unaligned loads, break the 32-byte operation
+  // into two 16-byte operations.
    ISD::LoadExtType Ext = Ld->getExtensionType();
    unsigned Alignment = Ld->getAlignment();
    bool IsAligned = Alignment == 0 || Alignment >= MemVT.getSizeInBits()/8;
-  if (RegVT.is256BitVector() && !Subtarget->hasInt256() &&
+  if (RegVT.is256BitVector() && Subtarget->isUnalignedMem32Slow() &&
        !DCI.isBeforeLegalizeOps() && !IsAligned && Ext == ISD::NON_EXTLOAD) {
      unsigned NumElems = RegVT.getVectorNumElements();
      if (NumElems < 2)
@@ -24280,13 +24416,11 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
    SDValue StoredVal = St->getOperand(1);
    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
  
-  // If we are saving a concatenation of two XMM registers, perform two stores.
-  // On Sandy Bridge, 256-bit memory operations are executed by two
-  // 128-bit ports. However, on Haswell it is better to issue a single 256-bit
-  // memory  operation.
+  // If we are saving a concatenation of two XMM registers and 32-byte stores
+  // are slow, such as on Sandy Bridge, perform two 16-byte stores.
    unsigned Alignment = St->getAlignment();
    bool IsAligned = Alignment == 0 || Alignment >= VT.getSizeInBits()/8;
-  if (VT.is256BitVector() && !Subtarget->hasInt256() &&
+  if (VT.is256BitVector() && Subtarget->isUnalignedMem32Slow() &&
        StVT == VT && !IsAligned) {
      unsigned NumElems = VT.getVectorNumElements();
      if (NumElems < 2)
@@ -26193,7 +26327,7 @@ int X86TargetLowering::getScalingFactorCost(const AddrMode &AM,
    // "load" ports instead of the dedicated "store" port.
    // E.g., on Haswell:
    // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
-  // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.   
+  // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
    if (isLegalAddressingMode(AM, Ty))
      // Scale represents reg2 * scale, thus account for 1
      // as soon as we use a second register.