Rename the x86 isTargetMacho to isTargetMachO for uniformity.

[oota-llvm.git] / lib / Target / X86 / X86ISelLowering.cpp
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index 4ec7fa15198a3fd7bf940248c3fa452a564bb530..7070e8c5d51fbe0c85a966d002f14429fded7654 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -107,7 +107,7 @@ static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,
    // If the input is a buildvector just emit a smaller one.
    if (Vec.getOpcode() == ISD::BUILD_VECTOR)
      return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
-                       makeArrayRef(Vec->op_begin()+NormalizedIdxVal,
+                       makeArrayRef(Vec->op_begin() + NormalizedIdxVal,
                                      ElemsPerChunk));
  
    SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
@@ -115,13 +115,13 @@ static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,
                                 VecIdx);
  
    return Result;
-
  }
+
  /// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
  /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
  /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
  /// instructions or a simple subregister reference. Idx is an index in the
-/// 128 bits we want.  It need not be aligned to a 128-bit bounday.  That makes
+/// 128 bits we want.  It need not be aligned to a 128-bit boundary.  That makes
  /// lowering EXTRACT_VECTOR_ELT operations easier.
  static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
                                     SelectionDAG &DAG, SDLoc dl) {
@@ -158,25 +158,23 @@ static SDValue InsertSubVector(SDValue Result, SDValue Vec,
                                 * ElemsPerChunk);
  
    SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
-  return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec,
-                     VecIdx);
+  return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
  }
+
  /// Generate a DAG to put 128-bits into a vector > 128 bits.  This
  /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
  /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
  /// simple superregister reference.  Idx is an index in the 128 bits
-/// we want.  It need not be aligned to a 128-bit bounday.  That makes
+/// we want.  It need not be aligned to a 128-bit boundary.  That makes
  /// lowering INSERT_VECTOR_ELT operations easier.
-static SDValue Insert128BitVector(SDValue Result, SDValue Vec,
-                                  unsigned IdxVal, SelectionDAG &DAG,
-                                  SDLoc dl) {
+static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
+                                  SelectionDAG &DAG,SDLoc dl) {
    assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
    return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
  }
  
-static SDValue Insert256BitVector(SDValue Result, SDValue Vec,
-                                  unsigned IdxVal, SelectionDAG &DAG,
-                                  SDLoc dl) {
+static SDValue Insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
+                                  SelectionDAG &DAG, SDLoc dl) {
    assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
    return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
  }
@@ -230,13 +228,13 @@ void X86TargetLowering::resetOperationActions() {
    // Set up the TargetLowering object.
    static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
  
-  // X86 is weird, it always uses i8 for shift amounts and setcc results.
+  // X86 is weird. It always uses i8 for shift amounts and setcc results.
    setBooleanContents(ZeroOrOneBooleanContent);
    // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
    setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
  
-  // For 64-bit since we have so many registers use the ILP scheduler, for
-  // 32-bit code use the register pressure specific scheduling.
+  // For 64-bit, since we have so many registers, use the ILP scheduler.
+  // For 32-bit, use the register pressure specific scheduling.
    // For Atom, always use ILP scheduling.
    if (Subtarget->isAtom())
      setSchedulingPreference(Sched::ILP);
@@ -248,9 +246,9 @@ void X86TargetLowering::resetOperationActions() {
        TM.getSubtarget<X86Subtarget>().getRegisterInfo();
    setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
  
-  // Bypass expensive divides on Atom when compiling with O2
+  // Bypass expensive divides on Atom when compiling with O2.
    if (TM.getOptLevel() >= CodeGenOpt::Default) {
-    if (Subtarget->hasSlowDivide32()) 
+    if (Subtarget->hasSlowDivide32())
        addBypassSlowDiv(32, 8);
      if (Subtarget->hasSlowDivide64() && Subtarget->is64Bit())
        addBypassSlowDiv(64, 16);
@@ -1321,13 +1319,21 @@ void X86TargetLowering::resetOperationActions() {
  
        // Extract subvector is special because the value type
        // (result) is 128-bit but the source is 256-bit wide.
-      if (VT.is128BitVector())
+      if (VT.is128BitVector()) {
+        if (VT.getScalarSizeInBits() >= 32) {
+          setOperationAction(ISD::MLOAD,  VT, Custom);
+          setOperationAction(ISD::MSTORE, VT, Custom);
+        }
          setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
-
+      }
        // Do not attempt to custom lower other non-256-bit vectors
        if (!VT.is256BitVector())
          continue;
  
+      if (VT.getScalarSizeInBits() >= 32) {
+        setOperationAction(ISD::MLOAD,  VT, Legal);
+        setOperationAction(ISD::MSTORE, VT, Legal);
+      }
        setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
        setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
        setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
@@ -1494,9 +1500,13 @@ void X86TargetLowering::resetOperationActions() {
        unsigned EltSize = VT.getVectorElementType().getSizeInBits();
        // Extract subvector is special because the value type
        // (result) is 256/128-bit but the source is 512-bit wide.
-      if (VT.is128BitVector() || VT.is256BitVector())
+      if (VT.is128BitVector() || VT.is256BitVector()) {
          setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
-
+        if ( EltSize >= 32) {
+          setOperationAction(ISD::MLOAD,   VT, Legal);
+          setOperationAction(ISD::MSTORE,  VT, Legal);
+        }
+      }
        if (VT.getVectorElementType() == MVT::i1)
          setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
  
@@ -1512,12 +1522,14 @@ void X86TargetLowering::resetOperationActions() {
          setOperationAction(ISD::EXTRACT_VECTOR_ELT,  VT, Custom);
          setOperationAction(ISD::SCALAR_TO_VECTOR,    VT, Custom);
          setOperationAction(ISD::INSERT_SUBVECTOR,    VT, Custom);
+        setOperationAction(ISD::MLOAD,               VT, Legal);
+        setOperationAction(ISD::MSTORE,              VT, Legal);
        }
      }
      for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
        MVT VT = (MVT::SimpleValueType)i;
  
-      // Do not attempt to promote non-256-bit vectors
+      // Do not attempt to promote non-256-bit vectors.
        if (!VT.is512BitVector())
          continue;
  
@@ -1543,11 +1555,11 @@ void X86TargetLowering::resetOperationActions() {
  
        const unsigned EltSize = VT.getVectorElementType().getSizeInBits();
  
-      // Do not attempt to promote non-256-bit vectors
+      // Do not attempt to promote non-256-bit vectors.
        if (!VT.is512BitVector())
          continue;
  
-      if ( EltSize < 32) {
+      if (EltSize < 32) {
          setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
          setOperationAction(ISD::VSELECT,             VT, Legal);
        }
@@ -1608,9 +1620,8 @@ void X86TargetLowering::resetOperationActions() {
      setLibcallName(RTLIB::SINCOS_F32, "sincosf");
      setLibcallName(RTLIB::SINCOS_F64, "sincos");
      if (Subtarget->isTargetDarwin()) {
-      // For MacOSX, we don't want to the normal expansion of a libcall to
-      // sincos. We want to issue a libcall to __sincos_stret to avoid memory
-      // traffic.
+      // For MacOSX, we don't want the normal expansion of a libcall to sincos.
+      // We want to issue a libcall to __sincos_stret to avoid memory traffic.
        setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
        setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
      }
@@ -1677,8 +1688,7 @@ void X86TargetLowering::resetOperationActions() {
  
  // This has so far only been implemented for 64-bit MachO.
  bool X86TargetLowering::useLoadStackGuardNode() const {
-  return Subtarget->getTargetTriple().getObjectFormat() == Triple::MachO &&
-         Subtarget->is64Bit();
+  return Subtarget->isTargetMachO() && Subtarget->is64Bit();
  }
  
  TargetLoweringBase::LegalizeTypeAction
@@ -1995,7 +2005,7 @@ X86TargetLowering::LowerReturn(SDValue Chain,
      SDValue ValToCopy = OutVals[i];
      EVT ValVT = ValToCopy.getValueType();
  
-    // Promote values to the appropriate types
+    // Promote values to the appropriate types.
      if (VA.getLocInfo() == CCValAssign::SExt)
        ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
      else if (VA.getLocInfo() == CCValAssign::ZExt)
@@ -2006,7 +2016,7 @@ X86TargetLowering::LowerReturn(SDValue Chain,
        ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy);
  
      assert(VA.getLocInfo() != CCValAssign::FPExt &&
-           "Unexpected FP-extend for return value.");  
+           "Unexpected FP-extend for return value.");
  
      // If this is x86-64, and we disabled SSE, we can't return FP values,
      // or SSE or MMX vectors.
@@ -3495,7 +3505,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
        // In PIC we need an extra register to formulate the address computation
        // for the callee.
        unsigned MaxInRegs =
-       (DAG.getTarget().getRelocationModel() == Reloc::PIC_) ? 2 : 3;
+        (DAG.getTarget().getRelocationModel() == Reloc::PIC_) ? 2 : 3;
  
        for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
          CCValAssign &VA = ArgLocs[i];
@@ -3662,7 +3672,7 @@ bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
    // For kernel code model we know that all object resist in the negative half
    // of 32bits address space. We may not accept negative offsets, since they may
    // be just off and we may accept pretty large positive ones.
-  if (M == CodeModel::Kernel && Offset > 0)
+  if (M == CodeModel::Kernel && Offset >= 0)
      return true;
  
    return false;
@@ -5757,7 +5767,8 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
    // We only know how to deal with build_vector nodes where elements are either
    // zeroable or extract_vector_elt with constant index.
    SDValue FirstNonZero;
-  for (int i=0; i < 4; ++i) {
+  unsigned FirstNonZeroIdx;
+  for (unsigned i=0; i < 4; ++i) {
      if (Zeroable[i])
        continue;
      SDValue Elt = Op->getOperand(i);
@@ -5768,8 +5779,10 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
      MVT VT = Elt.getOperand(0).getSimpleValueType();
      if (!VT.is128BitVector())
        return SDValue();
-    if (!FirstNonZero.getNode())
+    if (!FirstNonZero.getNode()) {
        FirstNonZero = Elt;
+      FirstNonZeroIdx = i;
+    }
    }
  
    assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
@@ -5808,14 +5821,14 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
      return SDValue();
  
    SDValue V2 = Elt.getOperand(0);
-  if (Elt == FirstNonZero)
+  if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
      V1 = SDValue();
  
    bool CanFold = true;
    for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
      if (Zeroable[i])
        continue;
-    
+
      SDValue Current = Op->getOperand(i);
      SDValue SrcVector = Current->getOperand(0);
      if (!V1.getNode())
@@ -6340,8 +6353,7 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
        AllContants = false;
        NonConstIdx = idx;
        NumNonConsts++;
-    }
-    else {
+    } else {
        NumConsts++;
        if (cast<ConstantSDNode>(In)->getZExtValue())
        Immediate |= (1ULL << idx);
@@ -6364,7 +6376,7 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
                                           MVT::getIntegerVT(VT.getSizeInBits()));
        DstVec = DAG.getNode(ISD::BITCAST, dl, VT, VecAsImm);
      }
-    else 
+    else
        DstVec = DAG.getUNDEF(VT);
      return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
                         Op.getOperand(NonConstIdx),
@@ -6387,7 +6399,7 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
  
  /// \brief Return true if \p N implements a horizontal binop and return the
  /// operands for the horizontal binop into V0 and V1.
-/// 
+///
  /// This is a helper function of PerformBUILD_VECTORCombine.
  /// This function checks that the build_vector \p N in input implements a
  /// horizontal operation. Parameter \p Opcode defines the kind of horizontal
@@ -6408,7 +6420,7 @@ static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
    assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
    assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
           "Invalid Vector in input!");
-  
+
    bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
    bool CanFold = true;
    unsigned ExpectedVExtractIdx = BaseIdx;
@@ -6477,13 +6489,13 @@ static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
  }
  
  /// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
-/// a concat_vector. 
+/// a concat_vector.
  ///
  /// This is a helper function of PerformBUILD_VECTORCombine.
  /// This function expects two 256-bit vectors called V0 and V1.
  /// At first, each vector is split into two separate 128-bit vectors.
  /// Then, the resulting 128-bit vectors are used to implement two
-/// horizontal binary operations. 
+/// horizontal binary operations.
  ///
  /// The kind of horizontal binary operation is defined by \p X86Opcode.
  ///
@@ -6677,18 +6689,18 @@ static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG,
      // Try to match an SSE3 float HADD/HSUB.
      if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
        return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
-    
+
      if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
        return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
    } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget->hasSSSE3()) {
      // Try to match an SSSE3 integer HADD/HSUB.
      if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
        return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
-    
+
      if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
        return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
    }
-  
+
    if (!Subtarget->hasAVX())
      return SDValue();
  
@@ -6739,7 +6751,7 @@ static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG,
        // Do this only if the target has AVX2.
        if (Subtarget->hasAVX2())
          return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
- 
+
        // Do not try to expand this build_vector into a pair of horizontal
        // add/sub if we can emit a pair of scalar add/sub.
        if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
@@ -7493,9 +7505,9 @@ static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(SDLoc DL, MVT VT,
  /// does not check for the profitability of lowering either as PALIGNR or
  /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
  /// This matches shuffle vectors that look like:
-/// 
+///
  ///   v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
-/// 
+///
  /// Essentially it concatenates V1 and V2, shifts right by some number of
  /// elements, and takes the low elements as the result. Note that while this is
  /// specified as a *right shift* because x86 is little-endian, it is a *left
@@ -8200,6 +8212,11 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
                      getV4X86ShuffleImm8ForMask(WidenedMask, DAG)));
    }
  
+  // Try to use byte shift instructions.
+  if (SDValue Shift = lowerVectorShuffleAsByteShift(
+          DL, MVT::v2i64, V1, V2, Mask, DAG))
+    return Shift;
+
    // If we have a single input from V2 insert that into V1 if we can do so
    // cheaply.
    if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
@@ -8226,11 +8243,6 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
                                                    Subtarget, DAG))
        return Blend;
  
-  // Try to use byte shift instructions.
-  if (SDValue Shift = lowerVectorShuffleAsByteShift(
-          DL, MVT::v2i64, V1, V2, Mask, DAG))
-    return Shift;
-
    // Try to use byte rotation instructions.
    // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
    if (Subtarget->hasSSSE3())
@@ -8491,6 +8503,11 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
                         getV4X86ShuffleImm8ForMask(Mask, DAG));
    }
  
+  // Try to use byte shift instructions.
+  if (SDValue Shift = lowerVectorShuffleAsByteShift(
+          DL, MVT::v4i32, V1, V2, Mask, DAG))
+    return Shift;
+
    // There are special ways we can lower some single-element blends.
    if (NumV2Elements == 1)
      if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4i32, DL, V1, V2,
@@ -8508,11 +8525,6 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
                                                    Subtarget, DAG))
        return Blend;
  
-  // Try to use byte shift instructions.
-  if (SDValue Shift = lowerVectorShuffleAsByteShift(
-          DL, MVT::v4i32, V1, V2, Mask, DAG))
-    return Shift;
-
    // Try to use byte rotation instructions.
    // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
    if (Subtarget->hasSSSE3())
@@ -8576,17 +8588,17 @@ static SDValue lowerV8I16SingleInputVectorShuffle(
                                                          Mask, Subtarget, DAG))
      return Broadcast;
  
+  // Try to use byte shift instructions.
+  if (SDValue Shift = lowerVectorShuffleAsByteShift(
+          DL, MVT::v8i16, V, V, Mask, DAG))
+    return Shift;
+
    // Use dedicated unpack instructions for masks that match their pattern.
    if (isShuffleEquivalent(Mask, 0, 0, 1, 1, 2, 2, 3, 3))
      return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V, V);
    if (isShuffleEquivalent(Mask, 4, 4, 5, 5, 6, 6, 7, 7))
      return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V, V);
  
-  // Try to use byte shift instructions.
-  if (SDValue Shift = lowerVectorShuffleAsByteShift(
-          DL, MVT::v8i16, V, V, Mask, DAG))
-    return Shift;
-
    // Try to use byte rotation instructions.
    if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
            DL, MVT::v8i16, V, V, Mask, Subtarget, DAG))
@@ -9193,6 +9205,11 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
    assert(NumV1Inputs > 0 && "All single-input shuffles should be canonicalized "
                              "to be V1-input shuffles.");
  
+  // Try to use byte shift instructions.
+  if (SDValue Shift = lowerVectorShuffleAsByteShift(
+          DL, MVT::v8i16, V1, V2, Mask, DAG))
+    return Shift;
+
    // There are special ways we can lower some single-element blends.
    if (NumV2Inputs == 1)
      if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v8i16, DL, V1, V2,
@@ -9210,11 +9227,6 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
                                                    Subtarget, DAG))
        return Blend;
  
-  // Try to use byte shift instructions.
-  if (SDValue Shift = lowerVectorShuffleAsByteShift(
-          DL, MVT::v8i16, V1, V2, Mask, DAG))
-    return Shift;
-
    // Try to use byte rotation instructions.
    if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
            DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
@@ -10833,7 +10845,8 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
    // When the number of V1 and V2 elements are the same, try to minimize the
    // number of uses of V2 in the low half of the vector. When that is tied,
    // ensure that the sum of indices for V1 is equal to or lower than the sum
-  // indices for V2.
+  // indices for V2. When those are equal, try to ensure that the number of odd
+  // indices for V1 is lower than the number of odd indices for V2.
    if (NumV1Elements == NumV2Elements) {
      int LowV1Elements = 0, LowV2Elements = 0;
      for (int M : SVOp->getMask().slice(0, NumElements / 2))
@@ -10850,8 +10863,18 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
            SumV2Indices += i;
          else if (SVOp->getMask()[i] >= 0)
            SumV1Indices += i;
-      if (SumV2Indices < SumV1Indices)
+      if (SumV2Indices < SumV1Indices) {
          return DAG.getCommutedVectorShuffle(*SVOp);
+      } else if (SumV2Indices == SumV1Indices) {
+        int NumV1OddIndices = 0, NumV2OddIndices = 0;
+        for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i)
+          if (SVOp->getMask()[i] >= NumElements)
+            NumV2OddIndices += i % 2;
+          else if (SVOp->getMask()[i] >= 0)
+            NumV1OddIndices += i % 2;
+        if (NumV2OddIndices < NumV1OddIndices)
+          return DAG.getCommutedVectorShuffle(*SVOp);
+      }
      }
    }
  
@@ -12774,7 +12797,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
  
  /// Insert one bit to mask vector, like v16i1 or v8i1.
  /// AVX-512 feature.
-SDValue 
+SDValue
  X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
    SDLoc dl(Op);
    SDValue Vec = Op.getOperand(0);
@@ -12787,7 +12810,7 @@ X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
      // insert element and then truncate the result.
      MVT ExtVecVT = (VecVT == MVT::v8i1 ?  MVT::v8i64 : MVT::v16i32);
      MVT ExtEltVT = (VecVT == MVT::v8i1 ?  MVT::i64 : MVT::i32);
-    SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT, 
+    SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
        DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec),
        DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx);
      return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
@@ -13535,7 +13558,7 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
      }
      return SDValue();
    }
-  
+
    assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
           "Unknown SINT_TO_FP to lower!");
  
@@ -14182,7 +14205,7 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
        In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
        InVT = ExtVT;
      }
-    
+
      SDValue Cst = DAG.getTargetConstant(1, InVT.getVectorElementType());
      const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue();
      SDValue CP = DAG.getConstantPool(C, getPointerTy());
@@ -14376,7 +14399,7 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
      EltVT = VT.getVectorElementType();
      NumElts = VT.getVectorNumElements();
    }
-  
+
    unsigned EltBits = EltVT.getSizeInBits();
    LLVMContext *Context = DAG.getContext();
    // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
@@ -14403,7 +14426,7 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
      return DAG.getNode(ISD::BITCAST, dl, VT,
                         DAG.getNode(BitOp, dl, VecVT, Operand, MaskCasted));
    }
-  
+
    // If not vector, then scalar.
    unsigned BitOp = IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
    SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
@@ -14453,17 +14476,6 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
                                false, false, false, 16);
    SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1);
  
-  // Shift sign bit right or left if the two operands have different types.
-  if (SrcVT.bitsGT(VT)) {
-    // Op0 is MVT::f32, Op1 is MVT::f64.
-    SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit);
-    SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit,
-                          DAG.getConstant(32, MVT::i32));
-    SignBit = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, SignBit);
-    SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit,
-                          DAG.getIntPtrConstant(0));
-  }
-
    // Clear first operand sign bit.
    CV.clear();
    if (VT == MVT::f64) {
@@ -14860,12 +14872,12 @@ SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
       if (Op0.getValueType() == MVT::i1)
         llvm_unreachable("Unexpected comparison operation for MVT::i1 operands");
    }
- 
+
    if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
         Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
-    // Do the comparison at i32 if it's smaller, besides the Atom case. 
-    // This avoids subregister aliasing issues. Keep the smaller reference 
-    // if we're optimizing for size, however, as that'll allow better folding 
+    // Do the comparison at i32 if it's smaller, besides the Atom case.
+    // This avoids subregister aliasing issues. Keep the smaller reference
+    // if we're optimizing for size, however, as that'll allow better folding
      // of memory operations.
      if (Op0.getValueType() != MVT::i32 && Op0.getValueType() != MVT::i64 &&
          !DAG.getMachineFunction().getFunction()->getAttributes().hasAttribute(
@@ -14923,7 +14935,7 @@ SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op,
      return SDValue();
  
    EVT VT = Op.getValueType();
-  
+
    // SSE1 has rsqrtss and rsqrtps.
    // TODO: Add support for AVX512 (v16f32).
    // It is likely not profitable to do this for f64 because a double-precision
@@ -14951,9 +14963,9 @@ SDValue X86TargetLowering::getRecipEstimate(SDValue Op,
    // significant digits in the divisor.
    if (!Subtarget->useReciprocalEst())
      return SDValue();
-  
+
    EVT VT = Op.getValueType();
-  
+
    // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
    // TODO: Add support for AVX512 (v16f32).
    // It is likely not profitable to do this for f64 because a double-precision
@@ -15772,11 +15784,11 @@ static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, const X86Subtarget *Subtarget
  
         ((Subtarget->hasDQI() && Subtarget->hasVLX() &&
          VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() >= 32)) ||
-    
+
         ((Subtarget->hasDQI() && VT.is512BitVector() &&
          VTElt.getSizeInBits() >= 32))))
      return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
-    
+
    unsigned int NumElts = VT.getVectorNumElements();
  
    if (NumElts != 8 && NumElts != 16)
@@ -16375,7 +16387,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
                                             SelectionDAG &DAG) const {
    MachineFunction &MF = DAG.getMachineFunction();
    bool SplitStack = MF.shouldSplitStack();
-  bool Lower = (Subtarget->isOSWindows() && !Subtarget->isTargetMacho()) ||
+  bool Lower = (Subtarget->isOSWindows() && !Subtarget->isTargetMachO()) ||
                 SplitStack;
    SDLoc dl(Op);
  
@@ -16771,6 +16783,23 @@ static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
      return DAG.getNode(ISD::VSELECT, dl, VT, VMask, Op, PreservedSrc);
  }
  
+static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
+                                    SDValue PreservedSrc,
+                                    const X86Subtarget *Subtarget,
+                                    SelectionDAG &DAG) {
+    if (isAllOnes(Mask))
+      return Op;
+
+    EVT VT = Op.getValueType();
+    SDLoc dl(Op);
+    // The mask should be of type MVT::i1
+    SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask);
+
+    if (PreservedSrc.getOpcode() == ISD::UNDEF)
+      PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
+    return DAG.getNode(X86ISD::SELECT, dl, VT, IMask, Op, PreservedSrc);
+}
+
  static unsigned getOpcodeForFMAIntrinsic(unsigned IntNo) {
      switch (IntNo) {
      default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
@@ -16844,7 +16873,21 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
                                                RoundingMode),
                                    Mask, Src0, Subtarget, DAG);
      }
-                                              
+    case INTR_TYPE_SCALAR_MASK_RM: {
+      SDValue Src1 = Op.getOperand(1);
+      SDValue Src2 = Op.getOperand(2);
+      SDValue Src0 = Op.getOperand(3);
+      SDValue Mask = Op.getOperand(4);
+      SDValue RoundingMode = Op.getOperand(5);
+      return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
+                                              RoundingMode),
+                                  Mask, Src0, Subtarget, DAG);
+    }
+    case INTR_TYPE_2OP_MASK: {
+      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Op.getOperand(1),
+                                              Op.getOperand(2)),
+                                  Op.getOperand(4), Op.getOperand(3), Subtarget, DAG);
+    }
      case CMP_MASK:
      case CMP_MASK_CC: {
        // Comparison intrinsics with masks.
@@ -16896,7 +16939,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget
      case VSHIFT_MASK:
        return getVectorMaskingNode(getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
                                                        Op.getOperand(1), Op.getOperand(2), DAG),
-                                  Op.getOperand(4), Op.getOperand(3), Subtarget, DAG);;
+                                  Op.getOperand(4), Op.getOperand(3), Subtarget, DAG);
      default:
        break;
      }
@@ -17448,7 +17491,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
    switch(IntrData->Type) {
    default:
      llvm_unreachable("Unknown Intrinsic Type");
-    break;    
+    break;
    case RDSEED:
    case RDRAND: {
      // Emit the node with the right value type.
@@ -18392,22 +18435,12 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
      SDValue BaseShAmt;
      EVT EltVT = VT.getVectorElementType();
  
-    if (Amt.getOpcode() == ISD::BUILD_VECTOR) {
-      unsigned NumElts = VT.getVectorNumElements();
-      unsigned i, j;
-      for (i = 0; i != NumElts; ++i) {
-        if (Amt.getOperand(i).getOpcode() == ISD::UNDEF)
-          continue;
-        break;
-      }
-      for (j = i; j != NumElts; ++j) {
-        SDValue Arg = Amt.getOperand(j);
-        if (Arg.getOpcode() == ISD::UNDEF) continue;
-        if (Arg != Amt.getOperand(i))
-          break;
-      }
-      if (i != NumElts && j == NumElts)
-        BaseShAmt = Amt.getOperand(i);
+    if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) {
+      // Check if this build_vector node is doing a splat.
+      // If so, then set BaseShAmt equal to the splat value.
+      BaseShAmt = BV->getSplatValue();
+      if (BaseShAmt && BaseShAmt.getOpcode() == ISD::UNDEF)
+        BaseShAmt = SDValue();
      } else {
        if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
          Amt = Amt.getOperand(0);
@@ -18558,7 +18591,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
    // If possible, lower this packed shift into a vector multiply instead of
    // expanding it into a sequence of scalar shifts.
    // Do this only if the vector shift count is a constant build_vector.
-  if (Op.getOpcode() == ISD::SHL && 
+  if (Op.getOpcode() == ISD::SHL &&
        (VT == MVT::v8i16 || VT == MVT::v4i32 ||
         (Subtarget->hasInt256() && VT == MVT::v16i16)) &&
        ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
@@ -18650,15 +18683,15 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
            CanBeSimplified = Amt2 == Amt->getOperand(j);
        }
      }
-    
+
      if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
          isa<ConstantSDNode>(Amt2)) {
        // Replace this node with two shifts followed by a MOVSS/MOVSD.
        EVT CastVT = MVT::v4i32;
-      SDValue Splat1 = 
+      SDValue Splat1 =
          DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), VT);
        SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
-      SDValue Splat2 = 
+      SDValue Splat2 =
          DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), VT);
        SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
        if (TargetOpcode == X86ISD::MOVSD)
@@ -19952,6 +19985,7 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
    return (SVT.getVectorNumElements() == 2 ||
            ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
            isMOVLMask(M, SVT) ||
+          isCommutedMOVLMask(M, SVT) ||
            isMOVHLPSMask(M, SVT) ||
            isSHUFPMask(M, SVT) ||
            isSHUFPMask(M, SVT, /* Commuted */ true) ||
@@ -20727,7 +20761,7 @@ X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI,
    const TargetInstrInfo *TII = BB->getParent()->getSubtarget().getInstrInfo();
    DebugLoc DL = MI->getDebugLoc();
  
-  assert(!Subtarget->isTargetMacho());
+  assert(!Subtarget->isTargetMachO());
  
    // The lowering is pretty easy: we're just emitting the call to _alloca.  The
    // non-trivial part is impdef of ESP.
@@ -20882,6 +20916,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
    //  v = phi(main, restore)
    //
    // restoreMBB:
+  //  if base pointer being used, load it from frame
    //  v_restore = 1
  
    MachineBasicBlock *thisMBB = MBB;
@@ -20965,6 +21000,18 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
      .addReg(restoreDstReg).addMBB(restoreMBB);
  
    // restoreMBB:
+  if (RegInfo->hasBasePointer(*MF)) {
+    const X86Subtarget &STI = MF->getTarget().getSubtarget<X86Subtarget>();
+    const bool Uses64BitFramePtr = STI.isTarget64BitLP64() || STI.isTargetNaCl64();
+    X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
+    X86FI->setRestoreBasePointer(MF);
+    unsigned FramePtr = RegInfo->getFrameRegister(*MF);
+    unsigned BasePtr = RegInfo->getBaseRegister();
+    unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
+    addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
+                 FramePtr, true, X86FI->getRestoreBasePointerOffset())
+      .setMIFlag(MachineInstr::FrameSetup);
+  }
    BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
    BuildMI(restoreMBB, DL, TII->get(X86::JMP_4)).addMBB(sinkMBB);
    restoreMBB->addSuccessor(sinkMBB);
@@ -21038,7 +21085,7 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
  
  // Replace 213-type (isel default) FMA3 instructions with 231-type for
  // accumulator loops. Writing back to the accumulator allows the coalescer
-// to remove extra copies in the loop.   
+// to remove extra copies in the loop.
  MachineBasicBlock *
  X86TargetLowering::emitFMA3Instr(MachineInstr *MI,
                                   MachineBasicBlock *MBB) const {
@@ -21300,6 +21347,11 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
    case X86::EH_SjLj_LongJmp64:
      return emitEHSjLjLongJmp(MI, BB);
  
+  case TargetOpcode::STATEPOINT:
+    // As an implementation detail, STATEPOINT shares the STACKMAP format at
+    // this point in the process.  We diverge later.
+    return emitPatchPoint(MI, BB);
+
    case TargetOpcode::STACKMAP:
    case TargetOpcode::PATCHPOINT:
      return emitPatchPoint(MI, BB);
@@ -22319,7 +22371,7 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
      EVT SVT = BC0.getValueType();
      unsigned Opcode = BC0.getOpcode();
      unsigned NumElts = VT.getVectorNumElements();
-    
+
      if (BC0.hasOneUse() && SVT.isVector() &&
          SVT.getVectorNumElements() * 2 == NumElts &&
          TLI.isOperationLegal(Opcode, VT)) {
@@ -22494,7 +22546,9 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
  
  /// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index
  /// generation and convert it from being a bunch of shuffles and extracts
-/// to a simple store and scalar loads to extract the elements.
+/// into a somewhat faster sequence. For i686, the best sequence is apparently
+/// storing the value and loading scalars back, while for x64 we should
+/// use 64-bit extracts and shifts.
  static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
                                           TargetLowering::DAGCombinerInfo &DCI) {
    SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI);
@@ -22553,36 +22607,61 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
      return SDValue();
  
    // Ok, we've now decided to do the transformation.
+  // If 64-bit shifts are legal, use the extract-shift sequence,
+  // otherwise bounce the vector off the cache.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDValue Vals[4];
    SDLoc dl(InputVector);
+  
+  if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) {
+    SDValue Cst = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, InputVector);
+    EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy();
+    SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
+      DAG.getConstant(0, VecIdxTy));
+    SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
+      DAG.getConstant(1, VecIdxTy));
+
+    SDValue ShAmt = DAG.getConstant(32, 
+      DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64));
+    Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf);
+    Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
+      DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt));
+    Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf);
+    Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
+      DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt));
+  } else {
+    // Store the value to a temporary stack slot.
+    SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType());
+    SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
+      MachinePointerInfo(), false, false, 0);
  
-  // Store the value to a temporary stack slot.
-  SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType());
-  SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
-                            MachinePointerInfo(), false, false, 0);
+    EVT ElementType = InputVector.getValueType().getVectorElementType();
+    unsigned EltSize = ElementType.getSizeInBits() / 8;
  
-  // Replace each use (extract) with a load of the appropriate element.
-  for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
-       UE = Uses.end(); UI != UE; ++UI) {
-    SDNode *Extract = *UI;
+    // Replace each use (extract) with a load of the appropriate element.
+    for (unsigned i = 0; i < 4; ++i) {
+      uint64_t Offset = EltSize * i;
+      SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy());
  
-    // cOMpute the element's address.
-    SDValue Idx = Extract->getOperand(1);
-    unsigned EltSize =
-        InputVector.getValueType().getVectorElementType().getSizeInBits()/8;
-    uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue();
-    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-    SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy());
+      SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(),
+                                       StackPtr, OffsetVal);
  
-    SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(),
-                                     StackPtr, OffsetVal);
+      // Load the scalar.
+      Vals[i] = DAG.getLoad(ElementType, dl, Ch,
+                            ScalarAddr, MachinePointerInfo(),
+                            false, false, false, 0);
+
+    }
+  }
  
-    // Load the scalar.
-    SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch,
-                                     ScalarAddr, MachinePointerInfo(),
-                                     false, false, false, 0);
+  // Replace the extracts
+  for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
+    UE = Uses.end(); UI != UE; ++UI) {
+    SDNode *Extract = *UI;
  
-    // Replace the exact with the load.
-    DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), LoadScalar);
+    SDValue Idx = Extract->getOperand(1);
+    uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+    DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]);
    }
  
    // The replacement was made in place; don't return anything.
@@ -22665,7 +22744,7 @@ matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS,
  }
  
  static SDValue
-TransformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG,
+transformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG,
                                        const X86Subtarget *Subtarget) {
    SDLoc dl(N);
    SDValue Cond = N->getOperand(0);
@@ -22678,18 +22757,6 @@ TransformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG,
        Cond = CondSrc->getOperand(0);
    }
  
-  MVT VT = N->getSimpleValueType(0);
-  MVT EltVT = VT.getVectorElementType();
-  unsigned NumElems = VT.getVectorNumElements();
-  // There is no blend with immediate in AVX-512.
-  if (VT.is512BitVector())
-    return SDValue();
-
-  if (!Subtarget->hasSSE41() || EltVT == MVT::i8)
-    return SDValue();
-  if (!Subtarget->hasInt256() && VT == MVT::v16i16)
-    return SDValue();
-
    if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
      return SDValue();
  
@@ -22703,6 +22770,8 @@ TransformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG,
    if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue))
      return SDValue();
  
+  MVT VT = N->getSimpleValueType(0);
+  unsigned NumElems = VT.getVectorNumElements();
    SmallVector<int, 8> ShuffleMask(NumElems, -1);
    for (unsigned i = 0; i < NumElems; ++i) {
      // Be sure we emit undef where we can.
@@ -22712,6 +22781,9 @@ TransformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG,
        ShuffleMask[i] = i + NumElems * ((MaskValue >> i) & 1);
    }
  
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (!TLI.isShuffleMaskLegal(ShuffleMask, VT))
+    return SDValue();
    return DAG.getVectorShuffle(VT, dl, LHS, RHS, &ShuffleMask[0]);
  }
  
@@ -23151,81 +23223,6 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
      }
    }
  
-  // Try to fold this VSELECT into a MOVSS/MOVSD
-  if (N->getOpcode() == ISD::VSELECT &&
-      Cond.getOpcode() == ISD::BUILD_VECTOR && !DCI.isBeforeLegalize()) {
-    if (VT == MVT::v4i32 || VT == MVT::v4f32 ||
-        (Subtarget->hasSSE2() && (VT == MVT::v2i64 || VT == MVT::v2f64))) {
-      bool CanFold = false;
-      unsigned NumElems = Cond.getNumOperands();
-      SDValue A = LHS;
-      SDValue B = RHS;
-      
-      if (isZero(Cond.getOperand(0))) {
-        CanFold = true;
-
-        // fold (vselect <0,-1,-1,-1>, A, B) -> (movss A, B)
-        // fold (vselect <0,-1> -> (movsd A, B)
-        for (unsigned i = 1, e = NumElems; i != e && CanFold; ++i)
-          CanFold = isAllOnes(Cond.getOperand(i));
-      } else if (isAllOnes(Cond.getOperand(0))) {
-        CanFold = true;
-        std::swap(A, B);
-
-        // fold (vselect <-1,0,0,0>, A, B) -> (movss B, A)
-        // fold (vselect <-1,0> -> (movsd B, A)
-        for (unsigned i = 1, e = NumElems; i != e && CanFold; ++i)
-          CanFold = isZero(Cond.getOperand(i));
-      }
-
-      if (CanFold) {
-        if (VT == MVT::v4i32 || VT == MVT::v4f32)
-          return getTargetShuffleNode(X86ISD::MOVSS, DL, VT, A, B, DAG);
-        return getTargetShuffleNode(X86ISD::MOVSD, DL, VT, A, B, DAG);
-      }
-
-      if (Subtarget->hasSSE2() && (VT == MVT::v4i32 || VT == MVT::v4f32)) {
-        // fold (v4i32: vselect <0,0,-1,-1>, A, B) ->
-        //      (v4i32 (bitcast (movsd (v2i64 (bitcast A)),
-        //                             (v2i64 (bitcast B)))))
-        //
-        // fold (v4f32: vselect <0,0,-1,-1>, A, B) ->
-        //      (v4f32 (bitcast (movsd (v2f64 (bitcast A)),
-        //                             (v2f64 (bitcast B)))))
-        //
-        // fold (v4i32: vselect <-1,-1,0,0>, A, B) ->
-        //      (v4i32 (bitcast (movsd (v2i64 (bitcast B)),
-        //                             (v2i64 (bitcast A)))))
-        //
-        // fold (v4f32: vselect <-1,-1,0,0>, A, B) ->
-        //      (v4f32 (bitcast (movsd (v2f64 (bitcast B)),
-        //                             (v2f64 (bitcast A)))))
-
-        CanFold = (isZero(Cond.getOperand(0)) &&
-                   isZero(Cond.getOperand(1)) &&
-                   isAllOnes(Cond.getOperand(2)) &&
-                   isAllOnes(Cond.getOperand(3)));
-
-        if (!CanFold && isAllOnes(Cond.getOperand(0)) &&
-            isAllOnes(Cond.getOperand(1)) &&
-            isZero(Cond.getOperand(2)) &&
-            isZero(Cond.getOperand(3))) {
-          CanFold = true;
-          std::swap(LHS, RHS);
-        }
-
-        if (CanFold) {
-          EVT NVT = (VT == MVT::v4i32) ? MVT::v2i64 : MVT::v2f64;
-          SDValue NewA = DAG.getNode(ISD::BITCAST, DL, NVT, LHS);
-          SDValue NewB = DAG.getNode(ISD::BITCAST, DL, NVT, RHS);
-          SDValue Select = getTargetShuffleNode(X86ISD::MOVSD, DL, NVT, NewA,
-                                                NewB, DAG);
-          return DAG.getNode(ISD::BITCAST, DL, VT, Select);
-        }
-      }
-    }
-  }
-
    // If we know that this node is legal then we know that it is going to be
    // matched by one of the SSE/AVX BLEND instructions. These instructions only
    // depend on the highest bit in each word. Try to use SimplifyDemandedBits
@@ -23310,7 +23307,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
    if ((N->getOpcode() == ISD::VSELECT ||
         N->getOpcode() == X86ISD::SHRUNKBLEND) &&
        !DCI.isBeforeLegalize()) {
-    SDValue Shuffle = TransformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget);
+    SDValue Shuffle = transformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget);
      if (Shuffle.getNode())
        return Shuffle;
    }
@@ -23667,7 +23664,7 @@ static SDValue PerformINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG,
      // fold (blend A, B, allOnes) -> B
      if (ISD::isBuildVectorAllOnes(Mask.getNode()))
        return Op1;
-    
+
      // Simplify the case where the mask is a constant i32 value.
      if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Mask)) {
        if (C->isNullValue())
@@ -24376,11 +24373,12 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
    SDLoc dl(Ld);
    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
  
-  // On Sandybridge unaligned 256bit loads are inefficient.
+  // For chips with slow 32-byte unaligned loads, break the 32-byte operation
+  // into two 16-byte operations.
    ISD::LoadExtType Ext = Ld->getExtensionType();
    unsigned Alignment = Ld->getAlignment();
    bool IsAligned = Alignment == 0 || Alignment >= MemVT.getSizeInBits()/8;
-  if (RegVT.is256BitVector() && !Subtarget->hasInt256() &&
+  if (RegVT.is256BitVector() && Subtarget->isUnalignedMem32Slow() &&
        !DCI.isBeforeLegalizeOps() && !IsAligned && Ext == ISD::NON_EXTLOAD) {
      unsigned NumElems = RegVT.getVectorNumElements();
      if (NumElems < 2)
@@ -24423,13 +24421,11 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
    SDValue StoredVal = St->getOperand(1);
    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
  
-  // If we are saving a concatenation of two XMM registers, perform two stores.
-  // On Sandy Bridge, 256-bit memory operations are executed by two
-  // 128-bit ports. However, on Haswell it is better to issue a single 256-bit
-  // memory  operation.
+  // If we are saving a concatenation of two XMM registers and 32-byte stores
+  // are slow, such as on Sandy Bridge, perform two 16-byte stores.
    unsigned Alignment = St->getAlignment();
    bool IsAligned = Alignment == 0 || Alignment >= VT.getSizeInBits()/8;
-  if (VT.is256BitVector() && !Subtarget->hasInt256() &&
+  if (VT.is256BitVector() && Subtarget->isUnalignedMem32Slow() &&
        StVT == VT && !IsAligned) {
      unsigned NumElems = VT.getVectorNumElements();
      if (NumElems < 2)
@@ -26336,7 +26332,7 @@ int X86TargetLowering::getScalingFactorCost(const AddrMode &AM,
    // "load" ports instead of the dedicated "store" port.
    // E.g., on Haswell:
    // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
-  // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.   
+  // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
    if (isLegalAddressingMode(AM, Ty))
      // Scale represents reg2 * scale, thus account for 1
      // as soon as we use a second register.