Fix a bug in the AVX 256-bit shuffle code in cases where the splat element is on...

[oota-llvm.git] / lib / Target / X86 / X86ISelLowering.cpp
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index 2d9dbd29c0892b579310dbd35ecca7d88206a5dd..c643cefb6c0b691bd811da71b9a9ad43c6a35877 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -44,7 +44,6 @@
  #include "llvm/ADT/Statistic.h"
  #include "llvm/ADT/StringExtras.h"
  #include "llvm/ADT/VariadicFunction.h"
-#include "llvm/ADT/VectorExtras.h"
  #include "llvm/Support/CallSite.h"
  #include "llvm/Support/Debug.h"
  #include "llvm/Support/Dwarf.h"
@@ -169,8 +168,8 @@ static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) {
  X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
    : TargetLowering(TM, createTLOF(TM)) {
    Subtarget = &TM.getSubtarget<X86Subtarget>();
-  X86ScalarSSEf64 = Subtarget->hasXMMInt();
-  X86ScalarSSEf32 = Subtarget->hasXMM();
+  X86ScalarSSEf64 = Subtarget->hasSSE2();
+  X86ScalarSSEf32 = Subtarget->hasSSE1();
    X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP;
  
    RegInfo = TM.getRegisterInfo();
@@ -256,7 +255,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
  
    if (Subtarget->is64Bit()) {
      setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Promote);
-    setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Expand);
+    setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
    } else if (!TM.Options.UseSoftFloat) {
      // We have an algorithm for SSE2->double, and we turn this into a
      // 64-bit FILD followed by conditional FADD for other targets.
@@ -481,7 +480,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
      setOperationAction(ISD::SRL_PARTS     , MVT::i64  , Custom);
    }
  
-  if (Subtarget->hasXMM())
+  if (Subtarget->hasSSE1())
      setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
  
    setOperationAction(ISD::MEMBARRIER    , MVT::Other, Custom);
@@ -815,7 +814,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
    setOperationAction(ISD::BITCAST,            MVT::v2i32, Expand);
    setOperationAction(ISD::BITCAST,            MVT::v1i64, Expand);
  
-  if (!TM.Options.UseSoftFloat && Subtarget->hasXMM()) {
+  if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) {
      addRegisterClass(MVT::v4f32, X86::VR128RegisterClass);
  
      setOperationAction(ISD::FADD,               MVT::v4f32, Legal);
@@ -832,7 +831,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
      setOperationAction(ISD::SETCC,              MVT::v4f32, Custom);
    }
  
-  if (!TM.Options.UseSoftFloat && Subtarget->hasXMMInt()) {
+  if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) {
      addRegisterClass(MVT::v2f64, X86::VR128RegisterClass);
  
      // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM
@@ -938,7 +937,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
      setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
    }
  
-  if (Subtarget->hasSSE41orAVX()) {
+  if (Subtarget->hasSSE41()) {
      setOperationAction(ISD::FFLOOR,             MVT::f32,   Legal);
      setOperationAction(ISD::FCEIL,              MVT::f32,   Legal);
      setOperationAction(ISD::FTRUNC,             MVT::f32,   Legal);
@@ -981,7 +980,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
      }
    }
  
-  if (Subtarget->hasXMMInt()) {
+  if (Subtarget->hasSSE2()) {
      setOperationAction(ISD::SRL,               MVT::v8i16, Custom);
      setOperationAction(ISD::SRL,               MVT::v16i8, Custom);
  
@@ -1010,7 +1009,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
      }
    }
  
-  if (Subtarget->hasSSE42orAVX())
+  if (Subtarget->hasSSE42())
      setOperationAction(ISD::SETCC,             MVT::v2i64, Custom);
  
    if (!TM.Options.UseSoftFloat && Subtarget->hasAVX()) {
@@ -1294,7 +1293,7 @@ unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const {
    }
  
    unsigned Align = 4;
-  if (Subtarget->hasXMM())
+  if (Subtarget->hasSSE1())
      getMaxByValAlign(Ty, Align);
    return Align;
  }
@@ -1331,14 +1330,14 @@ X86TargetLowering::getOptimalMemOpType(uint64_t Size,
        if (Subtarget->hasAVX() &&
            Subtarget->getStackAlignment() >= 32)
          return MVT::v8f32;
-      if (Subtarget->hasXMMInt())
+      if (Subtarget->hasSSE2())
          return MVT::v4i32;
-      if (Subtarget->hasXMM())
+      if (Subtarget->hasSSE1())
          return MVT::v4f32;
      } else if (!MemcpyStrSrc && Size >= 8 &&
                 !Subtarget->is64Bit() &&
                 Subtarget->getStackAlignment() >= 8 &&
-               Subtarget->hasXMMInt()) {
+               Subtarget->hasSSE2()) {
        // Do not use f64 to lower memcpy if source is string constant. It's
        // better to use i32 to avoid the loads.
        return MVT::f64;
@@ -1503,14 +1502,14 @@ X86TargetLowering::LowerReturn(SDValue Chain,
      // or SSE or MMX vectors.
      if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
           VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
-          (Subtarget->is64Bit() && !Subtarget->hasXMM())) {
+          (Subtarget->is64Bit() && !Subtarget->hasSSE1())) {
        report_fatal_error("SSE register return with SSE disabled");
      }
      // Likewise we can't return F64 values with SSE1 only.  gcc does so, but
      // llvm-gcc has never done it right and no one has noticed, so this
      // should be OK for now.
      if (ValVT == MVT::f64 &&
-        (Subtarget->is64Bit() && !Subtarget->hasXMMInt()))
+        (Subtarget->is64Bit() && !Subtarget->hasSSE2()))
        report_fatal_error("SSE2 register return with SSE2 disabled");
  
      // Returns in ST0/ST1 are handled specially: these are pushed as operands to
@@ -1536,7 +1535,7 @@ X86TargetLowering::LowerReturn(SDValue Chain,
                                    ValToCopy);
            // If we don't have SSE2 available, convert to v4f32 so the generated
            // register is legal.
-          if (!Subtarget->hasXMMInt())
+          if (!Subtarget->hasSSE2())
              ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy);
          }
        }
@@ -1636,7 +1635,7 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
  
      // If this is x86-64, and we disabled SSE, we can't return FP values
      if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
-        ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasXMM())) {
+        ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
        report_fatal_error("SSE register return with SSE disabled");
      }
  
@@ -1950,13 +1949,13 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
                                                         TotalNumIntRegs);
  
        bool NoImplicitFloatOps = Fn->hasFnAttr(Attribute::NoImplicitFloat);
-      assert(!(NumXMMRegs && !Subtarget->hasXMM()) &&
+      assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
               "SSE register cannot be used when SSE is disabled!");
        assert(!(NumXMMRegs && MF.getTarget().Options.UseSoftFloat &&
                 NoImplicitFloatOps) &&
               "SSE register cannot be used when SSE is disabled!");
        if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps ||
-          !Subtarget->hasXMM())
+          !Subtarget->hasSSE1())
          // Kernel mode asks for SSE to be disabled, so don't push them
          // on the stack.
          TotalNumXMMRegs = 0;
@@ -2319,7 +2318,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
        X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
      };
      unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
-    assert((Subtarget->hasXMM() || !NumXMMRegs)
+    assert((Subtarget->hasSSE1() || !NumXMMRegs)
             && "SSE registers cannot be used when SSE is disabled");
  
      Chain = DAG.getCopyToReg(Chain, dl, X86::AL,
@@ -3194,12 +3193,11 @@ static bool isPSHUFHWMask(const SmallVectorImpl<int> &Mask, EVT VT) {
      return false;
  
    // Lower quadword copied in order or undef.
-  for (int i = 0; i != 4; ++i)
-    if (Mask[i] >= 0 && Mask[i] != i)
-      return false;
+  if (!isSequentialOrUndefInRange(Mask, 0, 4, 0))
+    return false;
  
    // Upper quadword shuffled.
-  for (int i = 4; i != 8; ++i)
+  for (unsigned i = 4; i != 8; ++i)
      if (Mask[i] >= 0 && (Mask[i] < 4 || Mask[i] > 7))
        return false;
  
@@ -3219,12 +3217,11 @@ static bool isPSHUFLWMask(const SmallVectorImpl<int> &Mask, EVT VT) {
      return false;
  
    // Upper quadword copied in order.
-  for (int i = 4; i != 8; ++i)
-    if (Mask[i] >= 0 && Mask[i] != i)
-      return false;
+  if (!isSequentialOrUndefInRange(Mask, 4, 4, 4))
+    return false;
  
    // Lower quadword shuffled.
-  for (int i = 0; i != 4; ++i)
+  for (unsigned i = 0; i != 4; ++i)
      if (Mask[i] >= 4)
        return false;
  
@@ -3240,13 +3237,13 @@ bool X86::isPSHUFLWMask(ShuffleVectorSDNode *N) {
  /// isPALIGNRMask - Return true if the node specifies a shuffle of elements that
  /// is suitable for input to PALIGNR.
  static bool isPALIGNRMask(const SmallVectorImpl<int> &Mask, EVT VT,
-                          bool hasSSSE3OrAVX) {
+                          bool hasSSSE3) {
    int i, e = VT.getVectorNumElements();
    if (VT.getSizeInBits() != 128)
      return false;
  
    // Do not handle v2i64 / v2f64 shuffles with palignr.
-  if (e < 4 || !hasSSSE3OrAVX)
+  if (e < 4 || !hasSSSE3)
      return false;
  
    for (i = 0; i != e; ++i)
@@ -3321,7 +3318,7 @@ static bool isVSHUFPYMask(const SmallVectorImpl<int> &Mask, EVT VT,
          // VPERMILPS works with masks.
          if (NumElems == 4 || l == 0 || Mask[i+QuarterStart] < 0)
            continue;
-        if (!isUndefOrEqual(Idx, Mask[i+QuarterStart]+HalfSize))
+        if (!isUndefOrEqual(Idx, Mask[i+QuarterStart]+LaneStart))
            return false;
        }
      }
@@ -3332,18 +3329,17 @@ static bool isVSHUFPYMask(const SmallVectorImpl<int> &Mask, EVT VT,
  
  /// getShuffleVSHUFPYImmediate - Return the appropriate immediate to shuffle
  /// the specified VECTOR_MASK mask with VSHUFPSY/VSHUFPDY instructions.
-static unsigned getShuffleVSHUFPYImmediate(SDNode *N) {
-  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+static unsigned getShuffleVSHUFPYImmediate(ShuffleVectorSDNode *SVOp) {
    EVT VT = SVOp->getValueType(0);
-  int NumElems = VT.getVectorNumElements();
+  unsigned NumElems = VT.getVectorNumElements();
  
    assert(VT.getSizeInBits() == 256 && "Only supports 256-bit types");
    assert((NumElems == 4 || NumElems == 8) && "Only supports v4 and v8 types");
  
-  int HalfSize = NumElems/2;
+  unsigned HalfSize = NumElems/2;
    unsigned Mul = (NumElems == 8) ? 2 : 1;
    unsigned Mask = 0;
-  for (int i = 0; i != NumElems; ++i) {
+  for (unsigned i = 0; i != NumElems; ++i) {
      int Elt = SVOp->getMaskElt(i);
      if (Elt < 0)
        continue;
@@ -3672,12 +3668,12 @@ static bool isMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT) {
    if (VT.getSizeInBits() == 256)
      return false;
  
-  int NumElts = VT.getVectorNumElements();
+  unsigned NumElts = VT.getVectorNumElements();
  
    if (!isUndefOrEqual(Mask[0], NumElts))
      return false;
  
-  for (int i = 1; i < NumElts; ++i)
+  for (unsigned i = 1; i != NumElts; ++i)
      if (!isUndefOrEqual(Mask[i], i))
        return false;
  
@@ -3704,11 +3700,11 @@ static bool isVPERM2X128Mask(const SmallVectorImpl<int> &Mask, EVT VT,
    // The shuffle result is divided into half A and half B. In total the two
    // sources have 4 halves, namely: C, D, E, F. The final values of A and
    // B must come from C, D, E or F.
-  int HalfSize = VT.getVectorNumElements()/2;
+  unsigned HalfSize = VT.getVectorNumElements()/2;
    bool MatchA = false, MatchB = false;
  
    // Check if A comes from one of C, D, E, F.
-  for (int Half = 0; Half < 4; ++Half) {
+  for (unsigned Half = 0; Half != 4; ++Half) {
      if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) {
        MatchA = true;
        break;
@@ -3716,7 +3712,7 @@ static bool isVPERM2X128Mask(const SmallVectorImpl<int> &Mask, EVT VT,
    }
  
    // Check if B comes from one of C, D, E, F.
-  for (int Half = 0; Half < 4; ++Half) {
+  for (unsigned Half = 0; Half != 4; ++Half) {
      if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) {
        MatchB = true;
        break;
@@ -3731,16 +3727,16 @@ static bool isVPERM2X128Mask(const SmallVectorImpl<int> &Mask, EVT VT,
  static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) {
    EVT VT = SVOp->getValueType(0);
  
-  int HalfSize = VT.getVectorNumElements()/2;
+  unsigned HalfSize = VT.getVectorNumElements()/2;
  
-  int FstHalf = 0, SndHalf = 0;
-  for (int i = 0; i < HalfSize; ++i) {
+  unsigned FstHalf = 0, SndHalf = 0;
+  for (unsigned i = 0; i < HalfSize; ++i) {
      if (SVOp->getMaskElt(i) > 0) {
        FstHalf = SVOp->getMaskElt(i)/HalfSize;
        break;
      }
    }
-  for (int i = HalfSize; i < HalfSize*2; ++i) {
+  for (unsigned i = HalfSize; i < HalfSize*2; ++i) {
      if (SVOp->getMaskElt(i) > 0) {
        SndHalf = SVOp->getMaskElt(i)/HalfSize;
        break;
@@ -3759,20 +3755,19 @@ static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) {
  /// with the same restriction that lanes can't be crossed.
  static bool isVPERMILPMask(const SmallVectorImpl<int> &Mask, EVT VT,
                             bool HasAVX) {
-  int NumElts = VT.getVectorNumElements();
-  int NumLanes = VT.getSizeInBits()/128;
-
    if (!HasAVX)
      return false;
  
+  unsigned NumElts = VT.getVectorNumElements();
    // Only match 256-bit with 32/64-bit types
    if (VT.getSizeInBits() != 256 || (NumElts != 4 && NumElts != 8))
      return false;
  
-  int LaneSize = NumElts/NumLanes;
-  for (int l = 0; l != NumLanes; ++l) {
-    int LaneStart = l*LaneSize;
-    for (int i = 0; i != LaneSize; ++i) {
+  unsigned NumLanes = VT.getSizeInBits()/128;
+  unsigned LaneSize = NumElts/NumLanes;
+  for (unsigned l = 0; l != NumLanes; ++l) {
+    unsigned LaneStart = l*LaneSize;
+    for (unsigned i = 0; i != LaneSize; ++i) {
        if (!isUndefOrInRange(Mask[i+LaneStart], LaneStart, LaneStart+LaneSize))
          return false;
        if (NumElts == 4 || l == 0)
@@ -3780,7 +3775,7 @@ static bool isVPERMILPMask(const SmallVectorImpl<int> &Mask, EVT VT,
        // VPERMILPS handling
        if (Mask[i] < 0)
          continue;
-      if (!isUndefOrEqual(Mask[i+LaneStart], Mask[i]+LaneSize))
+      if (!isUndefOrEqual(Mask[i+LaneStart], Mask[i]+LaneStart))
          return false;
      }
    }
@@ -3793,9 +3788,9 @@ static bool isVPERMILPMask(const SmallVectorImpl<int> &Mask, EVT VT,
  static unsigned getShuffleVPERMILPImmediate(ShuffleVectorSDNode *SVOp) {
    EVT VT = SVOp->getValueType(0);
  
-  int NumElts = VT.getVectorNumElements();
-  int NumLanes = VT.getSizeInBits()/128;
-  int LaneSize = NumElts/NumLanes;
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned NumLanes = VT.getSizeInBits()/128;
+  unsigned LaneSize = NumElts/NumLanes;
  
    // Although the mask is equal for both lanes do it twice to get the cases
    // where a mask will match because the same mask element is undef on the
@@ -3803,7 +3798,7 @@ static unsigned getShuffleVPERMILPImmediate(ShuffleVectorSDNode *SVOp) {
    // such as: shuffle <u, 0, 1, 2, 4, 4, 5, 6>, which is completely valid.
    unsigned Shift = (LaneSize == 4) ? 2 : 1;
    unsigned Mask = 0;
-  for (int i = 0; i != NumElts; ++i) {
+  for (unsigned i = 0; i != NumElts; ++i) {
      int MaskElt = SVOp->getMaskElt(i);
      if (MaskElt < 0)
        continue;
@@ -3822,14 +3817,14 @@ static unsigned getShuffleVPERMILPImmediate(ShuffleVectorSDNode *SVOp) {
  /// element of vector 2 and the other elements to come from vector 1 in order.
  static bool isCommutedMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT,
                                 bool V2IsSplat = false, bool V2IsUndef = false) {
-  int NumOps = VT.getVectorNumElements();
+  unsigned NumOps = VT.getVectorNumElements();
    if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)
      return false;
  
    if (!isUndefOrEqual(Mask[0], 0))
      return false;
  
-  for (int i = 1; i < NumOps; ++i)
+  for (unsigned i = 1; i != NumOps; ++i)
      if (!(isUndefOrEqual(Mask[i], i+NumOps) ||
            (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) ||
            (V2IsSplat && isUndefOrEqual(Mask[i], NumOps))))
@@ -3850,7 +3845,7 @@ static bool isCommutedMOVL(ShuffleVectorSDNode *N, bool V2IsSplat = false,
  /// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7>
  bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N,
                           const X86Subtarget *Subtarget) {
-  if (!Subtarget->hasSSE3orAVX())
+  if (!Subtarget->hasSSE3())
      return false;
  
    // The second vector must be undef
@@ -3878,7 +3873,7 @@ bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N,
  /// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6>
  bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N,
                           const X86Subtarget *Subtarget) {
-  if (!Subtarget->hasSSE3orAVX())
+  if (!Subtarget->hasSSE3())
      return false;
  
    // The second vector must be undef
@@ -3893,7 +3888,7 @@ bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N,
      return false;
  
    // "i" is the value the indexed mask element must have
-  for (unsigned i = 0; i < NumElems; i += 2)
+  for (unsigned i = 0; i != NumElems; i += 2)
      if (!isUndefOrEqual(N->getMaskElt(i), i) ||
          !isUndefOrEqual(N->getMaskElt(i+1), i))
        return false;
@@ -3906,15 +3901,15 @@ bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N,
  /// version of MOVDDUP.
  static bool isMOVDDUPYMask(const SmallVectorImpl<int> &Mask, EVT VT,
                             bool HasAVX) {
-  int NumElts = VT.getVectorNumElements();
+  unsigned NumElts = VT.getVectorNumElements();
  
    if (!HasAVX || VT.getSizeInBits() != 256 || NumElts != 4)
      return false;
  
-  for (int i = 0; i != NumElts/2; ++i)
+  for (unsigned i = 0; i != NumElts/2; ++i)
      if (!isUndefOrEqual(Mask[i], 0))
        return false;
-  for (int i = NumElts/2; i != NumElts; ++i)
+  for (unsigned i = NumElts/2; i != NumElts; ++i)
      if (!isUndefOrEqual(Mask[i], NumElts/2))
        return false;
    return true;
@@ -3929,11 +3924,11 @@ bool X86::isMOVDDUPMask(ShuffleVectorSDNode *N) {
    if (VT.getSizeInBits() != 128)
      return false;
  
-  int e = VT.getVectorNumElements() / 2;
-  for (int i = 0; i < e; ++i)
+  unsigned e = VT.getVectorNumElements() / 2;
+  for (unsigned i = 0; i != e; ++i)
      if (!isUndefOrEqual(N->getMaskElt(i), i))
        return false;
-  for (int i = 0; i < e; ++i)
+  for (unsigned i = 0; i != e; ++i)
      if (!isUndefOrEqual(N->getMaskElt(e+i), i))
        return false;
    return true;
@@ -3981,14 +3976,14 @@ bool X86::isVINSERTF128Index(SDNode *N) {
  /// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions.
  unsigned X86::getShuffleSHUFImmediate(SDNode *N) {
    ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
-  int NumOperands = SVOp->getValueType(0).getVectorNumElements();
+  unsigned NumOperands = SVOp->getValueType(0).getVectorNumElements();
  
    unsigned Shift = (NumOperands == 4) ? 2 : 1;
    unsigned Mask = 0;
-  for (int i = 0; i < NumOperands; ++i) {
+  for (unsigned i = 0; i != NumOperands; ++i) {
      int Val = SVOp->getMaskElt(NumOperands-i-1);
      if (Val < 0) Val = 0;
-    if (Val >= NumOperands) Val -= NumOperands;
+    if (Val >= (int)NumOperands) Val -= NumOperands;
      Mask |= Val;
      if (i != NumOperands - 1)
        Mask <<= Shift;
@@ -4239,7 +4234,7 @@ static bool isZeroShuffle(ShuffleVectorSDNode *N) {
  
  /// getZeroVector - Returns a vector of specified type with all zero elements.
  ///
-static SDValue getZeroVector(EVT VT, bool HasXMMInt, SelectionDAG &DAG,
+static SDValue getZeroVector(EVT VT, bool HasSSE2, SelectionDAG &DAG,
                               DebugLoc dl) {
    assert(VT.isVector() && "Expected a vector type");
  
@@ -4247,7 +4242,7 @@ static SDValue getZeroVector(EVT VT, bool HasXMMInt, SelectionDAG &DAG,
    // to their dest type. This ensures they get CSE'd.
    SDValue Vec;
    if (VT.getSizeInBits() == 128) {  // SSE
-    if (HasXMMInt) {  // SSE2
+    if (HasSSE2) {  // SSE2
        SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
        Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
      } else { // SSE1
@@ -4418,7 +4413,7 @@ static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
    // Extract the 128-bit part containing the splat element and update
    // the splat element index when it refers to the higher register.
    if (Size == 256) {
-    unsigned Idx = (EltNo > NumElems/2) ? NumElems/2 : 0;
+    unsigned Idx = (EltNo >= NumElems/2) ? NumElems/2 : 0;
      V1 = Extract128BitVector(V1, DAG.getConstant(Idx, MVT::i32), DAG, dl);
      if (Idx > 0)
        EltNo -= NumElems/2;
@@ -4450,11 +4445,11 @@ static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
  /// element of V2 is swizzled into the zero/undef vector, landing at element
  /// Idx.  This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
  static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
-                                           bool isZero, bool HasXMMInt,
+                                           bool isZero, bool HasSSE2,
                                             SelectionDAG &DAG) {
    EVT VT = V2.getValueType();
    SDValue V1 = isZero
-    ? getZeroVector(VT, HasXMMInt, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT);
+    ? getZeroVector(VT, HasSSE2, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT);
    unsigned NumElems = VT.getVectorNumElements();
    SmallVector<int, 16> MaskVec;
    for (unsigned i = 0; i != NumElems; ++i)
@@ -4968,7 +4963,10 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
  /// a scalar load.
  /// The scalar load node is returned when a pattern is found,
  /// or SDValue() otherwise.
-static SDValue isVectorBroadcast(SDValue &Op, bool hasAVX2) {
+static SDValue isVectorBroadcast(SDValue &Op, const X86Subtarget *Subtarget) {
+  if (!Subtarget->hasAVX())
+    return SDValue();
+
    EVT VT = Op.getValueType();
    SDValue V = Op;
  
@@ -5027,18 +5025,6 @@ static SDValue isVectorBroadcast(SDValue &Op, bool hasAVX2) {
    bool Is128 = VT.getSizeInBits() == 128;
    unsigned ScalarSize = Ld.getValueType().getSizeInBits();
  
-  if (hasAVX2) {
-    // VBroadcast to YMM
-    if (Is256 && (ScalarSize == 8  || ScalarSize == 16 ||
-                  ScalarSize == 32 || ScalarSize == 64 ))
-      return Ld;
-
-    // VBroadcast to XMM
-    if (Is128 && (ScalarSize ==  8 || ScalarSize == 32 ||
-                  ScalarSize == 16 || ScalarSize == 64 ))
-      return Ld;
-  }
-
    // VBroadcast to YMM
    if (Is256 && (ScalarSize == 32 || ScalarSize == 64))
      return Ld;
@@ -5047,6 +5033,17 @@ static SDValue isVectorBroadcast(SDValue &Op, bool hasAVX2) {
    if (Is128 && (ScalarSize == 32))
      return Ld;
  
+  // The integer check is needed for the 64-bit into 128-bit so it doesn't match
+  // double since there is vbroadcastsd xmm
+  if (Subtarget->hasAVX2() && Ld.getValueType().isInteger()) {
+    // VBroadcast to YMM
+    if (Is256 && (ScalarSize == 8 || ScalarSize == 16))
+      return Ld;
+
+    // VBroadcast to XMM
+    if (Is128 && (ScalarSize ==  8 || ScalarSize == 16 || ScalarSize == 64))
+      return Ld;
+  }
  
    // Unsupported broadcast.
    return SDValue();
@@ -5068,7 +5065,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
          Op.getValueType() == MVT::v8i32)
        return Op;
  
-    return getZeroVector(Op.getValueType(), Subtarget->hasXMMInt(), DAG, dl);
+    return getZeroVector(Op.getValueType(), Subtarget->hasSSE2(), DAG, dl);
    }
  
    // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
@@ -5082,9 +5079,9 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
      return getOnesVector(Op.getValueType(), Subtarget->hasAVX2(), DAG, dl);
    }
  
-  SDValue LD = isVectorBroadcast(Op, Subtarget->hasAVX2());
-  if (Subtarget->hasAVX() && LD.getNode())
-      return DAG.getNode(X86ISD::VBROADCAST, dl, VT, LD);
+  SDValue LD = isVectorBroadcast(Op, Subtarget);
+  if (LD.getNode())
+    return DAG.getNode(X86ISD::VBROADCAST, dl, VT, LD);
  
    unsigned EVTBits = ExtVT.getSizeInBits();
  
@@ -5136,7 +5133,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
          Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
          Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
          Item = getShuffleVectorZeroOrUndef(Item, 0, true,
-                                           Subtarget->hasXMMInt(), DAG);
+                                           Subtarget->hasSSE2(), DAG);
  
          // Now we have our 32-bit value zero extended in the low element of
          // a vector.  If Idx != 0, swizzle it into place.
@@ -5164,17 +5161,15 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
        if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
            (ExtVT == MVT::i64 && Subtarget->is64Bit())) {
          if (VT.getSizeInBits() == 256) {
-          EVT VT128 = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems / 2);
-          Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Item);
            SDValue ZeroVec = getZeroVector(VT, true, DAG, dl);
-          return Insert128BitVector(ZeroVec, Item, DAG.getConstant(0, MVT::i32),
-                              DAG, dl);
+          return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
+                             Item, DAG.getIntPtrConstant(0));
          }
          assert(VT.getSizeInBits() == 128 && "Expected an SSE value type!");
          Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
          // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
          return getShuffleVectorZeroOrUndef(Item, 0, true,
-                                           Subtarget->hasXMMInt(), DAG);
+                                           Subtarget->hasSSE2(), DAG);
        }
  
        if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
@@ -5187,7 +5182,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
          } else {
            assert(VT.getSizeInBits() == 128 && "Expected an SSE value type!");
            Item = getShuffleVectorZeroOrUndef(Item, 0, true,
-                                             Subtarget->hasXMMInt(), DAG);
+                                             Subtarget->hasSSE2(), DAG);
          }
          return DAG.getNode(ISD::BITCAST, dl, VT, Item);
        }
@@ -5217,7 +5212,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
  
        // Turn it into a shuffle of zero and zero-extended scalar to vector.
        Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0,
-                                         Subtarget->hasXMMInt(), DAG);
+                                         Subtarget->hasSSE2(), DAG);
        SmallVector<int, 8> MaskVec;
        for (unsigned i = 0; i < NumElems; i++)
          MaskVec.push_back(i == Idx ? 0 : 1);
@@ -5274,7 +5269,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
        SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
                                   Op.getOperand(Idx));
        return getShuffleVectorZeroOrUndef(V2, Idx, true,
-                                         Subtarget->hasXMMInt(), DAG);
+                                         Subtarget->hasSSE2(), DAG);
      }
      return SDValue();
    }
@@ -5299,7 +5294,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
      for (unsigned i = 0; i < 4; ++i) {
        bool isZero = !(NonZeros & (1 << i));
        if (isZero)
-        V[i] = getZeroVector(VT, Subtarget->hasXMMInt(), DAG, dl);
+        V[i] = getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl);
        else
          V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
      }
@@ -5343,7 +5338,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
        return LD;
  
      // For SSE 4.1, use insertps to put the high elements into the low element.
-    if (getSubtarget()->hasSSE41orAVX()) {
+    if (getSubtarget()->hasSSE41()) {
        SDValue Result;
        if (Op.getOperand(0).getOpcode() != ISD::UNDEF)
          Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
@@ -5514,7 +5509,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op,
    // quads, disable the next transformation since it does not help SSSE3.
    bool V1Used = InputQuads[0] || InputQuads[1];
    bool V2Used = InputQuads[2] || InputQuads[3];
-  if (Subtarget->hasSSSE3orAVX()) {
+  if (Subtarget->hasSSSE3()) {
      if (InputQuads.count() == 2 && V1Used && V2Used) {
        BestLoQuad = InputQuads.find_first();
        BestHiQuad = InputQuads.find_next(BestLoQuad);
@@ -5587,7 +5582,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op,
    // If we have SSSE3, and all words of the result are from 1 input vector,
    // case 2 is generated, otherwise case 3 is generated.  If no SSSE3
    // is present, fall back to case 4.
-  if (Subtarget->hasSSSE3orAVX()) {
+  if (Subtarget->hasSSSE3()) {
      SmallVector<SDValue,16> pshufbMask;
  
      // If we have elements from both input vectors, set the high bit of the
@@ -5655,7 +5650,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op,
      NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
                                  &MaskV[0]);
  
-    if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3orAVX())
+    if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3())
        NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16,
                                 NewV.getOperand(0),
                                 X86::getShufflePSHUFLWImmediate(NewV.getNode()),
@@ -5683,7 +5678,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op,
      NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
                                  &MaskV[0]);
  
-    if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3orAVX())
+    if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3())
        NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16,
                                NewV.getOperand(0),
                                X86::getShufflePSHUFHWImmediate(NewV.getNode()),
@@ -5749,7 +5744,7 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
    }
  
    // If SSSE3, use 1 pshufb instruction per vector with elements in the result.
-  if (TLI.getSubtarget()->hasSSSE3orAVX()) {
+  if (TLI.getSubtarget()->hasSSSE3()) {
      SmallVector<SDValue,16> pshufbMask;
  
      // If all result elements are from one input vector, then only translate
@@ -6268,31 +6263,27 @@ bool CanXFormVExtractWithShuffleIntoLoad(SDValue V, SelectionDAG &DAG,
    if (V.getOpcode() == ISD::BITCAST)
      V = V.getOperand(0);
  
-  if (ISD::isNormalLoad(V.getNode())) {
-    // Is the original load suitable?
-    LoadSDNode *LN0 = cast<LoadSDNode>(V);
+  if (!ISD::isNormalLoad(V.getNode()))
+    return false;
  
-    // FIXME: avoid the multi-use bug that is preventing lots of
-    // of foldings to be detected, this is still wrong of course, but
-    // give the temporary desired behavior, and if it happens that
-    // the load has real more uses, during isel it will not fold, and
-    // will generate poor code.
-    if (!LN0 || LN0->isVolatile()) // || !LN0->hasOneUse()
-      return false;
+  // Is the original load suitable?
+  LoadSDNode *LN0 = cast<LoadSDNode>(V);
  
-    if (!HasShuffleIntoBitcast)
-      return true;
+  if (!LN0 || !LN0->hasNUsesOfValue(1,0) || LN0->isVolatile())
+    return false;
  
-    // If there's a bitcast before the shuffle, check if the load type and
-    // alignment is valid.
-    unsigned Align = LN0->getAlignment();
-    unsigned NewAlign =
-      TLI.getTargetData()->getABITypeAlignment(
-                                    VT.getTypeForEVT(*DAG.getContext()));
+  if (!HasShuffleIntoBitcast)
+    return true;
  
-    if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VT))
-      return false;
-  }
+  // If there's a bitcast before the shuffle, check if the load type and
+  // alignment is valid.
+  unsigned Align = LN0->getAlignment();
+  unsigned NewAlign =
+    TLI.getTargetData()->getABITypeAlignment(
+                                  VT.getTypeForEVT(*DAG.getContext()));
+
+  if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VT))
+    return false;
  
    return true;
  }
@@ -6310,14 +6301,14 @@ SDValue getMOVDDup(SDValue &Op, DebugLoc &dl, SDValue V1, SelectionDAG &DAG) {
  
  static
  SDValue getMOVLowToHigh(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG,
-                        bool HasXMMInt) {
+                        bool HasSSE2) {
    SDValue V1 = Op.getOperand(0);
    SDValue V2 = Op.getOperand(1);
    EVT VT = Op.getValueType();
  
    assert(VT != MVT::v2i64 && "unsupported shuffle type");
  
-  if (HasXMMInt && VT == MVT::v2f64)
+  if (HasSSE2 && VT == MVT::v2f64)
      return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG);
  
    // v4f32 or v4i32: canonizalized to v4f32 (which is legal for SSE1)
@@ -6344,7 +6335,7 @@ SDValue getMOVHighToLow(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG) {
  }
  
  static
-SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasXMMInt) {
+SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
    SDValue V1 = Op.getOperand(0);
    SDValue V2 = Op.getOperand(1);
    EVT VT = Op.getValueType();
@@ -6370,7 +6361,7 @@ SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasXMMInt) {
  
    ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
    if (CanFoldLoad) {
-    if (HasXMMInt && NumElems == 2)
+    if (HasSSE2 && NumElems == 2)
        return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG);
  
      if (NumElems == 4)
@@ -6385,7 +6376,7 @@ SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasXMMInt) {
    // this is horrible, but will stay like this until we move all shuffle
    // matching to x86 specific nodes. Note that for the 1st condition all
    // types are matched with movsd.
-  if (HasXMMInt) {
+  if (HasSSE2) {
      // FIXME: isMOVLMask should be checked and matched before getMOVLP,
      // as to remove this logic from here, as much as possible
      if (NumElems == 2 || !X86::isMOVLMask(SVOp))
@@ -6411,7 +6402,7 @@ SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG,
    SDValue V2 = Op.getOperand(1);
  
    if (isZeroShuffle(SVOp))
-    return getZeroVector(VT, Subtarget->hasXMMInt(), DAG, dl);
+    return getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl);
  
    // Handle splat operations
    if (SVOp->isSplat()) {
@@ -6425,8 +6416,8 @@ SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG,
        return Op;
  
      // Use vbroadcast whenever the splat comes from a foldable load
-    SDValue LD = isVectorBroadcast(Op, Subtarget->hasAVX2());
-    if (Subtarget->hasAVX() && LD.getNode())
+    SDValue LD = isVectorBroadcast(Op, Subtarget);
+    if (LD.getNode())
        return DAG.getNode(X86ISD::VBROADCAST, dl, VT, LD);
  
      // Handle splats by matching through known shuffle masks
@@ -6445,7 +6436,7 @@ SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG,
      if (NewOp.getNode())
        return DAG.getNode(ISD::BITCAST, dl, VT, NewOp);
    } else if ((VT == MVT::v4i32 ||
-             (VT == MVT::v4f32 && Subtarget->hasXMMInt()))) {
+             (VT == MVT::v4f32 && Subtarget->hasSSE2()))) {
      // FIXME: Figure out a cleaner way to do this.
      // Try to make use of movq to zero out the top part.
      if (ISD::isBuildVectorAllZeros(V2.getNode())) {
@@ -6476,7 +6467,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
    bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
    bool V1IsSplat = false;
    bool V2IsSplat = false;
-  bool HasXMMInt = Subtarget->hasXMMInt();
+  bool HasSSE2 = Subtarget->hasSSE2();
    bool HasAVX    = Subtarget->hasAVX();
    bool HasAVX2   = Subtarget->hasAVX2();
    MachineFunction &MF = DAG.getMachineFunction();
@@ -6514,7 +6505,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
    if (OptForSize && X86::isUNPCKH_v_undef_Mask(SVOp, HasAVX2))
      return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
  
-  if (X86::isMOVDDUPMask(SVOp) && Subtarget->hasSSE3orAVX() &&
+  if (X86::isMOVDDUPMask(SVOp) && Subtarget->hasSSE3() &&
        V2IsUndef && RelaxedMayFoldVectorLoad(V1))
      return getMOVDDup(Op, dl, V1, DAG);
  
@@ -6522,7 +6513,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
      return getMOVHighToLow(Op, dl, DAG);
  
    // Use to match splats
-  if (HasXMMInt && X86::isUNPCKHMask(SVOp, HasAVX2) && V2IsUndef &&
+  if (HasSSE2 && X86::isUNPCKHMask(SVOp, HasAVX2) && V2IsUndef &&
        (VT == MVT::v2f64 || VT == MVT::v2i64))
      return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
  
@@ -6535,7 +6526,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
  
      unsigned TargetMask = X86::getShuffleSHUFImmediate(SVOp);
  
-    if (HasXMMInt && (VT == MVT::v4f32 || VT == MVT::v4i32))
+    if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32))
        return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG);
  
      return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V1,
@@ -6546,7 +6537,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
    bool isLeft = false;
    unsigned ShAmt = 0;
    SDValue ShVal;
-  bool isShift = HasXMMInt && isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt);
+  bool isShift = HasSSE2 && isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt);
    if (isShift && ShVal.hasOneUse()) {
      // If the shifted value has multiple uses, it may be cheaper to use
      // v_set0 + movlhps or movhlps, etc.
@@ -6559,7 +6550,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
      if (ISD::isBuildVectorAllZeros(V1.getNode()))
        return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl);
      if (!X86::isMOVLPMask(SVOp)) {
-      if (HasXMMInt && (VT == MVT::v2i64 || VT == MVT::v2f64))
+      if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64))
          return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
  
        if (VT == MVT::v4i32 || VT == MVT::v4f32)
@@ -6569,7 +6560,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
  
    // FIXME: fold these into legal mask.
    if (X86::isMOVLHPSMask(SVOp) && !X86::isUNPCKLMask(SVOp, HasAVX2))
-    return getMOVLowToHigh(Op, dl, DAG, HasXMMInt);
+    return getMOVLowToHigh(Op, dl, DAG, HasSSE2);
  
    if (X86::isMOVHLPSMask(SVOp))
      return getMOVHighToLow(Op, dl, DAG);
@@ -6581,7 +6572,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
      return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG);
  
    if (X86::isMOVLPMask(SVOp))
-    return getMOVLP(Op, dl, DAG, HasXMMInt);
+    return getMOVLP(Op, dl, DAG, HasSSE2);
  
    if (ShouldXformToMOVHLPS(SVOp) ||
        ShouldXformToMOVLP(V1.getNode(), V2.getNode(), SVOp))
@@ -6666,7 +6657,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
    // inlined here right now to enable us to directly emit target specific
    // nodes, and remove one by one until they don't return Op anymore.
  
-  if (isPALIGNRMask(M, VT, Subtarget->hasSSSE3orAVX()))
+  if (isPALIGNRMask(M, VT, Subtarget->hasSSSE3()))
      return getTargetShuffleNode(X86ISD::PALIGN, dl, VT, V1, V2,
                                  getShufflePALIGNRImmediate(SVOp),
                                  DAG);
@@ -6838,7 +6829,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
  
    assert(Vec.getValueSizeInBits() <= 128 && "Unexpected vector length");
  
-  if (Subtarget->hasSSE41orAVX()) {
+  if (Subtarget->hasSSE41()) {
      SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG);
      if (Res.getNode())
        return Res;
@@ -6980,7 +6971,7 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
      return Insert128BitVector(N0, V, Ins128Idx, DAG, dl);
    }
  
-  if (Subtarget->hasSSE41orAVX())
+  if (Subtarget->hasSSE41())
      return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG);
  
    if (EltVT == MVT::i8)
@@ -7446,9 +7437,9 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
  }
  
  
-/// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values and
-/// take a 2 x i32 value to shift plus a shift amount.
-SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const {
+/// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values
+/// and take a 2 x i32 value to shift plus a shift amount.
+SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const{
    assert(Op.getNumOperands() == 3 && "Not a double-shift!");
    EVT VT = Op.getValueType();
    unsigned VTBits = VT.getSizeInBits();
@@ -7589,38 +7580,17 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
  // LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion.
  SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
                                                 SelectionDAG &DAG) const {
-  // This algorithm is not obvious. Here it is in C code, more or less:
+  // This algorithm is not obvious. Here it is what we're trying to output:
    /*
-    double uint64_to_double( uint32_t hi, uint32_t lo ) {
-      static const __m128i exp = { 0x4330000045300000ULL, 0 };
-      static const __m128d bias = { 0x1.0p84, 0x1.0p52 };
-
-      // Copy ints to xmm registers.
-      __m128i xh = _mm_cvtsi32_si128( hi );
-      __m128i xl = _mm_cvtsi32_si128( lo );
-
-      // Combine into low half of a single xmm register.
-      __m128i x = _mm_unpacklo_epi32( xh, xl );
-      __m128d d;
-      double sd;
-
-      // Merge in appropriate exponents to give the integer bits the right
-      // magnitude.
-      x = _mm_unpacklo_epi32( x, exp );
-
-      // Subtract away the biases to deal with the IEEE-754 double precision
-      // implicit 1.
-      d = _mm_sub_pd( (__m128d) x, bias );
-
-      // All conversions up to here are exact. The correctly rounded result is
-      // calculated using the current rounding mode using the following
-      // horizontal add.
-      d = _mm_add_sd( d, _mm_unpackhi_pd( d, d ) );
-      _mm_store_sd( &sd, d );   // Because we are returning doubles in XMM, this
-                                // store doesn't really need to be here (except
-                                // maybe to zero the other double)
-      return sd;
-    }
+     movq       %rax,  %xmm0
+     punpckldq  (c0),  %xmm0  // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
+     subpd      (c1),  %xmm0  // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
+     #ifdef __SSE3__
+       haddpd   %xmm0, %xmm0          
+     #else
+       pshufd   $0x4e, %xmm0, %xmm1 
+       addpd    %xmm1, %xmm0
+     #endif
    */
  
    DebugLoc dl = Op.getDebugLoc();
@@ -7628,46 +7598,51 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
  
    // Build some magic constants.
    SmallVector<Constant*,4> CV0;
-  CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x45300000)));
    CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x43300000)));
+  CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x45300000)));
    CV0.push_back(ConstantInt::get(*Context, APInt(32, 0)));
    CV0.push_back(ConstantInt::get(*Context, APInt(32, 0)));
    Constant *C0 = ConstantVector::get(CV0);
    SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16);
  
    SmallVector<Constant*,2> CV1;
-  CV1.push_back(
-    ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL))));
    CV1.push_back(
      ConstantFP::get(*Context, APFloat(APInt(64, 0x4330000000000000ULL))));
+  CV1.push_back(
+    ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL))));
    Constant *C1 = ConstantVector::get(CV1);
    SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16);
  
-  SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
-                            DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
-                                        Op.getOperand(0),
-                                        DAG.getIntPtrConstant(1)));
-  SDValue XR2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
-                            DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
-                                        Op.getOperand(0),
-                                        DAG.getIntPtrConstant(0)));
-  SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, XR1, XR2);
+  // Load the 64-bit value into an XMM register.
+  SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
+                            Op.getOperand(0));
    SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
                                MachinePointerInfo::getConstantPool(),
                                false, false, false, 16);
-  SDValue Unpck2 = getUnpackl(DAG, dl, MVT::v4i32, Unpck1, CLod0);
-  SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck2);
+  SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32,
+                              DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, XR1),
+                              CLod0);
+
    SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
                                MachinePointerInfo::getConstantPool(),
                                false, false, false, 16);
+  SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck1);
    SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
+  SDValue Result;
  
-  // Add the halves; easiest way is to swap them into another reg first.
-  int ShufMask[2] = { 1, -1 };
-  SDValue Shuf = DAG.getVectorShuffle(MVT::v2f64, dl, Sub,
-                                      DAG.getUNDEF(MVT::v2f64), ShufMask);
-  SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuf, Sub);
-  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Add,
+  if (Subtarget->hasSSE3()) {
+    // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
+    Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
+  } else {
+    SDValue S2F = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Sub);
+    SDValue Shuffle = getTargetShuffleNode(X86ISD::PSHUFD, dl, MVT::v4i32,
+                                           S2F, 0x4E, DAG);
+    Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
+                         DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Shuffle),
+                         Sub);
+  }
+
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
                       DAG.getIntPtrConstant(0));
  }
  
@@ -7684,7 +7659,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
                               Op.getOperand(0));
  
    // Zero out the upper parts of the register.
-  Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget->hasXMMInt(),
+  Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget->hasSSE2(),
                                       DAG);
  
    Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
@@ -7737,6 +7712,9 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
      return LowerUINT_TO_FP_i64(Op, DAG);
    else if (SrcVT == MVT::i32 && X86ScalarSSEf64)
      return LowerUINT_TO_FP_i32(Op, DAG);
+  else if (Subtarget->is64Bit() &&
+           SrcVT == MVT::i64 && DstVT == MVT::f32)
+    return SDValue();
  
    // Make a 64-bit buffer, and use it to build an FILD.
    SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
@@ -7756,7 +7734,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
  
    assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
    SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
-                                StackSlot, MachinePointerInfo(),
+                               StackSlot, MachinePointerInfo(),
                                 false, false, 0);
    // For i64 source, we need to add the appropriate power of 2 if the input
    // was negative.  This is the same as the optimization in
@@ -8474,9 +8452,9 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
  
    // Check that the operation in question is available (most are plain SSE2,
    // but PCMPGTQ and PCMPEQQ have different requirements).
-  if (Opc == X86ISD::PCMPGTQ && !Subtarget->hasSSE42orAVX())
+  if (Opc == X86ISD::PCMPGTQ && !Subtarget->hasSSE42())
      return SDValue();
-  if (Opc == X86ISD::PCMPEQQ && !Subtarget->hasSSE41orAVX())
+  if (Opc == X86ISD::PCMPEQQ && !Subtarget->hasSSE41())
      return SDValue();
  
    // Since SSE has no unsigned integer comparisons, we need to flip  the sign
@@ -9141,7 +9119,7 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
      assert(!getTargetMachine().Options.UseSoftFloat &&
             !(DAG.getMachineFunction()
                  .getFunction()->hasFnAttr(Attribute::NoImplicitFloat)) &&
-           Subtarget->hasXMM());
+           Subtarget->hasSSE1());
    }
  
    // Insert VAARG_64 node into the DAG
@@ -10044,7 +10022,7 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
    SDValue Amt = Op.getOperand(1);
    LLVMContext *Context = DAG.getContext();
  
-  if (!Subtarget->hasXMMInt())
+  if (!Subtarget->hasSSE2())
      return SDValue();
  
    // Optimize shl/srl/sra with constant shift amount.
@@ -10122,7 +10100,7 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
        if (VT == MVT::v16i8 && Op.getOpcode() == ISD::SRA) {
          if (ShiftAmt == 7) {
            // R s>> 7  ===  R s< 0
-          SDValue Zeros = getZeroVector(VT, true /* HasXMMInt */, DAG, dl);
+          SDValue Zeros = getZeroVector(VT, true /* HasSSE2 */, DAG, dl);
            return DAG.getNode(X86ISD::PCMPGTB, dl, VT, Zeros, R);
          }
  
@@ -10164,7 +10142,7 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
          if (Op.getOpcode() == ISD::SRA) {
            if (ShiftAmt == 7) {
              // R s>> 7  ===  R s< 0
-            SDValue Zeros = getZeroVector(VT, true /* HasXMMInt */, DAG, dl);
+            SDValue Zeros = getZeroVector(VT, true /* HasSSE2 */, DAG, dl);
              return DAG.getNode(X86ISD::PCMPGTB, dl, VT, Zeros, R);
            }
  
@@ -10379,7 +10357,7 @@ SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
    EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
    EVT VT = Op.getValueType();
  
-  if (Subtarget->hasXMMInt() && VT.isVector()) {
+  if (Subtarget->hasSSE2() && VT.isVector()) {
      unsigned BitsDiff = VT.getScalarType().getSizeInBits() -
                          ExtraVT.getScalarType().getSizeInBits();
      SDValue ShAmt = DAG.getConstant(BitsDiff, MVT::i32);
@@ -10453,7 +10431,7 @@ SDValue X86TargetLowering::LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const{
  
    // Go ahead and emit the fence on x86-64 even if we asked for no-sse2.
    // There isn't any reason to disable it if the target processor supports it.
-  if (!Subtarget->hasXMMInt() && !Subtarget->is64Bit()) {
+  if (!Subtarget->hasSSE2() && !Subtarget->is64Bit()) {
      SDValue Chain = Op.getOperand(0);
      SDValue Zero = DAG.getConstant(0, MVT::i32);
      SDValue Ops[] = {
@@ -10507,7 +10485,7 @@ SDValue X86TargetLowering::LowerATOMIC_FENCE(SDValue Op,
      // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for
      // no-sse2). There isn't any reason to disable it if the target processor
      // supports it.
-    if (Subtarget->hasXMMInt() || Subtarget->is64Bit())
+    if (Subtarget->hasSSE2() || Subtarget->is64Bit())
        return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
  
      SDValue Chain = Op.getOperand(0);
@@ -10587,7 +10565,7 @@ SDValue X86TargetLowering::LowerBITCAST(SDValue Op,
                                              SelectionDAG &DAG) const {
    EVT SrcVT = Op.getOperand(0).getValueType();
    EVT DstVT = Op.getValueType();
-  assert(Subtarget->is64Bit() && !Subtarget->hasXMMInt() &&
+  assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() &&
           Subtarget->hasMMX() && "Unexpected custom BITCAST");
    assert((DstVT == MVT::i64 ||
            (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
@@ -11147,7 +11125,7 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
            isPSHUFDMask(M, VT) ||
            isPSHUFHWMask(M, VT) ||
            isPSHUFLWMask(M, VT) ||
-          isPALIGNRMask(M, VT, Subtarget->hasSSSE3orAVX()) ||
+          isPALIGNRMask(M, VT, Subtarget->hasSSSE3()) ||
            isUNPCKLMask(M, VT, Subtarget->hasAVX2()) ||
            isUNPCKHMask(M, VT, Subtarget->hasAVX2()) ||
            isUNPCKL_v_undef_Mask(M, VT, Subtarget->hasAVX2()) ||
@@ -11556,7 +11534,7 @@ X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr,
  MachineBasicBlock *
  X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB,
                              unsigned numArgs, bool memArg) const {
-  assert(Subtarget->hasSSE42orAVX() &&
+  assert(Subtarget->hasSSE42() &&
           "Target must have SSE4.2 or AVX features enabled");
  
    DebugLoc dl = MI->getDebugLoc();
@@ -12088,7 +12066,7 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB,
    BuildMI(BB, DL, TII->get(Is64Bit ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
      .addReg(tmpSPVReg).addReg(sizeVReg);
    BuildMI(BB, DL, TII->get(Is64Bit ? X86::CMP64mr:X86::CMP32mr))
-    .addReg(0).addImm(0).addReg(0).addImm(TlsOffset).addReg(TlsReg)
+    .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
      .addReg(SPLimitVReg);
    BuildMI(BB, DL, TII->get(X86::JG_4)).addMBB(mallocMBB);
  
@@ -12739,9 +12717,23 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
            !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems))
          return SDValue();
  
+    // If V1 is coming from a vector load then just fold to a VZEXT_LOAD.
+    if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(V1.getOperand(0))) {
+      SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other);
+      SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() };
+      SDValue ResNode =
+        DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2,
+                                Ld->getMemoryVT(),
+                                Ld->getPointerInfo(),
+                                Ld->getAlignment(),
+                                false/*isVolatile*/, true/*ReadMem*/,
+                                false/*WriteMem*/);
+      return DAG.getNode(ISD::BITCAST, dl, VT, ResNode);
+    } 
+
      // Emit a zeroed vector and insert the desired subvector on its
      // first half.
-    SDValue Zeros = getZeroVector(VT, true /* HasXMMInt */, DAG, dl);
+    SDValue Zeros = getZeroVector(VT, true /* HasSSE2 */, DAG, dl);
      SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0),
                           DAG.getConstant(0, MVT::i32), DAG, dl);
      return DCI.CombineTo(N, InsV);
@@ -12902,7 +12894,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
    // ignored in unsafe-math mode).
    if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
        VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
-      (Subtarget->hasXMMInt() ||
+      (Subtarget->hasSSE2() ||
         (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) {
      ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
  
@@ -13133,6 +13125,37 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
        }
    }
  
+  // Canonicalize max and min:
+  // (x > y) ? x : y -> (x >= y) ? x : y
+  // (x < y) ? x : y -> (x <= y) ? x : y
+  // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
+  // the need for an extra compare
+  // against zero. e.g.
+  // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
+  // subl   %esi, %edi
+  // testl  %edi, %edi
+  // movl   $0, %eax
+  // cmovgl %edi, %eax
+  // =>
+  // xorl   %eax, %eax
+  // subl   %esi, $edi
+  // cmovsl %eax, %edi
+  if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
+      DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
+      DAG.isEqualTo(RHS, Cond.getOperand(1))) {
+    ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+    switch (CC) {
+    default: break;
+    case ISD::SETLT:
+    case ISD::SETGT: {
+      ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
+      Cond = DAG.getSetCC(Cond.getDebugLoc(), Cond.getValueType(),
+                          Cond.getOperand(0), Cond.getOperand(1), NewCC);
+      return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS);
+    }
+    }
+  }
+
    return SDValue();
  }
  
@@ -13377,7 +13400,7 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
    // all elements are shifted by the same amount.  We can't do this in legalize
    // because the a constant vector is typically transformed to a constant pool
    // so we have no knowledge of the shift amount.
-  if (!Subtarget->hasXMMInt())
+  if (!Subtarget->hasSSE2())
      return SDValue();
  
    if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
@@ -13527,7 +13550,7 @@ static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG,
  
    // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
    // we're requiring SSE2 for both.
-  if (Subtarget->hasXMMInt() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
+  if (Subtarget->hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
      SDValue N0 = N->getOperand(0);
      SDValue N1 = N->getOperand(1);
      SDValue CMP0 = N0->getOperand(1);
@@ -13718,14 +13741,14 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
  
    // look for psign/blend
    if (VT == MVT::v2i64 || VT == MVT::v4i64) {
-    if (!Subtarget->hasSSSE3orAVX() ||
+    if (!Subtarget->hasSSSE3() ||
          (VT == MVT::v4i64 && !Subtarget->hasAVX2()))
        return SDValue();
  
      // Canonicalize pandn to RHS
      if (N0.getOpcode() == X86ISD::ANDNP)
        std::swap(N0, N1);
-    // or (and (m, x), (pandn m, y))
+    // or (and (m, y), (pandn m, x))
      if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::ANDNP) {
        SDValue Mask = N1.getOperand(0);
        SDValue X    = N1.getOperand(1);
@@ -13788,7 +13811,7 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
          return DAG.getNode(ISD::BITCAST, DL, VT, Sign);
        }
        // PBLENDVB only available on SSE 4.1
-      if (!Subtarget->hasSSE41orAVX())
+      if (!Subtarget->hasSSE41())
          return SDValue();
  
        EVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
@@ -13952,7 +13975,8 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
  
      // Bitcast the loaded value to a vector of the original element type, in
      // the size of the target vector type.
-    SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, ScalarInVector);
+    SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT,
+                                    ScalarInVector);
      unsigned SizeRatio = RegSz/MemSz;
  
      // Redistribute the loaded elements into the different locations.
@@ -14095,7 +14119,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
    const Function *F = DAG.getMachineFunction().getFunction();
    bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat);
    bool F64IsLegal = !DAG.getTarget().Options.UseSoftFloat && !NoImplicitFloatOps
-                     && Subtarget->hasXMMInt();
+                     && Subtarget->hasSSE2();
    if ((VT.isVector() ||
         (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) &&
        isa<LoadSDNode>(St->getValue()) &&
@@ -14326,7 +14350,7 @@ static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG,
    SDValue RHS = N->getOperand(1);
  
    // Try to synthesize horizontal adds from adds of shuffles.
-  if (((Subtarget->hasSSE3orAVX() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
+  if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
         (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
        isHorizontalBinOp(LHS, RHS, true))
      return DAG.getNode(X86ISD::FHADD, N->getDebugLoc(), VT, LHS, RHS);
@@ -14341,7 +14365,7 @@ static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG,
    SDValue RHS = N->getOperand(1);
  
    // Try to synthesize horizontal subs from subs of shuffles.
-  if (((Subtarget->hasSSE3orAVX() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
+  if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
         (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
        isHorizontalBinOp(LHS, RHS, false))
      return DAG.getNode(X86ISD::FHSUB, N->getDebugLoc(), VT, LHS, RHS);
@@ -14546,7 +14570,7 @@ static SDValue PerformAddCombine(SDNode *N, SelectionDAG &DAG,
    SDValue Op1 = N->getOperand(1);
  
    // Try to synthesize horizontal adds from adds of shuffles.
-  if (((Subtarget->hasSSSE3orAVX() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
+  if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
         (Subtarget->hasAVX2() && (VT == MVT::v16i16 || MVT::v8i32))) &&
        isHorizontalBinOp(Op0, Op1, true))
      return DAG.getNode(X86ISD::HADD, N->getDebugLoc(), VT, Op0, Op1);
@@ -14579,7 +14603,7 @@ static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG,
  
    // Try to synthesize horizontal adds from adds of shuffles.
    EVT VT = N->getValueType(0);
-  if (((Subtarget->hasSSSE3orAVX() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
+  if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
         (Subtarget->hasAVX2() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
        isHorizontalBinOp(Op0, Op1, true))
      return DAG.getNode(X86ISD::HSUB, N->getDebugLoc(), VT, Op0, Op1);
@@ -14933,7 +14957,8 @@ TargetLowering::ConstraintWeight
        break;
    case 'x':
    case 'Y':
-    if ((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasXMM())
+    if (((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasSSE1()) ||
+        ((type->getPrimitiveSizeInBits() == 256) && Subtarget->hasAVX()))
        weight = CW_Register;
      break;
    case 'I':
@@ -15003,9 +15028,9 @@ LowerXConstraint(EVT ConstraintVT) const {
    // FP X constraints get lowered to SSE1/2 registers if available, otherwise
    // 'f' like normal targets.
    if (ConstraintVT.isFloatingPoint()) {
-    if (Subtarget->hasXMMInt())
+    if (Subtarget->hasSSE2())
        return "Y";
-    if (Subtarget->hasXMM())
+    if (Subtarget->hasSSE1())
        return "x";
    }
  
@@ -15211,10 +15236,10 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
        if (!Subtarget->hasMMX()) break;
        return std::make_pair(0U, X86::VR64RegisterClass);
      case 'Y':   // SSE_REGS if SSE2 allowed
-      if (!Subtarget->hasXMMInt()) break;
+      if (!Subtarget->hasSSE2()) break;
        // FALL THROUGH.
-    case 'x':   // SSE_REGS if SSE1 allowed
-      if (!Subtarget->hasXMM()) break;
+    case 'x':   // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
+      if (!Subtarget->hasSSE1()) break;
  
        switch (VT.getSimpleVT().SimpleTy) {
        default: break;
@@ -15233,6 +15258,15 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
        case MVT::v4f32:
        case MVT::v2f64:
          return std::make_pair(0U, X86::VR128RegisterClass);
+      // AVX types.
+      case MVT::v32i8:
+      case MVT::v16i16:
+      case MVT::v8i32:
+      case MVT::v4i64:
+      case MVT::v8f32:
+      case MVT::v4f64:
+        return std::make_pair(0U, X86::VR256RegisterClass);
+        
        }
        break;
      }