Improve the X86 cost model for loads and stores.

[oota-llvm.git] / lib / Target / X86 / X86ISelLowering.cpp
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index d276353cca6a9e91bfdc572186bb933bbdaae9e6..6b650726b622c158bf09dcc4c561298c88201f7a 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -14,20 +14,16 @@
  
  #define DEBUG_TYPE "x86-isel"
  #include "X86ISelLowering.h"
+#include "Utils/X86ShuffleDecode.h"
  #include "X86.h"
  #include "X86InstrBuilder.h"
  #include "X86TargetMachine.h"
  #include "X86TargetObjectFile.h"
-#include "Utils/X86ShuffleDecode.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/VariadicFunction.h"
  #include "llvm/CallingConv.h"
-#include "llvm/Constants.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/GlobalAlias.h"
-#include "llvm/GlobalVariable.h"
-#include "llvm/Function.h"
-#include "llvm/Instructions.h"
-#include "llvm/Intrinsics.h"
-#include "llvm/LLVMContext.h"
  #include "llvm/CodeGen/IntrinsicLowering.h"
  #include "llvm/CodeGen/MachineFrameInfo.h"
  #include "llvm/CodeGen/MachineFunction.h"
@@ -35,14 +31,18 @@
  #include "llvm/CodeGen/MachineJumpTableInfo.h"
  #include "llvm/CodeGen/MachineModuleInfo.h"
  #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/GlobalAlias.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/Instructions.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/LLVMContext.h"
  #include "llvm/MC/MCAsmInfo.h"
  #include "llvm/MC/MCContext.h"
  #include "llvm/MC/MCExpr.h"
  #include "llvm/MC/MCSymbol.h"
-#include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/ADT/VariadicFunction.h"
  #include "llvm/Support/CallSite.h"
  #include "llvm/Support/Debug.h"
  #include "llvm/Support/ErrorHandling.h"
@@ -978,7 +978,15 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
      setOperationAction(ISD::FNEARBYINT,         MVT::f64,   Legal);
  
      setOperationAction(ISD::FFLOOR,             MVT::v4f32, Legal);
+    setOperationAction(ISD::FCEIL,              MVT::v4f32, Legal);
+    setOperationAction(ISD::FTRUNC,             MVT::v4f32, Legal);
+    setOperationAction(ISD::FRINT,              MVT::v4f32, Legal);
+    setOperationAction(ISD::FNEARBYINT,         MVT::v4f32, Legal);
      setOperationAction(ISD::FFLOOR,             MVT::v2f64, Legal);
+    setOperationAction(ISD::FCEIL,              MVT::v2f64, Legal);
+    setOperationAction(ISD::FTRUNC,             MVT::v2f64, Legal);
+    setOperationAction(ISD::FRINT,              MVT::v2f64, Legal);
+    setOperationAction(ISD::FNEARBYINT,         MVT::v2f64, Legal);
  
      // FIXME: Do we need to handle scalar-to-vector here?
      setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
@@ -1021,7 +1029,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
      setOperationAction(ISD::SRA,               MVT::v8i16, Custom);
      setOperationAction(ISD::SRA,               MVT::v16i8, Custom);
  
-    if (Subtarget->hasAVX2()) {
+    if (Subtarget->hasInt256()) {
        setOperationAction(ISD::SRL,             MVT::v2i64, Legal);
        setOperationAction(ISD::SRL,             MVT::v4i32, Legal);
  
@@ -1040,7 +1048,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
      }
    }
  
-  if (!TM.Options.UseSoftFloat && Subtarget->hasAVX()) {
+  if (!TM.Options.UseSoftFloat && Subtarget->hasFp256()) {
      addRegisterClass(MVT::v32i8,  &X86::VR256RegClass);
      addRegisterClass(MVT::v16i16, &X86::VR256RegClass);
      addRegisterClass(MVT::v8i32,  &X86::VR256RegClass);
@@ -1058,6 +1066,10 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
      setOperationAction(ISD::FDIV,               MVT::v8f32, Legal);
      setOperationAction(ISD::FSQRT,              MVT::v8f32, Legal);
      setOperationAction(ISD::FFLOOR,             MVT::v8f32, Legal);
+    setOperationAction(ISD::FCEIL,              MVT::v8f32, Legal);
+    setOperationAction(ISD::FTRUNC,             MVT::v8f32, Legal);
+    setOperationAction(ISD::FRINT,              MVT::v8f32, Legal);
+    setOperationAction(ISD::FNEARBYINT,         MVT::v8f32, Legal);
      setOperationAction(ISD::FNEG,               MVT::v8f32, Custom);
      setOperationAction(ISD::FABS,               MVT::v8f32, Custom);
  
@@ -1067,6 +1079,10 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
      setOperationAction(ISD::FDIV,               MVT::v4f64, Legal);
      setOperationAction(ISD::FSQRT,              MVT::v4f64, Legal);
      setOperationAction(ISD::FFLOOR,             MVT::v4f64, Legal);
+    setOperationAction(ISD::FCEIL,              MVT::v4f64, Legal);
+    setOperationAction(ISD::FTRUNC,             MVT::v4f64, Legal);
+    setOperationAction(ISD::FRINT,              MVT::v4f64, Legal);
+    setOperationAction(ISD::FNEARBYINT,         MVT::v4f64, Legal);
      setOperationAction(ISD::FNEG,               MVT::v4f64, Custom);
      setOperationAction(ISD::FABS,               MVT::v4f64, Custom);
  
@@ -1108,15 +1124,15 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
      setOperationAction(ISD::VSELECT,           MVT::v8f32, Legal);
  
      if (Subtarget->hasFMA() || Subtarget->hasFMA4()) {
-      setOperationAction(ISD::FMA,             MVT::v8f32, Custom);
-      setOperationAction(ISD::FMA,             MVT::v4f64, Custom);
-      setOperationAction(ISD::FMA,             MVT::v4f32, Custom);
-      setOperationAction(ISD::FMA,             MVT::v2f64, Custom);
-      setOperationAction(ISD::FMA,             MVT::f32, Custom);
-      setOperationAction(ISD::FMA,             MVT::f64, Custom);
+      setOperationAction(ISD::FMA,             MVT::v8f32, Legal);
+      setOperationAction(ISD::FMA,             MVT::v4f64, Legal);
+      setOperationAction(ISD::FMA,             MVT::v4f32, Legal);
+      setOperationAction(ISD::FMA,             MVT::v2f64, Legal);
+      setOperationAction(ISD::FMA,             MVT::f32, Legal);
+      setOperationAction(ISD::FMA,             MVT::f64, Legal);
      }
  
-    if (Subtarget->hasAVX2()) {
+    if (Subtarget->hasInt256()) {
        setOperationAction(ISD::ADD,             MVT::v4i64, Legal);
        setOperationAction(ISD::ADD,             MVT::v8i32, Legal);
        setOperationAction(ISD::ADD,             MVT::v16i16, Legal);
@@ -1353,30 +1369,29 @@ unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const {
  /// lowering. If DstAlign is zero that means it's safe to destination
  /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
  /// means there isn't a need to check it against alignment requirement,
-/// probably because the source does not need to be loaded. If
-/// 'IsZeroVal' is true, that means it's safe to return a
-/// non-scalar-integer type, e.g. empty string source, constant, or loaded
-/// from memory. 'MemcpyStrSrc' indicates whether the memcpy source is
-/// constant so it does not need to be loaded.
+/// probably because the source does not need to be loaded. If 'IsMemset' is
+/// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
+/// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
+/// source is constant so it does not need to be loaded.
  /// It returns EVT::Other if the type should be determined using generic
  /// target-independent logic.
  EVT
  X86TargetLowering::getOptimalMemOpType(uint64_t Size,
                                         unsigned DstAlign, unsigned SrcAlign,
-                                       bool IsZeroVal,
+                                       bool IsMemset, bool ZeroMemset,
                                         bool MemcpyStrSrc,
                                         MachineFunction &MF) const {
    const Function *F = MF.getFunction();
-  if (IsZeroVal &&
-      !F->getFnAttributes().hasAttribute(Attributes::NoImplicitFloat)) {
+  if ((!IsMemset || ZeroMemset) &&
+      !F->getFnAttributes().hasAttribute(Attribute::NoImplicitFloat)) {
      if (Size >= 16 &&
          (Subtarget->isUnalignedMemAccessFast() ||
           ((DstAlign == 0 || DstAlign >= 16) &&
            (SrcAlign == 0 || SrcAlign >= 16)))) {
        if (Size >= 32) {
-        if (Subtarget->hasAVX2())
+        if (Subtarget->hasInt256())
            return MVT::v8i32;
-        if (Subtarget->hasAVX())
+        if (Subtarget->hasFp256())
            return MVT::v8f32;
        }
        if (Subtarget->hasSSE2())
@@ -1396,6 +1411,21 @@ X86TargetLowering::getOptimalMemOpType(uint64_t Size,
    return MVT::i32;
  }
  
+bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
+  if (VT == MVT::f32)
+    return X86ScalarSSEf32;
+  else if (VT == MVT::f64)
+    return X86ScalarSSEf64;
+  return true;
+}
+
+bool
+X86TargetLowering::allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const {
+  if (Fast)
+    *Fast = Subtarget->isUnalignedMemAccessFast();
+  return true;
+}
+
  /// getJumpTableEncoding - Return the entry encoding for a jump table in the
  /// current function.  The returned value is a member of the
  /// MachineJumpTableInfo::JTEntryKind enum.
@@ -1449,10 +1479,10 @@ getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
  
  // FIXME: Why this routine is here? Move to RegInfo!
  std::pair<const TargetRegisterClass*, uint8_t>
-X86TargetLowering::findRepresentativeClass(EVT VT) const{
+X86TargetLowering::findRepresentativeClass(MVT VT) const{
    const TargetRegisterClass *RRC = 0;
    uint8_t Cost = 1;
-  switch (VT.getSimpleVT().SimpleTy) {
+  switch (VT.SimpleTy) {
    default:
      return TargetLowering::findRepresentativeClass(VT);
    case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
@@ -1666,8 +1696,8 @@ bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
    return true;
  }
  
-EVT
-X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT,
+MVT
+X86TargetLowering::getTypeForExtArgOrReturn(MVT VT,
                                              ISD::NodeType ExtendKind) const {
    MVT ReturnMVT;
    // TODO: Is this also valid on 32-bit?
@@ -1676,7 +1706,7 @@ X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT,
    else
      ReturnMVT = MVT::i32;
  
-  EVT MinVT = getRegisterType(Context, ReturnMVT);
+  MVT MinVT = getRegisterType(ReturnMVT);
    return VT.bitsLT(MinVT) ? MinVT : VT;
  }
  
@@ -1806,7 +1836,8 @@ CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
  /// IsTailCallConvention - Return true if the calling convention is one that
  /// supports tail call optimization.
  static bool IsTailCallConvention(CallingConv::ID CC) {
-  return (CC == CallingConv::Fast || CC == CallingConv::GHC);
+  return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
+          CC == CallingConv::HiPE);
  }
  
  bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
@@ -1893,7 +1924,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
    bool IsWin64 = Subtarget->isTargetWin64();
  
    assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
-         "Var args not supported with calling convention fastcc or ghc");
+         "Var args not supported with calling convention fastcc, ghc or hipe");
  
    // Assign locations to all of the incoming arguments.
    SmallVector<CCValAssign, 16> ArgLocs;
@@ -2035,7 +2066,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
                                                         TotalNumIntRegs);
  
        bool NoImplicitFloatOps = Fn->getFnAttributes().
-        hasAttribute(Attributes::NoImplicitFloat);
+        hasAttribute(Attribute::NoImplicitFloat);
        assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
               "SSE register cannot be used when SSE is disabled!");
        assert(!(NumXMMRegs && MF.getTarget().Options.UseSoftFloat &&
@@ -2238,7 +2269,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
    }
  
    assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
-         "Var args not supported with calling convention fastcc or ghc");
+         "Var args not supported with calling convention fastcc, ghc or hipe");
  
    // Analyze operands of the call, assigning locations to each operand.
    SmallVector<CCValAssign, 16> ArgLocs;
@@ -2514,7 +2545,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
        } else if (Subtarget->isPICStyleRIPRel() &&
                   isa<Function>(GV) &&
                   cast<Function>(GV)->getFnAttributes().
-                   hasAttribute(Attributes::NonLazyBind)) {
+                   hasAttribute(Attribute::NonLazyBind)) {
          // If the function is marked as non-lazy, generate an indirect call
          // which loads from the GOT directly. This avoids runtime overhead
          // at the cost of eager binding (and one extra byte of encoding).
@@ -3103,6 +3134,8 @@ bool X86::isCalleePop(CallingConv::ID CallingConv,
      return TailCallOpt;
    case CallingConv::GHC:
      return TailCallOpt;
+  case CallingConv::HiPE:
+    return TailCallOpt;
    }
  }
  
@@ -3233,9 +3266,7 @@ static bool isUndefOrInRange(int Val, int Low, int Hi) {
  /// isUndefOrEqual - Val is either less than zero (undef) or equal to the
  /// specified value.
  static bool isUndefOrEqual(int Val, int CmpVal) {
-  if (Val < 0 || Val == CmpVal)
-    return true;
-  return false;
+  return (Val < 0 || Val == CmpVal);
  }
  
  /// isSequentialOrUndefInRange - Return true if every element in Mask, beginning
@@ -3262,8 +3293,8 @@ static bool isPSHUFDMask(ArrayRef<int> Mask, EVT VT) {
  
  /// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that
  /// is suitable for input to PSHUFHW.
-static bool isPSHUFHWMask(ArrayRef<int> Mask, EVT VT, bool HasAVX2) {
-  if (VT != MVT::v8i16 && (!HasAVX2 || VT != MVT::v16i16))
+static bool isPSHUFHWMask(ArrayRef<int> Mask, EVT VT, bool HasInt256) {
+  if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
      return false;
  
    // Lower quadword copied in order or undef.
@@ -3291,8 +3322,8 @@ static bool isPSHUFHWMask(ArrayRef<int> Mask, EVT VT, bool HasAVX2) {
  
  /// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that
  /// is suitable for input to PSHUFLW.
-static bool isPSHUFLWMask(ArrayRef<int> Mask, EVT VT, bool HasAVX2) {
-  if (VT != MVT::v8i16 && (!HasAVX2 || VT != MVT::v16i16))
+static bool isPSHUFLWMask(ArrayRef<int> Mask, EVT VT, bool HasInt256) {
+  if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
      return false;
  
    // Upper quadword copied in order.
@@ -3323,7 +3354,7 @@ static bool isPSHUFLWMask(ArrayRef<int> Mask, EVT VT, bool HasAVX2) {
  static bool isPALIGNRMask(ArrayRef<int> Mask, EVT VT,
                            const X86Subtarget *Subtarget) {
    if ((VT.getSizeInBits() == 128 && !Subtarget->hasSSSE3()) ||
-      (VT.getSizeInBits() == 256 && !Subtarget->hasAVX2()))
+      (VT.getSizeInBits() == 256 && !Subtarget->hasInt256()))
      return false;
  
    unsigned NumElts = VT.getVectorNumElements();
@@ -3410,9 +3441,9 @@ static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask,
  /// specifies a shuffle of elements that is suitable for input to 128/256-bit
  /// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be
  /// reverse of what x86 shuffles want.
-static bool isSHUFPMask(ArrayRef<int> Mask, EVT VT, bool HasAVX,
+static bool isSHUFPMask(ArrayRef<int> Mask, EVT VT, bool HasFp256,
                          bool Commuted = false) {
-  if (!HasAVX && VT.getSizeInBits() == 256)
+  if (!HasFp256 && VT.getSizeInBits() == 256)
      return false;
  
    unsigned NumElems = VT.getVectorNumElements();
@@ -3591,14 +3622,14 @@ SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp,
  /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
  /// specifies a shuffle of elements that is suitable for input to UNPCKL.
  static bool isUNPCKLMask(ArrayRef<int> Mask, EVT VT,
-                         bool HasAVX2, bool V2IsSplat = false) {
+                         bool HasInt256, bool V2IsSplat = false) {
    unsigned NumElts = VT.getVectorNumElements();
  
    assert((VT.is128BitVector() || VT.is256BitVector()) &&
           "Unsupported vector type for unpckh");
  
    if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 &&
-      (!HasAVX2 || (NumElts != 16 && NumElts != 32)))
+      (!HasInt256 || (NumElts != 16 && NumElts != 32)))
      return false;
  
    // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
@@ -3630,14 +3661,14 @@ static bool isUNPCKLMask(ArrayRef<int> Mask, EVT VT,
  /// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
  /// specifies a shuffle of elements that is suitable for input to UNPCKH.
  static bool isUNPCKHMask(ArrayRef<int> Mask, EVT VT,
-                         bool HasAVX2, bool V2IsSplat = false) {
+                         bool HasInt256, bool V2IsSplat = false) {
    unsigned NumElts = VT.getVectorNumElements();
  
    assert((VT.is128BitVector() || VT.is256BitVector()) &&
           "Unsupported vector type for unpckh");
  
    if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 &&
-      (!HasAVX2 || (NumElts != 16 && NumElts != 32)))
+      (!HasInt256 || (NumElts != 16 && NumElts != 32)))
      return false;
  
    // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
@@ -3668,14 +3699,14 @@ static bool isUNPCKHMask(ArrayRef<int> Mask, EVT VT,
  /// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
  /// <0, 0, 1, 1>
  static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, EVT VT,
-                                  bool HasAVX2) {
+                                  bool HasInt256) {
    unsigned NumElts = VT.getVectorNumElements();
  
    assert((VT.is128BitVector() || VT.is256BitVector()) &&
           "Unsupported vector type for unpckh");
  
    if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 &&
-      (!HasAVX2 || (NumElts != 16 && NumElts != 32)))
+      (!HasInt256 || (NumElts != 16 && NumElts != 32)))
      return false;
  
    // For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern
@@ -3710,14 +3741,14 @@ static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, EVT VT,
  /// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
  /// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
  /// <2, 2, 3, 3>
-static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, EVT VT, bool HasAVX2) {
+static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, EVT VT, bool HasInt256) {
    unsigned NumElts = VT.getVectorNumElements();
  
    assert((VT.is128BitVector() || VT.is256BitVector()) &&
           "Unsupported vector type for unpckh");
  
    if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 &&
-      (!HasAVX2 || (NumElts != 16 && NumElts != 32)))
+      (!HasInt256 || (NumElts != 16 && NumElts != 32)))
      return false;
  
    // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
@@ -3766,8 +3797,8 @@ static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) {
  ///   vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15>
  /// The first half comes from the second half of V1 and the second half from the
  /// the second half of V2.
-static bool isVPERM2X128Mask(ArrayRef<int> Mask, EVT VT, bool HasAVX) {
-  if (!HasAVX || !VT.is256BitVector())
+static bool isVPERM2X128Mask(ArrayRef<int> Mask, EVT VT, bool HasFp256) {
+  if (!HasFp256 || !VT.is256BitVector())
      return false;
  
    // The shuffle result is divided into half A and half B. In total the two
@@ -3826,8 +3857,8 @@ static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) {
  /// to the same elements of the low, but to the higher half of the source.
  /// In VPERMILPD the two lanes could be shuffled independently of each other
  /// with the same restriction that lanes can't be crossed. Also handles PSHUFDY.
-static bool isVPERMILPMask(ArrayRef<int> Mask, EVT VT, bool HasAVX) {
-  if (!HasAVX)
+static bool isVPERMILPMask(ArrayRef<int> Mask, EVT VT, bool HasFp256) {
+  if (!HasFp256)
      return false;
  
    unsigned NumElts = VT.getVectorNumElements();
@@ -3927,8 +3958,8 @@ static bool isMOVSLDUPMask(ArrayRef<int> Mask, EVT VT,
  /// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand
  /// specifies a shuffle of elements that is suitable for input to 256-bit
  /// version of MOVDDUP.
-static bool isMOVDDUPYMask(ArrayRef<int> Mask, EVT VT, bool HasAVX) {
-  if (!HasAVX || !VT.is256BitVector())
+static bool isMOVDDUPYMask(ArrayRef<int> Mask, EVT VT, bool HasFp256) {
+  if (!HasFp256 || !VT.is256BitVector())
      return false;
  
    unsigned NumElts = VT.getVectorNumElements();
@@ -4333,7 +4364,7 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
        Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
      }
    } else if (Size == 256) { // AVX
-    if (Subtarget->hasAVX2()) { // AVX2
+    if (Subtarget->hasInt256()) { // AVX2
        SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
        SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
        Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 8);
@@ -4354,7 +4385,7 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
  /// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
  /// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately.
  /// Then bitcast to their original type, ensuring they get CSE'd.
-static SDValue getOnesVector(EVT VT, bool HasAVX2, SelectionDAG &DAG,
+static SDValue getOnesVector(EVT VT, bool HasInt256, SelectionDAG &DAG,
                               DebugLoc dl) {
    assert(VT.isVector() && "Expected a vector type");
    unsigned Size = VT.getSizeInBits();
@@ -4362,7 +4393,7 @@ static SDValue getOnesVector(EVT VT, bool HasAVX2, SelectionDAG &DAG,
    SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32);
    SDValue Vec;
    if (Size == 256) {
-    if (HasAVX2) { // AVX2
+    if (HasInt256) { // AVX2
        SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
        Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 8);
      } else { // AVX
@@ -5063,7 +5094,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
  /// or SDValue() otherwise.
  SDValue
  X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const {
-  if (!Subtarget->hasAVX())
+  if (!Subtarget->hasFp256())
      return SDValue();
  
    EVT VT = Op.getValueType();
@@ -5109,7 +5140,7 @@ X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const {
        if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR &&
            Sc.getOpcode() != ISD::BUILD_VECTOR) {
  
-        if (!Subtarget->hasAVX2())
+        if (!Subtarget->hasInt256())
            return SDValue();
  
          // Use the register form of the broadcast instruction available on AVX2.
@@ -5136,7 +5167,7 @@ X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const {
    // Handle the broadcasting a single constant scalar from the constant pool
    // into a vector. On Sandybridge it is still better to load a constant vector
    // from the constant pool and not to broadcast it from a scalar.
-  if (ConstSplatVal && Subtarget->hasAVX2()) {
+  if (ConstSplatVal && Subtarget->hasInt256()) {
      EVT CVT = Ld.getValueType();
      assert(!CVT.isVector() && "Must not broadcast a vector type");
      unsigned ScalarSize = CVT.getSizeInBits();
@@ -5164,7 +5195,7 @@ X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const {
    unsigned ScalarSize = Ld.getValueType().getSizeInBits();
  
    // Handle AVX2 in-register broadcasts.
-  if (!IsLoad && Subtarget->hasAVX2() &&
+  if (!IsLoad && Subtarget->hasInt256() &&
        (ScalarSize == 32 || (Is256 && ScalarSize == 64)))
      return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
  
@@ -5177,7 +5208,7 @@ X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const {
  
    // The integer check is needed for the 64-bit into 128-bit so it doesn't match
    // double since there is no vbroadcastsd xmm
-  if (Subtarget->hasAVX2() && Ld.getValueType().isInteger()) {
+  if (Subtarget->hasInt256() && Ld.getValueType().isInteger()) {
      if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
        return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
    }
@@ -5282,10 +5313,10 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
    // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
    // vpcmpeqd on 256-bit vectors.
    if (ISD::isBuildVectorAllOnes(Op.getNode())) {
-    if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasAVX2()))
+    if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasInt256()))
        return Op;
  
-    return getOnesVector(VT, Subtarget->hasAVX2(), DAG, dl);
+    return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl);
    }
  
    SDValue Broadcast = LowerVectorBroadcast(Op, DAG);
@@ -5622,64 +5653,53 @@ LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp,
    SDValue V1 = SVOp->getOperand(0);
    SDValue V2 = SVOp->getOperand(1);
    DebugLoc dl = SVOp->getDebugLoc();
-  MVT VT = SVOp->getValueType(0).getSimpleVT();
+  EVT VT = SVOp->getValueType(0);
+  EVT EltVT = VT.getVectorElementType();
    unsigned NumElems = VT.getVectorNumElements();
  
-  if (!Subtarget->hasSSE41())
+  if (!Subtarget->hasSSE41() || EltVT == MVT::i8)
+    return SDValue();
+  if (!Subtarget->hasInt256() && VT == MVT::v16i16)
      return SDValue();
  
-  unsigned ISDNo = 0;
-  MVT OpTy;
-
-  switch (VT.SimpleTy) {
-  default: return SDValue();
-  case MVT::v8i16:
-    ISDNo = X86ISD::BLENDPW;
-    OpTy = MVT::v8i16;
-    break;
-  case MVT::v4i32:
-  case MVT::v4f32:
-    ISDNo = X86ISD::BLENDPS;
-    OpTy = MVT::v4f32;
-    break;
-  case MVT::v2i64:
-  case MVT::v2f64:
-    ISDNo = X86ISD::BLENDPD;
-    OpTy = MVT::v2f64;
-    break;
-  case MVT::v8i32:
-  case MVT::v8f32:
-    if (!Subtarget->hasAVX())
-      return SDValue();
-    ISDNo = X86ISD::BLENDPS;
-    OpTy = MVT::v8f32;
-    break;
-  case MVT::v4i64:
-  case MVT::v4f64:
-    if (!Subtarget->hasAVX())
-      return SDValue();
-    ISDNo = X86ISD::BLENDPD;
-    OpTy = MVT::v4f64;
-    break;
-  }
-  assert(ISDNo && "Invalid Op Number");
+  // Check the mask for BLEND and build the value.
+  unsigned MaskValue = 0;
+  // There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
+  unsigned NumLanes = (NumElems-1)/8 + 1; 
+  unsigned NumElemsInLane = NumElems / NumLanes;
  
-  unsigned MaskVals = 0;
+  // Blend for v16i16 should be symetric for the both lanes.
+  for (unsigned i = 0; i < NumElemsInLane; ++i) {
  
-  for (unsigned i = 0; i != NumElems; ++i) {
+    int SndLaneEltIdx = (NumLanes == 2) ? 
+      SVOp->getMaskElt(i + NumElemsInLane) : -1;
      int EltIdx = SVOp->getMaskElt(i);
-    if (EltIdx == (int)i || EltIdx < 0)
-      MaskVals |= (1<<i);
-    else if (EltIdx == (int)(i + NumElems))
-      continue; // Bit is set to zero;
-    else
+
+    if ((EltIdx == -1 || EltIdx == (int)i) && 
+        (SndLaneEltIdx == -1 || SndLaneEltIdx == (int)(i + NumElemsInLane)))
+      continue;
+
+    if (((unsigned)EltIdx == (i + NumElems)) && 
+        (SndLaneEltIdx == -1 || 
+         (unsigned)SndLaneEltIdx == i + NumElems + NumElemsInLane))
+      MaskValue |= (1<<i);
+    else 
        return SDValue();
    }
  
-  V1 = DAG.getNode(ISD::BITCAST, dl, OpTy, V1);
-  V2 = DAG.getNode(ISD::BITCAST, dl, OpTy, V2);
-  SDValue Ret =  DAG.getNode(ISDNo, dl, OpTy, V1, V2,
-                             DAG.getConstant(MaskVals, MVT::i32));
+  // Convert i32 vectors to floating point if it is not AVX2.
+  // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.
+  EVT BlendVT = VT;
+  if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) {
+    BlendVT = EVT::getVectorVT(*DAG.getContext(), 
+                              EVT::getFloatingPointVT(EltVT.getSizeInBits()), 
+                              NumElems);
+    V1 = DAG.getNode(ISD::BITCAST, dl, VT, V1);
+    V2 = DAG.getNode(ISD::BITCAST, dl, VT, V2);
+  }
+  
+  SDValue Ret =  DAG.getNode(X86ISD::BLENDI, dl, BlendVT, V1, V2,
+                             DAG.getConstant(MaskValue, MVT::i32));
    return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
  }
  
@@ -6079,7 +6099,7 @@ SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp,
    // (1) one of input vector is undefined or zeroinitializer.
    // The mask value 0x80 puts 0 in the corresponding slot of the vector.
    // And (2) the mask indexes don't cross the 128-bit lane.
-  if (VT != MVT::v32i8 || !Subtarget->hasAVX2() ||
+  if (VT != MVT::v32i8 || !Subtarget->hasInt256() ||
        (!V2IsUndef && !V2IsAllZero && !V1IsAllZero))
      return SDValue();
  
@@ -6452,23 +6472,6 @@ static bool MayFoldVectorLoad(SDValue V) {
    return MayFoldLoad(V);
  }
  
-// FIXME: the version above should always be used. Since there's
-// a bug where several vector shuffles can't be folded because the
-// DAG is not updated during lowering and a node claims to have two
-// uses while it only has one, use this version, and let isel match
-// another instruction if the load really happens to have more than
-// one use. Remove this version after this bug get fixed.
-// rdar://8434668, PR8156
-static bool RelaxedMayFoldVectorLoad(SDValue V) {
-  if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST)
-    V = V.getOperand(0);
-  if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR)
-    V = V.getOperand(0);
-  if (ISD::isNormalLoad(V.getNode()))
-    return true;
-  return false;
-}
-
  static
  SDValue getMOVDDup(SDValue &Op, DebugLoc &dl, SDValue V1, SelectionDAG &DAG) {
    EVT VT = Op.getValueType();
@@ -6582,7 +6585,7 @@ X86TargetLowering::lowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const {
    EVT VT = Op.getValueType();
  
    // Only AVX2 support 256-bit vector integer extending.
-  if (!Subtarget->hasAVX2() && VT.is256BitVector())
+  if (!Subtarget->hasInt256() && VT.is256BitVector())
      return SDValue();
  
    ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
@@ -6670,7 +6673,7 @@ X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const {
  
      // Handle splats by matching through known shuffle masks
      if ((Size == 128 && NumElem <= 4) ||
-        (Size == 256 && NumElem < 8))
+        (Size == 256 && NumElem <= 8))
        return SDValue();
  
      // All remaning splats are promoted to target supported vector shuffles.
@@ -6728,11 +6731,11 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
    bool V1IsSplat = false;
    bool V2IsSplat = false;
    bool HasSSE2 = Subtarget->hasSSE2();
-  bool HasAVX    = Subtarget->hasAVX();
-  bool HasAVX2   = Subtarget->hasAVX2();
+  bool HasFp256    = Subtarget->hasFp256();
+  bool HasInt256   = Subtarget->hasInt256();
    MachineFunction &MF = DAG.getMachineFunction();
    bool OptForSize = MF.getFunction()->getFnAttributes().
-    hasAttribute(Attributes::OptimizeForSize);
+    hasAttribute(Attribute::OptimizeForSize);
  
    assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles");
  
@@ -6766,20 +6769,20 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
  
    // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and
    // unpckh_undef). Only use pshufd if speed is more important than size.
-  if (OptForSize && isUNPCKL_v_undef_Mask(M, VT, HasAVX2))
+  if (OptForSize && isUNPCKL_v_undef_Mask(M, VT, HasInt256))
      return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
-  if (OptForSize && isUNPCKH_v_undef_Mask(M, VT, HasAVX2))
+  if (OptForSize && isUNPCKH_v_undef_Mask(M, VT, HasInt256))
      return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
  
    if (isMOVDDUPMask(M, VT) && Subtarget->hasSSE3() &&
-      V2IsUndef && RelaxedMayFoldVectorLoad(V1))
+      V2IsUndef && MayFoldVectorLoad(V1))
      return getMOVDDup(Op, dl, V1, DAG);
  
    if (isMOVHLPS_v_undef_Mask(M, VT))
      return getMOVHighToLow(Op, dl, DAG);
  
    // Use to match splats
-  if (HasSSE2 && isUNPCKHMask(M, VT, HasAVX2) && V2IsUndef &&
+  if (HasSSE2 && isUNPCKHMask(M, VT, HasInt256) && V2IsUndef &&
        (VT == MVT::v2f64 || VT == MVT::v2i64))
      return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
  
@@ -6792,12 +6795,13 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
  
      unsigned TargetMask = getShuffleSHUFImmediate(SVOp);
  
-    if (HasAVX && (VT == MVT::v4f32 || VT == MVT::v2f64))
-      return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1, TargetMask, DAG);
-
      if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32))
        return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG);
  
+    if (HasFp256 && (VT == MVT::v4f32 || VT == MVT::v2f64))
+      return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1, TargetMask,
+                                  DAG);
+
      return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V1,
                                  TargetMask, DAG);
    }
@@ -6828,7 +6832,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
    }
  
    // FIXME: fold these into legal mask.
-  if (isMOVLHPSMask(M, VT) && !isUNPCKLMask(M, VT, HasAVX2))
+  if (isMOVLHPSMask(M, VT) && !isUNPCKLMask(M, VT, HasInt256))
      return getMOVLowToHigh(Op, dl, DAG, HasSSE2);
  
    if (isMOVHLPSMask(M, VT))
@@ -6878,10 +6882,10 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
      return getMOVL(DAG, dl, VT, V2, V1);
    }
  
-  if (isUNPCKLMask(M, VT, HasAVX2))
+  if (isUNPCKLMask(M, VT, HasInt256))
      return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
  
-  if (isUNPCKHMask(M, VT, HasAVX2))
+  if (isUNPCKHMask(M, VT, HasInt256))
      return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
  
    if (V2IsSplat) {
@@ -6890,9 +6894,9 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
      // new vector_shuffle with the corrected mask.p
      SmallVector<int, 8> NewMask(M.begin(), M.end());
      NormalizeMask(NewMask, NumElems);
-    if (isUNPCKLMask(NewMask, VT, HasAVX2, true))
+    if (isUNPCKLMask(NewMask, VT, HasInt256, true))
        return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
-    if (isUNPCKHMask(NewMask, VT, HasAVX2, true))
+    if (isUNPCKHMask(NewMask, VT, HasInt256, true))
        return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
    }
  
@@ -6904,15 +6908,15 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
      std::swap(V1IsSplat, V2IsSplat);
      Commuted = false;
  
-    if (isUNPCKLMask(M, VT, HasAVX2))
+    if (isUNPCKLMask(M, VT, HasInt256))
        return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
  
-    if (isUNPCKHMask(M, VT, HasAVX2))
+    if (isUNPCKHMask(M, VT, HasInt256))
        return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
    }
  
    // Normalize the node to match x86 shuffle ops if needed
-  if (!V2IsUndef && (isSHUFPMask(M, VT, HasAVX, /* Commuted */ true)))
+  if (!V2IsUndef && (isSHUFPMask(M, VT, HasFp256, /* Commuted */ true)))
      return CommuteVectorShuffle(SVOp, DAG);
  
    // The checks below are all present in isShuffleMaskLegal, but they are
@@ -6930,23 +6934,23 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
        return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
    }
  
-  if (isPSHUFHWMask(M, VT, HasAVX2))
+  if (isPSHUFHWMask(M, VT, HasInt256))
      return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1,
                                  getShufflePSHUFHWImmediate(SVOp),
                                  DAG);
  
-  if (isPSHUFLWMask(M, VT, HasAVX2))
+  if (isPSHUFLWMask(M, VT, HasInt256))
      return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1,
                                  getShufflePSHUFLWImmediate(SVOp),
                                  DAG);
  
-  if (isSHUFPMask(M, VT, HasAVX))
+  if (isSHUFPMask(M, VT, HasFp256))
      return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V2,
                                  getShuffleSHUFImmediate(SVOp), DAG);
  
-  if (isUNPCKL_v_undef_Mask(M, VT, HasAVX2))
+  if (isUNPCKL_v_undef_Mask(M, VT, HasInt256))
      return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
-  if (isUNPCKH_v_undef_Mask(M, VT, HasAVX2))
+  if (isUNPCKH_v_undef_Mask(M, VT, HasInt256))
      return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
  
    //===--------------------------------------------------------------------===//
@@ -6955,12 +6959,12 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
    //
  
    // Handle VMOVDDUPY permutations
-  if (V2IsUndef && isMOVDDUPYMask(M, VT, HasAVX))
+  if (V2IsUndef && isMOVDDUPYMask(M, VT, HasFp256))
      return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG);
  
    // Handle VPERMILPS/D* permutations
-  if (isVPERMILPMask(M, VT, HasAVX)) {
-    if (HasAVX2 && VT == MVT::v8i32)
+  if (isVPERMILPMask(M, VT, HasFp256)) {
+    if (HasInt256 && VT == MVT::v8i32)
        return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1,
                                    getShuffleSHUFImmediate(SVOp), DAG);
      return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1,
@@ -6968,7 +6972,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
    }
  
    // Handle VPERM2F128/VPERM2I128 permutations
-  if (isVPERM2X128Mask(M, VT, HasAVX))
+  if (isVPERM2X128Mask(M, VT, HasFp256))
      return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1,
                                  V2, getShuffleVPERM2X128Immediate(SVOp), DAG);
  
@@ -6976,7 +6980,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
    if (BlendOp.getNode())
      return BlendOp;
  
-  if (V2IsUndef && HasAVX2 && (VT == MVT::v8i32 || VT == MVT::v8f32)) {
+  if (V2IsUndef && HasInt256 && (VT == MVT::v8i32 || VT == MVT::v8f32)) {
      SmallVector<SDValue, 8> permclMask;
      for (unsigned i = 0; i != 8; ++i) {
        permclMask.push_back(DAG.getConstant((M[i]>=0) ? M[i] : 0, MVT::i32));
@@ -6988,7 +6992,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
                         DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1);
    }
  
-  if (V2IsUndef && HasAVX2 && (VT == MVT::v4i64 || VT == MVT::v4f64))
+  if (V2IsUndef && HasInt256 && (VT == MVT::v4i64 || VT == MVT::v4f64))
      return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1,
                                  getShuffleCLImmediate(SVOp), DAG);
  
@@ -7323,7 +7327,7 @@ static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
  // upper bits of a vector.
  static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
                                        SelectionDAG &DAG) {
-  if (Subtarget->hasAVX()) {
+  if (Subtarget->hasFp256()) {
      DebugLoc dl = Op.getNode()->getDebugLoc();
      SDValue Vec = Op.getNode()->getOperand(0);
      SDValue Idx = Op.getNode()->getOperand(1);
@@ -7343,7 +7347,7 @@ static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
  // the upper bits of a vector.
  static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
                                       SelectionDAG &DAG) {
-  if (Subtarget->hasAVX()) {
+  if (Subtarget->hasFp256()) {
      DebugLoc dl = Op.getNode()->getDebugLoc();
      SDValue Vec = Op.getNode()->getOperand(0);
      SDValue SubVec = Op.getNode()->getOperand(1);
@@ -8305,10 +8309,10 @@ SDValue X86TargetLowering::lowerZERO_EXTEND(SDValue Op, SelectionDAG &DAG) const
        VT.getVectorNumElements() != SVT.getVectorNumElements())
      return SDValue();
  
-  assert(Subtarget->hasAVX() && "256-bit vector is observed without AVX!");
+  assert(Subtarget->hasFp256() && "256-bit vector is observed without AVX!");
  
    // AVX2 has better support of integer extending.
-  if (Subtarget->hasAVX2())
+  if (Subtarget->hasInt256())
      return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
  
    SDValue Lo = DAG.getNode(X86ISD::VZEXT, DL, MVT::v4i32, In);
@@ -8328,7 +8332,7 @@ SDValue X86TargetLowering::lowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
        VT.getVectorNumElements() != SVT.getVectorNumElements())
      return SDValue();
  
-  assert(Subtarget->hasAVX() && "256-bit vector is observed without AVX!");
+  assert(Subtarget->hasFp256() && "256-bit vector is observed without AVX!");
  
    unsigned NumElems = VT.getVectorNumElements();
    EVT NVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
@@ -8899,6 +8903,11 @@ SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
    return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
  }
  
+static bool isAllOnes(SDValue V) {
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
+  return C && C->isAllOnesValue();
+}
+
  /// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node
  /// if it's possible.
  SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
@@ -8947,6 +8956,14 @@ SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
    }
  
    if (LHS.getNode()) {
+    // If the LHS is of the form (x ^ -1) then replace the LHS with x and flip
+    // the condition code later.
+    bool Invert = false;
+    if (LHS.getOpcode() == ISD::XOR && isAllOnes(LHS.getOperand(1))) {
+      Invert = true;
+      LHS = LHS.getOperand(0);
+    }
+
      // If LHS is i8, promote it to i32 with any_extend.  There is no i8 BT
      // instruction.  Since the shift amount is in-range-or-undefined, we know
      // that doing a bittest on the i32 value is ok.  We extend to i32 because
@@ -8962,7 +8979,10 @@ SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
        RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS);
  
      SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS);
-    unsigned Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
+    X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
+    // Flip the condition if the LHS was a not instruction
+    if (Invert)
+      Cond = X86::GetOppositeBranchCondition(Cond);
      return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
                         DAG.getConstant(Cond, MVT::i8), BT);
    }
@@ -9133,7 +9153,7 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
    }
  
    // Break 256-bit integer vector compare into smaller ones.
-  if (VT.is256BitVector() && !Subtarget->hasAVX2())
+  if (VT.is256BitVector() && !Subtarget->hasInt256())
      return Lower256IntVSETCC(Op, DAG);
  
    // We are handling one of the integer comparisons here.  Since SSE only has
@@ -9220,11 +9240,6 @@ static bool isZero(SDValue V) {
    return C && C->isNullValue();
  }
  
-static bool isAllOnes(SDValue V) {
-  ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
-  return C && C->isAllOnesValue();
-}
-
  static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
    if (V.getOpcode() != ISD::TRUNCATE)
      return false;
@@ -9877,7 +9892,7 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
      assert(!getTargetMachine().Options.UseSoftFloat &&
             !(DAG.getMachineFunction()
                  .getFunction()->getFnAttributes()
-                .hasAttribute(Attributes::NoImplicitFloat)) &&
+                .hasAttribute(Attribute::NoImplicitFloat)) &&
             Subtarget->hasSSE1());
    }
  
@@ -10082,6 +10097,14 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
      return DAG.getNode(X86ISD::PMULUDQ, dl, Op.getValueType(),
                         Op.getOperand(1), Op.getOperand(2));
  
+  // SSE2/AVX2 sub with unsigned saturation intrinsics
+  case Intrinsic::x86_sse2_psubus_b:
+  case Intrinsic::x86_sse2_psubus_w:
+  case Intrinsic::x86_avx2_psubus_b:
+  case Intrinsic::x86_avx2_psubus_w:
+    return DAG.getNode(X86ISD::SUBUS, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+
    // SSE3/AVX horizontal add/sub intrinsics
    case Intrinsic::x86_sse3_hadd_ps:
    case Intrinsic::x86_sse3_hadd_pd:
@@ -10710,7 +10733,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
  
        // Check that ECX wasn't needed by an 'inreg' parameter.
        FunctionType *FTy = Func->getFunctionType();
-      const AttrListPtr &Attrs = Func->getAttributes();
+      const AttributeSet &Attrs = Func->getAttributes();
  
        if (!Attrs.isEmpty() && !Func->isVarArg()) {
          unsigned InRegCount = 0;
@@ -10718,7 +10741,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
  
          for (FunctionType::param_iterator I = FTy->param_begin(),
               E = FTy->param_end(); I != E; ++I, ++Idx)
-          if (Attrs.getParamAttributes(Idx).hasAttribute(Attributes::InReg))
+          if (Attrs.getParamAttributes(Idx).hasAttribute(Attribute::InReg))
              // FIXME: should only count parameters that are lowered to integers.
              InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32;
  
@@ -10973,7 +10996,7 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
    EVT VT = Op.getValueType();
  
    // Decompose 256-bit ops into smaller 128-bit ops.
-  if (VT.is256BitVector() && !Subtarget->hasAVX2())
+  if (VT.is256BitVector() && !Subtarget->hasInt256())
      return Lower256IntArith(Op, DAG);
  
    assert((VT == MVT::v2i64 || VT == MVT::v4i64) &&
@@ -11036,7 +11059,7 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
        uint64_t ShiftAmt = C->getZExtValue();
  
        if (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 ||
-          (Subtarget->hasAVX2() &&
+          (Subtarget->hasInt256() &&
             (VT == MVT::v4i64 || VT == MVT::v8i32 || VT == MVT::v16i16))) {
          if (Op.getOpcode() == ISD::SHL)
            return DAG.getNode(X86ISD::VSHLI, dl, VT, R,
@@ -11093,7 +11116,7 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
          llvm_unreachable("Unknown shift opcode.");
        }
  
-      if (Subtarget->hasAVX2() && VT == MVT::v32i8) {
+      if (Subtarget->hasInt256() && VT == MVT::v32i8) {
          if (Op.getOpcode() == ISD::SHL) {
            // Make a large shift.
            SDValue SHL = DAG.getNode(X86ISD::VSHLI, dl, MVT::v16i16, R,
@@ -11336,9 +11359,9 @@ SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
      default: return SDValue();
      case MVT::v8i32:
      case MVT::v16i16:
-      if (!Subtarget->hasAVX())
+      if (!Subtarget->hasFp256())
          return SDValue();
-      if (!Subtarget->hasAVX2()) {
+      if (!Subtarget->hasInt256()) {
          // needs to be split
          unsigned NumElems = VT.getVectorNumElements();
  
@@ -11727,6 +11750,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
                                             SmallVectorImpl<SDValue>&Results,
                                             SelectionDAG &DAG) const {
    DebugLoc dl = N->getDebugLoc();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
    switch (N->getOpcode()) {
    default:
      llvm_unreachable("Do not know how to custom type legalize this operation!");
@@ -11776,6 +11800,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
      return;
    }
    case ISD::FP_ROUND: {
+    if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
+        return;
      SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
      Results.push_back(V);
      return;
@@ -11942,9 +11968,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
    case X86ISD::ANDNP:              return "X86ISD::ANDNP";
    case X86ISD::PSIGN:              return "X86ISD::PSIGN";
    case X86ISD::BLENDV:             return "X86ISD::BLENDV";
-  case X86ISD::BLENDPW:            return "X86ISD::BLENDPW";
-  case X86ISD::BLENDPS:            return "X86ISD::BLENDPS";
-  case X86ISD::BLENDPD:            return "X86ISD::BLENDPD";
+  case X86ISD::BLENDI:             return "X86ISD::BLENDI";
+  case X86ISD::SUBUS:              return "X86ISD::SUBUS";
    case X86ISD::HADD:               return "X86ISD::HADD";
    case X86ISD::HSUB:               return "X86ISD::HSUB";
    case X86ISD::FHADD:              return "X86ISD::FHADD";
@@ -12001,7 +12026,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
    case X86ISD::OR:                 return "X86ISD::OR";
    case X86ISD::XOR:                return "X86ISD::XOR";
    case X86ISD::AND:                return "X86ISD::AND";
-  case X86ISD::ANDN:               return "X86ISD::ANDN";
    case X86ISD::BLSI:               return "X86ISD::BLSI";
    case X86ISD::BLSMSK:             return "X86ISD::BLSMSK";
    case X86ISD::BLSR:               return "X86ISD::BLSR";
@@ -12144,6 +12168,30 @@ bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
    return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit();
  }
  
+bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
+  EVT VT1 = Val.getValueType();
+  if (isZExtFree(VT1, VT2))
+    return true;
+
+  if (Val.getOpcode() != ISD::LOAD)
+    return false;
+
+  if (!VT1.isSimple() || !VT1.isInteger() ||
+      !VT2.isSimple() || !VT2.isInteger())
+    return false;
+
+  switch (VT1.getSimpleVT().SimpleTy) {
+  default: break;
+  case MVT::i8:
+  case MVT::i16:
+  case MVT::i32:
+    // X86 has 8, 16, and 32-bit zero-extending loads.
+    return true;
+  }
+
+  return false;
+}
+
  bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
    // i16 instructions are longer (0x66 prefix) and potentially slower.
    return !(VT1 == MVT::i32 && VT2 == MVT::i16);
@@ -12164,15 +12212,15 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
    return (VT.getVectorNumElements() == 2 ||
            ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
            isMOVLMask(M, VT) ||
-          isSHUFPMask(M, VT, Subtarget->hasAVX()) ||
+          isSHUFPMask(M, VT, Subtarget->hasFp256()) ||
            isPSHUFDMask(M, VT) ||
-          isPSHUFHWMask(M, VT, Subtarget->hasAVX2()) ||
-          isPSHUFLWMask(M, VT, Subtarget->hasAVX2()) ||
+          isPSHUFHWMask(M, VT, Subtarget->hasInt256()) ||
+          isPSHUFLWMask(M, VT, Subtarget->hasInt256()) ||
            isPALIGNRMask(M, VT, Subtarget) ||
-          isUNPCKLMask(M, VT, Subtarget->hasAVX2()) ||
-          isUNPCKHMask(M, VT, Subtarget->hasAVX2()) ||
-          isUNPCKL_v_undef_Mask(M, VT, Subtarget->hasAVX2()) ||
-          isUNPCKH_v_undef_Mask(M, VT, Subtarget->hasAVX2()));
+          isUNPCKLMask(M, VT, Subtarget->hasInt256()) ||
+          isUNPCKHMask(M, VT, Subtarget->hasInt256()) ||
+          isUNPCKL_v_undef_Mask(M, VT, Subtarget->hasInt256()) ||
+          isUNPCKH_v_undef_Mask(M, VT, Subtarget->hasInt256()));
  }
  
  bool
@@ -12185,8 +12233,8 @@ X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
    if (NumElts == 4 && VT.is128BitVector()) {
      return (isMOVLMask(Mask, VT)  ||
              isCommutedMOVLMask(Mask, VT, true) ||
-            isSHUFPMask(Mask, VT, Subtarget->hasAVX()) ||
-            isSHUFPMask(Mask, VT, Subtarget->hasAVX(), /* Commuted */ true));
+            isSHUFPMask(Mask, VT, Subtarget->hasFp256()) ||
+            isSHUFPMask(Mask, VT, Subtarget->hasFp256(), /* Commuted */ true));
    }
    return false;
  }
@@ -13239,7 +13287,7 @@ X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
      MBB->addSuccessor(EndMBB);
    }
  
-  unsigned MOVOpc = Subtarget->hasAVX() ? X86::VMOVAPSmr : X86::MOVAPSmr;
+  unsigned MOVOpc = Subtarget->hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
    // In the XMM save block, save all the XMM argument registers.
    for (int i = 3, e = MI->getNumOperands(); i != e; ++i) {
      int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
@@ -14260,7 +14308,7 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
      return SDValue();
  
    // Combine 256-bit vector shuffles. This is only profitable when in AVX mode
-  if (Subtarget->hasAVX() && VT.is256BitVector() &&
+  if (Subtarget->hasFp256() && VT.is256BitVector() &&
        N->getOpcode() == ISD::VECTOR_SHUFFLE)
      return PerformShuffleCombine256(N, DAG, DCI, Subtarget);
  
@@ -14288,7 +14336,7 @@ static SDValue PerformTruncateCombine(SDNode *N, SelectionDAG &DAG,
    if (!DCI.isBeforeLegalizeOps())
      return SDValue();
  
-  if (!Subtarget->hasAVX())
+  if (!Subtarget->hasFp256())
      return SDValue();
  
    EVT VT = N->getValueType(0);
@@ -14298,7 +14346,7 @@ static SDValue PerformTruncateCombine(SDNode *N, SelectionDAG &DAG,
  
    if ((VT == MVT::v4i32) && (OpVT == MVT::v4i64)) {
  
-    if (Subtarget->hasAVX2()) {
+    if (Subtarget->hasInt256()) {
        // AVX2: v4i64 -> v4i32
  
        // VPERMD
@@ -14337,7 +14385,7 @@ static SDValue PerformTruncateCombine(SDNode *N, SelectionDAG &DAG,
  
    if ((VT == MVT::v8i16) && (OpVT == MVT::v8i32)) {
  
-    if (Subtarget->hasAVX2()) {
+    if (Subtarget->hasInt256()) {
        // AVX2: v8i32 -> v8i16
  
        Op = DAG.getNode(ISD::BITCAST, dl, MVT::v32i8, Op);
@@ -14873,6 +14921,65 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
      }
    }
  
+  // Match VSELECTs into subs with unsigned saturation.
+  if (!DCI.isBeforeLegalize() &&
+      N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
+      // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
+      ((Subtarget->hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
+       (Subtarget->hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
+    ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+
+    // Check if one of the arms of the VSELECT is a zero vector. If it's on the
+    // left side invert the predicate to simplify logic below.
+    SDValue Other;
+    if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
+      Other = RHS;
+      CC = ISD::getSetCCInverse(CC, true);
+    } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
+      Other = LHS;
+    }
+
+    if (Other.getNode() && Other->getNumOperands() == 2 &&
+        DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
+      SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
+      SDValue CondRHS = Cond->getOperand(1);
+
+      // Look for a general sub with unsigned saturation first.
+      // x >= y ? x-y : 0 --> subus x, y
+      // x >  y ? x-y : 0 --> subus x, y
+      if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
+          Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
+        return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
+
+      // If the RHS is a constant we have to reverse the const canonicalization.
+      // x > C-1 ? x+-C : 0 --> subus x, C
+      if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
+          isSplatVector(CondRHS.getNode()) && isSplatVector(OpRHS.getNode())) {
+        APInt A = cast<ConstantSDNode>(OpRHS.getOperand(0))->getAPIntValue();
+        if (CondRHS.getConstantOperandVal(0) == -A-1) {
+          SmallVector<SDValue, 32> V(VT.getVectorNumElements(),
+                                     DAG.getConstant(-A, VT.getScalarType()));
+          return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS,
+                             DAG.getNode(ISD::BUILD_VECTOR, DL, VT,
+                                         V.data(), V.size()));
+        }
+      }
+
+      // Another special case: If C was a sign bit, the sub has been
+      // canonicalized into a xor.
+      // FIXME: Would it be better to use ComputeMaskedBits to determine whether
+      //        it's safe to decanonicalize the xor?
+      // x s< 0 ? x^C : 0 --> subus x, C
+      if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
+          ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
+          isSplatVector(OpRHS.getNode())) {
+        APInt A = cast<ConstantSDNode>(OpRHS.getOperand(0))->getAPIntValue();
+        if (A.isSignBit())
+          return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
+      }
+    }
+  }
+
    // If we know that this node is legal then we know that it is going to be
    // matched by one of the SSE/AVX BLEND instructions. These instructions only
    // depend on the highest bit in each word. Try to use SimplifyDemandedBits
@@ -15297,7 +15404,7 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
      return SDValue();
  
    if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
-      (!Subtarget->hasAVX2() ||
+      (!Subtarget->hasInt256() ||
         (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16)))
      return SDValue();
  
@@ -15524,7 +15631,7 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
  
    EVT VT = N->getValueType(0);
  
-  // Create ANDN, BLSI, and BLSR instructions
+  // Create BLSI, and BLSR instructions
    // BLSI is X & (-X)
    // BLSR is X & (X-1)
    if (Subtarget->hasBMI() && (VT == MVT::i32 || VT == MVT::i64)) {
@@ -15532,13 +15639,6 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
      SDValue N1 = N->getOperand(1);
      DebugLoc DL = N->getDebugLoc();
  
-    // Check LHS for not
-    if (N0.getOpcode() == ISD::XOR && isAllOnes(N0.getOperand(1)))
-      return DAG.getNode(X86ISD::ANDN, DL, VT, N0.getOperand(0), N1);
-    // Check RHS for not
-    if (N1.getOpcode() == ISD::XOR && isAllOnes(N1.getOperand(1)))
-      return DAG.getNode(X86ISD::ANDN, DL, VT, N1.getOperand(0), N0);
-
      // Check LHS for neg
      if (N0.getOpcode() == ISD::SUB && N0.getOperand(1) == N1 &&
          isZero(N0.getOperand(0)))
@@ -15606,7 +15706,7 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
    // look for psign/blend
    if (VT == MVT::v2i64 || VT == MVT::v4i64) {
      if (!Subtarget->hasSSSE3() ||
-        (VT == MVT::v4i64 && !Subtarget->hasAVX2()))
+        (VT == MVT::v4i64 && !Subtarget->hasInt256()))
        return SDValue();
  
      // Canonicalize pandn to RHS
@@ -15652,6 +15752,11 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
  
        DebugLoc DL = N->getDebugLoc();
  
+      // We are going to replace the AND, OR, NAND with either BLEND
+      // or PSIGN, which only look at the MSB. The VSRAI instruction
+      // does not affect the highest bit, so we can get rid of it.
+      Mask = Mask.getOperand(0);
+
        // Now we know we at least have a plendvb with the mask val.  See if
        // we can form a psignb/w/d.
        // psign = x.type == y.type == mask.type && y = sub(0, x);
@@ -15660,7 +15765,7 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
            X.getValueType() == MaskVT && Y.getValueType() == MaskVT) {
          assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
                 "Unsupported VT for PSIGN");
-        Mask = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X, Mask.getOperand(0));
+        Mask = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X, Mask);
          return DAG.getNode(ISD::BITCAST, DL, VT, Mask);
        }
        // PBLENDVB only available on SSE 4.1
@@ -15824,10 +15929,13 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
  
    // If this is a vector EXT Load then attempt to optimize it using a
    // shuffle. We need SSSE3 shuffles.
+  // SEXT loads are suppoted starting SSE41.
+  // We generate X86ISD::VSEXT for them.
    // TODO: It is possible to support ZExt by zeroing the undef values
    // during the shuffle phase or after the shuffle.
    if (RegVT.isVector() && RegVT.isInteger() &&
-      Ext == ISD::EXTLOAD && Subtarget->hasSSSE3()) {
+      ((Ext == ISD::EXTLOAD && Subtarget->hasSSSE3()) ||
+       (Ext == ISD::SEXTLOAD && Subtarget->hasSSE41()))){
      assert(MemVT != RegVT && "Cannot extend to the same type");
      assert(MemVT.isVector() && "Must load a vector from memory");
  
@@ -15836,6 +15944,9 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
      unsigned MemSz = MemVT.getSizeInBits();
      assert(RegSz > MemSz && "Register size must be greater than the mem size");
  
+    if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget->hasInt256())
+      return SDValue();
+
      // All sizes must be a power of two.
      if (!isPowerOf2_32(RegSz * MemSz * NumElems))
        return SDValue();
@@ -15859,16 +15970,23 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
      // Calculate the number of scalar loads that we need to perform
      // in order to load our vector from memory.
      unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
+    if (Ext == ISD::SEXTLOAD && NumLoads > 1)
+      return SDValue();
+
+    unsigned loadRegZize = RegSz;
+    if (Ext == ISD::SEXTLOAD && RegSz == 256)
+      loadRegZize /= 2;
  
      // Represent our vector as a sequence of elements which are the
      // largest scalar that we can load.
      EVT LoadUnitVecVT = EVT::getVectorVT(*DAG.getContext(), SclrLoadTy,
-      RegSz/SclrLoadTy.getSizeInBits());
+      loadRegZize/SclrLoadTy.getSizeInBits());
  
      // Represent the data using the same element type that is stored in
      // memory. In practice, we ''widen'' MemVT.
-    EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
-                                  RegSz/MemVT.getScalarType().getSizeInBits());
+    EVT WideVecVT = 
+         EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
+                       loadRegZize/MemVT.getScalarType().getSizeInBits());
  
      assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
        "Invalid vector type");
@@ -15909,6 +16027,10 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
      SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Res);
      unsigned SizeRatio = RegSz/MemSz;
  
+    if (Ext == ISD::SEXTLOAD) {
+      SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec);
+      return DCI.CombineTo(N, Sext, TF, true);
+    }
      // Redistribute the loaded elements into the different locations.
      SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
      for (unsigned i = 0; i != NumElems; ++i)
@@ -15942,7 +16064,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
    // On Sandy Bridge, 256-bit memory operations are executed by two
    // 128-bit ports. However, on Haswell it is better to issue a single 256-bit
    // memory  operation.
-  if (VT.is256BitVector() && !Subtarget->hasAVX2() &&
+  if (VT.is256BitVector() && !Subtarget->hasInt256() &&
        StoredVal.getNode()->getOpcode() == ISD::CONCAT_VECTORS &&
        StoredVal.getNumOperands() == 2) {
      SDValue Value0 = StoredVal.getOperand(0);
@@ -16054,7 +16176,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
  
    const Function *F = DAG.getMachineFunction().getFunction();
    bool NoImplicitFloatOps = F->getFnAttributes().
-    hasAttribute(Attributes::NoImplicitFloat);
+    hasAttribute(Attribute::NoImplicitFloat);
    bool F64IsLegal = !DAG.getTarget().Options.UseSoftFloat && !NoImplicitFloatOps
                       && Subtarget->hasSSE2();
    if ((VT.isVector() ||
@@ -16290,7 +16412,7 @@ static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG,
  
    // Try to synthesize horizontal adds from adds of shuffles.
    if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
-       (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
+       (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
        isHorizontalBinOp(LHS, RHS, true))
      return DAG.getNode(X86ISD::FHADD, N->getDebugLoc(), VT, LHS, RHS);
    return SDValue();
@@ -16305,7 +16427,7 @@ static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG,
  
    // Try to synthesize horizontal subs from subs of shuffles.
    if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
-       (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
+       (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
        isHorizontalBinOp(LHS, RHS, false))
      return DAG.getNode(X86ISD::FHSUB, N->getDebugLoc(), VT, LHS, RHS);
    return SDValue();
@@ -16400,7 +16522,7 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
    if (!DCI.isBeforeLegalizeOps())
      return SDValue();
  
-  if (!Subtarget->hasAVX())
+  if (!Subtarget->hasFp256())
      return SDValue();
  
    EVT VT = N->getValueType(0);
@@ -16411,7 +16533,7 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
    if ((VT == MVT::v4i64 && OpVT == MVT::v4i32) ||
        (VT == MVT::v8i32 && OpVT == MVT::v8i16)) {
  
-    if (Subtarget->hasAVX2())
+    if (Subtarget->hasInt256())
        return DAG.getNode(X86ISD::VSEXT_MOVL, dl, VT, Op);
  
      // Optimize vectors in AVX mode
@@ -16531,13 +16653,13 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
    if (!DCI.isBeforeLegalizeOps())
      return SDValue();
  
-  if (!Subtarget->hasAVX())
+  if (!Subtarget->hasFp256())
      return SDValue();
  
    if (((VT == MVT::v8i32) && (OpVT == MVT::v8i16)) ||
        ((VT == MVT::v4i64) && (OpVT == MVT::v4i32)))  {
  
-    if (Subtarget->hasAVX2())
+    if (Subtarget->hasInt256())
        return DAG.getNode(X86ISD::VZEXT_MOVL, dl, VT, N0);
  
      SDValue ZeroVec = getZeroVector(OpVT, Subtarget, DAG, dl);
@@ -16763,7 +16885,7 @@ static SDValue PerformAddCombine(SDNode *N, SelectionDAG &DAG,
  
    // Try to synthesize horizontal adds from adds of shuffles.
    if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
-       (Subtarget->hasAVX2() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
+       (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
        isHorizontalBinOp(Op0, Op1, true))
      return DAG.getNode(X86ISD::HADD, N->getDebugLoc(), VT, Op0, Op1);
  
@@ -16796,7 +16918,7 @@ static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG,
    // Try to synthesize horizontal adds from adds of shuffles.
    EVT VT = N->getValueType(0);
    if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
-       (Subtarget->hasAVX2() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
+       (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
        isHorizontalBinOp(Op0, Op1, true))
      return DAG.getNode(X86ISD::HSUB, N->getDebugLoc(), VT, Op0, Op1);
  
@@ -17174,7 +17296,7 @@ TargetLowering::ConstraintWeight
    case 'x':
    case 'Y':
      if (((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasSSE1()) ||
-        ((type->getPrimitiveSizeInBits() == 256) && Subtarget->hasAVX()))
+        ((type->getPrimitiveSizeInBits() == 256) && Subtarget->hasFp256()))
        weight = CW_Register;
      break;
    case 'I':
@@ -17651,6 +17773,17 @@ FindInConvertTable(const X86TypeConversionCostTblEntry *Tbl, unsigned len,
    return -1;
  }
  
+ScalarTargetTransformInfo::PopcntHwSupport
+X86ScalarTargetTransformImpl::getPopcntHwSupport(unsigned TyWidth) const {
+  assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
+  const X86Subtarget &ST = TLI->getTargetMachine().getSubtarget<X86Subtarget>();
+
+  // TODO: Currently the __builtin_popcount() implementation using SSE3
+  //   instructions is inefficient. Once the problem is fixed, we should
+  //   call ST.hasSSE3() instead of ST.hasSSE4().
+  return ST.hasSSE41() ? Fast : None;
+}
+
  unsigned
  X86VectorTargetTransformInfo::getArithmeticInstrCost(unsigned Opcode,
                                                       Type *Ty) const {
@@ -17685,6 +17818,30 @@ X86VectorTargetTransformInfo::getArithmeticInstrCost(unsigned Opcode,
    return VectorTargetTransformImpl::getArithmeticInstrCost(Opcode, Ty);
  }
  
+
+unsigned
+X86VectorTargetTransformInfo::getMemoryOpCost(unsigned Opcode, Type *Src,
+                                              unsigned Alignment,
+                                              unsigned AddressSpace) const {
+  // Legalize the type.
+  std::pair<unsigned, MVT> LT = getTypeLegalizationCost(Src);
+  assert(Opcode == Instruction::Load || Opcode == Instruction::Store &&
+         "Invalid Opcode");
+
+  const X86Subtarget &ST =
+  TLI->getTargetMachine().getSubtarget<X86Subtarget>();
+
+  // Each load/store unit costs 1.
+  unsigned Cost = LT.first * 1;
+
+  // On Sandybridge 256bit load/stores are double pumped
+  // (but not on Haswell).
+  if (LT.second.getSizeInBits() > 128 && !ST.hasAVX2())
+    Cost*=2;
+
+  return Cost;
+}
+
  unsigned
  X86VectorTargetTransformInfo::getVectorInstrCost(unsigned Opcode, Type *Val,
                                                   unsigned Index) const {
@@ -17750,10 +17907,10 @@ unsigned X86VectorTargetTransformInfo::getCmpSelInstrCost(unsigned Opcode,
      { ISD::SETCC,   MVT::v32i8,   1 },
    };
  
-  if (ST.hasSSE42()) {
-    int Idx = FindInTable(SSE42CostTbl, array_lengthof(SSE42CostTbl), ISD, MTy);
+  if (ST.hasAVX2()) {
+    int Idx = FindInTable(AVX2CostTbl, array_lengthof(AVX2CostTbl), ISD, MTy);
      if (Idx != -1)
-      return LT.first * SSE42CostTbl[Idx].Cost;
+      return LT.first * AVX2CostTbl[Idx].Cost;
    }
  
    if (ST.hasAVX()) {
@@ -17762,10 +17919,10 @@ unsigned X86VectorTargetTransformInfo::getCmpSelInstrCost(unsigned Opcode,
        return LT.first * AVX1CostTbl[Idx].Cost;
    }
  
-  if (ST.hasAVX2()) {
-    int Idx = FindInTable(AVX2CostTbl, array_lengthof(AVX2CostTbl), ISD, MTy);
+  if (ST.hasSSE42()) {
+    int Idx = FindInTable(SSE42CostTbl, array_lengthof(SSE42CostTbl), ISD, MTy);
      if (Idx != -1)
-      return LT.first * AVX2CostTbl[Idx].Cost;
+      return LT.first * SSE42CostTbl[Idx].Cost;
    }
  
    return VectorTargetTransformImpl::getCmpSelInstrCost(Opcode, ValTy, CondTy);