Custom lower PCMPEQ/PCMPGT intrinsics to target specific nodes and remove the intrins...

[oota-llvm.git] / lib / Target / X86 / X86ISelLowering.cpp
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index 7ba7c79b39af20b34c4e21d12c3436bfa9227af3..2655ce961041520c693ac48ddd26c5b5fad07a0a 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -45,6 +45,7 @@
  #include "llvm/ADT/StringExtras.h"
  #include "llvm/ADT/VariadicFunction.h"
  #include "llvm/Support/CallSite.h"
+#include "llvm/Support/CommandLine.h"
  #include "llvm/Support/Debug.h"
  #include "llvm/Support/Dwarf.h"
  #include "llvm/Support/ErrorHandling.h"
@@ -56,21 +57,13 @@ using namespace dwarf;
  
  STATISTIC(NumTailCalls, "Number of tail calls");
  
+static cl::opt<bool> UseRegMask("x86-use-regmask",
+                                cl::desc("Use register masks for x86 calls"));
+
  // Forward declarations.
  static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
                         SDValue V2);
  
-static SDValue Insert128BitVector(SDValue Result,
-                                  SDValue Vec,
-                                  SDValue Idx,
-                                  SelectionDAG &DAG,
-                                  DebugLoc dl);
-
-static SDValue Extract128BitVector(SDValue Vec,
-                                   SDValue Idx,
-                                   SelectionDAG &DAG,
-                                   DebugLoc dl);
-
  /// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
  /// sets things up to match to an AVX VEXTRACTF128 instruction or a
  /// simple subregister reference.  Idx is an index in the 128 bits we
@@ -1327,9 +1320,12 @@ X86TargetLowering::getOptimalMemOpType(uint64_t Size,
           ((DstAlign == 0 || DstAlign >= 16) &&
            (SrcAlign == 0 || SrcAlign >= 16))) &&
          Subtarget->getStackAlignment() >= 16) {
-      if (Subtarget->hasAVX() &&
-          Subtarget->getStackAlignment() >= 32)
-        return MVT::v8f32;
+      if (Subtarget->getStackAlignment() >= 32) {
+        if (Subtarget->hasAVX2())
+          return MVT::v8i32;
+        if (Subtarget->hasAVX())
+          return MVT::v8f32;
+      }
        if (Subtarget->hasSSE2())
          return MVT::v4i32;
        if (Subtarget->hasSSE1())
@@ -1725,7 +1721,7 @@ static bool IsTailCallConvention(CallingConv::ID CC) {
  }
  
  bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
-  if (!CI->isTailCall())
+  if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls)
      return false;
  
    CallSite CS(CI);
@@ -1804,6 +1800,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
  
    MachineFrameInfo *MFI = MF.getFrameInfo();
    bool Is64Bit = Subtarget->is64Bit();
+  bool IsWindows = Subtarget->isTargetWindows();
    bool IsWin64 = Subtarget->isTargetWin64();
  
    assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
@@ -2039,7 +2036,8 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
    } else {
      FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
      // If this is an sret function, the return should pop the hidden pointer.
-    if (!Is64Bit && !IsTailCallConvention(CallConv) && ArgsAreStructReturn(Ins))
+    if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows &&
+        ArgsAreStructReturn(Ins))
        FuncInfo->setBytesToPopOnReturn(4);
    }
  
@@ -2123,9 +2121,13 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
    MachineFunction &MF = DAG.getMachineFunction();
    bool Is64Bit        = Subtarget->is64Bit();
    bool IsWin64        = Subtarget->isTargetWin64();
+  bool IsWindows      = Subtarget->isTargetWindows();
    bool IsStructRet    = CallIsStructReturn(Outs);
    bool IsSibcall      = false;
  
+  if (MF.getTarget().Options.DisableTailCalls)
+    isTailCall = false;
+
    if (isTailCall) {
      // Check if it's really possible to do a tail call.
      isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
@@ -2503,6 +2505,14 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
    if (Is64Bit && isVarArg && !IsWin64)
      Ops.push_back(DAG.getRegister(X86::AL, MVT::i8));
  
+  // Experimental: Add a register mask operand representing the call-preserved
+  // registers.
+  if (UseRegMask) {
+    const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+    const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
+    Ops.push_back(DAG.getRegisterMask(Mask));
+  }
+
    if (InFlag.getNode())
      Ops.push_back(InFlag);
  
@@ -2525,10 +2535,12 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
    if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
                         getTargetMachine().Options.GuaranteedTailCallOpt))
      NumBytesForCalleeToPush = NumBytes;    // Callee pops everything
-  else if (!Is64Bit && !IsTailCallConvention(CallConv) && IsStructRet)
+  else if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows &&
+           IsStructRet)
      // If this is a call to a struct-return function, the callee
      // pops the hidden struct pointer, so we have to push it back.
      // This is common for Darwin/X86, Linux & Mingw32 targets.
+    // For MSVC Win32 targets, the caller pops the hidden struct pointer.
      NumBytesForCalleeToPush = 4;
    else
      NumBytesForCalleeToPush = 0;  // Callee pops nothing.
@@ -2886,7 +2898,6 @@ static bool isTargetShuffle(unsigned Opcode) {
    case X86ISD::VPERM2X128:
      return true;
    }
-  return false;
  }
  
  static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
@@ -2898,8 +2909,6 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
    case X86ISD::MOVDDUP:
      return DAG.getNode(Opc, dl, VT, V1);
    }
-
-  return SDValue();
  }
  
  static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
@@ -2912,8 +2921,6 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
    case X86ISD::VPERMILP:
      return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8));
    }
-
-  return SDValue();
  }
  
  static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
@@ -2926,7 +2933,6 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
      return DAG.getNode(Opc, dl, VT, V1, V2,
                         DAG.getConstant(TargetMask, MVT::i8));
    }
-  return SDValue();
  }
  
  static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
@@ -2944,7 +2950,6 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
    case X86ISD::UNPCKH:
      return DAG.getNode(Opc, dl, VT, V1, V2);
    }
-  return SDValue();
  }
  
  SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
@@ -3139,17 +3144,6 @@ static bool isUndefOrInRange(int Val, int Low, int Hi) {
    return (Val < 0) || (Val >= Low && Val < Hi);
  }
  
-/// isUndefOrInRange - Return true if every element in Mask, begining
-/// from position Pos and ending in Pos+Size, falls within the specified
-/// range (L, L+Pos]. or is undef.
-static bool isUndefOrInRange(const SmallVectorImpl<int> &Mask,
-                             int Pos, int Size, int Low, int Hi) {
-  for (int i = Pos, e = Pos+Size; i != e; ++i)
-    if (!isUndefOrInRange(Mask[i], Low, Hi))
-      return false;
-  return true;
-}
-
  /// isUndefOrEqual - Val is either less than zero (undef) or equal to the
  /// specified value.
  static bool isUndefOrEqual(int Val, int CmpVal) {
@@ -3161,7 +3155,7 @@ static bool isUndefOrEqual(int Val, int CmpVal) {
  /// isSequentialOrUndefInRange - Return true if every element in Mask, begining
  /// from position Pos and ending in Pos+Size, falls within the specified
  /// sequential range (L, L+Pos]. or is undef.
-static bool isSequentialOrUndefInRange(const SmallVectorImpl<int> &Mask,
+static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
                                         int Pos, int Size, int Low) {
    for (int i = Pos, e = Pos+Size; i != e; ++i, ++Low)
      if (!isUndefOrEqual(Mask[i], Low))
@@ -3172,7 +3166,7 @@ static bool isSequentialOrUndefInRange(const SmallVectorImpl<int> &Mask,
  /// isPSHUFDMask - Return true if the node specifies a shuffle of elements that
  /// is suitable for input to PSHUFD or PSHUFW.  That is, it doesn't reference
  /// the second operand.
-static bool isPSHUFDMask(const SmallVectorImpl<int> &Mask, EVT VT) {
+static bool isPSHUFDMask(ArrayRef<int> Mask, EVT VT) {
    if (VT == MVT::v4f32 || VT == MVT::v4i32 )
      return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4);
    if (VT == MVT::v2f64 || VT == MVT::v2i64)
@@ -3181,14 +3175,12 @@ static bool isPSHUFDMask(const SmallVectorImpl<int> &Mask, EVT VT) {
  }
  
  bool X86::isPSHUFDMask(ShuffleVectorSDNode *N) {
-  SmallVector<int, 8> M;
-  N->getMask(M);
-  return ::isPSHUFDMask(M, N->getValueType(0));
+  return ::isPSHUFDMask(N->getMask(), N->getValueType(0));
  }
  
  /// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that
  /// is suitable for input to PSHUFHW.
-static bool isPSHUFHWMask(const SmallVectorImpl<int> &Mask, EVT VT) {
+static bool isPSHUFHWMask(ArrayRef<int> Mask, EVT VT) {
    if (VT != MVT::v8i16)
      return false;
  
@@ -3205,14 +3197,12 @@ static bool isPSHUFHWMask(const SmallVectorImpl<int> &Mask, EVT VT) {
  }
  
  bool X86::isPSHUFHWMask(ShuffleVectorSDNode *N) {
-  SmallVector<int, 8> M;
-  N->getMask(M);
-  return ::isPSHUFHWMask(M, N->getValueType(0));
+  return ::isPSHUFHWMask(N->getMask(), N->getValueType(0));
  }
  
  /// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that
  /// is suitable for input to PSHUFLW.
-static bool isPSHUFLWMask(const SmallVectorImpl<int> &Mask, EVT VT) {
+static bool isPSHUFLWMask(ArrayRef<int> Mask, EVT VT) {
    if (VT != MVT::v8i16)
      return false;
  
@@ -3229,128 +3219,80 @@ static bool isPSHUFLWMask(const SmallVectorImpl<int> &Mask, EVT VT) {
  }
  
  bool X86::isPSHUFLWMask(ShuffleVectorSDNode *N) {
-  SmallVector<int, 8> M;
-  N->getMask(M);
-  return ::isPSHUFLWMask(M, N->getValueType(0));
+  return ::isPSHUFLWMask(N->getMask(), N->getValueType(0));
  }
  
  /// isPALIGNRMask - Return true if the node specifies a shuffle of elements that
  /// is suitable for input to PALIGNR.
-static bool isPALIGNRMask(const SmallVectorImpl<int> &Mask, EVT VT,
-                          bool hasSSSE3) {
-  int i, e = VT.getVectorNumElements();
-  if (VT.getSizeInBits() != 128)
+static bool isPALIGNRMask(ArrayRef<int> Mask, EVT VT,
+                          const X86Subtarget *Subtarget) {
+  if ((VT.getSizeInBits() == 128 && !Subtarget->hasSSSE3()) ||
+      (VT.getSizeInBits() == 256 && !Subtarget->hasAVX2()))
      return false;
  
-  // Do not handle v2i64 / v2f64 shuffles with palignr.
-  if (e < 4 || !hasSSSE3)
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned NumLanes = VT.getSizeInBits()/128;
+  unsigned NumLaneElts = NumElts/NumLanes;
+
+  // Do not handle 64-bit element shuffles with palignr.
+  if (NumLaneElts == 2)
      return false;
  
-  for (i = 0; i != e; ++i)
-    if (Mask[i] >= 0)
-      break;
+  for (unsigned l = 0; l != NumElts; l+=NumLaneElts) {
+    unsigned i;
+    for (i = 0; i != NumLaneElts; ++i) {
+      if (Mask[i+l] >= 0)
+        break;
+    }
  
-  // All undef, not a palignr.
-  if (i == e)
-    return false;
+    // Lane is all undef, go to next lane
+    if (i == NumLaneElts)
+      continue;
  
-  // Make sure we're shifting in the right direction.
-  if (Mask[i] <= i)
-    return false;
+    int Start = Mask[i+l];
  
-  int s = Mask[i] - i;
+    // Make sure its in this lane in one of the sources
+    if (!isUndefOrInRange(Start, l, l+NumLaneElts) &&
+        !isUndefOrInRange(Start, l+NumElts, l+NumElts+NumLaneElts))
+      return false;
  
-  // Check the rest of the elements to see if they are consecutive.
-  for (++i; i != e; ++i) {
-    int m = Mask[i];
-    if (m >= 0 && m != s+i)
+    // If not lane 0, then we must match lane 0
+    if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Start, Mask[i]+l))
        return false;
-  }
-  return true;
-}
  
-/// isVSHUFPYMask - Return true if the specified VECTOR_SHUFFLE operand
-/// specifies a shuffle of elements that is suitable for input to 256-bit
-/// VSHUFPSY.
-static bool isVSHUFPYMask(const SmallVectorImpl<int> &Mask, EVT VT,
-                          bool HasAVX, bool Commuted = false) {
-  int NumElems = VT.getVectorNumElements();
+    // Correct second source to be contiguous with first source
+    if (Start >= (int)NumElts)
+      Start -= NumElts - NumLaneElts;
  
-  if (!HasAVX || VT.getSizeInBits() != 256)
-    return false;
+    // Make sure we're shifting in the right direction.
+    if (Start <= (int)(i+l))
+      return false;
  
-  if (NumElems != 4 && NumElems != 8)
-    return false;
+    Start -= i;
  
-  // VSHUFPSY divides the resulting vector into 4 chunks.
-  // The sources are also splitted into 4 chunks, and each destination
-  // chunk must come from a different source chunk.
-  //
-  //  SRC1 =>   X7    X6    X5    X4    X3    X2    X1    X0
-  //  SRC2 =>   Y7    Y6    Y5    Y4    Y3    Y2    Y1    Y9
-  //
-  //  DST  =>  Y7..Y4,   Y7..Y4,   X7..X4,   X7..X4,
-  //           Y3..Y0,   Y3..Y0,   X3..X0,   X3..X0
-  //
-  // VSHUFPDY divides the resulting vector into 4 chunks.
-  // The sources are also splitted into 4 chunks, and each destination
-  // chunk must come from a different source chunk.
-  //
-  //  SRC1 =>      X3       X2       X1       X0
-  //  SRC2 =>      Y3       Y2       Y1       Y0
-  //
-  //  DST  =>  Y3..Y2,  X3..X2,  Y1..Y0,  X1..X0
-  //
-  unsigned QuarterSize = NumElems/4;
-  unsigned HalfSize = QuarterSize*2;
-  for (unsigned l = 0; l != 2; ++l) {
-    unsigned LaneStart = l*HalfSize;
-    for (unsigned s = 0; s != 2; ++s) {
-      unsigned QuarterStart = s*QuarterSize;
-      unsigned Src = (Commuted) ? (1-s) : s;
-      unsigned SrcStart = Src*NumElems + LaneStart;
-      for (unsigned i = 0; i != QuarterSize; ++i) {
-        int Idx = Mask[i+QuarterStart+LaneStart];
-        if (!isUndefOrInRange(Idx, SrcStart, SrcStart+HalfSize))
-          return false;
-        // For VSHUFPSY, the mask of the second half must be the same as the 
-        // first but with the appropriate offsets. This works in the same way as
-        // VPERMILPS works with masks.
-        if (NumElems == 4 || l == 0 || Mask[i+QuarterStart] < 0)
-          continue;
-        if (!isUndefOrEqual(Idx, Mask[i+QuarterStart]+LaneStart))
-          return false;
-      }
-    }
-  }
+    // Check the rest of the elements to see if they are consecutive.
+    for (++i; i != NumLaneElts; ++i) {
+      int Idx = Mask[i+l];
  
-  return true;
-}
+      // Make sure its in this lane
+      if (!isUndefOrInRange(Idx, l, l+NumLaneElts) &&
+          !isUndefOrInRange(Idx, l+NumElts, l+NumElts+NumLaneElts))
+        return false;
  
-/// getShuffleVSHUFPYImmediate - Return the appropriate immediate to shuffle
-/// the specified VECTOR_MASK mask with VSHUFPSY/VSHUFPDY instructions.
-static unsigned getShuffleVSHUFPYImmediate(ShuffleVectorSDNode *SVOp) {
-  EVT VT = SVOp->getValueType(0);
-  unsigned NumElems = VT.getVectorNumElements();
+      // If not lane 0, then we must match lane 0
+      if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Idx, Mask[i]+l))
+        return false;
  
-  assert(VT.getSizeInBits() == 256 && "Only supports 256-bit types");
-  assert((NumElems == 4 || NumElems == 8) && "Only supports v4 and v8 types");
+      if (Idx >= (int)NumElts)
+        Idx -= NumElts - NumLaneElts;
  
-  unsigned HalfSize = NumElems/2;
-  unsigned Mul = (NumElems == 8) ? 2 : 1;
-  unsigned Mask = 0;
-  for (unsigned i = 0; i != NumElems; ++i) {
-    int Elt = SVOp->getMaskElt(i);
-    if (Elt < 0)
-      continue;
-    Elt %= HalfSize;
-    unsigned Shamt = i;
-    // For VSHUFPSY, the mask of the first half must be equal to the second one.
-    if (NumElems == 8) Shamt %= HalfSize;
-    Mask |= Elt << (Shamt*Mul);
+      if (!isUndefOrEqual(Idx, Start+i))
+        return false;
+
+    }
    }
  
-  return Mask;
+  return true;
  }
  
  /// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
@@ -3369,36 +3311,62 @@ static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask,
  }
  
  /// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
-/// specifies a shuffle of elements that is suitable for input to 128-bit
+/// specifies a shuffle of elements that is suitable for input to 128/256-bit
  /// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be
  /// reverse of what x86 shuffles want.
-static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT,
+static bool isSHUFPMask(ArrayRef<int> Mask, EVT VT, bool HasAVX,
                          bool Commuted = false) {
-  unsigned NumElems = VT.getVectorNumElements();
-
-  if (VT.getSizeInBits() != 128)
+  if (!HasAVX && VT.getSizeInBits() == 256)
      return false;
  
-  if (NumElems != 2 && NumElems != 4)
+  unsigned NumElems = VT.getVectorNumElements();
+  unsigned NumLanes = VT.getSizeInBits()/128;
+  unsigned NumLaneElems = NumElems/NumLanes;
+
+  if (NumLaneElems != 2 && NumLaneElems != 4)
      return false;
  
-  unsigned Half = NumElems / 2;
-  unsigned SrcStart = Commuted ? NumElems : 0;
-  for (unsigned i = 0; i != Half; ++i)
-    if (!isUndefOrInRange(Mask[i], SrcStart, SrcStart+NumElems))
-      return false;
-  SrcStart = Commuted ? 0 : NumElems;
-  for (unsigned i = Half; i != NumElems; ++i)
-    if (!isUndefOrInRange(Mask[i], SrcStart, SrcStart+NumElems))
-      return false;
+  // VSHUFPSY divides the resulting vector into 4 chunks.
+  // The sources are also splitted into 4 chunks, and each destination
+  // chunk must come from a different source chunk.
+  //
+  //  SRC1 =>   X7    X6    X5    X4    X3    X2    X1    X0
+  //  SRC2 =>   Y7    Y6    Y5    Y4    Y3    Y2    Y1    Y9
+  //
+  //  DST  =>  Y7..Y4,   Y7..Y4,   X7..X4,   X7..X4,
+  //           Y3..Y0,   Y3..Y0,   X3..X0,   X3..X0
+  //
+  // VSHUFPDY divides the resulting vector into 4 chunks.
+  // The sources are also splitted into 4 chunks, and each destination
+  // chunk must come from a different source chunk.
+  //
+  //  SRC1 =>      X3       X2       X1       X0
+  //  SRC2 =>      Y3       Y2       Y1       Y0
+  //
+  //  DST  =>  Y3..Y2,  X3..X2,  Y1..Y0,  X1..X0
+  //
+  unsigned HalfLaneElems = NumLaneElems/2;
+  for (unsigned l = 0; l != NumElems; l += NumLaneElems) {
+    for (unsigned i = 0; i != NumLaneElems; ++i) {
+      int Idx = Mask[i+l];
+      unsigned RngStart = l + ((Commuted == (i<HalfLaneElems)) ? NumElems : 0);
+      if (!isUndefOrInRange(Idx, RngStart, RngStart+NumLaneElems))
+        return false;
+      // For VSHUFPSY, the mask of the second half must be the same as the
+      // first but with the appropriate offsets. This works in the same way as
+      // VPERMILPS works with masks.
+      if (NumElems != 8 || l == 0 || Mask[i] < 0)
+        continue;
+      if (!isUndefOrEqual(Idx, Mask[i]+l))
+        return false;
+    }
+  }
  
    return true;
  }
  
-bool X86::isSHUFPMask(ShuffleVectorSDNode *N) {
-  SmallVector<int, 8> M;
-  N->getMask(M);
-  return ::isSHUFPMask(M, N->getValueType(0));
+bool X86::isSHUFPMask(ShuffleVectorSDNode *N, bool HasAVX) {
+  return ::isSHUFPMask(N->getMask(), N->getValueType(0), HasAVX);
  }
  
  /// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
@@ -3485,7 +3453,7 @@ bool X86::isMOVLHPSMask(ShuffleVectorSDNode *N) {
  
  /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
  /// specifies a shuffle of elements that is suitable for input to UNPCKL.
-static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT,
+static bool isUNPCKLMask(ArrayRef<int> Mask, EVT VT,
                           bool HasAVX2, bool V2IsSplat = false) {
    unsigned NumElts = VT.getVectorNumElements();
  
@@ -3523,14 +3491,12 @@ static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT,
  }
  
  bool X86::isUNPCKLMask(ShuffleVectorSDNode *N, bool HasAVX2, bool V2IsSplat) {
-  SmallVector<int, 8> M;
-  N->getMask(M);
-  return ::isUNPCKLMask(M, N->getValueType(0), HasAVX2, V2IsSplat);
+  return ::isUNPCKLMask(N->getMask(), N->getValueType(0), HasAVX2, V2IsSplat);
  }
  
  /// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
  /// specifies a shuffle of elements that is suitable for input to UNPCKH.
-static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, EVT VT,
+static bool isUNPCKHMask(ArrayRef<int> Mask, EVT VT,
                           bool HasAVX2, bool V2IsSplat = false) {
    unsigned NumElts = VT.getVectorNumElements();
  
@@ -3566,15 +3532,13 @@ static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, EVT VT,
  }
  
  bool X86::isUNPCKHMask(ShuffleVectorSDNode *N, bool HasAVX2, bool V2IsSplat) {
-  SmallVector<int, 8> M;
-  N->getMask(M);
-  return ::isUNPCKHMask(M, N->getValueType(0), HasAVX2, V2IsSplat);
+  return ::isUNPCKHMask(N->getMask(), N->getValueType(0), HasAVX2, V2IsSplat);
  }
  
  /// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
  /// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
  /// <0, 0, 1, 1>
-static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT,
+static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, EVT VT,
                                    bool HasAVX2) {
    unsigned NumElts = VT.getVectorNumElements();
  
@@ -3615,16 +3579,13 @@ static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT,
  }
  
  bool X86::isUNPCKL_v_undef_Mask(ShuffleVectorSDNode *N, bool HasAVX2) {
-  SmallVector<int, 8> M;
-  N->getMask(M);
-  return ::isUNPCKL_v_undef_Mask(M, N->getValueType(0), HasAVX2);
+  return ::isUNPCKL_v_undef_Mask(N->getMask(), N->getValueType(0), HasAVX2);
  }
  
  /// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
  /// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
  /// <2, 2, 3, 3>
-static bool isUNPCKH_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT,
-                                  bool HasAVX2) {
+static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, EVT VT, bool HasAVX2) {
    unsigned NumElts = VT.getVectorNumElements();
  
    assert((VT.is128BitVector() || VT.is256BitVector()) &&
@@ -3654,15 +3615,13 @@ static bool isUNPCKH_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT,
  }
  
  bool X86::isUNPCKH_v_undef_Mask(ShuffleVectorSDNode *N, bool HasAVX2) {
-  SmallVector<int, 8> M;
-  N->getMask(M);
-  return ::isUNPCKH_v_undef_Mask(M, N->getValueType(0), HasAVX2);
+  return ::isUNPCKH_v_undef_Mask(N->getMask(), N->getValueType(0), HasAVX2);
  }
  
  /// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
  /// specifies a shuffle of elements that is suitable for input to MOVSS,
  /// MOVSD, and MOVD, i.e. setting the lowest element.
-static bool isMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT) {
+static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) {
    if (VT.getVectorElementType().getSizeInBits() < 32)
      return false;
    if (VT.getSizeInBits() == 256)
@@ -3681,9 +3640,7 @@ static bool isMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT) {
  }
  
  bool X86::isMOVLMask(ShuffleVectorSDNode *N) {
-  SmallVector<int, 8> M;
-  N->getMask(M);
-  return ::isMOVLMask(M, N->getValueType(0));
+  return ::isMOVLMask(N->getMask(), N->getValueType(0));
  }
  
  /// isVPERM2X128Mask - Match 256-bit shuffles where the elements are considered
@@ -3692,8 +3649,7 @@ bool X86::isMOVLMask(ShuffleVectorSDNode *N) {
  ///   vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15>
  /// The first half comes from the second half of V1 and the second half from the
  /// the second half of V2.
-static bool isVPERM2X128Mask(const SmallVectorImpl<int> &Mask, EVT VT,
-                             bool HasAVX) {
+static bool isVPERM2X128Mask(ArrayRef<int> Mask, EVT VT, bool HasAVX) {
    if (!HasAVX || VT.getSizeInBits() != 256)
      return false;
  
@@ -3753,8 +3709,7 @@ static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) {
  /// to the same elements of the low, but to the higher half of the source.
  /// In VPERMILPD the two lanes could be shuffled independently of each other
  /// with the same restriction that lanes can't be crossed.
-static bool isVPERMILPMask(const SmallVectorImpl<int> &Mask, EVT VT,
-                           bool HasAVX) {
+static bool isVPERMILPMask(ArrayRef<int> Mask, EVT VT, bool HasAVX) {
    if (!HasAVX)
      return false;
  
@@ -3765,17 +3720,16 @@ static bool isVPERMILPMask(const SmallVectorImpl<int> &Mask, EVT VT,
  
    unsigned NumLanes = VT.getSizeInBits()/128;
    unsigned LaneSize = NumElts/NumLanes;
-  for (unsigned l = 0; l != NumLanes; ++l) {
-    unsigned LaneStart = l*LaneSize;
+  for (unsigned l = 0; l != NumElts; l += LaneSize) {
      for (unsigned i = 0; i != LaneSize; ++i) {
-      if (!isUndefOrInRange(Mask[i+LaneStart], LaneStart, LaneStart+LaneSize))
+      if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
          return false;
-      if (NumElts == 4 || l == 0)
+      if (NumElts != 8 || l == 0)
          continue;
        // VPERMILPS handling
        if (Mask[i] < 0)
          continue;
-      if (!isUndefOrEqual(Mask[i+LaneStart], Mask[i]+LaneStart))
+      if (!isUndefOrEqual(Mask[i+l], Mask[i]+l))
          return false;
      }
    }
@@ -3815,7 +3769,7 @@ static unsigned getShuffleVPERMILPImmediate(ShuffleVectorSDNode *SVOp) {
  /// isCommutedMOVL - Returns true if the shuffle mask is except the reverse
  /// of what x86 movss want. X86 movs requires the lowest  element to be lowest
  /// element of vector 2 and the other elements to come from vector 1 in order.
-static bool isCommutedMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT,
+static bool isCommutedMOVLMask(ArrayRef<int> Mask, EVT VT,
                                 bool V2IsSplat = false, bool V2IsUndef = false) {
    unsigned NumOps = VT.getVectorNumElements();
    if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)
@@ -3835,9 +3789,8 @@ static bool isCommutedMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT,
  
  static bool isCommutedMOVL(ShuffleVectorSDNode *N, bool V2IsSplat = false,
                             bool V2IsUndef = false) {
-  SmallVector<int, 8> M;
-  N->getMask(M);
-  return isCommutedMOVLMask(M, N->getValueType(0), V2IsSplat, V2IsUndef);
+  return isCommutedMOVLMask(N->getMask(), N->getValueType(0),
+                            V2IsSplat, V2IsUndef);
  }
  
  /// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
@@ -3899,8 +3852,7 @@ bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N,
  /// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand
  /// specifies a shuffle of elements that is suitable for input to 256-bit
  /// version of MOVDDUP.
-static bool isMOVDDUPYMask(const SmallVectorImpl<int> &Mask, EVT VT,
-                           bool HasAVX) {
+static bool isMOVDDUPYMask(ArrayRef<int> Mask, EVT VT, bool HasAVX) {
    unsigned NumElts = VT.getVectorNumElements();
  
    if (!HasAVX || VT.getSizeInBits() != 256 || NumElts != 4)
@@ -3974,20 +3926,33 @@ bool X86::isVINSERTF128Index(SDNode *N) {
  
  /// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
  /// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions.
-unsigned X86::getShuffleSHUFImmediate(SDNode *N) {
-  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
-  unsigned NumOperands = SVOp->getValueType(0).getVectorNumElements();
+/// Handles 128-bit and 256-bit.
+unsigned X86::getShuffleSHUFImmediate(ShuffleVectorSDNode *N) {
+  EVT VT = N->getValueType(0);
+
+  assert((VT.is128BitVector() || VT.is256BitVector()) &&
+         "Unsupported vector type for PSHUF/SHUFP");
+
+  // Handle 128 and 256-bit vector lengths. AVX defines PSHUF/SHUFP to operate
+  // independently on 128-bit lanes.
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned NumLanes = VT.getSizeInBits()/128;
+  unsigned NumLaneElts = NumElts/NumLanes;
+
+  assert((NumLaneElts == 2 || NumLaneElts == 4) &&
+         "Only supports 2 or 4 elements per lane");
  
-  unsigned Shift = (NumOperands == 4) ? 2 : 1;
+  unsigned Shift = (NumLaneElts == 4) ? 1 : 0;
    unsigned Mask = 0;
-  for (unsigned i = 0; i != NumOperands; ++i) {
-    int Val = SVOp->getMaskElt(NumOperands-i-1);
-    if (Val < 0) Val = 0;
-    if (Val >= (int)NumOperands) Val -= NumOperands;
-    Mask |= Val;
-    if (i != NumOperands - 1)
-      Mask <<= Shift;
+  for (unsigned i = 0; i != NumElts; ++i) {
+    int Elt = N->getMaskElt(i);
+    if (Elt < 0) continue;
+    Elt %= NumLaneElts;
+    unsigned ShAmt = i << Shift;
+    if (ShAmt >= 8) ShAmt -= 8;
+    Mask |= Elt << ShAmt;
    }
+
    return Mask;
  }
  
@@ -4028,14 +3993,21 @@ unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) {
  static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) {
    EVT VT = SVOp->getValueType(0);
    unsigned EltSize = VT.getVectorElementType().getSizeInBits() >> 3;
-  int Val = 0;
  
-  unsigned i, e;
-  for (i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned NumLanes = VT.getSizeInBits()/128;
+  unsigned NumLaneElts = NumElts/NumLanes;
+
+  int Val = 0;
+  unsigned i;
+  for (i = 0; i != NumElts; ++i) {
      Val = SVOp->getMaskElt(i);
      if (Val >= 0)
        break;
    }
+  if (Val >= (int)NumElts)
+    Val -= NumElts - NumLaneElts;
+
    assert(Val - i > 0 && "PALIGNR imm should be positive");
    return (Val - i) * EltSize;
  }
@@ -4234,8 +4206,8 @@ static bool isZeroShuffle(ShuffleVectorSDNode *N) {
  
  /// getZeroVector - Returns a vector of specified type with all zero elements.
  ///
-static SDValue getZeroVector(EVT VT, bool HasSSE2, SelectionDAG &DAG,
-                             DebugLoc dl) {
+static SDValue getZeroVector(EVT VT, bool HasSSE2, bool HasAVX2,
+                             SelectionDAG &DAG, DebugLoc dl) {
    assert(VT.isVector() && "Expected a vector type");
  
    // Always build SSE zero vectors as <4 x i32> bitcasted
@@ -4250,12 +4222,17 @@ static SDValue getZeroVector(EVT VT, bool HasSSE2, SelectionDAG &DAG,
        Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
      }
    } else if (VT.getSizeInBits() == 256) { // AVX
-    // 256-bit logic and arithmetic instructions in AVX are
-    // all floating-point, no support for integer ops. Default
-    // to emitting fp zeroed vectors then.
-    SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
-    SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
-    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, 8);
+    if (HasAVX2) { // AVX2
+      SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
+      SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
+      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 8);
+    } else {
+      // 256-bit logic and arithmetic instructions in AVX are all
+      // floating-point, no support for integer ops. Emit fp zeroed vectors.
+      SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
+      SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
+      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, 8);
+    }
    }
    return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
  }
@@ -4297,8 +4274,7 @@ static SDValue NormalizeMask(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
    unsigned NumElems = VT.getVectorNumElements();
  
    bool Changed = false;
-  SmallVector<int, 8> MaskVec;
-  SVOp->getMask(MaskVec);
+  SmallVector<int, 8> MaskVec(SVOp->getMask().begin(), SVOp->getMask().end());
  
    for (unsigned i = 0; i != NumElems; ++i) {
      if (MaskVec[i] > (int)NumElems) {
@@ -4413,7 +4389,7 @@ static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
    // Extract the 128-bit part containing the splat element and update
    // the splat element index when it refers to the higher register.
    if (Size == 256) {
-    unsigned Idx = (EltNo > NumElems/2) ? NumElems/2 : 0;
+    unsigned Idx = (EltNo >= NumElems/2) ? NumElems/2 : 0;
      V1 = Extract128BitVector(V1, DAG.getConstant(Idx, MVT::i32), DAG, dl);
      if (Idx > 0)
        EltNo -= NumElems/2;
@@ -4445,11 +4421,13 @@ static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
  /// element of V2 is swizzled into the zero/undef vector, landing at element
  /// Idx.  This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
  static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
-                                           bool isZero, bool HasSSE2,
+                                           bool IsZero,
+                                           const X86Subtarget *Subtarget,
                                             SelectionDAG &DAG) {
    EVT VT = V2.getValueType();
-  SDValue V1 = isZero
-    ? getZeroVector(VT, HasSSE2, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT);
+  SDValue V1 = IsZero
+    ? getZeroVector(VT, Subtarget->hasSSE2(), Subtarget->hasAVX2(), DAG,
+                    V2.getDebugLoc()) : DAG.getUNDEF(VT);
    unsigned NumElems = VT.getVectorNumElements();
    SmallVector<int, 16> MaskVec;
    for (unsigned i = 0; i != NumElems; ++i)
@@ -4729,7 +4707,8 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
      bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
      if (ThisIsNonZero && First) {
        if (NumZero)
-        V = getZeroVector(MVT::v8i16, true, DAG, dl);
+        V = getZeroVector(MVT::v8i16, /*HasSSE2*/ true, /*HasAVX2*/ false,
+                          DAG, dl);
        else
          V = DAG.getUNDEF(MVT::v8i16);
        First = false;
@@ -4777,7 +4756,8 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
      if (isNonZero) {
        if (First) {
          if (NumZero)
-          V = getZeroVector(MVT::v8i16, true, DAG, dl);
+          V = getZeroVector(MVT::v8i16, /*HasSSE2*/ true, /*HasAVX2*/ false,
+                            DAG, dl);
          else
            V = DAG.getUNDEF(MVT::v8i16);
          First = false;
@@ -4798,7 +4778,7 @@ static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
                           const TargetLowering &TLI, DebugLoc dl) {
    assert(VT.getSizeInBits() == 128 && "Unknown type for VShift");
    EVT ShVT = MVT::v2i64;
-  unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL;
+  unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
    SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp);
    return DAG.getNode(ISD::BITCAST, dl, VT,
                       DAG.getNode(Opc, dl, ShVT, SrcOp,
@@ -4963,7 +4943,10 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
  /// a scalar load.
  /// The scalar load node is returned when a pattern is found,
  /// or SDValue() otherwise.
-static SDValue isVectorBroadcast(SDValue &Op, bool hasAVX2) {
+static SDValue isVectorBroadcast(SDValue &Op, const X86Subtarget *Subtarget) {
+  if (!Subtarget->hasAVX())
+    return SDValue();
+
    EVT VT = Op.getValueType();
    SDValue V = Op;
  
@@ -5022,18 +5005,6 @@ static SDValue isVectorBroadcast(SDValue &Op, bool hasAVX2) {
    bool Is128 = VT.getSizeInBits() == 128;
    unsigned ScalarSize = Ld.getValueType().getSizeInBits();
  
-  if (hasAVX2) {
-    // VBroadcast to YMM
-    if (Is256 && (ScalarSize == 8  || ScalarSize == 16 ||
-                  ScalarSize == 32 || ScalarSize == 64 ))
-      return Ld;
-
-    // VBroadcast to XMM
-    if (Is128 && (ScalarSize ==  8 || ScalarSize == 32 ||
-                  ScalarSize == 16 || ScalarSize == 64 ))
-      return Ld;
-  }
-
    // VBroadcast to YMM
    if (Is256 && (ScalarSize == 32 || ScalarSize == 64))
      return Ld;
@@ -5042,6 +5013,17 @@ static SDValue isVectorBroadcast(SDValue &Op, bool hasAVX2) {
    if (Is128 && (ScalarSize == 32))
      return Ld;
  
+  // The integer check is needed for the 64-bit into 128-bit so it doesn't match
+  // double since there is vbroadcastsd xmm
+  if (Subtarget->hasAVX2() && Ld.getValueType().isInteger()) {
+    // VBroadcast to YMM
+    if (Is256 && (ScalarSize == 8 || ScalarSize == 16))
+      return Ld;
+
+    // VBroadcast to XMM
+    if (Is128 && (ScalarSize ==  8 || ScalarSize == 16 || ScalarSize == 64))
+      return Ld;
+  }
  
    // Unsupported broadcast.
    return SDValue();
@@ -5059,27 +5041,26 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
    if (ISD::isBuildVectorAllZeros(Op.getNode())) {
      // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
      // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
-    if (Op.getValueType() == MVT::v4i32 ||
-        Op.getValueType() == MVT::v8i32)
+    if (VT == MVT::v4i32 || VT == MVT::v8i32)
        return Op;
  
-    return getZeroVector(Op.getValueType(), Subtarget->hasSSE2(), DAG, dl);
+    return getZeroVector(VT, Subtarget->hasSSE2(),
+                         Subtarget->hasAVX2(), DAG, dl);
    }
  
    // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
    // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
    // vpcmpeqd on 256-bit vectors.
    if (ISD::isBuildVectorAllOnes(Op.getNode())) {
-    if (Op.getValueType() == MVT::v4i32 ||
-        (Op.getValueType() == MVT::v8i32 && Subtarget->hasAVX2()))
+    if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasAVX2()))
        return Op;
  
-    return getOnesVector(Op.getValueType(), Subtarget->hasAVX2(), DAG, dl);
+    return getOnesVector(VT, Subtarget->hasAVX2(), DAG, dl);
    }
  
-  SDValue LD = isVectorBroadcast(Op, Subtarget->hasAVX2());
-  if (Subtarget->hasAVX() && LD.getNode())
-      return DAG.getNode(X86ISD::VBROADCAST, dl, VT, LD);
+  SDValue LD = isVectorBroadcast(Op, Subtarget);
+  if (LD.getNode())
+    return DAG.getNode(X86ISD::VBROADCAST, dl, VT, LD);
  
    unsigned EVTBits = ExtVT.getSizeInBits();
  
@@ -5130,8 +5111,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
          // convert it to a vector with movd (S2V+shuffle to zero extend).
          Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
          Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
-        Item = getShuffleVectorZeroOrUndef(Item, 0, true,
-                                           Subtarget->hasSSE2(), DAG);
+        Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
  
          // Now we have our 32-bit value zero extended in the low element of
          // a vector.  If Idx != 0, swizzle it into place.
@@ -5144,7 +5124,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
                                        DAG.getUNDEF(Item.getValueType()),
                                        &Mask[0]);
          }
-        return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Item);
+        return DAG.getNode(ISD::BITCAST, dl, VT, Item);
        }
      }
  
@@ -5159,30 +5139,28 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
        if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
            (ExtVT == MVT::i64 && Subtarget->is64Bit())) {
          if (VT.getSizeInBits() == 256) {
-          EVT VT128 = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems / 2);
-          Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Item);
-          SDValue ZeroVec = getZeroVector(VT, true, DAG, dl);
-          return Insert128BitVector(ZeroVec, Item, DAG.getConstant(0, MVT::i32),
-                              DAG, dl);
+          SDValue ZeroVec = getZeroVector(VT, Subtarget->hasSSE2(),
+                                          Subtarget->hasAVX2(), DAG, dl);
+          return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
+                             Item, DAG.getIntPtrConstant(0));
          }
          assert(VT.getSizeInBits() == 128 && "Expected an SSE value type!");
          Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
          // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
-        return getShuffleVectorZeroOrUndef(Item, 0, true,
-                                           Subtarget->hasSSE2(), DAG);
+        return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
        }
  
        if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
          Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
          Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
          if (VT.getSizeInBits() == 256) {
-          SDValue ZeroVec = getZeroVector(MVT::v8i32, true, DAG, dl);
+          SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget->hasSSE2(),
+                                          Subtarget->hasAVX2(), DAG, dl);
            Item = Insert128BitVector(ZeroVec, Item, DAG.getConstant(0, MVT::i32),
                                      DAG, dl);
          } else {
            assert(VT.getSizeInBits() == 128 && "Expected an SSE value type!");
-          Item = getShuffleVectorZeroOrUndef(Item, 0, true,
-                                             Subtarget->hasSSE2(), DAG);
+          Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
          }
          return DAG.getNode(ISD::BITCAST, dl, VT, Item);
        }
@@ -5211,8 +5189,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
        Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
  
        // Turn it into a shuffle of zero and zero-extended scalar to vector.
-      Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0,
-                                         Subtarget->hasSSE2(), DAG);
+      Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG);
        SmallVector<int, 8> MaskVec;
        for (unsigned i = 0; i < NumElems; i++)
          MaskVec.push_back(i == Idx ? 0 : 1);
@@ -5268,8 +5245,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
        unsigned Idx = CountTrailingZeros_32(NonZeros);
        SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
                                   Op.getOperand(Idx));
-      return getShuffleVectorZeroOrUndef(V2, Idx, true,
-                                         Subtarget->hasSSE2(), DAG);
+      return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
      }
      return SDValue();
    }
@@ -5294,7 +5270,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
      for (unsigned i = 0; i < 4; ++i) {
        bool isZero = !(NonZeros & (1 << i));
        if (isZero)
-        V[i] = getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl);
+        V[i] = getZeroVector(VT, Subtarget->hasSSE2(), Subtarget->hasAVX2(),
+                             DAG, dl);
        else
          V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
      }
@@ -5724,8 +5701,7 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
    SDValue V1 = SVOp->getOperand(0);
    SDValue V2 = SVOp->getOperand(1);
    DebugLoc dl = SVOp->getDebugLoc();
-  SmallVector<int, 16> MaskVals;
-  SVOp->getMask(MaskVals);
+  ArrayRef<int> MaskVals = SVOp->getMask();
  
    // If we have SSSE3, case 1 is generated when all result bytes come from
    // one of  the inputs.  Otherwise, case 2 is generated.  If no SSSE3 is
@@ -5941,96 +5917,106 @@ static SDValue getVZextMovL(EVT VT, EVT OpVT,
                                               OpVT, SrcOp)));
  }
  
-/// areShuffleHalvesWithinDisjointLanes - Check whether each half of a vector
-/// shuffle node referes to only one lane in the sources.
-static bool areShuffleHalvesWithinDisjointLanes(ShuffleVectorSDNode *SVOp) {
-  EVT VT = SVOp->getValueType(0);
-  int NumElems = VT.getVectorNumElements();
-  int HalfSize = NumElems/2;
-  SmallVector<int, 16> M;
-  SVOp->getMask(M);
-  bool MatchA = false, MatchB = false;
-
-  for (int l = 0; l < NumElems*2; l += HalfSize) {
-    if (isUndefOrInRange(M, 0, HalfSize, l, l+HalfSize)) {
-      MatchA = true;
-      break;
-    }
-  }
-
-  for (int l = 0; l < NumElems*2; l += HalfSize) {
-    if (isUndefOrInRange(M, HalfSize, HalfSize, l, l+HalfSize)) {
-      MatchB = true;
-      break;
-    }
-  }
-
-  return MatchA && MatchB;
-}
-
  /// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles
  /// which could not be matched by any known target speficic shuffle
  static SDValue
  LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
-  if (areShuffleHalvesWithinDisjointLanes(SVOp)) {
-    // If each half of a vector shuffle node referes to only one lane in the
-    // source vectors, extract each used 128-bit lane and shuffle them using
-    // 128-bit shuffles. Then, concatenate the results. Otherwise leave
-    // the work to the legalizer.
-    DebugLoc dl = SVOp->getDebugLoc();
-    EVT VT = SVOp->getValueType(0);
-    int NumElems = VT.getVectorNumElements();
-    int HalfSize = NumElems/2;
-
-    // Extract the reference for each half
-    int FstVecExtractIdx = 0, SndVecExtractIdx = 0;
-    int FstVecOpNum = 0, SndVecOpNum = 0;
-    for (int i = 0; i < HalfSize; ++i) {
-      int Elt = SVOp->getMaskElt(i);
-      if (SVOp->getMaskElt(i) < 0)
+  EVT VT = SVOp->getValueType(0);
+
+  unsigned NumElems = VT.getVectorNumElements();
+  unsigned NumLaneElems = NumElems / 2;
+
+  int MinRange[2][2] = { { static_cast<int>(NumElems),
+                           static_cast<int>(NumElems) },
+                         { static_cast<int>(NumElems),
+                           static_cast<int>(NumElems) } };
+  int MaxRange[2][2] = { { -1, -1 }, { -1, -1 } };
+
+  // Collect used ranges for each source in each lane
+  for (unsigned l = 0; l < 2; ++l) {
+    unsigned LaneStart = l*NumLaneElems;
+    for (unsigned i = 0; i != NumLaneElems; ++i) {
+      int Idx = SVOp->getMaskElt(i+LaneStart);
+      if (Idx < 0)
          continue;
-      FstVecOpNum = Elt/NumElems;
-      FstVecExtractIdx = Elt % NumElems < HalfSize ? 0 : HalfSize;
-      break;
+
+      int Input = 0;
+      if (Idx >= (int)NumElems) {
+        Idx -= NumElems;
+        Input = 1;
+      }
+
+      if (Idx > MaxRange[l][Input])
+        MaxRange[l][Input] = Idx;
+      if (Idx < MinRange[l][Input])
+        MinRange[l][Input] = Idx;
      }
-    for (int i = HalfSize; i < NumElems; ++i) {
-      int Elt = SVOp->getMaskElt(i);
-      if (SVOp->getMaskElt(i) < 0)
+  }
+
+  // Make sure each range is 128-bits
+  int ExtractIdx[2][2] = { { -1, -1 }, { -1, -1 } };
+  for (unsigned l = 0; l < 2; ++l) {
+    for (unsigned Input = 0; Input < 2; ++Input) {
+      if (MinRange[l][Input] == (int)NumElems && MaxRange[l][Input] < 0)
          continue;
-      SndVecOpNum = Elt/NumElems;
-      SndVecExtractIdx = Elt % NumElems < HalfSize ? 0 : HalfSize;
-      break;
+
+      if (MinRange[l][Input] >= 0 && MaxRange[l][Input] < (int)NumLaneElems)
+        ExtractIdx[l][Input] = 0;
+      else if (MinRange[l][Input] >= (int)NumLaneElems &&
+               MaxRange[l][Input] < (int)NumElems)
+        ExtractIdx[l][Input] = NumLaneElems;
+      else
+        return SDValue();
      }
+  }
  
-    // Extract the subvectors
-    SDValue V1 = Extract128BitVector(SVOp->getOperand(FstVecOpNum),
-                      DAG.getConstant(FstVecExtractIdx, MVT::i32), DAG, dl);
-    SDValue V2 = Extract128BitVector(SVOp->getOperand(SndVecOpNum),
-                      DAG.getConstant(SndVecExtractIdx, MVT::i32), DAG, dl);
+  DebugLoc dl = SVOp->getDebugLoc();
+  MVT EltVT = VT.getVectorElementType().getSimpleVT();
+  EVT NVT = MVT::getVectorVT(EltVT, NumElems/2);
+
+  SDValue Ops[2][2];
+  for (unsigned l = 0; l < 2; ++l) {
+    for (unsigned Input = 0; Input < 2; ++Input) {
+      if (ExtractIdx[l][Input] >= 0)
+        Ops[l][Input] = Extract128BitVector(SVOp->getOperand(Input),
+                                DAG.getConstant(ExtractIdx[l][Input], MVT::i32),
+                                                DAG, dl);
+      else
+        Ops[l][Input] = DAG.getUNDEF(NVT);
+    }
+  }
  
-    // Generate 128-bit shuffles
-    SmallVector<int, 16> MaskV1, MaskV2;
-    for (int i = 0; i < HalfSize; ++i) {
-      int Elt = SVOp->getMaskElt(i);
-      MaskV1.push_back(Elt < 0 ? Elt : Elt % HalfSize);
+  // Generate 128-bit shuffles
+  SmallVector<int, 16> Mask1, Mask2;
+  for (unsigned i = 0; i != NumLaneElems; ++i) {
+    int Elt = SVOp->getMaskElt(i);
+    if (Elt >= (int)NumElems) {
+      Elt %= NumLaneElems;
+      Elt += NumLaneElems;
+    } else if (Elt >= 0) {
+      Elt %= NumLaneElems;
      }
-    for (int i = HalfSize; i < NumElems; ++i) {
-      int Elt = SVOp->getMaskElt(i);
-      MaskV2.push_back(Elt < 0 ? Elt : Elt % HalfSize);
+    Mask1.push_back(Elt);
+  }
+  for (unsigned i = NumLaneElems; i != NumElems; ++i) {
+    int Elt = SVOp->getMaskElt(i);
+    if (Elt >= (int)NumElems) {
+      Elt %= NumLaneElems;
+      Elt += NumLaneElems;
+    } else if (Elt >= 0) {
+      Elt %= NumLaneElems;
      }
-
-    EVT NVT = V1.getValueType();
-    V1 = DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &MaskV1[0]);
-    V2 = DAG.getVectorShuffle(NVT, dl, V2, DAG.getUNDEF(NVT), &MaskV2[0]);
-
-    // Concatenate the result back
-    SDValue V = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), V1,
-                                   DAG.getConstant(0, MVT::i32), DAG, dl);
-    return Insert128BitVector(V, V2, DAG.getConstant(NumElems/2, MVT::i32),
-                              DAG, dl);
+    Mask2.push_back(Elt);
    }
  
-  return SDValue();
+  SDValue Shuf1 = DAG.getVectorShuffle(NVT, dl, Ops[0][0], Ops[0][1], &Mask1[0]);
+  SDValue Shuf2 = DAG.getVectorShuffle(NVT, dl, Ops[1][0], Ops[1][1], &Mask2[0]);
+
+  // Concatenate the result back
+  SDValue V = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), Shuf1,
+                                 DAG.getConstant(0, MVT::i32), DAG, dl);
+  return Insert128BitVector(V, Shuf2, DAG.getConstant(NumElems/2, MVT::i32),
+                            DAG, dl);
  }
  
  /// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with
@@ -6047,8 +6033,7 @@ LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
    SmallVector<std::pair<int, int>, 8> Locs;
    Locs.resize(4);
    SmallVector<int, 8> Mask1(4U, -1);
-  SmallVector<int, 8> PermMask;
-  SVOp->getMask(PermMask);
+  SmallVector<int, 8> PermMask(SVOp->getMask().begin(), SVOp->getMask().end());
  
    unsigned NumHi = 0;
    unsigned NumLo = 0;
@@ -6259,6 +6244,13 @@ bool CanXFormVExtractWithShuffleIntoLoad(SDValue V, SelectionDAG &DAG,
    int Idx = (Elt > NumElems) ? -1 : SVOp->getMaskElt(Elt);
    V = (Idx < (int)NumElems) ? V.getOperand(0) : V.getOperand(1);
  
+  // If we are accessing the upper part of a YMM register
+  // then the EXTRACT_VECTOR_ELT is likely to be legalized to a sequence of
+  // EXTRACT_SUBVECTOR + EXTRACT_VECTOR_ELT, which are not detected at this point
+  // because the legalization of N did not happen yet.
+  if (Idx >= (int)NumElems/2 && VT.getSizeInBits() == 256)
+    return false;
+
    // Skip one more bit_convert if necessary
    if (V.getOpcode() == ISD::BITCAST)
      V = V.getOperand(0);
@@ -6402,7 +6394,8 @@ SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG,
    SDValue V2 = Op.getOperand(1);
  
    if (isZeroShuffle(SVOp))
-    return getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl);
+    return getZeroVector(VT, Subtarget->hasSSE2(), Subtarget->hasAVX2(),
+                         DAG, dl);
  
    // Handle splat operations
    if (SVOp->isSplat()) {
@@ -6416,8 +6409,8 @@ SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG,
        return Op;
  
      // Use vbroadcast whenever the splat comes from a foldable load
-    SDValue LD = isVectorBroadcast(Op, Subtarget->hasAVX2());
-    if (Subtarget->hasAVX() && LD.getNode())
+    SDValue LD = isVectorBroadcast(Op, Subtarget);
+    if (LD.getNode())
        return DAG.getNode(X86ISD::VBROADCAST, dl, VT, LD);
  
      // Handle splats by matching through known shuffle masks
@@ -6464,6 +6457,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
    EVT VT = Op.getValueType();
    DebugLoc dl = Op.getDebugLoc();
    unsigned NumElems = VT.getVectorNumElements();
+  bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
    bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
    bool V1IsSplat = false;
    bool V2IsSplat = false;
@@ -6475,7 +6469,10 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
  
    assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles");
  
-  assert(V1.getOpcode() != ISD::UNDEF && "Op 1 of shuffle should not be undef");
+  if (V1IsUndef && V2IsUndef)
+    return DAG.getUNDEF(VT);
+
+  assert(!V1IsUndef && "Op 1 of shuffle should not be undef");
  
    // Vector shuffle lowering takes 3 steps:
    //
@@ -6579,7 +6576,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
      return CommuteVectorShuffle(SVOp, DAG);
  
    if (isShift) {
-    // No better options. Use a vshl / vsrl.
+    // No better options. Use a vshldq / vsrldq.
      EVT EltVT = VT.getVectorElementType();
      ShAmt *= EltVT.getSizeInBits();
      return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
@@ -6601,8 +6598,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
      Commuted = true;
    }
  
-  SmallVector<int, 32> M;
-  SVOp->getMask(M);
+  ArrayRef<int> M = SVOp->getMask();
  
    if (isCommutedMOVLMask(M, VT, V2IsSplat, V2IsUndef)) {
      // Shuffling low element of v1 into undef, just return v1.
@@ -6649,15 +6645,14 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
    }
  
    // Normalize the node to match x86 shuffle ops if needed
-  if (!V2IsUndef && (isSHUFPMask(M, VT, /* Commuted */ true) ||
-                     isVSHUFPYMask(M, VT, HasAVX, /* Commuted */ true)))
+  if (!V2IsUndef && (isSHUFPMask(M, VT, HasAVX, /* Commuted */ true)))
      return CommuteVectorShuffle(SVOp, DAG);
  
    // The checks below are all present in isShuffleMaskLegal, but they are
    // inlined here right now to enable us to directly emit target specific
    // nodes, and remove one by one until they don't return Op anymore.
  
-  if (isPALIGNRMask(M, VT, Subtarget->hasSSSE3()))
+  if (isPALIGNRMask(M, VT, Subtarget))
      return getTargetShuffleNode(X86ISD::PALIGN, dl, VT, V1, V2,
                                  getShufflePALIGNRImmediate(SVOp),
                                  DAG);
@@ -6678,7 +6673,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
                                  X86::getShufflePSHUFLWImmediate(SVOp),
                                  DAG);
  
-  if (isSHUFPMask(M, VT))
+  if (isSHUFPMask(M, VT, HasAVX))
      return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V2,
                                  X86::getShuffleSHUFImmediate(SVOp), DAG);
  
@@ -6706,11 +6701,6 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
      return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1,
                                  V2, getShuffleVPERM2X128Immediate(SVOp), DAG);
  
-  // Handle VSHUFPS/DY permutations
-  if (isVSHUFPYMask(M, VT, HasAVX))
-    return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V2,
-                                getShuffleVSHUFPYImmediate(SVOp), DAG);
-
    //===--------------------------------------------------------------------===//
    // Since no target specific shuffle was selected for this generic one,
    // lower it into other known shuffles. FIXME: this isn't true yet, but
@@ -7429,11 +7419,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
                                Chain.getValue(1));
    }
  
-  assert(false &&
-         "TLS not implemented for this target.");
-
-  llvm_unreachable("Unreachable");
-  return SDValue();
+  llvm_unreachable("TLS not implemented for this target.");
  }
  
  
@@ -7659,8 +7645,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
                               Op.getOperand(0));
  
    // Zero out the upper parts of the register.
-  Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget->hasSSE2(),
-                                     DAG);
+  Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
  
    Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
                       DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load),
@@ -7712,7 +7697,8 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
      return LowerUINT_TO_FP_i64(Op, DAG);
    else if (SrcVT == MVT::i32 && X86ScalarSSEf64)
      return LowerUINT_TO_FP_i32(Op, DAG);
-  else if (SrcVT == MVT::i64 && DstVT == MVT::f32)
+  else if (Subtarget->is64Bit() &&
+           SrcVT == MVT::i64 && DstVT == MVT::f32)
      return SDValue();
  
    // Make a 64-bit buffer, and use it to build an FILD.
@@ -7733,7 +7719,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
  
    assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
    SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
-                                StackSlot, MachinePointerInfo(),
+                               StackSlot, MachinePointerInfo(),
                                 false, false, 0);
    // For i64 source, we need to add the appropriate power of 2 if the input
    // was negative.  This is the same as the optimization in
@@ -8360,7 +8346,6 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
      EVT EltVT = Op0.getValueType().getVectorElementType();
      assert(EltVT == MVT::f32 || EltVT == MVT::f64);
  
-    unsigned Opc = EltVT == MVT::f32 ? X86ISD::CMPPS : X86ISD::CMPPD;
      bool Swap = false;
  
      // SSE Condition code mapping:
@@ -8400,19 +8385,24 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
      if (SSECC == 8) {
        if (SetCCOpcode == ISD::SETUEQ) {
          SDValue UNORD, EQ;
-        UNORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(3, MVT::i8));
-        EQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(0, MVT::i8));
+        UNORD = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
+                            DAG.getConstant(3, MVT::i8));
+        EQ = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
+                         DAG.getConstant(0, MVT::i8));
          return DAG.getNode(ISD::OR, dl, VT, UNORD, EQ);
        } else if (SetCCOpcode == ISD::SETONE) {
          SDValue ORD, NEQ;
-        ORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(7, MVT::i8));
-        NEQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(4, MVT::i8));
+        ORD = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
+                          DAG.getConstant(7, MVT::i8));
+        NEQ = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
+                          DAG.getConstant(4, MVT::i8));
          return DAG.getNode(ISD::AND, dl, VT, ORD, NEQ);
        }
        llvm_unreachable("Illegal FP comparison");
      }
      // Handle all other FP comparisons here.
-    return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, MVT::i8));
+    return DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
+                       DAG.getConstant(SSECC, MVT::i8));
    }
  
    // Break 256-bit integer vector compare into smaller ones.
@@ -8422,38 +8412,30 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
    // We are handling one of the integer comparisons here.  Since SSE only has
    // GT and EQ comparisons for integer, swapping operands and multiple
    // operations may be required for some comparisons.
-  unsigned Opc = 0, EQOpc = 0, GTOpc = 0;
+  unsigned Opc = 0;
    bool Swap = false, Invert = false, FlipSigns = false;
  
-  switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
-  default: break;
-  case MVT::i8:   EQOpc = X86ISD::PCMPEQB; GTOpc = X86ISD::PCMPGTB; break;
-  case MVT::i16:  EQOpc = X86ISD::PCMPEQW; GTOpc = X86ISD::PCMPGTW; break;
-  case MVT::i32:  EQOpc = X86ISD::PCMPEQD; GTOpc = X86ISD::PCMPGTD; break;
-  case MVT::i64:  EQOpc = X86ISD::PCMPEQQ; GTOpc = X86ISD::PCMPGTQ; break;
-  }
-
    switch (SetCCOpcode) {
    default: break;
    case ISD::SETNE:  Invert = true;
-  case ISD::SETEQ:  Opc = EQOpc; break;
+  case ISD::SETEQ:  Opc = X86ISD::PCMPEQ; break;
    case ISD::SETLT:  Swap = true;
-  case ISD::SETGT:  Opc = GTOpc; break;
+  case ISD::SETGT:  Opc = X86ISD::PCMPGT; break;
    case ISD::SETGE:  Swap = true;
-  case ISD::SETLE:  Opc = GTOpc; Invert = true; break;
+  case ISD::SETLE:  Opc = X86ISD::PCMPGT; Invert = true; break;
    case ISD::SETULT: Swap = true;
-  case ISD::SETUGT: Opc = GTOpc; FlipSigns = true; break;
+  case ISD::SETUGT: Opc = X86ISD::PCMPGT; FlipSigns = true; break;
    case ISD::SETUGE: Swap = true;
-  case ISD::SETULE: Opc = GTOpc; FlipSigns = true; Invert = true; break;
+  case ISD::SETULE: Opc = X86ISD::PCMPGT; FlipSigns = true; Invert = true; break;
    }
    if (Swap)
      std::swap(Op0, Op1);
  
    // Check that the operation in question is available (most are plain SSE2,
    // but PCMPGTQ and PCMPEQQ have different requirements).
-  if (Opc == X86ISD::PCMPGTQ && !Subtarget->hasSSE42())
+  if (Opc == X86ISD::PCMPGT && VT == MVT::v2i64 && !Subtarget->hasSSE42())
      return SDValue();
-  if (Opc == X86ISD::PCMPEQQ && !Subtarget->hasSSE41())
+  if (Opc == X86ISD::PCMPEQ && VT == MVT::v2i64 && !Subtarget->hasSSE41())
      return SDValue();
  
    // Since SSE has no unsigned integer comparisons, we need to flip  the sign
@@ -9164,6 +9146,43 @@ SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const {
                         MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
  }
  
+// getTargetVShiftNOde - Handle vector element shifts where the shift amount
+// may or may not be a constant. Takes immediate version of shift as input.
+static SDValue getTargetVShiftNode(unsigned Opc, DebugLoc dl, EVT VT,
+                                   SDValue SrcOp, SDValue ShAmt,
+                                   SelectionDAG &DAG) {
+  assert(ShAmt.getValueType() == MVT::i32 && "ShAmt is not i32");
+
+  if (isa<ConstantSDNode>(ShAmt)) {
+    switch (Opc) {
+      default: llvm_unreachable("Unknown target vector shift node");
+      case X86ISD::VSHLI:
+      case X86ISD::VSRLI:
+      case X86ISD::VSRAI:
+        return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
+    }
+  }
+
+  // Change opcode to non-immediate version
+  switch (Opc) {
+    default: llvm_unreachable("Unknown target vector shift node");
+    case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
+    case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
+    case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
+  }
+
+  // Need to build a vector containing shift amount
+  // Shift amount is 32-bits, but SSE instructions read 64-bit, so fill with 0
+  SDValue ShOps[4];
+  ShOps[0] = ShAmt;
+  ShOps[1] = DAG.getConstant(0, MVT::i32);
+  ShOps[2] = DAG.getUNDEF(MVT::i32);
+  ShOps[3] = DAG.getUNDEF(MVT::i32);
+  ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, &ShOps[0], 4);
+  ShAmt = DAG.getNode(ISD::BITCAST, dl, VT, ShAmt);
+  return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
+}
+
  SDValue
  X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const {
    DebugLoc dl = Op.getDebugLoc();
@@ -9299,6 +9318,26 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const
    case Intrinsic::x86_avx2_psrav_d_256:
      return DAG.getNode(ISD::SRA, dl, Op.getValueType(),
                        Op.getOperand(1), Op.getOperand(2));
+  case Intrinsic::x86_sse2_pcmpeq_b:
+  case Intrinsic::x86_sse2_pcmpeq_w:
+  case Intrinsic::x86_sse2_pcmpeq_d:
+  case Intrinsic::x86_sse41_pcmpeqq:
+  case Intrinsic::x86_avx2_pcmpeq_b:
+  case Intrinsic::x86_avx2_pcmpeq_w:
+  case Intrinsic::x86_avx2_pcmpeq_d:
+  case Intrinsic::x86_avx2_pcmpeq_q:
+    return DAG.getNode(X86ISD::PCMPEQ, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+  case Intrinsic::x86_sse2_pcmpgt_b:
+  case Intrinsic::x86_sse2_pcmpgt_w:
+  case Intrinsic::x86_sse2_pcmpgt_d:
+  case Intrinsic::x86_sse42_pcmpgtq:
+  case Intrinsic::x86_avx2_pcmpgt_b:
+  case Intrinsic::x86_avx2_pcmpgt_w:
+  case Intrinsic::x86_avx2_pcmpgt_d:
+  case Intrinsic::x86_avx2_pcmpgt_q:
+    return DAG.getNode(X86ISD::PCMPGT, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
  
    // ptest and testp intrinsics. The intrinsic these come from are designed to
    // return an integer value, not just an instruction so lower it to the ptest
@@ -9366,24 +9405,53 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const
      return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
    }
  
-  // Fix vector shift instructions where the last operand is a non-immediate
-  // i32 value.
-  case Intrinsic::x86_avx2_pslli_w:
-  case Intrinsic::x86_avx2_pslli_d:
-  case Intrinsic::x86_avx2_pslli_q:
-  case Intrinsic::x86_avx2_psrli_w:
-  case Intrinsic::x86_avx2_psrli_d:
-  case Intrinsic::x86_avx2_psrli_q:
-  case Intrinsic::x86_avx2_psrai_w:
-  case Intrinsic::x86_avx2_psrai_d:
+  // SSE/AVX shift intrinsics
+  case Intrinsic::x86_sse2_psll_w:
+  case Intrinsic::x86_sse2_psll_d:
+  case Intrinsic::x86_sse2_psll_q:
+  case Intrinsic::x86_avx2_psll_w:
+  case Intrinsic::x86_avx2_psll_d:
+  case Intrinsic::x86_avx2_psll_q:
+    return DAG.getNode(X86ISD::VSHL, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+  case Intrinsic::x86_sse2_psrl_w:
+  case Intrinsic::x86_sse2_psrl_d:
+  case Intrinsic::x86_sse2_psrl_q:
+  case Intrinsic::x86_avx2_psrl_w:
+  case Intrinsic::x86_avx2_psrl_d:
+  case Intrinsic::x86_avx2_psrl_q:
+    return DAG.getNode(X86ISD::VSRL, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+  case Intrinsic::x86_sse2_psra_w:
+  case Intrinsic::x86_sse2_psra_d:
+  case Intrinsic::x86_avx2_psra_w:
+  case Intrinsic::x86_avx2_psra_d:
+    return DAG.getNode(X86ISD::VSRA, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
    case Intrinsic::x86_sse2_pslli_w:
    case Intrinsic::x86_sse2_pslli_d:
    case Intrinsic::x86_sse2_pslli_q:
+  case Intrinsic::x86_avx2_pslli_w:
+  case Intrinsic::x86_avx2_pslli_d:
+  case Intrinsic::x86_avx2_pslli_q:
+    return getTargetVShiftNode(X86ISD::VSHLI, dl, Op.getValueType(),
+                               Op.getOperand(1), Op.getOperand(2), DAG);
    case Intrinsic::x86_sse2_psrli_w:
    case Intrinsic::x86_sse2_psrli_d:
    case Intrinsic::x86_sse2_psrli_q:
+  case Intrinsic::x86_avx2_psrli_w:
+  case Intrinsic::x86_avx2_psrli_d:
+  case Intrinsic::x86_avx2_psrli_q:
+    return getTargetVShiftNode(X86ISD::VSRLI, dl, Op.getValueType(),
+                               Op.getOperand(1), Op.getOperand(2), DAG);
    case Intrinsic::x86_sse2_psrai_w:
    case Intrinsic::x86_sse2_psrai_d:
+  case Intrinsic::x86_avx2_psrai_w:
+  case Intrinsic::x86_avx2_psrai_d:
+    return getTargetVShiftNode(X86ISD::VSRAI, dl, Op.getValueType(),
+                               Op.getOperand(1), Op.getOperand(2), DAG);
+  // Fix vector shift instructions where the last operand is a non-immediate
+  // i32 value.
    case Intrinsic::x86_mmx_pslli_w:
    case Intrinsic::x86_mmx_pslli_d:
    case Intrinsic::x86_mmx_pslli_q:
@@ -9397,103 +9465,40 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const
        return SDValue();
  
      unsigned NewIntNo = 0;
-    EVT ShAmtVT = MVT::v4i32;
      switch (IntNo) {
-    case Intrinsic::x86_sse2_pslli_w:
-      NewIntNo = Intrinsic::x86_sse2_psll_w;
-      break;
-    case Intrinsic::x86_sse2_pslli_d:
-      NewIntNo = Intrinsic::x86_sse2_psll_d;
-      break;
-    case Intrinsic::x86_sse2_pslli_q:
-      NewIntNo = Intrinsic::x86_sse2_psll_q;
-      break;
-    case Intrinsic::x86_sse2_psrli_w:
-      NewIntNo = Intrinsic::x86_sse2_psrl_w;
+    case Intrinsic::x86_mmx_pslli_w:
+      NewIntNo = Intrinsic::x86_mmx_psll_w;
        break;
-    case Intrinsic::x86_sse2_psrli_d:
-      NewIntNo = Intrinsic::x86_sse2_psrl_d;
+    case Intrinsic::x86_mmx_pslli_d:
+      NewIntNo = Intrinsic::x86_mmx_psll_d;
        break;
-    case Intrinsic::x86_sse2_psrli_q:
-      NewIntNo = Intrinsic::x86_sse2_psrl_q;
+    case Intrinsic::x86_mmx_pslli_q:
+      NewIntNo = Intrinsic::x86_mmx_psll_q;
        break;
-    case Intrinsic::x86_sse2_psrai_w:
-      NewIntNo = Intrinsic::x86_sse2_psra_w;
+    case Intrinsic::x86_mmx_psrli_w:
+      NewIntNo = Intrinsic::x86_mmx_psrl_w;
        break;
-    case Intrinsic::x86_sse2_psrai_d:
-      NewIntNo = Intrinsic::x86_sse2_psra_d;
+    case Intrinsic::x86_mmx_psrli_d:
+      NewIntNo = Intrinsic::x86_mmx_psrl_d;
        break;
-    case Intrinsic::x86_avx2_pslli_w:
-      NewIntNo = Intrinsic::x86_avx2_psll_w;
+    case Intrinsic::x86_mmx_psrli_q:
+      NewIntNo = Intrinsic::x86_mmx_psrl_q;
        break;
-    case Intrinsic::x86_avx2_pslli_d:
-      NewIntNo = Intrinsic::x86_avx2_psll_d;
+    case Intrinsic::x86_mmx_psrai_w:
+      NewIntNo = Intrinsic::x86_mmx_psra_w;
        break;
-    case Intrinsic::x86_avx2_pslli_q:
-      NewIntNo = Intrinsic::x86_avx2_psll_q;
-      break;
-    case Intrinsic::x86_avx2_psrli_w:
-      NewIntNo = Intrinsic::x86_avx2_psrl_w;
-      break;
-    case Intrinsic::x86_avx2_psrli_d:
-      NewIntNo = Intrinsic::x86_avx2_psrl_d;
-      break;
-    case Intrinsic::x86_avx2_psrli_q:
-      NewIntNo = Intrinsic::x86_avx2_psrl_q;
-      break;
-    case Intrinsic::x86_avx2_psrai_w:
-      NewIntNo = Intrinsic::x86_avx2_psra_w;
-      break;
-    case Intrinsic::x86_avx2_psrai_d:
-      NewIntNo = Intrinsic::x86_avx2_psra_d;
-      break;
-    default: {
-      ShAmtVT = MVT::v2i32;
-      switch (IntNo) {
-      case Intrinsic::x86_mmx_pslli_w:
-        NewIntNo = Intrinsic::x86_mmx_psll_w;
-        break;
-      case Intrinsic::x86_mmx_pslli_d:
-        NewIntNo = Intrinsic::x86_mmx_psll_d;
-        break;
-      case Intrinsic::x86_mmx_pslli_q:
-        NewIntNo = Intrinsic::x86_mmx_psll_q;
-        break;
-      case Intrinsic::x86_mmx_psrli_w:
-        NewIntNo = Intrinsic::x86_mmx_psrl_w;
-        break;
-      case Intrinsic::x86_mmx_psrli_d:
-        NewIntNo = Intrinsic::x86_mmx_psrl_d;
-        break;
-      case Intrinsic::x86_mmx_psrli_q:
-        NewIntNo = Intrinsic::x86_mmx_psrl_q;
-        break;
-      case Intrinsic::x86_mmx_psrai_w:
-        NewIntNo = Intrinsic::x86_mmx_psra_w;
-        break;
-      case Intrinsic::x86_mmx_psrai_d:
-        NewIntNo = Intrinsic::x86_mmx_psra_d;
-        break;
-      default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
-      }
+    case Intrinsic::x86_mmx_psrai_d:
+      NewIntNo = Intrinsic::x86_mmx_psra_d;
        break;
-    }
+    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
      }
  
      // The vector shift intrinsics with scalars uses 32b shift amounts but
      // the sse2/mmx shift instructions reads 64 bits. Set the upper 32 bits
      // to be zero.
-    SDValue ShOps[4];
-    ShOps[0] = ShAmt;
-    ShOps[1] = DAG.getConstant(0, MVT::i32);
-    if (ShAmtVT == MVT::v4i32) {
-      ShOps[2] = DAG.getUNDEF(MVT::i32);
-      ShOps[3] = DAG.getUNDEF(MVT::i32);
-      ShAmt =  DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 4);
-    } else {
-      ShAmt =  DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 2);
+    ShAmt =  DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, ShAmt,
+                         DAG.getConstant(0, MVT::i32));
  // FIXME this must be lowered to get rid of the invalid type.
-    }
  
      EVT VT = Op.getValueType();
      ShAmt = DAG.getNode(ISD::BITCAST, dl, VT, ShAmt);
@@ -9949,12 +9954,10 @@ SDValue X86TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
      //  AhiBlo = __builtin_ia32_psllqi256( AhiBlo, 32 );
      //  return AloBlo + AloBhi + AhiBlo;
  
-    SDValue Ahi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                         DAG.getConstant(Intrinsic::x86_avx2_psrli_q, MVT::i32),
-                         A, DAG.getConstant(32, MVT::i32));
-    SDValue Bhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                         DAG.getConstant(Intrinsic::x86_avx2_psrli_q, MVT::i32),
-                         B, DAG.getConstant(32, MVT::i32));
+    SDValue Ahi = DAG.getNode(X86ISD::VSRLI, dl, VT, A,
+                              DAG.getConstant(32, MVT::i32));
+    SDValue Bhi = DAG.getNode(X86ISD::VSRLI, dl, VT, B,
+                              DAG.getConstant(32, MVT::i32));
      SDValue AloBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
                           DAG.getConstant(Intrinsic::x86_avx2_pmulu_dq, MVT::i32),
                           A, B);
@@ -9964,12 +9967,10 @@ SDValue X86TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
      SDValue AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
                           DAG.getConstant(Intrinsic::x86_avx2_pmulu_dq, MVT::i32),
                           Ahi, B);
-    AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                         DAG.getConstant(Intrinsic::x86_avx2_pslli_q, MVT::i32),
-                         AloBhi, DAG.getConstant(32, MVT::i32));
-    AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                         DAG.getConstant(Intrinsic::x86_avx2_pslli_q, MVT::i32),
-                         AhiBlo, DAG.getConstant(32, MVT::i32));
+    AloBhi = DAG.getNode(X86ISD::VSHLI, dl, VT, AloBhi,
+                         DAG.getConstant(32, MVT::i32));
+    AhiBlo = DAG.getNode(X86ISD::VSHLI, dl, VT, AhiBlo,
+                         DAG.getConstant(32, MVT::i32));
      SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi);
      Res = DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
      return Res;
@@ -9987,12 +9988,10 @@ SDValue X86TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
    //  AhiBlo = __builtin_ia32_psllqi128( AhiBlo, 32 );
    //  return AloBlo + AloBhi + AhiBlo;
  
-  SDValue Ahi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                       DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32),
-                       A, DAG.getConstant(32, MVT::i32));
-  SDValue Bhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                       DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32),
-                       B, DAG.getConstant(32, MVT::i32));
+  SDValue Ahi = DAG.getNode(X86ISD::VSRLI, dl, VT, A,
+                            DAG.getConstant(32, MVT::i32));
+  SDValue Bhi = DAG.getNode(X86ISD::VSRLI, dl, VT, B,
+                            DAG.getConstant(32, MVT::i32));
    SDValue AloBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
                         DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32),
                         A, B);
@@ -10002,12 +10001,10 @@ SDValue X86TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
    SDValue AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
                         DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32),
                         Ahi, B);
-  AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                       DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32),
-                       AloBhi, DAG.getConstant(32, MVT::i32));
-  AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                       DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32),
-                       AhiBlo, DAG.getConstant(32, MVT::i32));
+  AloBhi = DAG.getNode(X86ISD::VSHLI, dl, VT, AloBhi,
+                       DAG.getConstant(32, MVT::i32));
+  AhiBlo = DAG.getNode(X86ISD::VSHLI, dl, VT, AhiBlo,
+                       DAG.getConstant(32, MVT::i32));
    SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi);
    Res = DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
    return Res;
@@ -10030,119 +10027,95 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
      if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) {
        uint64_t ShiftAmt = C->getZExtValue();
  
-      if (VT == MVT::v16i8 && Op.getOpcode() == ISD::SHL) {
-        // Make a large shift.
-        SDValue SHL =
-          DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                      DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32),
-                      R, DAG.getConstant(ShiftAmt, MVT::i32));
-        // Zero out the rightmost bits.
-        SmallVector<SDValue, 16> V(16, DAG.getConstant(uint8_t(-1U << ShiftAmt),
-                                                       MVT::i8));
-        return DAG.getNode(ISD::AND, dl, VT, SHL,
-                           DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16));
+      if (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 ||
+          (Subtarget->hasAVX2() &&
+           (VT == MVT::v4i64 || VT == MVT::v8i32 || VT == MVT::v16i16))) {
+        if (Op.getOpcode() == ISD::SHL)
+          return DAG.getNode(X86ISD::VSHLI, dl, VT, R,
+                             DAG.getConstant(ShiftAmt, MVT::i32));
+        if (Op.getOpcode() == ISD::SRL)
+          return DAG.getNode(X86ISD::VSRLI, dl, VT, R,
+                             DAG.getConstant(ShiftAmt, MVT::i32));
+        if (Op.getOpcode() == ISD::SRA && VT != MVT::v2i64 && VT != MVT::v4i64)
+          return DAG.getNode(X86ISD::VSRAI, dl, VT, R,
+                             DAG.getConstant(ShiftAmt, MVT::i32));
        }
  
-      if (VT == MVT::v2i64 && Op.getOpcode() == ISD::SHL)
-       return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                     DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32),
-                     R, DAG.getConstant(ShiftAmt, MVT::i32));
-
-      if (VT == MVT::v4i32 && Op.getOpcode() == ISD::SHL)
-       return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                     DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32),
-                     R, DAG.getConstant(ShiftAmt, MVT::i32));
-
-      if (VT == MVT::v8i16 && Op.getOpcode() == ISD::SHL)
-       return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                     DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32),
-                     R, DAG.getConstant(ShiftAmt, MVT::i32));
-
-      if (VT == MVT::v16i8 && Op.getOpcode() == ISD::SRL) {
-        // Make a large shift.
-        SDValue SRL =
-          DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                      DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32),
-                      R, DAG.getConstant(ShiftAmt, MVT::i32));
-        // Zero out the leftmost bits.
-        SmallVector<SDValue, 16> V(16, DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
-                                                       MVT::i8));
-        return DAG.getNode(ISD::AND, dl, VT, SRL,
-                           DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16));
-      }
-
-      if (VT == MVT::v2i64 && Op.getOpcode() == ISD::SRL)
-       return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                     DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32),
-                     R, DAG.getConstant(ShiftAmt, MVT::i32));
-
-      if (VT == MVT::v4i32 && Op.getOpcode() == ISD::SRL)
-       return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                     DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32),
-                     R, DAG.getConstant(ShiftAmt, MVT::i32));
-
-      if (VT == MVT::v8i16 && Op.getOpcode() == ISD::SRL)
-       return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                     DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32),
-                     R, DAG.getConstant(ShiftAmt, MVT::i32));
-
-      if (VT == MVT::v4i32 && Op.getOpcode() == ISD::SRA)
-       return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                     DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32),
-                     R, DAG.getConstant(ShiftAmt, MVT::i32));
-
-      if (VT == MVT::v8i16 && Op.getOpcode() == ISD::SRA)
-       return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                     DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32),
-                     R, DAG.getConstant(ShiftAmt, MVT::i32));
-
-      if (VT == MVT::v16i8 && Op.getOpcode() == ISD::SRA) {
-        if (ShiftAmt == 7) {
-          // R s>> 7  ===  R s< 0
-          SDValue Zeros = getZeroVector(VT, true /* HasSSE2 */, DAG, dl);
-          return DAG.getNode(X86ISD::PCMPGTB, dl, VT, Zeros, R);
+      if (VT == MVT::v16i8) {
+        if (Op.getOpcode() == ISD::SHL) {
+          // Make a large shift.
+          SDValue SHL = DAG.getNode(X86ISD::VSHLI, dl, MVT::v8i16, R,
+                                    DAG.getConstant(ShiftAmt, MVT::i32));
+          SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
+          // Zero out the rightmost bits.
+          SmallVector<SDValue, 16> V(16,
+                                     DAG.getConstant(uint8_t(-1U << ShiftAmt),
+                                                     MVT::i8));
+          return DAG.getNode(ISD::AND, dl, VT, SHL,
+                             DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16));
+        }
+        if (Op.getOpcode() == ISD::SRL) {
+          // Make a large shift.
+          SDValue SRL = DAG.getNode(X86ISD::VSRLI, dl, MVT::v8i16, R,
+                                    DAG.getConstant(ShiftAmt, MVT::i32));
+          SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
+          // Zero out the leftmost bits.
+          SmallVector<SDValue, 16> V(16,
+                                     DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
+                                                     MVT::i8));
+          return DAG.getNode(ISD::AND, dl, VT, SRL,
+                             DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16));
          }
+        if (Op.getOpcode() == ISD::SRA) {
+          if (ShiftAmt == 7) {
+            // R s>> 7  ===  R s< 0
+            SDValue Zeros = getZeroVector(VT, /* HasSSE2 */true,
+                                          /* HasAVX2 */false, DAG, dl);
+            return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
+          }
  
-        // R s>> a === ((R u>> a) ^ m) - m
-        SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
-        SmallVector<SDValue, 16> V(16, DAG.getConstant(128 >> ShiftAmt,
-                                                       MVT::i8));
-        SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16);
-        Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
-        Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
-        return Res;
+          // R s>> a === ((R u>> a) ^ m) - m
+          SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
+          SmallVector<SDValue, 16> V(16, DAG.getConstant(128 >> ShiftAmt,
+                                                         MVT::i8));
+          SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16);
+          Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
+          Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
+          return Res;
+        }
        }
  
        if (Subtarget->hasAVX2() && VT == MVT::v32i8) {
          if (Op.getOpcode() == ISD::SHL) {
            // Make a large shift.
-          SDValue SHL =
-            DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                        DAG.getConstant(Intrinsic::x86_avx2_pslli_w, MVT::i32),
-                        R, DAG.getConstant(ShiftAmt, MVT::i32));
+          SDValue SHL = DAG.getNode(X86ISD::VSHLI, dl, MVT::v16i16, R,
+                                    DAG.getConstant(ShiftAmt, MVT::i32));
+          SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
            // Zero out the rightmost bits.
-          SmallVector<SDValue, 32> V(32, DAG.getConstant(uint8_t(-1U << ShiftAmt),
-                                                         MVT::i8));
+          SmallVector<SDValue, 32> V(32,
+                                     DAG.getConstant(uint8_t(-1U << ShiftAmt),
+                                                     MVT::i8));
            return DAG.getNode(ISD::AND, dl, VT, SHL,
                               DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32));
          }
          if (Op.getOpcode() == ISD::SRL) {
            // Make a large shift.
-          SDValue SRL =
-            DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                        DAG.getConstant(Intrinsic::x86_avx2_psrli_w, MVT::i32),
-                        R, DAG.getConstant(ShiftAmt, MVT::i32));
+          SDValue SRL = DAG.getNode(X86ISD::VSRLI, dl, MVT::v16i16, R,
+                                    DAG.getConstant(ShiftAmt, MVT::i32));
+          SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
            // Zero out the leftmost bits.
-          SmallVector<SDValue, 32> V(32, DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
-                                                         MVT::i8));
+          SmallVector<SDValue, 32> V(32,
+                                     DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
+                                                     MVT::i8));
            return DAG.getNode(ISD::AND, dl, VT, SRL,
                               DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32));
          }
          if (Op.getOpcode() == ISD::SRA) {
            if (ShiftAmt == 7) {
              // R s>> 7  ===  R s< 0
-            SDValue Zeros = getZeroVector(VT, true /* HasSSE2 */, DAG, dl);
-            return DAG.getNode(X86ISD::PCMPGTB, dl, VT, Zeros, R);
+            SDValue Zeros = getZeroVector(VT, true /* HasSSE2 */,
+                                          true /* HasAVX2 */, DAG, dl);
+            return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
            }
  
            // R s>> a === ((R u>> a) ^ m) - m
@@ -10160,9 +10133,8 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
  
    // Lower SHL with variable shift amount.
    if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
-    Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                     DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32),
-                     Op.getOperand(1), DAG.getConstant(23, MVT::i32));
+    Op = DAG.getNode(X86ISD::VSHLI, dl, VT, Op.getOperand(1),
+                     DAG.getConstant(23, MVT::i32));
  
      ConstantInt *CI = ConstantInt::get(*Context, APInt(32, 0x3f800000U));
  
@@ -10179,51 +10151,44 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
      return DAG.getNode(ISD::MUL, dl, VT, Op, R);
    }
    if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) {
-    assert((Subtarget->hasSSE2() || Subtarget->hasAVX()) &&
-            "Need SSE2 for pslli/pcmpeq.");
+    assert(Subtarget->hasSSE2() && "Need SSE2 for pslli/pcmpeq.");
  
      // a = a << 5;
-    Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                     DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32),
-                     Op.getOperand(1), DAG.getConstant(5, MVT::i32));
+    Op = DAG.getNode(X86ISD::VSHLI, dl, MVT::v8i16, Op.getOperand(1),
+                     DAG.getConstant(5, MVT::i32));
+    Op = DAG.getNode(ISD::BITCAST, dl, VT, Op);
  
      // Turn 'a' into a mask suitable for VSELECT
      SDValue VSelM = DAG.getConstant(0x80, VT);
      SDValue OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
-    OpVSel = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                        DAG.getConstant(Intrinsic::x86_sse2_pcmpeq_b, MVT::i32),
-                        OpVSel, VSelM);
+    OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
  
      SDValue CM1 = DAG.getConstant(0x0f, VT);
      SDValue CM2 = DAG.getConstant(0x3f, VT);
  
      // r = VSELECT(r, psllw(r & (char16)15, 4), a);
      SDValue M = DAG.getNode(ISD::AND, dl, VT, R, CM1);
-    M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                    DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M,
-                    DAG.getConstant(4, MVT::i32));
+    M = getTargetVShiftNode(X86ISD::VSHLI, dl, MVT::v8i16, M,
+                            DAG.getConstant(4, MVT::i32), DAG);
+    M = DAG.getNode(ISD::BITCAST, dl, VT, M);
      R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
  
      // a += a
      Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
      OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
-    OpVSel = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                        DAG.getConstant(Intrinsic::x86_sse2_pcmpeq_b, MVT::i32),
-                        OpVSel, VSelM);
+    OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
  
      // r = VSELECT(r, psllw(r & (char16)63, 2), a);
      M = DAG.getNode(ISD::AND, dl, VT, R, CM2);
-    M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                    DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M,
-                    DAG.getConstant(2, MVT::i32));
+    M = getTargetVShiftNode(X86ISD::VSHLI, dl, MVT::v8i16, M,
+                            DAG.getConstant(2, MVT::i32), DAG);
+    M = DAG.getNode(ISD::BITCAST, dl, VT, M);
      R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
  
      // a += a
      Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
      OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
-    OpVSel = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                        DAG.getConstant(Intrinsic::x86_sse2_pcmpeq_b, MVT::i32),
-                        OpVSel, VSelM);
+    OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
  
      // return VSELECT(r, r+r, a);
      R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel,
@@ -10233,7 +10198,7 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
  
    // Decompose 256-bit shifts into smaller 128-bit shifts.
    if (VT.getSizeInBits() == 256) {
-    int NumElems = VT.getVectorNumElements();
+    unsigned NumElems = VT.getVectorNumElements();
      MVT EltVT = VT.getVectorElementType().getSimpleVT();
      EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
  
@@ -10248,9 +10213,9 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
        // Constant shift amount
        SmallVector<SDValue, 4> Amt1Csts;
        SmallVector<SDValue, 4> Amt2Csts;
-      for (int i = 0; i < NumElems/2; ++i)
+      for (unsigned i = 0; i != NumElems/2; ++i)
          Amt1Csts.push_back(Amt->getOperand(i));
-      for (int i = NumElems/2; i < NumElems; ++i)
+      for (unsigned i = NumElems/2; i != NumElems; ++i)
          Amt2Csts.push_back(Amt->getOperand(i));
  
        Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT,
@@ -10356,72 +10321,52 @@ SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
    EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
    EVT VT = Op.getValueType();
  
-  if (Subtarget->hasSSE2() && VT.isVector()) {
-    unsigned BitsDiff = VT.getScalarType().getSizeInBits() -
-                        ExtraVT.getScalarType().getSizeInBits();
-    SDValue ShAmt = DAG.getConstant(BitsDiff, MVT::i32);
+  if (!Subtarget->hasSSE2() || !VT.isVector())
+    return SDValue();
+
+  unsigned BitsDiff = VT.getScalarType().getSizeInBits() -
+                      ExtraVT.getScalarType().getSizeInBits();
+  SDValue ShAmt = DAG.getConstant(BitsDiff, MVT::i32);
  
-    unsigned SHLIntrinsicsID = 0;
-    unsigned SRAIntrinsicsID = 0;
-    switch (VT.getSimpleVT().SimpleTy) {
-      default:
+  switch (VT.getSimpleVT().SimpleTy) {
+    default: return SDValue();
+    case MVT::v8i32:
+    case MVT::v16i16:
+      if (!Subtarget->hasAVX())
          return SDValue();
-      case MVT::v4i32:
-        SHLIntrinsicsID = Intrinsic::x86_sse2_pslli_d;
-        SRAIntrinsicsID = Intrinsic::x86_sse2_psrai_d;
-        break;
-      case MVT::v8i16:
-        SHLIntrinsicsID = Intrinsic::x86_sse2_pslli_w;
-        SRAIntrinsicsID = Intrinsic::x86_sse2_psrai_w;
-        break;
-      case MVT::v8i32:
-      case MVT::v16i16:
-        if (!Subtarget->hasAVX())
-          return SDValue();
-        if (!Subtarget->hasAVX2()) {
-          // needs to be split
-          int NumElems = VT.getVectorNumElements();
-          SDValue Idx0 = DAG.getConstant(0, MVT::i32);
-          SDValue Idx1 = DAG.getConstant(NumElems/2, MVT::i32);
-
-          // Extract the LHS vectors
-          SDValue LHS = Op.getOperand(0);
-          SDValue LHS1 = Extract128BitVector(LHS, Idx0, DAG, dl);
-          SDValue LHS2 = Extract128BitVector(LHS, Idx1, DAG, dl);
-
-          MVT EltVT = VT.getVectorElementType().getSimpleVT();
-          EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
-
-          EVT ExtraEltVT = ExtraVT.getVectorElementType();
-          int ExtraNumElems = ExtraVT.getVectorNumElements();
-          ExtraVT = EVT::getVectorVT(*DAG.getContext(), ExtraEltVT,
-                                     ExtraNumElems/2);
-          SDValue Extra = DAG.getValueType(ExtraVT);
-
-          LHS1 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, Extra);
-          LHS2 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, Extra);
-
-          return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2);;
-        }
-        if (VT == MVT::v8i32) {
-          SHLIntrinsicsID = Intrinsic::x86_avx2_pslli_d;
-          SRAIntrinsicsID = Intrinsic::x86_avx2_psrai_d;
-        } else {
-          SHLIntrinsicsID = Intrinsic::x86_avx2_pslli_w;
-          SRAIntrinsicsID = Intrinsic::x86_avx2_psrai_w;
-        }
+      if (!Subtarget->hasAVX2()) {
+        // needs to be split
+        int NumElems = VT.getVectorNumElements();
+        SDValue Idx0 = DAG.getConstant(0, MVT::i32);
+        SDValue Idx1 = DAG.getConstant(NumElems/2, MVT::i32);
+
+        // Extract the LHS vectors
+        SDValue LHS = Op.getOperand(0);
+        SDValue LHS1 = Extract128BitVector(LHS, Idx0, DAG, dl);
+        SDValue LHS2 = Extract128BitVector(LHS, Idx1, DAG, dl);
+
+        MVT EltVT = VT.getVectorElementType().getSimpleVT();
+        EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
+
+        EVT ExtraEltVT = ExtraVT.getVectorElementType();
+        int ExtraNumElems = ExtraVT.getVectorNumElements();
+        ExtraVT = EVT::getVectorVT(*DAG.getContext(), ExtraEltVT,
+                                   ExtraNumElems/2);
+        SDValue Extra = DAG.getValueType(ExtraVT);
+
+        LHS1 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, Extra);
+        LHS2 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, Extra);
+
+        return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2);;
+      }
+      // fall through
+    case MVT::v4i32:
+    case MVT::v8i16: {
+      SDValue Tmp1 = getTargetVShiftNode(X86ISD::VSHLI, dl, VT,
+                                         Op.getOperand(0), ShAmt, DAG);
+      return getTargetVShiftNode(X86ISD::VSRAI, dl, VT, Tmp1, ShAmt, DAG);
      }
-
-    SDValue Tmp1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                         DAG.getConstant(SHLIntrinsicsID, MVT::i32),
-                         Op.getOperand(0), ShAmt);
-
-    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                       DAG.getConstant(SRAIntrinsicsID, MVT::i32),
-                       Tmp1, ShAmt);
    }
-
-  return SDValue();
  }
  
  
@@ -10953,18 +10898,17 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
    case X86ISD::ATOMNAND64_DAG:     return "X86ISD::ATOMNAND64_DAG";
    case X86ISD::VZEXT_MOVL:         return "X86ISD::VZEXT_MOVL";
    case X86ISD::VZEXT_LOAD:         return "X86ISD::VZEXT_LOAD";
+  case X86ISD::VSHLDQ:             return "X86ISD::VSHLDQ";
+  case X86ISD::VSRLDQ:             return "X86ISD::VSRLDQ";
    case X86ISD::VSHL:               return "X86ISD::VSHL";
    case X86ISD::VSRL:               return "X86ISD::VSRL";
-  case X86ISD::CMPPD:              return "X86ISD::CMPPD";
-  case X86ISD::CMPPS:              return "X86ISD::CMPPS";
-  case X86ISD::PCMPEQB:            return "X86ISD::PCMPEQB";
-  case X86ISD::PCMPEQW:            return "X86ISD::PCMPEQW";
-  case X86ISD::PCMPEQD:            return "X86ISD::PCMPEQD";
-  case X86ISD::PCMPEQQ:            return "X86ISD::PCMPEQQ";
-  case X86ISD::PCMPGTB:            return "X86ISD::PCMPGTB";
-  case X86ISD::PCMPGTW:            return "X86ISD::PCMPGTW";
-  case X86ISD::PCMPGTD:            return "X86ISD::PCMPGTD";
-  case X86ISD::PCMPGTQ:            return "X86ISD::PCMPGTQ";
+  case X86ISD::VSRA:               return "X86ISD::VSRA";
+  case X86ISD::VSHLI:              return "X86ISD::VSHLI";
+  case X86ISD::VSRLI:              return "X86ISD::VSRLI";
+  case X86ISD::VSRAI:              return "X86ISD::VSRAI";
+  case X86ISD::CMPP:               return "X86ISD::CMPP";
+  case X86ISD::PCMPEQ:             return "X86ISD::PCMPEQ";
+  case X86ISD::PCMPGT:             return "X86ISD::PCMPGT";
    case X86ISD::ADD:                return "X86ISD::ADD";
    case X86ISD::SUB:                return "X86ISD::SUB";
    case X86ISD::ADC:                return "X86ISD::ADC";
@@ -10986,9 +10930,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
    case X86ISD::PALIGN:             return "X86ISD::PALIGN";
    case X86ISD::PSHUFD:             return "X86ISD::PSHUFD";
    case X86ISD::PSHUFHW:            return "X86ISD::PSHUFHW";
-  case X86ISD::PSHUFHW_LD:         return "X86ISD::PSHUFHW_LD";
    case X86ISD::PSHUFLW:            return "X86ISD::PSHUFLW";
-  case X86ISD::PSHUFLW_LD:         return "X86ISD::PSHUFLW_LD";
    case X86ISD::SHUFP:              return "X86ISD::SHUFP";
    case X86ISD::MOVLHPS:            return "X86ISD::MOVLHPS";
    case X86ISD::MOVLHPD:            return "X86ISD::MOVLHPD";
@@ -10998,8 +10940,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
    case X86ISD::MOVDDUP:            return "X86ISD::MOVDDUP";
    case X86ISD::MOVSHDUP:           return "X86ISD::MOVSHDUP";
    case X86ISD::MOVSLDUP:           return "X86ISD::MOVSLDUP";
-  case X86ISD::MOVSHDUP_LD:        return "X86ISD::MOVSHDUP_LD";
-  case X86ISD::MOVSLDUP_LD:        return "X86ISD::MOVSLDUP_LD";
    case X86ISD::MOVSD:              return "X86ISD::MOVSD";
    case X86ISD::MOVSS:              return "X86ISD::MOVSS";
    case X86ISD::UNPCKL:             return "X86ISD::UNPCKL";
@@ -11120,11 +11060,11 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
    return (VT.getVectorNumElements() == 2 ||
            ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
            isMOVLMask(M, VT) ||
-          isSHUFPMask(M, VT) ||
+          isSHUFPMask(M, VT, Subtarget->hasAVX()) ||
            isPSHUFDMask(M, VT) ||
            isPSHUFHWMask(M, VT) ||
            isPSHUFLWMask(M, VT) ||
-          isPALIGNRMask(M, VT, Subtarget->hasSSSE3()) ||
+          isPALIGNRMask(M, VT, Subtarget) ||
            isUNPCKLMask(M, VT, Subtarget->hasAVX2()) ||
            isUNPCKHMask(M, VT, Subtarget->hasAVX2()) ||
            isUNPCKL_v_undef_Mask(M, VT, Subtarget->hasAVX2()) ||
@@ -11141,8 +11081,8 @@ X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
    if (NumElts == 4 && VT.getSizeInBits() == 128) {
      return (isMOVLMask(Mask, VT)  ||
              isCommutedMOVLMask(Mask, VT, true) ||
-            isSHUFPMask(Mask, VT) ||
-            isSHUFPMask(Mask, VT, /* Commuted */ true));
+            isSHUFPMask(Mask, VT, Subtarget->hasAVX()) ||
+            isSHUFPMask(Mask, VT, Subtarget->hasAVX(), /* Commuted */ true));
    }
    return false;
  }
@@ -12065,7 +12005,7 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB,
    BuildMI(BB, DL, TII->get(Is64Bit ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
      .addReg(tmpSPVReg).addReg(sizeVReg);
    BuildMI(BB, DL, TII->get(Is64Bit ? X86::CMP64mr:X86::CMP32mr))
-    .addReg(0).addImm(0).addReg(0).addImm(TlsOffset).addReg(TlsReg)
+    .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
      .addReg(SPLimitVReg);
    BuildMI(BB, DL, TII->get(X86::JG_4)).addMBB(mallocMBB);
  
@@ -12680,7 +12620,8 @@ static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) {
  
  /// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors.
  static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
-                                        TargetLowering::DAGCombinerInfo &DCI) {
+                                        TargetLowering::DAGCombinerInfo &DCI,
+                                        bool HasAVX2) {
    DebugLoc dl = N->getDebugLoc();
    ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
    SDValue V1 = SVOp->getOperand(0);
@@ -12732,7 +12673,7 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
  
      // Emit a zeroed vector and insert the desired subvector on its
      // first half.
-    SDValue Zeros = getZeroVector(VT, true /* HasSSE2 */, DAG, dl);
+    SDValue Zeros = getZeroVector(VT, true /* HasSSE2 */, HasAVX2, DAG, dl);
      SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0),
                           DAG.getConstant(0, MVT::i32), DAG, dl);
      return DCI.CombineTo(N, InsV);
@@ -12777,7 +12718,7 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
    // Combine 256-bit vector shuffles. This is only profitable when in AVX mode
    if (Subtarget->hasAVX() && VT.getSizeInBits() == 256 &&
        N->getOpcode() == ISD::VECTOR_SHUFFLE)
-    return PerformShuffleCombine256(N, DAG, DCI);
+    return PerformShuffleCombine256(N, DAG, DCI, Subtarget->hasAVX2());
  
    // Only handle 128 wide vector from here on.
    if (VT.getSizeInBits() != 128)
@@ -12879,6 +12820,7 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
  /// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT
  /// nodes.
  static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
+                                    TargetLowering::DAGCombinerInfo &DCI,
                                      const X86Subtarget *Subtarget) {
    DebugLoc DL = N->getDebugLoc();
    SDValue Cond = N->getOperand(0);
@@ -13155,6 +13097,26 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
      }
    }
  
+  // If we know that this node is legal then we know that it is going to be
+  // matched by one of the SSE/AVX BLEND instructions. These instructions only
+  // depend on the highest bit in each word. Try to use SimplifyDemandedBits
+  // to simplify previous instructions.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
+      !DCI.isBeforeLegalize() &&
+      TLI.isOperationLegal(ISD::VSELECT, VT)) {
+    unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits();
+    assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
+    APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
+
+    APInt KnownZero, KnownOne;
+    TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
+                                          DCI.isBeforeLegalizeOps());
+    if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) ||
+        TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne, TLO))
+      DCI.CommitTargetLoweringOpt(TLO);
+  }
+
    return SDValue();
  }
  
@@ -13420,6 +13382,11 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
        BaseShAmt = Arg;
        break;
      }
+    // Handle the case where the build_vector is all undef
+    // FIXME: Should DAG allow this?
+    if (i == NumElts)
+      return SDValue();
+
      for (; i != NumElts; ++i) {
        SDValue Arg = ShAmtOp.getOperand(i);
        if (Arg.getOpcode() == ISD::UNDEF) continue;
@@ -13463,79 +13430,38 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
    switch (N->getOpcode()) {
    default:
      llvm_unreachable("Unknown shift opcode!");
-    break;
    case ISD::SHL:
-    if (VT == MVT::v2i64)
-      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
-                         DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32),
-                         ValOp, BaseShAmt);
-    if (VT == MVT::v4i32)
-      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
-                         DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32),
-                         ValOp, BaseShAmt);
-    if (VT == MVT::v8i16)
-      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
-                         DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32),
-                         ValOp, BaseShAmt);
-    if (VT == MVT::v4i64)
-      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
-                         DAG.getConstant(Intrinsic::x86_avx2_pslli_q, MVT::i32),
-                         ValOp, BaseShAmt);
-    if (VT == MVT::v8i32)
-      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
-                         DAG.getConstant(Intrinsic::x86_avx2_pslli_d, MVT::i32),
-                         ValOp, BaseShAmt);
-    if (VT == MVT::v16i16)
-      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
-                         DAG.getConstant(Intrinsic::x86_avx2_pslli_w, MVT::i32),
-                         ValOp, BaseShAmt);
-    break;
+    switch (VT.getSimpleVT().SimpleTy) {
+    default: return SDValue();
+    case MVT::v2i64:
+    case MVT::v4i32:
+    case MVT::v8i16:
+    case MVT::v4i64:
+    case MVT::v8i32:
+    case MVT::v16i16:
+      return getTargetVShiftNode(X86ISD::VSHLI, DL, VT, ValOp, BaseShAmt, DAG);
+    }
    case ISD::SRA:
-    if (VT == MVT::v4i32)
-      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
-                         DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32),
-                         ValOp, BaseShAmt);
-    if (VT == MVT::v8i16)
-      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
-                         DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32),
-                         ValOp, BaseShAmt);
-    if (VT == MVT::v8i32)
-      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
-                         DAG.getConstant(Intrinsic::x86_avx2_psrai_d, MVT::i32),
-                         ValOp, BaseShAmt);
-    if (VT == MVT::v16i16)
-      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
-                         DAG.getConstant(Intrinsic::x86_avx2_psrai_w, MVT::i32),
-                         ValOp, BaseShAmt);
-    break;
+    switch (VT.getSimpleVT().SimpleTy) {
+    default: return SDValue();
+    case MVT::v4i32:
+    case MVT::v8i16:
+    case MVT::v8i32:
+    case MVT::v16i16:
+      return getTargetVShiftNode(X86ISD::VSRAI, DL, VT, ValOp, BaseShAmt, DAG);
+    }
    case ISD::SRL:
-    if (VT == MVT::v2i64)
-      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
-                         DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32),
-                         ValOp, BaseShAmt);
-    if (VT == MVT::v4i32)
-      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
-                         DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32),
-                         ValOp, BaseShAmt);
-    if (VT ==  MVT::v8i16)
-      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
-                         DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32),
-                         ValOp, BaseShAmt);
-    if (VT == MVT::v4i64)
-      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
-                         DAG.getConstant(Intrinsic::x86_avx2_psrli_q, MVT::i32),
-                         ValOp, BaseShAmt);
-    if (VT == MVT::v8i32)
-      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
-                         DAG.getConstant(Intrinsic::x86_avx2_psrli_d, MVT::i32),
-                         ValOp, BaseShAmt);
-    if (VT ==  MVT::v16i16)
-      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
-                         DAG.getConstant(Intrinsic::x86_avx2_psrli_w, MVT::i32),
-                         ValOp, BaseShAmt);
-    break;
+    switch (VT.getSimpleVT().SimpleTy) {
+    default: return SDValue();
+    case MVT::v2i64:
+    case MVT::v4i32:
+    case MVT::v8i16:
+    case MVT::v4i64:
+    case MVT::v8i32:
+    case MVT::v16i16:
+      return getTargetVShiftNode(X86ISD::VSRLI, DL, VT, ValOp, BaseShAmt, DAG);
+    }
    }
-  return SDValue();
  }
  
  
@@ -13747,7 +13673,7 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
      // Canonicalize pandn to RHS
      if (N0.getOpcode() == X86ISD::ANDNP)
        std::swap(N0, N1);
-    // or (and (m, x), (pandn m, y))
+    // or (and (m, y), (pandn m, x))
      if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::ANDNP) {
        SDValue Mask = N1.getOperand(0);
        SDValue X    = N1.getOperand(1);
@@ -13771,24 +13697,14 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
        Mask = Mask.getOperand(0);
        EVT MaskVT = Mask.getValueType();
  
-      // Validate that the Mask operand is a vector sra node.  The sra node
-      // will be an intrinsic.
-      if (Mask.getOpcode() != ISD::INTRINSIC_WO_CHAIN)
-        return SDValue();
-
+      // Validate that the Mask operand is a vector sra node.
        // FIXME: what to do for bytes, since there is a psignb/pblendvb, but
        // there is no psrai.b
-      switch (cast<ConstantSDNode>(Mask.getOperand(0))->getZExtValue()) {
-      case Intrinsic::x86_sse2_psrai_w:
-      case Intrinsic::x86_sse2_psrai_d:
-      case Intrinsic::x86_avx2_psrai_w:
-      case Intrinsic::x86_avx2_psrai_d:
-        break;
-      default: return SDValue();
-      }
+      if (Mask.getOpcode() != X86ISD::VSRAI)
+        return SDValue();
  
        // Check that the SRA is all signbits.
-      SDValue SraC = Mask.getOperand(2);
+      SDValue SraC = Mask.getOperand(1);
        unsigned SraAmt  = cast<ConstantSDNode>(SraC)->getZExtValue();
        unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits();
        if ((SraAmt + 1) != EltBits)
@@ -13803,11 +13719,11 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
        Y = Y.getOperand(0);
        if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X &&
            ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) &&
-          X.getValueType() == MaskVT && X.getValueType() == Y.getValueType() &&
-          (EltBits == 8 || EltBits == 16 || EltBits == 32)) {
-        SDValue Sign = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X,
-                                   Mask.getOperand(1));
-        return DAG.getNode(ISD::BITCAST, DL, VT, Sign);
+          X.getValueType() == MaskVT && Y.getValueType() == MaskVT) {
+        assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
+               "Unsupported VT for PSIGN");
+        Mask = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X, Mask.getOperand(0));
+        return DAG.getNode(ISD::BITCAST, DL, VT, Mask);
        }
        // PBLENDVB only available on SSE 4.1
        if (!Subtarget->hasSSE41())
@@ -14275,7 +14191,8 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
        A = LHS.getOperand(0);
      if (LHS.getOperand(1).getOpcode() != ISD::UNDEF)
        B = LHS.getOperand(1);
-    cast<ShuffleVectorSDNode>(LHS.getNode())->getMask(LMask);
+    ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
+    std::copy(Mask.begin(), Mask.end(), LMask.begin());
    } else {
      if (LHS.getOpcode() != ISD::UNDEF)
        A = LHS;
@@ -14292,7 +14209,8 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
        C = RHS.getOperand(0);
      if (RHS.getOperand(1).getOpcode() != ISD::UNDEF)
        D = RHS.getOperand(1);
-    cast<ShuffleVectorSDNode>(RHS.getNode())->getMask(RMask);
+    ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
+    std::copy(Mask.begin(), Mask.end(), RMask.begin());
    } else {
      if (RHS.getOpcode() != ISD::UNDEF)
        C = RHS;
@@ -14570,7 +14488,7 @@ static SDValue PerformAddCombine(SDNode *N, SelectionDAG &DAG,
  
    // Try to synthesize horizontal adds from adds of shuffles.
    if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
-       (Subtarget->hasAVX2() && (VT == MVT::v16i16 || MVT::v8i32))) &&
+       (Subtarget->hasAVX2() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
        isHorizontalBinOp(Op0, Op1, true))
      return DAG.getNode(X86ISD::HADD, N->getDebugLoc(), VT, Op0, Op1);
  
@@ -14618,7 +14536,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
    case ISD::EXTRACT_VECTOR_ELT:
      return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, *this);
    case ISD::VSELECT:
-  case ISD::SELECT:         return PerformSELECTCombine(N, DAG, Subtarget);
+  case ISD::SELECT:         return PerformSELECTCombine(N, DAG, DCI, Subtarget);
    case X86ISD::CMOV:        return PerformCMOVCombine(N, DAG, DCI);
    case ISD::ADD:            return PerformAddCombine(N, DAG, Subtarget);
    case ISD::SUB:            return PerformSubCombine(N, DAG, Subtarget);