Remove an unused X86ISD node type.

[oota-llvm.git] / lib / Target / X86 / X86ISelLowering.cpp
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index d9bd8fe156499061a8e63b78fb649ab48e28c185..b79e71cd5d7435108014e84722e5ce906613dc40 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -378,6 +378,10 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
    setOperationAction(ISD::FREM             , MVT::f80  , Expand);
    setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);
  
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , Expand);
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF  , MVT::i16  , Expand);
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF  , MVT::i32  , Expand);
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF  , MVT::i64  , Expand);
    if (Subtarget->hasBMI()) {
      setOperationAction(ISD::CTTZ           , MVT::i8   , Promote);
    } else {
@@ -388,6 +392,10 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
        setOperationAction(ISD::CTTZ         , MVT::i64  , Custom);
    }
  
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF  , MVT::i8   , Expand);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF  , MVT::i16  , Expand);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF  , MVT::i32  , Expand);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF  , MVT::i64  , Expand);
    if (Subtarget->hasLZCNT()) {
      setOperationAction(ISD::CTLZ           , MVT::i8   , Promote);
    } else {
@@ -663,6 +671,11 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
        setOperationAction(ISD::FCOS           , MVT::f80  , Expand);
      }
  
+    setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
+    setOperationAction(ISD::FCEIL,  MVT::f80, Expand);
+    setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
+    setOperationAction(ISD::FRINT,  MVT::f80, Expand);
+    setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
      setOperationAction(ISD::FMA, MVT::f80, Expand);
    }
  
@@ -714,7 +727,9 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
      setOperationAction(ISD::FPOW, (MVT::SimpleValueType)VT, Expand);
      setOperationAction(ISD::CTPOP, (MVT::SimpleValueType)VT, Expand);
      setOperationAction(ISD::CTTZ, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF, (MVT::SimpleValueType)VT, Expand);
      setOperationAction(ISD::CTLZ, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::CTLZ_ZERO_UNDEF, (MVT::SimpleValueType)VT, Expand);
      setOperationAction(ISD::SHL, (MVT::SimpleValueType)VT, Expand);
      setOperationAction(ISD::SRA, (MVT::SimpleValueType)VT, Expand);
      setOperationAction(ISD::SRL, (MVT::SimpleValueType)VT, Expand);
@@ -1211,10 +1226,10 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
    maxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
    maxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
    maxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
-  setPrefLoopAlignment(16);
+  setPrefLoopAlignment(4); // 2^4 bytes.
    benefitFromCodePlacementOpt = true;
  
-  setPrefFunctionAlignment(4);
+  setPrefFunctionAlignment(4); // 2^4 bytes.
  }
  
  
@@ -2851,10 +2866,8 @@ static bool isTargetShuffle(unsigned Opcode) {
    case X86ISD::MOVDDUP:
    case X86ISD::MOVSS:
    case X86ISD::MOVSD:
-  case X86ISD::UNPCKLP:
-  case X86ISD::PUNPCKL:
-  case X86ISD::UNPCKHP:
-  case X86ISD::PUNPCKH:
+  case X86ISD::UNPCKL:
+  case X86ISD::UNPCKH:
    case X86ISD::VPERMILP:
    case X86ISD::VPERM2X128:
      return true;
@@ -2914,10 +2927,8 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
    case X86ISD::MOVLPD:
    case X86ISD::MOVSS:
    case X86ISD::MOVSD:
-  case X86ISD::UNPCKLP:
-  case X86ISD::PUNPCKL:
-  case X86ISD::UNPCKHP:
-  case X86ISD::PUNPCKH:
+  case X86ISD::UNPCKL:
+  case X86ISD::UNPCKH:
      return DAG.getNode(Opc, dl, VT, V1, V2);
    }
    return SDValue();
@@ -3217,7 +3228,7 @@ bool X86::isPSHUFLWMask(ShuffleVectorSDNode *N) {
  static bool isPALIGNRMask(const SmallVectorImpl<int> &Mask, EVT VT,
                            bool hasSSSE3OrAVX) {
    int i, e = VT.getVectorNumElements();
-  if (VT.getSizeInBits() != 128 && VT.getSizeInBits() != 64)
+  if (VT.getSizeInBits() != 128)
      return false;
  
    // Do not handle v2i64 / v2f64 shuffles with palignr.
@@ -3251,83 +3262,8 @@ static bool isPALIGNRMask(const SmallVectorImpl<int> &Mask, EVT VT,
  /// specifies a shuffle of elements that is suitable for input to 256-bit
  /// VSHUFPSY.
  static bool isVSHUFPYMask(const SmallVectorImpl<int> &Mask, EVT VT,
-                          bool HasAVX) {
-  int NumElems = VT.getVectorNumElements();
-
-  if (!HasAVX || VT.getSizeInBits() != 256)
-    return false;
-
-  if (NumElems != 4 && NumElems != 8)
-    return false;
-
-  // VSHUFPSY divides the resulting vector into 4 chunks.
-  // The sources are also splitted into 4 chunks, and each destination
-  // chunk must come from a different source chunk.
-  //
-  //  SRC1 =>   X7    X6    X5    X4    X3    X2    X1    X0
-  //  SRC2 =>   Y7    Y6    Y5    Y4    Y3    Y2    Y1    Y9
-  //
-  //  DST  =>  Y7..Y4,   Y7..Y4,   X7..X4,   X7..X4,
-  //           Y3..Y0,   Y3..Y0,   X3..X0,   X3..X0
-  //
-  // VSHUFPDY divides the resulting vector into 4 chunks.
-  // The sources are also splitted into 4 chunks, and each destination
-  // chunk must come from a different source chunk.
-  //
-  //  SRC1 =>      X3       X2       X1       X0
-  //  SRC2 =>      Y3       Y2       Y1       Y0
-  //
-  //  DST  =>  Y3..Y2,  X3..X2,  Y1..Y0,  X1..X0
-  //
-  int QuarterSize = NumElems/4;
-  int HalfSize = QuarterSize*2;
-  for (int i = 0; i < QuarterSize; ++i)
-    if (!isUndefOrInRange(Mask[i], 0, HalfSize))
-      return false;
-  for (int i = QuarterSize; i < QuarterSize*2; ++i)
-    if (!isUndefOrInRange(Mask[i], NumElems, NumElems+HalfSize))
-      return false;
-
-  // For VSHUFPSY, the mask of the second half must be the same as the first
-  // but with the appropriate offsets. This works in the same way as
-  // VPERMILPS works with masks.
-  for (int i = QuarterSize*2; i < QuarterSize*3; ++i) {
-    if (!isUndefOrInRange(Mask[i], HalfSize, NumElems))
-      return false;
-    if (NumElems == 4)
-      continue;
-    // VSHUFPSY handling
-    int FstHalfIdx = i-HalfSize;
-    if (Mask[FstHalfIdx] < 0)
-      continue;
-    if (!isUndefOrEqual(Mask[i], Mask[FstHalfIdx]+HalfSize))
-      return false;
-  }
-  for (int i = QuarterSize*3; i < NumElems; ++i) {
-    if (!isUndefOrInRange(Mask[i], NumElems+HalfSize, NumElems*2))
-      return false;
-    int FstHalfIdx = i-HalfSize;
-    if (NumElems == 4)
-      continue;
-    // VSHUFPSY handling
-    if (Mask[FstHalfIdx] < 0)
-      continue;
-    if (!isUndefOrEqual(Mask[i], Mask[FstHalfIdx]+HalfSize))
-      return false;
-  }
-
-  return true;
-}
-
-/// isCommutedVSHUFP() - Returns true if the shuffle mask is exactly
-/// the reverse of what x86 shuffles want. x86 shuffles requires the lower
-/// half elements to come from vector 1 (which would equal the dest.) and
-/// the upper half to come from vector 2.
-static bool isCommutedVSHUFPY(ShuffleVectorSDNode *N, bool HasAVX) {
-  EVT VT = N->getValueType(0);
+                          bool HasAVX, bool Commuted = false) {
    int NumElems = VT.getVectorNumElements();
-  SmallVector<int, 8> Mask;
-  N->getMask(Mask);
  
    if (!HasAVX || VT.getSizeInBits() != 256)
      return false;
@@ -3354,41 +3290,27 @@ static bool isCommutedVSHUFPY(ShuffleVectorSDNode *N, bool HasAVX) {
    //
    //  DST  =>  Y3..Y2,  X3..X2,  Y1..Y0,  X1..X0
    //
-  int QuarterSize = NumElems/4;
-  int HalfSize = QuarterSize*2;
-  for (int i = 0; i < QuarterSize; ++i)
-    if (!isUndefOrInRange(Mask[i], NumElems, NumElems+HalfSize))
-      return false;
-  for (int i = QuarterSize; i < QuarterSize*2; ++i)
-    if (!isUndefOrInRange(Mask[i], 0, HalfSize))
-      return false;
-
-  // For VSHUFPSY, the mask of the second half must be the same as the first
-  // but with the appropriate offsets. This works in the same way as
-  // VPERMILPS works with masks.
-  for (int i = QuarterSize*2; i < QuarterSize*3; ++i) {
-    if (!isUndefOrInRange(Mask[i], NumElems+HalfSize, NumElems*2))
-      return false;
-    if (NumElems == 4)
-      continue;
-    // VSHUFPSY handling
-    int FstHalfIdx = i-HalfSize;
-    if (Mask[FstHalfIdx] < 0)
-      continue;
-    if (!isUndefOrEqual(Mask[i], Mask[FstHalfIdx]+HalfSize))
-      return false;
-  }
-  for (int i = QuarterSize*3; i < NumElems; ++i) {
-    if (!isUndefOrInRange(Mask[i], HalfSize, NumElems))
-      return false;
-    if (NumElems == 4)
-      continue;
-    // VSHUFPSY handling
-    int FstHalfIdx = i-HalfSize;
-    if (Mask[FstHalfIdx] < 0)
-      continue;
-    if (!isUndefOrEqual(Mask[i], Mask[FstHalfIdx]+HalfSize))
-      return false;
+  unsigned QuarterSize = NumElems/4;
+  unsigned HalfSize = QuarterSize*2;
+  for (unsigned l = 0; l != 2; ++l) {
+    unsigned LaneStart = l*HalfSize;
+    for (unsigned s = 0; s != 2; ++s) {
+      unsigned QuarterStart = s*QuarterSize;
+      unsigned Src = (Commuted) ? (1-s) : s;
+      unsigned SrcStart = Src*NumElems + LaneStart;
+      for (unsigned i = 0; i != QuarterSize; ++i) {
+        int Idx = Mask[i+QuarterStart+LaneStart];
+        if (!isUndefOrInRange(Idx, SrcStart, SrcStart+HalfSize))
+          return false;
+        // For VSHUFPSY, the mask of the second half must be the same as the first
+        // but with the appropriate offsets. This works in the same way as
+        // VPERMILPS works with masks.
+        if (NumElems == 4 || l == 0 || Mask[i+QuarterStart] < 0)
+          continue;
+        if (!isUndefOrEqual(Idx, Mask[i+QuarterStart]+HalfSize))
+          return false;
+      }
+    }
    }
  
    return true;
@@ -3423,8 +3345,8 @@ static unsigned getShuffleVSHUFPYImmediate(SDNode *N) {
  
  /// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
  /// the two vector operands have swapped position.
-static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, EVT VT) {
-  unsigned NumElems = VT.getVectorNumElements();
+static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask,
+                                     unsigned NumElems) {
    for (unsigned i = 0; i != NumElems; ++i) {
      int idx = Mask[i];
      if (idx < 0)
@@ -3438,9 +3360,11 @@ static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, EVT VT) {
  
  /// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
  /// specifies a shuffle of elements that is suitable for input to 128-bit
-/// SHUFPS and SHUFPD.
-static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) {
-  int NumElems = VT.getVectorNumElements();
+/// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be
+/// reverse of what x86 shuffles want.
+static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT,
+                        bool Commuted = false) {
+  unsigned NumElems = VT.getVectorNumElements();
  
    if (VT.getSizeInBits() != 128)
      return false;
@@ -3448,12 +3372,14 @@ static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) {
    if (NumElems != 2 && NumElems != 4)
      return false;
  
-  int Half = NumElems / 2;
-  for (int i = 0; i < Half; ++i)
-    if (!isUndefOrInRange(Mask[i], 0, NumElems))
+  unsigned Half = NumElems / 2;
+  unsigned SrcStart = Commuted ? NumElems : 0;
+  for (unsigned i = 0; i != Half; ++i)
+    if (!isUndefOrInRange(Mask[i], SrcStart, SrcStart+NumElems))
        return false;
-  for (int i = Half; i < NumElems; ++i)
-    if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2))
+  SrcStart = Commuted ? 0 : NumElems;
+  for (unsigned i = Half; i != NumElems; ++i)
+    if (!isUndefOrInRange(Mask[i], SrcStart, SrcStart+NumElems))
        return false;
  
    return true;
@@ -3465,32 +3391,6 @@ bool X86::isSHUFPMask(ShuffleVectorSDNode *N) {
    return ::isSHUFPMask(M, N->getValueType(0));
  }
  
-/// isCommutedSHUFPMask - Returns true if the shuffle mask is exactly
-/// the reverse of what x86 shuffles want. x86 shuffles requires the lower
-/// half elements to come from vector 1 (which would equal the dest.) and
-/// the upper half to come from vector 2.
-static bool isCommutedSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) {
-  int NumElems = VT.getVectorNumElements();
-
-  if (NumElems != 2 && NumElems != 4)
-    return false;
-
-  int Half = NumElems / 2;
-  for (int i = 0; i < Half; ++i)
-    if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2))
-      return false;
-  for (int i = Half; i < NumElems; ++i)
-    if (!isUndefOrInRange(Mask[i], 0, NumElems))
-      return false;
-  return true;
-}
-
-static bool isCommutedSHUFP(ShuffleVectorSDNode *N) {
-  SmallVector<int, 8> M;
-  N->getMask(M);
-  return isCommutedSHUFPMask(M, N->getValueType(0));
-}
-
  /// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
  /// specifies a shuffle of elements that is suitable for input to MOVHLPS.
  bool X86::isMOVHLPSMask(ShuffleVectorSDNode *N) {
@@ -3572,7 +3472,7 @@ bool X86::isMOVLHPSMask(ShuffleVectorSDNode *N) {
  /// specifies a shuffle of elements that is suitable for input to UNPCKL.
  static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT,
                           bool HasAVX2, bool V2IsSplat = false) {
-  int NumElts = VT.getVectorNumElements();
+  unsigned NumElts = VT.getVectorNumElements();
  
    assert((VT.is128BitVector() || VT.is256BitVector()) &&
           "Unsupported vector type for unpckh");
@@ -3586,11 +3486,9 @@ static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT,
    unsigned NumLanes = VT.getSizeInBits()/128;
    unsigned NumLaneElts = NumElts/NumLanes;
  
-  unsigned Start = 0;
-  unsigned End = NumLaneElts;
-  for (unsigned s = 0; s < NumLanes; ++s) {
-    for (unsigned i = Start, j = s * NumLaneElts;
-         i != End;
+  for (unsigned l = 0; l != NumLanes; ++l) {
+    for (unsigned i = l*NumLaneElts, j = l*NumLaneElts;
+         i != (l+1)*NumLaneElts;
           i += 2, ++j) {
        int BitI  = Mask[i];
        int BitI1 = Mask[i+1];
@@ -3604,9 +3502,6 @@ static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT,
            return false;
        }
      }
-    // Process the next 128 bits.
-    Start += NumLaneElts;
-    End += NumLaneElts;
    }
  
    return true;
@@ -3622,7 +3517,7 @@ bool X86::isUNPCKLMask(ShuffleVectorSDNode *N, bool HasAVX2, bool V2IsSplat) {
  /// specifies a shuffle of elements that is suitable for input to UNPCKH.
  static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, EVT VT,
                           bool HasAVX2, bool V2IsSplat = false) {
-  int NumElts = VT.getVectorNumElements();
+  unsigned NumElts = VT.getVectorNumElements();
  
    assert((VT.is128BitVector() || VT.is256BitVector()) &&
           "Unsupported vector type for unpckh");
@@ -3636,11 +3531,9 @@ static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, EVT VT,
    unsigned NumLanes = VT.getSizeInBits()/128;
    unsigned NumLaneElts = NumElts/NumLanes;
  
-  unsigned Start = 0;
-  unsigned End = NumLaneElts;
    for (unsigned l = 0; l != NumLanes; ++l) {
-    for (unsigned i = Start, j = (l*NumLaneElts)+NumLaneElts/2;
-                             i != End; i += 2, ++j) {
+    for (unsigned i = l*NumLaneElts, j = (l*NumLaneElts)+NumLaneElts/2;
+         i != (l+1)*NumLaneElts; i += 2, ++j) {
        int BitI  = Mask[i];
        int BitI1 = Mask[i+1];
        if (!isUndefOrEqual(BitI, j))
@@ -3653,9 +3546,6 @@ static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, EVT VT,
            return false;
        }
      }
-    // Process the next 128 bits.
-    Start += NumLaneElts;
-    End += NumLaneElts;
    }
    return true;
  }
@@ -3669,26 +3559,32 @@ bool X86::isUNPCKHMask(ShuffleVectorSDNode *N, bool HasAVX2, bool V2IsSplat) {
  /// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
  /// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
  /// <0, 0, 1, 1>
-static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) {
-  int NumElems = VT.getVectorNumElements();
-  if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16)
+static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT,
+                                  bool HasAVX2) {
+  unsigned NumElts = VT.getVectorNumElements();
+
+  assert((VT.is128BitVector() || VT.is256BitVector()) &&
+         "Unsupported vector type for unpckh");
+
+  if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 &&
+      (!HasAVX2 || (NumElts != 16 && NumElts != 32)))
      return false;
  
    // For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern
    // FIXME: Need a better way to get rid of this, there's no latency difference
    // between UNPCKLPD and MOVDDUP, the later should always be checked first and
    // the former later. We should also remove the "_undef" special mask.
-  if (NumElems == 4 && VT.getSizeInBits() == 256)
+  if (NumElts == 4 && VT.getSizeInBits() == 256)
      return false;
  
    // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
    // independently on 128-bit lanes.
-  unsigned NumLanes = VT.getSizeInBits() / 128;
-  unsigned NumLaneElts = NumElems / NumLanes;
+  unsigned NumLanes = VT.getSizeInBits()/128;
+  unsigned NumLaneElts = NumElts/NumLanes;
  
-  for (unsigned s = 0; s < NumLanes; ++s) {
-    for (unsigned i = s * NumLaneElts, j = s * NumLaneElts;
-         i != NumLaneElts * (s + 1);
+  for (unsigned l = 0; l != NumLanes; ++l) {
+    for (unsigned i = l*NumLaneElts, j = l*NumLaneElts;
+         i != (l+1)*NumLaneElts;
           i += 2, ++j) {
        int BitI  = Mask[i];
        int BitI1 = Mask[i+1];
@@ -3703,35 +3599,49 @@ static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) {
    return true;
  }
  
-bool X86::isUNPCKL_v_undef_Mask(ShuffleVectorSDNode *N) {
+bool X86::isUNPCKL_v_undef_Mask(ShuffleVectorSDNode *N, bool HasAVX2) {
    SmallVector<int, 8> M;
    N->getMask(M);
-  return ::isUNPCKL_v_undef_Mask(M, N->getValueType(0));
+  return ::isUNPCKL_v_undef_Mask(M, N->getValueType(0), HasAVX2);
  }
  
  /// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
  /// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
  /// <2, 2, 3, 3>
-static bool isUNPCKH_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) {
-  int NumElems = VT.getVectorNumElements();
-  if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16)
+static bool isUNPCKH_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT,
+                                  bool HasAVX2) {
+  unsigned NumElts = VT.getVectorNumElements();
+
+  assert((VT.is128BitVector() || VT.is256BitVector()) &&
+         "Unsupported vector type for unpckh");
+
+  if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 &&
+      (!HasAVX2 || (NumElts != 16 && NumElts != 32)))
      return false;
  
-  for (int i = 0, j = NumElems / 2; i != NumElems; i += 2, ++j) {
-    int BitI  = Mask[i];
-    int BitI1 = Mask[i+1];
-    if (!isUndefOrEqual(BitI, j))
-      return false;
-    if (!isUndefOrEqual(BitI1, j))
-      return false;
+  // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
+  // independently on 128-bit lanes.
+  unsigned NumLanes = VT.getSizeInBits()/128;
+  unsigned NumLaneElts = NumElts/NumLanes;
+
+  for (unsigned l = 0; l != NumLanes; ++l) {
+    for (unsigned i = l*NumLaneElts, j = (l*NumLaneElts)+NumLaneElts/2;
+         i != (l+1)*NumLaneElts; i += 2, ++j) {
+      int BitI  = Mask[i];
+      int BitI1 = Mask[i+1];
+      if (!isUndefOrEqual(BitI, j))
+        return false;
+      if (!isUndefOrEqual(BitI1, j))
+        return false;
+    }
    }
    return true;
  }
  
-bool X86::isUNPCKH_v_undef_Mask(ShuffleVectorSDNode *N) {
+bool X86::isUNPCKH_v_undef_Mask(ShuffleVectorSDNode *N, bool HasAVX2) {
    SmallVector<int, 8> M;
    N->getMask(M);
-  return ::isUNPCKH_v_undef_Mask(M, N->getValueType(0));
+  return ::isUNPCKH_v_undef_Mask(M, N->getValueType(0), HasAVX2);
  }
  
  /// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
@@ -3797,8 +3707,7 @@ static bool isVPERM2X128Mask(const SmallVectorImpl<int> &Mask, EVT VT,
  
  /// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle
  /// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions.
-static unsigned getShuffleVPERM2X128Immediate(SDNode *N) {
-  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) {
    EVT VT = SVOp->getValueType(0);
  
    int HalfSize = VT.getVectorNumElements()/2;
@@ -3860,8 +3769,7 @@ static bool isVPERMILPMask(const SmallVectorImpl<int> &Mask, EVT VT,
  
  /// getShuffleVPERMILPImmediate - Return the appropriate immediate to shuffle
  /// the specified VECTOR_MASK mask with VPERMILPS/D* instructions.
-static unsigned getShuffleVPERMILPImmediate(SDNode *N) {
-  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+static unsigned getShuffleVPERMILPImmediate(ShuffleVectorSDNode *SVOp) {
    EVT VT = SVOp->getValueType(0);
  
    int NumElts = VT.getVectorNumElements();
@@ -3975,21 +3883,18 @@ bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N,
  /// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand
  /// specifies a shuffle of elements that is suitable for input to 256-bit
  /// version of MOVDDUP.
-static bool isMOVDDUPYMask(ShuffleVectorSDNode *N,
-                           const X86Subtarget *Subtarget) {
-  EVT VT = N->getValueType(0);
+static bool isMOVDDUPYMask(const SmallVectorImpl<int> &Mask, EVT VT,
+                           bool HasAVX) {
    int NumElts = VT.getVectorNumElements();
-  bool V2IsUndef = N->getOperand(1).getOpcode() == ISD::UNDEF;
  
-  if (!Subtarget->hasAVX() || VT.getSizeInBits() != 256 ||
-      !V2IsUndef || NumElts != 4)
+  if (!HasAVX || VT.getSizeInBits() != 256 || NumElts != 4)
      return false;
  
    for (int i = 0; i != NumElts/2; ++i)
-    if (!isUndefOrEqual(N->getMaskElt(i), 0))
+    if (!isUndefOrEqual(Mask[i], 0))
        return false;
    for (int i = NumElts/2; i != NumElts; ++i)
-    if (!isUndefOrEqual(N->getMaskElt(i), NumElts/2))
+    if (!isUndefOrEqual(Mask[i], NumElts/2))
        return false;
    return true;
  }
@@ -4104,14 +4009,13 @@ unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) {
  
  /// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle
  /// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction.
-unsigned X86::getShufflePALIGNRImmediate(SDNode *N) {
-  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
-  EVT VVT = N->getValueType(0);
-  unsigned EltSize = VVT.getVectorElementType().getSizeInBits() >> 3;
+static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) {
+  EVT VT = SVOp->getValueType(0);
+  unsigned EltSize = VT.getVectorElementType().getSizeInBits() >> 3;
    int Val = 0;
  
    unsigned i, e;
-  for (i = 0, e = VVT.getVectorNumElements(); i != e; ++i) {
+  for (i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
      Val = SVOp->getMaskElt(i);
      if (Val >= 0)
        break;
@@ -4574,17 +4478,11 @@ static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG,
        DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(),
                        ShuffleMask);
        break;
-    case X86ISD::PUNPCKH:
-      DecodePUNPCKHMask(NumElems, ShuffleMask);
-      break;
-    case X86ISD::UNPCKHP:
-      DecodeUNPCKHPMask(VT, ShuffleMask);
+    case X86ISD::UNPCKH:
+      DecodeUNPCKHMask(VT, ShuffleMask);
        break;
-    case X86ISD::PUNPCKL:
-      DecodePUNPCKLMask(VT, ShuffleMask);
-      break;
-    case X86ISD::UNPCKLP:
-      DecodeUNPCKLPMask(VT, ShuffleMask);
+    case X86ISD::UNPCKL:
+      DecodeUNPCKLMask(VT, ShuffleMask);
        break;
      case X86ISD::MOVHLPS:
        DecodeMOVHLPSMask(NumElems, ShuffleMask);
@@ -5250,8 +5148,10 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
                                             DAG);
        } else if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
          Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
-        assert(VT.getSizeInBits() == 128 && "Expected an SSE value type!");
-        EVT MiddleVT = MVT::v4i32;
+        unsigned NumBits = VT.getSizeInBits();
+        assert((NumBits == 128 || NumBits == 256) && 
+               "Expected an SSE or AVX value type!");
+        EVT MiddleVT = NumBits == 128 ? MVT::v4i32 : MVT::v8i32;
          Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MiddleVT, Item);
          Item = getShuffleVectorZeroOrUndef(Item, 0, true,
                                             Subtarget->hasXMMInt(), DAG);
@@ -6172,7 +6072,7 @@ LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
      // from X.
      if (NumHi == 3) {
        // Normalize it so the 3 elements come from V1.
-      CommuteVectorShuffleMask(PermMask, VT);
+      CommuteVectorShuffleMask(PermMask, 4);
        std::swap(V1, V2);
      }
  
@@ -6482,50 +6382,6 @@ SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasXMMInt) {
                                X86::getShuffleSHUFImmediate(SVOp), DAG);
  }
  
-static inline unsigned getUNPCKLOpcode(EVT VT, bool HasAVX2) {
-  switch(VT.getSimpleVT().SimpleTy) {
-  case MVT::v32i8:
-  case MVT::v16i8:
-  case MVT::v16i16:
-  case MVT::v8i16:
-  case MVT::v4i32:
-  case MVT::v2i64: return X86ISD::PUNPCKL;
-  case MVT::v8i32:
-  case MVT::v4i64:
-    if (HasAVX2)   return X86ISD::PUNPCKL;
-    // else use fp unit for int unpack.
-  case MVT::v8f32:
-  case MVT::v4f32:
-  case MVT::v4f64:
-  case MVT::v2f64: return X86ISD::UNPCKLP;
-  default:
-    llvm_unreachable("Unknown type for unpckl");
-  }
-  return 0;
-}
-
-static inline unsigned getUNPCKHOpcode(EVT VT, bool HasAVX2) {
-  switch(VT.getSimpleVT().SimpleTy) {
-  case MVT::v32i8:
-  case MVT::v16i8:
-  case MVT::v16i16:
-  case MVT::v8i16:
-  case MVT::v4i32:
-  case MVT::v2i64: return X86ISD::PUNPCKH;
-  case MVT::v4i64:
-  case MVT::v8i32:
-    if (HasAVX2)   return X86ISD::PUNPCKH;
-    // else use fp unit for int unpack.
-  case MVT::v8f32:
-  case MVT::v4f32:
-  case MVT::v4f64:
-  case MVT::v2f64: return X86ISD::UNPCKHP;
-  default:
-    llvm_unreachable("Unknown type for unpckh");
-  }
-  return 0;
-}
-
  static
  SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG,
                                 const TargetLowering &TLI,
@@ -6603,6 +6459,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
    bool V1IsSplat = false;
    bool V2IsSplat = false;
    bool HasXMMInt = Subtarget->hasXMMInt();
+  bool HasAVX    = Subtarget->hasAVX();
    bool HasAVX2   = Subtarget->hasAVX2();
    MachineFunction &MF = DAG.getMachineFunction();
    bool OptForSize = MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize);
@@ -6634,12 +6491,10 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
  
    // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and
    // unpckh_undef). Only use pshufd if speed is more important than size.
-  if (OptForSize && X86::isUNPCKL_v_undef_Mask(SVOp))
-    return getTargetShuffleNode(getUNPCKLOpcode(VT, HasAVX2), dl, VT, V1, V1,
-                                DAG);
-  if (OptForSize && X86::isUNPCKH_v_undef_Mask(SVOp))
-    return getTargetShuffleNode(getUNPCKHOpcode(VT, HasAVX2), dl, VT, V1, V1,
-                                DAG);
+  if (OptForSize && X86::isUNPCKL_v_undef_Mask(SVOp, HasAVX2))
+    return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
+  if (OptForSize && X86::isUNPCKH_v_undef_Mask(SVOp, HasAVX2))
+    return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
  
    if (X86::isMOVDDUPMask(SVOp) && Subtarget->hasSSE3orAVX() &&
        V2IsUndef && RelaxedMayFoldVectorLoad(V1))
@@ -6651,8 +6506,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
    // Use to match splats
    if (HasXMMInt && X86::isUNPCKHMask(SVOp, HasAVX2) && V2IsUndef &&
        (VT == MVT::v2f64 || VT == MVT::v2i64))
-    return getTargetShuffleNode(getUNPCKHOpcode(VT, HasAVX2), dl, VT, V1, V1,
-                                DAG);
+    return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
  
    if (X86::isPSHUFDMask(SVOp)) {
      // The actual implementation will match the mask in the if above and then
@@ -6738,7 +6592,10 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
      Commuted = true;
    }
  
-  if (isCommutedMOVL(SVOp, V2IsSplat, V2IsUndef)) {
+  SmallVector<int, 32> M;
+  SVOp->getMask(M);
+
+  if (isCommutedMOVLMask(M, VT, V2IsSplat, V2IsUndef)) {
      // Shuffling low element of v1 into undef, just return v1.
      if (V2IsUndef)
        return V1;
@@ -6748,13 +6605,11 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
      return getMOVL(DAG, dl, VT, V2, V1);
    }
  
-  if (X86::isUNPCKLMask(SVOp, HasAVX2))
-    return getTargetShuffleNode(getUNPCKLOpcode(VT, HasAVX2), dl, VT, V1, V2,
-                                DAG);
+  if (isUNPCKLMask(M, VT, HasAVX2))
+    return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
  
-  if (X86::isUNPCKHMask(SVOp, HasAVX2))
-    return getTargetShuffleNode(getUNPCKHOpcode(VT, HasAVX2), dl, VT, V1, V2,
-                                DAG);
+  if (isUNPCKHMask(M, VT, HasAVX2))
+    return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
  
    if (V2IsSplat) {
      // Normalize mask so all entries that point to V2 points to its first
@@ -6778,36 +6633,30 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
      ShuffleVectorSDNode *NewSVOp = cast<ShuffleVectorSDNode>(NewOp);
  
      if (X86::isUNPCKLMask(NewSVOp, HasAVX2))
-      return getTargetShuffleNode(getUNPCKLOpcode(VT, HasAVX2), dl, VT, V2, V1,
-                                  DAG);
+      return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V2, V1, DAG);
  
      if (X86::isUNPCKHMask(NewSVOp, HasAVX2))
-      return getTargetShuffleNode(getUNPCKHOpcode(VT, HasAVX2), dl, VT, V2, V1,
-                                  DAG);
+      return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V2, V1, DAG);
    }
  
    // Normalize the node to match x86 shuffle ops if needed
-  if (!V2IsUndef && (isCommutedSHUFP(SVOp) ||
-                     isCommutedVSHUFPY(SVOp, Subtarget->hasAVX())))
+  if (!V2IsUndef && (isSHUFPMask(M, VT, /* Commuted */ true) ||
+                     isVSHUFPYMask(M, VT, HasAVX, /* Commuted */ true)))
      return CommuteVectorShuffle(SVOp, DAG);
  
    // The checks below are all present in isShuffleMaskLegal, but they are
    // inlined here right now to enable us to directly emit target specific
    // nodes, and remove one by one until they don't return Op anymore.
-  SmallVector<int, 16> M;
-  SVOp->getMask(M);
  
    if (isPALIGNRMask(M, VT, Subtarget->hasSSSE3orAVX()))
      return getTargetShuffleNode(X86ISD::PALIGN, dl, VT, V1, V2,
-                                X86::getShufflePALIGNRImmediate(SVOp),
+                                getShufflePALIGNRImmediate(SVOp),
                                  DAG);
  
    if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) &&
        SVOp->getSplatIndex() == 0 && V2IsUndef) {
-    if (VT == MVT::v2f64)
-      return getTargetShuffleNode(X86ISD::UNPCKLP, dl, VT, V1, V1, DAG);
-    if (VT == MVT::v2i64)
-      return getTargetShuffleNode(X86ISD::PUNPCKL, dl, VT, V1, V1, DAG);
+    if (VT == MVT::v2f64 || VT == MVT::v2i64)
+      return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
    }
  
    if (isPSHUFHWMask(M, VT))
@@ -6824,12 +6673,10 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
      return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V1, V2,
                                  X86::getShuffleSHUFImmediate(SVOp), DAG);
  
-  if (X86::isUNPCKL_v_undef_Mask(SVOp))
-    return getTargetShuffleNode(getUNPCKLOpcode(VT, HasAVX2), dl, VT, V1, V1,
-                                DAG);
-  if (X86::isUNPCKH_v_undef_Mask(SVOp))
-    return getTargetShuffleNode(getUNPCKHOpcode(VT, HasAVX2), dl, VT, V1, V1,
-                                DAG);
+  if (isUNPCKL_v_undef_Mask(M, VT, HasAVX2))
+    return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
+  if (isUNPCKH_v_undef_Mask(M, VT, HasAVX2))
+    return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
  
    //===--------------------------------------------------------------------===//
    // Generate target specific nodes for 128 or 256-bit shuffles only
@@ -6837,21 +6684,21 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
    //
  
    // Handle VMOVDDUPY permutations
-  if (isMOVDDUPYMask(SVOp, Subtarget))
+  if (V2IsUndef && isMOVDDUPYMask(M, VT, HasAVX))
      return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG);
  
    // Handle VPERMILPS/D* permutations
-  if (isVPERMILPMask(M, VT, Subtarget->hasAVX()))
+  if (isVPERMILPMask(M, VT, HasAVX))
      return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1,
                                  getShuffleVPERMILPImmediate(SVOp), DAG);
  
    // Handle VPERM2F128/VPERM2I128 permutations
-  if (isVPERM2X128Mask(M, VT, Subtarget->hasAVX()))
+  if (isVPERM2X128Mask(M, VT, HasAVX))
      return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1,
                                  V2, getShuffleVPERM2X128Immediate(SVOp), DAG);
  
    // Handle VSHUFPS/DY permutations
-  if (isVSHUFPYMask(M, VT, Subtarget->hasAVX()))
+  if (isVSHUFPYMask(M, VT, HasAVX))
      return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V1, V2,
                                  getShuffleVSHUFPYImmediate(SVOp), DAG);
  
@@ -7762,7 +7609,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
    LLVMContext *Context = DAG.getContext();
  
    // Build some magic constants.
-  std::vector<Constant*> CV0;
+  SmallVector<Constant*,4> CV0;
    CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x45300000)));
    CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x43300000)));
    CV0.push_back(ConstantInt::get(*Context, APInt(32, 0)));
@@ -7770,7 +7617,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
    Constant *C0 = ConstantVector::get(CV0);
    SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16);
  
-  std::vector<Constant*> CV1;
+  SmallVector<Constant*,2> CV1;
    CV1.push_back(
      ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL))));
    CV1.push_back(
@@ -8050,17 +7897,13 @@ SDValue X86TargetLowering::LowerFABS(SDValue Op,
    EVT EltVT = VT;
    if (VT.isVector())
      EltVT = VT.getVectorElementType();
-  std::vector<Constant*> CV;
+  SmallVector<Constant*,4> CV;
    if (EltVT == MVT::f64) {
      Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63))));
-    CV.push_back(C);
-    CV.push_back(C);
+    CV.assign(2, C);
    } else {
      Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31))));
-    CV.push_back(C);
-    CV.push_back(C);
-    CV.push_back(C);
-    CV.push_back(C);
+    CV.assign(4, C);
    }
    Constant *C = ConstantVector::get(CV);
    SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
@@ -8075,19 +7918,18 @@ SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const {
    DebugLoc dl = Op.getDebugLoc();
    EVT VT = Op.getValueType();
    EVT EltVT = VT;
-  if (VT.isVector())
+  unsigned NumElts = VT == MVT::f64 ? 2 : 4;
+  if (VT.isVector()) {
      EltVT = VT.getVectorElementType();
-  std::vector<Constant*> CV;
+    NumElts = VT.getVectorNumElements();
+  }
+  SmallVector<Constant*,8> CV;
    if (EltVT == MVT::f64) {
      Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63)));
-    CV.push_back(C);
-    CV.push_back(C);
+    CV.assign(NumElts, C);
    } else {
      Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31)));
-    CV.push_back(C);
-    CV.push_back(C);
-    CV.push_back(C);
-    CV.push_back(C);
+    CV.assign(NumElts, C);
    }
    Constant *C = ConstantVector::get(CV);
    SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
@@ -8095,11 +7937,12 @@ SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const {
                               MachinePointerInfo::getConstantPool(),
                               false, false, false, 16);
    if (VT.isVector()) {
+    MVT XORVT = VT.getSizeInBits() == 128 ? MVT::v2i64 : MVT::v4i64;
      return DAG.getNode(ISD::BITCAST, dl, VT,
-                       DAG.getNode(ISD::XOR, dl, MVT::v2i64,
-                    DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
+                       DAG.getNode(ISD::XOR, dl, XORVT,
+                    DAG.getNode(ISD::BITCAST, dl, XORVT,
                                  Op.getOperand(0)),
-                    DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Mask)));
+                    DAG.getNode(ISD::BITCAST, dl, XORVT, Mask)));
    } else {
      return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask);
    }
@@ -8128,7 +7971,7 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
    // type, and that won't be f80 since that is not custom lowered.
  
    // First get the sign bit of second operand.
-  std::vector<Constant*> CV;
+  SmallVector<Constant*,4> CV;
    if (SrcVT == MVT::f64) {
      CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63))));
      CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0))));
@@ -10325,49 +10168,55 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
      return DAG.getNode(ISD::MUL, dl, VT, Op, R);
    }
    if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) {
+    assert((Subtarget->hasSSE2() || Subtarget->hasAVX()) &&
+            "Need SSE2 for pslli/pcmpeq.");
+
      // a = a << 5;
      Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
                       DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32),
                       Op.getOperand(1), DAG.getConstant(5, MVT::i32));
  
-    ConstantInt *CM1 = ConstantInt::get(*Context, APInt(8, 15));
-    ConstantInt *CM2 = ConstantInt::get(*Context, APInt(8, 63));
+    // Turn 'a' into a mask suitable for VSELECT
+    SDValue VSelM = DAG.getConstant(0x80, VT);
+    SDValue OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
+    OpVSel = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                        DAG.getConstant(Intrinsic::x86_sse2_pcmpeq_b, MVT::i32),
+                        OpVSel, VSelM);
  
-    std::vector<Constant*> CVM1(16, CM1);
-    std::vector<Constant*> CVM2(16, CM2);
-    Constant *C = ConstantVector::get(CVM1);
-    SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
-    SDValue M = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
-                            MachinePointerInfo::getConstantPool(),
-                            false, false, false, 16);
+    SDValue CM1 = DAG.getConstant(0x0f, VT);
+    SDValue CM2 = DAG.getConstant(0x3f, VT);
  
-    // r = pblendv(r, psllw(r & (char16)15, 4), a);
-    M = DAG.getNode(ISD::AND, dl, VT, R, M);
+    // r = VSELECT(r, psllw(r & (char16)15, 4), a);
+    SDValue M = DAG.getNode(ISD::AND, dl, VT, R, CM1);
      M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
                      DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M,
                      DAG.getConstant(4, MVT::i32));
-    R = DAG.getNode(ISD::VSELECT, dl, VT, Op, R, M);
+    R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
+
      // a += a
      Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
+    OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
+    OpVSel = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                        DAG.getConstant(Intrinsic::x86_sse2_pcmpeq_b, MVT::i32),
+                        OpVSel, VSelM);
  
-    C = ConstantVector::get(CVM2);
-    CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
-    M = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
-                    MachinePointerInfo::getConstantPool(),
-                    false, false, false, 16);
-
-    // r = pblendv(r, psllw(r & (char16)63, 2), a);
-    M = DAG.getNode(ISD::AND, dl, VT, R, M);
+    // r = VSELECT(r, psllw(r & (char16)63, 2), a);
+    M = DAG.getNode(ISD::AND, dl, VT, R, CM2);
      M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
                      DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M,
                      DAG.getConstant(2, MVT::i32));
-    R = DAG.getNode(ISD::VSELECT, dl, VT, Op, R, M);
+    R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
+
      // a += a
      Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
-
-    // return pblendv(r, r+r, a);
-    R = DAG.getNode(ISD::VSELECT, dl, VT, Op,
-                    R, DAG.getNode(ISD::ADD, dl, VT, R, R));
+    OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
+    OpVSel = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                        DAG.getConstant(Intrinsic::x86_sse2_pcmpeq_b, MVT::i32),
+                        OpVSel, VSelM);
+
+    // return VSELECT(r, r+r, a);
+    R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel,
+                    DAG.getNode(ISD::ADD, dl, VT, R, R), R);
      return R;
    }
  
@@ -11068,6 +10917,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
    case X86ISD::ANDNP:              return "X86ISD::ANDNP";
    case X86ISD::PSIGN:              return "X86ISD::PSIGN";
    case X86ISD::BLENDV:             return "X86ISD::BLENDV";
+  case X86ISD::HADD:               return "X86ISD::HADD";
+  case X86ISD::HSUB:               return "X86ISD::HSUB";
    case X86ISD::FHADD:              return "X86ISD::FHADD";
    case X86ISD::FHSUB:              return "X86ISD::FHSUB";
    case X86ISD::FMAX:               return "X86ISD::FMAX";
@@ -11130,7 +10981,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
    case X86ISD::MOVLHPS:            return "X86ISD::MOVLHPS";
    case X86ISD::MOVLHPD:            return "X86ISD::MOVLHPD";
    case X86ISD::MOVHLPS:            return "X86ISD::MOVHLPS";
-  case X86ISD::MOVHLPD:            return "X86ISD::MOVHLPD";
    case X86ISD::MOVLPS:             return "X86ISD::MOVLPS";
    case X86ISD::MOVLPD:             return "X86ISD::MOVLPD";
    case X86ISD::MOVDDUP:            return "X86ISD::MOVDDUP";
@@ -11140,10 +10990,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
    case X86ISD::MOVSLDUP_LD:        return "X86ISD::MOVSLDUP_LD";
    case X86ISD::MOVSD:              return "X86ISD::MOVSD";
    case X86ISD::MOVSS:              return "X86ISD::MOVSS";
-  case X86ISD::UNPCKLP:            return "X86ISD::UNPCKLP";
-  case X86ISD::UNPCKHP:            return "X86ISD::UNPCKHP";
-  case X86ISD::PUNPCKL:            return "X86ISD::PUNPCKL";
-  case X86ISD::PUNPCKH:            return "X86ISD::PUNPCKH";
+  case X86ISD::UNPCKL:             return "X86ISD::UNPCKL";
+  case X86ISD::UNPCKH:             return "X86ISD::UNPCKH";
    case X86ISD::VBROADCAST:         return "X86ISD::VBROADCAST";
    case X86ISD::VPERMILP:           return "X86ISD::VPERMILP";
    case X86ISD::VPERM2X128:         return "X86ISD::VPERM2X128";
@@ -11254,7 +11102,7 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
                                        EVT VT) const {
    // Very little shuffling can be done for 64-bit vectors right now.
    if (VT.getSizeInBits() == 64)
-    return isPALIGNRMask(M, VT, Subtarget->hasSSSE3orAVX());
+    return false;
  
    // FIXME: pshufb, blends, shifts.
    return (VT.getVectorNumElements() == 2 ||
@@ -11267,8 +11115,8 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
            isPALIGNRMask(M, VT, Subtarget->hasSSSE3orAVX()) ||
            isUNPCKLMask(M, VT, Subtarget->hasAVX2()) ||
            isUNPCKHMask(M, VT, Subtarget->hasAVX2()) ||
-          isUNPCKL_v_undef_Mask(M, VT) ||
-          isUNPCKH_v_undef_Mask(M, VT));
+          isUNPCKL_v_undef_Mask(M, VT, Subtarget->hasAVX2()) ||
+          isUNPCKH_v_undef_Mask(M, VT, Subtarget->hasAVX2()));
  }
  
  bool
@@ -11282,7 +11130,7 @@ X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
      return (isMOVLMask(Mask, VT)  ||
              isCommutedMOVLMask(Mask, VT, true) ||
              isSHUFPMask(Mask, VT) ||
-            isCommutedSHUFPMask(Mask, VT));
+            isSHUFPMask(Mask, VT, /* Commuted */ true));
    }
    return false;
  }
@@ -14321,7 +14169,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
  /// set to A, RHS to B, and the routine returns 'true'.
  /// Note that the binary operation should have the property that if one of the
  /// operands is UNDEF then the result is UNDEF.
-static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool isCommutative) {
+static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
    // Look for the following pattern: if
    //   A = < float a0, float a1, float a2, float a3 >
    //   B = < float b0, float b1, float b2, float b3 >
@@ -14399,34 +14247,28 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool isCommutative) {
    // If A and B occur in reverse order in RHS, then "swap" them (which means
    // rewriting the mask).
    if (A != C)
-    for (unsigned i = 0; i != NumElts; ++i) {
-      unsigned Idx = RMask[i];
-      if (Idx < NumElts)
-        RMask[i] += NumElts;
-      else if (Idx < 2*NumElts)
-        RMask[i] -= NumElts;
-    }
+    CommuteVectorShuffleMask(RMask, NumElts);
  
    // At this point LHS and RHS are equivalent to
    //   LHS = VECTOR_SHUFFLE A, B, LMask
    //   RHS = VECTOR_SHUFFLE A, B, RMask
    // Check that the masks correspond to performing a horizontal operation.
    for (unsigned i = 0; i != NumElts; ++i) {
-    unsigned LIdx = LMask[i], RIdx = RMask[i];
+    int LIdx = LMask[i], RIdx = RMask[i];
  
      // Ignore any UNDEF components.
-    if (LIdx >= 2*NumElts || RIdx >= 2*NumElts ||
-        (!A.getNode() && (LIdx < NumElts || RIdx < NumElts)) ||
-        (!B.getNode() && (LIdx >= NumElts || RIdx >= NumElts)))
+    if (LIdx < 0 || RIdx < 0 ||
+        (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
+        (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
        continue;
  
      // Check that successive elements are being operated on.  If not, this is
      // not a horizontal operation.
      unsigned Src = (i/HalfLaneElts) % 2; // each lane is split between srcs
      unsigned LaneStart = (i/NumLaneElts) * NumLaneElts;
-    unsigned Index = 2*(i%HalfLaneElts) + NumElts*Src + LaneStart;
+    int Index = 2*(i%HalfLaneElts) + NumElts*Src + LaneStart;
      if (!(LIdx == Index && RIdx == Index + 1) &&
-        !(isCommutative && LIdx == Index + 1 && RIdx == Index))
+        !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
        return false;
    }
  
@@ -14739,10 +14581,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
    case X86ISD::SHUFPS:      // Handle all target specific shuffles
    case X86ISD::SHUFPD:
    case X86ISD::PALIGN:
-  case X86ISD::PUNPCKH:
-  case X86ISD::UNPCKHP:
-  case X86ISD::PUNPCKL:
-  case X86ISD::UNPCKLP:
+  case X86ISD::UNPCKH:
+  case X86ISD::UNPCKL:
    case X86ISD::MOVHLPS:
    case X86ISD::MOVLHPS:
    case X86ISD::PSHUFD:
@@ -14859,11 +14699,41 @@ bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
  //                           X86 Inline Assembly Support
  //===----------------------------------------------------------------------===//
  
+// Helper to match a string separated by whitespace.
+static bool END_WITH_NULL matchAsm(StringRef s, ...) {
+  va_list ap;
+  va_start(ap, s);
+  s = s.substr(s.find_first_not_of(" \t")); // Skip leading whitespace.
+
+  while (const char *p = va_arg(ap, const char *)) {
+    StringRef piece(p);
+    if (!s.startswith(piece)) { // Check if the piece matches.
+      va_end(ap);
+      return false;
+    }
+
+    s = s.substr(piece.size());
+    StringRef::size_type i = s.find_first_not_of(" \t");
+    if (i == 0) { // We matched a prefix.
+      va_end(ap);
+      return false;
+    }
+    s = s.substr(i);
+  }
+
+  va_end(ap);
+  return s.empty();
+}
+
  bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
    InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
  
    std::string AsmStr = IA->getAsmString();
  
+  IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
+  if (!Ty || Ty->getBitWidth() % 16 != 0)
+    return false;
+
    // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
    SmallVector<StringRef, 4> AsmPieces;
    SplitString(AsmStr, AsmPieces, ";\n");
@@ -14871,35 +14741,27 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
    switch (AsmPieces.size()) {
    default: return false;
    case 1:
-    AsmStr = AsmPieces[0];
-    AsmPieces.clear();
-    SplitString(AsmStr, AsmPieces, " \t");  // Split with whitespace.
-
      // FIXME: this should verify that we are targeting a 486 or better.  If not,
-    // we will turn this bswap into something that will be lowered to logical ops
-    // instead of emitting the bswap asm.  For now, we don't support 486 or lower
-    // so don't worry about this.
+    // we will turn this bswap into something that will be lowered to logical
+    // ops instead of emitting the bswap asm.  For now, we don't support 486 or
+    // lower so don't worry about this.
      // bswap $0
-    if (AsmPieces.size() == 2 &&
-        (AsmPieces[0] == "bswap" ||
-         AsmPieces[0] == "bswapq" ||
-         AsmPieces[0] == "bswapl") &&
-        (AsmPieces[1] == "$0" ||
-         AsmPieces[1] == "${0:q}")) {
+    if (matchAsm(AsmPieces[0], "bswap", "$0", NULL) ||
+        matchAsm(AsmPieces[0], "bswapl", "$0", NULL) ||
+        matchAsm(AsmPieces[0], "bswapq", "$0", NULL) ||
+        matchAsm(AsmPieces[0], "bswap", "${0:q}", NULL) ||
+        matchAsm(AsmPieces[0], "bswapl", "${0:q}", NULL) ||
+        matchAsm(AsmPieces[0], "bswapq", "${0:q}", NULL)) {
        // No need to check constraints, nothing other than the equivalent of
        // "=r,0" would be valid here.
-      IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
-      if (!Ty || Ty->getBitWidth() % 16 != 0)
-        return false;
        return IntrinsicLowering::LowerToByteSwap(CI);
      }
+
      // rorw $$8, ${0:w}  -->  llvm.bswap.i16
      if (CI->getType()->isIntegerTy(16) &&
-        AsmPieces.size() == 3 &&
-        (AsmPieces[0] == "rorw" || AsmPieces[0] == "rolw") &&
-        AsmPieces[1] == "$$8," &&
-        AsmPieces[2] == "${0:w}" &&
-        IA->getConstraintString().compare(0, 5, "=r,0,") == 0) {
+        IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
+        (matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}", NULL) ||
+         matchAsm(AsmPieces[0], "rolw", "$$8,", "${0:w}", NULL))) {
        AsmPieces.clear();
        const std::string &ConstraintsStr = IA->getConstraintString();
        SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
@@ -14908,46 +14770,26 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
            AsmPieces[0] == "~{cc}" &&
            AsmPieces[1] == "~{dirflag}" &&
            AsmPieces[2] == "~{flags}" &&
-          AsmPieces[3] == "~{fpsr}") {
-        IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
-        if (!Ty || Ty->getBitWidth() % 16 != 0)
-          return false;
-        return IntrinsicLowering::LowerToByteSwap(CI);
-      }
+          AsmPieces[3] == "~{fpsr}")
+      return IntrinsicLowering::LowerToByteSwap(CI);
      }
      break;
    case 3:
      if (CI->getType()->isIntegerTy(32) &&
-        IA->getConstraintString().compare(0, 5, "=r,0,") == 0) {
-      SmallVector<StringRef, 4> Words;
-      SplitString(AsmPieces[0], Words, " \t,");
-      if (Words.size() == 3 && Words[0] == "rorw" && Words[1] == "$$8" &&
-          Words[2] == "${0:w}") {
-        Words.clear();
-        SplitString(AsmPieces[1], Words, " \t,");
-        if (Words.size() == 3 && Words[0] == "rorl" && Words[1] == "$$16" &&
-            Words[2] == "$0") {
-          Words.clear();
-          SplitString(AsmPieces[2], Words, " \t,");
-          if (Words.size() == 3 && Words[0] == "rorw" && Words[1] == "$$8" &&
-              Words[2] == "${0:w}") {
-            AsmPieces.clear();
-            const std::string &ConstraintsStr = IA->getConstraintString();
-            SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
-            std::sort(AsmPieces.begin(), AsmPieces.end());
-            if (AsmPieces.size() == 4 &&
-                AsmPieces[0] == "~{cc}" &&
-                AsmPieces[1] == "~{dirflag}" &&
-                AsmPieces[2] == "~{flags}" &&
-                AsmPieces[3] == "~{fpsr}") {
-              IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
-              if (!Ty || Ty->getBitWidth() % 16 != 0)
-                return false;
-              return IntrinsicLowering::LowerToByteSwap(CI);
-            }
-          }
-        }
-      }
+        IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
+        matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}", NULL) &&
+        matchAsm(AsmPieces[1], "rorl", "$$16,", "$0", NULL) &&
+        matchAsm(AsmPieces[2], "rorw", "$$8,", "${0:w}", NULL)) {
+      AsmPieces.clear();
+      const std::string &ConstraintsStr = IA->getConstraintString();
+      SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
+      std::sort(AsmPieces.begin(), AsmPieces.end());
+      if (AsmPieces.size() == 4 &&
+          AsmPieces[0] == "~{cc}" &&
+          AsmPieces[1] == "~{dirflag}" &&
+          AsmPieces[2] == "~{flags}" &&
+          AsmPieces[3] == "~{fpsr}")
+        return IntrinsicLowering::LowerToByteSwap(CI);
      }
  
      if (CI->getType()->isIntegerTy(64)) {
@@ -14956,23 +14798,10 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
            Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
            Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
          // bswap %eax / bswap %edx / xchgl %eax, %edx  -> llvm.bswap.i64
-        SmallVector<StringRef, 4> Words;
-        SplitString(AsmPieces[0], Words, " \t");
-        if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%eax") {
-          Words.clear();
-          SplitString(AsmPieces[1], Words, " \t");
-          if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%edx") {
-            Words.clear();
-            SplitString(AsmPieces[2], Words, " \t,");
-            if (Words.size() == 3 && Words[0] == "xchgl" && Words[1] == "%eax" &&
-                Words[2] == "%edx") {
-              IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
-              if (!Ty || Ty->getBitWidth() % 16 != 0)
-                return false;
-              return IntrinsicLowering::LowerToByteSwap(CI);
-            }
-          }
-        }
+        if (matchAsm(AsmPieces[0], "bswap", "%eax", NULL) &&
+            matchAsm(AsmPieces[1], "bswap", "%edx", NULL) &&
+            matchAsm(AsmPieces[2], "xchgl", "%eax,", "%edx", NULL))
+          return IntrinsicLowering::LowerToByteSwap(CI);
        }
      }
      break;