Explicitly update the number of nodes to be traversed

[oota-llvm.git] / lib / Target / X86 / X86ISelLowering.cpp
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index d1ded33384973332483329d33cadd9d6f2e047d6..340524ca2091b93ba4717a80bb582b1d5896fdea 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -49,6 +49,7 @@
  #include "llvm/Support/MathExtras.h"
  #include "llvm/Target/TargetOptions.h"
  #include <bitset>
+#include <cctype>
  using namespace llvm;
  
  STATISTIC(NumTailCalls, "Number of tail calls");
@@ -62,41 +63,33 @@ static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
  /// simple subregister reference.  Idx is an index in the 128 bits we
  /// want.  It need not be aligned to a 128-bit bounday.  That makes
  /// lowering EXTRACT_VECTOR_ELT operations easier.
-static SDValue Extract128BitVector(SDValue Vec,
-                                   SDValue Idx,
-                                   SelectionDAG &DAG,
-                                   DebugLoc dl) {
+static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
+                                   SelectionDAG &DAG, DebugLoc dl) {
    EVT VT = Vec.getValueType();
-  assert(VT.getSizeInBits() == 256 && "Unexpected vector size!");
+  assert(VT.is256BitVector() && "Unexpected vector size!");
    EVT ElVT = VT.getVectorElementType();
-  int Factor = VT.getSizeInBits()/128;
+  unsigned Factor = VT.getSizeInBits()/128;
    EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
                                    VT.getVectorNumElements()/Factor);
  
    // Extract from UNDEF is UNDEF.
    if (Vec.getOpcode() == ISD::UNDEF)
-    return DAG.getNode(ISD::UNDEF, dl, ResultVT);
-
-  if (isa<ConstantSDNode>(Idx)) {
-    unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
-
-    // Extract the relevant 128 bits.  Generate an EXTRACT_SUBVECTOR
-    // we can match to VEXTRACTF128.
-    unsigned ElemsPerChunk = 128 / ElVT.getSizeInBits();
+    return DAG.getUNDEF(ResultVT);
  
-    // This is the index of the first element of the 128-bit chunk
-    // we want.
-    unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / 128)
-                                 * ElemsPerChunk);
+  // Extract the relevant 128 bits.  Generate an EXTRACT_SUBVECTOR
+  // we can match to VEXTRACTF128.
+  unsigned ElemsPerChunk = 128 / ElVT.getSizeInBits();
  
-    SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32);
-    SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec,
-                                 VecIdx);
+  // This is the index of the first element of the 128-bit chunk
+  // we want.
+  unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / 128)
+                               * ElemsPerChunk);
  
-    return Result;
-  }
+  SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32);
+  SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec,
+                               VecIdx);
  
-  return SDValue();
+  return Result;
  }
  
  /// Generate a DAG to put 128-bits into a vector > 128 bits.  This
@@ -104,34 +97,41 @@ static SDValue Extract128BitVector(SDValue Vec,
  /// simple superregister reference.  Idx is an index in the 128 bits
  /// we want.  It need not be aligned to a 128-bit bounday.  That makes
  /// lowering INSERT_VECTOR_ELT operations easier.
-static SDValue Insert128BitVector(SDValue Result,
-                                  SDValue Vec,
-                                  SDValue Idx,
-                                  SelectionDAG &DAG,
+static SDValue Insert128BitVector(SDValue Result, SDValue Vec,
+                                  unsigned IdxVal, SelectionDAG &DAG,
                                    DebugLoc dl) {
-  if (isa<ConstantSDNode>(Idx)) {
-    EVT VT = Vec.getValueType();
-    assert(VT.getSizeInBits() == 128 && "Unexpected vector size!");
+  // Inserting UNDEF is Result
+  if (Vec.getOpcode() == ISD::UNDEF)
+    return Result;
  
-    EVT ElVT = VT.getVectorElementType();
-    unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
-    EVT ResultVT = Result.getValueType();
+  EVT VT = Vec.getValueType();
+  assert(VT.is128BitVector() && "Unexpected vector size!");
  
-    // Insert the relevant 128 bits.
-    unsigned ElemsPerChunk = 128/ElVT.getSizeInBits();
+  EVT ElVT = VT.getVectorElementType();
+  EVT ResultVT = Result.getValueType();
  
-    // This is the index of the first element of the 128-bit chunk
-    // we want.
-    unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/128)
-                                 * ElemsPerChunk);
+  // Insert the relevant 128 bits.
+  unsigned ElemsPerChunk = 128/ElVT.getSizeInBits();
  
-    SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32);
-    Result = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec,
-                         VecIdx);
-    return Result;
-  }
+  // This is the index of the first element of the 128-bit chunk
+  // we want.
+  unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/128)
+                               * ElemsPerChunk);
  
-  return SDValue();
+  SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32);
+  return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec,
+                     VecIdx);
+}
+
+/// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
+/// instructions. This is used because creating CONCAT_VECTOR nodes of
+/// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
+/// large BUILD_VECTORS.
+static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
+                                   unsigned NumElems, SelectionDAG &DAG,
+                                   DebugLoc dl) {
+  SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
+  return Insert128BitVector(V, V2, NumElems/2, DAG, dl);
  }
  
  static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) {
@@ -140,10 +140,12 @@ static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) {
  
    if (Subtarget->isTargetEnvMacho()) {
      if (is64Bit)
-      return new X8664_MachoTargetObjectFile();
+      return new X86_64MachoTargetObjectFile();
      return new TargetLoweringObjectFileMachO();
    }
  
+  if (Subtarget->isTargetLinux())
+    return new X86LinuxTargetObjectFile();
    if (Subtarget->isTargetELF())
      return new TargetLoweringObjectFileELF();
    if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho())
@@ -162,7 +164,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
    TD = getTargetData();
  
    // Set up the TargetLowering object.
-  static MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
+  static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
  
    // X86 is weird, it always uses i8 for shift amounts and setcc results.
    setBooleanContents(ZeroOrOneBooleanContent);
@@ -171,11 +173,11 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
  
    // For 64-bit since we have so many registers use the ILP scheduler, for
    // 32-bit code use the register pressure specific scheduling.
-  // For 32 bit Atom, use Hybrid (register pressure + latency) scheduling.
-  if (Subtarget->is64Bit())
+  // For Atom, always use ILP scheduling.
+  if (Subtarget->isAtom())
+    setSchedulingPreference(Sched::ILP);
+  else if (Subtarget->is64Bit())
      setSchedulingPreference(Sched::ILP);
-  else if (Subtarget->isAtom()) 
-    setSchedulingPreference(Sched::Hybrid);
    else
      setSchedulingPreference(Sched::RegPressure);
    setStackPointerRegisterToSaveRestore(X86StackPtr);
@@ -215,11 +217,11 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
    }
  
    // Set up the register classes.
-  addRegisterClass(MVT::i8, X86::GR8RegisterClass);
-  addRegisterClass(MVT::i16, X86::GR16RegisterClass);
-  addRegisterClass(MVT::i32, X86::GR32RegisterClass);
+  addRegisterClass(MVT::i8, &X86::GR8RegClass);
+  addRegisterClass(MVT::i16, &X86::GR16RegClass);
+  addRegisterClass(MVT::i32, &X86::GR32RegClass);
    if (Subtarget->is64Bit())
-    addRegisterClass(MVT::i64, X86::GR64RegisterClass);
+    addRegisterClass(MVT::i64, &X86::GR64RegClass);
  
    setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
  
@@ -345,7 +347,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
    // (low) operations are left as Legal, as there are single-result
    // instructions for this in x86. Using the two-result multiply instructions
    // when both high and low results are needed must be arranged by dagcombine.
-  for (unsigned i = 0, e = 4; i != e; ++i) {
+  for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
      MVT VT = IntVTs[i];
      setOperationAction(ISD::MULHS, VT, Expand);
      setOperationAction(ISD::MULHU, VT, Expand);
@@ -492,7 +494,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
    setShouldFoldAtomicFences(true);
  
    // Expand certain atomics
-  for (unsigned i = 0, e = 4; i != e; ++i) {
+  for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
      MVT VT = IntVTs[i];
      setOperationAction(ISD::ATOMIC_CMP_SWAP, VT, Custom);
      setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
@@ -567,8 +569,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
    if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) {
      // f32 and f64 use SSE.
      // Set up the FP register classes.
-    addRegisterClass(MVT::f32, X86::FR32RegisterClass);
-    addRegisterClass(MVT::f64, X86::FR64RegisterClass);
+    addRegisterClass(MVT::f32, &X86::FR32RegClass);
+    addRegisterClass(MVT::f64, &X86::FR64RegClass);
  
      // Use ANDPD to simulate FABS.
      setOperationAction(ISD::FABS , MVT::f64, Custom);
@@ -599,8 +601,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
    } else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) {
      // Use SSE for f32, x87 for f64.
      // Set up the FP register classes.
-    addRegisterClass(MVT::f32, X86::FR32RegisterClass);
-    addRegisterClass(MVT::f64, X86::RFP64RegisterClass);
+    addRegisterClass(MVT::f32, &X86::FR32RegClass);
+    addRegisterClass(MVT::f64, &X86::RFP64RegClass);
  
      // Use ANDPS to simulate FABS.
      setOperationAction(ISD::FABS , MVT::f32, Custom);
@@ -632,8 +634,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
    } else if (!TM.Options.UseSoftFloat) {
      // f32 and f64 in x87.
      // Set up the FP register classes.
-    addRegisterClass(MVT::f64, X86::RFP64RegisterClass);
-    addRegisterClass(MVT::f32, X86::RFP32RegisterClass);
+    addRegisterClass(MVT::f64, &X86::RFP64RegClass);
+    addRegisterClass(MVT::f32, &X86::RFP32RegClass);
  
      setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
      setOperationAction(ISD::UNDEF,     MVT::f32, Expand);
@@ -660,7 +662,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
  
    // Long double always uses X87.
    if (!TM.Options.UseSoftFloat) {
-    addRegisterClass(MVT::f80, X86::RFP80RegisterClass);
+    addRegisterClass(MVT::f80, &X86::RFP80RegClass);
      setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
      setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
      {
@@ -705,8 +707,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
    // First set operation action for all vector types to either promote
    // (for widening) or expand (for scalarization). Then we will selectively
    // turn on ones that can be effectively codegen'd.
-  for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
-       VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) {
+  for (int VT = MVT::FIRST_VECTOR_VALUETYPE;
+           VT <= MVT::LAST_VECTOR_VALUETYPE; ++VT) {
      setOperationAction(ISD::ADD , (MVT::SimpleValueType)VT, Expand);
      setOperationAction(ISD::SUB , (MVT::SimpleValueType)VT, Expand);
      setOperationAction(ISD::FADD, (MVT::SimpleValueType)VT, Expand);
@@ -729,6 +731,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
      setOperationAction(ISD::FSIN, (MVT::SimpleValueType)VT, Expand);
      setOperationAction(ISD::FCOS, (MVT::SimpleValueType)VT, Expand);
      setOperationAction(ISD::FREM, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::FMA,  (MVT::SimpleValueType)VT, Expand);
      setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand);
      setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand);
      setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand);
@@ -764,8 +767,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
      setOperationAction(ISD::ZERO_EXTEND,  (MVT::SimpleValueType)VT, Expand);
      setOperationAction(ISD::ANY_EXTEND,  (MVT::SimpleValueType)VT, Expand);
      setOperationAction(ISD::VSELECT,  (MVT::SimpleValueType)VT, Expand);
-    for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
-         InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
+    for (int InnerVT = MVT::FIRST_VECTOR_VALUETYPE;
+             InnerVT <= MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
        setTruncStoreAction((MVT::SimpleValueType)VT,
                            (MVT::SimpleValueType)InnerVT, Expand);
      setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand);
@@ -776,7 +779,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
    // FIXME: In order to prevent SSE instructions being expanded to MMX ones
    // with -msoft-float, disable use of MMX as well.
    if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) {
-    addRegisterClass(MVT::x86mmx, X86::VR64RegisterClass);
+    addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
      // No operations on x86mmx supported, everything uses intrinsics.
    }
  
@@ -813,7 +816,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
    setOperationAction(ISD::BITCAST,            MVT::v1i64, Expand);
  
    if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) {
-    addRegisterClass(MVT::v4f32, X86::VR128RegisterClass);
+    addRegisterClass(MVT::v4f32, &X86::VR128RegClass);
  
      setOperationAction(ISD::FADD,               MVT::v4f32, Legal);
      setOperationAction(ISD::FSUB,               MVT::v4f32, Legal);
@@ -826,18 +829,17 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
      setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
      setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
      setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
-    setOperationAction(ISD::SETCC,              MVT::v4f32, Custom);
    }
  
    if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) {
-    addRegisterClass(MVT::v2f64, X86::VR128RegisterClass);
+    addRegisterClass(MVT::v2f64, &X86::VR128RegClass);
  
      // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM
      // registers cannot be used even for integer operations.
-    addRegisterClass(MVT::v16i8, X86::VR128RegisterClass);
-    addRegisterClass(MVT::v8i16, X86::VR128RegisterClass);
-    addRegisterClass(MVT::v4i32, X86::VR128RegisterClass);
-    addRegisterClass(MVT::v2i64, X86::VR128RegisterClass);
+    addRegisterClass(MVT::v16i8, &X86::VR128RegClass);
+    addRegisterClass(MVT::v8i16, &X86::VR128RegClass);
+    addRegisterClass(MVT::v4i32, &X86::VR128RegClass);
+    addRegisterClass(MVT::v2i64, &X86::VR128RegClass);
  
      setOperationAction(ISD::ADD,                MVT::v16i8, Legal);
      setOperationAction(ISD::ADD,                MVT::v8i16, Legal);
@@ -867,27 +869,18 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
      setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
      setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
  
-    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v2f64, Custom);
-    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v2i64, Custom);
-    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i8, Custom);
-    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i16, Custom);
-    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v4i32, Custom);
-
      // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
-    for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; ++i) {
-      EVT VT = (MVT::SimpleValueType)i;
+    for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
+      MVT VT = (MVT::SimpleValueType)i;
        // Do not attempt to custom lower non-power-of-2 vectors
        if (!isPowerOf2_32(VT.getVectorNumElements()))
          continue;
        // Do not attempt to custom lower non-128-bit vectors
        if (!VT.is128BitVector())
          continue;
-      setOperationAction(ISD::BUILD_VECTOR,
-                         VT.getSimpleVT().SimpleTy, Custom);
-      setOperationAction(ISD::VECTOR_SHUFFLE,
-                         VT.getSimpleVT().SimpleTy, Custom);
-      setOperationAction(ISD::EXTRACT_VECTOR_ELT,
-                         VT.getSimpleVT().SimpleTy, Custom);
+      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
+      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
+      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
      }
  
      setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f64, Custom);
@@ -903,24 +896,23 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
      }
  
      // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
-    for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; i++) {
-      MVT::SimpleValueType SVT = (MVT::SimpleValueType)i;
-      EVT VT = SVT;
+    for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
+      MVT VT = (MVT::SimpleValueType)i;
  
        // Do not attempt to promote non-128-bit vectors
        if (!VT.is128BitVector())
          continue;
  
-      setOperationAction(ISD::AND,    SVT, Promote);
-      AddPromotedToType (ISD::AND,    SVT, MVT::v2i64);
-      setOperationAction(ISD::OR,     SVT, Promote);
-      AddPromotedToType (ISD::OR,     SVT, MVT::v2i64);
-      setOperationAction(ISD::XOR,    SVT, Promote);
-      AddPromotedToType (ISD::XOR,    SVT, MVT::v2i64);
-      setOperationAction(ISD::LOAD,   SVT, Promote);
-      AddPromotedToType (ISD::LOAD,   SVT, MVT::v2i64);
-      setOperationAction(ISD::SELECT, SVT, Promote);
-      AddPromotedToType (ISD::SELECT, SVT, MVT::v2i64);
+      setOperationAction(ISD::AND,    VT, Promote);
+      AddPromotedToType (ISD::AND,    VT, MVT::v2i64);
+      setOperationAction(ISD::OR,     VT, Promote);
+      AddPromotedToType (ISD::OR,     VT, MVT::v2i64);
+      setOperationAction(ISD::XOR,    VT, Promote);
+      AddPromotedToType (ISD::XOR,    VT, MVT::v2i64);
+      setOperationAction(ISD::LOAD,   VT, Promote);
+      AddPromotedToType (ISD::LOAD,   VT, MVT::v2i64);
+      setOperationAction(ISD::SELECT, VT, Promote);
+      AddPromotedToType (ISD::SELECT, VT, MVT::v2i64);
      }
  
      setTruncStoreAction(MVT::f64, MVT::f32, Expand);
@@ -1007,16 +999,13 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
      }
    }
  
-  if (Subtarget->hasSSE42())
-    setOperationAction(ISD::SETCC,             MVT::v2i64, Custom);
-
    if (!TM.Options.UseSoftFloat && Subtarget->hasAVX()) {
-    addRegisterClass(MVT::v32i8,  X86::VR256RegisterClass);
-    addRegisterClass(MVT::v16i16, X86::VR256RegisterClass);
-    addRegisterClass(MVT::v8i32,  X86::VR256RegisterClass);
-    addRegisterClass(MVT::v8f32,  X86::VR256RegisterClass);
-    addRegisterClass(MVT::v4i64,  X86::VR256RegisterClass);
-    addRegisterClass(MVT::v4f64,  X86::VR256RegisterClass);
+    addRegisterClass(MVT::v32i8,  &X86::VR256RegClass);
+    addRegisterClass(MVT::v16i16, &X86::VR256RegClass);
+    addRegisterClass(MVT::v8i32,  &X86::VR256RegClass);
+    addRegisterClass(MVT::v8f32,  &X86::VR256RegClass);
+    addRegisterClass(MVT::v4i64,  &X86::VR256RegClass);
+    addRegisterClass(MVT::v4f64,  &X86::VR256RegClass);
  
      setOperationAction(ISD::LOAD,               MVT::v8f32, Legal);
      setOperationAction(ISD::LOAD,               MVT::v4f64, Legal);
@@ -1040,13 +1029,6 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
      setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
      setOperationAction(ISD::FP_ROUND,           MVT::v4f32, Legal);
  
-    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v4f64,  Custom);
-    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v4i64,  Custom);
-    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8f32,  Custom);
-    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i32,  Custom);
-    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v32i8,  Custom);
-    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i16, Custom);
-
      setOperationAction(ISD::SRL,               MVT::v16i16, Custom);
      setOperationAction(ISD::SRL,               MVT::v32i8, Custom);
  
@@ -1070,6 +1052,15 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
      setOperationAction(ISD::VSELECT,           MVT::v8i32, Legal);
      setOperationAction(ISD::VSELECT,           MVT::v8f32, Legal);
  
+    if (Subtarget->hasFMA()) {
+      setOperationAction(ISD::FMA,             MVT::v8f32, Custom);
+      setOperationAction(ISD::FMA,             MVT::v4f64, Custom);
+      setOperationAction(ISD::FMA,             MVT::v4f32, Custom);
+      setOperationAction(ISD::FMA,             MVT::v2f64, Custom);
+      setOperationAction(ISD::FMA,             MVT::f32, Custom);
+      setOperationAction(ISD::FMA,             MVT::f64, Custom);
+    }
+
      if (Subtarget->hasAVX2()) {
        setOperationAction(ISD::ADD,             MVT::v4i64, Legal);
        setOperationAction(ISD::ADD,             MVT::v8i32, Legal);
@@ -1121,60 +1112,60 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
      }
  
      // Custom lower several nodes for 256-bit types.
-    for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
-                  i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) {
-      MVT::SimpleValueType SVT = (MVT::SimpleValueType)i;
-      EVT VT = SVT;
+    for (int i = MVT::FIRST_VECTOR_VALUETYPE;
+             i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
+      MVT VT = (MVT::SimpleValueType)i;
  
        // Extract subvector is special because the value type
        // (result) is 128-bit but the source is 256-bit wide.
        if (VT.is128BitVector())
-        setOperationAction(ISD::EXTRACT_SUBVECTOR, SVT, Custom);
+        setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
  
        // Do not attempt to custom lower other non-256-bit vectors
        if (!VT.is256BitVector())
          continue;
  
-      setOperationAction(ISD::BUILD_VECTOR,       SVT, Custom);
-      setOperationAction(ISD::VECTOR_SHUFFLE,     SVT, Custom);
-      setOperationAction(ISD::INSERT_VECTOR_ELT,  SVT, Custom);
-      setOperationAction(ISD::EXTRACT_VECTOR_ELT, SVT, Custom);
-      setOperationAction(ISD::SCALAR_TO_VECTOR,   SVT, Custom);
-      setOperationAction(ISD::INSERT_SUBVECTOR,   SVT, Custom);
+      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
+      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
+      setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
+      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+      setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
+      setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Custom);
+      setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
      }
  
      // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
-    for (unsigned i = (unsigned)MVT::v32i8; i != (unsigned)MVT::v4i64; ++i) {
-      MVT::SimpleValueType SVT = (MVT::SimpleValueType)i;
-      EVT VT = SVT;
+    for (int i = MVT::v32i8; i != MVT::v4i64; ++i) {
+      MVT VT = (MVT::SimpleValueType)i;
  
        // Do not attempt to promote non-256-bit vectors
        if (!VT.is256BitVector())
          continue;
  
-      setOperationAction(ISD::AND,    SVT, Promote);
-      AddPromotedToType (ISD::AND,    SVT, MVT::v4i64);
-      setOperationAction(ISD::OR,     SVT, Promote);
-      AddPromotedToType (ISD::OR,     SVT, MVT::v4i64);
-      setOperationAction(ISD::XOR,    SVT, Promote);
-      AddPromotedToType (ISD::XOR,    SVT, MVT::v4i64);
-      setOperationAction(ISD::LOAD,   SVT, Promote);
-      AddPromotedToType (ISD::LOAD,   SVT, MVT::v4i64);
-      setOperationAction(ISD::SELECT, SVT, Promote);
-      AddPromotedToType (ISD::SELECT, SVT, MVT::v4i64);
+      setOperationAction(ISD::AND,    VT, Promote);
+      AddPromotedToType (ISD::AND,    VT, MVT::v4i64);
+      setOperationAction(ISD::OR,     VT, Promote);
+      AddPromotedToType (ISD::OR,     VT, MVT::v4i64);
+      setOperationAction(ISD::XOR,    VT, Promote);
+      AddPromotedToType (ISD::XOR,    VT, MVT::v4i64);
+      setOperationAction(ISD::LOAD,   VT, Promote);
+      AddPromotedToType (ISD::LOAD,   VT, MVT::v4i64);
+      setOperationAction(ISD::SELECT, VT, Promote);
+      AddPromotedToType (ISD::SELECT, VT, MVT::v4i64);
      }
    }
  
    // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion
    // of this type with custom code.
-  for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
-         VT != (unsigned)MVT::LAST_VECTOR_VALUETYPE; VT++) {
+  for (int VT = MVT::FIRST_VECTOR_VALUETYPE;
+           VT != MVT::LAST_VECTOR_VALUETYPE; VT++) {
      setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,
                         Custom);
    }
  
    // We want to custom lower some of our intrinsics.
    setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
  
  
    // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
@@ -1218,17 +1209,21 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
    setTargetDAGCombine(ISD::ADD);
    setTargetDAGCombine(ISD::FADD);
    setTargetDAGCombine(ISD::FSUB);
+  setTargetDAGCombine(ISD::FMA);
    setTargetDAGCombine(ISD::SUB);
    setTargetDAGCombine(ISD::LOAD);
    setTargetDAGCombine(ISD::STORE);
    setTargetDAGCombine(ISD::ZERO_EXTEND);
+  setTargetDAGCombine(ISD::ANY_EXTEND);
    setTargetDAGCombine(ISD::SIGN_EXTEND);
    setTargetDAGCombine(ISD::TRUNCATE);
+  setTargetDAGCombine(ISD::UINT_TO_FP);
    setTargetDAGCombine(ISD::SINT_TO_FP);
+  setTargetDAGCombine(ISD::SETCC);
+  setTargetDAGCombine(ISD::FP_TO_SINT);
    if (Subtarget->is64Bit())
      setTargetDAGCombine(ISD::MUL);
-  if (Subtarget->hasBMI())
-    setTargetDAGCombine(ISD::XOR);
+  setTargetDAGCombine(ISD::XOR);
  
    computeRegisterProperties();
  
@@ -1243,6 +1238,9 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
    setPrefLoopAlignment(4); // 2^4 bytes.
    benefitFromCodePlacementOpt = true;
  
+  // Predictable cmov don't hurt on atom because it's in-order.
+  predictableSelectIsExpensive = !Subtarget->isAtom();
+
    setPrefFunctionAlignment(4); // 2^4 bytes.
  }
  
@@ -1276,7 +1274,6 @@ static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
          break;
      }
    }
-  return;
  }
  
  /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
@@ -1411,18 +1408,19 @@ X86TargetLowering::findRepresentativeClass(EVT VT) const{
    default:
      return TargetLowering::findRepresentativeClass(VT);
    case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
-    RRC = (Subtarget->is64Bit()
-           ? X86::GR64RegisterClass : X86::GR32RegisterClass);
+    RRC = Subtarget->is64Bit() ?
+      (const TargetRegisterClass*)&X86::GR64RegClass :
+      (const TargetRegisterClass*)&X86::GR32RegClass;
      break;
    case MVT::x86mmx:
-    RRC = X86::VR64RegisterClass;
+    RRC = &X86::VR64RegClass;
      break;
    case MVT::f32: case MVT::f64:
    case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
    case MVT::v4f32: case MVT::v2f64:
    case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32:
    case MVT::v4f64:
-    RRC = X86::VR128RegisterClass;
+    RRC = &X86::VR128RegClass;
      break;
    }
    return std::make_pair(RRC, Cost);
@@ -1457,7 +1455,7 @@ bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
  
  bool
  X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv,
-                                 MachineFunction &MF, bool isVarArg,
+                                  MachineFunction &MF, bool isVarArg,
                          const SmallVectorImpl<ISD::OutputArg> &Outs,
                          LLVMContext &Context) const {
    SmallVector<CCValAssign, 16> RVLocs;
@@ -1501,6 +1499,16 @@ X86TargetLowering::LowerReturn(SDValue Chain,
      SDValue ValToCopy = OutVals[i];
      EVT ValVT = ValToCopy.getValueType();
  
+    // Promote values to the appropriate types
+    if (VA.getLocInfo() == CCValAssign::SExt)
+      ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
+    else if (VA.getLocInfo() == CCValAssign::ZExt)
+      ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
+    else if (VA.getLocInfo() == CCValAssign::AExt)
+      ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
+    else if (VA.getLocInfo() == CCValAssign::BCvt)
+      ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy);
+
      // If this is x86-64, and we disabled SSE, we can't return FP values,
      // or SSE or MMX vectors.
      if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
@@ -1578,20 +1586,21 @@ X86TargetLowering::LowerReturn(SDValue Chain,
                       MVT::Other, &RetOps[0], RetOps.size());
  }
  
-bool X86TargetLowering::isUsedByReturnOnly(SDNode *N) const {
+bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
    if (N->getNumValues() != 1)
      return false;
    if (!N->hasNUsesOfValue(1, 0))
      return false;
  
+  SDValue TCChain = Chain;
    SDNode *Copy = *N->use_begin();
-  if (Copy->getOpcode() != ISD::CopyToReg &&
-      Copy->getOpcode() != ISD::FP_EXTEND)
-    return false;
-
-  // If anything is glued to the copy, then we can't safely perform a tail call.
-  if (Copy->getOpcode() == ISD::CopyToReg &&
-      Copy->getNumOperands() == 4)
+  if (Copy->getOpcode() == ISD::CopyToReg) {
+    // If the copy has a glue operand, we conservatively assume it isn't safe to
+    // perform a tail call.
+    if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
+      return false;
+    TCChain = Copy->getOperand(0);
+  } else if (Copy->getOpcode() != ISD::FP_EXTEND)
      return false;
  
    bool HasRet = false;
@@ -1602,7 +1611,11 @@ bool X86TargetLowering::isUsedByReturnOnly(SDNode *N) const {
      HasRet = true;
    }
  
-  return HasRet;
+  if (!HasRet)
+    return false;
+
+  Chain = TCChain;
+  return true;
  }
  
  EVT
@@ -1633,7 +1646,7 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
    SmallVector<CCValAssign, 16> RVLocs;
    bool Is64Bit = Subtarget->is64Bit();
    CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                getTargetMachine(), RVLocs, *DAG.getContext());
+                 getTargetMachine(), RVLocs, *DAG.getContext());
    CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
  
    // Copy all of the result registers out of their specified physreg.
@@ -1650,7 +1663,7 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
      SDValue Val;
  
      // If this is a call to a function that returns an fp value on the floating
-    // point stack, we must guarantee the the value is popped from the stack, so
+    // point stack, we must guarantee the value is popped from the stack, so
      // a CopyFromReg is not good enough - the copy instruction may be eliminated
      // if the return value is not used. We use the FpPOP_RETVAL instruction
      // instead.
@@ -1694,21 +1707,37 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
  
  /// CallIsStructReturn - Determines whether a call uses struct return
  /// semantics.
-static bool CallIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) {
+enum StructReturnType {
+  NotStructReturn,
+  RegStructReturn,
+  StackStructReturn
+};
+static StructReturnType
+callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) {
    if (Outs.empty())
-    return false;
+    return NotStructReturn;
  
-  return Outs[0].Flags.isSRet();
+  const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
+  if (!Flags.isSRet())
+    return NotStructReturn;
+  if (Flags.isInReg())
+    return RegStructReturn;
+  return StackStructReturn;
  }
  
  /// ArgsAreStructReturn - Determines whether a function uses struct
  /// return semantics.
-static bool
-ArgsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
+static StructReturnType
+argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
    if (Ins.empty())
-    return false;
+    return NotStructReturn;
  
-  return Ins[0].Flags.isSRet();
+  const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
+  if (!Flags.isSRet())
+    return NotStructReturn;
+  if (Flags.isInReg())
+    return RegStructReturn;
+  return StackStructReturn;
  }
  
  /// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
@@ -1845,19 +1874,19 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
        EVT RegVT = VA.getLocVT();
        const TargetRegisterClass *RC;
        if (RegVT == MVT::i32)
-        RC = X86::GR32RegisterClass;
+        RC = &X86::GR32RegClass;
        else if (Is64Bit && RegVT == MVT::i64)
-        RC = X86::GR64RegisterClass;
+        RC = &X86::GR64RegClass;
        else if (RegVT == MVT::f32)
-        RC = X86::FR32RegisterClass;
+        RC = &X86::FR32RegClass;
        else if (RegVT == MVT::f64)
-        RC = X86::FR64RegisterClass;
-      else if (RegVT.isVector() && RegVT.getSizeInBits() == 256)
-        RC = X86::VR256RegisterClass;
-      else if (RegVT.isVector() && RegVT.getSizeInBits() == 128)
-        RC = X86::VR128RegisterClass;
+        RC = &X86::FR64RegClass;
+      else if (RegVT.is256BitVector())
+        RC = &X86::VR256RegClass;
+      else if (RegVT.is128BitVector())
+        RC = &X86::VR128RegClass;
        else if (RegVT == MVT::x86mmx)
-        RC = X86::VR64RegisterClass;
+        RC = &X86::VR64RegClass;
        else
          llvm_unreachable("Unknown argument type!");
  
@@ -1928,17 +1957,17 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
        unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0;
  
        // FIXME: We should really autogenerate these arrays
-      static const unsigned GPR64ArgRegsWin64[] = {
+      static const uint16_t GPR64ArgRegsWin64[] = {
          X86::RCX, X86::RDX, X86::R8,  X86::R9
        };
-      static const unsigned GPR64ArgRegs64Bit[] = {
+      static const uint16_t GPR64ArgRegs64Bit[] = {
          X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
        };
-      static const unsigned XMMArgRegs64Bit[] = {
+      static const uint16_t XMMArgRegs64Bit[] = {
          X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
          X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
        };
-      const unsigned *GPR64ArgRegs;
+      const uint16_t *GPR64ArgRegs;
        unsigned NumXMMRegs = 0;
  
        if (IsWin64) {
@@ -1999,7 +2028,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
          SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
                                    DAG.getIntPtrConstant(Offset));
          unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs],
-                                     X86::GR64RegisterClass);
+                                     &X86::GR64RegClass);
          SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
          SDValue Store =
            DAG.getStore(Val.getValue(1), dl, Val, FIN,
@@ -2015,7 +2044,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
          SmallVector<SDValue, 11> SaveXMMOps;
          SaveXMMOps.push_back(Chain);
  
-        unsigned AL = MF.addLiveIn(X86::AL, X86::GR8RegisterClass);
+        unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
          SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8);
          SaveXMMOps.push_back(ALVal);
  
@@ -2026,7 +2055,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
  
          for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) {
            unsigned VReg = MF.addLiveIn(XMMArgRegs64Bit[NumXMMRegs],
-                                       X86::VR128RegisterClass);
+                                       &X86::VR128RegClass);
            SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32);
            SaveXMMOps.push_back(Val);
          }
@@ -2049,7 +2078,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
      FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
      // If this is an sret function, the return should pop the hidden pointer.
      if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows &&
-        ArgsAreStructReturn(Ins))
+        argsAreStructReturn(Ins) == StackStructReturn)
        FuncInfo->setBytesToPopOnReturn(4);
    }
  
@@ -2122,19 +2151,24 @@ EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF,
  }
  
  SDValue
-X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
-                             CallingConv::ID CallConv, bool isVarArg,
-                             bool doesNotRet, bool &isTailCall,
-                             const SmallVectorImpl<ISD::OutputArg> &Outs,
-                             const SmallVectorImpl<SDValue> &OutVals,
-                             const SmallVectorImpl<ISD::InputArg> &Ins,
-                             DebugLoc dl, SelectionDAG &DAG,
+X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                               SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG                     = CLI.DAG;
+  DebugLoc &dl                          = CLI.DL;
+  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
+  SmallVector<SDValue, 32> &OutVals     = CLI.OutVals;
+  SmallVector<ISD::InputArg, 32> &Ins   = CLI.Ins;
+  SDValue Chain                         = CLI.Chain;
+  SDValue Callee                        = CLI.Callee;
+  CallingConv::ID CallConv              = CLI.CallConv;
+  bool &isTailCall                      = CLI.IsTailCall;
+  bool isVarArg                         = CLI.IsVarArg;
+
    MachineFunction &MF = DAG.getMachineFunction();
    bool Is64Bit        = Subtarget->is64Bit();
    bool IsWin64        = Subtarget->isTargetWin64();
    bool IsWindows      = Subtarget->isTargetWindows();
-  bool IsStructRet    = CallIsStructReturn(Outs);
+  StructReturnType SR = callIsStructReturn(Outs);
    bool IsSibcall      = false;
  
    if (MF.getTarget().Options.DisableTailCalls)
@@ -2143,8 +2177,9 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
    if (isTailCall) {
      // Check if it's really possible to do a tail call.
      isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
-                    isVarArg, IsStructRet, MF.getFunction()->hasStructRetAttr(),
-                                                   Outs, OutVals, Ins, DAG);
+                    isVarArg, SR != NotStructReturn,
+                    MF.getFunction()->hasStructRetAttr(),
+                    Outs, OutVals, Ins, DAG);
  
      // Sibcalls are automatically detected tailcalls which do not require
      // ABI changes.
@@ -2226,7 +2261,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
        Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
        break;
      case CCValAssign::AExt:
-      if (RegVT.isVector() && RegVT.getSizeInBits() == 128) {
+      if (RegVT.is128BitVector()) {
          // Special case: passing MMX values in XMM registers.
          Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg);
          Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
@@ -2277,27 +2312,12 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
                          &MemOpChains[0], MemOpChains.size());
  
-  // Build a sequence of copy-to-reg nodes chained together with token chain
-  // and flag operands which copy the outgoing args into registers.
-  SDValue InFlag;
-  // Tail call byval lowering might overwrite argument registers so in case of
-  // tail call optimization the copies to registers are lowered later.
-  if (!isTailCall)
-    for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
-      Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
-                               RegsToPass[i].second, InFlag);
-      InFlag = Chain.getValue(1);
-    }
-
    if (Subtarget->isPICStyleGOT()) {
      // ELF / PIC requires GOT in the EBX register before function calls via PLT
      // GOT pointer.
      if (!isTailCall) {
-      Chain = DAG.getCopyToReg(Chain, dl, X86::EBX,
-                               DAG.getNode(X86ISD::GlobalBaseReg,
-                                           DebugLoc(), getPointerTy()),
-                               InFlag);
-      InFlag = Chain.getValue(1);
+      RegsToPass.push_back(std::make_pair(unsigned(X86::EBX),
+               DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy())));
      } else {
        // If we are tail calling and generating PIC/GOT style code load the
        // address of the callee into ECX. The value in ecx is used as target of
@@ -2327,7 +2347,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
      // registers used and is in the range 0 - 8 inclusive.
  
      // Count the number of XMM registers allocated.
-    static const unsigned XMMArgRegs[] = {
+    static const uint16_t XMMArgRegs[] = {
        X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
        X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
      };
@@ -2335,12 +2355,10 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
      assert((Subtarget->hasSSE1() || !NumXMMRegs)
             && "SSE registers cannot be used when SSE is disabled");
  
-    Chain = DAG.getCopyToReg(Chain, dl, X86::AL,
-                             DAG.getConstant(NumXMMRegs, MVT::i8), InFlag);
-    InFlag = Chain.getValue(1);
+    RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
+                                        DAG.getConstant(NumXMMRegs, MVT::i8)));
    }
  
-
    // For tail calls lower the arguments to the 'real' stack slot.
    if (isTailCall) {
      // Force all the incoming stack arguments to be loaded from the stack
@@ -2354,8 +2372,6 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
      SmallVector<SDValue, 8> MemOpChains2;
      SDValue FIN;
      int FI = 0;
-    // Do not flag preceding copytoreg stuff together with the following stuff.
-    InFlag = SDValue();
      if (getTargetMachine().Options.GuaranteedTailCallOpt) {
        for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
          CCValAssign &VA = ArgLocs[i];
@@ -2395,19 +2411,20 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
        Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
                            &MemOpChains2[0], MemOpChains2.size());
  
-    // Copy arguments to their registers.
-    for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
-      Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
-                               RegsToPass[i].second, InFlag);
-      InFlag = Chain.getValue(1);
-    }
-    InFlag =SDValue();
-
      // Store the return address to the appropriate stack slot.
      Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit,
                                       FPDiff, dl);
    }
  
+  // Build a sequence of copy-to-reg nodes chained together with token chain
+  // and flag operands which copy the outgoing args into registers.
+  SDValue InFlag;
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+                             RegsToPass[i].second, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
    if (getTargetMachine().getCodeModel() == CodeModel::Large) {
      assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
      // In the 64-bit large code model, we have to make all calls
@@ -2509,14 +2526,6 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
      Ops.push_back(DAG.getRegister(RegsToPass[i].first,
                                    RegsToPass[i].second.getValueType()));
  
-  // Add an implicit use GOT pointer in EBX.
-  if (!isTailCall && Subtarget->isPICStyleGOT())
-    Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy()));
-
-  // Add an implicit use of AL for non-Windows x86 64-bit vararg functions.
-  if (Is64Bit && isVarArg && !IsWin64)
-    Ops.push_back(DAG.getRegister(X86::AL, MVT::i8));
-
    // Add a register mask operand representing the call-preserved registers.
    const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
    const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
@@ -2546,7 +2555,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
                         getTargetMachine().Options.GuaranteedTailCallOpt))
      NumBytesForCalleeToPush = NumBytes;    // Callee pops everything
    else if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows &&
-           IsStructRet)
+           SR == StackStructReturn)
      // If this is a call to a struct-return function, the callee
      // pops the hidden struct pointer, so we have to push it back.
      // This is common for Darwin/X86, Linux & Mingw32 targets.
@@ -2738,7 +2747,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
  
      SmallVector<CCValAssign, 16> ArgLocs;
      CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
-                  getTargetMachine(), ArgLocs, *DAG.getContext());
+                   getTargetMachine(), ArgLocs, *DAG.getContext());
  
      CCInfo.AnalyzeCallOperands(Outs, CC_X86);
      for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
@@ -2759,7 +2768,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
    if (Unused) {
      SmallVector<CCValAssign, 16> RVLocs;
      CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(),
-                  getTargetMachine(), RVLocs, *DAG.getContext());
+                   getTargetMachine(), RVLocs, *DAG.getContext());
      CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
      for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
        CCValAssign &VA = RVLocs[i];
@@ -2773,12 +2782,12 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
    if (!CCMatch) {
      SmallVector<CCValAssign, 16> RVLocs1;
      CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(),
-                   getTargetMachine(), RVLocs1, *DAG.getContext());
+                    getTargetMachine(), RVLocs1, *DAG.getContext());
      CCInfo1.AnalyzeCallResult(Ins, RetCC_X86);
  
      SmallVector<CCValAssign, 16> RVLocs2;
      CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(),
-                   getTargetMachine(), RVLocs2, *DAG.getContext());
+                    getTargetMachine(), RVLocs2, *DAG.getContext());
      CCInfo2.AnalyzeCallResult(Ins, RetCC_X86);
  
      if (RVLocs1.size() != RVLocs2.size())
@@ -2805,7 +2814,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
      // argument is passed on the stack.
      SmallVector<CCValAssign, 16> ArgLocs;
      CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
-                  getTargetMachine(), ArgLocs, *DAG.getContext());
+                   getTargetMachine(), ArgLocs, *DAG.getContext());
  
      // Allocate shadow area for Win64
      if (Subtarget->isTargetWin64()) {
@@ -2867,8 +2876,9 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
  }
  
  FastISel *
-X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo) const {
-  return X86::createFastISel(funcInfo);
+X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
+                                  const TargetLibraryInfo *libInfo) const {
+  return X86::createFastISel(funcInfo, libInfo);
  }
  
  
@@ -2906,12 +2916,13 @@ static bool isTargetShuffle(unsigned Opcode) {
    case X86ISD::UNPCKH:
    case X86ISD::VPERMILP:
    case X86ISD::VPERM2X128:
+  case X86ISD::VPERMI:
      return true;
    }
  }
  
  static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
-                                               SDValue V1, SelectionDAG &DAG) {
+                                    SDValue V1, SelectionDAG &DAG) {
    switch(Opc) {
    default: llvm_unreachable("Unknown x86 shuffle node");
    case X86ISD::MOVSHDUP:
@@ -2922,19 +2933,22 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
  }
  
  static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
-                          SDValue V1, unsigned TargetMask, SelectionDAG &DAG) {
+                                    SDValue V1, unsigned TargetMask,
+                                    SelectionDAG &DAG) {
    switch(Opc) {
    default: llvm_unreachable("Unknown x86 shuffle node");
    case X86ISD::PSHUFD:
    case X86ISD::PSHUFHW:
    case X86ISD::PSHUFLW:
    case X86ISD::VPERMILP:
+  case X86ISD::VPERMI:
      return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8));
    }
  }
  
  static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
-               SDValue V1, SDValue V2, unsigned TargetMask, SelectionDAG &DAG) {
+                                    SDValue V1, SDValue V2, unsigned TargetMask,
+                                    SelectionDAG &DAG) {
    switch(Opc) {
    default: llvm_unreachable("Unknown x86 shuffle node");
    case X86ISD::PALIGN:
@@ -3043,10 +3057,12 @@ static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
          // X > -1   -> X == 0, jump !sign.
          RHS = DAG.getConstant(0, RHS.getValueType());
          return X86::COND_NS;
-      } else if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
+      }
+      if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
          // X < 0   -> X == 0, jump on sign.
          return X86::COND_S;
-      } else if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
+      }
+      if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
          // X < 1   -> X <= 0
          RHS = DAG.getConstant(0, RHS.getValueType());
          return X86::COND_LE;
@@ -3162,12 +3178,12 @@ static bool isUndefOrEqual(int Val, int CmpVal) {
    return false;
  }
  
-/// isSequentialOrUndefInRange - Return true if every element in Mask, begining
+/// isSequentialOrUndefInRange - Return true if every element in Mask, beginning
  /// from position Pos and ending in Pos+Size, falls within the specified
  /// sequential range (L, L+Pos]. or is undef.
  static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
-                                       int Pos, int Size, int Low) {
-  for (int i = Pos, e = Pos+Size; i != e; ++i, ++Low)
+                                       unsigned Pos, unsigned Size, int Low) {
+  for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
      if (!isUndefOrEqual(Mask[i], Low))
        return false;
    return true;
@@ -3186,8 +3202,8 @@ static bool isPSHUFDMask(ArrayRef<int> Mask, EVT VT) {
  
  /// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that
  /// is suitable for input to PSHUFHW.
-static bool isPSHUFHWMask(ArrayRef<int> Mask, EVT VT) {
-  if (VT != MVT::v8i16)
+static bool isPSHUFHWMask(ArrayRef<int> Mask, EVT VT, bool HasAVX2) {
+  if (VT != MVT::v8i16 && (!HasAVX2 || VT != MVT::v16i16))
      return false;
  
    // Lower quadword copied in order or undef.
@@ -3196,16 +3212,27 @@ static bool isPSHUFHWMask(ArrayRef<int> Mask, EVT VT) {
  
    // Upper quadword shuffled.
    for (unsigned i = 4; i != 8; ++i)
-    if (Mask[i] >= 0 && (Mask[i] < 4 || Mask[i] > 7))
+    if (!isUndefOrInRange(Mask[i], 4, 8))
+      return false;
+
+  if (VT == MVT::v16i16) {
+    // Lower quadword copied in order or undef.
+    if (!isSequentialOrUndefInRange(Mask, 8, 4, 8))
        return false;
  
+    // Upper quadword shuffled.
+    for (unsigned i = 12; i != 16; ++i)
+      if (!isUndefOrInRange(Mask[i], 12, 16))
+        return false;
+  }
+
    return true;
  }
  
  /// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that
  /// is suitable for input to PSHUFLW.
-static bool isPSHUFLWMask(ArrayRef<int> Mask, EVT VT) {
-  if (VT != MVT::v8i16)
+static bool isPSHUFLWMask(ArrayRef<int> Mask, EVT VT, bool HasAVX2) {
+  if (VT != MVT::v8i16 && (!HasAVX2 || VT != MVT::v16i16))
      return false;
  
    // Upper quadword copied in order.
@@ -3214,9 +3241,20 @@ static bool isPSHUFLWMask(ArrayRef<int> Mask, EVT VT) {
  
    // Lower quadword shuffled.
    for (unsigned i = 0; i != 4; ++i)
-    if (Mask[i] >= 4)
+    if (!isUndefOrInRange(Mask[i], 0, 4))
        return false;
  
+  if (VT == MVT::v16i16) {
+    // Upper quadword copied in order.
+    if (!isSequentialOrUndefInRange(Mask, 12, 4, 12))
+      return false;
+
+    // Lower quadword shuffled.
+    for (unsigned i = 8; i != 12; ++i)
+      if (!isUndefOrInRange(Mask[i], 8, 12))
+        return false;
+  }
+
    return true;
  }
  
@@ -3366,11 +3404,11 @@ static bool isSHUFPMask(ArrayRef<int> Mask, EVT VT, bool HasAVX,
  /// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
  /// specifies a shuffle of elements that is suitable for input to MOVHLPS.
  static bool isMOVHLPSMask(ArrayRef<int> Mask, EVT VT) {
-  unsigned NumElems = VT.getVectorNumElements();
-
-  if (VT.getSizeInBits() != 128)
+  if (!VT.is128BitVector())
      return false;
  
+  unsigned NumElems = VT.getVectorNumElements();
+
    if (NumElems != 4)
      return false;
  
@@ -3385,11 +3423,11 @@ static bool isMOVHLPSMask(ArrayRef<int> Mask, EVT VT) {
  /// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
  /// <2, 3, 2, 3>
  static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, EVT VT) {
-  unsigned NumElems = VT.getVectorNumElements();
-
-  if (VT.getSizeInBits() != 128)
+  if (!VT.is128BitVector())
      return false;
  
+  unsigned NumElems = VT.getVectorNumElements();
+
    if (NumElems != 4)
      return false;
  
@@ -3402,7 +3440,7 @@ static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, EVT VT) {
  /// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
  /// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
  static bool isMOVLPMask(ArrayRef<int> Mask, EVT VT) {
-  if (VT.getSizeInBits() != 128)
+  if (!VT.is128BitVector())
      return false;
  
    unsigned NumElems = VT.getVectorNumElements();
@@ -3410,11 +3448,11 @@ static bool isMOVLPMask(ArrayRef<int> Mask, EVT VT) {
    if (NumElems != 2 && NumElems != 4)
      return false;
  
-  for (unsigned i = 0; i != NumElems/2; ++i)
+  for (unsigned i = 0, e = NumElems/2; i != e; ++i)
      if (!isUndefOrEqual(Mask[i], i + NumElems))
        return false;
  
-  for (unsigned i = NumElems/2; i != NumElems; ++i)
+  for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
      if (!isUndefOrEqual(Mask[i], i))
        return false;
  
@@ -3424,23 +3462,71 @@ static bool isMOVLPMask(ArrayRef<int> Mask, EVT VT) {
  /// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand
  /// specifies a shuffle of elements that is suitable for input to MOVLHPS.
  static bool isMOVLHPSMask(ArrayRef<int> Mask, EVT VT) {
+  if (!VT.is128BitVector())
+    return false;
+
    unsigned NumElems = VT.getVectorNumElements();
  
-  if ((NumElems != 2 && NumElems != 4)
-      || VT.getSizeInBits() > 128)
+  if (NumElems != 2 && NumElems != 4)
      return false;
  
-  for (unsigned i = 0; i != NumElems/2; ++i)
+  for (unsigned i = 0, e = NumElems/2; i != e; ++i)
      if (!isUndefOrEqual(Mask[i], i))
        return false;
  
-  for (unsigned i = 0; i != NumElems/2; ++i)
-    if (!isUndefOrEqual(Mask[i + NumElems/2], i + NumElems))
+  for (unsigned i = 0, e = NumElems/2; i != e; ++i)
+    if (!isUndefOrEqual(Mask[i + e], i + NumElems))
        return false;
  
    return true;
  }
  
+//
+// Some special combinations that can be optimized.
+//
+static
+SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp,
+                               SelectionDAG &DAG) {
+  EVT VT = SVOp->getValueType(0);
+  DebugLoc dl = SVOp->getDebugLoc();
+
+  if (VT != MVT::v8i32 && VT != MVT::v8f32)
+    return SDValue();
+
+  ArrayRef<int> Mask = SVOp->getMask();
+
+  // These are the special masks that may be optimized.
+  static const int MaskToOptimizeEven[] = {0, 8, 2, 10, 4, 12, 6, 14};
+  static const int MaskToOptimizeOdd[]  = {1, 9, 3, 11, 5, 13, 7, 15};
+  bool MatchEvenMask = true;
+  bool MatchOddMask  = true;
+  for (int i=0; i<8; ++i) {
+    if (!isUndefOrEqual(Mask[i], MaskToOptimizeEven[i]))
+      MatchEvenMask = false;
+    if (!isUndefOrEqual(Mask[i], MaskToOptimizeOdd[i]))
+      MatchOddMask = false;
+  }
+  static const int CompactionMaskEven[] = {0, 2, -1, -1, 4, 6, -1, -1};
+  static const int CompactionMaskOdd [] = {1, 3, -1, -1, 5, 7, -1, -1};
+
+  const int *CompactionMask;
+  if (MatchEvenMask)
+    CompactionMask = CompactionMaskEven;
+  else if (MatchOddMask)
+    CompactionMask = CompactionMaskOdd;
+  else
+    return SDValue();
+
+  SDValue UndefNode = DAG.getNode(ISD::UNDEF, dl, VT);
+
+  SDValue Op0 = DAG.getVectorShuffle(VT, dl, SVOp->getOperand(0),
+                                     UndefNode, CompactionMask);
+  SDValue Op1 = DAG.getVectorShuffle(VT, dl, SVOp->getOperand(1),
+                                     UndefNode, CompactionMask);
+  static const int UnpackMask[] = {0, 8, 1, 9, 4, 12, 5, 13};
+  return DAG.getVectorShuffle(VT, dl, Op0, Op1, UnpackMask);
+}
+
  /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
  /// specifies a shuffle of elements that is suitable for input to UNPCKL.
  static bool isUNPCKLMask(ArrayRef<int> Mask, EVT VT,
@@ -3598,7 +3684,7 @@ static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, EVT VT, bool HasAVX2) {
  static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) {
    if (VT.getVectorElementType().getSizeInBits() < 32)
      return false;
-  if (VT.getSizeInBits() == 256)
+  if (!VT.is128BitVector())
      return false;
  
    unsigned NumElts = VT.getVectorNumElements();
@@ -3620,7 +3706,7 @@ static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) {
  /// The first half comes from the second half of V1 and the second half from the
  /// the second half of V2.
  static bool isVPERM2X128Mask(ArrayRef<int> Mask, EVT VT, bool HasAVX) {
-  if (!HasAVX || VT.getSizeInBits() != 256)
+  if (!HasAVX || !VT.is256BitVector())
      return false;
  
    // The shuffle result is divided into half A and half B. In total the two
@@ -3712,6 +3798,9 @@ static bool isVPERMILPMask(ArrayRef<int> Mask, EVT VT, bool HasAVX) {
  /// element of vector 2 and the other elements to come from vector 1 in order.
  static bool isCommutedMOVLMask(ArrayRef<int> Mask, EVT VT,
                                 bool V2IsSplat = false, bool V2IsUndef = false) {
+  if (!VT.is128BitVector())
+    return false;
+
    unsigned NumOps = VT.getVectorNumElements();
    if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)
      return false;
@@ -3778,9 +3867,11 @@ static bool isMOVSLDUPMask(ArrayRef<int> Mask, EVT VT,
  /// specifies a shuffle of elements that is suitable for input to 256-bit
  /// version of MOVDDUP.
  static bool isMOVDDUPYMask(ArrayRef<int> Mask, EVT VT, bool HasAVX) {
-  unsigned NumElts = VT.getVectorNumElements();
+  if (!HasAVX || !VT.is256BitVector())
+    return false;
  
-  if (!HasAVX || VT.getSizeInBits() != 256 || NumElts != 4)
+  unsigned NumElts = VT.getVectorNumElements();
+  if (NumElts != 4)
      return false;
  
    for (unsigned i = 0; i != NumElts/2; ++i)
@@ -3796,7 +3887,7 @@ static bool isMOVDDUPYMask(ArrayRef<int> Mask, EVT VT, bool HasAVX) {
  /// specifies a shuffle of elements that is suitable for input to 128-bit
  /// version of MOVDDUP.
  static bool isMOVDDUPMask(ArrayRef<int> Mask, EVT VT) {
-  if (VT.getSizeInBits() != 128)
+  if (!VT.is128BitVector())
      return false;
  
    unsigned e = VT.getVectorNumElements() / 2;
@@ -3870,9 +3961,8 @@ static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) {
    for (unsigned i = 0; i != NumElts; ++i) {
      int Elt = N->getMaskElt(i);
      if (Elt < 0) continue;
-    Elt %= NumLaneElts;
-    unsigned ShAmt = i << Shift;
-    if (ShAmt >= 8) ShAmt -= 8;
+    Elt &= NumLaneElts - 1;
+    unsigned ShAmt = (i << Shift) % 8;
      Mask |= Elt << ShAmt;
    }
  
@@ -3882,30 +3972,48 @@ static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) {
  /// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
  /// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction.
  static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) {
+  EVT VT = N->getValueType(0);
+
+  assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
+         "Unsupported vector type for PSHUFHW");
+
+  unsigned NumElts = VT.getVectorNumElements();
+
    unsigned Mask = 0;
-  // 8 nodes, but we only care about the last 4.
-  for (unsigned i = 7; i >= 4; --i) {
-    int Val = N->getMaskElt(i);
-    if (Val >= 0)
-      Mask |= (Val - 4);
-    if (i != 4)
-      Mask <<= 2;
+  for (unsigned l = 0; l != NumElts; l += 8) {
+    // 8 nodes per lane, but we only care about the last 4.
+    for (unsigned i = 0; i < 4; ++i) {
+      int Elt = N->getMaskElt(l+i+4);
+      if (Elt < 0) continue;
+      Elt &= 0x3; // only 2-bits.
+      Mask |= Elt << (i * 2);
+    }
    }
+
    return Mask;
  }
  
  /// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
  /// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction.
  static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) {
+  EVT VT = N->getValueType(0);
+
+  assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
+         "Unsupported vector type for PSHUFHW");
+
+  unsigned NumElts = VT.getVectorNumElements();
+
    unsigned Mask = 0;
-  // 8 nodes, but we only care about the first 4.
-  for (int i = 3; i >= 0; --i) {
-    int Val = N->getMaskElt(i);
-    if (Val >= 0)
-      Mask |= Val;
-    if (i != 0)
-      Mask <<= 2;
+  for (unsigned l = 0; l != NumElts; l += 8) {
+    // 8 nodes per lane, but we only care about the first 4.
+    for (unsigned i = 0; i < 4; ++i) {
+      int Elt = N->getMaskElt(l+i);
+      if (Elt < 0) continue;
+      Elt &= 0x3; // only 2-bits
+      Mask |= Elt << (i * 2);
+    }
    }
+
    return Mask;
  }
  
@@ -3967,6 +4075,27 @@ unsigned X86::getInsertVINSERTF128Immediate(SDNode *N) {
    return Index / NumElemsPerChunk;
  }
  
+/// getShuffleCLImmediate - Return the appropriate immediate to shuffle
+/// the specified VECTOR_SHUFFLE mask with VPERMQ and VPERMPD instructions.
+/// Handles 256-bit.
+static unsigned getShuffleCLImmediate(ShuffleVectorSDNode *N) {
+  EVT VT = N->getValueType(0);
+
+  unsigned NumElts = VT.getVectorNumElements();
+
+  assert((VT.is256BitVector() && NumElts == 4) &&
+         "Unsupported vector type for VPERMQ/VPERMPD");
+
+  unsigned Mask = 0;
+  for (unsigned i = 0; i != NumElts; ++i) {
+    int Elt = N->getMaskElt(i);
+    if (Elt < 0)
+      continue;
+    Mask |= Elt << (i*2);
+  }
+
+  return Mask;
+}
  /// isZeroNode - Returns true if Elt is a constant zero or a floating point
  /// constant +0.0.
  bool X86::isZeroNode(SDValue Elt) {
@@ -3985,13 +4114,14 @@ static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp,
    SmallVector<int, 8> MaskVec;
  
    for (unsigned i = 0; i != NumElems; ++i) {
-    int idx = SVOp->getMaskElt(i);
-    if (idx < 0)
-      MaskVec.push_back(idx);
-    else if (idx < (int)NumElems)
-      MaskVec.push_back(idx + NumElems);
-    else
-      MaskVec.push_back(idx - NumElems);
+    int Idx = SVOp->getMaskElt(i);
+    if (Idx >= 0) {
+      if (Idx < (int)NumElems)
+        Idx += NumElems;
+      else
+        Idx -= NumElems;
+    }
+    MaskVec.push_back(Idx);
    }
    return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1),
                                SVOp->getOperand(0), &MaskVec[0]);
@@ -4002,7 +4132,7 @@ static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp,
  /// V1 (and in order), and the upper half elements should come from the upper
  /// half of V2 (and in order).
  static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, EVT VT) {
-  if (VT.getSizeInBits() != 128)
+  if (!VT.is128BitVector())
      return false;
    if (VT.getVectorNumElements() != 4)
      return false;
@@ -4059,7 +4189,7 @@ static bool WillBeConstantPoolLoad(SDNode *N) {
  /// MOVLP, it must be either a vector load or a scalar load to vector.
  static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
                                 ArrayRef<int> Mask, EVT VT) {
-  if (VT.getSizeInBits() != 128)
+  if (!VT.is128BitVector())
      return false;
  
    if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1))
@@ -4076,7 +4206,7 @@ static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
    for (unsigned i = 0, e = NumElems/2; i != e; ++i)
      if (!isUndefOrEqual(Mask[i], i))
        return false;
-  for (unsigned i = NumElems/2; i != NumElems; ++i)
+  for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
      if (!isUndefOrEqual(Mask[i], i+NumElems))
        return false;
    return true;
@@ -4128,11 +4258,12 @@ static bool isZeroShuffle(ShuffleVectorSDNode *N) {
  static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
                               SelectionDAG &DAG, DebugLoc dl) {
    assert(VT.isVector() && "Expected a vector type");
+  unsigned Size = VT.getSizeInBits();
  
    // Always build SSE zero vectors as <4 x i32> bitcasted
    // to their dest type. This ensures they get CSE'd.
    SDValue Vec;
-  if (VT.getSizeInBits() == 128) {  // SSE
+  if (Size == 128) {  // SSE
      if (Subtarget->hasSSE2()) {  // SSE2
        SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
        Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
@@ -4140,7 +4271,7 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
        SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
        Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
      }
-  } else if (VT.getSizeInBits() == 256) { // AVX
+  } else if (Size == 256) { // AVX
      if (Subtarget->hasAVX2()) { // AVX2
        SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
        SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
@@ -4152,7 +4283,9 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
        SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
        Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, 8);
      }
-  }
+  } else
+    llvm_unreachable("Unexpected vector type");
+
    return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
  }
  
@@ -4163,25 +4296,22 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
  static SDValue getOnesVector(EVT VT, bool HasAVX2, SelectionDAG &DAG,
                               DebugLoc dl) {
    assert(VT.isVector() && "Expected a vector type");
-  assert((VT.is128BitVector() || VT.is256BitVector())
-         && "Expected a 128-bit or 256-bit vector type");
+  unsigned Size = VT.getSizeInBits();
  
    SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32);
    SDValue Vec;
-  if (VT.getSizeInBits() == 256) {
+  if (Size == 256) {
      if (HasAVX2) { // AVX2
        SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
        Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 8);
      } else { // AVX
        Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
-      SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, MVT::v8i32),
-                                Vec, DAG.getConstant(0, MVT::i32), DAG, dl);
-      Vec = Insert128BitVector(InsV, Vec,
-                    DAG.getConstant(4 /* NumElems/2 */, MVT::i32), DAG, dl);
+      Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
      }
-  } else {
+  } else if (Size == 128) {
      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
-  }
+  } else
+    llvm_unreachable("Unexpected vector type");
  
    return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
  }
@@ -4224,9 +4354,8 @@ static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
  static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
                            SDValue V2) {
    unsigned NumElems = VT.getVectorNumElements();
-  unsigned Half = NumElems/2;
    SmallVector<int, 8> Mask;
-  for (unsigned i = 0; i != Half; ++i) {
+  for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) {
      Mask.push_back(i + Half);
      Mask.push_back(i + NumElems + Half);
    }
@@ -4258,15 +4387,14 @@ static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) {
  static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
    EVT VT = V.getValueType();
    DebugLoc dl = V.getDebugLoc();
-  assert((VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256)
-         && "Vector size not supported");
+  unsigned Size = VT.getSizeInBits();
  
-  if (VT.getSizeInBits() == 128) {
+  if (Size == 128) {
      V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V);
      int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
      V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32),
                               &SplatMask[0]);
-  } else {
+  } else if (Size == 256) {
      // To use VPERMILPS to splat scalars, the second half of indicies must
      // refer to the higher part, which is a duplication of the lower one,
      // because VPERMILPS can only handle in-lane permutations.
@@ -4276,7 +4404,8 @@ static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
      V = DAG.getNode(ISD::BITCAST, dl, MVT::v8f32, V);
      V = DAG.getVectorShuffle(MVT::v8f32, dl, V, DAG.getUNDEF(MVT::v8f32),
                               &SplatMask[0]);
-  }
+  } else
+    llvm_unreachable("Vector size not supported");
  
    return DAG.getNode(ISD::BITCAST, dl, VT, V);
  }
@@ -4297,9 +4426,8 @@ static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
    // Extract the 128-bit part containing the splat element and update
    // the splat element index when it refers to the higher register.
    if (Size == 256) {
-    unsigned Idx = (EltNo >= NumElems/2) ? NumElems/2 : 0;
-    V1 = Extract128BitVector(V1, DAG.getConstant(Idx, MVT::i32), DAG, dl);
-    if (Idx > 0)
+    V1 = Extract128BitVector(V1, EltNo, DAG, dl);
+    if (EltNo >= NumElems/2)
        EltNo -= NumElems/2;
    }
  
@@ -4315,10 +4443,7 @@ static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
    // into the low and high part. This is necessary because we want
    // to use VPERM* to shuffle the vectors
    if (Size == 256) {
-    SDValue InsV = Insert128BitVector(DAG.getUNDEF(SrcVT), V1,
-                         DAG.getConstant(0, MVT::i32), DAG, dl);
-    V1 = Insert128BitVector(InsV, V1,
-               DAG.getConstant(NumElems/2, MVT::i32), DAG, dl);
+    V1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT, V1, V1);
    }
  
    return getLegalSplat(DAG, V1, EltNo);
@@ -4343,9 +4468,87 @@ static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
    return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]);
  }
  
+/// getTargetShuffleMask - Calculates the shuffle mask corresponding to the
+/// target specific opcode. Returns true if the Mask could be calculated.
+/// Sets IsUnary to true if only uses one source.
+static bool getTargetShuffleMask(SDNode *N, MVT VT,
+                                 SmallVectorImpl<int> &Mask, bool &IsUnary) {
+  unsigned NumElems = VT.getVectorNumElements();
+  SDValue ImmN;
+
+  IsUnary = false;
+  switch(N->getOpcode()) {
+  case X86ISD::SHUFP:
+    ImmN = N->getOperand(N->getNumOperands()-1);
+    DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    break;
+  case X86ISD::UNPCKH:
+    DecodeUNPCKHMask(VT, Mask);
+    break;
+  case X86ISD::UNPCKL:
+    DecodeUNPCKLMask(VT, Mask);
+    break;
+  case X86ISD::MOVHLPS:
+    DecodeMOVHLPSMask(NumElems, Mask);
+    break;
+  case X86ISD::MOVLHPS:
+    DecodeMOVLHPSMask(NumElems, Mask);
+    break;
+  case X86ISD::PSHUFD:
+  case X86ISD::VPERMILP:
+    ImmN = N->getOperand(N->getNumOperands()-1);
+    DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    IsUnary = true;
+    break;
+  case X86ISD::PSHUFHW:
+    ImmN = N->getOperand(N->getNumOperands()-1);
+    DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    IsUnary = true;
+    break;
+  case X86ISD::PSHUFLW:
+    ImmN = N->getOperand(N->getNumOperands()-1);
+    DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    IsUnary = true;
+    break;
+  case X86ISD::VPERMI:
+    ImmN = N->getOperand(N->getNumOperands()-1);
+    DecodeVPERMMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    IsUnary = true;
+    break;
+  case X86ISD::MOVSS:
+  case X86ISD::MOVSD: {
+    // The index 0 always comes from the first element of the second source,
+    // this is why MOVSS and MOVSD are used in the first place. The other
+    // elements come from the other positions of the first source vector
+    Mask.push_back(NumElems);
+    for (unsigned i = 1; i != NumElems; ++i) {
+      Mask.push_back(i);
+    }
+    break;
+  }
+  case X86ISD::VPERM2X128:
+    ImmN = N->getOperand(N->getNumOperands()-1);
+    DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    if (Mask.empty()) return false;
+    break;
+  case X86ISD::MOVDDUP:
+  case X86ISD::MOVLHPD:
+  case X86ISD::MOVLPD:
+  case X86ISD::MOVLPS:
+  case X86ISD::MOVSHDUP:
+  case X86ISD::MOVSLDUP:
+  case X86ISD::PALIGN:
+    // Not yet implemented
+    return false;
+  default: llvm_unreachable("unknown target shuffle node");
+  }
+
+  return true;
+}
+
  /// getShuffleScalarElt - Returns the scalar element that will make up the ith
  /// element of the result of the vector shuffle.
-static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG,
+static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
                                     unsigned Depth) {
    if (Depth == 6)
      return SDValue();  // Limit search depth.
@@ -4356,89 +4559,35 @@ static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG,
  
    // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
    if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
-    Index = SV->getMaskElt(Index);
+    int Elt = SV->getMaskElt(Index);
  
-    if (Index < 0)
+    if (Elt < 0)
        return DAG.getUNDEF(VT.getVectorElementType());
  
      unsigned NumElems = VT.getVectorNumElements();
-    SDValue NewV = (Index < (int)NumElems) ? SV->getOperand(0)
-                                           : SV->getOperand(1);
-    return getShuffleScalarElt(NewV.getNode(), Index % NumElems, DAG, Depth+1);
+    SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
+                                         : SV->getOperand(1);
+    return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
    }
  
    // Recurse into target specific vector shuffles to find scalars.
    if (isTargetShuffle(Opcode)) {
-    unsigned NumElems = VT.getVectorNumElements();
-    SmallVector<unsigned, 16> ShuffleMask;
+    MVT ShufVT = V.getValueType().getSimpleVT();
+    unsigned NumElems = ShufVT.getVectorNumElements();
+    SmallVector<int, 16> ShuffleMask;
      SDValue ImmN;
+    bool IsUnary;
  
-    switch(Opcode) {
-    case X86ISD::SHUFP:
-      ImmN = N->getOperand(N->getNumOperands()-1);
-      DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(),
-                      ShuffleMask);
-      break;
-    case X86ISD::UNPCKH:
-      DecodeUNPCKHMask(VT, ShuffleMask);
-      break;
-    case X86ISD::UNPCKL:
-      DecodeUNPCKLMask(VT, ShuffleMask);
-      break;
-    case X86ISD::MOVHLPS:
-      DecodeMOVHLPSMask(NumElems, ShuffleMask);
-      break;
-    case X86ISD::MOVLHPS:
-      DecodeMOVLHPSMask(NumElems, ShuffleMask);
-      break;
-    case X86ISD::PSHUFD:
-    case X86ISD::VPERMILP:
-      ImmN = N->getOperand(N->getNumOperands()-1);
-      DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(),
-                      ShuffleMask);
-      break;
-    case X86ISD::PSHUFHW:
-      ImmN = N->getOperand(N->getNumOperands()-1);
-      DecodePSHUFHWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(),
-                        ShuffleMask);
-      break;
-    case X86ISD::PSHUFLW:
-      ImmN = N->getOperand(N->getNumOperands()-1);
-      DecodePSHUFLWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(),
-                        ShuffleMask);
-      break;
-    case X86ISD::MOVSS:
-    case X86ISD::MOVSD: {
-      // The index 0 always comes from the first element of the second source,
-      // this is why MOVSS and MOVSD are used in the first place. The other
-      // elements come from the other positions of the first source vector.
-      unsigned OpNum = (Index == 0) ? 1 : 0;
-      return getShuffleScalarElt(V.getOperand(OpNum).getNode(), Index, DAG,
-                                 Depth+1);
-    }
-    case X86ISD::VPERM2X128:
-      ImmN = N->getOperand(N->getNumOperands()-1);
-      DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(),
-                           ShuffleMask);
-      break;
-    case X86ISD::MOVDDUP:
-    case X86ISD::MOVLHPD:
-    case X86ISD::MOVLPD:
-    case X86ISD::MOVLPS:
-    case X86ISD::MOVSHDUP:
-    case X86ISD::MOVSLDUP:
-    case X86ISD::PALIGN:
-      return SDValue(); // Not yet implemented.
-    default: llvm_unreachable("unknown target shuffle node");
-    }
-
-    Index = ShuffleMask[Index];
-    if (Index < 0)
-      return DAG.getUNDEF(VT.getVectorElementType());
+    if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary))
+      return SDValue();
+
+    int Elt = ShuffleMask[Index];
+    if (Elt < 0)
+      return DAG.getUNDEF(ShufVT.getVectorElementType());
  
-    SDValue NewV = (Index < (int)NumElems) ? N->getOperand(0)
-                                           : N->getOperand(1);
-    return getShuffleScalarElt(NewV.getNode(), Index % NumElems, DAG,
+    SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0)
+                                         : N->getOperand(1);
+    return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
                                 Depth+1);
    }
  
@@ -4454,7 +4603,7 @@ static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG,
  
    if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
      return (Index == 0) ? V.getOperand(0)
-                          : DAG.getUNDEF(VT.getVectorElementType());
+                        : DAG.getUNDEF(VT.getVectorElementType());
  
    if (V.getOpcode() == ISD::BUILD_VECTOR)
      return V.getOperand(Index);
@@ -4466,38 +4615,37 @@ static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG,
  /// shuffle operation which come from a consecutively from a zero. The
  /// search can start in two different directions, from left or right.
  static
-unsigned getNumOfConsecutiveZeros(SDNode *N, int NumElems,
+unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp, unsigned NumElems,
                                    bool ZerosFromLeft, SelectionDAG &DAG) {
-  int i = 0;
-
-  while (i < NumElems) {
+  unsigned i;
+  for (i = 0; i != NumElems; ++i) {
      unsigned Index = ZerosFromLeft ? i : NumElems-i-1;
-    SDValue Elt = getShuffleScalarElt(N, Index, DAG, 0);
+    SDValue Elt = getShuffleScalarElt(SVOp, Index, DAG, 0);
      if (!(Elt.getNode() &&
           (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt))))
        break;
-    ++i;
    }
  
    return i;
  }
  
-/// isShuffleMaskConsecutive - Check if the shuffle mask indicies from MaskI to
-/// MaskE correspond consecutively to elements from one of the vector operands,
+/// isShuffleMaskConsecutive - Check if the shuffle mask indicies [MaskI, MaskE)
+/// correspond consecutively to elements from one of the vector operands,
  /// starting from its index OpIdx. Also tell OpNum which source vector operand.
  static
-bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp, int MaskI, int MaskE,
-                              int OpIdx, int NumElems, unsigned &OpNum) {
+bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp,
+                              unsigned MaskI, unsigned MaskE, unsigned OpIdx,
+                              unsigned NumElems, unsigned &OpNum) {
    bool SeenV1 = false;
    bool SeenV2 = false;
  
-  for (int i = MaskI; i <= MaskE; ++i, ++OpIdx) {
+  for (unsigned i = MaskI; i != MaskE; ++i, ++OpIdx) {
      int Idx = SVOp->getMaskElt(i);
      // Ignore undef indicies
      if (Idx < 0)
        continue;
  
-    if (Idx < NumElems)
+    if (Idx < (int)NumElems)
        SeenV1 = true;
      else
        SeenV2 = true;
@@ -4532,7 +4680,7 @@ static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
    //
    if (!isShuffleMaskConsecutive(SVOp,
              0,                   // Mask Start Index
-            NumElems-NumZeros-1, // Mask End Index
+            NumElems-NumZeros,   // Mask End Index(exclusive)
              NumZeros,            // Where to start looking in the src vector
              NumElems,            // Number of elements in vector
              OpSrc))              // Which source operand ?
@@ -4565,7 +4713,7 @@ static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
    //
    if (!isShuffleMaskConsecutive(SVOp,
              NumZeros,     // Mask Start Index
-            NumElems-1,   // Mask End Index
+            NumElems,     // Mask End Index(exclusive)
              0,            // Where to start looking in the src vector
              NumElems,     // Number of elements in vector
              OpSrc))       // Which source operand ?
@@ -4583,7 +4731,7 @@ static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
                            bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
    // Although the logic below support any bitwidth size, there are no
    // shift instructions which handle more than 128-bit vectors.
-  if (SVOp->getValueType(0).getSizeInBits() > 128)
+  if (!SVOp->getValueType(0).is128BitVector())
      return false;
  
    if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) ||
@@ -4678,7 +4826,7 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
  static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
                           unsigned NumBits, SelectionDAG &DAG,
                           const TargetLowering &TLI, DebugLoc dl) {
-  assert(VT.getSizeInBits() == 128 && "Unknown type for VShift");
+  assert(VT.is128BitVector() && "Unknown type for VShift");
    EVT ShVT = MVT::v2i64;
    unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
    SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp);
@@ -4746,7 +4894,7 @@ X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl,
                          Ptr,DAG.getConstant(StartOffset, Ptr.getValueType()));
  
      int EltNo = (Offset - StartOffset) >> 2;
-    int NumElems = VT.getVectorNumElements();
+    unsigned NumElems = VT.getVectorNumElements();
  
      EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
      SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
@@ -4754,7 +4902,7 @@ X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl,
                               false, false, false, 0);
  
      SmallVector<int, 8> Mask;
-    for (int i = 0; i < NumElems; ++i)
+    for (unsigned i = 0; i != NumElems; ++i)
        Mask.push_back(EltNo);
  
      return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]);
@@ -4818,8 +4966,9 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
                         LDBase->getPointerInfo(),
                         LDBase->isVolatile(), LDBase->isNonTemporal(),
                         LDBase->isInvariant(), LDBase->getAlignment());
-  } else if (NumElems == 4 && LastLoadedElt == 1 &&
-             DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
+  }
+  if (NumElems == 4 && LastLoadedElt == 1 &&
+      DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
      SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
      SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
      SDValue ResNode =
@@ -4833,41 +4982,45 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
    return SDValue();
  }
  
-/// isVectorBroadcast - Check if the node chain is suitable to be xformed to
-/// a vbroadcast node. We support two patterns:
-/// 1. A splat BUILD_VECTOR which uses a single scalar load.
+/// LowerVectorBroadcast - Attempt to use the vbroadcast instruction
+/// to generate a splat value for the following cases:
+/// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant.
  /// 2. A splat shuffle which uses a scalar_to_vector node which comes from
-/// a scalar load.
-/// The scalar load node is returned when a pattern is found,
+/// a scalar load, or a constant.
+/// The VBROADCAST node is returned when a pattern is found,
  /// or SDValue() otherwise.
-static SDValue isVectorBroadcast(SDValue &Op, const X86Subtarget *Subtarget) {
+SDValue
+X86TargetLowering::LowerVectorBroadcast(SDValue &Op, SelectionDAG &DAG) const {
    if (!Subtarget->hasAVX())
      return SDValue();
  
    EVT VT = Op.getValueType();
-  SDValue V = Op;
+  DebugLoc dl = Op.getDebugLoc();
  
-  if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST)
-    V = V.getOperand(0);
+  assert((VT.is128BitVector() || VT.is256BitVector()) &&
+         "Unsupported vector type for broadcast.");
  
-  //A suspected load to be broadcasted.
    SDValue Ld;
+  bool ConstSplatVal;
  
-  switch (V.getOpcode()) {
+  switch (Op.getOpcode()) {
      default:
        // Unknown pattern found.
        return SDValue();
  
      case ISD::BUILD_VECTOR: {
        // The BUILD_VECTOR node must be a splat.
-      if (!isSplatVector(V.getNode()))
+      if (!isSplatVector(Op.getNode()))
          return SDValue();
  
-      Ld = V.getOperand(0);
+      Ld = Op.getOperand(0);
+      ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
+                     Ld.getOpcode() == ISD::ConstantFP);
  
        // The suspected load node has several users. Make sure that all
        // of its users are from the BUILD_VECTOR node.
-      if (!Ld->hasNUsesOfValue(VT.getVectorNumElements(), 0))
+      // Constants may have multiple users.
+      if (!ConstSplatVal && !Ld->hasNUsesOfValue(VT.getVectorNumElements(), 0))
          return SDValue();
        break;
      }
@@ -4881,86 +5034,201 @@ static SDValue isVectorBroadcast(SDValue &Op, const X86Subtarget *Subtarget) {
          return SDValue();
  
        SDValue Sc = Op.getOperand(0);
-      if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR)
-        return SDValue();
+      if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR &&
+          Sc.getOpcode() != ISD::BUILD_VECTOR) {
+
+        if (!Subtarget->hasAVX2())
+          return SDValue();
+
+        // Use the register form of the broadcast instruction available on AVX2.
+        if (VT.is256BitVector())
+          Sc = Extract128BitVector(Sc, 0, DAG, dl);
+        return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc);
+      }
  
        Ld = Sc.getOperand(0);
+      ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
+                       Ld.getOpcode() == ISD::ConstantFP);
  
        // The scalar_to_vector node and the suspected
        // load node must have exactly one user.
-      if (!Sc.hasOneUse() || !Ld.hasOneUse())
+      // Constants may have multiple users.
+      if (!ConstSplatVal && (!Sc.hasOneUse() || !Ld.hasOneUse()))
          return SDValue();
        break;
      }
    }
  
-  // The scalar source must be a normal load.
-  if (!ISD::isNormalLoad(Ld.getNode()))
-    return SDValue();
+  bool Is256 = VT.is256BitVector();
  
-  // Reject loads that have uses of the chain result
-  if (Ld->hasAnyUseOfValue(1))
-    return SDValue();
+  // Handle the broadcasting a single constant scalar from the constant pool
+  // into a vector. On Sandybridge it is still better to load a constant vector
+  // from the constant pool and not to broadcast it from a scalar.
+  if (ConstSplatVal && Subtarget->hasAVX2()) {
+    EVT CVT = Ld.getValueType();
+    assert(!CVT.isVector() && "Must not broadcast a vector type");
+    unsigned ScalarSize = CVT.getSizeInBits();
+
+    if (ScalarSize == 32 || (Is256 && ScalarSize == 64)) {
+      const Constant *C = 0;
+      if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
+        C = CI->getConstantIntValue();
+      else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
+        C = CF->getConstantFPValue();
+
+      assert(C && "Invalid constant type");
+
+      SDValue CP = DAG.getConstantPool(C, getPointerTy());
+      unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
+      Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP,
+                       MachinePointerInfo::getConstantPool(),
+                       false, false, false, Alignment);
+
+      return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
+    }
+  }
  
-  bool Is256 = VT.getSizeInBits() == 256;
-  bool Is128 = VT.getSizeInBits() == 128;
+  bool IsLoad = ISD::isNormalLoad(Ld.getNode());
    unsigned ScalarSize = Ld.getValueType().getSizeInBits();
  
-  // VBroadcast to YMM
-  if (Is256 && (ScalarSize == 32 || ScalarSize == 64))
-    return Ld;
+  // Handle AVX2 in-register broadcasts.
+  if (!IsLoad && Subtarget->hasAVX2() &&
+      (ScalarSize == 32 || (Is256 && ScalarSize == 64)))
+    return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
+
+  // The scalar source must be a normal load.
+  if (!IsLoad)
+    return SDValue();
  
-  // VBroadcast to XMM
-  if (Is128 && (ScalarSize == 32))
-    return Ld;
+  if (ScalarSize == 32 || (Is256 && ScalarSize == 64))
+    return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
  
    // The integer check is needed for the 64-bit into 128-bit so it doesn't match
-  // double since there is vbroadcastsd xmm
+  // double since there is no vbroadcastsd xmm
    if (Subtarget->hasAVX2() && Ld.getValueType().isInteger()) {
-    // VBroadcast to YMM
-    if (Is256 && (ScalarSize == 8 || ScalarSize == 16))
-      return Ld;
-
-    // VBroadcast to XMM
-    if (Is128 && (ScalarSize ==  8 || ScalarSize == 16 || ScalarSize == 64))
-      return Ld;
+    if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
+      return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
    }
  
    // Unsupported broadcast.
    return SDValue();
  }
  
+// LowerVectorFpExtend - Recognize the scalarized FP_EXTEND from v2f32 to v2f64
+// and convert it into X86ISD::VFPEXT due to the current ISD::FP_EXTEND has the
+// constraint of matching input/output vector elements.
  SDValue
-X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
-  DebugLoc dl = Op.getDebugLoc();
-
+X86TargetLowering::LowerVectorFpExtend(SDValue &Op, SelectionDAG &DAG) const {
+  DebugLoc DL = Op.getDebugLoc();
+  SDNode *N = Op.getNode();
    EVT VT = Op.getValueType();
-  EVT ExtVT = VT.getVectorElementType();
-  unsigned NumElems = Op.getNumOperands();
+  unsigned NumElts = Op.getNumOperands();
  
-  // Vectors containing all zeros can be matched by pxor and xorps later
-  if (ISD::isBuildVectorAllZeros(Op.getNode())) {
-    // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
-    // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
-    if (VT == MVT::v4i32 || VT == MVT::v8i32)
-      return Op;
+  // Check supported types and sub-targets.
+  //
+  // Only v2f32 -> v2f64 needs special handling.
+  if (VT != MVT::v2f64 || !Subtarget->hasSSE2())
+    return SDValue();
  
-    return getZeroVector(VT, Subtarget, DAG, dl);
-  }
+  SDValue VecIn;
+  EVT VecInVT;
+  SmallVector<int, 8> Mask;
+  EVT SrcVT = MVT::Other;
  
-  // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
-  // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
-  // vpcmpeqd on 256-bit vectors.
-  if (ISD::isBuildVectorAllOnes(Op.getNode())) {
-    if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasAVX2()))
-      return Op;
+  // Check the patterns could be translated into X86vfpext.
+  for (unsigned i = 0; i < NumElts; ++i) {
+    SDValue In = N->getOperand(i);
+    unsigned Opcode = In.getOpcode();
  
-    return getOnesVector(VT, Subtarget->hasAVX2(), DAG, dl);
-  }
+    // Skip if the element is undefined.
+    if (Opcode == ISD::UNDEF) {
+      Mask.push_back(-1);
+      continue;
+    }
+
+    // Quit if one of the elements is not defined from 'fpext'.
+    if (Opcode != ISD::FP_EXTEND)
+      return SDValue();
+
+    // Check how the source of 'fpext' is defined.
+    SDValue L2In = In.getOperand(0);
+    EVT L2InVT = L2In.getValueType();
+
+    // Check the original type
+    if (SrcVT == MVT::Other)
+      SrcVT = L2InVT;
+    else if (SrcVT != L2InVT) // Quit if non-homogenous typed.
+      return SDValue();
+
+    // Check whether the value being 'fpext'ed is extracted from the same
+    // source.
+    Opcode = L2In.getOpcode();
+
+    // Quit if it's not extracted with a constant index.
+    if (Opcode != ISD::EXTRACT_VECTOR_ELT ||
+        !isa<ConstantSDNode>(L2In.getOperand(1)))
+      return SDValue();
+
+    SDValue ExtractedFromVec = L2In.getOperand(0);
+
+    if (VecIn.getNode() == 0) {
+      VecIn = ExtractedFromVec;
+      VecInVT = ExtractedFromVec.getValueType();
+    } else if (VecIn != ExtractedFromVec) // Quit if built from more than 1 vec.
+      return SDValue();
+
+    Mask.push_back(cast<ConstantSDNode>(L2In.getOperand(1))->getZExtValue());
+  }
+
+  // Quit if all operands of BUILD_VECTOR are undefined.
+  if (!VecIn.getNode())
+    return SDValue();
+
+  // Fill the remaining mask as undef.
+  for (unsigned i = NumElts; i < VecInVT.getVectorNumElements(); ++i)
+    Mask.push_back(-1);
+
+  return DAG.getNode(X86ISD::VFPEXT, DL, VT,
+                     DAG.getVectorShuffle(VecInVT, DL,
+                                          VecIn, DAG.getUNDEF(VecInVT),
+                                          &Mask[0]));
+}
+
+SDValue
+X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
+  DebugLoc dl = Op.getDebugLoc();
+
+  EVT VT = Op.getValueType();
+  EVT ExtVT = VT.getVectorElementType();
+  unsigned NumElems = Op.getNumOperands();
+
+  // Vectors containing all zeros can be matched by pxor and xorps later
+  if (ISD::isBuildVectorAllZeros(Op.getNode())) {
+    // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
+    // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
+    if (VT == MVT::v4i32 || VT == MVT::v8i32)
+      return Op;
+
+    return getZeroVector(VT, Subtarget, DAG, dl);
+  }
+
+  // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
+  // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
+  // vpcmpeqd on 256-bit vectors.
+  if (ISD::isBuildVectorAllOnes(Op.getNode())) {
+    if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasAVX2()))
+      return Op;
  
-  SDValue LD = isVectorBroadcast(Op, Subtarget);
-  if (LD.getNode())
-    return DAG.getNode(X86ISD::VBROADCAST, dl, VT, LD);
+    return getOnesVector(VT, Subtarget->hasAVX2(), DAG, dl);
+  }
+
+  SDValue Broadcast = LowerVectorBroadcast(Op, DAG);
+  if (Broadcast.getNode())
+    return Broadcast;
+
+  SDValue FpExt = LowerVectorFpExtend(Op, DAG);
+  if (FpExt.getNode())
+    return FpExt;
  
    unsigned EVTBits = ExtVT.getSizeInBits();
  
@@ -5020,8 +5288,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
            Mask.push_back(Idx);
            for (unsigned i = 1; i != VecElts; ++i)
              Mask.push_back(i);
-          Item = DAG.getVectorShuffle(VecVT, dl, Item,
-                                      DAG.getUNDEF(Item.getValueType()),
+          Item = DAG.getVectorShuffle(VecVT, dl, Item, DAG.getUNDEF(VecVT),
                                        &Mask[0]);
          }
          return DAG.getNode(ISD::BITCAST, dl, VT, Item);
@@ -5038,12 +5305,12 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
  
        if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
            (ExtVT == MVT::i64 && Subtarget->is64Bit())) {
-        if (VT.getSizeInBits() == 256) {
+        if (VT.is256BitVector()) {
            SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);
            return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
                               Item, DAG.getIntPtrConstant(0));
          }
-        assert(VT.getSizeInBits() == 128 && "Expected an SSE value type!");
+        assert(VT.is128BitVector() && "Expected an SSE value type!");
          Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
          // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
          return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
@@ -5052,12 +5319,11 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
        if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
          Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
          Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
-        if (VT.getSizeInBits() == 256) {
+        if (VT.is256BitVector()) {
            SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl);
-          Item = Insert128BitVector(ZeroVec, Item, DAG.getConstant(0, MVT::i32),
-                                    DAG, dl);
+          Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl);
          } else {
-          assert(VT.getSizeInBits() == 128 && "Expected an SSE value type!");
+          assert(VT.is128BitVector() && "Expected an SSE value type!");
            Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
          }
          return DAG.getNode(ISD::BITCAST, dl, VT, Item);
@@ -5089,7 +5355,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
        // Turn it into a shuffle of zero and zero-extended scalar to vector.
        Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG);
        SmallVector<int, 8> MaskVec;
-      for (unsigned i = 0; i < NumElems; i++)
+      for (unsigned i = 0; i != NumElems; ++i)
          MaskVec.push_back(i == Idx ? 0 : 1);
        return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]);
      }
@@ -5117,7 +5383,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
  
    // For AVX-length vectors, build the individual 128-bit pieces and use
    // shuffles to put them in place.
-  if (VT.getSizeInBits() == 256) {
+  if (VT.is256BitVector()) {
      SmallVector<SDValue, 32> V;
      for (unsigned i = 0; i != NumElems; ++i)
        V.push_back(Op.getOperand(i));
@@ -5130,10 +5396,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
                                  NumElems/2);
  
      // Recreate the wider vector with the lower and upper part.
-    SDValue Vec = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), Lower,
-                                DAG.getConstant(0, MVT::i32), DAG, dl);
-    return Insert128BitVector(Vec, Upper, DAG.getConstant(NumElems/2, MVT::i32),
-                              DAG, dl);
+    return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
    }
  
    // Let legalizer expand 2-wide build_vectors.
@@ -5201,7 +5464,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
      return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
    }
  
-  if (Values.size() > 1 && VT.getSizeInBits() == 128) {
+  if (Values.size() > 1 && VT.is128BitVector()) {
      // Check for a build vector of consecutive loads.
      for (unsigned i = 0; i < NumElems; ++i)
        V[i] = Op.getOperand(i);
@@ -5262,68 +5525,98 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
    return SDValue();
  }
  
-// LowerMMXCONCAT_VECTORS - We support concatenate two MMX registers and place
-// them in a MMX register.  This is better than doing a stack convert.
-static SDValue LowerMMXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
-  DebugLoc dl = Op.getDebugLoc();
-  EVT ResVT = Op.getValueType();
-
-  assert(ResVT == MVT::v2i64 || ResVT == MVT::v4i32 ||
-         ResVT == MVT::v8i16 || ResVT == MVT::v16i8);
-  int Mask[2];
-  SDValue InVec = DAG.getNode(ISD::BITCAST,dl, MVT::v1i64, Op.getOperand(0));
-  SDValue VecOp = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec);
-  InVec = Op.getOperand(1);
-  if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) {
-    unsigned NumElts = ResVT.getVectorNumElements();
-    VecOp = DAG.getNode(ISD::BITCAST, dl, ResVT, VecOp);
-    VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ResVT, VecOp,
-                       InVec.getOperand(0), DAG.getIntPtrConstant(NumElts/2+1));
-  } else {
-    InVec = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, InVec);
-    SDValue VecOp2 = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec);
-    Mask[0] = 0; Mask[1] = 2;
-    VecOp = DAG.getVectorShuffle(MVT::v2i64, dl, VecOp, VecOp2, Mask);
-  }
-  return DAG.getNode(ISD::BITCAST, dl, ResVT, VecOp);
-}
-
  // LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction
  // to create 256-bit vectors from two other 128-bit ones.
  static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
    DebugLoc dl = Op.getDebugLoc();
    EVT ResVT = Op.getValueType();
  
-  assert(ResVT.getSizeInBits() == 256 && "Value type must be 256-bit wide");
+  assert(ResVT.is256BitVector() && "Value type must be 256-bit wide");
  
    SDValue V1 = Op.getOperand(0);
    SDValue V2 = Op.getOperand(1);
    unsigned NumElems = ResVT.getVectorNumElements();
  
-  SDValue V = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, ResVT), V1,
-                                 DAG.getConstant(0, MVT::i32), DAG, dl);
-  return Insert128BitVector(V, V2, DAG.getConstant(NumElems/2, MVT::i32),
-                            DAG, dl);
+  return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
  }
  
  SDValue
  X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
-  EVT ResVT = Op.getValueType();
-
    assert(Op.getNumOperands() == 2);
-  assert((ResVT.getSizeInBits() == 128 || ResVT.getSizeInBits() == 256) &&
-         "Unsupported CONCAT_VECTORS for value type");
-
-  // We support concatenate two MMX registers and place them in a MMX register.
-  // This is better than doing a stack convert.
-  if (ResVT.is128BitVector())
-    return LowerMMXCONCAT_VECTORS(Op, DAG);
  
    // 256-bit AVX can use the vinsertf128 instruction to create 256-bit vectors
    // from two other 128-bit ones.
    return LowerAVXCONCAT_VECTORS(Op, DAG);
  }
  
+// Try to lower a shuffle node into a simple blend instruction.
+static SDValue LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp,
+                                          const X86Subtarget *Subtarget,
+                                          SelectionDAG &DAG) {
+  SDValue V1 = SVOp->getOperand(0);
+  SDValue V2 = SVOp->getOperand(1);
+  DebugLoc dl = SVOp->getDebugLoc();
+  MVT VT = SVOp->getValueType(0).getSimpleVT();
+  unsigned NumElems = VT.getVectorNumElements();
+
+  if (!Subtarget->hasSSE41())
+    return SDValue();
+
+  unsigned ISDNo = 0;
+  MVT OpTy;
+
+  switch (VT.SimpleTy) {
+  default: return SDValue();
+  case MVT::v8i16:
+    ISDNo = X86ISD::BLENDPW;
+    OpTy = MVT::v8i16;
+    break;
+  case MVT::v4i32:
+  case MVT::v4f32:
+    ISDNo = X86ISD::BLENDPS;
+    OpTy = MVT::v4f32;
+    break;
+  case MVT::v2i64:
+  case MVT::v2f64:
+    ISDNo = X86ISD::BLENDPD;
+    OpTy = MVT::v2f64;
+    break;
+  case MVT::v8i32:
+  case MVT::v8f32:
+    if (!Subtarget->hasAVX())
+      return SDValue();
+    ISDNo = X86ISD::BLENDPS;
+    OpTy = MVT::v8f32;
+    break;
+  case MVT::v4i64:
+  case MVT::v4f64:
+    if (!Subtarget->hasAVX())
+      return SDValue();
+    ISDNo = X86ISD::BLENDPD;
+    OpTy = MVT::v4f64;
+    break;
+  }
+  assert(ISDNo && "Invalid Op Number");
+
+  unsigned MaskVals = 0;
+
+  for (unsigned i = 0; i != NumElems; ++i) {
+    int EltIdx = SVOp->getMaskElt(i);
+    if (EltIdx == (int)i || EltIdx < 0)
+      MaskVals |= (1<<i);
+    else if (EltIdx == (int)(i + NumElems))
+      continue; // Bit is set to zero;
+    else
+      return SDValue();
+  }
+
+  V1 = DAG.getNode(ISD::BITCAST, dl, OpTy, V1);
+  V2 = DAG.getNode(ISD::BITCAST, dl, OpTy, V2);
+  SDValue Ret =  DAG.getNode(ISDNo, dl, OpTy, V1, V2,
+                             DAG.getConstant(MaskVals, MVT::i32));
+  return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
+}
+
  // v8i16 shuffles - Prefer shuffles in the following order:
  // 1. [all]   pshuflw, pshufhw, optional move
  // 2. [ssse3] 1 x pshufb
@@ -5468,13 +5761,10 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op,
      bool TwoInputs = V1Used && V2Used;
      for (unsigned i = 0; i != 8; ++i) {
        int EltIdx = MaskVals[i] * 2;
-      if (TwoInputs && (EltIdx >= 16)) {
-        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
-        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
-        continue;
-      }
-      pshufbMask.push_back(DAG.getConstant(EltIdx,   MVT::i8));
-      pshufbMask.push_back(DAG.getConstant(EltIdx+1, MVT::i8));
+      int Idx0 = (TwoInputs && (EltIdx >= 16)) ? 0x80 : EltIdx;
+      int Idx1 = (TwoInputs && (EltIdx >= 16)) ? 0x80 : EltIdx+1;
+      pshufbMask.push_back(DAG.getConstant(Idx0,   MVT::i8));
+      pshufbMask.push_back(DAG.getConstant(Idx1, MVT::i8));
      }
      V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V1);
      V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
@@ -5488,13 +5778,10 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op,
      pshufbMask.clear();
      for (unsigned i = 0; i != 8; ++i) {
        int EltIdx = MaskVals[i] * 2;
-      if (EltIdx < 16) {
-        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
-        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
-        continue;
-      }
-      pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8));
-      pshufbMask.push_back(DAG.getConstant(EltIdx - 15, MVT::i8));
+      int Idx0 = (EltIdx < 16) ? 0x80 : EltIdx - 16;
+      int Idx1 = (EltIdx < 16) ? 0x80 : EltIdx - 15;
+      pshufbMask.push_back(DAG.getConstant(Idx0, MVT::i8));
+      pshufbMask.push_back(DAG.getConstant(Idx1, MVT::i8));
      }
      V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V2);
      V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
@@ -5570,10 +5857,10 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op,
      int EltIdx = MaskVals[i];
      if (EltIdx < 0)
        continue;
-    SDValue ExtOp = (EltIdx < 8)
-    ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1,
-                  DAG.getIntPtrConstant(EltIdx))
-    : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2,
+    SDValue ExtOp = (EltIdx < 8) ?
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1,
+                  DAG.getIntPtrConstant(EltIdx)) :
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2,
                    DAG.getIntPtrConstant(EltIdx - 8));
      NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp,
                         DAG.getIntPtrConstant(i));
@@ -5594,21 +5881,11 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
    DebugLoc dl = SVOp->getDebugLoc();
    ArrayRef<int> MaskVals = SVOp->getMask();
  
+  bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
+
    // If we have SSSE3, case 1 is generated when all result bytes come from
    // one of  the inputs.  Otherwise, case 2 is generated.  If no SSSE3 is
    // present, fall back to case 3.
-  // FIXME: kill V2Only once shuffles are canonizalized by getNode.
-  bool V1Only = true;
-  bool V2Only = true;
-  for (unsigned i = 0; i < 16; ++i) {
-    int EltIdx = MaskVals[i];
-    if (EltIdx < 0)
-      continue;
-    if (EltIdx < 16)
-      V2Only = false;
-    else
-      V1Only = false;
-  }
  
    // If SSSE3, use 1 pshufb instruction per vector with elements in the result.
    if (TLI.getSubtarget()->hasSSSE3()) {
@@ -5620,23 +5897,16 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
      // Otherwise, we have elements from both input vectors, and must zero out
      // elements that come from V2 in the first mask, and V1 in the second mask
      // so that we can OR them together.
-    bool TwoInputs = !(V1Only || V2Only);
      for (unsigned i = 0; i != 16; ++i) {
        int EltIdx = MaskVals[i];
-      if (EltIdx < 0 || (TwoInputs && EltIdx >= 16)) {
-        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
-        continue;
-      }
+      if (EltIdx < 0 || EltIdx >= 16)
+        EltIdx = 0x80;
        pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
      }
-    // If all the elements are from V2, assign it to V1 and return after
-    // building the first pshufb.
-    if (V2Only)
-      V1 = V2;
      V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
                       DAG.getNode(ISD::BUILD_VECTOR, dl,
                                   MVT::v16i8, &pshufbMask[0], 16));
-    if (!TwoInputs)
+    if (V2IsUndef)
        return V1;
  
      // Calculate the shuffle mask for the second input, shuffle it, and
@@ -5644,11 +5914,8 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
      pshufbMask.clear();
      for (unsigned i = 0; i != 16; ++i) {
        int EltIdx = MaskVals[i];
-      if (EltIdx < 16) {
-        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
-        continue;
-      }
-      pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8));
+      EltIdx = (EltIdx < 16) ? 0x80 : EltIdx - 16;
+      pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
      }
      V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
                       DAG.getNode(ISD::BUILD_VECTOR, dl,
@@ -5661,7 +5928,7 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
    // the 16 different words that comprise the two doublequadword input vectors.
    V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
    V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2);
-  SDValue NewV = V2Only ? V2 : V1;
+  SDValue NewV = V1;
    for (int i = 0; i != 8; ++i) {
      int Elt0 = MaskVals[i*2];
      int Elt1 = MaskVals[i*2+1];
@@ -5671,9 +5938,7 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
        continue;
  
      // This word of the result is already in the correct place, skip it.
-    if (V1Only && (Elt0 == i*2) && (Elt1 == i*2+1))
-      continue;
-    if (V2Only && (Elt0 == i*2+16) && (Elt1 == i*2+17))
+    if ((Elt0 == i*2) && (Elt1 == i*2+1))
        continue;
  
      SDValue Elt0Src = Elt0 < 16 ? V1 : V2;
@@ -5735,41 +6000,37 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
  static
  SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
                                   SelectionDAG &DAG, DebugLoc dl) {
-  EVT VT = SVOp->getValueType(0);
-  SDValue V1 = SVOp->getOperand(0);
-  SDValue V2 = SVOp->getOperand(1);
+  MVT VT = SVOp->getValueType(0).getSimpleVT();
    unsigned NumElems = VT.getVectorNumElements();
-  unsigned NewWidth = (NumElems == 4) ? 2 : 4;
-  EVT NewVT;
-  switch (VT.getSimpleVT().SimpleTy) {
+  MVT NewVT;
+  unsigned Scale;
+  switch (VT.SimpleTy) {
    default: llvm_unreachable("Unexpected!");
-  case MVT::v4f32: NewVT = MVT::v2f64; break;
-  case MVT::v4i32: NewVT = MVT::v2i64; break;
-  case MVT::v8i16: NewVT = MVT::v4i32; break;
-  case MVT::v16i8: NewVT = MVT::v4i32; break;
+  case MVT::v4f32:  NewVT = MVT::v2f64; Scale = 2; break;
+  case MVT::v4i32:  NewVT = MVT::v2i64; Scale = 2; break;
+  case MVT::v8i16:  NewVT = MVT::v4i32; Scale = 2; break;
+  case MVT::v16i8:  NewVT = MVT::v4i32; Scale = 4; break;
+  case MVT::v16i16: NewVT = MVT::v8i32; Scale = 2; break;
+  case MVT::v32i8:  NewVT = MVT::v8i32; Scale = 4; break;
    }
  
-  int Scale = NumElems / NewWidth;
    SmallVector<int, 8> MaskVec;
-  for (unsigned i = 0; i < NumElems; i += Scale) {
+  for (unsigned i = 0; i != NumElems; i += Scale) {
      int StartIdx = -1;
-    for (int j = 0; j < Scale; ++j) {
+    for (unsigned j = 0; j != Scale; ++j) {
        int EltIdx = SVOp->getMaskElt(i+j);
        if (EltIdx < 0)
          continue;
-      if (StartIdx == -1)
-        StartIdx = EltIdx - (EltIdx % Scale);
-      if (EltIdx != StartIdx + j)
+      if (StartIdx < 0)
+        StartIdx = (EltIdx / Scale);
+      if (EltIdx != (int)(StartIdx*Scale + j))
          return SDValue();
      }
-    if (StartIdx == -1)
-      MaskVec.push_back(-1);
-    else
-      MaskVec.push_back(StartIdx / Scale);
+    MaskVec.push_back(StartIdx);
    }
  
-  V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1);
-  V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2);
+  SDValue V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(0));
+  SDValue V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(1));
    return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]);
  }
  
@@ -5812,102 +6073,114 @@ static SDValue getVZextMovL(EVT VT, EVT OpVT,
  /// which could not be matched by any known target speficic shuffle
  static SDValue
  LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
+
+  SDValue NewOp = Compact8x32ShuffleNode(SVOp, DAG);
+  if (NewOp.getNode())
+    return NewOp;
+
    EVT VT = SVOp->getValueType(0);
  
    unsigned NumElems = VT.getVectorNumElements();
    unsigned NumLaneElems = NumElems / 2;
  
-  int MinRange[2][2] = { { static_cast<int>(NumElems),
-                           static_cast<int>(NumElems) },
-                         { static_cast<int>(NumElems),
-                           static_cast<int>(NumElems) } };
-  int MaxRange[2][2] = { { -1, -1 }, { -1, -1 } };
+  DebugLoc dl = SVOp->getDebugLoc();
+  MVT EltVT = VT.getVectorElementType().getSimpleVT();
+  EVT NVT = MVT::getVectorVT(EltVT, NumLaneElems);
+  SDValue Output[2];
  
-  // Collect used ranges for each source in each lane
+  SmallVector<int, 16> Mask;
    for (unsigned l = 0; l < 2; ++l) {
-    unsigned LaneStart = l*NumLaneElems;
+    // Build a shuffle mask for the output, discovering on the fly which
+    // input vectors to use as shuffle operands (recorded in InputUsed).
+    // If building a suitable shuffle vector proves too hard, then bail
+    // out with UseBuildVector set.
+    bool UseBuildVector = false;
+    int InputUsed[2] = { -1, -1 }; // Not yet discovered.
+    unsigned LaneStart = l * NumLaneElems;
      for (unsigned i = 0; i != NumLaneElems; ++i) {
+      // The mask element.  This indexes into the input.
        int Idx = SVOp->getMaskElt(i+LaneStart);
-      if (Idx < 0)
+      if (Idx < 0) {
+        // the mask element does not index into any input vector.
+        Mask.push_back(-1);
          continue;
+      }
  
-      int Input = 0;
-      if (Idx >= (int)NumElems) {
-        Idx -= NumElems;
-        Input = 1;
+      // The input vector this mask element indexes into.
+      int Input = Idx / NumLaneElems;
+
+      // Turn the index into an offset from the start of the input vector.
+      Idx -= Input * NumLaneElems;
+
+      // Find or create a shuffle vector operand to hold this input.
+      unsigned OpNo;
+      for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) {
+        if (InputUsed[OpNo] == Input)
+          // This input vector is already an operand.
+          break;
+        if (InputUsed[OpNo] < 0) {
+          // Create a new operand for this input vector.
+          InputUsed[OpNo] = Input;
+          break;
+        }
        }
  
-      if (Idx > MaxRange[l][Input])
-        MaxRange[l][Input] = Idx;
-      if (Idx < MinRange[l][Input])
-        MinRange[l][Input] = Idx;
+      if (OpNo >= array_lengthof(InputUsed)) {
+        // More than two input vectors used!  Give up on trying to create a
+        // shuffle vector.  Insert all elements into a BUILD_VECTOR instead.
+        UseBuildVector = true;
+        break;
+      }
+
+      // Add the mask index for the new shuffle vector.
+      Mask.push_back(Idx + OpNo * NumLaneElems);
      }
-  }
  
-  // Make sure each range is 128-bits
-  int ExtractIdx[2][2] = { { -1, -1 }, { -1, -1 } };
-  for (unsigned l = 0; l < 2; ++l) {
-    for (unsigned Input = 0; Input < 2; ++Input) {
-      if (MinRange[l][Input] == (int)NumElems && MaxRange[l][Input] < 0)
-        continue;
+    if (UseBuildVector) {
+      SmallVector<SDValue, 16> SVOps;
+      for (unsigned i = 0; i != NumLaneElems; ++i) {
+        // The mask element.  This indexes into the input.
+        int Idx = SVOp->getMaskElt(i+LaneStart);
+        if (Idx < 0) {
+          SVOps.push_back(DAG.getUNDEF(EltVT));
+          continue;
+        }
  
-      if (MinRange[l][Input] >= 0 && MaxRange[l][Input] < (int)NumLaneElems)
-        ExtractIdx[l][Input] = 0;
-      else if (MinRange[l][Input] >= (int)NumLaneElems &&
-               MaxRange[l][Input] < (int)NumElems)
-        ExtractIdx[l][Input] = NumLaneElems;
-      else
-        return SDValue();
-    }
-  }
+        // The input vector this mask element indexes into.
+        int Input = Idx / NumElems;
  
-  DebugLoc dl = SVOp->getDebugLoc();
-  MVT EltVT = VT.getVectorElementType().getSimpleVT();
-  EVT NVT = MVT::getVectorVT(EltVT, NumElems/2);
+        // Turn the index into an offset from the start of the input vector.
+        Idx -= Input * NumElems;
  
-  SDValue Ops[2][2];
-  for (unsigned l = 0; l < 2; ++l) {
-    for (unsigned Input = 0; Input < 2; ++Input) {
-      if (ExtractIdx[l][Input] >= 0)
-        Ops[l][Input] = Extract128BitVector(SVOp->getOperand(Input),
-                                DAG.getConstant(ExtractIdx[l][Input], MVT::i32),
-                                                DAG, dl);
-      else
-        Ops[l][Input] = DAG.getUNDEF(NVT);
-    }
-  }
+        // Extract the vector element by hand.
+        SVOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
+                                    SVOp->getOperand(Input),
+                                    DAG.getIntPtrConstant(Idx)));
+      }
  
-  // Generate 128-bit shuffles
-  SmallVector<int, 16> Mask1, Mask2;
-  for (unsigned i = 0; i != NumLaneElems; ++i) {
-    int Elt = SVOp->getMaskElt(i);
-    if (Elt >= (int)NumElems) {
-      Elt %= NumLaneElems;
-      Elt += NumLaneElems;
-    } else if (Elt >= 0) {
-      Elt %= NumLaneElems;
-    }
-    Mask1.push_back(Elt);
-  }
-  for (unsigned i = NumLaneElems; i != NumElems; ++i) {
-    int Elt = SVOp->getMaskElt(i);
-    if (Elt >= (int)NumElems) {
-      Elt %= NumLaneElems;
-      Elt += NumLaneElems;
-    } else if (Elt >= 0) {
-      Elt %= NumLaneElems;
+      // Construct the output using a BUILD_VECTOR.
+      Output[l] = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, &SVOps[0],
+                              SVOps.size());
+    } else if (InputUsed[0] < 0) {
+      // No input vectors were used! The result is undefined.
+      Output[l] = DAG.getUNDEF(NVT);
+    } else {
+      SDValue Op0 = Extract128BitVector(SVOp->getOperand(InputUsed[0] / 2),
+                                        (InputUsed[0] % 2) * NumLaneElems,
+                                        DAG, dl);
+      // If only one input was used, use an undefined vector for the other.
+      SDValue Op1 = (InputUsed[1] < 0) ? DAG.getUNDEF(NVT) :
+        Extract128BitVector(SVOp->getOperand(InputUsed[1] / 2),
+                            (InputUsed[1] % 2) * NumLaneElems, DAG, dl);
+      // At least one input vector was used. Create a new shuffle vector.
+      Output[l] = DAG.getVectorShuffle(NVT, dl, Op0, Op1, &Mask[0]);
      }
-    Mask2.push_back(Elt);
-  }
  
-  SDValue Shuf1 = DAG.getVectorShuffle(NVT, dl, Ops[0][0], Ops[0][1], &Mask1[0]);
-  SDValue Shuf2 = DAG.getVectorShuffle(NVT, dl, Ops[1][0], Ops[1][1], &Mask2[0]);
+    Mask.clear();
+  }
  
    // Concatenate the result back
-  SDValue V = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), Shuf1,
-                                 DAG.getConstant(0, MVT::i32), DAG, dl);
-  return Insert128BitVector(V, Shuf2, DAG.getConstant(NumElems/2, MVT::i32),
-                            DAG, dl);
+  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Output[0], Output[1]);
  }
  
  /// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with
@@ -5919,7 +6192,7 @@ LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
    DebugLoc dl = SVOp->getDebugLoc();
    EVT VT = SVOp->getValueType(0);
  
-  assert(VT.getSizeInBits() == 128 && "Unsupported vector size");
+  assert(VT.is128BitVector() && "Unsupported vector size");
  
    std::pair<int, int> Locs[4];
    int Mask1[] = { -1, -1, -1, -1 };
@@ -5963,7 +6236,9 @@ LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
        }
  
      return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]);
-  } else if (NumLo == 3 || NumHi == 3) {
+  }
+
+  if (NumLo == 3 || NumHi == 3) {
      // Otherwise, we must have three elements from one vector, call it X, and
      // one element from the other, call it Y.  First, use a shufps to build an
      // intermediate vector with the one element from Y and the element from X
@@ -5999,17 +6274,17 @@ LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
        Mask1[2] = HiIndex & 1 ? 6 : 4;
        Mask1[3] = HiIndex & 1 ? 4 : 6;
        return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
-    } else {
-      Mask1[0] = HiIndex & 1 ? 2 : 0;
-      Mask1[1] = HiIndex & 1 ? 0 : 2;
-      Mask1[2] = PermMask[2];
-      Mask1[3] = PermMask[3];
-      if (Mask1[2] >= 0)
-        Mask1[2] += 4;
-      if (Mask1[3] >= 0)
-        Mask1[3] += 4;
-      return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]);
      }
+
+    Mask1[0] = HiIndex & 1 ? 2 : 0;
+    Mask1[1] = HiIndex & 1 ? 0 : 2;
+    Mask1[2] = PermMask[2];
+    Mask1[3] = PermMask[3];
+    if (Mask1[2] >= 0)
+      Mask1[2] += 4;
+    if (Mask1[3] >= 0)
+      Mask1[3] += 4;
+    return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]);
    }
  
    // Break it into (shuffle shuffle_hi, shuffle_lo).
@@ -6081,88 +6356,6 @@ static bool RelaxedMayFoldVectorLoad(SDValue V) {
    return false;
  }
  
-/// CanFoldShuffleIntoVExtract - Check if the current shuffle is used by
-/// a vector extract, and if both can be later optimized into a single load.
-/// This is done in visitEXTRACT_VECTOR_ELT and the conditions are checked
-/// here because otherwise a target specific shuffle node is going to be
-/// emitted for this shuffle, and the optimization not done.
-/// FIXME: This is probably not the best approach, but fix the problem
-/// until the right path is decided.
-static
-bool CanXFormVExtractWithShuffleIntoLoad(SDValue V, SelectionDAG &DAG,
-                                         const TargetLowering &TLI) {
-  EVT VT = V.getValueType();
-  ShuffleVectorSDNode *SVOp = dyn_cast<ShuffleVectorSDNode>(V);
-
-  // Be sure that the vector shuffle is present in a pattern like this:
-  // (vextract (v4f32 shuffle (load $addr), <1,u,u,u>), c) -> (f32 load $addr)
-  if (!V.hasOneUse())
-    return false;
-
-  SDNode *N = *V.getNode()->use_begin();
-  if (N->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
-    return false;
-
-  SDValue EltNo = N->getOperand(1);
-  if (!isa<ConstantSDNode>(EltNo))
-    return false;
-
-  // If the bit convert changed the number of elements, it is unsafe
-  // to examine the mask.
-  bool HasShuffleIntoBitcast = false;
-  if (V.getOpcode() == ISD::BITCAST) {
-    EVT SrcVT = V.getOperand(0).getValueType();
-    if (SrcVT.getVectorNumElements() != VT.getVectorNumElements())
-      return false;
-    V = V.getOperand(0);
-    HasShuffleIntoBitcast = true;
-  }
-
-  // Select the input vector, guarding against out of range extract vector.
-  unsigned NumElems = VT.getVectorNumElements();
-  unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
-  int Idx = (Elt > NumElems) ? -1 : SVOp->getMaskElt(Elt);
-  V = (Idx < (int)NumElems) ? V.getOperand(0) : V.getOperand(1);
-
-  // If we are accessing the upper part of a YMM register
-  // then the EXTRACT_VECTOR_ELT is likely to be legalized to a sequence of
-  // EXTRACT_SUBVECTOR + EXTRACT_VECTOR_ELT, which are not detected at this point
-  // because the legalization of N did not happen yet.
-  if (Idx >= (int)NumElems/2 && VT.getSizeInBits() == 256)
-    return false;
-
-  // Skip one more bit_convert if necessary
-  if (V.getOpcode() == ISD::BITCAST) {
-    if (!V.hasOneUse())
-      return false;
-    V = V.getOperand(0);
-  }
-
-  if (!ISD::isNormalLoad(V.getNode()))
-    return false;
-
-  // Is the original load suitable?
-  LoadSDNode *LN0 = cast<LoadSDNode>(V);
-
-  if (!LN0 || !LN0->hasNUsesOfValue(1,0) || LN0->isVolatile())
-    return false;
-
-  if (!HasShuffleIntoBitcast)
-    return true;
-
-  // If there's a bitcast before the shuffle, check if the load type and
-  // alignment is valid.
-  unsigned Align = LN0->getAlignment();
-  unsigned NewAlign =
-    TLI.getTargetData()->getABITypeAlignment(
-                                  VT.getTypeForEVT(*DAG.getContext()));
-
-  if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VT))
-    return false;
-
-  return true;
-}
-
  static
  SDValue getMOVDDup(SDValue &Op, DebugLoc &dl, SDValue V1, SelectionDAG &DAG) {
    EVT VT = Op.getValueType();
@@ -6240,7 +6433,7 @@ SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
        return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG);
  
      if (NumElems == 4)
-      // If we don't care about the second element, procede to use movss.
+      // If we don't care about the second element, proceed to use movss.
        if (SVOp->getMaskElt(1) != -1)
          return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG);
    }
@@ -6266,10 +6459,8 @@ SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
                                getShuffleSHUFImmediate(SVOp), DAG);
  }
  
-static
-SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG,
-                               const TargetLowering &TLI,
-                               const X86Subtarget *Subtarget) {
+SDValue
+X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const {
    ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
    EVT VT = Op.getValueType();
    DebugLoc dl = Op.getDebugLoc();
@@ -6283,17 +6474,11 @@ SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG,
    if (SVOp->isSplat()) {
      unsigned NumElem = VT.getVectorNumElements();
      int Size = VT.getSizeInBits();
-    // Special case, this is the only place now where it's allowed to return
-    // a vector_shuffle operation without using a target specific node, because
-    // *hopefully* it will be optimized away by the dag combiner. FIXME: should
-    // this be moved to DAGCombine instead?
-    if (NumElem <= 4 && CanXFormVExtractWithShuffleIntoLoad(Op, DAG, TLI))
-      return Op;
  
      // Use vbroadcast whenever the splat comes from a foldable load
-    SDValue LD = isVectorBroadcast(Op, Subtarget);
-    if (LD.getNode())
-      return DAG.getNode(X86ISD::VBROADCAST, dl, VT, LD);
+    SDValue Broadcast = LowerVectorBroadcast(Op, DAG);
+    if (Broadcast.getNode())
+      return Broadcast;
  
      // Handle splats by matching through known shuffle masks
      if ((Size == 128 && NumElem <= 4) ||
@@ -6306,7 +6491,8 @@ SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG,
  
    // If the shuffle can be profitably rewritten as a narrower shuffle, then
    // do it!
-  if (VT == MVT::v8i16 || VT == MVT::v16i8) {
+  if (VT == MVT::v8i16  || VT == MVT::v16i8 ||
+      VT == MVT::v16i16 || VT == MVT::v32i8) {
      SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl);
      if (NewOp.getNode())
        return DAG.getNode(ISD::BITCAST, dl, VT, NewOp);
@@ -6378,7 +6564,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
    // Normalize the input vectors. Here splats, zeroed vectors, profitable
    // narrowing and commutation of operands should be handled. The actual code
    // doesn't include all of those, work in progress...
-  SDValue NewOp = NormalizeVectorShuffle(Op, DAG, *this, Subtarget);
+  SDValue NewOp = NormalizeVectorShuffle(Op, DAG);
    if (NewOp.getNode())
      return NewOp;
  
@@ -6510,11 +6696,10 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
      // new vector_shuffle with the corrected mask.p
      SmallVector<int, 8> NewMask(M.begin(), M.end());
      NormalizeMask(NewMask, NumElems);
-    if (isUNPCKLMask(NewMask, VT, HasAVX2, true)) {
+    if (isUNPCKLMask(NewMask, VT, HasAVX2, true))
        return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
-    } else if (isUNPCKHMask(NewMask, VT, HasAVX2, true)) {
+    if (isUNPCKHMask(NewMask, VT, HasAVX2, true))
        return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
-    }
    }
  
    if (Commuted) {
@@ -6551,12 +6736,12 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
        return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
    }
  
-  if (isPSHUFHWMask(M, VT))
+  if (isPSHUFHWMask(M, VT, HasAVX2))
      return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1,
                                  getShufflePSHUFHWImmediate(SVOp),
                                  DAG);
  
-  if (isPSHUFLWMask(M, VT))
+  if (isPSHUFLWMask(M, VT, HasAVX2))
      return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1,
                                  getShufflePSHUFLWImmediate(SVOp),
                                  DAG);
@@ -6593,6 +6778,27 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
      return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1,
                                  V2, getShuffleVPERM2X128Immediate(SVOp), DAG);
  
+  SDValue BlendOp = LowerVECTOR_SHUFFLEtoBlend(SVOp, Subtarget, DAG);
+  if (BlendOp.getNode())
+    return BlendOp;
+
+  if (V2IsUndef && HasAVX2 && (VT == MVT::v8i32 || VT == MVT::v8f32)) {
+    SmallVector<SDValue, 8> permclMask;
+    for (unsigned i = 0; i != 8; ++i) {
+      permclMask.push_back(DAG.getConstant((M[i]>=0) ? M[i] : 0, MVT::i32));
+    }
+    SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32,
+                               &permclMask[0], 8);
+    // Bitcast is for VPERMPS since mask is v8i32 but node takes v8f32
+    return DAG.getNode(X86ISD::VPERMV, dl, VT,
+                       DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1);
+  }
+
+  if (V2IsUndef && HasAVX2 && (VT == MVT::v4i64 || VT == MVT::v4f64))
+    return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1,
+                                getShuffleCLImmediate(SVOp), DAG);
+
+
    //===--------------------------------------------------------------------===//
    // Since no target specific shuffle was selected for this generic one,
    // lower it into other known shuffles. FIXME: this isn't true yet, but
@@ -6614,7 +6820,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
  
    // Handle all 128-bit wide vectors with 4 elements, and match them with
    // several different shuffle types.
-  if (NumElems == 4 && VT.getSizeInBits() == 128)
+  if (NumElems == 4 && VT.is128BitVector())
      return LowerVECTOR_SHUFFLE_128v4(SVOp, DAG);
  
    // Handle general 256-bit shuffles
@@ -6630,7 +6836,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op,
    EVT VT = Op.getValueType();
    DebugLoc dl = Op.getDebugLoc();
  
-  if (Op.getOperand(0).getValueType().getSizeInBits() != 128)
+  if (!Op.getOperand(0).getValueType().is128BitVector())
      return SDValue();
  
    if (VT.getSizeInBits() == 8) {
@@ -6639,7 +6845,9 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op,
      SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
                                      DAG.getValueType(VT));
      return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
-  } else if (VT.getSizeInBits() == 16) {
+  }
+
+  if (VT.getSizeInBits() == 16) {
      unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
      // If Idx is 0, it's cheaper to do a move instead of a pextrw.
      if (Idx == 0)
@@ -6654,7 +6862,9 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op,
      SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
                                      DAG.getValueType(VT));
      return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
-  } else if (VT == MVT::f32) {
+  }
+
+  if (VT == MVT::f32) {
      // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
      // the result back to FR32 register. It's only worth matching if the
      // result has a single use which is a store or a bitcast to i32.  And in
@@ -6674,7 +6884,9 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op,
                                                Op.getOperand(0)),
                                                Op.getOperand(1));
      return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract);
-  } else if (VT == MVT::i32 || VT == MVT::i64) {
+  }
+
+  if (VT == MVT::i32 || VT == MVT::i64) {
      // ExtractPS/pextrq works with constant index.
      if (isa<ConstantSDNode>(Op.getOperand(1)))
        return Op;
@@ -6694,22 +6906,22 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
  
    // If this is a 256-bit vector result, first extract the 128-bit vector and
    // then extract the element from the 128-bit vector.
-  if (VecVT.getSizeInBits() == 256) {
+  if (VecVT.is256BitVector()) {
      DebugLoc dl = Op.getNode()->getDebugLoc();
      unsigned NumElems = VecVT.getVectorNumElements();
      SDValue Idx = Op.getOperand(1);
      unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
  
      // Get the 128-bit vector.
-    bool Upper = IdxVal >= NumElems/2;
-    Vec = Extract128BitVector(Vec,
-                    DAG.getConstant(Upper ? NumElems/2 : 0, MVT::i32), DAG, dl);
+    Vec = Extract128BitVector(Vec, IdxVal, DAG, dl);
  
+    if (IdxVal >= NumElems/2)
+      IdxVal -= NumElems/2;
      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
-                    Upper ? DAG.getConstant(IdxVal-NumElems/2, MVT::i32) : Idx);
+                       DAG.getConstant(IdxVal, MVT::i32));
    }
  
-  assert(Vec.getValueSizeInBits() <= 128 && "Unexpected vector length");
+  assert(VecVT.is128BitVector() && "Unexpected vector length");
  
    if (Subtarget->hasSSE41()) {
      SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG);
@@ -6736,7 +6948,9 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
      SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract,
                                      DAG.getValueType(VT));
      return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
-  } else if (VT.getSizeInBits() == 32) {
+  }
+
+  if (VT.getSizeInBits() == 32) {
      unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
      if (Idx == 0)
        return Op;
@@ -6748,7 +6962,9 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
                                         DAG.getUNDEF(VVT), Mask);
      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
                         DAG.getIntPtrConstant(0));
-  } else if (VT.getSizeInBits() == 64) {
+  }
+
+  if (VT.getSizeInBits() == 64) {
      // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
      // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
      //        to match extract_elt for f64.
@@ -6781,7 +6997,7 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op,
    SDValue N1 = Op.getOperand(1);
    SDValue N2 = Op.getOperand(2);
  
-  if (VT.getSizeInBits() == 256)
+  if (!VT.is128BitVector())
      return SDValue();
  
    if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) &&
@@ -6801,7 +7017,9 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op,
      if (N2.getValueType() != MVT::i32)
        N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
      return DAG.getNode(Opc, dl, VT, N0, N1, N2);
-  } else if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) {
+  }
+
+  if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) {
      // Bits [7:6] of the constant are the source select.  This will always be
      //  zero here.  The DAG Combiner may combine an extract_elt index into these
      //  bits.  For example (insert (extract, 3), 2) could be matched by putting
@@ -6814,8 +7032,9 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op,
      // Create this as a scalar to vector..
      N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
      return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
-  } else if ((EltVT == MVT::i32 || EltVT == MVT::i64) && 
-             isa<ConstantSDNode>(N2)) {
+  }
+
+  if ((EltVT == MVT::i32 || EltVT == MVT::i64) && isa<ConstantSDNode>(N2)) {
      // PINSR* works with constant index.
      return Op;
    }
@@ -6834,23 +7053,22 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
  
    // If this is a 256-bit vector result, first extract the 128-bit vector,
    // insert the element into the extracted half and then place it back.
-  if (VT.getSizeInBits() == 256) {
+  if (VT.is256BitVector()) {
      if (!isa<ConstantSDNode>(N2))
        return SDValue();
  
      // Get the desired 128-bit vector half.
      unsigned NumElems = VT.getVectorNumElements();
      unsigned IdxVal = cast<ConstantSDNode>(N2)->getZExtValue();
-    bool Upper = IdxVal >= NumElems/2;
-    SDValue Ins128Idx = DAG.getConstant(Upper ? NumElems/2 : 0, MVT::i32);
-    SDValue V = Extract128BitVector(N0, Ins128Idx, DAG, dl);
+    SDValue V = Extract128BitVector(N0, IdxVal, DAG, dl);
  
      // Insert the element into the desired half.
-    V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V,
-                 N1, Upper ? DAG.getConstant(IdxVal-NumElems/2, MVT::i32) : N2);
+    bool Upper = IdxVal >= NumElems/2;
+    V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
+                 DAG.getConstant(Upper ? IdxVal-NumElems/2 : IdxVal, MVT::i32));
  
      // Insert the changed part back to the 256-bit vector
-    return Insert128BitVector(N0, V, Ins128Idx, DAG, dl);
+    return Insert128BitVector(N0, V, IdxVal, DAG, dl);
    }
  
    if (Subtarget->hasSSE41())
@@ -6879,7 +7097,7 @@ X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const {
  
    // If this is a 256-bit vector result, first insert into a 128-bit
    // vector and then insert into the 256-bit vector.
-  if (OpVT.getSizeInBits() > 128) {
+  if (!OpVT.is128BitVector()) {
      // Insert into a 128-bit vector.
      EVT VT128 = EVT::getVectorVT(*Context,
                                   OpVT.getVectorElementType(),
@@ -6888,19 +7106,16 @@ X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const {
      Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
  
      // Insert the 128-bit vector.
-    return Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, OpVT), Op,
-                              DAG.getConstant(0, MVT::i32),
-                              DAG, dl);
+    return Insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
    }
  
-  if (Op.getValueType() == MVT::v1i64 &&
+  if (OpVT == MVT::v1i64 &&
        Op.getOperand(0).getValueType() == MVT::i64)
      return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0));
  
    SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
-  assert(Op.getValueType().getSimpleVT().getSizeInBits() == 128 &&
-         "Expected an SSE type!");
-  return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(),
+  assert(OpVT.is128BitVector() && "Expected an SSE type!");
+  return DAG.getNode(ISD::BITCAST, dl, OpVT,
                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt));
  }
  
@@ -6914,9 +7129,11 @@ X86TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const {
      SDValue Vec = Op.getNode()->getOperand(0);
      SDValue Idx = Op.getNode()->getOperand(1);
  
-    if (Op.getNode()->getValueType(0).getSizeInBits() == 128
-        && Vec.getNode()->getValueType(0).getSizeInBits() == 256) {
-        return Extract128BitVector(Vec, Idx, DAG, dl);
+    if (Op.getNode()->getValueType(0).is128BitVector() &&
+        Vec.getNode()->getValueType(0).is256BitVector() &&
+        isa<ConstantSDNode>(Idx)) {
+      unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+      return Extract128BitVector(Vec, IdxVal, DAG, dl);
      }
    }
    return SDValue();
@@ -6933,9 +7150,11 @@ X86TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const {
      SDValue SubVec = Op.getNode()->getOperand(1);
      SDValue Idx = Op.getNode()->getOperand(2);
  
-    if (Op.getNode()->getValueType(0).getSizeInBits() == 256
-        && SubVec.getNode()->getValueType(0).getSizeInBits() == 128) {
-      return Insert128BitVector(Vec, SubVec, Idx, DAG, dl);
+    if (Op.getNode()->getValueType(0).is256BitVector() &&
+        SubVec.getNode()->getValueType(0).is128BitVector() &&
+        isa<ConstantSDNode>(Idx)) {
+      unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+      return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
      }
    }
    return SDValue();
@@ -7144,7 +7363,7 @@ X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
  static SDValue
  GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
             SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
-           unsigned char OperandFlags) {
+           unsigned char OperandFlags, bool LocalDynamic = false) {
    MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
    SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
    DebugLoc dl = GA->getDebugLoc();
@@ -7152,12 +7371,16 @@ GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
                                             GA->getValueType(0),
                                             GA->getOffset(),
                                             OperandFlags);
+
+  X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
+                                           : X86ISD::TLSADDR;
+
    if (InFlag) {
      SDValue Ops[] = { Chain,  TGA, *InFlag };
-    Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 3);
+    Chain = DAG.getNode(CallType, dl, NodeTys, Ops, 3);
    } else {
      SDValue Ops[]  = { Chain, TGA };
-    Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 2);
+    Chain = DAG.getNode(CallType, dl, NodeTys, Ops, 2);
    }
  
    // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
@@ -7189,11 +7412,49 @@ LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
                      X86::RAX, X86II::MO_TLSGD);
  }
  
-// Lower ISD::GlobalTLSAddress using the "initial exec" (for no-pic) or
-// "local exec" model.
+static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
+                                           SelectionDAG &DAG,
+                                           const EVT PtrVT,
+                                           bool is64Bit) {
+  DebugLoc dl = GA->getDebugLoc();
+
+  // Get the start address of the TLS block for this module.
+  X86MachineFunctionInfo* MFI = DAG.getMachineFunction()
+      .getInfo<X86MachineFunctionInfo>();
+  MFI->incNumLocalDynamicTLSAccesses();
+
+  SDValue Base;
+  if (is64Bit) {
+    Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, X86::RAX,
+                      X86II::MO_TLSLD, /*LocalDynamic=*/true);
+  } else {
+    SDValue InFlag;
+    SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
+        DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), PtrVT), InFlag);
+    InFlag = Chain.getValue(1);
+    Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
+                      X86II::MO_TLSLDM, /*LocalDynamic=*/true);
+  }
+
+  // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
+  // of Base.
+
+  // Build x@dtpoff.
+  unsigned char OperandFlags = X86II::MO_DTPOFF;
+  unsigned WrapperKind = X86ISD::Wrapper;
+  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
+                                           GA->getValueType(0),
+                                           GA->getOffset(), OperandFlags);
+  SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
+
+  // Add x@dtpoff with the base.
+  return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
+}
+
+// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
  static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
                                     const EVT PtrVT, TLSModel::Model model,
-                                   bool is64Bit) {
+                                   bool is64Bit, bool isPIC) {
    DebugLoc dl = GA->getDebugLoc();
  
    // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
@@ -7211,25 +7472,36 @@ static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
    unsigned WrapperKind = X86ISD::Wrapper;
    if (model == TLSModel::LocalExec) {
      OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
-  } else if (is64Bit) {
-    assert(model == TLSModel::InitialExec);
-    OperandFlags = X86II::MO_GOTTPOFF;
-    WrapperKind = X86ISD::WrapperRIP;
+  } else if (model == TLSModel::InitialExec) {
+    if (is64Bit) {
+      OperandFlags = X86II::MO_GOTTPOFF;
+      WrapperKind = X86ISD::WrapperRIP;
+    } else {
+      OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
+    }
    } else {
-    assert(model == TLSModel::InitialExec);
-    OperandFlags = X86II::MO_INDNTPOFF;
+    llvm_unreachable("Unexpected model");
    }
  
-  // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial
-  // exec)
+  // emit "addl x@ntpoff,%eax" (local exec)
+  // or "addl x@indntpoff,%eax" (initial exec)
+  // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
    SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
                                             GA->getValueType(0),
                                             GA->getOffset(), OperandFlags);
    SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
  
-  if (model == TLSModel::InitialExec)
+  if (model == TLSModel::InitialExec) {
+    if (isPIC && !is64Bit) {
+      Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
+                          DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), PtrVT),
+                           Offset);
+    }
+
      Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
-                         MachinePointerInfo::getGOT(), false, false, false, 0);
+                         MachinePointerInfo::getGOT(), false, false, false,
+                         0);
+  }
  
    // The address of the thread local variable is the add of the thread
    // pointer with the offset of the variable.
@@ -7243,30 +7515,26 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
    const GlobalValue *GV = GA->getGlobal();
  
    if (Subtarget->isTargetELF()) {
-    // TODO: implement the "local dynamic" model
-    // TODO: implement the "initial exec"model for pic executables
-
-    // If GV is an alias then use the aliasee for determining
-    // thread-localness.
-    if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
-      GV = GA->resolveAliasedGlobal(false);
-
-    TLSModel::Model model
-      = getTLSModel(GV, getTargetMachine().getRelocationModel());
+    TLSModel::Model model = getTargetMachine().getTLSModel(GV);
  
      switch (model) {
        case TLSModel::GeneralDynamic:
-      case TLSModel::LocalDynamic: // not implemented
          if (Subtarget->is64Bit())
            return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy());
          return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy());
-
+      case TLSModel::LocalDynamic:
+        return LowerToTLSLocalDynamicModel(GA, DAG, getPointerTy(),
+                                           Subtarget->is64Bit());
        case TLSModel::InitialExec:
        case TLSModel::LocalExec:
          return LowerToTLSExecModel(GA, DAG, getPointerTy(), model,
-                                   Subtarget->is64Bit());
+                                   Subtarget->is64Bit(),
+                         getTargetMachine().getRelocationModel() == Reloc::PIC_);
      }
-  } else if (Subtarget->isTargetDarwin()) {
+    llvm_unreachable("Unknown TLS model.");
+  }
+
+  if (Subtarget->isTargetDarwin()) {
      // Darwin only has one model of TLS.  Lower to that.
      unsigned char OpFlag = 0;
      unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ?
@@ -7309,7 +7577,9 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
      unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
      return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(),
                                Chain.getValue(1));
-  } else if (Subtarget->isTargetWindows()) {
+  }
+
+  if (Subtarget->isTargetWindows()) {
      // Just use the implicit TLS architecture
      // Need to generate someting similar to:
      //   mov     rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
@@ -7355,7 +7625,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
                          false, false, false, 0);
  
      SDValue Scale = DAG.getConstant(Log2_64_Ceil(TD->getPointerSize()),
-                                           getPointerTy());
+                                    getPointerTy());
      IDX = DAG.getNode(ISD::SHL, dl, getPointerTy(), IDX, Scale);
  
      SDValue res = DAG.getNode(ISD::ADD, dl, getPointerTy(), ThreadPointer, IDX);
@@ -7526,9 +7796,9 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
       punpckldq  (c0),  %xmm0  // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
       subpd      (c1),  %xmm0  // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
       #ifdef __SSE3__
-       haddpd   %xmm0, %xmm0          
+       haddpd   %xmm0, %xmm0
       #else
-       pshufd   $0x4e, %xmm0, %xmm1 
+       pshufd   $0x4e, %xmm0, %xmm1
         addpd    %xmm1, %xmm0
       #endif
    */
@@ -7619,12 +7889,11 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
    // Handle final rounding.
    EVT DestVT = Op.getValueType();
  
-  if (DestVT.bitsLT(MVT::f64)) {
+  if (DestVT.bitsLT(MVT::f64))
      return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
                         DAG.getIntPtrConstant(0));
-  } else if (DestVT.bitsGT(MVT::f64)) {
+  if (DestVT.bitsGT(MVT::f64))
      return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
-  }
  
    // Handle final rounding.
    return Sub;
@@ -7645,10 +7914,9 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
    EVT DstVT = Op.getValueType();
    if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
      return LowerUINT_TO_FP_i64(Op, DAG);
-  else if (SrcVT == MVT::i32 && X86ScalarSSEf64)
+  if (SrcVT == MVT::i32 && X86ScalarSSEf64)
      return LowerUINT_TO_FP_i32(Op, DAG);
-  else if (Subtarget->is64Bit() &&
-           SrcVT == MVT::i64 && DstVT == MVT::f32)
+  if (Subtarget->is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
      return SDValue();
  
    // Make a 64-bit buffer, and use it to build an FILD.
@@ -7825,9 +8093,9 @@ SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
      return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(),
                         FIST, StackSlot, MachinePointerInfo(),
                         false, false, false, 0);
-  else
-    // The node is the result.
-    return FIST;
+
+  // The node is the result.
+  return FIST;
  }
  
  SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
@@ -7842,9 +8110,9 @@ SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
      return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(),
                         FIST, StackSlot, MachinePointerInfo(),
                         false, false, false, 0);
-  else
-    // The node is the result.
-    return FIST;
+
+  // The node is the result.
+  return FIST;
  }
  
  SDValue X86TargetLowering::LowerFABS(SDValue Op,
@@ -7857,7 +8125,7 @@ SDValue X86TargetLowering::LowerFABS(SDValue Op,
      EltVT = VT.getVectorElementType();
    Constant *C;
    if (EltVT == MVT::f64) {
-    C = ConstantVector::getSplat(2, 
+    C = ConstantVector::getSplat(2,
                  ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))));
    } else {
      C = ConstantVector::getSplat(4,
@@ -7891,15 +8159,15 @@ SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const {
                               MachinePointerInfo::getConstantPool(),
                               false, false, false, 16);
    if (VT.isVector()) {
-    MVT XORVT = VT.getSizeInBits() == 128 ? MVT::v2i64 : MVT::v4i64;
+    MVT XORVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
      return DAG.getNode(ISD::BITCAST, dl, VT,
                         DAG.getNode(ISD::XOR, dl, XORVT,
-                    DAG.getNode(ISD::BITCAST, dl, XORVT,
-                                Op.getOperand(0)),
-                    DAG.getNode(ISD::BITCAST, dl, XORVT, Mask)));
-  } else {
-    return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask);
+                                   DAG.getNode(ISD::BITCAST, dl, XORVT,
+                                               Op.getOperand(0)),
+                                   DAG.getNode(ISD::BITCAST, dl, XORVT, Mask)));
    }
+
+  return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask);
  }
  
  SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
@@ -8019,7 +8287,33 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
  
    unsigned Opcode = 0;
    unsigned NumOperands = 0;
-  switch (Op.getNode()->getOpcode()) {
+
+  // Truncate operations may prevent the merge of the SETCC instruction
+  // and the arithmetic intruction before it. Attempt to truncate the operands
+  // of the arithmetic instruction and use a reduced bit-width instruction.
+  bool NeedTruncation = false;
+  SDValue ArithOp = Op;
+  if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
+    SDValue Arith = Op->getOperand(0);
+    // Both the trunc and the arithmetic op need to have one user each.
+    if (Arith->hasOneUse())
+      switch (Arith.getOpcode()) {
+        default: break;
+        case ISD::ADD:
+        case ISD::SUB:
+        case ISD::AND:
+        case ISD::OR:
+        case ISD::XOR: {
+          NeedTruncation = true;
+          ArithOp = Arith;
+        }
+      }
+  }
+
+  // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
+  // which may be the result of a CAST.  We use the variable 'Op', which is the
+  // non-casted variable when we check for possible users.
+  switch (ArithOp.getOpcode()) {
    case ISD::ADD:
      // Due to an isel shortcoming, be conservative if this add is likely to be
      // selected as part of a load-modify-store instruction. When the root node
@@ -8039,7 +8333,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
          goto default_case;
  
      if (ConstantSDNode *C =
-        dyn_cast<ConstantSDNode>(Op.getNode()->getOperand(1))) {
+        dyn_cast<ConstantSDNode>(ArithOp.getNode()->getOperand(1))) {
        // An add of one will be selected as an INC.
        if (C->getAPIntValue() == 1) {
          Opcode = X86ISD::INC;
@@ -8075,7 +8369,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
  
        if (User->getOpcode() != ISD::BRCOND &&
            User->getOpcode() != ISD::SETCC &&
-          (User->getOpcode() != ISD::SELECT || UOpNo != 0)) {
+          !(User->getOpcode() == ISD::SELECT && UOpNo == 0)) {
          NonFlagUse = true;
          break;
        }
@@ -8096,7 +8390,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
          goto default_case;
  
      // Otherwise use a regular EFLAGS-setting instruction.
-    switch (Op.getNode()->getOpcode()) {
+    switch (ArithOp.getOpcode()) {
      default: llvm_unreachable("unexpected operator!");
      case ISD::SUB: Opcode = X86ISD::SUB; break;
      case ISD::OR:  Opcode = X86ISD::OR;  break;
@@ -8119,6 +8413,35 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
      break;
    }
  
+  // If we found that truncation is beneficial, perform the truncation and
+  // update 'Op'.
+  if (NeedTruncation) {
+    EVT VT = Op.getValueType();
+    SDValue WideVal = Op->getOperand(0);
+    EVT WideVT = WideVal.getValueType();
+    unsigned ConvertedOp = 0;
+    // Use a target machine opcode to prevent further DAGCombine
+    // optimizations that may separate the arithmetic operations
+    // from the setcc node.
+    switch (WideVal.getOpcode()) {
+      default: break;
+      case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
+      case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
+      case ISD::AND: ConvertedOp = X86ISD::AND; break;
+      case ISD::OR:  ConvertedOp = X86ISD::OR;  break;
+      case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
+    }
+
+    if (ConvertedOp) {
+      const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+      if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
+        SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
+        SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
+        Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1);
+      }
+    }
+  }
+
    if (Opcode == 0)
      // Emit a CMP with 0, which is the TEST pattern.
      return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
@@ -8143,9 +8466,41 @@ SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
        return EmitTest(Op0, X86CC, DAG);
  
    DebugLoc dl = Op0.getDebugLoc();
+  if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
+       Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
+    // Use SUB instead of CMP to enable CSE between SUB and CMP.
+    SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
+    SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs,
+                              Op0, Op1);
+    return SDValue(Sub.getNode(), 1);
+  }
    return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
  }
  
+/// Convert a comparison if required by the subtarget.
+SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
+                                                 SelectionDAG &DAG) const {
+  // If the subtarget does not support the FUCOMI instruction, floating-point
+  // comparisons have to be converted.
+  if (Subtarget->hasCMov() ||
+      Cmp.getOpcode() != X86ISD::CMP ||
+      !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
+      !Cmp.getOperand(1).getValueType().isFloatingPoint())
+    return Cmp;
+
+  // The instruction selector will select an FUCOM instruction instead of
+  // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
+  // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
+  // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
+  DebugLoc dl = Cmp.getDebugLoc();
+  SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
+  SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
+  SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
+                            DAG.getConstant(8, MVT::i8));
+  SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
+  return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
+}
+
  /// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node
  /// if it's possible.
  SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
@@ -8168,8 +8523,8 @@ SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
          unsigned BitWidth = Op0.getValueSizeInBits();
          unsigned AndBitWidth = And.getValueSizeInBits();
          if (BitWidth > AndBitWidth) {
-          APInt Mask = APInt::getAllOnesValue(BitWidth), Zeros, Ones;
-          DAG.ComputeMaskedBits(Op0, Mask, Zeros, Ones);
+          APInt Zeros, Ones;
+          DAG.ComputeMaskedBits(Op0, Zeros, Ones);
            if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth)
              return SDValue();
          }
@@ -8267,6 +8622,7 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
      return SDValue();
  
    SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, DAG);
+  EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
    return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
                       DAG.getConstant(X86CC, MVT::i8), EFLAGS);
  }
@@ -8276,24 +8632,22 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
  static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
    EVT VT = Op.getValueType();
  
-  assert(VT.getSizeInBits() == 256 && Op.getOpcode() == ISD::SETCC &&
+  assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
           "Unsupported value type for operation");
  
-  int NumElems = VT.getVectorNumElements();
+  unsigned NumElems = VT.getVectorNumElements();
    DebugLoc dl = Op.getDebugLoc();
    SDValue CC = Op.getOperand(2);
-  SDValue Idx0 = DAG.getConstant(0, MVT::i32);
-  SDValue Idx1 = DAG.getConstant(NumElems/2, MVT::i32);
  
    // Extract the LHS vectors
    SDValue LHS = Op.getOperand(0);
-  SDValue LHS1 = Extract128BitVector(LHS, Idx0, DAG, dl);
-  SDValue LHS2 = Extract128BitVector(LHS, Idx1, DAG, dl);
+  SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
+  SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
  
    // Extract the RHS vectors
    SDValue RHS = Op.getOperand(1);
-  SDValue RHS1 = Extract128BitVector(RHS, Idx0, DAG, dl);
-  SDValue RHS2 = Extract128BitVector(RHS, Idx1, DAG, dl);
+  SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);
+  SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
  
    // Issue the operation on the smaller types and concatenate the result back
    MVT EltVT = VT.getVectorElementType().getSimpleVT();
@@ -8315,10 +8669,12 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
    DebugLoc dl = Op.getDebugLoc();
  
    if (isFP) {
-    unsigned SSECC = 8;
+#ifndef NDEBUG
      EVT EltVT = Op0.getValueType().getVectorElementType();
-    assert(EltVT == MVT::f32 || EltVT == MVT::f64); (void)EltVT;
+    assert(EltVT == MVT::f32 || EltVT == MVT::f64);
+#endif
  
+    unsigned SSECC;
      bool Swap = false;
  
      // SSE Condition code mapping:
@@ -8331,7 +8687,7 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
      //  6 - NLE
      //  7 - ORD
      switch (SetCCOpcode) {
-    default: break;
+    default: llvm_unreachable("Unexpected SETCC condition");
      case ISD::SETOEQ:
      case ISD::SETEQ:  SSECC = 0; break;
      case ISD::SETOGT:
@@ -8345,33 +8701,33 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
      case ISD::SETUO:  SSECC = 3; break;
      case ISD::SETUNE:
      case ISD::SETNE:  SSECC = 4; break;
-    case ISD::SETULE: Swap = true;
+    case ISD::SETULE: Swap = true; // Fallthrough
      case ISD::SETUGE: SSECC = 5; break;
-    case ISD::SETULT: Swap = true;
+    case ISD::SETULT: Swap = true; // Fallthrough
      case ISD::SETUGT: SSECC = 6; break;
      case ISD::SETO:   SSECC = 7; break;
+    case ISD::SETUEQ:
+    case ISD::SETONE: SSECC = 8; break;
      }
      if (Swap)
        std::swap(Op0, Op1);
  
      // In the two special cases we can't handle, emit two comparisons.
      if (SSECC == 8) {
+      unsigned CC0, CC1;
+      unsigned CombineOpc;
        if (SetCCOpcode == ISD::SETUEQ) {
-        SDValue UNORD, EQ;
-        UNORD = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
-                            DAG.getConstant(3, MVT::i8));
-        EQ = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
-                         DAG.getConstant(0, MVT::i8));
-        return DAG.getNode(ISD::OR, dl, VT, UNORD, EQ);
-      } else if (SetCCOpcode == ISD::SETONE) {
-        SDValue ORD, NEQ;
-        ORD = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
-                          DAG.getConstant(7, MVT::i8));
-        NEQ = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
-                          DAG.getConstant(4, MVT::i8));
-        return DAG.getNode(ISD::AND, dl, VT, ORD, NEQ);
+        CC0 = 3; CC1 = 0; CombineOpc = ISD::OR;
+      } else {
+        assert(SetCCOpcode == ISD::SETONE);
+        CC0 = 7; CC1 = 4; CombineOpc = ISD::AND;
        }
-      llvm_unreachable("Illegal FP comparison");
+
+      SDValue Cmp0 = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
+                                 DAG.getConstant(CC0, MVT::i8));
+      SDValue Cmp1 = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
+                                 DAG.getConstant(CC1, MVT::i8));
+      return DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
      }
      // Handle all other FP comparisons here.
      return DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
@@ -8379,17 +8735,17 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
    }
  
    // Break 256-bit integer vector compare into smaller ones.
-  if (VT.getSizeInBits() == 256 && !Subtarget->hasAVX2())
+  if (VT.is256BitVector() && !Subtarget->hasAVX2())
      return Lower256IntVSETCC(Op, DAG);
  
    // We are handling one of the integer comparisons here.  Since SSE only has
    // GT and EQ comparisons for integer, swapping operands and multiple
    // operations may be required for some comparisons.
-  unsigned Opc = 0;
+  unsigned Opc;
    bool Swap = false, Invert = false, FlipSigns = false;
  
    switch (SetCCOpcode) {
-  default: break;
+  default: llvm_unreachable("Unexpected SETCC condition");
    case ISD::SETNE:  Invert = true;
    case ISD::SETEQ:  Opc = X86ISD::PCMPEQ; break;
    case ISD::SETLT:  Swap = true;
@@ -8406,10 +8762,12 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
  
    // Check that the operation in question is available (most are plain SSE2,
    // but PCMPGTQ and PCMPEQQ have different requirements).
-  if (Opc == X86ISD::PCMPGT && VT == MVT::v2i64 && !Subtarget->hasSSE42())
-    return SDValue();
-  if (Opc == X86ISD::PCMPEQ && VT == MVT::v2i64 && !Subtarget->hasSSE41())
-    return SDValue();
+  if (VT == MVT::v2i64) {
+    if (Opc == X86ISD::PCMPGT && !Subtarget->hasSSE42())
+      return SDValue();
+    if (Opc == X86ISD::PCMPEQ && !Subtarget->hasSSE41())
+      return SDValue();
+  }
  
    // Since SSE has no unsigned integer comparisons, we need to flip  the sign
    // bits of the inputs before performing those operations.
@@ -8436,7 +8794,8 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
  // isX86LogicalCmp - Return true if opcode is a X86 logical comparison.
  static bool isX86LogicalCmp(SDValue Op) {
    unsigned Opc = Op.getNode()->getOpcode();
-  if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI)
+  if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
+      Opc == X86ISD::SAHF)
      return true;
    if (Op.getResNo() == 1 &&
        (Opc == X86ISD::ADD ||
@@ -8468,6 +8827,16 @@ static bool isAllOnes(SDValue V) {
    return C && C->isAllOnesValue();
  }
  
+static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
+  if (V.getOpcode() != ISD::TRUNCATE)
+    return false;
+
+  SDValue VOp0 = V.getOperand(0);
+  unsigned InBits = VOp0.getValueSizeInBits();
+  unsigned Bits = V.getValueSizeInBits();
+  return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
+}
+
  SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
    bool addTest = true;
    SDValue Cond  = Op.getOperand(0);
@@ -8498,8 +8867,25 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
        SDValue Y = isAllOnes(Op2) ? Op1 : Op2;
  
        SDValue CmpOp0 = Cmp.getOperand(0);
+      // Apply further optimizations for special cases
+      // (select (x != 0), -1, 0) -> neg & sbb
+      // (select (x == 0), 0, -1) -> neg & sbb
+      if (ConstantSDNode *YC = dyn_cast<ConstantSDNode>(Y))
+        if (YC->isNullValue() &&
+            (isAllOnes(Op1) == (CondCode == X86::COND_NE))) {
+          SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
+          SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs,
+                                    DAG.getConstant(0, CmpOp0.getValueType()),
+                                    CmpOp0);
+          SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
+                                    DAG.getConstant(X86::COND_B, MVT::i8),
+                                    SDValue(Neg.getNode(), 1));
+          return Res;
+        }
+
        Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
                          CmpOp0, DAG.getConstant(1, CmpOp0.getValueType()));
+      Cmp = ConvertCmpIfNecessary(Cmp, DAG);
  
        SDValue Res =   // Res = 0 or -1.
          DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
@@ -8580,9 +8966,9 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
    }
  
    if (addTest) {
-    // Look pass the truncate.
-    if (Cond.getOpcode() == ISD::TRUNCATE)
-      Cond = Cond.getOperand(0);
+    // Look pass the truncate if the high bits are known zero.
+    if (isTruncWithZeroHighBitsInput(Cond, DAG))
+        Cond = Cond.getOperand(0);
  
      // We know the result of AND is compared against zero. Try to match
      // it to BT.
@@ -8605,7 +8991,8 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
    // a <  b ?  0 : -1 -> RES = setcc_carry
    // a >= b ? -1 :  0 -> RES = setcc_carry
    // a >= b ?  0 : -1 -> RES = ~setcc_carry
-  if (Cond.getOpcode() == X86ISD::CMP) {
+  if (Cond.getOpcode() == X86ISD::SUB) {
+    Cond = ConvertCmpIfNecessary(Cond, DAG);
      unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
  
      if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
@@ -8844,6 +9231,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
  
            SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
                                      Cond.getOperand(0), Cond.getOperand(1));
+          Cmp = ConvertCmpIfNecessary(Cmp, DAG);
            CC = DAG.getConstant(X86::COND_NE, MVT::i8);
            Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
                                Chain, Dest, CC, Cmp);
@@ -8873,6 +9261,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
  
            SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
                                      Cond.getOperand(0), Cond.getOperand(1));
+          Cmp = ConvertCmpIfNecessary(Cmp, DAG);
            CC = DAG.getConstant(X86::COND_NE, MVT::i8);
            Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
                                Chain, Dest, CC, Cmp);
@@ -8886,9 +9275,9 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
    }
  
    if (addTest) {
-    // Look pass the truncate.
-    if (Cond.getOpcode() == ISD::TRUNCATE)
-      Cond = Cond.getOperand(0);
+    // Look pass the truncate if the high bits are known zero.
+    if (isTruncWithZeroHighBitsInput(Cond, DAG))
+        Cond = Cond.getOperand(0);
  
      // We know the result of AND is compared against zero. Try to match
      // it to BT.
@@ -8906,6 +9295,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
      CC = DAG.getConstant(X86::COND_NE, MVT::i8);
      Cond = EmitTest(Cond, X86::COND_NE, DAG);
    }
+  Cond = ConvertCmpIfNecessary(Cond, DAG);
    return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
                       Chain, Dest, CC, Cond);
  }
@@ -8944,7 +9334,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
        const Function *F = MF.getFunction();
  
        for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
-           I != E; I++)
+           I != E; ++I)
          if (I->hasNestAttr())
            report_fatal_error("Cannot use segmented stacks with functions that "
                               "have nested arguments.");
@@ -9127,12 +9517,15 @@ static SDValue getTargetVShiftNode(unsigned Opc, DebugLoc dl, EVT VT,
    assert(ShAmt.getValueType() == MVT::i32 && "ShAmt is not i32");
  
    if (isa<ConstantSDNode>(ShAmt)) {
+    // Constant may be a TargetConstant. Use a regular constant.
+    uint32_t ShiftAmt = cast<ConstantSDNode>(ShAmt)->getZExtValue();
      switch (Opc) {
        default: llvm_unreachable("Unknown target vector shift node");
        case X86ISD::VSHLI:
        case X86ISD::VSRLI:
        case X86ISD::VSRAI:
-        return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
+        return DAG.getNode(Opc, dl, VT, SrcOp,
+                           DAG.getConstant(ShiftAmt, MVT::i32));
      }
    }
  
@@ -9149,10 +9542,15 @@ static SDValue getTargetVShiftNode(unsigned Opc, DebugLoc dl, EVT VT,
    SDValue ShOps[4];
    ShOps[0] = ShAmt;
    ShOps[1] = DAG.getConstant(0, MVT::i32);
-  ShOps[2] = DAG.getUNDEF(MVT::i32);
-  ShOps[3] = DAG.getUNDEF(MVT::i32);
+  ShOps[2] = ShOps[3] = DAG.getUNDEF(MVT::i32);
    ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, &ShOps[0], 4);
-  ShAmt = DAG.getNode(ISD::BITCAST, dl, VT, ShAmt);
+
+  // The return type has to be a 128-bit type with the same element
+  // type as the input type.
+  MVT EltVT = VT.getVectorElementType().getSimpleVT();
+  EVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
+
+  ShAmt = DAG.getNode(ISD::BITCAST, dl, ShVT, ShAmt);
    return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
  }
  
@@ -9187,8 +9585,8 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const
    case Intrinsic::x86_sse2_ucomigt_sd:
    case Intrinsic::x86_sse2_ucomige_sd:
    case Intrinsic::x86_sse2_ucomineq_sd: {
-    unsigned Opc = 0;
-    ISD::CondCode CC = ISD::SETCC_INVALID;
+    unsigned Opc;
+    ISD::CondCode CC;
      switch (IntNo) {
      default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
      case Intrinsic::x86_sse_comieq_ss:
@@ -9262,245 +9660,102 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const
                                  DAG.getConstant(X86CC, MVT::i8), Cond);
      return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
    }
-  // XOP comparison intrinsics
-  case Intrinsic::x86_xop_vpcomltb:
-  case Intrinsic::x86_xop_vpcomltw:
-  case Intrinsic::x86_xop_vpcomltd:
-  case Intrinsic::x86_xop_vpcomltq:
-  case Intrinsic::x86_xop_vpcomltub:
-  case Intrinsic::x86_xop_vpcomltuw:
-  case Intrinsic::x86_xop_vpcomltud:
-  case Intrinsic::x86_xop_vpcomltuq:
-  case Intrinsic::x86_xop_vpcomleb:
-  case Intrinsic::x86_xop_vpcomlew:
-  case Intrinsic::x86_xop_vpcomled:
-  case Intrinsic::x86_xop_vpcomleq:
-  case Intrinsic::x86_xop_vpcomleub:
-  case Intrinsic::x86_xop_vpcomleuw:
-  case Intrinsic::x86_xop_vpcomleud:
-  case Intrinsic::x86_xop_vpcomleuq:
-  case Intrinsic::x86_xop_vpcomgtb:
-  case Intrinsic::x86_xop_vpcomgtw:
-  case Intrinsic::x86_xop_vpcomgtd:
-  case Intrinsic::x86_xop_vpcomgtq:
-  case Intrinsic::x86_xop_vpcomgtub:
-  case Intrinsic::x86_xop_vpcomgtuw:
-  case Intrinsic::x86_xop_vpcomgtud:
-  case Intrinsic::x86_xop_vpcomgtuq:
-  case Intrinsic::x86_xop_vpcomgeb:
-  case Intrinsic::x86_xop_vpcomgew:
-  case Intrinsic::x86_xop_vpcomged:
-  case Intrinsic::x86_xop_vpcomgeq:
-  case Intrinsic::x86_xop_vpcomgeub:
-  case Intrinsic::x86_xop_vpcomgeuw:
-  case Intrinsic::x86_xop_vpcomgeud:
-  case Intrinsic::x86_xop_vpcomgeuq:
-  case Intrinsic::x86_xop_vpcomeqb:
-  case Intrinsic::x86_xop_vpcomeqw:
-  case Intrinsic::x86_xop_vpcomeqd:
-  case Intrinsic::x86_xop_vpcomeqq:
-  case Intrinsic::x86_xop_vpcomequb:
-  case Intrinsic::x86_xop_vpcomequw:
-  case Intrinsic::x86_xop_vpcomequd:
-  case Intrinsic::x86_xop_vpcomequq:
-  case Intrinsic::x86_xop_vpcomneb:
-  case Intrinsic::x86_xop_vpcomnew:
-  case Intrinsic::x86_xop_vpcomned:
-  case Intrinsic::x86_xop_vpcomneq:
-  case Intrinsic::x86_xop_vpcomneub:
-  case Intrinsic::x86_xop_vpcomneuw:
-  case Intrinsic::x86_xop_vpcomneud:
-  case Intrinsic::x86_xop_vpcomneuq:
-  case Intrinsic::x86_xop_vpcomfalseb:
-  case Intrinsic::x86_xop_vpcomfalsew:
-  case Intrinsic::x86_xop_vpcomfalsed:
-  case Intrinsic::x86_xop_vpcomfalseq:
-  case Intrinsic::x86_xop_vpcomfalseub:
-  case Intrinsic::x86_xop_vpcomfalseuw:
-  case Intrinsic::x86_xop_vpcomfalseud:
-  case Intrinsic::x86_xop_vpcomfalseuq:
-  case Intrinsic::x86_xop_vpcomtrueb:
-  case Intrinsic::x86_xop_vpcomtruew:
-  case Intrinsic::x86_xop_vpcomtrued:
-  case Intrinsic::x86_xop_vpcomtrueq:
-  case Intrinsic::x86_xop_vpcomtrueub:
-  case Intrinsic::x86_xop_vpcomtrueuw:
-  case Intrinsic::x86_xop_vpcomtrueud:
-  case Intrinsic::x86_xop_vpcomtrueuq: {
-    unsigned CC = 0;
-    unsigned Opc = 0;
-
-    switch (IntNo) {
-    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
-    case Intrinsic::x86_xop_vpcomltb:
-    case Intrinsic::x86_xop_vpcomltw:
-    case Intrinsic::x86_xop_vpcomltd:
-    case Intrinsic::x86_xop_vpcomltq:
-      CC = 0;
-      Opc = X86ISD::VPCOM;
-      break;
-    case Intrinsic::x86_xop_vpcomltub:
-    case Intrinsic::x86_xop_vpcomltuw:
-    case Intrinsic::x86_xop_vpcomltud:
-    case Intrinsic::x86_xop_vpcomltuq:
-      CC = 0;
-      Opc = X86ISD::VPCOMU;
-      break;
-    case Intrinsic::x86_xop_vpcomleb:
-    case Intrinsic::x86_xop_vpcomlew:
-    case Intrinsic::x86_xop_vpcomled:
-    case Intrinsic::x86_xop_vpcomleq:
-      CC = 1;
-      Opc = X86ISD::VPCOM;
-      break;
-    case Intrinsic::x86_xop_vpcomleub:
-    case Intrinsic::x86_xop_vpcomleuw:
-    case Intrinsic::x86_xop_vpcomleud:
-    case Intrinsic::x86_xop_vpcomleuq:
-      CC = 1;
-      Opc = X86ISD::VPCOMU;
-      break;
-    case Intrinsic::x86_xop_vpcomgtb:
-    case Intrinsic::x86_xop_vpcomgtw:
-    case Intrinsic::x86_xop_vpcomgtd:
-    case Intrinsic::x86_xop_vpcomgtq:
-      CC = 2;
-      Opc = X86ISD::VPCOM;
-      break;
-    case Intrinsic::x86_xop_vpcomgtub:
-    case Intrinsic::x86_xop_vpcomgtuw:
-    case Intrinsic::x86_xop_vpcomgtud:
-    case Intrinsic::x86_xop_vpcomgtuq:
-      CC = 2;
-      Opc = X86ISD::VPCOMU;
-      break;
-    case Intrinsic::x86_xop_vpcomgeb:
-    case Intrinsic::x86_xop_vpcomgew:
-    case Intrinsic::x86_xop_vpcomged:
-    case Intrinsic::x86_xop_vpcomgeq:
-      CC = 3;
-      Opc = X86ISD::VPCOM;
-      break;
-    case Intrinsic::x86_xop_vpcomgeub:
-    case Intrinsic::x86_xop_vpcomgeuw:
-    case Intrinsic::x86_xop_vpcomgeud:
-    case Intrinsic::x86_xop_vpcomgeuq:
-      CC = 3;
-      Opc = X86ISD::VPCOMU;
-      break;
-    case Intrinsic::x86_xop_vpcomeqb:
-    case Intrinsic::x86_xop_vpcomeqw:
-    case Intrinsic::x86_xop_vpcomeqd:
-    case Intrinsic::x86_xop_vpcomeqq:
-      CC = 4;
-      Opc = X86ISD::VPCOM;
-      break;
-    case Intrinsic::x86_xop_vpcomequb:
-    case Intrinsic::x86_xop_vpcomequw:
-    case Intrinsic::x86_xop_vpcomequd:
-    case Intrinsic::x86_xop_vpcomequq:
-      CC = 4;
-      Opc = X86ISD::VPCOMU;
-      break;
-    case Intrinsic::x86_xop_vpcomneb:
-    case Intrinsic::x86_xop_vpcomnew:
-    case Intrinsic::x86_xop_vpcomned:
-    case Intrinsic::x86_xop_vpcomneq:
-      CC = 5;
-      Opc = X86ISD::VPCOM;
-      break;
-    case Intrinsic::x86_xop_vpcomneub:
-    case Intrinsic::x86_xop_vpcomneuw:
-    case Intrinsic::x86_xop_vpcomneud:
-    case Intrinsic::x86_xop_vpcomneuq:
-      CC = 5;
-      Opc = X86ISD::VPCOMU;
-      break;
-    case Intrinsic::x86_xop_vpcomfalseb:
-    case Intrinsic::x86_xop_vpcomfalsew:
-    case Intrinsic::x86_xop_vpcomfalsed:
-    case Intrinsic::x86_xop_vpcomfalseq:
-      CC = 6;
-      Opc = X86ISD::VPCOM;
-      break;
-    case Intrinsic::x86_xop_vpcomfalseub:
-    case Intrinsic::x86_xop_vpcomfalseuw:
-    case Intrinsic::x86_xop_vpcomfalseud:
-    case Intrinsic::x86_xop_vpcomfalseuq:
-      CC = 6;
-      Opc = X86ISD::VPCOMU;
-      break;
-    case Intrinsic::x86_xop_vpcomtrueb:
-    case Intrinsic::x86_xop_vpcomtruew:
-    case Intrinsic::x86_xop_vpcomtrued:
-    case Intrinsic::x86_xop_vpcomtrueq:
-      CC = 7;
-      Opc = X86ISD::VPCOM;
-      break;
-    case Intrinsic::x86_xop_vpcomtrueub:
-    case Intrinsic::x86_xop_vpcomtrueuw:
-    case Intrinsic::x86_xop_vpcomtrueud:
-    case Intrinsic::x86_xop_vpcomtrueuq:
-      CC = 7;
-      Opc = X86ISD::VPCOMU;
-      break;
-    }
-
-    SDValue LHS = Op.getOperand(1);
-    SDValue RHS = Op.getOperand(2);
-    return DAG.getNode(Opc, dl, Op.getValueType(), LHS, RHS,
-                       DAG.getConstant(CC, MVT::i8));
-  }
  
    // Arithmetic intrinsics.
    case Intrinsic::x86_sse2_pmulu_dq:
    case Intrinsic::x86_avx2_pmulu_dq:
      return DAG.getNode(X86ISD::PMULUDQ, dl, Op.getValueType(),
                         Op.getOperand(1), Op.getOperand(2));
+
+  // SSE3/AVX horizontal add/sub intrinsics
    case Intrinsic::x86_sse3_hadd_ps:
    case Intrinsic::x86_sse3_hadd_pd:
    case Intrinsic::x86_avx_hadd_ps_256:
    case Intrinsic::x86_avx_hadd_pd_256:
-    return DAG.getNode(X86ISD::FHADD, dl, Op.getValueType(),
-                       Op.getOperand(1), Op.getOperand(2));
    case Intrinsic::x86_sse3_hsub_ps:
    case Intrinsic::x86_sse3_hsub_pd:
    case Intrinsic::x86_avx_hsub_ps_256:
    case Intrinsic::x86_avx_hsub_pd_256:
-    return DAG.getNode(X86ISD::FHSUB, dl, Op.getValueType(),
-                       Op.getOperand(1), Op.getOperand(2));
    case Intrinsic::x86_ssse3_phadd_w_128:
    case Intrinsic::x86_ssse3_phadd_d_128:
    case Intrinsic::x86_avx2_phadd_w:
    case Intrinsic::x86_avx2_phadd_d:
-    return DAG.getNode(X86ISD::HADD, dl, Op.getValueType(),
-                       Op.getOperand(1), Op.getOperand(2));
    case Intrinsic::x86_ssse3_phsub_w_128:
    case Intrinsic::x86_ssse3_phsub_d_128:
    case Intrinsic::x86_avx2_phsub_w:
-  case Intrinsic::x86_avx2_phsub_d:
-    return DAG.getNode(X86ISD::HSUB, dl, Op.getValueType(),
+  case Intrinsic::x86_avx2_phsub_d: {
+    unsigned Opcode;
+    switch (IntNo) {
+    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
+    case Intrinsic::x86_sse3_hadd_ps:
+    case Intrinsic::x86_sse3_hadd_pd:
+    case Intrinsic::x86_avx_hadd_ps_256:
+    case Intrinsic::x86_avx_hadd_pd_256:
+      Opcode = X86ISD::FHADD;
+      break;
+    case Intrinsic::x86_sse3_hsub_ps:
+    case Intrinsic::x86_sse3_hsub_pd:
+    case Intrinsic::x86_avx_hsub_ps_256:
+    case Intrinsic::x86_avx_hsub_pd_256:
+      Opcode = X86ISD::FHSUB;
+      break;
+    case Intrinsic::x86_ssse3_phadd_w_128:
+    case Intrinsic::x86_ssse3_phadd_d_128:
+    case Intrinsic::x86_avx2_phadd_w:
+    case Intrinsic::x86_avx2_phadd_d:
+      Opcode = X86ISD::HADD;
+      break;
+    case Intrinsic::x86_ssse3_phsub_w_128:
+    case Intrinsic::x86_ssse3_phsub_d_128:
+    case Intrinsic::x86_avx2_phsub_w:
+    case Intrinsic::x86_avx2_phsub_d:
+      Opcode = X86ISD::HSUB;
+      break;
+    }
+    return DAG.getNode(Opcode, dl, Op.getValueType(),
                         Op.getOperand(1), Op.getOperand(2));
+  }
+
+  // AVX2 variable shift intrinsics
    case Intrinsic::x86_avx2_psllv_d:
    case Intrinsic::x86_avx2_psllv_q:
    case Intrinsic::x86_avx2_psllv_d_256:
    case Intrinsic::x86_avx2_psllv_q_256:
-    return DAG.getNode(ISD::SHL, dl, Op.getValueType(),
-                      Op.getOperand(1), Op.getOperand(2));
    case Intrinsic::x86_avx2_psrlv_d:
    case Intrinsic::x86_avx2_psrlv_q:
    case Intrinsic::x86_avx2_psrlv_d_256:
    case Intrinsic::x86_avx2_psrlv_q_256:
-    return DAG.getNode(ISD::SRL, dl, Op.getValueType(),
-                      Op.getOperand(1), Op.getOperand(2));
    case Intrinsic::x86_avx2_psrav_d:
-  case Intrinsic::x86_avx2_psrav_d_256:
-    return DAG.getNode(ISD::SRA, dl, Op.getValueType(),
-                      Op.getOperand(1), Op.getOperand(2));
+  case Intrinsic::x86_avx2_psrav_d_256: {
+    unsigned Opcode;
+    switch (IntNo) {
+    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
+    case Intrinsic::x86_avx2_psllv_d:
+    case Intrinsic::x86_avx2_psllv_q:
+    case Intrinsic::x86_avx2_psllv_d_256:
+    case Intrinsic::x86_avx2_psllv_q_256:
+      Opcode = ISD::SHL;
+      break;
+    case Intrinsic::x86_avx2_psrlv_d:
+    case Intrinsic::x86_avx2_psrlv_q:
+    case Intrinsic::x86_avx2_psrlv_d_256:
+    case Intrinsic::x86_avx2_psrlv_q_256:
+      Opcode = ISD::SRL;
+      break;
+    case Intrinsic::x86_avx2_psrav_d:
+    case Intrinsic::x86_avx2_psrav_d_256:
+      Opcode = ISD::SRA;
+      break;
+    }
+    return DAG.getNode(Opcode, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+  }
+
    case Intrinsic::x86_ssse3_pshuf_b_128:
    case Intrinsic::x86_avx2_pshuf_b:
      return DAG.getNode(X86ISD::PSHUFB, dl, Op.getValueType(),
                         Op.getOperand(1), Op.getOperand(2));
+
    case Intrinsic::x86_ssse3_psign_b_128:
    case Intrinsic::x86_ssse3_psign_w_128:
    case Intrinsic::x86_ssse3_psign_d_128:
@@ -9509,21 +9764,24 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const
    case Intrinsic::x86_avx2_psign_d:
      return DAG.getNode(X86ISD::PSIGN, dl, Op.getValueType(),
                         Op.getOperand(1), Op.getOperand(2));
+
    case Intrinsic::x86_sse41_insertps:
      return DAG.getNode(X86ISD::INSERTPS, dl, Op.getValueType(),
                         Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+
    case Intrinsic::x86_avx_vperm2f128_ps_256:
    case Intrinsic::x86_avx_vperm2f128_pd_256:
    case Intrinsic::x86_avx_vperm2f128_si_256:
    case Intrinsic::x86_avx2_vperm2i128:
      return DAG.getNode(X86ISD::VPERM2X128, dl, Op.getValueType(),
                         Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
-  case Intrinsic::x86_avx_vpermil_ps:
-  case Intrinsic::x86_avx_vpermil_pd:
-  case Intrinsic::x86_avx_vpermil_ps_256:
-  case Intrinsic::x86_avx_vpermil_pd_256:
-    return DAG.getNode(X86ISD::VPERMILP, dl, Op.getValueType(),
-                       Op.getOperand(1), Op.getOperand(2));
+
+  case Intrinsic::x86_avx2_permd:
+  case Intrinsic::x86_avx2_permps:
+    // Operands intentionally swapped. Mask is last operand to intrinsic,
+    // but second operand for node/intruction.
+    return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
+                       Op.getOperand(2), Op.getOperand(1));
  
    // ptest and testp intrinsics. The intrinsic these come from are designed to
    // return an integer value, not just an instruction so lower it to the ptest
@@ -9547,7 +9805,7 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const
    case Intrinsic::x86_avx_vtestc_pd_256:
    case Intrinsic::x86_avx_vtestnzc_pd_256: {
      bool IsTestPacked = false;
-    unsigned X86CC = 0;
+    unsigned X86CC;
      switch (IntNo) {
      default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
      case Intrinsic::x86_avx_vtestz_ps:
@@ -9598,99 +9856,274 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const
    case Intrinsic::x86_avx2_psll_w:
    case Intrinsic::x86_avx2_psll_d:
    case Intrinsic::x86_avx2_psll_q:
-    return DAG.getNode(X86ISD::VSHL, dl, Op.getValueType(),
-                       Op.getOperand(1), Op.getOperand(2));
    case Intrinsic::x86_sse2_psrl_w:
    case Intrinsic::x86_sse2_psrl_d:
    case Intrinsic::x86_sse2_psrl_q:
    case Intrinsic::x86_avx2_psrl_w:
    case Intrinsic::x86_avx2_psrl_d:
    case Intrinsic::x86_avx2_psrl_q:
-    return DAG.getNode(X86ISD::VSRL, dl, Op.getValueType(),
-                       Op.getOperand(1), Op.getOperand(2));
    case Intrinsic::x86_sse2_psra_w:
    case Intrinsic::x86_sse2_psra_d:
    case Intrinsic::x86_avx2_psra_w:
-  case Intrinsic::x86_avx2_psra_d:
-    return DAG.getNode(X86ISD::VSRA, dl, Op.getValueType(),
+  case Intrinsic::x86_avx2_psra_d: {
+    unsigned Opcode;
+    switch (IntNo) {
+    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
+    case Intrinsic::x86_sse2_psll_w:
+    case Intrinsic::x86_sse2_psll_d:
+    case Intrinsic::x86_sse2_psll_q:
+    case Intrinsic::x86_avx2_psll_w:
+    case Intrinsic::x86_avx2_psll_d:
+    case Intrinsic::x86_avx2_psll_q:
+      Opcode = X86ISD::VSHL;
+      break;
+    case Intrinsic::x86_sse2_psrl_w:
+    case Intrinsic::x86_sse2_psrl_d:
+    case Intrinsic::x86_sse2_psrl_q:
+    case Intrinsic::x86_avx2_psrl_w:
+    case Intrinsic::x86_avx2_psrl_d:
+    case Intrinsic::x86_avx2_psrl_q:
+      Opcode = X86ISD::VSRL;
+      break;
+    case Intrinsic::x86_sse2_psra_w:
+    case Intrinsic::x86_sse2_psra_d:
+    case Intrinsic::x86_avx2_psra_w:
+    case Intrinsic::x86_avx2_psra_d:
+      Opcode = X86ISD::VSRA;
+      break;
+    }
+    return DAG.getNode(Opcode, dl, Op.getValueType(),
                         Op.getOperand(1), Op.getOperand(2));
+  }
+
+  // SSE/AVX immediate shift intrinsics
    case Intrinsic::x86_sse2_pslli_w:
    case Intrinsic::x86_sse2_pslli_d:
    case Intrinsic::x86_sse2_pslli_q:
    case Intrinsic::x86_avx2_pslli_w:
    case Intrinsic::x86_avx2_pslli_d:
    case Intrinsic::x86_avx2_pslli_q:
-    return getTargetVShiftNode(X86ISD::VSHLI, dl, Op.getValueType(),
-                               Op.getOperand(1), Op.getOperand(2), DAG);
    case Intrinsic::x86_sse2_psrli_w:
    case Intrinsic::x86_sse2_psrli_d:
    case Intrinsic::x86_sse2_psrli_q:
    case Intrinsic::x86_avx2_psrli_w:
    case Intrinsic::x86_avx2_psrli_d:
    case Intrinsic::x86_avx2_psrli_q:
-    return getTargetVShiftNode(X86ISD::VSRLI, dl, Op.getValueType(),
-                               Op.getOperand(1), Op.getOperand(2), DAG);
    case Intrinsic::x86_sse2_psrai_w:
    case Intrinsic::x86_sse2_psrai_d:
    case Intrinsic::x86_avx2_psrai_w:
-  case Intrinsic::x86_avx2_psrai_d:
-    return getTargetVShiftNode(X86ISD::VSRAI, dl, Op.getValueType(),
+  case Intrinsic::x86_avx2_psrai_d: {
+    unsigned Opcode;
+    switch (IntNo) {
+    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
+    case Intrinsic::x86_sse2_pslli_w:
+    case Intrinsic::x86_sse2_pslli_d:
+    case Intrinsic::x86_sse2_pslli_q:
+    case Intrinsic::x86_avx2_pslli_w:
+    case Intrinsic::x86_avx2_pslli_d:
+    case Intrinsic::x86_avx2_pslli_q:
+      Opcode = X86ISD::VSHLI;
+      break;
+    case Intrinsic::x86_sse2_psrli_w:
+    case Intrinsic::x86_sse2_psrli_d:
+    case Intrinsic::x86_sse2_psrli_q:
+    case Intrinsic::x86_avx2_psrli_w:
+    case Intrinsic::x86_avx2_psrli_d:
+    case Intrinsic::x86_avx2_psrli_q:
+      Opcode = X86ISD::VSRLI;
+      break;
+    case Intrinsic::x86_sse2_psrai_w:
+    case Intrinsic::x86_sse2_psrai_d:
+    case Intrinsic::x86_avx2_psrai_w:
+    case Intrinsic::x86_avx2_psrai_d:
+      Opcode = X86ISD::VSRAI;
+      break;
+    }
+    return getTargetVShiftNode(Opcode, dl, Op.getValueType(),
                                 Op.getOperand(1), Op.getOperand(2), DAG);
-  // Fix vector shift instructions where the last operand is a non-immediate
-  // i32 value.
-  case Intrinsic::x86_mmx_pslli_w:
-  case Intrinsic::x86_mmx_pslli_d:
-  case Intrinsic::x86_mmx_pslli_q:
-  case Intrinsic::x86_mmx_psrli_w:
-  case Intrinsic::x86_mmx_psrli_d:
-  case Intrinsic::x86_mmx_psrli_q:
-  case Intrinsic::x86_mmx_psrai_w:
-  case Intrinsic::x86_mmx_psrai_d: {
-    SDValue ShAmt = Op.getOperand(2);
-    if (isa<ConstantSDNode>(ShAmt))
-      return SDValue();
+  }
  
-    unsigned NewIntNo = 0;
+  case Intrinsic::x86_sse42_pcmpistria128:
+  case Intrinsic::x86_sse42_pcmpestria128:
+  case Intrinsic::x86_sse42_pcmpistric128:
+  case Intrinsic::x86_sse42_pcmpestric128:
+  case Intrinsic::x86_sse42_pcmpistrio128:
+  case Intrinsic::x86_sse42_pcmpestrio128:
+  case Intrinsic::x86_sse42_pcmpistris128:
+  case Intrinsic::x86_sse42_pcmpestris128:
+  case Intrinsic::x86_sse42_pcmpistriz128:
+  case Intrinsic::x86_sse42_pcmpestriz128: {
+    unsigned Opcode;
+    unsigned X86CC;
      switch (IntNo) {
-    case Intrinsic::x86_mmx_pslli_w:
-      NewIntNo = Intrinsic::x86_mmx_psll_w;
+    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
+    case Intrinsic::x86_sse42_pcmpistria128:
+      Opcode = X86ISD::PCMPISTRI;
+      X86CC = X86::COND_A;
+      break;
+    case Intrinsic::x86_sse42_pcmpestria128:
+      Opcode = X86ISD::PCMPESTRI;
+      X86CC = X86::COND_A;
        break;
-    case Intrinsic::x86_mmx_pslli_d:
-      NewIntNo = Intrinsic::x86_mmx_psll_d;
+    case Intrinsic::x86_sse42_pcmpistric128:
+      Opcode = X86ISD::PCMPISTRI;
+      X86CC = X86::COND_B;
+      break;
+    case Intrinsic::x86_sse42_pcmpestric128:
+      Opcode = X86ISD::PCMPESTRI;
+      X86CC = X86::COND_B;
        break;
-    case Intrinsic::x86_mmx_pslli_q:
-      NewIntNo = Intrinsic::x86_mmx_psll_q;
+    case Intrinsic::x86_sse42_pcmpistrio128:
+      Opcode = X86ISD::PCMPISTRI;
+      X86CC = X86::COND_O;
        break;
-    case Intrinsic::x86_mmx_psrli_w:
-      NewIntNo = Intrinsic::x86_mmx_psrl_w;
+    case Intrinsic::x86_sse42_pcmpestrio128:
+      Opcode = X86ISD::PCMPESTRI;
+      X86CC = X86::COND_O;
        break;
-    case Intrinsic::x86_mmx_psrli_d:
-      NewIntNo = Intrinsic::x86_mmx_psrl_d;
+    case Intrinsic::x86_sse42_pcmpistris128:
+      Opcode = X86ISD::PCMPISTRI;
+      X86CC = X86::COND_S;
        break;
-    case Intrinsic::x86_mmx_psrli_q:
-      NewIntNo = Intrinsic::x86_mmx_psrl_q;
+    case Intrinsic::x86_sse42_pcmpestris128:
+      Opcode = X86ISD::PCMPESTRI;
+      X86CC = X86::COND_S;
        break;
-    case Intrinsic::x86_mmx_psrai_w:
-      NewIntNo = Intrinsic::x86_mmx_psra_w;
+    case Intrinsic::x86_sse42_pcmpistriz128:
+      Opcode = X86ISD::PCMPISTRI;
+      X86CC = X86::COND_E;
        break;
-    case Intrinsic::x86_mmx_psrai_d:
-      NewIntNo = Intrinsic::x86_mmx_psra_d;
+    case Intrinsic::x86_sse42_pcmpestriz128:
+      Opcode = X86ISD::PCMPESTRI;
+      X86CC = X86::COND_E;
        break;
+    }
+    SmallVector<SDValue, 5> NewOps;
+    NewOps.append(Op->op_begin()+1, Op->op_end());
+    SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
+    SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size());
+    SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+                                DAG.getConstant(X86CC, MVT::i8),
+                                SDValue(PCMP.getNode(), 1));
+    return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
+  }
+
+  case Intrinsic::x86_sse42_pcmpistri128:
+  case Intrinsic::x86_sse42_pcmpestri128: {
+    unsigned Opcode;
+    if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
+      Opcode = X86ISD::PCMPISTRI;
+    else
+      Opcode = X86ISD::PCMPESTRI;
+
+    SmallVector<SDValue, 5> NewOps;
+    NewOps.append(Op->op_begin()+1, Op->op_end());
+    SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
+    return DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size());
+  }
+  case Intrinsic::x86_fma_vfmadd_ps:
+  case Intrinsic::x86_fma_vfmadd_pd:
+  case Intrinsic::x86_fma_vfmsub_ps:
+  case Intrinsic::x86_fma_vfmsub_pd:
+  case Intrinsic::x86_fma_vfnmadd_ps:
+  case Intrinsic::x86_fma_vfnmadd_pd:
+  case Intrinsic::x86_fma_vfnmsub_ps:
+  case Intrinsic::x86_fma_vfnmsub_pd:
+  case Intrinsic::x86_fma_vfmaddsub_ps:
+  case Intrinsic::x86_fma_vfmaddsub_pd:
+  case Intrinsic::x86_fma_vfmsubadd_ps:
+  case Intrinsic::x86_fma_vfmsubadd_pd:
+  case Intrinsic::x86_fma_vfmadd_ps_256:
+  case Intrinsic::x86_fma_vfmadd_pd_256:
+  case Intrinsic::x86_fma_vfmsub_ps_256:
+  case Intrinsic::x86_fma_vfmsub_pd_256:
+  case Intrinsic::x86_fma_vfnmadd_ps_256:
+  case Intrinsic::x86_fma_vfnmadd_pd_256:
+  case Intrinsic::x86_fma_vfnmsub_ps_256:
+  case Intrinsic::x86_fma_vfnmsub_pd_256:
+  case Intrinsic::x86_fma_vfmaddsub_ps_256:
+  case Intrinsic::x86_fma_vfmaddsub_pd_256:
+  case Intrinsic::x86_fma_vfmsubadd_ps_256:
+  case Intrinsic::x86_fma_vfmsubadd_pd_256: {
+    // Only lower intrinsics if FMA is enabled. FMA4 still uses patterns.
+    if (!Subtarget->hasFMA())
+      return SDValue();
+
+    unsigned Opc;
+    switch (IntNo) {
      default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
+    case Intrinsic::x86_fma_vfmadd_ps:
+    case Intrinsic::x86_fma_vfmadd_pd:
+    case Intrinsic::x86_fma_vfmadd_ps_256:
+    case Intrinsic::x86_fma_vfmadd_pd_256:
+      Opc = X86ISD::FMADD;
+      break;
+    case Intrinsic::x86_fma_vfmsub_ps:
+    case Intrinsic::x86_fma_vfmsub_pd:
+    case Intrinsic::x86_fma_vfmsub_ps_256:
+    case Intrinsic::x86_fma_vfmsub_pd_256:
+      Opc = X86ISD::FMSUB;
+      break;
+    case Intrinsic::x86_fma_vfnmadd_ps:
+    case Intrinsic::x86_fma_vfnmadd_pd:
+    case Intrinsic::x86_fma_vfnmadd_ps_256:
+    case Intrinsic::x86_fma_vfnmadd_pd_256:
+      Opc = X86ISD::FNMADD;
+      break;
+    case Intrinsic::x86_fma_vfnmsub_ps:
+    case Intrinsic::x86_fma_vfnmsub_pd:
+    case Intrinsic::x86_fma_vfnmsub_ps_256:
+    case Intrinsic::x86_fma_vfnmsub_pd_256:
+      Opc = X86ISD::FNMSUB;
+      break;
+    case Intrinsic::x86_fma_vfmaddsub_ps:
+    case Intrinsic::x86_fma_vfmaddsub_pd:
+    case Intrinsic::x86_fma_vfmaddsub_ps_256:
+    case Intrinsic::x86_fma_vfmaddsub_pd_256:
+      Opc = X86ISD::FMADDSUB;
+      break;
+    case Intrinsic::x86_fma_vfmsubadd_ps:
+    case Intrinsic::x86_fma_vfmsubadd_pd:
+    case Intrinsic::x86_fma_vfmsubadd_ps_256:
+    case Intrinsic::x86_fma_vfmsubadd_pd_256:
+      Opc = X86ISD::FMSUBADD;
+      break;
      }
  
-    // The vector shift intrinsics with scalars uses 32b shift amounts but
-    // the sse2/mmx shift instructions reads 64 bits. Set the upper 32 bits
-    // to be zero.
-    ShAmt =  DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, ShAmt,
-                         DAG.getConstant(0, MVT::i32));
-// FIXME this must be lowered to get rid of the invalid type.
+    return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
+                       Op.getOperand(2), Op.getOperand(3));
+  }
+  }
+}
  
-    EVT VT = Op.getValueType();
-    ShAmt = DAG.getNode(ISD::BITCAST, dl, VT, ShAmt);
-    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                       DAG.getConstant(NewIntNo, MVT::i32),
-                       Op.getOperand(1), ShAmt);
+SDValue
+X86TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const {
+  DebugLoc dl = Op.getDebugLoc();
+  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+  switch (IntNo) {
+  default: return SDValue();    // Don't custom lower most intrinsics.
+
+  // RDRAND intrinsics.
+  case Intrinsic::x86_rdrand_16:
+  case Intrinsic::x86_rdrand_32:
+  case Intrinsic::x86_rdrand_64: {
+    // Emit the node with the right value type.
+    SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other);
+    SDValue Result = DAG.getNode(X86ISD::RDRAND, dl, VTs, Op.getOperand(0));
+
+    // If the value returned by RDRAND was valid (CF=1), return 1. Otherwise
+    // return the value from Rand, which is always 0, casted to i32.
+    SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
+                      DAG.getConstant(1, Op->getValueType(1)),
+                      DAG.getConstant(X86::COND_B, MVT::i32),
+                      SDValue(Result.getNode(), 1) };
+    SDValue isValid = DAG.getNode(X86ISD::CMOV, dl,
+                                  DAG.getVTList(Op->getValueType(1), MVT::Glue),
+                                  Ops, 4);
+
+    // Return { result, isValid, chain }.
+    return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
+                       SDValue(Result.getNode(), 2));
    }
    }
  }
@@ -9742,7 +10175,6 @@ SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
  }
  
  SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
-  MachineFunction &MF = DAG.getMachineFunction();
    SDValue Chain     = Op.getOperand(0);
    SDValue Offset    = Op.getOperand(1);
    SDValue Handler   = Op.getOperand(2);
@@ -9759,7 +10191,6 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
    Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(),
                         false, false, 0);
    Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
-  MF.getRegInfo().addLiveOut(StoreAddrReg);
  
    return DAG.getNode(X86ISD::EH_RETURN, dl,
                       MVT::Other,
@@ -10075,23 +10506,21 @@ SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
  static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
    EVT VT = Op.getValueType();
  
-  assert(VT.getSizeInBits() == 256 && VT.isInteger() &&
+  assert(VT.is256BitVector() && VT.isInteger() &&
           "Unsupported value type for operation");
  
-  int NumElems = VT.getVectorNumElements();
+  unsigned NumElems = VT.getVectorNumElements();
    DebugLoc dl = Op.getDebugLoc();
-  SDValue Idx0 = DAG.getConstant(0, MVT::i32);
-  SDValue Idx1 = DAG.getConstant(NumElems/2, MVT::i32);
  
    // Extract the LHS vectors
    SDValue LHS = Op.getOperand(0);
-  SDValue LHS1 = Extract128BitVector(LHS, Idx0, DAG, dl);
-  SDValue LHS2 = Extract128BitVector(LHS, Idx1, DAG, dl);
+  SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
+  SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
  
    // Extract the RHS vectors
    SDValue RHS = Op.getOperand(1);
-  SDValue RHS1 = Extract128BitVector(RHS, Idx0, DAG, dl);
-  SDValue RHS2 = Extract128BitVector(RHS, Idx1, DAG, dl);
+  SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);
+  SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
  
    MVT EltVT = VT.getVectorElementType().getSimpleVT();
    EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
@@ -10102,14 +10531,14 @@ static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
  }
  
  SDValue X86TargetLowering::LowerADD(SDValue Op, SelectionDAG &DAG) const {
-  assert(Op.getValueType().getSizeInBits() == 256 &&
+  assert(Op.getValueType().is256BitVector() &&
           Op.getValueType().isInteger() &&
           "Only handle AVX 256-bit vector integer operation");
    return Lower256IntArith(Op, DAG);
  }
  
  SDValue X86TargetLowering::LowerSUB(SDValue Op, SelectionDAG &DAG) const {
-  assert(Op.getValueType().getSizeInBits() == 256 &&
+  assert(Op.getValueType().is256BitVector() &&
           Op.getValueType().isInteger() &&
           "Only handle AVX 256-bit vector integer operation");
    return Lower256IntArith(Op, DAG);
@@ -10119,7 +10548,7 @@ SDValue X86TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
    EVT VT = Op.getValueType();
  
    // Decompose 256-bit ops into smaller 128-bit ops.
-  if (VT.getSizeInBits() == 256 && !Subtarget->hasAVX2())
+  if (VT.is256BitVector() && !Subtarget->hasAVX2())
      return Lower256IntArith(Op, DAG);
  
    assert((VT == MVT::v2i64 || VT == MVT::v4i64) &&
@@ -10236,6 +10665,7 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
            Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
            return Res;
          }
+        llvm_unreachable("Unknown shift opcode.");
        }
  
        if (Subtarget->hasAVX2() && VT == MVT::v32i8) {
@@ -10279,6 +10709,7 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
            Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
            return Res;
          }
+        llvm_unreachable("Unknown shift opcode.");
        }
      }
    }
@@ -10347,15 +10778,14 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
    }
  
    // Decompose 256-bit shifts into smaller 128-bit shifts.
-  if (VT.getSizeInBits() == 256) {
+  if (VT.is256BitVector()) {
      unsigned NumElems = VT.getVectorNumElements();
      MVT EltVT = VT.getVectorElementType().getSimpleVT();
      EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
  
      // Extract the two vectors
-    SDValue V1 = Extract128BitVector(R, DAG.getConstant(0, MVT::i32), DAG, dl);
-    SDValue V2 = Extract128BitVector(R, DAG.getConstant(NumElems/2, MVT::i32),
-                                     DAG, dl);
+    SDValue V1 = Extract128BitVector(R, 0, DAG, dl);
+    SDValue V2 = Extract128BitVector(R, NumElems/2, DAG, dl);
  
      // Recreate the shift amount vectors
      SDValue Amt1, Amt2;
@@ -10374,9 +10804,8 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
                                   &Amt2Csts[0], NumElems/2);
      } else {
        // Variable shift amount
-      Amt1 = Extract128BitVector(Amt, DAG.getConstant(0, MVT::i32), DAG, dl);
-      Amt2 = Extract128BitVector(Amt, DAG.getConstant(NumElems/2, MVT::i32),
-                                 DAG, dl);
+      Amt1 = Extract128BitVector(Amt, 0, DAG, dl);
+      Amt2 = Extract128BitVector(Amt, NumElems/2, DAG, dl);
      }
  
      // Issue new vector shifts for the smaller types
@@ -10486,20 +10915,18 @@ SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
          return SDValue();
        if (!Subtarget->hasAVX2()) {
          // needs to be split
-        int NumElems = VT.getVectorNumElements();
-        SDValue Idx0 = DAG.getConstant(0, MVT::i32);
-        SDValue Idx1 = DAG.getConstant(NumElems/2, MVT::i32);
+        unsigned NumElems = VT.getVectorNumElements();
  
          // Extract the LHS vectors
          SDValue LHS = Op.getOperand(0);
-        SDValue LHS1 = Extract128BitVector(LHS, Idx0, DAG, dl);
-        SDValue LHS2 = Extract128BitVector(LHS, Idx1, DAG, dl);
+        SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
+        SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
  
          MVT EltVT = VT.getVectorElementType().getSimpleVT();
          EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
  
          EVT ExtraEltVT = ExtraVT.getVectorElementType();
-        int ExtraNumElems = ExtraVT.getVectorNumElements();
+        unsigned ExtraNumElems = ExtraVT.getVectorNumElements();
          ExtraVT = EVT::getVectorVT(*DAG.getContext(), ExtraEltVT,
                                     ExtraNumElems/2);
          SDValue Extra = DAG.getValueType(ExtraVT);
@@ -10785,6 +11212,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
    case ISD::VAARG:              return LowerVAARG(Op, DAG);
    case ISD::VACOPY:             return LowerVACOPY(Op, DAG);
    case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+  case ISD::INTRINSIC_W_CHAIN:  return LowerINTRINSIC_W_CHAIN(Op, DAG);
    case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
    case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
    case ISD::FRAME_TO_ARGS_OFFSET:
@@ -10839,9 +11267,9 @@ static void ReplaceATOMIC_LOAD(SDNode *Node,
    Results.push_back(Swap.getValue(1));
  }
  
-void X86TargetLowering::
+static void
  ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results,
-                        SelectionDAG &DAG, unsigned NewOp) const {
+                        SelectionDAG &DAG, unsigned NewOp) {
    DebugLoc dl = Node->getDebugLoc();
    assert (Node->getValueType(0) == MVT::i64 &&
            "Only know how to expand i64 atomics");
@@ -10939,7 +11367,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
                                 Regs64bit ? X86::RBX : X86::EBX,
                                 swapInL, cpInH.getValue(1));
      swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl,
-                               Regs64bit ? X86::RCX : X86::ECX, 
+                               Regs64bit ? X86::RCX : X86::ECX,
                                 swapInH, swapInL.getValue(1));
      SDValue Ops[] = { swapInH.getValue(0),
                        N->getOperand(1),
@@ -10962,26 +11390,40 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
      return;
    }
    case ISD::ATOMIC_LOAD_ADD:
-    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMADD64_DAG);
-    return;
    case ISD::ATOMIC_LOAD_AND:
-    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMAND64_DAG);
-    return;
    case ISD::ATOMIC_LOAD_NAND:
-    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMNAND64_DAG);
-    return;
    case ISD::ATOMIC_LOAD_OR:
-    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMOR64_DAG);
-    return;
    case ISD::ATOMIC_LOAD_SUB:
-    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSUB64_DAG);
-    return;
    case ISD::ATOMIC_LOAD_XOR:
-    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMXOR64_DAG);
-    return;
-  case ISD::ATOMIC_SWAP:
-    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSWAP64_DAG);
+  case ISD::ATOMIC_SWAP: {
+    unsigned Opc;
+    switch (N->getOpcode()) {
+    default: llvm_unreachable("Unexpected opcode");
+    case ISD::ATOMIC_LOAD_ADD:
+      Opc = X86ISD::ATOMADD64_DAG;
+      break;
+    case ISD::ATOMIC_LOAD_AND:
+      Opc = X86ISD::ATOMAND64_DAG;
+      break;
+    case ISD::ATOMIC_LOAD_NAND:
+      Opc = X86ISD::ATOMNAND64_DAG;
+      break;
+    case ISD::ATOMIC_LOAD_OR:
+      Opc = X86ISD::ATOMOR64_DAG;
+      break;
+    case ISD::ATOMIC_LOAD_SUB:
+      Opc = X86ISD::ATOMSUB64_DAG;
+      break;
+    case ISD::ATOMIC_LOAD_XOR:
+      Opc = X86ISD::ATOMXOR64_DAG;
+      break;
+    case ISD::ATOMIC_SWAP:
+      Opc = X86ISD::ATOMSWAP64_DAG;
+      break;
+    }
+    ReplaceATOMIC_BINARY_64(N, Results, DAG, Opc);
      return;
+  }
    case ISD::ATOMIC_LOAD:
      ReplaceATOMIC_LOAD(N, Results, DAG);
    }
@@ -11032,19 +11474,26 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
    case X86ISD::ANDNP:              return "X86ISD::ANDNP";
    case X86ISD::PSIGN:              return "X86ISD::PSIGN";
    case X86ISD::BLENDV:             return "X86ISD::BLENDV";
+  case X86ISD::BLENDPW:            return "X86ISD::BLENDPW";
+  case X86ISD::BLENDPS:            return "X86ISD::BLENDPS";
+  case X86ISD::BLENDPD:            return "X86ISD::BLENDPD";
    case X86ISD::HADD:               return "X86ISD::HADD";
    case X86ISD::HSUB:               return "X86ISD::HSUB";
    case X86ISD::FHADD:              return "X86ISD::FHADD";
    case X86ISD::FHSUB:              return "X86ISD::FHSUB";
    case X86ISD::FMAX:               return "X86ISD::FMAX";
    case X86ISD::FMIN:               return "X86ISD::FMIN";
+  case X86ISD::FMAXC:              return "X86ISD::FMAXC";
+  case X86ISD::FMINC:              return "X86ISD::FMINC";
    case X86ISD::FRSQRT:             return "X86ISD::FRSQRT";
    case X86ISD::FRCP:               return "X86ISD::FRCP";
    case X86ISD::TLSADDR:            return "X86ISD::TLSADDR";
+  case X86ISD::TLSBASEADDR:        return "X86ISD::TLSBASEADDR";
    case X86ISD::TLSCALL:            return "X86ISD::TLSCALL";
    case X86ISD::EH_RETURN:          return "X86ISD::EH_RETURN";
    case X86ISD::TC_RETURN:          return "X86ISD::TC_RETURN";
    case X86ISD::FNSTCW16m:          return "X86ISD::FNSTCW16m";
+  case X86ISD::FNSTSW16r:          return "X86ISD::FNSTSW16r";
    case X86ISD::LCMPXCHG_DAG:       return "X86ISD::LCMPXCHG_DAG";
    case X86ISD::LCMPXCHG8_DAG:      return "X86ISD::LCMPXCHG8_DAG";
    case X86ISD::ATOMADD64_DAG:      return "X86ISD::ATOMADD64_DAG";
@@ -11054,7 +11503,9 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
    case X86ISD::ATOMAND64_DAG:      return "X86ISD::ATOMAND64_DAG";
    case X86ISD::ATOMNAND64_DAG:     return "X86ISD::ATOMNAND64_DAG";
    case X86ISD::VZEXT_MOVL:         return "X86ISD::VZEXT_MOVL";
+  case X86ISD::VSEXT_MOVL:         return "X86ISD::VSEXT_MOVL";
    case X86ISD::VZEXT_LOAD:         return "X86ISD::VZEXT_LOAD";
+  case X86ISD::VFPEXT:             return "X86ISD::VFPEXT";
    case X86ISD::VSHLDQ:             return "X86ISD::VSHLDQ";
    case X86ISD::VSRLDQ:             return "X86ISD::VSRLDQ";
    case X86ISD::VSHL:               return "X86ISD::VSHL";
@@ -11104,6 +11555,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
    case X86ISD::VBROADCAST:         return "X86ISD::VBROADCAST";
    case X86ISD::VPERMILP:           return "X86ISD::VPERMILP";
    case X86ISD::VPERM2X128:         return "X86ISD::VPERM2X128";
+  case X86ISD::VPERMV:             return "X86ISD::VPERMV";
+  case X86ISD::VPERMI:             return "X86ISD::VPERMI";
    case X86ISD::PMULUDQ:            return "X86ISD::PMULUDQ";
    case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
    case X86ISD::VAARG_64:           return "X86ISD::VAARG_64";
@@ -11111,6 +11564,14 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
    case X86ISD::MEMBARRIER:         return "X86ISD::MEMBARRIER";
    case X86ISD::SEG_ALLOCA:         return "X86ISD::SEG_ALLOCA";
    case X86ISD::WIN_FTOL:           return "X86ISD::WIN_FTOL";
+  case X86ISD::SAHF:               return "X86ISD::SAHF";
+  case X86ISD::RDRAND:             return "X86ISD::RDRAND";
+  case X86ISD::FMADD:              return "X86ISD::FMADD";
+  case X86ISD::FMSUB:              return "X86ISD::FMSUB";
+  case X86ISD::FNMADD:             return "X86ISD::FNMADD";
+  case X86ISD::FNMSUB:             return "X86ISD::FNMSUB";
+  case X86ISD::FMADDSUB:           return "X86ISD::FMADDSUB";
+  case X86ISD::FMSUBADD:           return "X86ISD::FMSUBADD";
    }
  }
  
@@ -11179,6 +11640,15 @@ bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
    return true;
  }
  
+bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
+  return Imm == (int32_t)Imm;
+}
+
+bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
+  // Can also use sub to handle negated immediates.
+  return Imm == (int32_t)Imm;
+}
+
  bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
    if (!VT1.isInteger() || !VT2.isInteger())
      return false;
@@ -11221,8 +11691,8 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
            isMOVLMask(M, VT) ||
            isSHUFPMask(M, VT, Subtarget->hasAVX()) ||
            isPSHUFDMask(M, VT) ||
-          isPSHUFHWMask(M, VT) ||
-          isPSHUFLWMask(M, VT) ||
+          isPSHUFHWMask(M, VT, Subtarget->hasAVX2()) ||
+          isPSHUFLWMask(M, VT, Subtarget->hasAVX2()) ||
            isPALIGNRMask(M, VT, Subtarget) ||
            isUNPCKLMask(M, VT, Subtarget->hasAVX2()) ||
            isUNPCKHMask(M, VT, Subtarget->hasAVX2()) ||
@@ -11237,7 +11707,7 @@ X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
    // FIXME: This collection of masks seems suspect.
    if (NumElts == 2)
      return true;
-  if (NumElts == 4 && VT.getSizeInBits() == 128) {
+  if (NumElts == 4 && VT.is128BitVector()) {
      return (isMOVLMask(Mask, VT)  ||
              isCommutedMOVLMask(Mask, VT, true) ||
              isSHUFPMask(Mask, VT, Subtarget->hasAVX()) ||
@@ -11261,14 +11731,15 @@ X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr,
                                                         unsigned notOpc,
                                                         unsigned EAXreg,
                                                   const TargetRegisterClass *RC,
-                                                       bool invSrc) const {
+                                                       bool Invert) const {
    // For the atomic bitwise operator, we generate
    //   thisMBB:
    //   newMBB:
    //     ld  t1 = [bitinstr.addr]
    //     op  t2 = t1, [bitinstr.val]
+  //     not t3 = t2  (if Invert)
    //     mov EAX = t1
-  //     lcs dest = [bitinstr.addr], t2  [EAX is implicit]
+  //     lcs dest = [bitinstr.addr], t3  [EAX is implicit]
    //     bz  newMBB
    //     fallthrough -->nextMBB
    const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
@@ -11316,13 +11787,6 @@ X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr,
    for (int i=0; i <= lastAddrIndx; ++i)
      (*MIB).addOperand(*argOpers[i]);
  
-  unsigned tt = F->getRegInfo().createVirtualRegister(RC);
-  if (invSrc) {
-    MIB = BuildMI(newMBB, dl, TII->get(notOpc), tt).addReg(t1);
-  }
-  else
-    tt = t1;
-
    unsigned t2 = F->getRegInfo().createVirtualRegister(RC);
    assert((argOpers[valArgIndx]->isReg() ||
            argOpers[valArgIndx]->isImm()) &&
@@ -11331,16 +11795,23 @@ X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr,
      MIB = BuildMI(newMBB, dl, TII->get(regOpc), t2);
    else
      MIB = BuildMI(newMBB, dl, TII->get(immOpc), t2);
-  MIB.addReg(tt);
+  MIB.addReg(t1);
    (*MIB).addOperand(*argOpers[valArgIndx]);
  
+  unsigned t3 = F->getRegInfo().createVirtualRegister(RC);
+  if (Invert) {
+    MIB = BuildMI(newMBB, dl, TII->get(notOpc), t3).addReg(t2);
+  }
+  else
+    t3 = t2;
+
    MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), EAXreg);
    MIB.addReg(t1);
  
    MIB = BuildMI(newMBB, dl, TII->get(CXchgOpc));
    for (int i=0; i <= lastAddrIndx; ++i)
      (*MIB).addOperand(*argOpers[i]);
-  MIB.addReg(t2);
+  MIB.addReg(t3);
    assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand");
    (*MIB).setMemRefs(bInstr->memoperands_begin(),
                      bInstr->memoperands_end());
@@ -11363,7 +11834,7 @@ X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr,
                                                         unsigned regOpcH,
                                                         unsigned immOpcL,
                                                         unsigned immOpcH,
-                                                       bool invSrc) const {
+                                                       bool Invert) const {
    // For the atomic bitwise operator, we generate
    //   thisMBB (instructions are in pairs, except cmpxchg8b)
    //     ld t1,t2 = [bitinstr.addr]
@@ -11371,6 +11842,7 @@ X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr,
    //     out1, out2 = phi (thisMBB, t1/t2) (newMBB, t3/t4)
    //     op  t5, t6 <- out1, out2, [bitinstr.val]
    //      (for SWAP, substitute:  mov t5, t6 <- [bitinstr.val])
+  //     neg t7, t8 < t5, t6  (if Invert)
    //     mov ECX, EBX <- t5, t6
    //     mov EAX, EDX <- t1, t2
    //     cmpxchg8b [bitinstr.addr]  [EAX, EDX, EBX, ECX implicit]
@@ -11379,7 +11851,7 @@ X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr,
    //     result in out1, out2
    //     fallthrough -->nextMBB
  
-  const TargetRegisterClass *RC = X86::GR32RegisterClass;
+  const TargetRegisterClass *RC = &X86::GR32RegClass;
    const unsigned LoadOpc = X86::MOV32rm;
    const unsigned NotOpc = X86::NOT32r;
    const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
@@ -11454,16 +11926,9 @@ X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr,
      .addReg(t2).addMBB(thisMBB).addReg(t4).addMBB(newMBB);
  
    // The subsequent operations should be using the destination registers of
-  //the PHI instructions.
-  if (invSrc) {
-    t1 = F->getRegInfo().createVirtualRegister(RC);
-    t2 = F->getRegInfo().createVirtualRegister(RC);
-    MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t1).addReg(dest1Oper.getReg());
-    MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t2).addReg(dest2Oper.getReg());
-  } else {
-    t1 = dest1Oper.getReg();
-    t2 = dest2Oper.getReg();
-  }
+  // the PHI instructions.
+  t1 = dest1Oper.getReg();
+  t2 = dest2Oper.getReg();
  
    int valArgIndx = lastAddrIndx + 1;
    assert((argOpers[valArgIndx]->isReg() ||
@@ -11490,15 +11955,26 @@ X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr,
      MIB.addReg(t2);
    (*MIB).addOperand(*argOpers[valArgIndx + 1]);
  
+  unsigned t7, t8;
+  if (Invert) {
+    t7 = F->getRegInfo().createVirtualRegister(RC);
+    t8 = F->getRegInfo().createVirtualRegister(RC);
+    MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t7).addReg(t5);
+    MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t8).addReg(t6);
+  } else {
+    t7 = t5;
+    t8 = t6;
+  }
+
    MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX);
    MIB.addReg(t1);
    MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EDX);
    MIB.addReg(t2);
  
    MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EBX);
-  MIB.addReg(t5);
+  MIB.addReg(t7);
    MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::ECX);
-  MIB.addReg(t6);
+  MIB.addReg(t8);
  
    MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG8B));
    for (int i=0; i <= lastAddrIndx; ++i)
@@ -11577,7 +12053,7 @@ X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr,
    int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3]
    int valArgIndx = lastAddrIndx + 1;
  
-  unsigned t1 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
+  unsigned t1 = F->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
    MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rm), t1);
    for (int i=0; i <= lastAddrIndx; ++i)
      (*MIB).addOperand(*argOpers[i]);
@@ -11587,7 +12063,7 @@ X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr,
            argOpers[valArgIndx]->isImm()) &&
           "invalid operand");
  
-  unsigned t2 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
+  unsigned t2 = F->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
    if (argOpers[valArgIndx]->isReg())
      MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t2);
    else
@@ -11602,7 +12078,7 @@ X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr,
    MIB.addReg(t2);
  
    // Generate movc
-  unsigned t3 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
+  unsigned t3 = F->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
    MIB = BuildMI(newMBB, dl, TII->get(cmovOpc),t3);
    MIB.addReg(t2);
    MIB.addReg(t1);
@@ -11657,8 +12133,7 @@ X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB,
        MIB.addOperand(Op);
    }
    BuildMI(*BB, MI, dl,
-    TII->get(Subtarget->hasAVX() ? X86::VMOVAPSrr : X86::MOVAPSrr),
-             MI->getOperand(0).getReg())
+    TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
      .addReg(X86::XMM0);
  
    MI->eraseFromParent();
@@ -11690,24 +12165,6 @@ X86TargetLowering::EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB) const {
    return BB;
  }
  
-MachineBasicBlock *
-X86TargetLowering::EmitMwait(MachineInstr *MI, MachineBasicBlock *BB) const {
-  DebugLoc dl = MI->getDebugLoc();
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
-
-  // First arg in ECX, the second in EAX.
-  BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
-    .addReg(MI->getOperand(0).getReg());
-  BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX)
-    .addReg(MI->getOperand(1).getReg());
-
-  // The instruction doesn't actually take any operands though.
-  BuildMI(*BB, MI, dl, TII->get(X86::MWAITrr));
-
-  MI->eraseFromParent(); // The pseudo is gone now.
-  return BB;
-}
-
  MachineBasicBlock *
  X86TargetLowering::EmitVAARG64WithCustomInserter(
                     MachineInstr *MI,
@@ -12221,8 +12678,9 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB,
      BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
        .addReg(sizeVReg);
      BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
-      .addExternalSymbol("__morestack_allocate_stack_space").addReg(X86::RDI)
+      .addExternalSymbol("__morestack_allocate_stack_space")
        .addRegMask(RegMask)
+      .addReg(X86::RDI, RegState::Implicit)
        .addReg(X86::RAX, RegState::ImplicitDefine);
    } else {
      BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
@@ -12432,7 +12890,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
  
      // Load the old value of the high byte of the control word...
      unsigned OldCW =
-      F->getRegInfo().createVirtualRegister(X86::GR16RegisterClass);
+      F->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
      addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
                        CWFrameIdx);
  
@@ -12497,185 +12955,208 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
      // String/text processing lowering.
    case X86::PCMPISTRM128REG:
    case X86::VPCMPISTRM128REG:
-    return EmitPCMP(MI, BB, 3, false /* in-mem */);
    case X86::PCMPISTRM128MEM:
    case X86::VPCMPISTRM128MEM:
-    return EmitPCMP(MI, BB, 3, true /* in-mem */);
    case X86::PCMPESTRM128REG:
    case X86::VPCMPESTRM128REG:
-    return EmitPCMP(MI, BB, 5, false /* in mem */);
    case X86::PCMPESTRM128MEM:
-  case X86::VPCMPESTRM128MEM:
-    return EmitPCMP(MI, BB, 5, true /* in mem */);
+  case X86::VPCMPESTRM128MEM: {
+    unsigned NumArgs;
+    bool MemArg;
+    switch (MI->getOpcode()) {
+    default: llvm_unreachable("illegal opcode!");
+    case X86::PCMPISTRM128REG:
+    case X86::VPCMPISTRM128REG:
+      NumArgs = 3; MemArg = false; break;
+    case X86::PCMPISTRM128MEM:
+    case X86::VPCMPISTRM128MEM:
+      NumArgs = 3; MemArg = true; break;
+    case X86::PCMPESTRM128REG:
+    case X86::VPCMPESTRM128REG:
+      NumArgs = 5; MemArg = false; break;
+    case X86::PCMPESTRM128MEM:
+    case X86::VPCMPESTRM128MEM:
+      NumArgs = 5; MemArg = true; break;
+    }
+    return EmitPCMP(MI, BB, NumArgs, MemArg);
+  }
  
      // Thread synchronization.
    case X86::MONITOR:
      return EmitMonitor(MI, BB);
-  case X86::MWAIT:
-    return EmitMwait(MI, BB);
  
      // Atomic Lowering.
-  case X86::ATOMAND32:
-    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr,
-                                               X86::AND32ri, X86::MOV32rm,
-                                               X86::LCMPXCHG32,
-                                               X86::NOT32r, X86::EAX,
-                                               X86::GR32RegisterClass);
-  case X86::ATOMOR32:
-    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR32rr,
-                                               X86::OR32ri, X86::MOV32rm,
-                                               X86::LCMPXCHG32,
-                                               X86::NOT32r, X86::EAX,
-                                               X86::GR32RegisterClass);
-  case X86::ATOMXOR32:
-    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR32rr,
-                                               X86::XOR32ri, X86::MOV32rm,
-                                               X86::LCMPXCHG32,
-                                               X86::NOT32r, X86::EAX,
-                                               X86::GR32RegisterClass);
-  case X86::ATOMNAND32:
-    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr,
-                                               X86::AND32ri, X86::MOV32rm,
-                                               X86::LCMPXCHG32,
-                                               X86::NOT32r, X86::EAX,
-                                               X86::GR32RegisterClass, true);
    case X86::ATOMMIN32:
-    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL32rr);
    case X86::ATOMMAX32:
-    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG32rr);
    case X86::ATOMUMIN32:
-    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB32rr);
    case X86::ATOMUMAX32:
-    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA32rr);
+  case X86::ATOMMIN16:
+  case X86::ATOMMAX16:
+  case X86::ATOMUMIN16:
+  case X86::ATOMUMAX16:
+  case X86::ATOMMIN64:
+  case X86::ATOMMAX64:
+  case X86::ATOMUMIN64:
+  case X86::ATOMUMAX64: {
+    unsigned Opc;
+    switch (MI->getOpcode()) {
+    default: llvm_unreachable("illegal opcode!");
+    case X86::ATOMMIN32:  Opc = X86::CMOVL32rr; break;
+    case X86::ATOMMAX32:  Opc = X86::CMOVG32rr; break;
+    case X86::ATOMUMIN32: Opc = X86::CMOVB32rr; break;
+    case X86::ATOMUMAX32: Opc = X86::CMOVA32rr; break;
+    case X86::ATOMMIN16:  Opc = X86::CMOVL16rr; break;
+    case X86::ATOMMAX16:  Opc = X86::CMOVG16rr; break;
+    case X86::ATOMUMIN16: Opc = X86::CMOVB16rr; break;
+    case X86::ATOMUMAX16: Opc = X86::CMOVA16rr; break;
+    case X86::ATOMMIN64:  Opc = X86::CMOVL64rr; break;
+    case X86::ATOMMAX64:  Opc = X86::CMOVG64rr; break;
+    case X86::ATOMUMIN64: Opc = X86::CMOVB64rr; break;
+    case X86::ATOMUMAX64: Opc = X86::CMOVA64rr; break;
+    // FIXME: There are no CMOV8 instructions; MIN/MAX need some other way.
+    }
+    return EmitAtomicMinMaxWithCustomInserter(MI, BB, Opc);
+  }
+
+  case X86::ATOMAND32:
+  case X86::ATOMOR32:
+  case X86::ATOMXOR32:
+  case X86::ATOMNAND32: {
+    bool Invert = false;
+    unsigned RegOpc, ImmOpc;
+    switch (MI->getOpcode()) {
+    default: llvm_unreachable("illegal opcode!");
+    case X86::ATOMAND32:
+      RegOpc = X86::AND32rr; ImmOpc = X86::AND32ri; break;
+    case X86::ATOMOR32:
+      RegOpc = X86::OR32rr;  ImmOpc = X86::OR32ri; break;
+    case X86::ATOMXOR32:
+      RegOpc = X86::XOR32rr; ImmOpc = X86::XOR32ri; break;
+    case X86::ATOMNAND32:
+      RegOpc = X86::AND32rr; ImmOpc = X86::AND32ri; Invert = true; break;
+    }
+    return EmitAtomicBitwiseWithCustomInserter(MI, BB, RegOpc, ImmOpc,
+                                               X86::MOV32rm, X86::LCMPXCHG32,
+                                               X86::NOT32r, X86::EAX,
+                                               &X86::GR32RegClass, Invert);
+  }
  
    case X86::ATOMAND16:
-    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr,
-                                               X86::AND16ri, X86::MOV16rm,
-                                               X86::LCMPXCHG16,
-                                               X86::NOT16r, X86::AX,
-                                               X86::GR16RegisterClass);
    case X86::ATOMOR16:
-    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR16rr,
-                                               X86::OR16ri, X86::MOV16rm,
-                                               X86::LCMPXCHG16,
-                                               X86::NOT16r, X86::AX,
-                                               X86::GR16RegisterClass);
    case X86::ATOMXOR16:
-    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR16rr,
-                                               X86::XOR16ri, X86::MOV16rm,
-                                               X86::LCMPXCHG16,
-                                               X86::NOT16r, X86::AX,
-                                               X86::GR16RegisterClass);
-  case X86::ATOMNAND16:
-    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr,
-                                               X86::AND16ri, X86::MOV16rm,
-                                               X86::LCMPXCHG16,
+  case X86::ATOMNAND16: {
+    bool Invert = false;
+    unsigned RegOpc, ImmOpc;
+    switch (MI->getOpcode()) {
+    default: llvm_unreachable("illegal opcode!");
+    case X86::ATOMAND16:
+      RegOpc = X86::AND16rr; ImmOpc = X86::AND16ri; break;
+    case X86::ATOMOR16:
+      RegOpc = X86::OR16rr;  ImmOpc = X86::OR16ri; break;
+    case X86::ATOMXOR16:
+      RegOpc = X86::XOR16rr; ImmOpc = X86::XOR16ri; break;
+    case X86::ATOMNAND16:
+      RegOpc = X86::AND16rr; ImmOpc = X86::AND16ri; Invert = true; break;
+    }
+    return EmitAtomicBitwiseWithCustomInserter(MI, BB, RegOpc, ImmOpc,
+                                               X86::MOV16rm, X86::LCMPXCHG16,
                                                 X86::NOT16r, X86::AX,
-                                               X86::GR16RegisterClass, true);
-  case X86::ATOMMIN16:
-    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL16rr);
-  case X86::ATOMMAX16:
-    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG16rr);
-  case X86::ATOMUMIN16:
-    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB16rr);
-  case X86::ATOMUMAX16:
-    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA16rr);
+                                               &X86::GR16RegClass, Invert);
+  }
  
    case X86::ATOMAND8:
-    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr,
-                                               X86::AND8ri, X86::MOV8rm,
-                                               X86::LCMPXCHG8,
-                                               X86::NOT8r, X86::AL,
-                                               X86::GR8RegisterClass);
    case X86::ATOMOR8:
-    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR8rr,
-                                               X86::OR8ri, X86::MOV8rm,
-                                               X86::LCMPXCHG8,
-                                               X86::NOT8r, X86::AL,
-                                               X86::GR8RegisterClass);
    case X86::ATOMXOR8:
-    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR8rr,
-                                               X86::XOR8ri, X86::MOV8rm,
-                                               X86::LCMPXCHG8,
-                                               X86::NOT8r, X86::AL,
-                                               X86::GR8RegisterClass);
-  case X86::ATOMNAND8:
-    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr,
-                                               X86::AND8ri, X86::MOV8rm,
-                                               X86::LCMPXCHG8,
+  case X86::ATOMNAND8: {
+    bool Invert = false;
+    unsigned RegOpc, ImmOpc;
+    switch (MI->getOpcode()) {
+    default: llvm_unreachable("illegal opcode!");
+    case X86::ATOMAND8:
+      RegOpc = X86::AND8rr; ImmOpc = X86::AND8ri; break;
+    case X86::ATOMOR8:
+      RegOpc = X86::OR8rr;  ImmOpc = X86::OR8ri; break;
+    case X86::ATOMXOR8:
+      RegOpc = X86::XOR8rr; ImmOpc = X86::XOR8ri; break;
+    case X86::ATOMNAND8:
+      RegOpc = X86::AND8rr; ImmOpc = X86::AND8ri; Invert = true; break;
+    }
+    return EmitAtomicBitwiseWithCustomInserter(MI, BB, RegOpc, ImmOpc,
+                                               X86::MOV8rm, X86::LCMPXCHG8,
                                                 X86::NOT8r, X86::AL,
-                                               X86::GR8RegisterClass, true);
-  // FIXME: There are no CMOV8 instructions; MIN/MAX need some other way.
+                                               &X86::GR8RegClass, Invert);
+  }
+
    // This group is for 64-bit host.
    case X86::ATOMAND64:
-    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr,
-                                               X86::AND64ri32, X86::MOV64rm,
-                                               X86::LCMPXCHG64,
-                                               X86::NOT64r, X86::RAX,
-                                               X86::GR64RegisterClass);
    case X86::ATOMOR64:
-    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR64rr,
-                                               X86::OR64ri32, X86::MOV64rm,
-                                               X86::LCMPXCHG64,
-                                               X86::NOT64r, X86::RAX,
-                                               X86::GR64RegisterClass);
    case X86::ATOMXOR64:
-    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR64rr,
-                                               X86::XOR64ri32, X86::MOV64rm,
-                                               X86::LCMPXCHG64,
-                                               X86::NOT64r, X86::RAX,
-                                               X86::GR64RegisterClass);
-  case X86::ATOMNAND64:
-    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr,
-                                               X86::AND64ri32, X86::MOV64rm,
-                                               X86::LCMPXCHG64,
+  case X86::ATOMNAND64: {
+    bool Invert = false;
+    unsigned RegOpc, ImmOpc;
+    switch (MI->getOpcode()) {
+    default: llvm_unreachable("illegal opcode!");
+    case X86::ATOMAND64:
+      RegOpc = X86::AND64rr; ImmOpc = X86::AND64ri32; break;
+    case X86::ATOMOR64:
+      RegOpc = X86::OR64rr;  ImmOpc = X86::OR64ri32; break;
+    case X86::ATOMXOR64:
+      RegOpc = X86::XOR64rr; ImmOpc = X86::XOR64ri32; break;
+    case X86::ATOMNAND64:
+      RegOpc = X86::AND64rr; ImmOpc = X86::AND64ri32; Invert = true; break;
+    }
+    return EmitAtomicBitwiseWithCustomInserter(MI, BB, RegOpc, ImmOpc,
+                                               X86::MOV64rm, X86::LCMPXCHG64,
                                                 X86::NOT64r, X86::RAX,
-                                               X86::GR64RegisterClass, true);
-  case X86::ATOMMIN64:
-    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL64rr);
-  case X86::ATOMMAX64:
-    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG64rr);
-  case X86::ATOMUMIN64:
-    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB64rr);
-  case X86::ATOMUMAX64:
-    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA64rr);
+                                               &X86::GR64RegClass, Invert);
+  }
  
    // This group does 64-bit operations on a 32-bit host.
    case X86::ATOMAND6432:
-    return EmitAtomicBit6432WithCustomInserter(MI, BB,
-                                               X86::AND32rr, X86::AND32rr,
-                                               X86::AND32ri, X86::AND32ri,
-                                               false);
    case X86::ATOMOR6432:
-    return EmitAtomicBit6432WithCustomInserter(MI, BB,
-                                               X86::OR32rr, X86::OR32rr,
-                                               X86::OR32ri, X86::OR32ri,
-                                               false);
    case X86::ATOMXOR6432:
-    return EmitAtomicBit6432WithCustomInserter(MI, BB,
-                                               X86::XOR32rr, X86::XOR32rr,
-                                               X86::XOR32ri, X86::XOR32ri,
-                                               false);
    case X86::ATOMNAND6432:
-    return EmitAtomicBit6432WithCustomInserter(MI, BB,
-                                               X86::AND32rr, X86::AND32rr,
-                                               X86::AND32ri, X86::AND32ri,
-                                               true);
    case X86::ATOMADD6432:
-    return EmitAtomicBit6432WithCustomInserter(MI, BB,
-                                               X86::ADD32rr, X86::ADC32rr,
-                                               X86::ADD32ri, X86::ADC32ri,
-                                               false);
    case X86::ATOMSUB6432:
-    return EmitAtomicBit6432WithCustomInserter(MI, BB,
-                                               X86::SUB32rr, X86::SBB32rr,
-                                               X86::SUB32ri, X86::SBB32ri,
-                                               false);
-  case X86::ATOMSWAP6432:
-    return EmitAtomicBit6432WithCustomInserter(MI, BB,
-                                               X86::MOV32rr, X86::MOV32rr,
-                                               X86::MOV32ri, X86::MOV32ri,
-                                               false);
+  case X86::ATOMSWAP6432: {
+    bool Invert = false;
+    unsigned RegOpcL, RegOpcH, ImmOpcL, ImmOpcH;
+    switch (MI->getOpcode()) {
+    default: llvm_unreachable("illegal opcode!");
+    case X86::ATOMAND6432:
+      RegOpcL = RegOpcH = X86::AND32rr;
+      ImmOpcL = ImmOpcH = X86::AND32ri;
+      break;
+    case X86::ATOMOR6432:
+      RegOpcL = RegOpcH = X86::OR32rr;
+      ImmOpcL = ImmOpcH = X86::OR32ri;
+      break;
+    case X86::ATOMXOR6432:
+      RegOpcL = RegOpcH = X86::XOR32rr;
+      ImmOpcL = ImmOpcH = X86::XOR32ri;
+      break;
+    case X86::ATOMNAND6432:
+      RegOpcL = RegOpcH = X86::AND32rr;
+      ImmOpcL = ImmOpcH = X86::AND32ri;
+      Invert = true;
+      break;
+    case X86::ATOMADD6432:
+      RegOpcL = X86::ADD32rr; RegOpcH = X86::ADC32rr;
+      ImmOpcL = X86::ADD32ri; ImmOpcH = X86::ADC32ri;
+      break;
+    case X86::ATOMSUB6432:
+      RegOpcL = X86::SUB32rr; RegOpcH = X86::SBB32rr;
+      ImmOpcL = X86::SUB32ri; ImmOpcH = X86::SBB32ri;
+      break;
+    case X86::ATOMSWAP6432:
+      RegOpcL = RegOpcH = X86::MOV32rr;
+      ImmOpcL = ImmOpcH = X86::MOV32ri;
+      break;
+    }
+    return EmitAtomicBit6432WithCustomInserter(MI, BB, RegOpcL, RegOpcH,
+                                               ImmOpcL, ImmOpcH, Invert);
+  }
+
    case X86::VASTART_SAVE_XMM_REGS:
      return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
  
@@ -12689,11 +13170,11 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
  //===----------------------------------------------------------------------===//
  
  void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
-                                                       const APInt &Mask,
                                                         APInt &KnownZero,
                                                         APInt &KnownOne,
                                                         const SelectionDAG &DAG,
                                                         unsigned Depth) const {
+  unsigned BitWidth = KnownZero.getBitWidth();
    unsigned Opc = Op.getOpcode();
    assert((Opc >= ISD::BUILTIN_OP_END ||
            Opc == ISD::INTRINSIC_WO_CHAIN ||
@@ -12702,7 +13183,7 @@ void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
           "Should use MaskedValueIsZero if you don't know whether Op"
           " is a target node!");
  
-  KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0);   // Don't know anything.
+  KnownZero = KnownOne = APInt(BitWidth, 0);   // Don't know anything.
    switch (Opc) {
    default: break;
    case X86ISD::ADD:
@@ -12721,8 +13202,7 @@ void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
        break;
      // Fallthrough
    case X86ISD::SETCC:
-    KnownZero |= APInt::getHighBitsSet(Mask.getBitWidth(),
-                                       Mask.getBitWidth() - 1);
+    KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
      break;
    case ISD::INTRINSIC_WO_CHAIN: {
      unsigned IntId = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
@@ -12747,8 +13227,7 @@ void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
          case Intrinsic::x86_sse2_pmovmskb_128:  NumLoBits = 16; break;
          case Intrinsic::x86_avx2_pmovmskb:      NumLoBits = 32; break;
        }
-      KnownZero = APInt::getHighBitsSet(Mask.getBitWidth(),
-                                        Mask.getBitWidth() - NumLoBits);
+      KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits);
        break;
      }
      }
@@ -12787,10 +13266,10 @@ bool X86TargetLowering::isGAPlusOffset(SDNode *N,
  /// inserting the result into the low part of a new 256-bit vector
  static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) {
    EVT VT = SVOp->getValueType(0);
-  int NumElems = VT.getVectorNumElements();
+  unsigned NumElems = VT.getVectorNumElements();
  
    // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
-  for (int i = 0, j = NumElems/2; i < NumElems/2; ++i, ++j)
+  for (unsigned i = 0, j = NumElems/2; i != NumElems/2; ++i, ++j)
      if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
          SVOp->getMaskElt(j) >= 0)
        return false;
@@ -12803,10 +13282,10 @@ static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) {
  /// inserting the result into the high part of a new 256-bit vector
  static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) {
    EVT VT = SVOp->getValueType(0);
-  int NumElems = VT.getVectorNumElements();
+  unsigned NumElems = VT.getVectorNumElements();
  
    // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
-  for (int i = NumElems/2, j = 0; i < NumElems; ++i, ++j)
+  for (unsigned i = NumElems/2, j = 0; i != NumElems; ++i, ++j)
      if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
          SVOp->getMaskElt(j) >= 0)
        return false;
@@ -12823,7 +13302,7 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
    SDValue V1 = SVOp->getOperand(0);
    SDValue V2 = SVOp->getOperand(1);
    EVT VT = SVOp->getValueType(0);
-  int NumElems = VT.getVectorNumElements();
+  unsigned NumElems = VT.getVectorNumElements();
  
    if (V1.getOpcode() == ISD::CONCAT_VECTORS &&
        V2.getOpcode() == ISD::CONCAT_VECTORS) {
@@ -12848,30 +13327,31 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
      // To match the shuffle mask, the first half of the mask should
      // be exactly the first vector, and all the rest a splat with the
      // first element of the second one.
-    for (int i = 0; i < NumElems/2; ++i)
+    for (unsigned i = 0; i != NumElems/2; ++i)
        if (!isUndefOrEqual(SVOp->getMaskElt(i), i) ||
            !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems))
          return SDValue();
  
      // If V1 is coming from a vector load then just fold to a VZEXT_LOAD.
      if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(V1.getOperand(0))) {
-      SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other);
-      SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() };
-      SDValue ResNode =
-        DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2,
-                                Ld->getMemoryVT(),
-                                Ld->getPointerInfo(),
-                                Ld->getAlignment(),
-                                false/*isVolatile*/, true/*ReadMem*/,
-                                false/*WriteMem*/);
-      return DAG.getNode(ISD::BITCAST, dl, VT, ResNode);
-    } 
+      if (Ld->hasNUsesOfValue(1, 0)) {
+        SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other);
+        SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() };
+        SDValue ResNode =
+          DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2,
+                                  Ld->getMemoryVT(),
+                                  Ld->getPointerInfo(),
+                                  Ld->getAlignment(),
+                                  false/*isVolatile*/, true/*ReadMem*/,
+                                  false/*WriteMem*/);
+        return DAG.getNode(ISD::BITCAST, dl, VT, ResNode);
+      }
+    }
  
      // Emit a zeroed vector and insert the desired subvector on its
      // first half.
      SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
-    SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0),
-                         DAG.getConstant(0, MVT::i32), DAG, dl);
+    SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0), 0, DAG, dl);
      return DCI.CombineTo(N, InsV);
    }
  
@@ -12881,18 +13361,15 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
  
    // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
    if (isShuffleHigh128VectorInsertLow(SVOp)) {
-    SDValue V = Extract128BitVector(V1, DAG.getConstant(NumElems/2, MVT::i32),
-                                    DAG, dl);
-    SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT),
-                                      V, DAG.getConstant(0, MVT::i32), DAG, dl);
+    SDValue V = Extract128BitVector(V1, NumElems/2, DAG, dl);
+    SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, 0, DAG, dl);
      return DCI.CombineTo(N, InsV);
    }
  
    // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
    if (isShuffleLow128VectorInsertHigh(SVOp)) {
-    SDValue V = Extract128BitVector(V1, DAG.getConstant(0, MVT::i32), DAG, dl);
-    SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT),
-                             V, DAG.getConstant(NumElems/2, MVT::i32), DAG, dl);
+    SDValue V = Extract128BitVector(V1, 0, DAG, dl);
+    SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, NumElems/2, DAG, dl);
      return DCI.CombineTo(N, InsV);
    }
  
@@ -12912,12 +13389,12 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
      return SDValue();
  
    // Combine 256-bit vector shuffles. This is only profitable when in AVX mode
-  if (Subtarget->hasAVX() && VT.getSizeInBits() == 256 &&
+  if (Subtarget->hasAVX() && VT.is256BitVector() &&
        N->getOpcode() == ISD::VECTOR_SHUFFLE)
      return PerformShuffleCombine256(N, DAG, DCI, Subtarget);
  
    // Only handle 128 wide vector from here on.
-  if (VT.getSizeInBits() != 128)
+  if (!VT.is128BitVector())
      return SDValue();
  
    // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
@@ -12931,16 +13408,17 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
  }
  
  
-/// PerformTruncateCombine - Converts truncate operation to
+/// DCI, PerformTruncateCombine - Converts truncate operation to
  /// a sequence of vector shuffle operations.
  /// It is possible when we truncate 256-bit vector to 128-bit vector
  
-SDValue X86TargetLowering::PerformTruncateCombine(SDNode *N, SelectionDAG &DAG, 
+SDValue X86TargetLowering::PerformTruncateCombine(SDNode *N, SelectionDAG &DAG,
                                                    DAGCombinerInfo &DCI) const {
    if (!DCI.isBeforeLegalizeOps())
      return SDValue();
  
-  if (!Subtarget->hasAVX()) return SDValue();
+  if (!Subtarget->hasAVX())
+    return SDValue();
  
    EVT VT = N->getValueType(0);
    SDValue Op = N->getOperand(0);
@@ -12949,68 +13427,214 @@ SDValue X86TargetLowering::PerformTruncateCombine(SDNode *N, SelectionDAG &DAG,
  
    if ((VT == MVT::v4i32) && (OpVT == MVT::v4i64)) {
  
-    SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op,
-                          DAG.getIntPtrConstant(0));
+    if (Subtarget->hasAVX2()) {
+      // AVX2: v4i64 -> v4i32
+
+      // VPERMD
+      static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
+
+      Op = DAG.getNode(ISD::BITCAST, dl, MVT::v8i32, Op);
+      Op = DAG.getVectorShuffle(MVT::v8i32, dl, Op, DAG.getUNDEF(MVT::v8i32),
+                                ShufMask);
+
+      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Op,
+                         DAG.getIntPtrConstant(0));
+    }
+
+    // AVX: v4i64 -> v4i32
+    SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op,
+                               DAG.getIntPtrConstant(0));
+
+    SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op,
+                               DAG.getIntPtrConstant(2));
+
+    OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpLo);
+    OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpHi);
+
+    // PSHUFD
+    static const int ShufMask1[] = {0, 2, 0, 0};
+
+    SDValue Undef = DAG.getUNDEF(VT);
+    OpLo = DAG.getVectorShuffle(VT, dl, OpLo, Undef, ShufMask1);
+    OpHi = DAG.getVectorShuffle(VT, dl, OpHi, Undef, ShufMask1);
+
+    // MOVLHPS
+    static const int ShufMask2[] = {0, 1, 4, 5};
+
+    return DAG.getVectorShuffle(VT, dl, OpLo, OpHi, ShufMask2);
+  }
+
+  if ((VT == MVT::v8i16) && (OpVT == MVT::v8i32)) {
+
+    if (Subtarget->hasAVX2()) {
+      // AVX2: v8i32 -> v8i16
+
+      Op = DAG.getNode(ISD::BITCAST, dl, MVT::v32i8, Op);
+
+      // PSHUFB
+      SmallVector<SDValue,32> pshufbMask;
+      for (unsigned i = 0; i < 2; ++i) {
+        pshufbMask.push_back(DAG.getConstant(0x0, MVT::i8));
+        pshufbMask.push_back(DAG.getConstant(0x1, MVT::i8));
+        pshufbMask.push_back(DAG.getConstant(0x4, MVT::i8));
+        pshufbMask.push_back(DAG.getConstant(0x5, MVT::i8));
+        pshufbMask.push_back(DAG.getConstant(0x8, MVT::i8));
+        pshufbMask.push_back(DAG.getConstant(0x9, MVT::i8));
+        pshufbMask.push_back(DAG.getConstant(0xc, MVT::i8));
+        pshufbMask.push_back(DAG.getConstant(0xd, MVT::i8));
+        for (unsigned j = 0; j < 8; ++j)
+          pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
+      }
+      SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v32i8,
+                               &pshufbMask[0], 32);
+      Op = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v32i8, Op, BV);
+
+      Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4i64, Op);
+
+      static const int ShufMask[] = {0,  2,  -1,  -1};
+      Op = DAG.getVectorShuffle(MVT::v4i64, dl,  Op, DAG.getUNDEF(MVT::v4i64),
+                                &ShufMask[0]);
+
+      Op = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op,
+                       DAG.getIntPtrConstant(0));
+
+      return DAG.getNode(ISD::BITCAST, dl, VT, Op);
+    }
+
+    SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i32, Op,
+                               DAG.getIntPtrConstant(0));
+
+    SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i32, Op,
+                               DAG.getIntPtrConstant(4));
+
+    OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpLo);
+    OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpHi);
+
+    // PSHUFB
+    static const int ShufMask1[] = {0,  1,  4,  5,  8,  9, 12, 13,
+                                   -1, -1, -1, -1, -1, -1, -1, -1};
+
+    SDValue Undef = DAG.getUNDEF(MVT::v16i8);
+    OpLo = DAG.getVectorShuffle(MVT::v16i8, dl, OpLo, Undef, ShufMask1);
+    OpHi = DAG.getVectorShuffle(MVT::v16i8, dl, OpHi, Undef, ShufMask1);
+
+    OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpLo);
+    OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpHi);
+
+    // MOVLHPS
+    static const int ShufMask2[] = {0, 1, 4, 5};
+
+    SDValue res = DAG.getVectorShuffle(MVT::v4i32, dl, OpLo, OpHi, ShufMask2);
+    return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, res);
+  }
+
+  return SDValue();
+}
+
+/// XFormVExtractWithShuffleIntoLoad - Check if a vector extract from a target
+/// specific shuffle of a load can be folded into a single element load.
+/// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
+/// shuffles have been customed lowered so we need to handle those here.
+static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
+                                         TargetLowering::DAGCombinerInfo &DCI) {
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  SDValue InVec = N->getOperand(0);
+  SDValue EltNo = N->getOperand(1);
+
+  if (!isa<ConstantSDNode>(EltNo))
+    return SDValue();
  
-    SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op,
-                          DAG.getIntPtrConstant(2));
+  EVT VT = InVec.getValueType();
  
-    OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpLo);
-    OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpHi);
+  bool HasShuffleIntoBitcast = false;
+  if (InVec.getOpcode() == ISD::BITCAST) {
+    // Don't duplicate a load with other uses.
+    if (!InVec.hasOneUse())
+      return SDValue();
+    EVT BCVT = InVec.getOperand(0).getValueType();
+    if (BCVT.getVectorNumElements() != VT.getVectorNumElements())
+      return SDValue();
+    InVec = InVec.getOperand(0);
+    HasShuffleIntoBitcast = true;
+  }
  
-    // PSHUFD
-    int ShufMask1[] = {0, 2, 0, 0};
+  if (!isTargetShuffle(InVec.getOpcode()))
+    return SDValue();
  
-    OpLo = DAG.getVectorShuffle(VT, dl, OpLo, DAG.getUNDEF(VT),
-                                ShufMask1);
-    OpHi = DAG.getVectorShuffle(VT, dl, OpHi, DAG.getUNDEF(VT),
-                                ShufMask1);
+  // Don't duplicate a load with other uses.
+  if (!InVec.hasOneUse())
+    return SDValue();
  
-    // MOVLHPS
-    int ShufMask2[] = {0, 1, 4, 5};
+  SmallVector<int, 16> ShuffleMask;
+  bool UnaryShuffle;
+  if (!getTargetShuffleMask(InVec.getNode(), VT.getSimpleVT(), ShuffleMask,
+                            UnaryShuffle))
+    return SDValue();
  
-    return DAG.getVectorShuffle(VT, dl, OpLo, OpHi, ShufMask2);
-  }
-  if ((VT == MVT::v8i16) && (OpVT == MVT::v8i32)) {
+  // Select the input vector, guarding against out of range extract vector.
+  unsigned NumElems = VT.getVectorNumElements();
+  int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
+  int Idx = (Elt > (int)NumElems) ? -1 : ShuffleMask[Elt];
+  SDValue LdNode = (Idx < (int)NumElems) ? InVec.getOperand(0)
+                                         : InVec.getOperand(1);
  
-    SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i32, Op,
-                          DAG.getIntPtrConstant(0));
+  // If inputs to shuffle are the same for both ops, then allow 2 uses
+  unsigned AllowedUses = InVec.getOperand(0) == InVec.getOperand(1) ? 2 : 1;
  
-    SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i32, Op,
-                          DAG.getIntPtrConstant(4));
+  if (LdNode.getOpcode() == ISD::BITCAST) {
+    // Don't duplicate a load with other uses.
+    if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
+      return SDValue();
  
-    OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpLo);
-    OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpHi);
+    AllowedUses = 1; // only allow 1 load use if we have a bitcast
+    LdNode = LdNode.getOperand(0);
+  }
  
-    // PSHUFB
-    int ShufMask1[] = {0,  1,  4,  5,  8,  9, 12, 13, 
-                      -1, -1, -1, -1, -1, -1, -1, -1};
+  if (!ISD::isNormalLoad(LdNode.getNode()))
+    return SDValue();
  
-    OpLo = DAG.getVectorShuffle(MVT::v16i8, dl, OpLo,
-                                DAG.getUNDEF(MVT::v16i8),
-                                ShufMask1);
-    OpHi = DAG.getVectorShuffle(MVT::v16i8, dl, OpHi,
-                                DAG.getUNDEF(MVT::v16i8),
-                                ShufMask1);
+  LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
  
-    OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpLo);
-    OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpHi);
+  if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
+    return SDValue();
  
-    // MOVLHPS
-    int ShufMask2[] = {0, 1, 4, 5};
+  if (HasShuffleIntoBitcast) {
+    // If there's a bitcast before the shuffle, check if the load type and
+    // alignment is valid.
+    unsigned Align = LN0->getAlignment();
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    unsigned NewAlign = TLI.getTargetData()->
+      getABITypeAlignment(VT.getTypeForEVT(*DAG.getContext()));
  
-    SDValue res = DAG.getVectorShuffle(MVT::v4i32, dl, OpLo, OpHi, ShufMask2);
-    return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, res);
+    if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VT))
+      return SDValue();
    }
  
-  return SDValue();
+  // All checks match so transform back to vector_shuffle so that DAG combiner
+  // can finish the job
+  DebugLoc dl = N->getDebugLoc();
+
+  // Create shuffle node taking into account the case that its a unary shuffle
+  SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(VT) : InVec.getOperand(1);
+  Shuffle = DAG.getVectorShuffle(InVec.getValueType(), dl,
+                                 InVec.getOperand(0), Shuffle,
+                                 &ShuffleMask[0]);
+  Shuffle = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
+                     EltNo);
  }
  
  /// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index
  /// generation and convert it from being a bunch of shuffles and extracts
  /// to a simple store and scalar loads to extract the elements.
  static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
-                                                const TargetLowering &TLI) {
+                                         TargetLowering::DAGCombinerInfo &DCI) {
+  SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI);
+  if (NewOp.getNode())
+    return NewOp;
+
    SDValue InputVector = N->getOperand(0);
  
    // Only operate on vectors of 4 elements, where the alternative shuffling
@@ -13071,6 +13695,7 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
      unsigned EltSize =
          InputVector.getValueType().getVectorElementType().getSizeInBits()/8;
      uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue();
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
      SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy());
  
      SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(),
@@ -13375,9 +14000,13 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
    // to simplify previous instructions.
    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
    if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
-      !DCI.isBeforeLegalize() &&
-      TLI.isOperationLegal(ISD::VSELECT, VT)) {
+      !DCI.isBeforeLegalize() && TLI.isOperationLegal(ISD::VSELECT, VT)) {
      unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits();
+
+    // Don't optimize vector selects that map to mask-registers.
+    if (BitWidth == 1)
+      return SDValue();
+
      assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
      APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
  
@@ -13392,9 +14021,170 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
    return SDValue();
  }
  
+// Check whether a boolean test is testing a boolean value generated by
+// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
+// code.
+//
+// Simplify the following patterns:
+// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
+// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
+// to (Op EFLAGS Cond)
+//
+// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
+// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
+// to (Op EFLAGS !Cond)
+//
+// where Op could be BRCOND or CMOV.
+//
+static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
+  // Quit if not CMP and SUB with its value result used.
+  if (Cmp.getOpcode() != X86ISD::CMP &&
+      (Cmp.getOpcode() != X86ISD::SUB || Cmp.getNode()->hasAnyUseOfValue(0)))
+      return SDValue();
+
+  // Quit if not used as a boolean value.
+  if (CC != X86::COND_E && CC != X86::COND_NE)
+    return SDValue();
+
+  // Check CMP operands. One of them should be 0 or 1 and the other should be
+  // an SetCC or extended from it.
+  SDValue Op1 = Cmp.getOperand(0);
+  SDValue Op2 = Cmp.getOperand(1);
+
+  SDValue SetCC;
+  const ConstantSDNode* C = 0;
+  bool needOppositeCond = (CC == X86::COND_E);
+
+  if ((C = dyn_cast<ConstantSDNode>(Op1)))
+    SetCC = Op2;
+  else if ((C = dyn_cast<ConstantSDNode>(Op2)))
+    SetCC = Op1;
+  else // Quit if all operands are not constants.
+    return SDValue();
+
+  if (C->getZExtValue() == 1)
+    needOppositeCond = !needOppositeCond;
+  else if (C->getZExtValue() != 0)
+    // Quit if the constant is neither 0 or 1.
+    return SDValue();
+
+  // Skip 'zext' node.
+  if (SetCC.getOpcode() == ISD::ZERO_EXTEND)
+    SetCC = SetCC.getOperand(0);
+
+  // Quit if not SETCC.
+  // FIXME: So far we only handle the boolean value generated from SETCC. If
+  // there is other ways to generate boolean values, we need handle them here
+  // as well.
+  if (SetCC.getOpcode() != X86ISD::SETCC)
+    return SDValue();
+
+  // Set the condition code or opposite one if necessary.
+  CC = X86::CondCode(SetCC.getConstantOperandVal(0));
+  if (needOppositeCond)
+    CC = X86::GetOppositeBranchCondition(CC);
+
+  return SetCC.getOperand(1);
+}
+
+/// checkFlaggedOrCombine - DAG combination on X86ISD::OR, i.e. with EFLAGS
+/// updated. If only flag result is used and the result is evaluated from a
+/// series of element extraction, try to combine it into a PTEST.
+static SDValue checkFlaggedOrCombine(SDValue Or, X86::CondCode &CC,
+                                     SelectionDAG &DAG,
+                                     const X86Subtarget *Subtarget) {
+  SDNode *N = Or.getNode();
+  DebugLoc DL = N->getDebugLoc();
+
+  // Only SSE4.1 and beyond supports PTEST or like.
+  if (!Subtarget->hasSSE41())
+    return SDValue();
+
+  if (N->getOpcode() != X86ISD::OR)
+    return SDValue();
+
+  // Quit if the value result of OR is used.
+  if (N->hasAnyUseOfValue(0))
+    return SDValue();
+
+  // Quit if not used as a boolean value.
+  if (CC != X86::COND_E && CC != X86::COND_NE)
+    return SDValue();
+
+  SmallVector<SDValue, 8> Opnds;
+  SDValue VecIn;
+  EVT VT = MVT::Other;
+  unsigned Mask = 0;
+
+  // Recognize a special case where a vector is casted into wide integer to
+  // test all 0s.
+  Opnds.push_back(N->getOperand(0));
+  Opnds.push_back(N->getOperand(1));
+
+  for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
+    SmallVector<SDValue, 8>::const_iterator I = Opnds.begin() + Slot;
+    // BFS traverse all OR'd operands.
+    if (I->getOpcode() == ISD::OR) {
+      Opnds.push_back(I->getOperand(0));
+      Opnds.push_back(I->getOperand(1));
+      // Re-evaluate the number of nodes to be traversed.
+      e += 2;
+      continue;
+    }
+
+    // Quit if a non-EXTRACT_VECTOR_ELT
+    if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+      return SDValue();
+
+    // Quit if without a constant index.
+    SDValue Idx = I->getOperand(1);
+    if (!isa<ConstantSDNode>(Idx))
+      return SDValue();
+
+    // Check if all elements are extracted from the same vector.
+    SDValue ExtractedFromVec = I->getOperand(0);
+    if (VecIn.getNode() == 0) {
+      VT = ExtractedFromVec.getValueType();
+      // FIXME: only 128-bit vector is supported so far.
+      if (!VT.is128BitVector())
+        return SDValue();
+      VecIn = ExtractedFromVec;
+    } else if (VecIn != ExtractedFromVec)
+      return SDValue();
+
+    // Record the constant index.
+    Mask |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
+  }
+
+  assert(VT.is128BitVector() && "Only 128-bit vector PTEST is supported so far.");
+
+  // Quit if not all elements are used.
+  if (Mask != (1U << VT.getVectorNumElements()) - 1U)
+    return SDValue();
+
+  return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIn, VecIn);
+}
+
+static bool isValidFCMOVCondition(X86::CondCode CC) {
+  switch (CC) {
+  default:
+    return false;
+  case X86::COND_B:
+  case X86::COND_BE:
+  case X86::COND_E:
+  case X86::COND_P:
+  case X86::COND_AE:
+  case X86::COND_A:
+  case X86::COND_NE:
+  case X86::COND_NP:
+    return true;
+  }
+}
+
  /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
  static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
-                                  TargetLowering::DAGCombinerInfo &DCI) {
+                                  TargetLowering::DAGCombinerInfo &DCI,
+                                  const X86Subtarget *Subtarget) {
    DebugLoc DL = N->getDebugLoc();
  
    // If the flag operand isn't dead, don't touch this CMOV.
@@ -13405,6 +14195,7 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
    SDValue TrueOp = N->getOperand(1);
    X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
    SDValue Cond = N->getOperand(3);
+
    if (CC == X86::COND_E || CC == X86::COND_NE) {
      switch (Cond.getOpcode()) {
      default: break;
@@ -13416,6 +14207,26 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
      }
    }
  
+  SDValue Flags;
+
+  Flags = checkBoolTestSetCCCombine(Cond, CC);
+  if (Flags.getNode() &&
+      // Extra check as FCMOV only supports a subset of X86 cond.
+      (FalseOp.getValueType() != MVT::f80 || isValidFCMOVCondition(CC))) {
+    SDValue Ops[] = { FalseOp, TrueOp,
+                      DAG.getConstant(CC, MVT::i8), Flags };
+    return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(),
+                       Ops, array_lengthof(Ops));
+  }
+
+  Flags = checkFlaggedOrCombine(Cond, CC, DAG, Subtarget);
+  if (Flags.getNode()) {
+    SDValue Ops[] = { FalseOp, TrueOp,
+                      DAG.getConstant(CC, MVT::i8), Flags };
+    return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(),
+                       Ops, array_lengthof(Ops));
+  }
+
    // If this is a select between two integer constants, try to do some
    // optimizations.  Note that the operands are ordered the opposite of SELECT
    // operands.
@@ -13838,7 +14649,7 @@ static bool CanFoldXORWithAllOnes(const SDNode *N) {
  
    // Sometimes the operand may come from a insert_subvector building a 256-bit
    // allones vector
-  if (VT.getSizeInBits() == 256 &&
+  if (VT.is256BitVector() &&
        N->getOpcode() == ISD::INSERT_SUBVECTOR) {
      SDValue V1 = N->getOperand(0);
      SDValue V2 = N->getOperand(1);
@@ -13968,13 +14779,14 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
          return SDValue();
  
        // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them.
-      if (Mask.getOpcode() != ISD::BITCAST ||
-          X.getOpcode() != ISD::BITCAST ||
-          Y.getOpcode() != ISD::BITCAST)
-        return SDValue();
-
        // Look through mask bitcast.
-      Mask = Mask.getOperand(0);
+      if (Mask.getOpcode() == ISD::BITCAST)
+        Mask = Mask.getOperand(0);
+      if (X.getOpcode() == ISD::BITCAST)
+        X = X.getOperand(0);
+      if (Y.getOpcode() == ISD::BITCAST)
+        Y = Y.getOperand(0);
+
        EVT MaskVT = Mask.getValueType();
  
        // Validate that the Mask operand is a vector sra node.
@@ -13995,8 +14807,6 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
        // Now we know we at least have a plendvb with the mask val.  See if
        // we can form a psignb/w/d.
        // psign = x.type == y.type == mask.type && y = sub(0, x);
-      X = X.getOperand(0);
-      Y = Y.getOperand(0);
        if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X &&
            ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) &&
            X.getValueType() == MaskVT && Y.getValueType() == MaskVT) {
@@ -14077,6 +14887,41 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
    return SDValue();
  }
  
+// Generate NEG and CMOV for integer abs.
+static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+
+  // Since X86 does not have CMOV for 8-bit integer, we don't convert
+  // 8-bit integer abs to NEG and CMOV.
+  if (VT.isInteger() && VT.getSizeInBits() == 8)
+    return SDValue();
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  DebugLoc DL = N->getDebugLoc();
+
+  // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
+  // and change it to SUB and CMOV.
+  if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
+      N0.getOpcode() == ISD::ADD &&
+      N0.getOperand(1) == N1 &&
+      N1.getOpcode() == ISD::SRA &&
+      N1.getOperand(0) == N0.getOperand(0))
+    if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1)))
+      if (Y1C->getAPIntValue() == VT.getSizeInBits()-1) {
+        // Generate SUB & CMOV.
+        SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
+                                  DAG.getConstant(0, VT), N0.getOperand(0));
+
+        SDValue Ops[] = { N0.getOperand(0), Neg,
+                          DAG.getConstant(X86::COND_GE, MVT::i8),
+                          SDValue(Neg.getNode(), 1) };
+        return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue),
+                           Ops, array_lengthof(Ops));
+      }
+  return SDValue();
+}
+
  // PerformXorCombine - Attempts to turn XOR nodes into BLSMSK nodes
  static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG,
                                   TargetLowering::DAGCombinerInfo &DCI,
@@ -14084,6 +14929,16 @@ static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG,
    if (DCI.isBeforeLegalizeOps())
      return SDValue();
  
+  if (Subtarget->hasCMov()) {
+    SDValue RV = performIntegerAbsCombine(N, DAG);
+    if (RV.getNode())
+      return RV;
+  }
+
+  // Try forming BMI if it is available.
+  if (!Subtarget->hasBMI())
+    return SDValue();
+
    EVT VT = N->getValueType(0);
  
    if (VT != MVT::i32 && VT != MVT::i64)
@@ -14109,7 +14964,8 @@ static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG,
  
  /// PerformLOADCombine - Do target-specific dag combines on LOAD nodes.
  static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
-                                   const X86Subtarget *Subtarget) {
+                                  TargetLowering::DAGCombinerInfo &DCI,
+                                  const X86Subtarget *Subtarget) {
    LoadSDNode *Ld = cast<LoadSDNode>(N);
    EVT RegVT = Ld->getValueType(0);
    EVT MemVT = Ld->getMemoryVT();
@@ -14131,63 +14987,94 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
      unsigned RegSz = RegVT.getSizeInBits();
      unsigned MemSz = MemVT.getSizeInBits();
      assert(RegSz > MemSz && "Register size must be greater than the mem size");
-    // All sizes must be a power of two
-    if (!isPowerOf2_32(RegSz * MemSz * NumElems)) return SDValue();
  
-    // Attempt to load the original value using a single load op.
-    // Find a scalar type which is equal to the loaded word size.
+    // All sizes must be a power of two.
+    if (!isPowerOf2_32(RegSz * MemSz * NumElems))
+      return SDValue();
+
+    // Attempt to load the original value using scalar loads.
+    // Find the largest scalar type that divides the total loaded size.
      MVT SclrLoadTy = MVT::i8;
      for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE;
           tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) {
        MVT Tp = (MVT::SimpleValueType)tp;
-      if (TLI.isTypeLegal(Tp) &&  Tp.getSizeInBits() == MemSz) {
+      if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
          SclrLoadTy = Tp;
-        break;
        }
      }
  
-    // Proceed if a load word is found.
-    if (SclrLoadTy.getSizeInBits() != MemSz) return SDValue();
+    // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
+    if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
+        (64 <= MemSz))
+      SclrLoadTy = MVT::f64;
  
+    // Calculate the number of scalar loads that we need to perform
+    // in order to load our vector from memory.
+    unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
+
+    // Represent our vector as a sequence of elements which are the
+    // largest scalar that we can load.
      EVT LoadUnitVecVT = EVT::getVectorVT(*DAG.getContext(), SclrLoadTy,
        RegSz/SclrLoadTy.getSizeInBits());
  
+    // Represent the data using the same element type that is stored in
+    // memory. In practice, we ''widen'' MemVT.
      EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
                                    RegSz/MemVT.getScalarType().getSizeInBits());
-    // Can't shuffle using an illegal type.
-    if (!TLI.isTypeLegal(WideVecVT)) return SDValue();
  
-    // Perform a single load.
-    SDValue ScalarLoad = DAG.getLoad(SclrLoadTy, dl, Ld->getChain(),
-                                  Ld->getBasePtr(),
-                                  Ld->getPointerInfo(), Ld->isVolatile(),
-                                  Ld->isNonTemporal(), Ld->isInvariant(),
-                                  Ld->getAlignment());
+    assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
+      "Invalid vector type");
+
+    // We can't shuffle using an illegal type.
+    if (!TLI.isTypeLegal(WideVecVT))
+      return SDValue();
+
+    SmallVector<SDValue, 8> Chains;
+    SDValue Ptr = Ld->getBasePtr();
+    SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits()/8,
+                                        TLI.getPointerTy());
+    SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
+
+    for (unsigned i = 0; i < NumLoads; ++i) {
+      // Perform a single load.
+      SDValue ScalarLoad = DAG.getLoad(SclrLoadTy, dl, Ld->getChain(),
+                                       Ptr, Ld->getPointerInfo(),
+                                       Ld->isVolatile(), Ld->isNonTemporal(),
+                                       Ld->isInvariant(), Ld->getAlignment());
+      Chains.push_back(ScalarLoad.getValue(1));
+      // Create the first element type using SCALAR_TO_VECTOR in order to avoid
+      // another round of DAGCombining.
+      if (i == 0)
+        Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
+      else
+        Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
+                          ScalarLoad, DAG.getIntPtrConstant(i));
+
+      Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
+    }
  
-    // Insert the word loaded into a vector.
-    SDValue ScalarInVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
-      LoadUnitVecVT, ScalarLoad);
+    SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0],
+                               Chains.size());
  
      // Bitcast the loaded value to a vector of the original element type, in
      // the size of the target vector type.
-    SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT,
-                                    ScalarInVector);
+    SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Res);
      unsigned SizeRatio = RegSz/MemSz;
  
      // Redistribute the loaded elements into the different locations.
      SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
-    for (unsigned i = 0; i < NumElems; i++) ShuffleVec[i*SizeRatio] = i;
+    for (unsigned i = 0; i != NumElems; ++i)
+      ShuffleVec[i*SizeRatio] = i;
  
      SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
-                                DAG.getUNDEF(SlicedVec.getValueType()),
-                                ShuffleVec.data());
+                                         DAG.getUNDEF(WideVecVT),
+                                         &ShuffleVec[0]);
  
      // Bitcast to the requested type.
      Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
      // Replace the original load with the new sequence
      // and return the new chain.
-    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Shuff);
-    return SDValue(ScalarLoad.getNode(), 1);
+    return DCI.CombineTo(N, Shuff, TF, true);
    }
  
    return SDValue();
@@ -14204,13 +15091,12 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
  
    // If we are saving a concatenation of two XMM registers, perform two stores.
-  // This is better in Sandy Bridge cause one 256-bit mem op is done via two
-  // 128-bit ones. If in the future the cost becomes only one memory access the
-  // first version would be better.
-  if (VT.getSizeInBits() == 256 &&
-    StoredVal.getNode()->getOpcode() == ISD::CONCAT_VECTORS &&
-    StoredVal.getNumOperands() == 2) {
-
+  // On Sandy Bridge, 256-bit memory operations are executed by two
+  // 128-bit ports. However, on Haswell it is better to issue a single 256-bit
+  // memory  operation.
+  if (VT.is256BitVector() && !Subtarget->hasAVX2() &&
+      StoredVal.getNode()->getOpcode() == ISD::CONCAT_VECTORS &&
+      StoredVal.getNumOperands() == 2) {
      SDValue Value0 = StoredVal.getOperand(0);
      SDValue Value1 = StoredVal.getOperand(1);
  
@@ -14255,14 +15141,16 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
  
      SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, St->getValue());
      SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
-    for (unsigned i = 0; i < NumElems; i++ ) ShuffleVec[i] = i * SizeRatio;
+    for (unsigned i = 0; i != NumElems; ++i)
+      ShuffleVec[i] = i * SizeRatio;
  
-    // Can't shuffle using an illegal type
-    if (!TLI.isTypeLegal(WideVecVT)) return SDValue();
+    // Can't shuffle using an illegal type.
+    if (!TLI.isTypeLegal(WideVecVT))
+      return SDValue();
  
      SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
-                                DAG.getUNDEF(WideVec.getValueType()),
-                                ShuffleVec.data());
+                                         DAG.getUNDEF(WideVecVT),
+                                         &ShuffleVec[0]);
      // At this point all of the data is stored at the bottom of the
      // register. We now need to save it to mem.
  
@@ -14271,13 +15159,18 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
      for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE;
           tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) {
        MVT Tp = (MVT::SimpleValueType)tp;
-      if (TLI.isTypeLegal(Tp) && StoreType.getSizeInBits() < NumElems * ToSz)
+      if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
          StoreType = Tp;
      }
  
+    // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
+    if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
+        (64 <= NumElems * ToSz))
+      StoreType = MVT::f64;
+
      // Bitcast the original vector into a vector of store-size units
      EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
-            StoreType, VT.getSizeInBits()/EVT(StoreType).getSizeInBits());
+            StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
      assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
      SDValue ShuffWide = DAG.getNode(ISD::BITCAST, dl, StoreVecVT, Shuff);
      SmallVector<SDValue, 8> Chains;
@@ -14286,7 +15179,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
      SDValue Ptr = St->getBasePtr();
  
      // Perform one or more big stores into memory.
-    for (unsigned i = 0; i < (ToSz*NumElems)/StoreType.getSizeInBits() ; i++) {
+    for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
        SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
                                     StoreType, ShuffWide,
                                     DAG.getIntPtrConstant(i));
@@ -14584,6 +15477,29 @@ static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) {
    return SDValue();
  }
  
+/// PerformFMinFMaxCombine - Do target-specific dag combines on X86ISD::FMIN and
+/// X86ISD::FMAX nodes.
+static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) {
+  assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
+
+  // Only perform optimizations if UnsafeMath is used.
+  if (!DAG.getTarget().Options.UnsafeFPMath)
+    return SDValue();
+
+  // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
+  // into FMINC and MMAXC, which are Commutative operations.
+  unsigned NewOp = 0;
+  switch (N->getOpcode()) {
+    default: llvm_unreachable("unknown opcode");
+    case X86ISD::FMIN:  NewOp = X86ISD::FMINC; break;
+    case X86ISD::FMAX:  NewOp = X86ISD::FMAXC; break;
+  }
+
+  return DAG.getNode(NewOp, N->getDebugLoc(), N->getValueType(0),
+                     N->getOperand(0), N->getOperand(1));
+}
+
+
  /// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes.
  static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
    // FAND(0.0, x) -> 0.0
@@ -14635,18 +15551,9 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
    if (!DCI.isBeforeLegalizeOps())
      return SDValue();
  
-  if (!Subtarget->hasAVX()) 
+  if (!Subtarget->hasAVX())
      return SDValue();
  
-  // Optimize vectors in AVX mode
-  // Sign extend  v8i16 to v8i32 and
-  //              v4i32 to v4i64
-  //
-  // Divide input vector into two parts
-  // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
-  // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
-  // concat the vectors to original VT
-
    EVT VT = N->getValueType(0);
    SDValue Op = N->getOperand(0);
    EVT OpVT = Op.getValueType();
@@ -14655,23 +15562,37 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
    if ((VT == MVT::v4i64 && OpVT == MVT::v4i32) ||
        (VT == MVT::v8i32 && OpVT == MVT::v8i16)) {
  
+    if (Subtarget->hasAVX2())
+      return DAG.getNode(X86ISD::VSEXT_MOVL, dl, VT, Op);
+
+    // Optimize vectors in AVX mode
+    // Sign extend  v8i16 to v8i32 and
+    //              v4i32 to v4i64
+    //
+    // Divide input vector into two parts
+    // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
+    // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
+    // concat the vectors to original VT
+
      unsigned NumElems = OpVT.getVectorNumElements();
+    SDValue Undef = DAG.getUNDEF(OpVT);
+
      SmallVector<int,8> ShufMask1(NumElems, -1);
-    for (unsigned i = 0; i < NumElems/2; i++) ShufMask1[i] = i;
+    for (unsigned i = 0; i != NumElems/2; ++i)
+      ShufMask1[i] = i;
  
-    SDValue OpLo = DAG.getVectorShuffle(OpVT, dl, Op, DAG.getUNDEF(OpVT),
-                                        ShufMask1.data());
+    SDValue OpLo = DAG.getVectorShuffle(OpVT, dl, Op, Undef, &ShufMask1[0]);
  
      SmallVector<int,8> ShufMask2(NumElems, -1);
-    for (unsigned i = 0; i < NumElems/2; i++) ShufMask2[i] = i + NumElems/2;
+    for (unsigned i = 0; i != NumElems/2; ++i)
+      ShufMask2[i] = i + NumElems/2;
  
-    SDValue OpHi = DAG.getVectorShuffle(OpVT, dl, Op, DAG.getUNDEF(OpVT),
-                                        ShufMask2.data());
+    SDValue OpHi = DAG.getVectorShuffle(OpVT, dl, Op, Undef, &ShufMask2[0]);
  
-    EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), 
+    EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(),
                                    VT.getVectorNumElements()/2);
  
-    OpLo = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpLo); 
+    OpLo = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpLo);
      OpHi = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpHi);
  
      return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
@@ -14679,7 +15600,42 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
    return SDValue();
  }
  
+static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG,
+                                 const X86Subtarget* Subtarget) {
+  DebugLoc dl = N->getDebugLoc();
+  EVT VT = N->getValueType(0);
+
+  EVT ScalarVT = VT.getScalarType();
+  if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget->hasFMA())
+    return SDValue();
+
+  SDValue A = N->getOperand(0);
+  SDValue B = N->getOperand(1);
+  SDValue C = N->getOperand(2);
+
+  bool NegA = (A.getOpcode() == ISD::FNEG);
+  bool NegB = (B.getOpcode() == ISD::FNEG);
+  bool NegC = (C.getOpcode() == ISD::FNEG);
+
+  // Negative multiplication when NegA xor NegB
+  bool NegMul = (NegA != NegB);
+  if (NegA)
+    A = A.getOperand(0);
+  if (NegB)
+    B = B.getOperand(0);
+  if (NegC)
+    C = C.getOperand(0);
+
+  unsigned Opcode;
+  if (!NegMul)
+    Opcode = (!NegC)? X86ISD::FMADD : X86ISD::FMSUB;
+  else
+    Opcode = (!NegC)? X86ISD::FNMADD : X86ISD::FNMSUB;
+  return DAG.getNode(Opcode, dl, VT, A, B, C);
+}
+
  static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
+                                  TargetLowering::DAGCombinerInfo &DCI,
                                    const X86Subtarget *Subtarget) {
    // (i32 zext (and (i8  x86isd::setcc_carry), 1)) ->
    //           (and (i32 x86isd::setcc_carry), 1)
@@ -14704,6 +15660,7 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
                                     N00.getOperand(0), N00.getOperand(1)),
                         DAG.getConstant(1, VT));
    }
+
    // Optimize vectors in AVX mode:
    //
    //   v8i16 -> v8i32
@@ -14716,50 +15673,154 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
    //   Use vpunpckhdq for 4 upper elements  v4i32 -> v2i64.
    //   Concat upper and lower parts.
    //
-  if (Subtarget->hasAVX()) {
+  if (!DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  if (!Subtarget->hasAVX())
+    return SDValue();
  
-    if (((VT == MVT::v8i32) && (OpVT == MVT::v8i16))  ||
+  if (((VT == MVT::v8i32) && (OpVT == MVT::v8i16)) ||
        ((VT == MVT::v4i64) && (OpVT == MVT::v4i32)))  {
  
-      SDValue ZeroVec = getZeroVector(OpVT, Subtarget, DAG, dl);
-      SDValue OpLo = getTargetShuffleNode(X86ISD::UNPCKL, dl, OpVT, N0, ZeroVec, DAG);
-      SDValue OpHi = getTargetShuffleNode(X86ISD::UNPCKH, dl, OpVT, N0, ZeroVec, DAG);
+    if (Subtarget->hasAVX2())
+      return DAG.getNode(X86ISD::VZEXT_MOVL, dl, VT, N0);
  
-      EVT HVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 
-        VT.getVectorNumElements()/2);
+    SDValue ZeroVec = getZeroVector(OpVT, Subtarget, DAG, dl);
+    SDValue OpLo = getUnpackl(DAG, dl, OpVT, N0, ZeroVec);
+    SDValue OpHi = getUnpackh(DAG, dl, OpVT, N0, ZeroVec);
  
-      OpLo = DAG.getNode(ISD::BITCAST, dl, HVT, OpLo);
-      OpHi = DAG.getNode(ISD::BITCAST, dl, HVT, OpHi);
+    EVT HVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
+                               VT.getVectorNumElements()/2);
  
-      return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
-    }
+    OpLo = DAG.getNode(ISD::BITCAST, dl, HVT, OpLo);
+    OpHi = DAG.getNode(ISD::BITCAST, dl, HVT, OpHi);
+
+    return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
    }
  
+  return SDValue();
+}
+
+// Optimize x == -y --> x+y == 0
+//          x != -y --> x+y != 0
+static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG) {
+  ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
  
+  if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB)
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(LHS.getOperand(0)))
+      if (C->getAPIntValue() == 0 && LHS.hasOneUse()) {
+        SDValue addV = DAG.getNode(ISD::ADD, N->getDebugLoc(),
+                                   LHS.getValueType(), RHS, LHS.getOperand(1));
+        return DAG.getSetCC(N->getDebugLoc(), N->getValueType(0),
+                            addV, DAG.getConstant(0, addV.getValueType()), CC);
+      }
+  if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB)
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS.getOperand(0)))
+      if (C->getAPIntValue() == 0 && RHS.hasOneUse()) {
+        SDValue addV = DAG.getNode(ISD::ADD, N->getDebugLoc(),
+                                   RHS.getValueType(), LHS, RHS.getOperand(1));
+        return DAG.getSetCC(N->getDebugLoc(), N->getValueType(0),
+                            addV, DAG.getConstant(0, addV.getValueType()), CC);
+      }
    return SDValue();
  }
  
  // Optimize  RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
-static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG) {
-  unsigned X86CC = N->getConstantOperandVal(0);
-  SDValue EFLAG = N->getOperand(1);
+static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG,
+                                   TargetLowering::DAGCombinerInfo &DCI,
+                                   const X86Subtarget *Subtarget) {
    DebugLoc DL = N->getDebugLoc();
+  X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
+  SDValue EFLAGS = N->getOperand(1);
  
    // Materialize "setb reg" as "sbb reg,reg", since it can be extended without
    // a zext and produces an all-ones bit which is more useful than 0/1 in some
    // cases.
-  if (X86CC == X86::COND_B)
+  if (CC == X86::COND_B)
      return DAG.getNode(ISD::AND, DL, MVT::i8,
                         DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
-                                   DAG.getConstant(X86CC, MVT::i8), EFLAG),
+                                   DAG.getConstant(CC, MVT::i8), EFLAGS),
                         DAG.getConstant(1, MVT::i8));
  
+  SDValue Flags;
+
+  Flags = checkBoolTestSetCCCombine(EFLAGS, CC);
+  if (Flags.getNode()) {
+    SDValue Cond = DAG.getConstant(CC, MVT::i8);
+    return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags);
+  }
+
+  Flags = checkFlaggedOrCombine(EFLAGS, CC, DAG, Subtarget);
+  if (Flags.getNode()) {
+    SDValue Cond = DAG.getConstant(CC, MVT::i8);
+    return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags);
+  }
+
+  return SDValue();
+}
+
+// Optimize branch condition evaluation.
+//
+static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG,
+                                    TargetLowering::DAGCombinerInfo &DCI,
+                                    const X86Subtarget *Subtarget) {
+  DebugLoc DL = N->getDebugLoc();
+  SDValue Chain = N->getOperand(0);
+  SDValue Dest = N->getOperand(1);
+  SDValue EFLAGS = N->getOperand(3);
+  X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
+
+  SDValue Flags;
+
+  Flags = checkBoolTestSetCCCombine(EFLAGS, CC);
+  if (Flags.getNode()) {
+    SDValue Cond = DAG.getConstant(CC, MVT::i8);
+    return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond,
+                       Flags);
+  }
+
+  Flags = checkFlaggedOrCombine(EFLAGS, CC, DAG, Subtarget);
+  if (Flags.getNode()) {
+    SDValue Cond = DAG.getConstant(CC, MVT::i8);
+    return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond,
+                       Flags);
+  }
+
+  return SDValue();
+}
+
+static SDValue PerformUINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG) {
+  SDValue Op0 = N->getOperand(0);
+  EVT InVT = Op0->getValueType(0);
+
+  // UINT_TO_FP(v4i8) -> SINT_TO_FP(ZEXT(v4i8 to v4i32))
+  if (InVT == MVT::v8i8 || InVT == MVT::v4i8) {
+    DebugLoc dl = N->getDebugLoc();
+    MVT DstVT = InVT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32;
+    SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
+    // Notice that we use SINT_TO_FP because we know that the high bits
+    // are zero and SINT_TO_FP is better supported by the hardware.
+    return DAG.getNode(ISD::SINT_TO_FP, dl, N->getValueType(0), P);
+  }
+
    return SDValue();
  }
  
  static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
                                          const X86TargetLowering *XTLI) {
    SDValue Op0 = N->getOperand(0);
+  EVT InVT = Op0->getValueType(0);
+
+  // SINT_TO_FP(v4i8) -> SINT_TO_FP(SEXT(v4i8 to v4i32))
+  if (InVT == MVT::v8i8 || InVT == MVT::v4i8) {
+    DebugLoc dl = N->getDebugLoc();
+    MVT DstVT = InVT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32;
+    SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
+    return DAG.getNode(ISD::SINT_TO_FP, dl, N->getValueType(0), P);
+  }
+
    // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
    // a 32-bit target where SSE doesn't support i64->FP operations.
    if (Op0.getOpcode() == ISD::LOAD) {
@@ -14778,6 +15839,20 @@ static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
    return SDValue();
  }
  
+static SDValue PerformFP_TO_SINTCombine(SDNode *N, SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+
+  // v4i8 = FP_TO_SINT() -> v4i8 = TRUNCATE (V4i32 = FP_TO_SINT()
+  if (VT == MVT::v8i8 || VT == MVT::v4i8) {
+    DebugLoc dl = N->getDebugLoc();
+    MVT DstVT = VT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32;
+    SDValue I = DAG.getNode(ISD::FP_TO_SINT, dl, DstVT, N->getOperand(0));
+    return DAG.getNode(ISD::TRUNCATE, dl, VT, I);
+  }
+
+  return SDValue();
+}
+
  // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
  static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG,
                                   X86TargetLowering::DAGCombinerInfo &DCI) {
@@ -14898,10 +15973,10 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
    switch (N->getOpcode()) {
    default: break;
    case ISD::EXTRACT_VECTOR_ELT:
-    return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, *this);
+    return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, DCI);
    case ISD::VSELECT:
    case ISD::SELECT:         return PerformSELECTCombine(N, DAG, DCI, Subtarget);
-  case X86ISD::CMOV:        return PerformCMOVCombine(N, DAG, DCI);
+  case X86ISD::CMOV:        return PerformCMOVCombine(N, DAG, DCI, Subtarget);
    case ISD::ADD:            return PerformAddCombine(N, DAG, Subtarget);
    case ISD::SUB:            return PerformSubCombine(N, DAG, Subtarget);
    case X86ISD::ADC:         return PerformADCCombine(N, DAG, DCI);
@@ -14912,20 +15987,27 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
    case ISD::AND:            return PerformAndCombine(N, DAG, DCI, Subtarget);
    case ISD::OR:             return PerformOrCombine(N, DAG, DCI, Subtarget);
    case ISD::XOR:            return PerformXorCombine(N, DAG, DCI, Subtarget);
-  case ISD::LOAD:           return PerformLOADCombine(N, DAG, Subtarget);
+  case ISD::LOAD:           return PerformLOADCombine(N, DAG, DCI, Subtarget);
    case ISD::STORE:          return PerformSTORECombine(N, DAG, Subtarget);
+  case ISD::UINT_TO_FP:     return PerformUINT_TO_FPCombine(N, DAG);
    case ISD::SINT_TO_FP:     return PerformSINT_TO_FPCombine(N, DAG, this);
+  case ISD::FP_TO_SINT:     return PerformFP_TO_SINTCombine(N, DAG);
    case ISD::FADD:           return PerformFADDCombine(N, DAG, Subtarget);
    case ISD::FSUB:           return PerformFSUBCombine(N, DAG, Subtarget);
    case X86ISD::FXOR:
    case X86ISD::FOR:         return PerformFORCombine(N, DAG);
+  case X86ISD::FMIN:
+  case X86ISD::FMAX:        return PerformFMinFMaxCombine(N, DAG);
    case X86ISD::FAND:        return PerformFANDCombine(N, DAG);
    case X86ISD::BT:          return PerformBTCombine(N, DAG, DCI);
    case X86ISD::VZEXT_MOVL:  return PerformVZEXT_MOVLCombine(N, DAG);
-  case ISD::ZERO_EXTEND:    return PerformZExtCombine(N, DAG, Subtarget);
+  case ISD::ANY_EXTEND:
+  case ISD::ZERO_EXTEND:    return PerformZExtCombine(N, DAG, DCI, Subtarget);
    case ISD::SIGN_EXTEND:    return PerformSExtCombine(N, DAG, DCI, Subtarget);
    case ISD::TRUNCATE:       return PerformTruncateCombine(N, DAG, DCI);
-  case X86ISD::SETCC:       return PerformSETCCCombine(N, DAG);
+  case ISD::SETCC:          return PerformISDSETCCCombine(N, DAG);
+  case X86ISD::SETCC:       return PerformSETCCCombine(N, DAG, DCI, Subtarget);
+  case X86ISD::BRCOND:      return PerformBrCondCombine(N, DAG, DCI, Subtarget);
    case X86ISD::SHUFP:       // Handle all target specific shuffles
    case X86ISD::PALIGN:
    case X86ISD::UNPCKH:
@@ -14940,6 +16022,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
    case X86ISD::VPERMILP:
    case X86ISD::VPERM2X128:
    case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget);
+  case ISD::FMA:            return PerformFMACombine(N, DAG, Subtarget);
    }
  
    return SDValue();
@@ -15469,55 +16552,55 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
        // in the normal allocation?
      case 'q':   // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
        if (Subtarget->is64Bit()) {
-       if (VT == MVT::i32 || VT == MVT::f32)
-         return std::make_pair(0U, X86::GR32RegisterClass);
-       else if (VT == MVT::i16)
-         return std::make_pair(0U, X86::GR16RegisterClass);
-       else if (VT == MVT::i8 || VT == MVT::i1)
-         return std::make_pair(0U, X86::GR8RegisterClass);
-       else if (VT == MVT::i64 || VT == MVT::f64)
-         return std::make_pair(0U, X86::GR64RegisterClass);
-       break;
+        if (VT == MVT::i32 || VT == MVT::f32)
+          return std::make_pair(0U, &X86::GR32RegClass);
+        if (VT == MVT::i16)
+          return std::make_pair(0U, &X86::GR16RegClass);
+        if (VT == MVT::i8 || VT == MVT::i1)
+          return std::make_pair(0U, &X86::GR8RegClass);
+        if (VT == MVT::i64 || VT == MVT::f64)
+          return std::make_pair(0U, &X86::GR64RegClass);
+        break;
        }
        // 32-bit fallthrough
      case 'Q':   // Q_REGS
        if (VT == MVT::i32 || VT == MVT::f32)
-       return std::make_pair(0U, X86::GR32_ABCDRegisterClass);
-      else if (VT == MVT::i16)
-       return std::make_pair(0U, X86::GR16_ABCDRegisterClass);
-      else if (VT == MVT::i8 || VT == MVT::i1)
-       return std::make_pair(0U, X86::GR8_ABCD_LRegisterClass);
-      else if (VT == MVT::i64)
-       return std::make_pair(0U, X86::GR64_ABCDRegisterClass);
+        return std::make_pair(0U, &X86::GR32_ABCDRegClass);
+      if (VT == MVT::i16)
+        return std::make_pair(0U, &X86::GR16_ABCDRegClass);
+      if (VT == MVT::i8 || VT == MVT::i1)
+        return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
+      if (VT == MVT::i64)
+        return std::make_pair(0U, &X86::GR64_ABCDRegClass);
        break;
      case 'r':   // GENERAL_REGS
      case 'l':   // INDEX_REGS
        if (VT == MVT::i8 || VT == MVT::i1)
-        return std::make_pair(0U, X86::GR8RegisterClass);
+        return std::make_pair(0U, &X86::GR8RegClass);
        if (VT == MVT::i16)
-        return std::make_pair(0U, X86::GR16RegisterClass);
+        return std::make_pair(0U, &X86::GR16RegClass);
        if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget->is64Bit())
-        return std::make_pair(0U, X86::GR32RegisterClass);
-      return std::make_pair(0U, X86::GR64RegisterClass);
+        return std::make_pair(0U, &X86::GR32RegClass);
+      return std::make_pair(0U, &X86::GR64RegClass);
      case 'R':   // LEGACY_REGS
        if (VT == MVT::i8 || VT == MVT::i1)
-        return std::make_pair(0U, X86::GR8_NOREXRegisterClass);
+        return std::make_pair(0U, &X86::GR8_NOREXRegClass);
        if (VT == MVT::i16)
-        return std::make_pair(0U, X86::GR16_NOREXRegisterClass);
+        return std::make_pair(0U, &X86::GR16_NOREXRegClass);
        if (VT == MVT::i32 || !Subtarget->is64Bit())
-        return std::make_pair(0U, X86::GR32_NOREXRegisterClass);
-      return std::make_pair(0U, X86::GR64_NOREXRegisterClass);
+        return std::make_pair(0U, &X86::GR32_NOREXRegClass);
+      return std::make_pair(0U, &X86::GR64_NOREXRegClass);
      case 'f':  // FP Stack registers.
        // If SSE is enabled for this VT, use f80 to ensure the isel moves the
        // value to the correct fpstack register class.
        if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
-        return std::make_pair(0U, X86::RFP32RegisterClass);
+        return std::make_pair(0U, &X86::RFP32RegClass);
        if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
-        return std::make_pair(0U, X86::RFP64RegisterClass);
-      return std::make_pair(0U, X86::RFP80RegisterClass);
+        return std::make_pair(0U, &X86::RFP64RegClass);
+      return std::make_pair(0U, &X86::RFP80RegClass);
      case 'y':   // MMX_REGS if MMX allowed.
        if (!Subtarget->hasMMX()) break;
-      return std::make_pair(0U, X86::VR64RegisterClass);
+      return std::make_pair(0U, &X86::VR64RegClass);
      case 'Y':   // SSE_REGS if SSE2 allowed
        if (!Subtarget->hasSSE2()) break;
        // FALL THROUGH.
@@ -15529,10 +16612,10 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
        // Scalar SSE types.
        case MVT::f32:
        case MVT::i32:
-        return std::make_pair(0U, X86::FR32RegisterClass);
+        return std::make_pair(0U, &X86::FR32RegClass);
        case MVT::f64:
        case MVT::i64:
-        return std::make_pair(0U, X86::FR64RegisterClass);
+        return std::make_pair(0U, &X86::FR64RegClass);
        // Vector types.
        case MVT::v16i8:
        case MVT::v8i16:
@@ -15540,7 +16623,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
        case MVT::v2i64:
        case MVT::v4f32:
        case MVT::v2f64:
-        return std::make_pair(0U, X86::VR128RegisterClass);
+        return std::make_pair(0U, &X86::VR128RegClass);
        // AVX types.
        case MVT::v32i8:
        case MVT::v16i16:
@@ -15548,8 +16631,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
        case MVT::v4i64:
        case MVT::v8f32:
        case MVT::v4f64:
-        return std::make_pair(0U, X86::VR256RegisterClass);
-        
+        return std::make_pair(0U, &X86::VR256RegClass);
        }
        break;
      }
@@ -15572,28 +16654,28 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
          Constraint[6] == '}') {
  
        Res.first = X86::ST0+Constraint[4]-'0';
-      Res.second = X86::RFP80RegisterClass;
+      Res.second = &X86::RFP80RegClass;
        return Res;
      }
  
      // GCC allows "st(0)" to be called just plain "st".
      if (StringRef("{st}").equals_lower(Constraint)) {
        Res.first = X86::ST0;
-      Res.second = X86::RFP80RegisterClass;
+      Res.second = &X86::RFP80RegClass;
        return Res;
      }
  
      // flags -> EFLAGS
      if (StringRef("{flags}").equals_lower(Constraint)) {
        Res.first = X86::EFLAGS;
-      Res.second = X86::CCRRegisterClass;
+      Res.second = &X86::CCRRegClass;
        return Res;
      }
  
      // 'A' means EAX + EDX.
      if (Constraint == "A") {
        Res.first = X86::EAX;
-      Res.second = X86::GR32_ADRegisterClass;
+      Res.second = &X86::GR32_ADRegClass;
        return Res;
      }
      return Res;
@@ -15609,7 +16691,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
    // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp".  If we
    // really want an 8-bit or 32-bit register, map to the appropriate register
    // class and return the appropriate register.
-  if (Res.second == X86::GR16RegisterClass) {
+  if (Res.second == &X86::GR16RegClass) {
      if (VT == MVT::i8) {
        unsigned DestReg = 0;
        switch (Res.first) {
@@ -15621,7 +16703,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
        }
        if (DestReg) {
          Res.first = DestReg;
-        Res.second = X86::GR8RegisterClass;
+        Res.second = &X86::GR8RegClass;
        }
      } else if (VT == MVT::i32) {
        unsigned DestReg = 0;
@@ -15638,7 +16720,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
        }
        if (DestReg) {
          Res.first = DestReg;
-        Res.second = X86::GR32RegisterClass;
+        Res.second = &X86::GR32RegClass;
        }
      } else if (VT == MVT::i64) {
        unsigned DestReg = 0;
@@ -15655,22 +16737,25 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
        }
        if (DestReg) {
          Res.first = DestReg;
-        Res.second = X86::GR64RegisterClass;
+        Res.second = &X86::GR64RegClass;
        }
      }
-  } else if (Res.second == X86::FR32RegisterClass ||
-             Res.second == X86::FR64RegisterClass ||
-             Res.second == X86::VR128RegisterClass) {
+  } else if (Res.second == &X86::FR32RegClass ||
+             Res.second == &X86::FR64RegClass ||
+             Res.second == &X86::VR128RegClass) {
      // Handle references to XMM physical registers that got mapped into the
      // wrong class.  This can happen with constraints like {xmm0} where the
      // target independent register mapper will just pick the first match it can
      // find, ignoring the required type.
-    if (VT == MVT::f32)
-      Res.second = X86::FR32RegisterClass;
-    else if (VT == MVT::f64)
-      Res.second = X86::FR64RegisterClass;
-    else if (X86::VR128RegisterClass->hasType(VT))
-      Res.second = X86::VR128RegisterClass;
+
+    if (VT == MVT::f32 || VT == MVT::i32)
+      Res.second = &X86::FR32RegClass;
+    else if (VT == MVT::f64 || VT == MVT::i64)
+      Res.second = &X86::FR64RegClass;
+    else if (X86::VR128RegClass.hasType(VT))
+      Res.second = &X86::VR128RegClass;
+    else if (X86::VR256RegClass.hasType(VT))
+      Res.second = &X86::VR256RegClass;
    }
  
    return Res;