Introduce a new function to lower 256-bit vectors which are not

[oota-llvm.git] / lib / Target / X86 / X86ISelLowering.cpp
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index f953bf2bc7f5ccb71176e9df162f1167ba4db5b8..85c6f4923510935175adc5258f3a35663cd924f3 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -85,14 +85,10 @@ static SDValue Extract128BitVector(SDValue Vec,
                                     DebugLoc dl) {
    EVT VT = Vec.getValueType();
    assert(VT.getSizeInBits() == 256 && "Unexpected vector size!");
-
    EVT ElVT = VT.getVectorElementType();
-
-  int Factor = VT.getSizeInBits() / 128;
-
-  EVT ResultVT = EVT::getVectorVT(*DAG.getContext(),
-                                  ElVT,
-                                  VT.getVectorNumElements() / Factor);
+  int Factor = VT.getSizeInBits()/128;
+  EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
+                                  VT.getVectorNumElements()/Factor);
  
    // Extract from UNDEF is UNDEF.
    if (Vec.getOpcode() == ISD::UNDEF)
@@ -111,7 +107,6 @@ static SDValue Extract128BitVector(SDValue Vec,
                                   * ElemsPerChunk);
  
      SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32);
-
      SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec,
                                   VecIdx);
  
@@ -136,21 +131,18 @@ static SDValue Insert128BitVector(SDValue Result,
      assert(VT.getSizeInBits() == 128 && "Unexpected vector size!");
  
      EVT ElVT = VT.getVectorElementType();
-
      unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
-
      EVT ResultVT = Result.getValueType();
  
      // Insert the relevant 128 bits.
-    unsigned ElemsPerChunk = 128 / ElVT.getSizeInBits();
+    unsigned ElemsPerChunk = 128/ElVT.getSizeInBits();
  
      // This is the index of the first element of the 128-bit chunk
      // we want.
-    unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / 128)
+    unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/128)
                                   * ElemsPerChunk);
  
      SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32);
-
      Result = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec,
                           VecIdx);
      return Result;
@@ -197,11 +189,8 @@ static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) {
      return new TargetLoweringObjectFileMachO();
    }
  
-  if (Subtarget->isTargetELF()) {
-    if (is64Bit)
-      return new X8664_ELFTargetObjectFile(TM);
-    return new X8632_ELFTargetObjectFile(TM);
-  }
+  if (Subtarget->isTargetELF())
+    return new TargetLoweringObjectFileELF();
    if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho())
      return new TargetLoweringObjectFileCOFF();
    llvm_unreachable("unknown subtarget type");
@@ -981,14 +970,14 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
      setOperationAction(ISD::VSETCC,             MVT::v2i64, Custom);
  
    if (!UseSoftFloat && Subtarget->hasAVX()) {
-    addRegisterClass(MVT::v8f32, X86::VR256RegisterClass);
-    addRegisterClass(MVT::v4f64, X86::VR256RegisterClass);
-    addRegisterClass(MVT::v8i32, X86::VR256RegisterClass);
-    addRegisterClass(MVT::v4i64, X86::VR256RegisterClass);
-    addRegisterClass(MVT::v32i8, X86::VR256RegisterClass);
+    addRegisterClass(MVT::v32i8,  X86::VR256RegisterClass);
+    addRegisterClass(MVT::v16i16, X86::VR256RegisterClass);
+    addRegisterClass(MVT::v8i32,  X86::VR256RegisterClass);
+    addRegisterClass(MVT::v8f32,  X86::VR256RegisterClass);
+    addRegisterClass(MVT::v4i64,  X86::VR256RegisterClass);
+    addRegisterClass(MVT::v4f64,  X86::VR256RegisterClass);
  
      setOperationAction(ISD::LOAD,               MVT::v8f32, Legal);
-    setOperationAction(ISD::LOAD,               MVT::v8i32, Legal);
      setOperationAction(ISD::LOAD,               MVT::v4f64, Legal);
      setOperationAction(ISD::LOAD,               MVT::v4i64, Legal);
  
@@ -1006,63 +995,58 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
      setOperationAction(ISD::FSQRT,              MVT::v4f64, Legal);
      setOperationAction(ISD::FNEG,               MVT::v4f64, Custom);
  
-    // Custom lower build_vector, vector_shuffle, scalar_to_vector,
-    // insert_vector_elt extract_subvector and extract_vector_elt for
-    // 256-bit types.
+    // Custom lower several nodes for 256-bit types.
      for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
-         i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE;
-         ++i) {
-      MVT::SimpleValueType VT = (MVT::SimpleValueType)i;
-      // Do not attempt to custom lower non-256-bit vectors
-      if (!isPowerOf2_32(MVT(VT).getVectorNumElements())
-          || (MVT(VT).getSizeInBits() < 256))
-        continue;
-      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
-      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
-      setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
-      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
-      setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
-    }
-    // Custom-lower insert_subvector and extract_subvector based on
-    // the result type.
-    for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
-         i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE;
-         ++i) {
-      MVT::SimpleValueType VT = (MVT::SimpleValueType)i;
-      // Do not attempt to custom lower non-256-bit vectors
-      if (!isPowerOf2_32(MVT(VT).getVectorNumElements()))
+                  i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) {
+      MVT::SimpleValueType SVT = (MVT::SimpleValueType)i;
+      EVT VT = SVT;
+
+      // Extract subvector is special because the value type
+      // (result) is 128-bit but the source is 256-bit wide.
+      if (VT.is128BitVector())
+        setOperationAction(ISD::EXTRACT_SUBVECTOR, SVT, Custom);
+
+      // Do not attempt to custom lower other non-256-bit vectors
+      if (!VT.is256BitVector())
          continue;
  
-      if (MVT(VT).getSizeInBits() == 128) {
-        setOperationAction(ISD::EXTRACT_SUBVECTOR,  VT, Custom);
-      }
-      else if (MVT(VT).getSizeInBits() == 256) {
-        setOperationAction(ISD::INSERT_SUBVECTOR,  VT, Custom);
-      }
+      setOperationAction(ISD::BUILD_VECTOR,       SVT, Custom);
+      setOperationAction(ISD::VECTOR_SHUFFLE,     SVT, Custom);
+      setOperationAction(ISD::INSERT_VECTOR_ELT,  SVT, Custom);
+      setOperationAction(ISD::EXTRACT_VECTOR_ELT, SVT, Custom);
+      setOperationAction(ISD::SCALAR_TO_VECTOR,   SVT, Custom);
+      setOperationAction(ISD::INSERT_SUBVECTOR,   SVT, Custom);
      }
  
      // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
-    // Don't promote loads because we need them for VPERM vector index versions.
+    for (unsigned i = (unsigned)MVT::v32i8; i != (unsigned)MVT::v4i64; ++i) {
+      MVT::SimpleValueType SVT = (MVT::SimpleValueType)i;
+      EVT VT = SVT;
  
-    for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
-         VT != (unsigned)MVT::LAST_VECTOR_VALUETYPE;
-         VT++) {
-      if (!isPowerOf2_32(MVT((MVT::SimpleValueType)VT).getVectorNumElements())
-          || (MVT((MVT::SimpleValueType)VT).getSizeInBits() < 256))
+      // Do not attempt to promote non-256-bit vectors
+      if (!VT.is256BitVector())
          continue;
-      setOperationAction(ISD::AND,    (MVT::SimpleValueType)VT, Promote);
-      AddPromotedToType (ISD::AND,    (MVT::SimpleValueType)VT, MVT::v4i64);
-      setOperationAction(ISD::OR,     (MVT::SimpleValueType)VT, Promote);
-      AddPromotedToType (ISD::OR,     (MVT::SimpleValueType)VT, MVT::v4i64);
-      setOperationAction(ISD::XOR,    (MVT::SimpleValueType)VT, Promote);
-      AddPromotedToType (ISD::XOR,    (MVT::SimpleValueType)VT, MVT::v4i64);
-      //setOperationAction(ISD::LOAD,   (MVT::SimpleValueType)VT, Promote);
-      //AddPromotedToType (ISD::LOAD,   (MVT::SimpleValueType)VT, MVT::v4i64);
-      setOperationAction(ISD::SELECT, (MVT::SimpleValueType)VT, Promote);
-      AddPromotedToType (ISD::SELECT, (MVT::SimpleValueType)VT, MVT::v4i64);
+
+      setOperationAction(ISD::AND,    SVT, Promote);
+      AddPromotedToType (ISD::AND,    SVT, MVT::v4i64);
+      setOperationAction(ISD::OR,     SVT, Promote);
+      AddPromotedToType (ISD::OR,     SVT, MVT::v4i64);
+      setOperationAction(ISD::XOR,    SVT, Promote);
+      AddPromotedToType (ISD::XOR,    SVT, MVT::v4i64);
+      setOperationAction(ISD::LOAD,   SVT, Promote);
+      AddPromotedToType (ISD::LOAD,   SVT, MVT::v4i64);
+      setOperationAction(ISD::SELECT, SVT, Promote);
+      AddPromotedToType (ISD::SELECT, SVT, MVT::v4i64);
      }
    }
  
+  // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion
+  // of this type with custom code.
+  for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
+         VT != (unsigned)MVT::LAST_VECTOR_VALUETYPE; VT++) {
+    setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT, Custom);
+  }
+
    // We want to custom lower some of our intrinsics.
    setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
  
@@ -1137,18 +1121,18 @@ MVT::SimpleValueType X86TargetLowering::getSetCCResultType(EVT VT) const {
  
  /// getMaxByValAlign - Helper for getByValTypeAlignment to determine
  /// the desired ByVal argument alignment.
-static void getMaxByValAlign(const Type *Ty, unsigned &MaxAlign) {
+static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
    if (MaxAlign == 16)
      return;
-  if (const VectorType *VTy = dyn_cast<VectorType>(Ty)) {
+  if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
      if (VTy->getBitWidth() == 128)
        MaxAlign = 16;
-  } else if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
+  } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
      unsigned EltAlign = 0;
      getMaxByValAlign(ATy->getElementType(), EltAlign);
      if (EltAlign > MaxAlign)
        MaxAlign = EltAlign;
-  } else if (const StructType *STy = dyn_cast<StructType>(Ty)) {
+  } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
      for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
        unsigned EltAlign = 0;
        getMaxByValAlign(STy->getElementType(i), EltAlign);
@@ -1165,7 +1149,7 @@ static void getMaxByValAlign(const Type *Ty, unsigned &MaxAlign) {
  /// function arguments in the caller parameter area. For X86, aggregates
  /// that contain SSE vectors are placed at 16-byte boundaries while the rest
  /// are at 4-byte boundaries.
-unsigned X86TargetLowering::getByValTypeAlignment(const Type *Ty) const {
+unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const {
    if (Subtarget->is64Bit()) {
      // Max of 8 and alignment of type.
      unsigned TyAlign = TD->getABITypeAlignment(Ty);
@@ -2764,6 +2748,7 @@ static bool isTargetShuffle(unsigned Opcode) {
    case X86ISD::PUNPCKHBW:
    case X86ISD::PUNPCKHDQ:
    case X86ISD::PUNPCKHQDQ:
+  case X86ISD::VPERMIL:
      return true;
    }
    return false;
@@ -2789,6 +2774,7 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
    case X86ISD::PSHUFD:
    case X86ISD::PSHUFHW:
    case X86ISD::PSHUFLW:
+  case X86ISD::VPERMIL:
      return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8));
    }
  
@@ -3439,6 +3425,54 @@ bool X86::isMOVLMask(ShuffleVectorSDNode *N) {
    return ::isMOVLMask(M, N->getValueType(0));
  }
  
+/// isVPERMILMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to VPERMIL*.
+static bool isVPERMILMask(const SmallVectorImpl<int> &Mask, EVT VT) {
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned NumLanes = VT.getSizeInBits()/128;
+
+  // Match any permutation of 128-bit vector with 32/64-bit types
+  if (NumLanes == 1) {
+    if (NumElts == 4 || NumElts == 2)
+      return true;
+    return false;
+  }
+
+  // Only match 256-bit with 32/64-bit types
+  if (NumElts != 8 && NumElts != 4)
+    return false;
+
+  // The mask on the high lane should be the same as the low. Actually,
+  // they can differ if any of the corresponding index in a lane is undef.
+  int LaneSize = NumElts/NumLanes;
+  for (int i = 0; i < LaneSize; ++i) {
+    int HighElt = i+LaneSize;
+    if (Mask[i] < 0 || Mask[HighElt] < 0)
+      continue;
+
+    if (Mask[HighElt]-Mask[i] != LaneSize)
+      return false;
+  }
+
+  return true;
+}
+
+/// getShuffleVPERMILImmediateediate - Return the appropriate immediate to shuffle
+/// the specified VECTOR_MASK mask with VPERMIL* instructions.
+static unsigned getShuffleVPERMILImmediate(SDNode *N) {
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+  EVT VT = SVOp->getValueType(0);
+
+  int NumElts = VT.getVectorNumElements();
+  int NumLanes = VT.getSizeInBits()/128;
+
+  unsigned Mask = 0;
+  for (int i = 0; i < NumElts/NumLanes /* lane size */; ++i)
+    Mask |= SVOp->getMaskElt(i) << (i*2);
+
+  return Mask;
+}
+
  /// isCommutedMOVL - Returns true if the shuffle mask is except the reverse
  /// of what x86 movss want. X86 movs requires the lowest  element to be lowest
  /// element of vector 2 and the other elements to come from vector 1 in order.
@@ -3650,7 +3684,6 @@ unsigned X86::getExtractVEXTRACTF128Immediate(SDNode *N) {
    EVT ElVT = VecVT.getVectorElementType();
  
    unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits();
-
    return Index / NumElemsPerChunk;
  }
  
@@ -3668,7 +3701,6 @@ unsigned X86::getInsertVINSERTF128Immediate(SDNode *N) {
    EVT ElVT = VecVT.getVectorElementType();
  
    unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits();
-
    return Index / NumElemsPerChunk;
  }
  
@@ -3844,19 +3876,24 @@ static SDValue getZeroVector(EVT VT, bool HasSSE2, SelectionDAG &DAG,
  }
  
  /// getOnesVector - Returns a vector of specified type with all bits set.
-///
+/// Always build ones vectors as <4 x i32> or <8 x i32> bitcasted to
+/// their original type, ensuring they get CSE'd.
  static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) {
    assert(VT.isVector() && "Expected a vector type");
+  assert((VT.is128BitVector() || VT.is256BitVector())
+         && "Expected a 128-bit or 256-bit vector type");
  
-  // Always build ones vectors as <4 x i32> or <2 x i32> bitcasted to their dest
-  // type.  This ensures they get CSE'd.
    SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32);
+
    SDValue Vec;
-  Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
+  if (VT.is256BitVector()) {
+    SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
+    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 8);
+  } else
+    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
    return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
  }
  
-
  /// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements
  /// that point to V2 points to its first element.
  static SDValue NormalizeMask(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
@@ -3903,7 +3940,7 @@ static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
    return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
  }
  
-/// getUnpackhMask - Returns a vector_shuffle node for an unpackh operation.
+/// getUnpackh - Returns a vector_shuffle node for an unpackh operation.
  static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
                            SDValue V2) {
    unsigned NumElems = VT.getVectorNumElements();
@@ -3916,31 +3953,89 @@ static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
    return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
  }
  
-/// PromoteSplat - Promote a splat of v4i32, v8i16 or v16i8 to v4f32.
-static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
-  EVT PVT = MVT::v4f32;
-  EVT VT = SV->getValueType(0);
-  DebugLoc dl = SV->getDebugLoc();
-  SDValue V1 = SV->getOperand(0);
+// PromoteSplatv8v16 - All i16 and i8 vector types can't be used directly by
+// a generic shuffle instruction because the target has no such instructions.
+// Generate shuffles which repeat i16 and i8 several times until they can be
+// represented by v4f32 and then be manipulated by target suported shuffles.
+static SDValue PromoteSplatv8v16(SDValue V, SelectionDAG &DAG, int &EltNo) {
+  EVT VT = V.getValueType();
    int NumElems = VT.getVectorNumElements();
-  int EltNo = SV->getSplatIndex();
+  DebugLoc dl = V.getDebugLoc();
  
-  // unpack elements to the correct location
    while (NumElems > 4) {
      if (EltNo < NumElems/2) {
-      V1 = getUnpackl(DAG, dl, VT, V1, V1);
+      V = getUnpackl(DAG, dl, VT, V, V);
      } else {
-      V1 = getUnpackh(DAG, dl, VT, V1, V1);
+      V = getUnpackh(DAG, dl, VT, V, V);
        EltNo -= NumElems/2;
      }
      NumElems >>= 1;
    }
+  return V;
+}
+
+/// getLegalSplat - Generate a legal splat with supported x86 shuffles
+static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
+  EVT VT = V.getValueType();
+  DebugLoc dl = V.getDebugLoc();
+  assert((VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256)
+         && "Vector size not supported");
+
+  bool Is128 = VT.getSizeInBits() == 128;
+  EVT NVT = Is128 ? MVT::v4f32 : MVT::v8f32;
+  V = DAG.getNode(ISD::BITCAST, dl, NVT, V);
+
+  if (Is128) {
+    int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
+    V = DAG.getVectorShuffle(NVT, dl, V, DAG.getUNDEF(NVT), &SplatMask[0]);
+  } else {
+    // The second half of indicies refer to the higher part, which is a
+    // duplication of the lower one. This makes this shuffle a perfect match
+    // for the VPERM instruction.
+    int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo,
+                         EltNo+4, EltNo+4, EltNo+4, EltNo+4 };
+    V = DAG.getVectorShuffle(NVT, dl, V, DAG.getUNDEF(NVT), &SplatMask[0]);
+  }
  
-  // Perform the splat.
-  int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
-  V1 = DAG.getNode(ISD::BITCAST, dl, PVT, V1);
-  V1 = DAG.getVectorShuffle(PVT, dl, V1, DAG.getUNDEF(PVT), &SplatMask[0]);
-  return DAG.getNode(ISD::BITCAST, dl, VT, V1);
+  return DAG.getNode(ISD::BITCAST, dl, VT, V);
+}
+
+/// PromoteSplat - Promote a splat of v4i32, v8i16 or v16i8 to v4f32 and
+/// v8i32, v16i16 or v32i8 to v8f32.
+static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
+  EVT SrcVT = SV->getValueType(0);
+  SDValue V1 = SV->getOperand(0);
+  DebugLoc dl = SV->getDebugLoc();
+
+  int EltNo = SV->getSplatIndex();
+  int NumElems = SrcVT.getVectorNumElements();
+  unsigned Size = SrcVT.getSizeInBits();
+
+  // Extract the 128-bit part containing the splat element and update
+  // the splat element index when it refers to the higher register.
+  if (Size == 256) {
+    unsigned Idx = (EltNo > NumElems/2) ? NumElems/2 : 0;
+    V1 = Extract128BitVector(V1, DAG.getConstant(Idx, MVT::i32), DAG, dl);
+    if (Idx > 0)
+      EltNo -= NumElems/2;
+  }
+
+  // Make this 128-bit vector duplicate i8 and i16 elements
+  if (NumElems > 4)
+    V1 = PromoteSplatv8v16(V1, DAG, EltNo);
+
+  // Recreate the 256-bit vector and place the same 128-bit vector
+  // into the low and high part. This is necessary because we want
+  // to use VPERM to shuffle the v8f32 vector, and VPERM only shuffles
+  // inside each separate v4f32 lane.
+  if (Size == 256) {
+    SDValue InsV = Insert128BitVector(DAG.getUNDEF(SrcVT), V1,
+                         DAG.getConstant(0, MVT::i32), DAG, dl);
+    V1 = Insert128BitVector(InsV, V1,
+               DAG.getConstant(NumElems/2, MVT::i32), DAG, dl);
+  }
+
+  return getLegalSplat(DAG, V1, EltNo);
  }
  
  /// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
@@ -4053,6 +4148,10 @@ static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG,
        return getShuffleScalarElt(V.getOperand(OpNum).getNode(), Index, DAG,
                                   Depth+1);
      }
+    case X86ISD::VPERMIL:
+      ImmN = N->getOperand(N->getNumOperands()-1);
+      DecodeVPERMILMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(),
+                        ShuffleMask);
      default:
        assert("not implemented for target shuffle node");
        return SDValue();
@@ -4471,17 +4570,17 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
      return ConcatVectors(Lower, Upper, DAG);
    }
  
-  // All zero's are handled with pxor in SSE2 and above, xorps in SSE1.
-  // All one's are handled with pcmpeqd. In AVX, zero's are handled with
-  // vpxor in 128-bit and xor{pd,ps} in 256-bit, but no 256 version of pcmpeqd
-  // is present, so AllOnes is ignored.
+  // All zero's:
+  //  - pxor (SSE2), xorps (SSE1), vpxor (128 AVX), xorp[s|d] (256 AVX)
+  // All one's:
+  //  - pcmpeqd (SSE2 and 128 AVX), fallback to constant pools (256 AVX)
    if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
-      (Op.getValueType().getSizeInBits() != 256 &&
-       ISD::isBuildVectorAllOnes(Op.getNode()))) {
-    // Canonicalize this to <4 x i32> (SSE) to
+      ISD::isBuildVectorAllOnes(Op.getNode())) {
+    // Canonicalize this to <4 x i32> or <8 x 32> (SSE) to
      // 1) ensure the zero vectors are CSE'd, and 2) ensure that i64 scalars are
      // eliminated on x86-32 hosts.
-    if (Op.getValueType() == MVT::v4i32)
+    if (Op.getValueType() == MVT::v4i32 ||
+        Op.getValueType() == MVT::v8i32)
        return Op;
  
      if (ISD::isBuildVectorAllOnes(Op.getNode()))
@@ -5277,15 +5376,24 @@ static SDValue getVZextMovL(EVT VT, EVT OpVT,
                                               OpVT, SrcOp)));
  }
  
-/// LowerVECTOR_SHUFFLE_4wide - Handle all 4 wide cases with a number of
-/// shuffles.
+/// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles
+/// which could not be matched by any known target speficic shuffle
+static SDValue
+LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
+  return SDValue();
+}
+
+/// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with
+/// 4 elements, and match them with several different shuffle types.
  static SDValue
-LowerVECTOR_SHUFFLE_4wide(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
+LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
    SDValue V1 = SVOp->getOperand(0);
    SDValue V2 = SVOp->getOperand(1);
    DebugLoc dl = SVOp->getDebugLoc();
    EVT VT = SVOp->getValueType(0);
  
+  assert(VT.getSizeInBits() == 128 && "Unsupported vector size");
+
    SmallVector<std::pair<int, int>, 8> Locs;
    Locs.resize(4);
    SmallVector<int, 8> Mask1(4U, -1);
@@ -5677,19 +5785,24 @@ SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG,
  
    // Handle splat operations
    if (SVOp->isSplat()) {
-    // Special case, this is the only place now where it's
-    // allowed to return a vector_shuffle operation without
-    // using a target specific node, because *hopefully* it
-    // will be optimized away by the dag combiner.
-    if (VT.getVectorNumElements() <= 4 &&
-        CanXFormVExtractWithShuffleIntoLoad(Op, DAG, TLI))
+    unsigned NumElem = VT.getVectorNumElements();
+    // Special case, this is the only place now where it's allowed to return
+    // a vector_shuffle operation without using a target specific node, because
+    // *hopefully* it will be optimized away by the dag combiner. FIXME: should
+    // this be moved to DAGCombine instead?
+    if (NumElem <= 4 && CanXFormVExtractWithShuffleIntoLoad(Op, DAG, TLI))
        return Op;
  
      // Handle splats by matching through known masks
-    if (VT.getVectorNumElements() <= 4)
+    if ((VT.is128BitVector() && NumElem <= 4) ||
+        (VT.is256BitVector() && NumElem <= 8))
        return SDValue();
  
-    // Canonicalize all of the remaining to v4f32.
+    // All i16 and i8 vector types can't be used directly by a generic shuffle
+    // instruction because the target has no such instruction. Generate shuffles
+    // which repeat i16 and i8 several times until they fit in i32, and then can
+    // be manipulated by target suported shuffles. After the insertion of the
+    // necessary shuffles, the result is bitcasted back to v4f32 or v8f32.
      return PromoteSplat(SVOp, DAG);
    }
  
@@ -5990,9 +6103,24 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
        return NewOp;
    }
  
-  // Handle all 4 wide cases with a number of shuffles.
-  if (NumElems == 4)
-    return LowerVECTOR_SHUFFLE_4wide(SVOp, DAG);
+  // Handle all 128-bit wide vectors with 4 elements, and match them with
+  // several different shuffle types.
+  if (NumElems == 4 && VT.getSizeInBits() == 128)
+    return LowerVECTOR_SHUFFLE_128v4(SVOp, DAG);
+
+  //===--------------------------------------------------------------------===//
+  //  Custom lower or generate target specific nodes for 256-bit shuffles.
+
+  // Handle VPERMIL permutations
+  if (isVPERMILMask(M, VT)) {
+    unsigned TargetMask = getShuffleVPERMILImmediate(SVOp);
+    if (VT == MVT::v8f32)
+      return getTargetShuffleNode(X86ISD::VPERMIL, dl, VT, V1, TargetMask, DAG);
+  }
+
+  // Handle general 256-bit shuffles
+  if (VT.is256BitVector())
+    return LowerVECTOR_SHUFFLE_256(SVOp, DAG);
  
    return SDValue();
  }
@@ -8119,7 +8247,7 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
    DebugLoc dl = Op.getDebugLoc();
  
    EVT ArgVT = Op.getNode()->getValueType(0);
-  const Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
    uint32_t ArgSize = getTargetData()->getTypeAllocSize(ArgTy);
    uint8_t ArgMode;
  
@@ -8553,8 +8681,8 @@ SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op,
      const unsigned char JMP64r  = 0xFF; // 64-bit jmp through register opcode.
      const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
  
-    const unsigned char N86R10 = RegInfo->getX86RegNum(X86::R10);
-    const unsigned char N86R11 = RegInfo->getX86RegNum(X86::R11);
+    const unsigned char N86R10 = X86_MC::getX86RegNum(X86::R10);
+    const unsigned char N86R11 = X86_MC::getX86RegNum(X86::R11);
  
      const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
  
@@ -8620,7 +8748,7 @@ SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op,
        NestReg = X86::ECX;
  
        // Check that ECX wasn't needed by an 'inreg' parameter.
-      const FunctionType *FTy = Func->getFunctionType();
+      FunctionType *FTy = Func->getFunctionType();
        const AttrListPtr &Attrs = Func->getAttributes();
  
        if (!Attrs.isEmpty() && !Func->isVarArg()) {
@@ -8658,7 +8786,7 @@ SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op,
  
      // This is storing the opcode for MOV32ri.
      const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
-    const unsigned char N86Reg = RegInfo->getX86RegNum(NestReg);
+    const unsigned char N86Reg = X86_MC::getX86RegNum(NestReg);
      OutChains[0] = DAG.getStore(Root, dl,
                                  DAG.getConstant(MOV32ri|N86Reg, MVT::i8),
                                  Trmp, MachinePointerInfo(TrmpAddr),
@@ -8928,8 +9056,8 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
    }
  
    // Lower SHL with variable shift amount.
-  // Cannot lower SHL without SSE4.1 or later.
-  if (!Subtarget->hasSSE41()) return SDValue();
+  // Cannot lower SHL without SSE2 or later.
+  if (!Subtarget->hasSSE2()) return SDValue();
  
    if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
      Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
@@ -9076,6 +9204,58 @@ SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const {
    return Sum;
  }
  
+SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const{
+  DebugLoc dl = Op.getDebugLoc();
+  SDNode* Node = Op.getNode();
+  EVT ExtraVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
+  EVT VT = Node->getValueType(0);
+
+  if (Subtarget->hasSSE2() && VT.isVector()) {
+    unsigned BitsDiff = VT.getScalarType().getSizeInBits() -
+                        ExtraVT.getScalarType().getSizeInBits();
+    SDValue ShAmt = DAG.getConstant(BitsDiff, MVT::i32);
+
+    unsigned SHLIntrinsicsID = 0;
+    unsigned SRAIntrinsicsID = 0;
+    switch (VT.getSimpleVT().SimpleTy) {
+      default:
+        return SDValue();
+      case MVT::v2i64: {
+        SHLIntrinsicsID = Intrinsic::x86_sse2_pslli_q;
+        SRAIntrinsicsID = 0;
+        break;
+      }
+      case MVT::v4i32: {
+        SHLIntrinsicsID = Intrinsic::x86_sse2_pslli_d;
+        SRAIntrinsicsID = Intrinsic::x86_sse2_psrai_d;
+        break;
+      }
+      case MVT::v8i16: {
+        SHLIntrinsicsID = Intrinsic::x86_sse2_pslli_w;
+        SRAIntrinsicsID = Intrinsic::x86_sse2_psrai_w;
+        break;
+      }
+    }
+
+    SDValue Tmp1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                         DAG.getConstant(SHLIntrinsicsID, MVT::i32),
+                         Node->getOperand(0), ShAmt);
+
+    // In case of 1 bit sext, no need to shr
+    if (ExtraVT.getScalarType().getSizeInBits() == 1) return Tmp1;
+
+    if (SRAIntrinsicsID) {
+      Tmp1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                         DAG.getConstant(SRAIntrinsicsID, MVT::i32),
+                         Tmp1, ShAmt);
+    }
+    return Tmp1;
+  }
+
+  return SDValue();
+}
+
+
  SDValue X86TargetLowering::LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const{
    DebugLoc dl = Op.getDebugLoc();
  
@@ -9238,6 +9418,7 @@ static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
  SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
    switch (Op.getOpcode()) {
    default: llvm_unreachable("Should not custom lower this!");
+  case ISD::SIGN_EXTEND_INREG:  return LowerSIGN_EXTEND_INREG(Op,DAG);
    case ISD::MEMBARRIER:         return LowerMEMBARRIER(Op,DAG);
    case ISD::ATOMIC_CMP_SWAP:    return LowerCMP_SWAP(Op,DAG);
    case ISD::ATOMIC_LOAD_SUB:    return LowerLOAD_SUB(Op,DAG);
@@ -9336,6 +9517,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
    default:
      assert(false && "Do not know how to custom type legalize this operation!");
      return;
+  case ISD::SIGN_EXTEND_INREG:
    case ISD::ADDC:
    case ISD::ADDE:
    case ISD::SUBC:
@@ -9557,6 +9739,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
    case X86ISD::PUNPCKHWD:          return "X86ISD::PUNPCKHWD";
    case X86ISD::PUNPCKHDQ:          return "X86ISD::PUNPCKHDQ";
    case X86ISD::PUNPCKHQDQ:         return "X86ISD::PUNPCKHQDQ";
+  case X86ISD::VPERMIL:            return "X86ISD::VPERMIL";
    case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
    case X86ISD::VAARG_64:           return "X86ISD::VAARG_64";
    case X86ISD::WIN_ALLOCA:         return "X86ISD::WIN_ALLOCA";
@@ -9566,7 +9749,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
  // isLegalAddressingMode - Return true if the addressing mode represented
  // by AM is legal for this target, for a load/store of the specified type.
  bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM,
-                                              const Type *Ty) const {
+                                              Type *Ty) const {
    // X86 supports extremely general addressing modes.
    CodeModel::Model M = getTargetMachine().getCodeModel();
    Reloc::Model R = getTargetMachine().getRelocationModel();
@@ -9618,7 +9801,7 @@ bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM,
  }
  
  
-bool X86TargetLowering::isTruncateFree(const Type *Ty1, const Type *Ty2) const {
+bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
    if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
      return false;
    unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
@@ -9638,7 +9821,7 @@ bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
    return true;
  }
  
-bool X86TargetLowering::isZExtFree(const Type *Ty1, const Type *Ty2) const {
+bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
    // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
    return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit();
  }
@@ -11821,10 +12004,12 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
    if (R.getNode())
      return R;
  
-  // Want to form ANDNP nodes, in the hopes of then easily combining them with
-  // OR and AND nodes to form PBLEND/PSIGN.
+  // Want to form ANDNP nodes:
+  // 1) In the hopes of then easily combining them with OR and AND nodes
+  //    to form PBLEND/PSIGN.
+  // 2) To match ANDN packed intrinsics
    EVT VT = N->getValueType(0);
-  if (VT != MVT::v2i64)
+  if (VT != MVT::v2i64 && VT != MVT::v4i64)
      return SDValue();
  
    SDValue N0 = N->getOperand(0);
@@ -12360,6 +12545,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
    case X86ISD::PSHUFLW:
    case X86ISD::MOVSS:
    case X86ISD::MOVSD:
+  case X86ISD::VPERMIL:
    case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI);
    }
  
@@ -12496,7 +12682,7 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
           AsmPieces[1] == "${0:q}")) {
        // No need to check constraints, nothing other than the equivalent of
        // "=r,0" would be valid here.
-      const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
+      IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
        if (!Ty || Ty->getBitWidth() % 16 != 0)
          return false;
        return IntrinsicLowering::LowerToByteSwap(CI);
@@ -12517,7 +12703,7 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
            AsmPieces[1] == "~{dirflag}" &&
            AsmPieces[2] == "~{flags}" &&
            AsmPieces[3] == "~{fpsr}") {
-        const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
+        IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
          if (!Ty || Ty->getBitWidth() % 16 != 0)
            return false;
          return IntrinsicLowering::LowerToByteSwap(CI);
@@ -12548,7 +12734,7 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
                  AsmPieces[1] == "~{dirflag}" &&
                  AsmPieces[2] == "~{flags}" &&
                  AsmPieces[3] == "~{fpsr}") {
-              const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
+              IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
                if (!Ty || Ty->getBitWidth() % 16 != 0)
                  return false;
                return IntrinsicLowering::LowerToByteSwap(CI);
@@ -12574,7 +12760,7 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
              SplitString(AsmPieces[2], Words, " \t,");
              if (Words.size() == 3 && Words[0] == "xchgl" && Words[1] == "%eax" &&
                  Words[2] == "%edx") {
-              const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
+              IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
                if (!Ty || Ty->getBitWidth() % 16 != 0)
                  return false;
                return IntrinsicLowering::LowerToByteSwap(CI);
@@ -12645,7 +12831,7 @@ TargetLowering::ConstraintWeight
      // but allow it at the lowest weight.
    if (CallOperandVal == NULL)
      return CW_Default;
-  const Type *type = CallOperandVal->getType();
+  Type *type = CallOperandVal->getType();
    // Look at the constraint type.
    switch (*constraint) {
    default:
@@ -12907,7 +13093,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
           return std::make_pair(0U, X86::GR32RegisterClass);
         else if (VT == MVT::i16)
           return std::make_pair(0U, X86::GR16RegisterClass);
-       else if (VT == MVT::i8)
+       else if (VT == MVT::i8 || VT == MVT::i1)
           return std::make_pair(0U, X86::GR8RegisterClass);
         else if (VT == MVT::i64 || VT == MVT::f64)
           return std::make_pair(0U, X86::GR64RegisterClass);
@@ -12919,14 +13105,14 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
         return std::make_pair(0U, X86::GR32_ABCDRegisterClass);
        else if (VT == MVT::i16)
         return std::make_pair(0U, X86::GR16_ABCDRegisterClass);
-      else if (VT == MVT::i8)
+      else if (VT == MVT::i8 || VT == MVT::i1)
         return std::make_pair(0U, X86::GR8_ABCD_LRegisterClass);
        else if (VT == MVT::i64)
         return std::make_pair(0U, X86::GR64_ABCDRegisterClass);
        break;
      case 'r':   // GENERAL_REGS
      case 'l':   // INDEX_REGS
-      if (VT == MVT::i8)
+      if (VT == MVT::i8 || VT == MVT::i1)
          return std::make_pair(0U, X86::GR8RegisterClass);
        if (VT == MVT::i16)
          return std::make_pair(0U, X86::GR16RegisterClass);
@@ -12934,7 +13120,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
          return std::make_pair(0U, X86::GR32RegisterClass);
        return std::make_pair(0U, X86::GR64RegisterClass);
      case 'R':   // LEGACY_REGS
-      if (VT == MVT::i8)
+      if (VT == MVT::i8 || VT == MVT::i1)
          return std::make_pair(0U, X86::GR8_NOREXRegisterClass);
        if (VT == MVT::i16)
          return std::make_pair(0U, X86::GR16_NOREXRegisterClass);