DebugLoc dl) {
EVT VT = Vec.getValueType();
assert(VT.getSizeInBits() == 256 && "Unexpected vector size!");
-
EVT ElVT = VT.getVectorElementType();
-
- int Factor = VT.getSizeInBits() / 128;
-
- EVT ResultVT = EVT::getVectorVT(*DAG.getContext(),
- ElVT,
- VT.getVectorNumElements() / Factor);
+ int Factor = VT.getSizeInBits()/128;
+ EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
+ VT.getVectorNumElements()/Factor);
// Extract from UNDEF is UNDEF.
if (Vec.getOpcode() == ISD::UNDEF)
* ElemsPerChunk);
SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32);
-
SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec,
VecIdx);
assert(VT.getSizeInBits() == 128 && "Unexpected vector size!");
EVT ElVT = VT.getVectorElementType();
-
unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
-
EVT ResultVT = Result.getValueType();
// Insert the relevant 128 bits.
- unsigned ElemsPerChunk = 128 / ElVT.getSizeInBits();
+ unsigned ElemsPerChunk = 128/ElVT.getSizeInBits();
// This is the index of the first element of the 128-bit chunk
// we want.
- unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / 128)
+ unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/128)
* ElemsPerChunk);
SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32);
-
Result = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec,
VecIdx);
return Result;
return new TargetLoweringObjectFileMachO();
}
- if (Subtarget->isTargetELF()) {
- if (is64Bit)
- return new X8664_ELFTargetObjectFile(TM);
- return new X8632_ELFTargetObjectFile(TM);
- }
+ if (Subtarget->isTargetELF())
+ return new TargetLoweringObjectFileELF();
if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho())
return new TargetLoweringObjectFileCOFF();
llvm_unreachable("unknown subtarget type");
setOperationAction(ISD::VSETCC, MVT::v2i64, Custom);
if (!UseSoftFloat && Subtarget->hasAVX()) {
- addRegisterClass(MVT::v8f32, X86::VR256RegisterClass);
- addRegisterClass(MVT::v4f64, X86::VR256RegisterClass);
- addRegisterClass(MVT::v8i32, X86::VR256RegisterClass);
- addRegisterClass(MVT::v4i64, X86::VR256RegisterClass);
- addRegisterClass(MVT::v32i8, X86::VR256RegisterClass);
+ addRegisterClass(MVT::v32i8, X86::VR256RegisterClass);
+ addRegisterClass(MVT::v16i16, X86::VR256RegisterClass);
+ addRegisterClass(MVT::v8i32, X86::VR256RegisterClass);
+ addRegisterClass(MVT::v8f32, X86::VR256RegisterClass);
+ addRegisterClass(MVT::v4i64, X86::VR256RegisterClass);
+ addRegisterClass(MVT::v4f64, X86::VR256RegisterClass);
setOperationAction(ISD::LOAD, MVT::v8f32, Legal);
- setOperationAction(ISD::LOAD, MVT::v8i32, Legal);
setOperationAction(ISD::LOAD, MVT::v4f64, Legal);
setOperationAction(ISD::LOAD, MVT::v4i64, Legal);
setOperationAction(ISD::FSQRT, MVT::v4f64, Legal);
setOperationAction(ISD::FNEG, MVT::v4f64, Custom);
- // Custom lower build_vector, vector_shuffle, scalar_to_vector,
- // insert_vector_elt extract_subvector and extract_vector_elt for
- // 256-bit types.
+ // Custom lower several nodes for 256-bit types.
for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
- i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE;
- ++i) {
- MVT::SimpleValueType VT = (MVT::SimpleValueType)i;
- // Do not attempt to custom lower non-256-bit vectors
- if (!isPowerOf2_32(MVT(VT).getVectorNumElements())
- || (MVT(VT).getSizeInBits() < 256))
- continue;
- setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
- setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
- setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
- setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
- setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
- }
- // Custom-lower insert_subvector and extract_subvector based on
- // the result type.
- for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
- i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE;
- ++i) {
- MVT::SimpleValueType VT = (MVT::SimpleValueType)i;
- // Do not attempt to custom lower non-256-bit vectors
- if (!isPowerOf2_32(MVT(VT).getVectorNumElements()))
+ i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) {
+ MVT::SimpleValueType SVT = (MVT::SimpleValueType)i;
+ EVT VT = SVT;
+
+ // Extract subvector is special because the value type
+ // (result) is 128-bit but the source is 256-bit wide.
+ if (VT.is128BitVector())
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, SVT, Custom);
+
+ // Do not attempt to custom lower other non-256-bit vectors
+ if (!VT.is256BitVector())
continue;
- if (MVT(VT).getSizeInBits() == 128) {
- setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
- }
- else if (MVT(VT).getSizeInBits() == 256) {
- setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
- }
+ setOperationAction(ISD::BUILD_VECTOR, SVT, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, SVT, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, SVT, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, SVT, Custom);
+ setOperationAction(ISD::SCALAR_TO_VECTOR, SVT, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, SVT, Custom);
}
// Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
- // Don't promote loads because we need them for VPERM vector index versions.
+ for (unsigned i = (unsigned)MVT::v32i8; i != (unsigned)MVT::v4i64; ++i) {
+ MVT::SimpleValueType SVT = (MVT::SimpleValueType)i;
+ EVT VT = SVT;
- for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
- VT != (unsigned)MVT::LAST_VECTOR_VALUETYPE;
- VT++) {
- if (!isPowerOf2_32(MVT((MVT::SimpleValueType)VT).getVectorNumElements())
- || (MVT((MVT::SimpleValueType)VT).getSizeInBits() < 256))
+ // Do not attempt to promote non-256-bit vectors
+ if (!VT.is256BitVector())
continue;
- setOperationAction(ISD::AND, (MVT::SimpleValueType)VT, Promote);
- AddPromotedToType (ISD::AND, (MVT::SimpleValueType)VT, MVT::v4i64);
- setOperationAction(ISD::OR, (MVT::SimpleValueType)VT, Promote);
- AddPromotedToType (ISD::OR, (MVT::SimpleValueType)VT, MVT::v4i64);
- setOperationAction(ISD::XOR, (MVT::SimpleValueType)VT, Promote);
- AddPromotedToType (ISD::XOR, (MVT::SimpleValueType)VT, MVT::v4i64);
- //setOperationAction(ISD::LOAD, (MVT::SimpleValueType)VT, Promote);
- //AddPromotedToType (ISD::LOAD, (MVT::SimpleValueType)VT, MVT::v4i64);
- setOperationAction(ISD::SELECT, (MVT::SimpleValueType)VT, Promote);
- AddPromotedToType (ISD::SELECT, (MVT::SimpleValueType)VT, MVT::v4i64);
+
+ setOperationAction(ISD::AND, SVT, Promote);
+ AddPromotedToType (ISD::AND, SVT, MVT::v4i64);
+ setOperationAction(ISD::OR, SVT, Promote);
+ AddPromotedToType (ISD::OR, SVT, MVT::v4i64);
+ setOperationAction(ISD::XOR, SVT, Promote);
+ AddPromotedToType (ISD::XOR, SVT, MVT::v4i64);
+ setOperationAction(ISD::LOAD, SVT, Promote);
+ AddPromotedToType (ISD::LOAD, SVT, MVT::v4i64);
+ setOperationAction(ISD::SELECT, SVT, Promote);
+ AddPromotedToType (ISD::SELECT, SVT, MVT::v4i64);
}
}
+ // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion
+ // of this type with custom code.
+ for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
+ VT != (unsigned)MVT::LAST_VECTOR_VALUETYPE; VT++) {
+ setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT, Custom);
+ }
+
// We want to custom lower some of our intrinsics.
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
/// getMaxByValAlign - Helper for getByValTypeAlignment to determine
/// the desired ByVal argument alignment.
-static void getMaxByValAlign(const Type *Ty, unsigned &MaxAlign) {
+static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
if (MaxAlign == 16)
return;
- if (const VectorType *VTy = dyn_cast<VectorType>(Ty)) {
+ if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
if (VTy->getBitWidth() == 128)
MaxAlign = 16;
- } else if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
+ } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
unsigned EltAlign = 0;
getMaxByValAlign(ATy->getElementType(), EltAlign);
if (EltAlign > MaxAlign)
MaxAlign = EltAlign;
- } else if (const StructType *STy = dyn_cast<StructType>(Ty)) {
+ } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
unsigned EltAlign = 0;
getMaxByValAlign(STy->getElementType(i), EltAlign);
/// function arguments in the caller parameter area. For X86, aggregates
/// that contain SSE vectors are placed at 16-byte boundaries while the rest
/// are at 4-byte boundaries.
-unsigned X86TargetLowering::getByValTypeAlignment(const Type *Ty) const {
+unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const {
if (Subtarget->is64Bit()) {
// Max of 8 and alignment of type.
unsigned TyAlign = TD->getABITypeAlignment(Ty);
case X86ISD::PUNPCKHBW:
case X86ISD::PUNPCKHDQ:
case X86ISD::PUNPCKHQDQ:
+ case X86ISD::VPERMIL:
return true;
}
return false;
case X86ISD::PSHUFD:
case X86ISD::PSHUFHW:
case X86ISD::PSHUFLW:
+ case X86ISD::VPERMIL:
return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8));
}
return ::isMOVLMask(M, N->getValueType(0));
}
+/// isVPERMILMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to VPERMIL*.
+static bool isVPERMILMask(const SmallVectorImpl<int> &Mask, EVT VT) {
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned NumLanes = VT.getSizeInBits()/128;
+
+ // Match any permutation of 128-bit vector with 32/64-bit types
+ if (NumLanes == 1) {
+ if (NumElts == 4 || NumElts == 2)
+ return true;
+ return false;
+ }
+
+ // Only match 256-bit with 32/64-bit types
+ if (NumElts != 8 && NumElts != 4)
+ return false;
+
+ // The mask on the high lane should be the same as the low. Actually,
+ // they can differ if any of the corresponding index in a lane is undef.
+ int LaneSize = NumElts/NumLanes;
+ for (int i = 0; i < LaneSize; ++i) {
+ int HighElt = i+LaneSize;
+ if (Mask[i] < 0 || Mask[HighElt] < 0)
+ continue;
+
+ if (Mask[HighElt]-Mask[i] != LaneSize)
+ return false;
+ }
+
+ return true;
+}
+
+/// getShuffleVPERMILImmediateediate - Return the appropriate immediate to shuffle
+/// the specified VECTOR_MASK mask with VPERMIL* instructions.
+static unsigned getShuffleVPERMILImmediate(SDNode *N) {
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+ EVT VT = SVOp->getValueType(0);
+
+ int NumElts = VT.getVectorNumElements();
+ int NumLanes = VT.getSizeInBits()/128;
+
+ unsigned Mask = 0;
+ for (int i = 0; i < NumElts/NumLanes /* lane size */; ++i)
+ Mask |= SVOp->getMaskElt(i) << (i*2);
+
+ return Mask;
+}
+
/// isCommutedMOVL - Returns true if the shuffle mask is except the reverse
/// of what x86 movss want. X86 movs requires the lowest element to be lowest
/// element of vector 2 and the other elements to come from vector 1 in order.
EVT ElVT = VecVT.getVectorElementType();
unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits();
-
return Index / NumElemsPerChunk;
}
EVT ElVT = VecVT.getVectorElementType();
unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits();
-
return Index / NumElemsPerChunk;
}
}
/// getOnesVector - Returns a vector of specified type with all bits set.
-///
+/// Always build ones vectors as <4 x i32> or <8 x i32> bitcasted to
+/// their original type, ensuring they get CSE'd.
static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) {
assert(VT.isVector() && "Expected a vector type");
+ assert((VT.is128BitVector() || VT.is256BitVector())
+ && "Expected a 128-bit or 256-bit vector type");
- // Always build ones vectors as <4 x i32> or <2 x i32> bitcasted to their dest
- // type. This ensures they get CSE'd.
SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32);
+
SDValue Vec;
- Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
+ if (VT.is256BitVector()) {
+ SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
+ Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 8);
+ } else
+ Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
}
-
/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements
/// that point to V2 points to its first element.
static SDValue NormalizeMask(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
}
-/// getUnpackhMask - Returns a vector_shuffle node for an unpackh operation.
+/// getUnpackh - Returns a vector_shuffle node for an unpackh operation.
static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
SDValue V2) {
unsigned NumElems = VT.getVectorNumElements();
return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
}
-/// PromoteSplat - Promote a splat of v4i32, v8i16 or v16i8 to v4f32.
-static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
- EVT PVT = MVT::v4f32;
- EVT VT = SV->getValueType(0);
- DebugLoc dl = SV->getDebugLoc();
- SDValue V1 = SV->getOperand(0);
+// PromoteSplatv8v16 - All i16 and i8 vector types can't be used directly by
+// a generic shuffle instruction because the target has no such instructions.
+// Generate shuffles which repeat i16 and i8 several times until they can be
+// represented by v4f32 and then be manipulated by target suported shuffles.
+static SDValue PromoteSplatv8v16(SDValue V, SelectionDAG &DAG, int &EltNo) {
+ EVT VT = V.getValueType();
int NumElems = VT.getVectorNumElements();
- int EltNo = SV->getSplatIndex();
+ DebugLoc dl = V.getDebugLoc();
- // unpack elements to the correct location
while (NumElems > 4) {
if (EltNo < NumElems/2) {
- V1 = getUnpackl(DAG, dl, VT, V1, V1);
+ V = getUnpackl(DAG, dl, VT, V, V);
} else {
- V1 = getUnpackh(DAG, dl, VT, V1, V1);
+ V = getUnpackh(DAG, dl, VT, V, V);
EltNo -= NumElems/2;
}
NumElems >>= 1;
}
+ return V;
+}
+
+/// getLegalSplat - Generate a legal splat with supported x86 shuffles
+static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
+ EVT VT = V.getValueType();
+ DebugLoc dl = V.getDebugLoc();
+ assert((VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256)
+ && "Vector size not supported");
+
+ bool Is128 = VT.getSizeInBits() == 128;
+ EVT NVT = Is128 ? MVT::v4f32 : MVT::v8f32;
+ V = DAG.getNode(ISD::BITCAST, dl, NVT, V);
+
+ if (Is128) {
+ int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
+ V = DAG.getVectorShuffle(NVT, dl, V, DAG.getUNDEF(NVT), &SplatMask[0]);
+ } else {
+ // The second half of indicies refer to the higher part, which is a
+ // duplication of the lower one. This makes this shuffle a perfect match
+ // for the VPERM instruction.
+ int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo,
+ EltNo+4, EltNo+4, EltNo+4, EltNo+4 };
+ V = DAG.getVectorShuffle(NVT, dl, V, DAG.getUNDEF(NVT), &SplatMask[0]);
+ }
- // Perform the splat.
- int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
- V1 = DAG.getNode(ISD::BITCAST, dl, PVT, V1);
- V1 = DAG.getVectorShuffle(PVT, dl, V1, DAG.getUNDEF(PVT), &SplatMask[0]);
- return DAG.getNode(ISD::BITCAST, dl, VT, V1);
+ return DAG.getNode(ISD::BITCAST, dl, VT, V);
+}
+
+/// PromoteSplat - Promote a splat of v4i32, v8i16 or v16i8 to v4f32 and
+/// v8i32, v16i16 or v32i8 to v8f32.
+static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
+ EVT SrcVT = SV->getValueType(0);
+ SDValue V1 = SV->getOperand(0);
+ DebugLoc dl = SV->getDebugLoc();
+
+ int EltNo = SV->getSplatIndex();
+ int NumElems = SrcVT.getVectorNumElements();
+ unsigned Size = SrcVT.getSizeInBits();
+
+ // Extract the 128-bit part containing the splat element and update
+ // the splat element index when it refers to the higher register.
+ if (Size == 256) {
+ unsigned Idx = (EltNo > NumElems/2) ? NumElems/2 : 0;
+ V1 = Extract128BitVector(V1, DAG.getConstant(Idx, MVT::i32), DAG, dl);
+ if (Idx > 0)
+ EltNo -= NumElems/2;
+ }
+
+ // Make this 128-bit vector duplicate i8 and i16 elements
+ if (NumElems > 4)
+ V1 = PromoteSplatv8v16(V1, DAG, EltNo);
+
+ // Recreate the 256-bit vector and place the same 128-bit vector
+ // into the low and high part. This is necessary because we want
+ // to use VPERM to shuffle the v8f32 vector, and VPERM only shuffles
+ // inside each separate v4f32 lane.
+ if (Size == 256) {
+ SDValue InsV = Insert128BitVector(DAG.getUNDEF(SrcVT), V1,
+ DAG.getConstant(0, MVT::i32), DAG, dl);
+ V1 = Insert128BitVector(InsV, V1,
+ DAG.getConstant(NumElems/2, MVT::i32), DAG, dl);
+ }
+
+ return getLegalSplat(DAG, V1, EltNo);
}
/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
return getShuffleScalarElt(V.getOperand(OpNum).getNode(), Index, DAG,
Depth+1);
}
+ case X86ISD::VPERMIL:
+ ImmN = N->getOperand(N->getNumOperands()-1);
+ DecodeVPERMILMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(),
+ ShuffleMask);
default:
assert("not implemented for target shuffle node");
return SDValue();
return ConcatVectors(Lower, Upper, DAG);
}
- // All zero's are handled with pxor in SSE2 and above, xorps in SSE1.
- // All one's are handled with pcmpeqd. In AVX, zero's are handled with
- // vpxor in 128-bit and xor{pd,ps} in 256-bit, but no 256 version of pcmpeqd
- // is present, so AllOnes is ignored.
+ // All zero's:
+ // - pxor (SSE2), xorps (SSE1), vpxor (128 AVX), xorp[s|d] (256 AVX)
+ // All one's:
+ // - pcmpeqd (SSE2 and 128 AVX), fallback to constant pools (256 AVX)
if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
- (Op.getValueType().getSizeInBits() != 256 &&
- ISD::isBuildVectorAllOnes(Op.getNode()))) {
- // Canonicalize this to <4 x i32> (SSE) to
+ ISD::isBuildVectorAllOnes(Op.getNode())) {
+ // Canonicalize this to <4 x i32> or <8 x 32> (SSE) to
// 1) ensure the zero vectors are CSE'd, and 2) ensure that i64 scalars are
// eliminated on x86-32 hosts.
- if (Op.getValueType() == MVT::v4i32)
+ if (Op.getValueType() == MVT::v4i32 ||
+ Op.getValueType() == MVT::v8i32)
return Op;
if (ISD::isBuildVectorAllOnes(Op.getNode()))
OpVT, SrcOp)));
}
-/// LowerVECTOR_SHUFFLE_4wide - Handle all 4 wide cases with a number of
-/// shuffles.
+/// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles
+/// which could not be matched by any known target speficic shuffle
+static SDValue
+LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
+ return SDValue();
+}
+
+/// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with
+/// 4 elements, and match them with several different shuffle types.
static SDValue
-LowerVECTOR_SHUFFLE_4wide(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
+LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
SDValue V1 = SVOp->getOperand(0);
SDValue V2 = SVOp->getOperand(1);
DebugLoc dl = SVOp->getDebugLoc();
EVT VT = SVOp->getValueType(0);
+ assert(VT.getSizeInBits() == 128 && "Unsupported vector size");
+
SmallVector<std::pair<int, int>, 8> Locs;
Locs.resize(4);
SmallVector<int, 8> Mask1(4U, -1);
// Handle splat operations
if (SVOp->isSplat()) {
- // Special case, this is the only place now where it's
- // allowed to return a vector_shuffle operation without
- // using a target specific node, because *hopefully* it
- // will be optimized away by the dag combiner.
- if (VT.getVectorNumElements() <= 4 &&
- CanXFormVExtractWithShuffleIntoLoad(Op, DAG, TLI))
+ unsigned NumElem = VT.getVectorNumElements();
+ // Special case, this is the only place now where it's allowed to return
+ // a vector_shuffle operation without using a target specific node, because
+ // *hopefully* it will be optimized away by the dag combiner. FIXME: should
+ // this be moved to DAGCombine instead?
+ if (NumElem <= 4 && CanXFormVExtractWithShuffleIntoLoad(Op, DAG, TLI))
return Op;
// Handle splats by matching through known masks
- if (VT.getVectorNumElements() <= 4)
+ if ((VT.is128BitVector() && NumElem <= 4) ||
+ (VT.is256BitVector() && NumElem <= 8))
return SDValue();
- // Canonicalize all of the remaining to v4f32.
+ // All i16 and i8 vector types can't be used directly by a generic shuffle
+ // instruction because the target has no such instruction. Generate shuffles
+ // which repeat i16 and i8 several times until they fit in i32, and then can
+ // be manipulated by target suported shuffles. After the insertion of the
+ // necessary shuffles, the result is bitcasted back to v4f32 or v8f32.
return PromoteSplat(SVOp, DAG);
}
return NewOp;
}
- // Handle all 4 wide cases with a number of shuffles.
- if (NumElems == 4)
- return LowerVECTOR_SHUFFLE_4wide(SVOp, DAG);
+ // Handle all 128-bit wide vectors with 4 elements, and match them with
+ // several different shuffle types.
+ if (NumElems == 4 && VT.getSizeInBits() == 128)
+ return LowerVECTOR_SHUFFLE_128v4(SVOp, DAG);
+
+ //===--------------------------------------------------------------------===//
+ // Custom lower or generate target specific nodes for 256-bit shuffles.
+
+ // Handle VPERMIL permutations
+ if (isVPERMILMask(M, VT)) {
+ unsigned TargetMask = getShuffleVPERMILImmediate(SVOp);
+ if (VT == MVT::v8f32)
+ return getTargetShuffleNode(X86ISD::VPERMIL, dl, VT, V1, TargetMask, DAG);
+ }
+
+ // Handle general 256-bit shuffles
+ if (VT.is256BitVector())
+ return LowerVECTOR_SHUFFLE_256(SVOp, DAG);
return SDValue();
}
DebugLoc dl = Op.getDebugLoc();
EVT ArgVT = Op.getNode()->getValueType(0);
- const Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+ Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
uint32_t ArgSize = getTargetData()->getTypeAllocSize(ArgTy);
uint8_t ArgMode;
const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
- const unsigned char N86R10 = RegInfo->getX86RegNum(X86::R10);
- const unsigned char N86R11 = RegInfo->getX86RegNum(X86::R11);
+ const unsigned char N86R10 = X86_MC::getX86RegNum(X86::R10);
+ const unsigned char N86R11 = X86_MC::getX86RegNum(X86::R11);
const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
NestReg = X86::ECX;
// Check that ECX wasn't needed by an 'inreg' parameter.
- const FunctionType *FTy = Func->getFunctionType();
+ FunctionType *FTy = Func->getFunctionType();
const AttrListPtr &Attrs = Func->getAttributes();
if (!Attrs.isEmpty() && !Func->isVarArg()) {
// This is storing the opcode for MOV32ri.
const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
- const unsigned char N86Reg = RegInfo->getX86RegNum(NestReg);
+ const unsigned char N86Reg = X86_MC::getX86RegNum(NestReg);
OutChains[0] = DAG.getStore(Root, dl,
DAG.getConstant(MOV32ri|N86Reg, MVT::i8),
Trmp, MachinePointerInfo(TrmpAddr),
}
// Lower SHL with variable shift amount.
- // Cannot lower SHL without SSE4.1 or later.
- if (!Subtarget->hasSSE41()) return SDValue();
+ // Cannot lower SHL without SSE2 or later.
+ if (!Subtarget->hasSSE2()) return SDValue();
if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
return Sum;
}
+SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const{
+ DebugLoc dl = Op.getDebugLoc();
+ SDNode* Node = Op.getNode();
+ EVT ExtraVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
+ EVT VT = Node->getValueType(0);
+
+ if (Subtarget->hasSSE2() && VT.isVector()) {
+ unsigned BitsDiff = VT.getScalarType().getSizeInBits() -
+ ExtraVT.getScalarType().getSizeInBits();
+ SDValue ShAmt = DAG.getConstant(BitsDiff, MVT::i32);
+
+ unsigned SHLIntrinsicsID = 0;
+ unsigned SRAIntrinsicsID = 0;
+ switch (VT.getSimpleVT().SimpleTy) {
+ default:
+ return SDValue();
+ case MVT::v2i64: {
+ SHLIntrinsicsID = Intrinsic::x86_sse2_pslli_q;
+ SRAIntrinsicsID = 0;
+ break;
+ }
+ case MVT::v4i32: {
+ SHLIntrinsicsID = Intrinsic::x86_sse2_pslli_d;
+ SRAIntrinsicsID = Intrinsic::x86_sse2_psrai_d;
+ break;
+ }
+ case MVT::v8i16: {
+ SHLIntrinsicsID = Intrinsic::x86_sse2_pslli_w;
+ SRAIntrinsicsID = Intrinsic::x86_sse2_psrai_w;
+ break;
+ }
+ }
+
+ SDValue Tmp1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+ DAG.getConstant(SHLIntrinsicsID, MVT::i32),
+ Node->getOperand(0), ShAmt);
+
+ // In case of 1 bit sext, no need to shr
+ if (ExtraVT.getScalarType().getSizeInBits() == 1) return Tmp1;
+
+ if (SRAIntrinsicsID) {
+ Tmp1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+ DAG.getConstant(SRAIntrinsicsID, MVT::i32),
+ Tmp1, ShAmt);
+ }
+ return Tmp1;
+ }
+
+ return SDValue();
+}
+
+
SDValue X86TargetLowering::LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const{
DebugLoc dl = Op.getDebugLoc();
SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
switch (Op.getOpcode()) {
default: llvm_unreachable("Should not custom lower this!");
+ case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op,DAG);
case ISD::MEMBARRIER: return LowerMEMBARRIER(Op,DAG);
case ISD::ATOMIC_CMP_SWAP: return LowerCMP_SWAP(Op,DAG);
case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG);
default:
assert(false && "Do not know how to custom type legalize this operation!");
return;
+ case ISD::SIGN_EXTEND_INREG:
case ISD::ADDC:
case ISD::ADDE:
case ISD::SUBC:
case X86ISD::PUNPCKHWD: return "X86ISD::PUNPCKHWD";
case X86ISD::PUNPCKHDQ: return "X86ISD::PUNPCKHDQ";
case X86ISD::PUNPCKHQDQ: return "X86ISD::PUNPCKHQDQ";
+ case X86ISD::VPERMIL: return "X86ISD::VPERMIL";
case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
case X86ISD::VAARG_64: return "X86ISD::VAARG_64";
case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA";
// isLegalAddressingMode - Return true if the addressing mode represented
// by AM is legal for this target, for a load/store of the specified type.
bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM,
- const Type *Ty) const {
+ Type *Ty) const {
// X86 supports extremely general addressing modes.
CodeModel::Model M = getTargetMachine().getCodeModel();
Reloc::Model R = getTargetMachine().getRelocationModel();
}
-bool X86TargetLowering::isTruncateFree(const Type *Ty1, const Type *Ty2) const {
+bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
return false;
unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
return true;
}
-bool X86TargetLowering::isZExtFree(const Type *Ty1, const Type *Ty2) const {
+bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
// x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit();
}
if (R.getNode())
return R;
- // Want to form ANDNP nodes, in the hopes of then easily combining them with
- // OR and AND nodes to form PBLEND/PSIGN.
+ // Want to form ANDNP nodes:
+ // 1) In the hopes of then easily combining them with OR and AND nodes
+ // to form PBLEND/PSIGN.
+ // 2) To match ANDN packed intrinsics
EVT VT = N->getValueType(0);
- if (VT != MVT::v2i64)
+ if (VT != MVT::v2i64 && VT != MVT::v4i64)
return SDValue();
SDValue N0 = N->getOperand(0);
case X86ISD::PSHUFLW:
case X86ISD::MOVSS:
case X86ISD::MOVSD:
+ case X86ISD::VPERMIL:
case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI);
}
AsmPieces[1] == "${0:q}")) {
// No need to check constraints, nothing other than the equivalent of
// "=r,0" would be valid here.
- const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
+ IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
if (!Ty || Ty->getBitWidth() % 16 != 0)
return false;
return IntrinsicLowering::LowerToByteSwap(CI);
AsmPieces[1] == "~{dirflag}" &&
AsmPieces[2] == "~{flags}" &&
AsmPieces[3] == "~{fpsr}") {
- const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
+ IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
if (!Ty || Ty->getBitWidth() % 16 != 0)
return false;
return IntrinsicLowering::LowerToByteSwap(CI);
AsmPieces[1] == "~{dirflag}" &&
AsmPieces[2] == "~{flags}" &&
AsmPieces[3] == "~{fpsr}") {
- const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
+ IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
if (!Ty || Ty->getBitWidth() % 16 != 0)
return false;
return IntrinsicLowering::LowerToByteSwap(CI);
SplitString(AsmPieces[2], Words, " \t,");
if (Words.size() == 3 && Words[0] == "xchgl" && Words[1] == "%eax" &&
Words[2] == "%edx") {
- const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
+ IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
if (!Ty || Ty->getBitWidth() % 16 != 0)
return false;
return IntrinsicLowering::LowerToByteSwap(CI);
// but allow it at the lowest weight.
if (CallOperandVal == NULL)
return CW_Default;
- const Type *type = CallOperandVal->getType();
+ Type *type = CallOperandVal->getType();
// Look at the constraint type.
switch (*constraint) {
default:
return std::make_pair(0U, X86::GR32RegisterClass);
else if (VT == MVT::i16)
return std::make_pair(0U, X86::GR16RegisterClass);
- else if (VT == MVT::i8)
+ else if (VT == MVT::i8 || VT == MVT::i1)
return std::make_pair(0U, X86::GR8RegisterClass);
else if (VT == MVT::i64 || VT == MVT::f64)
return std::make_pair(0U, X86::GR64RegisterClass);
return std::make_pair(0U, X86::GR32_ABCDRegisterClass);
else if (VT == MVT::i16)
return std::make_pair(0U, X86::GR16_ABCDRegisterClass);
- else if (VT == MVT::i8)
+ else if (VT == MVT::i8 || VT == MVT::i1)
return std::make_pair(0U, X86::GR8_ABCD_LRegisterClass);
else if (VT == MVT::i64)
return std::make_pair(0U, X86::GR64_ABCDRegisterClass);
break;
case 'r': // GENERAL_REGS
case 'l': // INDEX_REGS
- if (VT == MVT::i8)
+ if (VT == MVT::i8 || VT == MVT::i1)
return std::make_pair(0U, X86::GR8RegisterClass);
if (VT == MVT::i16)
return std::make_pair(0U, X86::GR16RegisterClass);
return std::make_pair(0U, X86::GR32RegisterClass);
return std::make_pair(0U, X86::GR64RegisterClass);
case 'R': // LEGACY_REGS
- if (VT == MVT::i8)
+ if (VT == MVT::i8 || VT == MVT::i1)
return std::make_pair(0U, X86::GR8_NOREXRegisterClass);
if (VT == MVT::i16)
return std::make_pair(0U, X86::GR16_NOREXRegisterClass);