#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetOptions.h"
using namespace llvm;
using namespace dwarf;
// X86 is weird, it always uses i8 for shift amounts and setcc results.
setBooleanContents(ZeroOrOneBooleanContent);
+ // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
+ setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
// For 64-bit since we have so many registers use the ILP scheduler, for
// 32-bit code use the register pressure specific scheduling.
setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand);
setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote);
} else if (!UseSoftFloat) {
- if (X86ScalarSSEf32 && !Subtarget->hasSSE3())
+ // Since AVX is a superset of SSE3, only check for SSE here.
+ if (Subtarget->hasSSE1() && !Subtarget->hasSSE3())
// Expand FP_TO_UINT into a select.
// FIXME: We would like to use a Custom expander here eventually to do
// the optimal thing for SSE vs. the default expansion in the legalizer.
if (Subtarget->hasXMM())
setOperationAction(ISD::PREFETCH , MVT::Other, Legal);
- // We may not have a libcall for MEMBARRIER so we should lower this.
setOperationAction(ISD::MEMBARRIER , MVT::Other, Custom);
+ setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
// On X86 and X86-64, atomic operations are lowered to locked instructions.
// Locked instructions, in turn, have implicit fence semantics (all memory
MVT VT = IntVTs[i];
setOperationAction(ISD::ATOMIC_CMP_SWAP, VT, Custom);
setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
+ setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
}
if (!Subtarget->is64Bit()) {
+ setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom);
setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom);
setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom);
setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom);
}
+ if (Subtarget->hasCmpxchg16b()) {
+ setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom);
+ }
+
// FIXME - use subtarget debug flags
if (!Subtarget->isTargetDarwin() &&
!Subtarget->isTargetELF() &&
setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
- setOperationAction(ISD::TRAMPOLINE, MVT::Other, Custom);
+ setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
+ setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
setOperationAction(ISD::TRAP, MVT::Other, Legal);
setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
- setOperationAction(ISD::DYNAMIC_STACKALLOC,
- (Subtarget->is64Bit() ? MVT::i64 : MVT::i32),
- (Subtarget->isTargetCOFF()
- && !Subtarget->isTargetEnvMacho()
- ? Custom : Expand));
+
+ if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho())
+ setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
+ MVT::i64 : MVT::i32, Custom);
+ else if (EnableSegmentedStacks)
+ setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
+ MVT::i64 : MVT::i32, Custom);
+ else
+ setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
+ MVT::i64 : MVT::i32, Expand);
if (!UseSoftFloat && X86ScalarSSEf64) {
// f32 and f64 use SSE.
setOperationAction(ISD::ROTL, (MVT::SimpleValueType)VT, Expand);
setOperationAction(ISD::ROTR, (MVT::SimpleValueType)VT, Expand);
setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand);
- setOperationAction(ISD::VSETCC, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::SETCC, (MVT::SimpleValueType)VT, Expand);
setOperationAction(ISD::FLOG, (MVT::SimpleValueType)VT, Expand);
setOperationAction(ISD::FLOG2, (MVT::SimpleValueType)VT, Expand);
setOperationAction(ISD::FLOG10, (MVT::SimpleValueType)VT, Expand);
setOperationAction(ISD::SIGN_EXTEND, (MVT::SimpleValueType)VT, Expand);
setOperationAction(ISD::ZERO_EXTEND, (MVT::SimpleValueType)VT, Expand);
setOperationAction(ISD::ANY_EXTEND, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::VSELECT, (MVT::SimpleValueType)VT, Expand);
for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
setTruncStoreAction((MVT::SimpleValueType)VT,
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
- setOperationAction(ISD::VSETCC, MVT::v4f32, Custom);
+ setOperationAction(ISD::SETCC, MVT::v4f32, Custom);
}
if (!UseSoftFloat && Subtarget->hasXMMInt()) {
setOperationAction(ISD::FSQRT, MVT::v2f64, Legal);
setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
- setOperationAction(ISD::VSETCC, MVT::v2f64, Custom);
- setOperationAction(ISD::VSETCC, MVT::v16i8, Custom);
- setOperationAction(ISD::VSETCC, MVT::v8i16, Custom);
- setOperationAction(ISD::VSETCC, MVT::v4i32, Custom);
+ setOperationAction(ISD::SETCC, MVT::v2f64, Custom);
+ setOperationAction(ISD::SETCC, MVT::v16i8, Custom);
+ setOperationAction(ISD::SETCC, MVT::v8i16, Custom);
+ setOperationAction(ISD::SETCC, MVT::v4i32, Custom);
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom);
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom);
setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
}
- if (Subtarget->hasSSE41()) {
+ if (Subtarget->hasSSE41() || Subtarget->hasAVX()) {
setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
setOperationAction(ISD::FCEIL, MVT::f32, Legal);
setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
setOperationAction(ISD::SHL, MVT::v4i32, Custom);
setOperationAction(ISD::SHL, MVT::v16i8, Custom);
+ setOperationAction(ISD::VSELECT, MVT::v2f64, Legal);
+ setOperationAction(ISD::VSELECT, MVT::v2i64, Legal);
+ setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
+ setOperationAction(ISD::VSELECT, MVT::v4i32, Legal);
+ setOperationAction(ISD::VSELECT, MVT::v4f32, Legal);
+
// i8 and i16 vectors are custom , because the source register and source
// source memory operand types are not the same width. f32 vectors are
// custom since the immediate controlling the insert encodes additional
}
}
- if (Subtarget->hasSSE2()) {
+ if (Subtarget->hasXMMInt()) {
setOperationAction(ISD::SRL, MVT::v2i64, Custom);
setOperationAction(ISD::SRL, MVT::v4i32, Custom);
setOperationAction(ISD::SRL, MVT::v16i8, Custom);
+ setOperationAction(ISD::SRL, MVT::v8i16, Custom);
setOperationAction(ISD::SHL, MVT::v2i64, Custom);
setOperationAction(ISD::SHL, MVT::v4i32, Custom);
setOperationAction(ISD::SRA, MVT::v8i16, Custom);
}
- if (Subtarget->hasSSE42())
- setOperationAction(ISD::VSETCC, MVT::v2i64, Custom);
+ if (Subtarget->hasSSE42() || Subtarget->hasAVX())
+ setOperationAction(ISD::SETCC, MVT::v2i64, Custom);
if (!UseSoftFloat && Subtarget->hasAVX()) {
addRegisterClass(MVT::v32i8, X86::VR256RegisterClass);
setOperationAction(ISD::FSQRT, MVT::v4f64, Legal);
setOperationAction(ISD::FNEG, MVT::v4f64, Custom);
+ setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
+ setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal);
+
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f64, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i64, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i8, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i16, Custom);
+
+ setOperationAction(ISD::SRL, MVT::v4i64, Custom);
+ setOperationAction(ISD::SRL, MVT::v8i32, Custom);
+ setOperationAction(ISD::SRL, MVT::v16i16, Custom);
+ setOperationAction(ISD::SRL, MVT::v32i8, Custom);
+
+ setOperationAction(ISD::SHL, MVT::v4i64, Custom);
+ setOperationAction(ISD::SHL, MVT::v8i32, Custom);
+ setOperationAction(ISD::SHL, MVT::v16i16, Custom);
+ setOperationAction(ISD::SHL, MVT::v32i8, Custom);
+
+ setOperationAction(ISD::SRA, MVT::v8i32, Custom);
+ setOperationAction(ISD::SRA, MVT::v16i16, Custom);
+
+ setOperationAction(ISD::SETCC, MVT::v32i8, Custom);
+ setOperationAction(ISD::SETCC, MVT::v16i16, Custom);
+ setOperationAction(ISD::SETCC, MVT::v8i32, Custom);
+ setOperationAction(ISD::SETCC, MVT::v4i64, Custom);
+
+ setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
+ setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
+ setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
+
+ setOperationAction(ISD::VSELECT, MVT::v4f64, Legal);
+ setOperationAction(ISD::VSELECT, MVT::v4i64, Legal);
+ setOperationAction(ISD::VSELECT, MVT::v8i32, Legal);
+ setOperationAction(ISD::VSELECT, MVT::v8f32, Legal);
+
+ setOperationAction(ISD::ADD, MVT::v4i64, Custom);
+ setOperationAction(ISD::ADD, MVT::v8i32, Custom);
+ setOperationAction(ISD::ADD, MVT::v16i16, Custom);
+ setOperationAction(ISD::ADD, MVT::v32i8, Custom);
+
+ setOperationAction(ISD::SUB, MVT::v4i64, Custom);
+ setOperationAction(ISD::SUB, MVT::v8i32, Custom);
+ setOperationAction(ISD::SUB, MVT::v16i16, Custom);
+ setOperationAction(ISD::SUB, MVT::v32i8, Custom);
+
+ setOperationAction(ISD::MUL, MVT::v4i64, Custom);
+ setOperationAction(ISD::MUL, MVT::v8i32, Custom);
+ setOperationAction(ISD::MUL, MVT::v16i16, Custom);
+ // Don't lower v32i8 because there is no 128-bit byte mul
+
// Custom lower several nodes for 256-bit types.
for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) {
setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
setTargetDAGCombine(ISD::BUILD_VECTOR);
+ setTargetDAGCombine(ISD::VSELECT);
setTargetDAGCombine(ISD::SELECT);
setTargetDAGCombine(ISD::SHL);
setTargetDAGCombine(ISD::SRA);
}
-MVT::SimpleValueType X86TargetLowering::getSetCCResultType(EVT VT) const {
- return MVT::i8;
+EVT X86TargetLowering::getSetCCResultType(EVT VT) const {
+ if (!VT.isVector()) return MVT::i8;
+ return VT.changeVectorElementTypeToInteger();
}
((DstAlign == 0 || DstAlign >= 16) &&
(SrcAlign == 0 || SrcAlign >= 16))) &&
Subtarget->getStackAlignment() >= 16) {
- if (Subtarget->hasSSE2())
+ if (Subtarget->hasAVX() &&
+ Subtarget->getStackAlignment() >= 32)
+ return MVT::v8f32;
+ if (Subtarget->hasXMMInt())
return MVT::v4i32;
- if (Subtarget->hasSSE1())
+ if (Subtarget->hasXMM())
return MVT::v4f32;
} else if (!MemcpyStrSrc && Size >= 8 &&
!Subtarget->is64Bit() &&
ValToCopy);
// If we don't have SSE2 available, convert to v4f32 so the generated
// register is legal.
- if (!Subtarget->hasSSE2())
+ if (!Subtarget->hasXMMInt())
ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy);
}
}
FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
}
+ FuncInfo->setArgumentStackSize(StackSize);
+
return Chain;
}
case X86ISD::PUNPCKHBW:
case X86ISD::PUNPCKHDQ:
case X86ISD::PUNPCKHQDQ:
- case X86ISD::VPERMIL:
+ case X86ISD::VPERMILPS:
+ case X86ISD::VPERMILPSY:
+ case X86ISD::VPERMILPD:
+ case X86ISD::VPERMILPDY:
+ case X86ISD::VPERM2F128:
return true;
}
return false;
case X86ISD::PSHUFD:
case X86ISD::PSHUFHW:
case X86ISD::PSHUFLW:
- case X86ISD::VPERMIL:
+ case X86ISD::VPERMILPS:
+ case X86ISD::VPERMILPSY:
+ case X86ISD::VPERMILPD:
+ case X86ISD::VPERMILPDY:
return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8));
}
case X86ISD::PALIGN:
case X86ISD::SHUFPD:
case X86ISD::SHUFPS:
+ case X86ISD::VPERM2F128:
return DAG.getNode(Opc, dl, VT, V1, V2,
DAG.getConstant(TargetMask, MVT::i8));
}
return (Val < 0) || (Val >= Low && Val < Hi);
}
+/// isUndefOrInRange - Return true if every element in Mask, begining
+/// from position Pos and ending in Pos+Size, falls within the specified
+/// range (L, L+Pos]. or is undef.
+static bool isUndefOrInRange(const SmallVectorImpl<int> &Mask,
+ int Pos, int Size, int Low, int Hi) {
+ for (int i = Pos, e = Pos+Size; i != e; ++i)
+ if (!isUndefOrInRange(Mask[i], Low, Hi))
+ return false;
+ return true;
+}
+
/// isUndefOrEqual - Val is either less than zero (undef) or equal to the
/// specified value.
static bool isUndefOrEqual(int Val, int CmpVal) {
return false;
}
+/// isSequentialOrUndefInRange - Return true if every element in Mask, begining
+/// from position Pos and ending in Pos+Size, falls within the specified
+/// sequential range (L, L+Pos]. or is undef.
+static bool isSequentialOrUndefInRange(const SmallVectorImpl<int> &Mask,
+ int Pos, int Size, int Low) {
+ for (int i = Pos, e = Pos+Size; i != e; ++i, ++Low)
+ if (!isUndefOrEqual(Mask[i], Low))
+ return false;
+ return true;
+}
+
/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that
/// is suitable for input to PSHUFD or PSHUFW. That is, it doesn't reference
/// the second operand.
/// isPALIGNRMask - Return true if the node specifies a shuffle of elements that
/// is suitable for input to PALIGNR.
static bool isPALIGNRMask(const SmallVectorImpl<int> &Mask, EVT VT,
- bool hasSSSE3) {
+ bool hasSSSE3OrAVX) {
int i, e = VT.getVectorNumElements();
+ if (VT.getSizeInBits() != 128 && VT.getSizeInBits() != 64)
+ return false;
// Do not handle v2i64 / v2f64 shuffles with palignr.
- if (e < 4 || !hasSSSE3)
+ if (e < 4 || !hasSSSE3OrAVX)
return false;
for (i = 0; i != e; ++i)
return true;
}
-bool X86::isPALIGNRMask(ShuffleVectorSDNode *N) {
- SmallVector<int, 8> M;
- N->getMask(M);
- return ::isPALIGNRMask(M, N->getValueType(0), true);
+/// isVSHUFPSYMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to 256-bit
+/// VSHUFPSY.
+static bool isVSHUFPSYMask(const SmallVectorImpl<int> &Mask, EVT VT,
+ const X86Subtarget *Subtarget) {
+ int NumElems = VT.getVectorNumElements();
+
+ if (!Subtarget->hasAVX() || VT.getSizeInBits() != 256)
+ return false;
+
+ if (NumElems != 8)
+ return false;
+
+ // VSHUFPSY divides the resulting vector into 4 chunks.
+ // The sources are also splitted into 4 chunks, and each destination
+ // chunk must come from a different source chunk.
+ //
+ // SRC1 => X7 X6 X5 X4 X3 X2 X1 X0
+ // SRC2 => Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y9
+ //
+ // DST => Y7..Y4, Y7..Y4, X7..X4, X7..X4,
+ // Y3..Y0, Y3..Y0, X3..X0, X3..X0
+ //
+ int QuarterSize = NumElems/4;
+ int HalfSize = QuarterSize*2;
+ for (int i = 0; i < QuarterSize; ++i)
+ if (!isUndefOrInRange(Mask[i], 0, HalfSize))
+ return false;
+ for (int i = QuarterSize; i < QuarterSize*2; ++i)
+ if (!isUndefOrInRange(Mask[i], NumElems, NumElems+HalfSize))
+ return false;
+
+ // The mask of the second half must be the same as the first but with
+ // the appropriate offsets. This works in the same way as VPERMILPS
+ // works with masks.
+ for (int i = QuarterSize*2; i < QuarterSize*3; ++i) {
+ if (!isUndefOrInRange(Mask[i], HalfSize, NumElems))
+ return false;
+ int FstHalfIdx = i-HalfSize;
+ if (Mask[FstHalfIdx] < 0)
+ continue;
+ if (!isUndefOrEqual(Mask[i], Mask[FstHalfIdx]+HalfSize))
+ return false;
+ }
+ for (int i = QuarterSize*3; i < NumElems; ++i) {
+ if (!isUndefOrInRange(Mask[i], NumElems+HalfSize, NumElems*2))
+ return false;
+ int FstHalfIdx = i-HalfSize;
+ if (Mask[FstHalfIdx] < 0)
+ continue;
+ if (!isUndefOrEqual(Mask[i], Mask[FstHalfIdx]+HalfSize))
+ return false;
+
+ }
+
+ return true;
+}
+
+/// getShuffleVSHUFPSYImmediate - Return the appropriate immediate to shuffle
+/// the specified VECTOR_MASK mask with VSHUFPSY instruction.
+static unsigned getShuffleVSHUFPSYImmediate(SDNode *N) {
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+ EVT VT = SVOp->getValueType(0);
+ int NumElems = VT.getVectorNumElements();
+
+ assert(NumElems == 8 && VT.getSizeInBits() == 256 &&
+ "Only supports v8i32 and v8f32 types");
+
+ int HalfSize = NumElems/2;
+ unsigned Mask = 0;
+ for (int i = 0; i != NumElems ; ++i) {
+ if (SVOp->getMaskElt(i) < 0)
+ continue;
+ // The mask of the first half must be equal to the second one.
+ unsigned Shamt = (i%HalfSize)*2;
+ unsigned Elt = SVOp->getMaskElt(i) % HalfSize;
+ Mask |= Elt << Shamt;
+ }
+
+ return Mask;
+}
+
+/// isVSHUFPDYMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to 256-bit
+/// VSHUFPDY. This shuffle doesn't have the same restriction as the PS
+/// version and the mask of the second half isn't binded with the first
+/// one.
+static bool isVSHUFPDYMask(const SmallVectorImpl<int> &Mask, EVT VT,
+ const X86Subtarget *Subtarget) {
+ int NumElems = VT.getVectorNumElements();
+
+ if (!Subtarget->hasAVX() || VT.getSizeInBits() != 256)
+ return false;
+
+ if (NumElems != 4)
+ return false;
+
+ // VSHUFPSY divides the resulting vector into 4 chunks.
+ // The sources are also splitted into 4 chunks, and each destination
+ // chunk must come from a different source chunk.
+ //
+ // SRC1 => X3 X2 X1 X0
+ // SRC2 => Y3 Y2 Y1 Y0
+ //
+ // DST => Y2..Y3, X2..X3, Y1..Y0, X1..X0
+ //
+ int QuarterSize = NumElems/4;
+ int HalfSize = QuarterSize*2;
+ for (int i = 0; i < QuarterSize; ++i)
+ if (!isUndefOrInRange(Mask[i], 0, HalfSize))
+ return false;
+ for (int i = QuarterSize; i < QuarterSize*2; ++i)
+ if (!isUndefOrInRange(Mask[i], NumElems, NumElems+HalfSize))
+ return false;
+ for (int i = QuarterSize*2; i < QuarterSize*3; ++i)
+ if (!isUndefOrInRange(Mask[i], HalfSize, NumElems))
+ return false;
+ for (int i = QuarterSize*3; i < NumElems; ++i)
+ if (!isUndefOrInRange(Mask[i], NumElems+HalfSize, NumElems*2))
+ return false;
+
+ return true;
+}
+
+/// getShuffleVSHUFPDYImmediate - Return the appropriate immediate to shuffle
+/// the specified VECTOR_MASK mask with VSHUFPDY instruction.
+static unsigned getShuffleVSHUFPDYImmediate(SDNode *N) {
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+ EVT VT = SVOp->getValueType(0);
+ int NumElems = VT.getVectorNumElements();
+
+ assert(NumElems == 4 && VT.getSizeInBits() == 256 &&
+ "Only supports v4i64 and v4f64 types");
+
+ int HalfSize = NumElems/2;
+ unsigned Mask = 0;
+ for (int i = 0; i != NumElems ; ++i) {
+ if (SVOp->getMaskElt(i) < 0)
+ continue;
+ int Elt = SVOp->getMaskElt(i) % HalfSize;
+ Mask |= Elt << i;
+ }
+
+ return Mask;
}
/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
-/// specifies a shuffle of elements that is suitable for input to SHUFP*.
+/// specifies a shuffle of elements that is suitable for input to 128-bit
+/// SHUFPS and SHUFPD.
static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) {
int NumElems = VT.getVectorNumElements();
+
+ if (VT.getSizeInBits() != 128)
+ return false;
+
if (NumElems != 2 && NumElems != 4)
return false;
/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to MOVHLPS.
bool X86::isMOVHLPSMask(ShuffleVectorSDNode *N) {
- if (N->getValueType(0).getVectorNumElements() != 4)
+ EVT VT = N->getValueType(0);
+ unsigned NumElems = VT.getVectorNumElements();
+
+ if (VT.getSizeInBits() != 128)
+ return false;
+
+ if (NumElems != 4)
return false;
// Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3
/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
/// <2, 3, 2, 3>
bool X86::isMOVHLPS_v_undef_Mask(ShuffleVectorSDNode *N) {
- unsigned NumElems = N->getValueType(0).getVectorNumElements();
+ EVT VT = N->getValueType(0);
+ unsigned NumElems = VT.getVectorNumElements();
+
+ if (VT.getSizeInBits() != 128)
+ return false;
if (NumElems != 4)
return false;
return isUndefOrEqual(N->getMaskElt(0), 2) &&
- isUndefOrEqual(N->getMaskElt(1), 3) &&
- isUndefOrEqual(N->getMaskElt(2), 2) &&
- isUndefOrEqual(N->getMaskElt(3), 3);
+ isUndefOrEqual(N->getMaskElt(1), 3) &&
+ isUndefOrEqual(N->getMaskElt(2), 2) &&
+ isUndefOrEqual(N->getMaskElt(3), 3);
}
/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16)
return false;
+ // For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern
+ // FIXME: Need a better way to get rid of this, there's no latency difference
+ // between UNPCKLPD and MOVDDUP, the later should always be checked first and
+ // the former later. We should also remove the "_undef" special mask.
+ if (NumElems == 4 && VT.getSizeInBits() == 256)
+ return false;
+
// Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
// independently on 128-bit lanes.
unsigned NumLanes = VT.getSizeInBits() / 128;
return ::isMOVLMask(M, N->getValueType(0));
}
-/// isVPERMILMask - Return true if the specified VECTOR_SHUFFLE operand
-/// specifies a shuffle of elements that is suitable for input to VPERMIL*.
-static bool isVPERMILMask(const SmallVectorImpl<int> &Mask, EVT VT) {
+/// isVPERM2F128Mask - Match 256-bit shuffles where the elements are considered
+/// as permutations between 128-bit chunks or halves. As an example: this
+/// shuffle bellow:
+/// vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15>
+/// The first half comes from the second half of V1 and the second half from the
+/// the second half of V2.
+static bool isVPERM2F128Mask(const SmallVectorImpl<int> &Mask, EVT VT,
+ const X86Subtarget *Subtarget) {
+ if (!Subtarget->hasAVX() || VT.getSizeInBits() != 256)
+ return false;
+
+ // The shuffle result is divided into half A and half B. In total the two
+ // sources have 4 halves, namely: C, D, E, F. The final values of A and
+ // B must come from C, D, E or F.
+ int HalfSize = VT.getVectorNumElements()/2;
+ bool MatchA = false, MatchB = false;
+
+ // Check if A comes from one of C, D, E, F.
+ for (int Half = 0; Half < 4; ++Half) {
+ if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) {
+ MatchA = true;
+ break;
+ }
+ }
+
+ // Check if B comes from one of C, D, E, F.
+ for (int Half = 0; Half < 4; ++Half) {
+ if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) {
+ MatchB = true;
+ break;
+ }
+ }
+
+ return MatchA && MatchB;
+}
+
+/// getShuffleVPERM2F128Immediate - Return the appropriate immediate to shuffle
+/// the specified VECTOR_MASK mask with VPERM2F128 instructions.
+static unsigned getShuffleVPERM2F128Immediate(SDNode *N) {
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+ EVT VT = SVOp->getValueType(0);
+
+ int HalfSize = VT.getVectorNumElements()/2;
+
+ int FstHalf = 0, SndHalf = 0;
+ for (int i = 0; i < HalfSize; ++i) {
+ if (SVOp->getMaskElt(i) > 0) {
+ FstHalf = SVOp->getMaskElt(i)/HalfSize;
+ break;
+ }
+ }
+ for (int i = HalfSize; i < HalfSize*2; ++i) {
+ if (SVOp->getMaskElt(i) > 0) {
+ SndHalf = SVOp->getMaskElt(i)/HalfSize;
+ break;
+ }
+ }
+
+ return (FstHalf | (SndHalf << 4));
+}
+
+/// isVPERMILPDMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to VPERMILPD*.
+/// Note that VPERMIL mask matching is different depending whether theunderlying
+/// type is 32 or 64. In the VPERMILPS the high half of the mask should point
+/// to the same elements of the low, but to the higher half of the source.
+/// In VPERMILPD the two lanes could be shuffled independently of each other
+/// with the same restriction that lanes can't be crossed.
+static bool isVPERMILPDMask(const SmallVectorImpl<int> &Mask, EVT VT,
+ const X86Subtarget *Subtarget) {
+ int NumElts = VT.getVectorNumElements();
+ int NumLanes = VT.getSizeInBits()/128;
+
+ if (!Subtarget->hasAVX())
+ return false;
+
+ // Match any permutation of 128-bit vector with 64-bit types
+ if (NumLanes == 1 && NumElts != 2)
+ return false;
+
+ // Only match 256-bit with 32 types
+ if (VT.getSizeInBits() == 256 && NumElts != 4)
+ return false;
+
+ // The mask on the high lane is independent of the low. Both can match
+ // any element in inside its own lane, but can't cross.
+ int LaneSize = NumElts/NumLanes;
+ for (int l = 0; l < NumLanes; ++l)
+ for (int i = l*LaneSize; i < LaneSize*(l+1); ++i) {
+ int LaneStart = l*LaneSize;
+ if (!isUndefOrInRange(Mask[i], LaneStart, LaneStart+LaneSize))
+ return false;
+ }
+
+ return true;
+}
+
+/// isVPERMILPSMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to VPERMILPS*.
+/// Note that VPERMIL mask matching is different depending whether theunderlying
+/// type is 32 or 64. In the VPERMILPS the high half of the mask should point
+/// to the same elements of the low, but to the higher half of the source.
+/// In VPERMILPD the two lanes could be shuffled independently of each other
+/// with the same restriction that lanes can't be crossed.
+static bool isVPERMILPSMask(const SmallVectorImpl<int> &Mask, EVT VT,
+ const X86Subtarget *Subtarget) {
unsigned NumElts = VT.getVectorNumElements();
unsigned NumLanes = VT.getSizeInBits()/128;
- // Match any permutation of 128-bit vector with 32/64-bit types
- if (NumLanes == 1) {
- if (NumElts == 4 || NumElts == 2)
- return true;
+ if (!Subtarget->hasAVX())
return false;
- }
- // Only match 256-bit with 32/64-bit types
- if (NumElts != 8 && NumElts != 4)
+ // Match any permutation of 128-bit vector with 32-bit types
+ if (NumLanes == 1 && NumElts != 4)
+ return false;
+
+ // Only match 256-bit with 32 types
+ if (VT.getSizeInBits() == 256 && NumElts != 8)
return false;
// The mask on the high lane should be the same as the low. Actually,
- // they can differ if any of the corresponding index in a lane is undef.
+ // they can differ if any of the corresponding index in a lane is undef
+ // and the other stays in range.
int LaneSize = NumElts/NumLanes;
for (int i = 0; i < LaneSize; ++i) {
int HighElt = i+LaneSize;
+ bool HighValid = isUndefOrInRange(Mask[HighElt], LaneSize, NumElts);
+ bool LowValid = isUndefOrInRange(Mask[i], 0, LaneSize);
+
+ if (!HighValid || !LowValid)
+ return false;
if (Mask[i] < 0 || Mask[HighElt] < 0)
continue;
-
if (Mask[HighElt]-Mask[i] != LaneSize)
return false;
}
return true;
}
-/// getShuffleVPERMILImmediateediate - Return the appropriate immediate to shuffle
-/// the specified VECTOR_MASK mask with VPERMIL* instructions.
-static unsigned getShuffleVPERMILImmediate(SDNode *N) {
+/// getShuffleVPERMILPSImmediate - Return the appropriate immediate to shuffle
+/// the specified VECTOR_MASK mask with VPERMILPS* instructions.
+static unsigned getShuffleVPERMILPSImmediate(SDNode *N) {
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+ EVT VT = SVOp->getValueType(0);
+
+ int NumElts = VT.getVectorNumElements();
+ int NumLanes = VT.getSizeInBits()/128;
+ int LaneSize = NumElts/NumLanes;
+
+ // Although the mask is equal for both lanes do it twice to get the cases
+ // where a mask will match because the same mask element is undef on the
+ // first half but valid on the second. This would get pathological cases
+ // such as: shuffle <u, 0, 1, 2, 4, 4, 5, 6>, which is completely valid.
+ unsigned Mask = 0;
+ for (int l = 0; l < NumLanes; ++l) {
+ for (int i = 0; i < LaneSize; ++i) {
+ int MaskElt = SVOp->getMaskElt(i+(l*LaneSize));
+ if (MaskElt < 0)
+ continue;
+ if (MaskElt >= LaneSize)
+ MaskElt -= LaneSize;
+ Mask |= MaskElt << (i*2);
+ }
+ }
+
+ return Mask;
+}
+
+/// getShuffleVPERMILPDImmediate - Return the appropriate immediate to shuffle
+/// the specified VECTOR_MASK mask with VPERMILPD* instructions.
+static unsigned getShuffleVPERMILPDImmediate(SDNode *N) {
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
EVT VT = SVOp->getValueType(0);
int NumLanes = VT.getSizeInBits()/128;
unsigned Mask = 0;
- for (int i = 0; i < NumElts/NumLanes /* lane size */; ++i)
- Mask |= SVOp->getMaskElt(i) << (i*2);
+ int LaneSize = NumElts/NumLanes;
+ for (int l = 0; l < NumLanes; ++l)
+ for (int i = l*LaneSize; i < LaneSize*(l+1); ++i) {
+ int MaskElt = SVOp->getMaskElt(i);
+ if (MaskElt < 0)
+ continue;
+ Mask |= (MaskElt-l*LaneSize) << i;
+ }
return Mask;
}
return true;
}
+/// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to 256-bit
+/// version of MOVDDUP.
+static bool isMOVDDUPYMask(ShuffleVectorSDNode *N,
+ const X86Subtarget *Subtarget) {
+ EVT VT = N->getValueType(0);
+ int NumElts = VT.getVectorNumElements();
+ bool V2IsUndef = N->getOperand(1).getOpcode() == ISD::UNDEF;
+
+ if (!Subtarget->hasAVX() || VT.getSizeInBits() != 256 ||
+ !V2IsUndef || NumElts != 4)
+ return false;
+
+ for (int i = 0; i != NumElts/2; ++i)
+ if (!isUndefOrEqual(N->getMaskElt(i), 0))
+ return false;
+ for (int i = NumElts/2; i != NumElts; ++i)
+ if (!isUndefOrEqual(N->getMaskElt(i), NumElts/2))
+ return false;
+ return true;
+}
+
/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
-/// specifies a shuffle of elements that is suitable for input to MOVDDUP.
+/// specifies a shuffle of elements that is suitable for input to 128-bit
+/// version of MOVDDUP.
bool X86::isMOVDDUPMask(ShuffleVectorSDNode *N) {
- int e = N->getValueType(0).getVectorNumElements() / 2;
+ EVT VT = N->getValueType(0);
+
+ if (VT.getSizeInBits() != 128)
+ return false;
+ int e = VT.getVectorNumElements() / 2;
for (int i = 0; i < e; ++i)
if (!isUndefOrEqual(N->getMaskElt(i), i))
return false;
/// V1 (and in order), and the upper half elements should come from the upper
/// half of V2 (and in order).
static bool ShouldXformToMOVHLPS(ShuffleVectorSDNode *Op) {
- if (Op->getValueType(0).getVectorNumElements() != 4)
+ EVT VT = Op->getValueType(0);
+ if (VT.getSizeInBits() != 128)
+ return false;
+ if (VT.getVectorNumElements() != 4)
return false;
for (unsigned i = 0, e = 2; i != e; ++i)
if (!isUndefOrEqual(Op->getMaskElt(i), i+2))
/// MOVLP, it must be either a vector load or a scalar load to vector.
static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
ShuffleVectorSDNode *Op) {
+ EVT VT = Op->getValueType(0);
+ if (VT.getSizeInBits() != 128)
+ return false;
+
if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1))
return false;
// Is V2 is a vector load, don't do this transformation. We will try to use
if (ISD::isNON_EXTLoad(V2))
return false;
- unsigned NumElems = Op->getValueType(0).getVectorNumElements();
+ unsigned NumElems = VT.getVectorNumElements();
if (NumElems != 2 && NumElems != 4)
return false;
/// getZeroVector - Returns a vector of specified type with all zero elements.
///
-static SDValue getZeroVector(EVT VT, bool HasSSE2, SelectionDAG &DAG,
+static SDValue getZeroVector(EVT VT, bool HasXMMInt, SelectionDAG &DAG,
DebugLoc dl) {
assert(VT.isVector() && "Expected a vector type");
// to their dest type. This ensures they get CSE'd.
SDValue Vec;
if (VT.getSizeInBits() == 128) { // SSE
- if (HasSSE2) { // SSE2
+ if (HasXMMInt) { // SSE2
SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
} else { // SSE1
return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
}
-// PromoteSplatv8v16 - All i16 and i8 vector types can't be used directly by
+// PromoteSplati8i16 - All i16 and i8 vector types can't be used directly by
// a generic shuffle instruction because the target has no such instructions.
// Generate shuffles which repeat i16 and i8 several times until they can be
// represented by v4f32 and then be manipulated by target suported shuffles.
-static SDValue PromoteSplatv8v16(SDValue V, SelectionDAG &DAG, int &EltNo) {
+static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) {
EVT VT = V.getValueType();
int NumElems = VT.getVectorNumElements();
DebugLoc dl = V.getDebugLoc();
assert((VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256)
&& "Vector size not supported");
- bool Is128 = VT.getSizeInBits() == 128;
- EVT NVT = Is128 ? MVT::v4f32 : MVT::v8f32;
- V = DAG.getNode(ISD::BITCAST, dl, NVT, V);
-
- if (Is128) {
+ if (VT.getSizeInBits() == 128) {
+ V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V);
int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
- V = DAG.getVectorShuffle(NVT, dl, V, DAG.getUNDEF(NVT), &SplatMask[0]);
+ V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32),
+ &SplatMask[0]);
} else {
- // The second half of indicies refer to the higher part, which is a
- // duplication of the lower one. This makes this shuffle a perfect match
- // for the VPERM instruction.
+ // To use VPERMILPS to splat scalars, the second half of indicies must
+ // refer to the higher part, which is a duplication of the lower one,
+ // because VPERMILPS can only handle in-lane permutations.
int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo,
EltNo+4, EltNo+4, EltNo+4, EltNo+4 };
- V = DAG.getVectorShuffle(NVT, dl, V, DAG.getUNDEF(NVT), &SplatMask[0]);
+
+ V = DAG.getNode(ISD::BITCAST, dl, MVT::v8f32, V);
+ V = DAG.getVectorShuffle(MVT::v8f32, dl, V, DAG.getUNDEF(MVT::v8f32),
+ &SplatMask[0]);
}
return DAG.getNode(ISD::BITCAST, dl, VT, V);
}
-/// PromoteVectorToScalarSplat - Since there's no native support for
-/// scalar_to_vector for 256-bit AVX, a 128-bit scalar_to_vector +
-/// INSERT_SUBVECTOR is generated. Recognize this idiom and do the
-/// shuffle before the insertion, this yields less instructions in the end.
-static SDValue PromoteVectorToScalarSplat(ShuffleVectorSDNode *SV,
- SelectionDAG &DAG) {
- EVT SrcVT = SV->getValueType(0);
- SDValue V1 = SV->getOperand(0);
- DebugLoc dl = SV->getDebugLoc();
- int NumElems = SrcVT.getVectorNumElements();
-
- assert(SrcVT.is256BitVector() && "unknown howto handle vector type");
-
- SmallVector<int, 4> Mask;
- for (int i = 0; i < NumElems/2; ++i)
- Mask.push_back(SV->getMaskElt(i));
-
- EVT SVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(),
- NumElems/2);
- SDValue SV1 = DAG.getVectorShuffle(SVT, dl, V1.getOperand(1),
- DAG.getUNDEF(SVT), &Mask[0]);
- SDValue InsV = Insert128BitVector(DAG.getUNDEF(SrcVT), SV1,
- DAG.getConstant(0, MVT::i32), DAG, dl);
-
- return Insert128BitVector(InsV, SV1,
- DAG.getConstant(NumElems/2, MVT::i32), DAG, dl);
-}
-
-/// PromoteSplat - Promote a splat of v4i32, v8i16 or v16i8 to v4f32 and
-/// v8i32, v16i16 or v32i8 to v8f32.
+/// PromoteSplat - Splat is promoted to target supported vector shuffles.
static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
EVT SrcVT = SV->getValueType(0);
SDValue V1 = SV->getOperand(0);
int NumElems = SrcVT.getVectorNumElements();
unsigned Size = SrcVT.getSizeInBits();
+ assert(((Size == 128 && NumElems > 4) || Size == 256) &&
+ "Unknown how to promote splat for type");
+
// Extract the 128-bit part containing the splat element and update
// the splat element index when it refers to the higher register.
if (Size == 256) {
EltNo -= NumElems/2;
}
- // Make this 128-bit vector duplicate i8 and i16 elements
- if (NumElems > 4)
- V1 = PromoteSplatv8v16(V1, DAG, EltNo);
+ // All i16 and i8 vector types can't be used directly by a generic shuffle
+ // instruction because the target has no such instruction. Generate shuffles
+ // which repeat i16 and i8 several times until they fit in i32, and then can
+ // be manipulated by target suported shuffles.
+ EVT EltVT = SrcVT.getVectorElementType();
+ if (EltVT == MVT::i8 || EltVT == MVT::i16)
+ V1 = PromoteSplati8i16(V1, DAG, EltNo);
// Recreate the 256-bit vector and place the same 128-bit vector
// into the low and high part. This is necessary because we want
- // to use VPERM to shuffle the v8f32 vector, and VPERM only shuffles
- // inside each separate v4f32 lane.
+ // to use VPERM* to shuffle the vectors
if (Size == 256) {
SDValue InsV = Insert128BitVector(DAG.getUNDEF(SrcVT), V1,
DAG.getConstant(0, MVT::i32), DAG, dl);
/// element of V2 is swizzled into the zero/undef vector, landing at element
/// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
- bool isZero, bool HasSSE2,
- SelectionDAG &DAG) {
+ bool isZero, bool HasXMMInt,
+ SelectionDAG &DAG) {
EVT VT = V2.getValueType();
SDValue V1 = isZero
- ? getZeroVector(VT, HasSSE2, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT);
+ ? getZeroVector(VT, HasXMMInt, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT);
unsigned NumElems = VT.getVectorNumElements();
SmallVector<int, 16> MaskVec;
for (unsigned i = 0; i != NumElems; ++i)
return getShuffleScalarElt(V.getOperand(OpNum).getNode(), Index, DAG,
Depth+1);
}
- case X86ISD::VPERMIL:
+ case X86ISD::VPERMILPS:
+ ImmN = N->getOperand(N->getNumOperands()-1);
+ DecodeVPERMILPSMask(4, cast<ConstantSDNode>(ImmN)->getZExtValue(),
+ ShuffleMask);
+ break;
+ case X86ISD::VPERMILPSY:
+ ImmN = N->getOperand(N->getNumOperands()-1);
+ DecodeVPERMILPSMask(8, cast<ConstantSDNode>(ImmN)->getZExtValue(),
+ ShuffleMask);
+ break;
+ case X86ISD::VPERMILPD:
ImmN = N->getOperand(N->getNumOperands()-1);
- DecodeVPERMILMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(),
+ DecodeVPERMILPDMask(2, cast<ConstantSDNode>(ImmN)->getZExtValue(),
ShuffleMask);
+ break;
+ case X86ISD::VPERMILPDY:
+ ImmN = N->getOperand(N->getNumOperands()-1);
+ DecodeVPERMILPDMask(4, cast<ConstantSDNode>(ImmN)->getZExtValue(),
+ ShuffleMask);
+ break;
+ case X86ISD::VPERM2F128:
+ ImmN = N->getOperand(N->getNumOperands()-1);
+ DecodeVPERM2F128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(),
+ ShuffleMask);
+ break;
+ case X86ISD::MOVDDUP:
+ case X86ISD::MOVLHPD:
+ case X86ISD::MOVLPD:
+ case X86ISD::MOVLPS:
+ case X86ISD::MOVSHDUP:
+ case X86ISD::MOVSLDUP:
+ case X86ISD::PALIGN:
+ return SDValue(); // Not yet implemented.
default:
- assert("not implemented for target shuffle node");
+ assert(0 && "unknown target shuffle node");
return SDValue();
}
/// logical left or right shift of a vector.
static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
+ // Although the logic below support any bitwidth size, there are no
+ // shift instructions which handle more than 128-bit vectors.
+ if (SVOp->getValueType(0).getSizeInBits() > 128)
+ return false;
+
if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) ||
isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt))
return true;
static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
unsigned NumBits, SelectionDAG &DAG,
const TargetLowering &TLI, DebugLoc dl) {
+ assert(VT.getSizeInBits() == 128 && "Unknown type for VShift");
EVT ShVT = MVT::v2i64;
unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL;
SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp);
return SDValue();
}
+ // FIXME: 256-bit vector instructions don't require a strict alignment,
+ // improve this code to support it better.
+ unsigned RequiredAlign = VT.getSizeInBits()/8;
SDValue Chain = LD->getChain();
- // Make sure the stack object alignment is at least 16.
+ // Make sure the stack object alignment is at least 16 or 32.
MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
- if (DAG.InferPtrAlignment(Ptr) < 16) {
+ if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
if (MFI->isFixedObjectIndex(FI)) {
// Can't change the alignment. FIXME: It's possible to compute
// the exact stack offset and reference FI + adjust offset instead.
// If someone *really* cares about this. That's the way to implement it.
return SDValue();
} else {
- MFI->setObjectAlignment(FI, 16);
+ MFI->setObjectAlignment(FI, RequiredAlign);
}
}
- // (Offset % 16) must be multiple of 4. Then address is then
+ // (Offset % 16 or 32) must be multiple of 4. Then address is then
// Ptr + (Offset & ~15).
if (Offset < 0)
return SDValue();
- if ((Offset % 16) & 3)
+ if ((Offset % RequiredAlign) & 3)
return SDValue();
- int64_t StartOffset = Offset & ~15;
+ int64_t StartOffset = Offset & ~(RequiredAlign-1);
if (StartOffset)
Ptr = DAG.getNode(ISD::ADD, Ptr.getDebugLoc(), Ptr.getValueType(),
Ptr,DAG.getConstant(StartOffset, Ptr.getValueType()));
int EltNo = (Offset - StartOffset) >> 2;
- int Mask[4] = { EltNo, EltNo, EltNo, EltNo };
- EVT VT = (PVT == MVT::i32) ? MVT::v4i32 : MVT::v4f32;
- SDValue V1 = DAG.getLoad(VT, dl, Chain, Ptr,
+ int NumElems = VT.getVectorNumElements();
+
+ EVT CanonVT = VT.getSizeInBits() == 128 ? MVT::v4i32 : MVT::v8i32;
+ EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
+ SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
LD->getPointerInfo().getWithOffset(StartOffset),
false, false, 0);
- // Canonicalize it to a v4i32 shuffle.
- V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1);
- return DAG.getNode(ISD::BITCAST, dl, VT,
- DAG.getVectorShuffle(MVT::v4i32, dl, V1,
- DAG.getUNDEF(MVT::v4i32),&Mask[0]));
+
+ // Canonicalize it to a v4i32 or v8i32 shuffle.
+ SmallVector<int, 8> Mask;
+ for (int i = 0; i < NumElems; ++i)
+ Mask.push_back(EltNo);
+
+ V1 = DAG.getNode(ISD::BITCAST, dl, CanonVT, V1);
+ return DAG.getNode(ISD::BITCAST, dl, NVT,
+ DAG.getVectorShuffle(CanonVT, dl, V1,
+ DAG.getUNDEF(CanonVT),&Mask[0]));
}
return SDValue();
DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
- SDValue ResNode = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys,
- Ops, 2, MVT::i32,
- LDBase->getMemOperand());
+ SDValue ResNode =
+ DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, 2, MVT::i64,
+ LDBase->getPointerInfo(),
+ LDBase->getAlignment(),
+ false/*isVolatile*/, true/*ReadMem*/,
+ false/*WriteMem*/);
return DAG.getNode(ISD::BITCAST, DL, VT, ResNode);
}
return SDValue();
EVT ExtVT = VT.getVectorElementType();
unsigned NumElems = Op.getNumOperands();
- // All zero's:
- // - pxor (SSE2), xorps (SSE1), vpxor (128 AVX), xorp[s|d] (256 AVX)
- // All one's:
- // - pcmpeqd (SSE2 and 128 AVX), fallback to constant pools (256 AVX)
- if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
- ISD::isBuildVectorAllOnes(Op.getNode())) {
- // Canonicalize this to <4 x i32> or <8 x 32> (SSE) to
- // 1) ensure the zero vectors are CSE'd, and 2) ensure that i64 scalars are
- // eliminated on x86-32 hosts.
+ // Vectors containing all zeros can be matched by pxor and xorps later
+ if (ISD::isBuildVectorAllZeros(Op.getNode())) {
+ // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
+ // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
if (Op.getValueType() == MVT::v4i32 ||
Op.getValueType() == MVT::v8i32)
return Op;
- if (ISD::isBuildVectorAllOnes(Op.getNode()))
- return getOnesVector(Op.getValueType(), DAG, dl);
- return getZeroVector(Op.getValueType(), Subtarget->hasSSE2(), DAG, dl);
+ return getZeroVector(Op.getValueType(), Subtarget->hasXMMInt(), DAG, dl);
+ }
+
+ // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
+ // vectors or broken into v4i32 operations on 256-bit vectors.
+ if (ISD::isBuildVectorAllOnes(Op.getNode())) {
+ if (Op.getValueType() == MVT::v4i32)
+ return Op;
+
+ return getOnesVector(Op.getValueType(), DAG, dl);
}
unsigned EVTBits = ExtVT.getSizeInBits();
Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
Item = getShuffleVectorZeroOrUndef(Item, 0, true,
- Subtarget->hasSSE2(), DAG);
+ Subtarget->hasXMMInt(), DAG);
// Now we have our 32-bit value zero extended in the low element of
// a vector. If Idx != 0, swizzle it into place.
(ExtVT == MVT::i64 && Subtarget->is64Bit())) {
Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
// Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
- return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget->hasSSE2(),
+ return getShuffleVectorZeroOrUndef(Item, 0, true,Subtarget->hasXMMInt(),
DAG);
} else if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
EVT MiddleVT = MVT::v4i32;
Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MiddleVT, Item);
Item = getShuffleVectorZeroOrUndef(Item, 0, true,
- Subtarget->hasSSE2(), DAG);
+ Subtarget->hasXMMInt(), DAG);
return DAG.getNode(ISD::BITCAST, dl, VT, Item);
}
}
// Turn it into a shuffle of zero and zero-extended scalar to vector.
Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0,
- Subtarget->hasSSE2(), DAG);
+ Subtarget->hasXMMInt(), DAG);
SmallVector<int, 8> MaskVec;
for (unsigned i = 0; i < NumElems; i++)
MaskVec.push_back(i == Idx ? 0 : 1);
NumElems/2);
// Recreate the wider vector with the lower and upper part.
- SDValue Vec = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), Upper,
- DAG.getConstant(NumElems/2, MVT::i32), DAG, dl);
- return Insert128BitVector(Vec, Lower, DAG.getConstant(0, MVT::i32),
+ SDValue Vec = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), Lower,
+ DAG.getConstant(0, MVT::i32), DAG, dl);
+ return Insert128BitVector(Vec, Upper, DAG.getConstant(NumElems/2, MVT::i32),
DAG, dl);
}
SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
Op.getOperand(Idx));
return getShuffleVectorZeroOrUndef(V2, Idx, true,
- Subtarget->hasSSE2(), DAG);
+ Subtarget->hasXMMInt(), DAG);
}
return SDValue();
}
for (unsigned i = 0; i < 4; ++i) {
bool isZero = !(NonZeros & (1 << i));
if (isZero)
- V[i] = getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl);
+ V[i] = getZeroVector(VT, Subtarget->hasXMMInt(), DAG, dl);
else
V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
}
return LD;
// For SSE 4.1, use insertps to put the high elements into the low element.
- if (getSubtarget()->hasSSE41()) {
+ if (getSubtarget()->hasSSE41() || getSubtarget()->hasAVX()) {
SDValue Result;
if (Op.getOperand(0).getOpcode() != ISD::UNDEF)
Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
return SDValue();
}
-SDValue
-X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
- // We support concatenate two MMX registers and place them in a MMX
- // register. This is better than doing a stack convert.
+// LowerMMXCONCAT_VECTORS - We support concatenate two MMX registers and place
+// them in a MMX register. This is better than doing a stack convert.
+static SDValue LowerMMXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
DebugLoc dl = Op.getDebugLoc();
EVT ResVT = Op.getValueType();
- assert(Op.getNumOperands() == 2);
+
assert(ResVT == MVT::v2i64 || ResVT == MVT::v4i32 ||
ResVT == MVT::v8i16 || ResVT == MVT::v16i8);
int Mask[2];
return DAG.getNode(ISD::BITCAST, dl, ResVT, VecOp);
}
+// LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction
+// to create 256-bit vectors from two other 128-bit ones.
+static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
+ DebugLoc dl = Op.getDebugLoc();
+ EVT ResVT = Op.getValueType();
+
+ assert(ResVT.getSizeInBits() == 256 && "Value type must be 256-bit wide");
+
+ SDValue V1 = Op.getOperand(0);
+ SDValue V2 = Op.getOperand(1);
+ unsigned NumElems = ResVT.getVectorNumElements();
+
+ SDValue V = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, ResVT), V1,
+ DAG.getConstant(0, MVT::i32), DAG, dl);
+ return Insert128BitVector(V, V2, DAG.getConstant(NumElems/2, MVT::i32),
+ DAG, dl);
+}
+
+SDValue
+X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
+ EVT ResVT = Op.getValueType();
+
+ assert(Op.getNumOperands() == 2);
+ assert((ResVT.getSizeInBits() == 128 || ResVT.getSizeInBits() == 256) &&
+ "Unsupported CONCAT_VECTORS for value type");
+
+ // We support concatenate two MMX registers and place them in a MMX register.
+ // This is better than doing a stack convert.
+ if (ResVT.is128BitVector())
+ return LowerMMXCONCAT_VECTORS(Op, DAG);
+
+ // 256-bit AVX can use the vinsertf128 instruction to create 256-bit vectors
+ // from two other 128-bit ones.
+ return LowerAVXCONCAT_VECTORS(Op, DAG);
+}
+
// v8i16 shuffles - Prefer shuffles in the following order:
// 1. [all] pshuflw, pshufhw, optional move
// 2. [ssse3] 1 x pshufb
// quads, disable the next transformation since it does not help SSSE3.
bool V1Used = InputQuads[0] || InputQuads[1];
bool V2Used = InputQuads[2] || InputQuads[3];
- if (Subtarget->hasSSSE3()) {
+ if (Subtarget->hasSSSE3() || Subtarget->hasAVX()) {
if (InputQuads.count() == 2 && V1Used && V2Used) {
BestLoQuad = InputQuads.find_first();
BestHiQuad = InputQuads.find_next(BestLoQuad);
// If we have SSSE3, and all words of the result are from 1 input vector,
// case 2 is generated, otherwise case 3 is generated. If no SSSE3
// is present, fall back to case 4.
- if (Subtarget->hasSSSE3()) {
+ if (Subtarget->hasSSSE3() || Subtarget->hasAVX()) {
SmallVector<SDValue,16> pshufbMask;
// If we have elements from both input vectors, set the high bit of the
NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
&MaskV[0]);
- if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3())
+ if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE &&
+ (Subtarget->hasSSSE3() || Subtarget->hasAVX()))
NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16,
NewV.getOperand(0),
X86::getShufflePSHUFLWImmediate(NewV.getNode()),
NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
&MaskV[0]);
- if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3())
+ if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE &&
+ (Subtarget->hasSSSE3() || Subtarget->hasAVX()))
NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16,
NewV.getOperand(0),
X86::getShufflePSHUFHWImmediate(NewV.getNode()),
}
// If SSSE3, use 1 pshufb instruction per vector with elements in the result.
- if (TLI.getSubtarget()->hasSSSE3()) {
+ if (TLI.getSubtarget()->hasSSSE3() || TLI.getSubtarget()->hasAVX()) {
SmallVector<SDValue,16> pshufbMask;
// If all result elements are from one input vector, then only translate
OpVT, SrcOp)));
}
-/// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles
-/// which could not be matched by any known target speficic shuffle
-static SDValue
-LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
+/// areShuffleHalvesWithinDisjointLanes - Check whether each half of a vector
+/// shuffle node referes to only one lane in the sources.
+static bool areShuffleHalvesWithinDisjointLanes(ShuffleVectorSDNode *SVOp) {
+ EVT VT = SVOp->getValueType(0);
+ int NumElems = VT.getVectorNumElements();
+ int HalfSize = NumElems/2;
+ SmallVector<int, 16> M;
+ SVOp->getMask(M);
+ bool MatchA = false, MatchB = false;
+
+ for (int l = 0; l < NumElems*2; l += HalfSize) {
+ if (isUndefOrInRange(M, 0, HalfSize, l, l+HalfSize)) {
+ MatchA = true;
+ break;
+ }
+ }
+
+ for (int l = 0; l < NumElems*2; l += HalfSize) {
+ if (isUndefOrInRange(M, HalfSize, HalfSize, l, l+HalfSize)) {
+ MatchB = true;
+ break;
+ }
+ }
+
+ return MatchA && MatchB;
+}
+
+/// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles
+/// which could not be matched by any known target speficic shuffle
+static SDValue
+LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
+ if (areShuffleHalvesWithinDisjointLanes(SVOp)) {
+ // If each half of a vector shuffle node referes to only one lane in the
+ // source vectors, extract each used 128-bit lane and shuffle them using
+ // 128-bit shuffles. Then, concatenate the results. Otherwise leave
+ // the work to the legalizer.
+ DebugLoc dl = SVOp->getDebugLoc();
+ EVT VT = SVOp->getValueType(0);
+ int NumElems = VT.getVectorNumElements();
+ int HalfSize = NumElems/2;
+
+ // Extract the reference for each half
+ int FstVecExtractIdx = 0, SndVecExtractIdx = 0;
+ int FstVecOpNum = 0, SndVecOpNum = 0;
+ for (int i = 0; i < HalfSize; ++i) {
+ int Elt = SVOp->getMaskElt(i);
+ if (SVOp->getMaskElt(i) < 0)
+ continue;
+ FstVecOpNum = Elt/NumElems;
+ FstVecExtractIdx = Elt % NumElems < HalfSize ? 0 : HalfSize;
+ break;
+ }
+ for (int i = HalfSize; i < NumElems; ++i) {
+ int Elt = SVOp->getMaskElt(i);
+ if (SVOp->getMaskElt(i) < 0)
+ continue;
+ SndVecOpNum = Elt/NumElems;
+ SndVecExtractIdx = Elt % NumElems < HalfSize ? 0 : HalfSize;
+ break;
+ }
+
+ // Extract the subvectors
+ SDValue V1 = Extract128BitVector(SVOp->getOperand(FstVecOpNum),
+ DAG.getConstant(FstVecExtractIdx, MVT::i32), DAG, dl);
+ SDValue V2 = Extract128BitVector(SVOp->getOperand(SndVecOpNum),
+ DAG.getConstant(SndVecExtractIdx, MVT::i32), DAG, dl);
+
+ // Generate 128-bit shuffles
+ SmallVector<int, 16> MaskV1, MaskV2;
+ for (int i = 0; i < HalfSize; ++i) {
+ int Elt = SVOp->getMaskElt(i);
+ MaskV1.push_back(Elt < 0 ? Elt : Elt % HalfSize);
+ }
+ for (int i = HalfSize; i < NumElems; ++i) {
+ int Elt = SVOp->getMaskElt(i);
+ MaskV2.push_back(Elt < 0 ? Elt : Elt % HalfSize);
+ }
+
+ EVT NVT = V1.getValueType();
+ V1 = DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &MaskV1[0]);
+ V2 = DAG.getVectorShuffle(NVT, dl, V2, DAG.getUNDEF(NVT), &MaskV2[0]);
+
+ // Concatenate the result back
+ SDValue V = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), V1,
+ DAG.getConstant(0, MVT::i32), DAG, dl);
+ return Insert128BitVector(V, V2, DAG.getConstant(NumElems/2, MVT::i32),
+ DAG, dl);
+ }
+
return SDValue();
}
static
SDValue getMOVLowToHigh(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG,
- bool HasSSE2) {
+ bool HasXMMInt) {
SDValue V1 = Op.getOperand(0);
SDValue V2 = Op.getOperand(1);
EVT VT = Op.getValueType();
assert(VT != MVT::v2i64 && "unsupported shuffle type");
- if (HasSSE2 && VT == MVT::v2f64)
+ if (HasXMMInt && VT == MVT::v2f64)
return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG);
- // v4f32 or v4i32
- return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V2, DAG);
+ // v4f32 or v4i32: canonizalized to v4f32 (which is legal for SSE1)
+ return DAG.getNode(ISD::BITCAST, dl, VT,
+ getTargetShuffleNode(X86ISD::MOVLHPS, dl, MVT::v4f32,
+ DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V1),
+ DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V2), DAG));
}
static
return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG);
}
+static inline unsigned getSHUFPOpcode(EVT VT) {
+ switch(VT.getSimpleVT().SimpleTy) {
+ case MVT::v8i32: // Use fp unit for int unpack.
+ case MVT::v8f32:
+ case MVT::v4i32: // Use fp unit for int unpack.
+ case MVT::v4f32: return X86ISD::SHUFPS;
+ case MVT::v4i64: // Use fp unit for int unpack.
+ case MVT::v4f64:
+ case MVT::v2i64: // Use fp unit for int unpack.
+ case MVT::v2f64: return X86ISD::SHUFPD;
+ default:
+ llvm_unreachable("Unknown type for shufp*");
+ }
+ return 0;
+}
+
static
-SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
+SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasXMMInt) {
SDValue V1 = Op.getOperand(0);
SDValue V2 = Op.getOperand(1);
EVT VT = Op.getValueType();
CanFoldLoad = false;
if (CanFoldLoad) {
- if (HasSSE2 && NumElems == 2)
+ if (HasXMMInt && NumElems == 2)
return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG);
if (NumElems == 4)
// this is horrible, but will stay like this until we move all shuffle
// matching to x86 specific nodes. Note that for the 1st condition all
// types are matched with movsd.
- if ((HasSSE2 && NumElems == 2) || !X86::isMOVLMask(SVOp))
- return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
- else if (HasSSE2)
+ if (HasXMMInt) {
+ // FIXME: isMOVLMask should be checked and matched before getMOVLP,
+ // as to remove this logic from here, as much as possible
+ if (NumElems == 2 || !X86::isMOVLMask(SVOp))
+ return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
-
+ }
assert(VT != MVT::v4i32 && "unsupported shuffle type");
// Invert the operand order and use SHUFPS to match it.
- return getTargetShuffleNode(X86ISD::SHUFPS, dl, VT, V2, V1,
+ return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V2, V1,
X86::getShuffleSHUFImmediate(SVOp), DAG);
}
case MVT::v2i64: return X86ISD::PUNPCKLQDQ;
case MVT::v4f32: return X86ISD::UNPCKLPS;
case MVT::v2f64: return X86ISD::UNPCKLPD;
+ case MVT::v8i32: // Use fp unit for int unpack.
case MVT::v8f32: return X86ISD::VUNPCKLPSY;
+ case MVT::v4i64: // Use fp unit for int unpack.
case MVT::v4f64: return X86ISD::VUNPCKLPDY;
case MVT::v16i8: return X86ISD::PUNPCKLBW;
case MVT::v8i16: return X86ISD::PUNPCKLWD;
case MVT::v2i64: return X86ISD::PUNPCKHQDQ;
case MVT::v4f32: return X86ISD::UNPCKHPS;
case MVT::v2f64: return X86ISD::UNPCKHPD;
+ case MVT::v8i32: // Use fp unit for int unpack.
case MVT::v8f32: return X86ISD::VUNPCKHPSY;
+ case MVT::v4i64: // Use fp unit for int unpack.
case MVT::v4f64: return X86ISD::VUNPCKHPDY;
case MVT::v16i8: return X86ISD::PUNPCKHBW;
case MVT::v8i16: return X86ISD::PUNPCKHWD;
return 0;
}
+static inline unsigned getVPERMILOpcode(EVT VT) {
+ switch(VT.getSimpleVT().SimpleTy) {
+ case MVT::v4i32:
+ case MVT::v4f32: return X86ISD::VPERMILPS;
+ case MVT::v2i64:
+ case MVT::v2f64: return X86ISD::VPERMILPD;
+ case MVT::v8i32:
+ case MVT::v8f32: return X86ISD::VPERMILPSY;
+ case MVT::v4i64:
+ case MVT::v4f64: return X86ISD::VPERMILPDY;
+ default:
+ llvm_unreachable("Unknown type for vpermil");
+ }
+ return 0;
+}
+
+/// isVectorBroadcast - Check if the node chain is suitable to be xformed to
+/// a vbroadcast node. The nodes are suitable whenever we can fold a load coming
+/// from a 32 or 64 bit scalar. Update Op to the desired load to be folded.
+static bool isVectorBroadcast(SDValue &Op) {
+ EVT VT = Op.getValueType();
+ bool Is256 = VT.getSizeInBits() == 256;
+
+ assert((VT.getSizeInBits() == 128 || Is256) &&
+ "Unsupported type for vbroadcast node");
+
+ SDValue V = Op;
+ if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST)
+ V = V.getOperand(0);
+
+ if (Is256 && !(V.hasOneUse() &&
+ V.getOpcode() == ISD::INSERT_SUBVECTOR &&
+ V.getOperand(0).getOpcode() == ISD::UNDEF))
+ return false;
+
+ if (Is256)
+ V = V.getOperand(1);
+
+ if (!V.hasOneUse())
+ return false;
+
+ // Check the source scalar_to_vector type. 256-bit broadcasts are
+ // supported for 32/64-bit sizes, while 128-bit ones are only supported
+ // for 32-bit scalars.
+ if (V.getOpcode() != ISD::SCALAR_TO_VECTOR)
+ return false;
+
+ unsigned ScalarSize = V.getOperand(0).getValueType().getSizeInBits();
+ if (ScalarSize != 32 && ScalarSize != 64)
+ return false;
+ if (!Is256 && ScalarSize == 64)
+ return false;
+
+ V = V.getOperand(0);
+ if (!MayFoldLoad(V))
+ return false;
+
+ // Return the load node
+ Op = V;
+ return true;
+}
+
static
SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG,
const TargetLowering &TLI,
SDValue V2 = Op.getOperand(1);
if (isZeroShuffle(SVOp))
- return getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl);
+ return getZeroVector(VT, Subtarget->hasXMMInt(), DAG, dl);
// Handle splat operations
if (SVOp->isSplat()) {
unsigned NumElem = VT.getVectorNumElements();
+ int Size = VT.getSizeInBits();
// Special case, this is the only place now where it's allowed to return
// a vector_shuffle operation without using a target specific node, because
// *hopefully* it will be optimized away by the dag combiner. FIXME: should
if (NumElem <= 4 && CanXFormVExtractWithShuffleIntoLoad(Op, DAG, TLI))
return Op;
- // Since there's no native support for scalar_to_vector for 256-bit AVX, a
- // 128-bit scalar_to_vector + INSERT_SUBVECTOR is generated. Recognize this
- // idiom and do the shuffle before the insertion, this yields less
- // instructions in the end.
- if (VT.is256BitVector() &&
- V1.getOpcode() == ISD::INSERT_SUBVECTOR &&
- V1.getOperand(0).getOpcode() == ISD::UNDEF &&
- V1.getOperand(1).getOpcode() == ISD::SCALAR_TO_VECTOR)
- return PromoteVectorToScalarSplat(SVOp, DAG);
+ // Use vbroadcast whenever the splat comes from a foldable load
+ if (Subtarget->hasAVX() && isVectorBroadcast(V1))
+ return DAG.getNode(X86ISD::VBROADCAST, dl, VT, V1);
// Handle splats by matching through known shuffle masks
- if ((VT.is128BitVector() && NumElem <= 4) ||
- (VT.is256BitVector() && NumElem <= 8))
+ if ((Size == 128 && NumElem <= 4) ||
+ (Size == 256 && NumElem < 8))
return SDValue();
- // All i16 and i8 vector types can't be used directly by a generic shuffle
- // instruction because the target has no such instruction. Generate shuffles
- // which repeat i16 and i8 several times until they fit in i32, and then can
- // be manipulated by target suported shuffles. After the insertion of the
- // necessary shuffles, the result is bitcasted back to v4f32 or v8f32.
+ // All remaning splats are promoted to target supported vector shuffles.
return PromoteSplat(SVOp, DAG);
}
SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl);
if (NewOp.getNode())
return DAG.getNode(ISD::BITCAST, dl, VT, NewOp);
- } else if ((VT == MVT::v4i32 || (VT == MVT::v4f32 && Subtarget->hasSSE2()))) {
+ } else if ((VT == MVT::v4i32 ||
+ (VT == MVT::v4f32 && Subtarget->hasXMMInt()))) {
// FIXME: Figure out a cleaner way to do this.
// Try to make use of movq to zero out the top part.
if (ISD::isBuildVectorAllZeros(V2.getNode())) {
bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
bool V1IsSplat = false;
bool V2IsSplat = false;
- bool HasSSE2 = Subtarget->hasSSE2() || Subtarget->hasAVX();
- bool HasSSE3 = Subtarget->hasSSE3() || Subtarget->hasAVX();
- bool HasSSSE3 = Subtarget->hasSSSE3() || Subtarget->hasAVX();
+ bool HasXMMInt = Subtarget->hasXMMInt();
MachineFunction &MF = DAG.getMachineFunction();
bool OptForSize = MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize);
if (OptForSize && X86::isUNPCKH_v_undef_Mask(SVOp))
return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG);
- if (X86::isMOVDDUPMask(SVOp) && HasSSE3 && V2IsUndef &&
- RelaxedMayFoldVectorLoad(V1))
+ if (X86::isMOVDDUPMask(SVOp) &&
+ (Subtarget->hasSSE3() || Subtarget->hasAVX()) &&
+ V2IsUndef && RelaxedMayFoldVectorLoad(V1))
return getMOVDDup(Op, dl, V1, DAG);
if (X86::isMOVHLPS_v_undef_Mask(SVOp))
return getMOVHighToLow(Op, dl, DAG);
// Use to match splats
- if (HasSSE2 && X86::isUNPCKHMask(SVOp) && V2IsUndef &&
+ if (HasXMMInt && X86::isUNPCKHMask(SVOp) && V2IsUndef &&
(VT == MVT::v2f64 || VT == MVT::v2i64))
return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG);
unsigned TargetMask = X86::getShuffleSHUFImmediate(SVOp);
- if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32))
+ if (HasXMMInt && (VT == MVT::v4f32 || VT == MVT::v4i32))
return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG);
- if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64))
- return getTargetShuffleNode(X86ISD::SHUFPD, dl, VT, V1, V1,
- TargetMask, DAG);
-
- if (VT == MVT::v4f32)
- return getTargetShuffleNode(X86ISD::SHUFPS, dl, VT, V1, V1,
- TargetMask, DAG);
+ return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V1, V1,
+ TargetMask, DAG);
}
// Check if this can be converted into a logical shift.
bool isLeft = false;
unsigned ShAmt = 0;
SDValue ShVal;
- bool isShift = getSubtarget()->hasSSE2() &&
- isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt);
+ bool isShift = getSubtarget()->hasXMMInt() &&
+ isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt);
if (isShift && ShVal.hasOneUse()) {
// If the shifted value has multiple uses, it may be cheaper to use
// v_set0 + movlhps or movhlps, etc.
if (ISD::isBuildVectorAllZeros(V1.getNode()))
return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl);
if (!X86::isMOVLPMask(SVOp)) {
- if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64))
+ if (HasXMMInt && (VT == MVT::v2i64 || VT == MVT::v2f64))
return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
if (VT == MVT::v4i32 || VT == MVT::v4f32)
// FIXME: fold these into legal mask.
if (X86::isMOVLHPSMask(SVOp) && !X86::isUNPCKLMask(SVOp))
- return getMOVLowToHigh(Op, dl, DAG, HasSSE2);
+ return getMOVLowToHigh(Op, dl, DAG, HasXMMInt);
if (X86::isMOVHLPSMask(SVOp))
return getMOVHighToLow(Op, dl, DAG);
return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG);
if (X86::isMOVLPMask(SVOp))
- return getMOVLP(Op, dl, DAG, HasSSE2);
+ return getMOVLP(Op, dl, DAG, HasXMMInt);
if (ShouldXformToMOVHLPS(SVOp) ||
ShouldXformToMOVLP(V1.getNode(), V2.getNode(), SVOp))
SmallVector<int, 16> M;
SVOp->getMask(M);
- if (isPALIGNRMask(M, VT, HasSSSE3))
+ if (isPALIGNRMask(M, VT, Subtarget->hasSSSE3() || Subtarget->hasAVX()))
return getTargetShuffleNode(X86ISD::PALIGN, dl, VT, V1, V2,
X86::getShufflePALIGNRImmediate(SVOp),
DAG);
X86::getShufflePSHUFLWImmediate(SVOp),
DAG);
- if (isSHUFPMask(M, VT)) {
- unsigned TargetMask = X86::getShuffleSHUFImmediate(SVOp);
- if (VT == MVT::v4f32 || VT == MVT::v4i32)
- return getTargetShuffleNode(X86ISD::SHUFPS, dl, VT, V1, V2,
- TargetMask, DAG);
- if (VT == MVT::v2f64 || VT == MVT::v2i64)
- return getTargetShuffleNode(X86ISD::SHUFPD, dl, VT, V1, V2,
- TargetMask, DAG);
- }
+ if (isSHUFPMask(M, VT))
+ return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V1, V2,
+ X86::getShuffleSHUFImmediate(SVOp), DAG);
if (X86::isUNPCKL_v_undef_Mask(SVOp))
return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V1, DAG);
if (X86::isUNPCKH_v_undef_Mask(SVOp))
return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG);
+ //===--------------------------------------------------------------------===//
+ // Generate target specific nodes for 128 or 256-bit shuffles only
+ // supported in the AVX instruction set.
+ //
+
+ // Handle VMOVDDUPY permutations
+ if (isMOVDDUPYMask(SVOp, Subtarget))
+ return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG);
+
+ // Handle VPERMILPS* permutations
+ if (isVPERMILPSMask(M, VT, Subtarget))
+ return getTargetShuffleNode(getVPERMILOpcode(VT), dl, VT, V1,
+ getShuffleVPERMILPSImmediate(SVOp), DAG);
+
+ // Handle VPERMILPD* permutations
+ if (isVPERMILPDMask(M, VT, Subtarget))
+ return getTargetShuffleNode(getVPERMILOpcode(VT), dl, VT, V1,
+ getShuffleVPERMILPDImmediate(SVOp), DAG);
+
+ // Handle VPERM2F128 permutations
+ if (isVPERM2F128Mask(M, VT, Subtarget))
+ return getTargetShuffleNode(X86ISD::VPERM2F128, dl, VT, V1, V2,
+ getShuffleVPERM2F128Immediate(SVOp), DAG);
+
+ // Handle VSHUFPSY permutations
+ if (isVSHUFPSYMask(M, VT, Subtarget))
+ return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V1, V2,
+ getShuffleVSHUFPSYImmediate(SVOp), DAG);
+
+ // Handle VSHUFPDY permutations
+ if (isVSHUFPDYMask(M, VT, Subtarget))
+ return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V1, V2,
+ getShuffleVSHUFPDYImmediate(SVOp), DAG);
+
+ //===--------------------------------------------------------------------===//
+ // Since no target specific shuffle was selected for this generic one,
+ // lower it into other known shuffles. FIXME: this isn't true yet, but
+ // this is the plan.
+ //
+
// Handle v8i16 specifically since SSE can do byte extraction and insertion.
if (VT == MVT::v8i16) {
SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, DAG);
if (NumElems == 4 && VT.getSizeInBits() == 128)
return LowerVECTOR_SHUFFLE_128v4(SVOp, DAG);
- //===--------------------------------------------------------------------===//
- // Custom lower or generate target specific nodes for 256-bit shuffles.
-
- // Handle VPERMIL permutations
- if (isVPERMILMask(M, VT)) {
- unsigned TargetMask = getShuffleVPERMILImmediate(SVOp);
- if (VT == MVT::v8f32)
- return getTargetShuffleNode(X86ISD::VPERMIL, dl, VT, V1, TargetMask, DAG);
- }
-
// Handle general 256-bit shuffles
if (VT.is256BitVector())
return LowerVECTOR_SHUFFLE_256(SVOp, DAG);
SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
DebugLoc dl = Op.getDebugLoc();
+
+ if (Op.getOperand(0).getValueType().getSizeInBits() != 128)
+ return SDValue();
+
if (VT.getSizeInBits() == 8) {
SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
Op.getOperand(0), Op.getOperand(1));
SDValue Vec = Op.getOperand(0);
EVT VecVT = Vec.getValueType();
- // If this is a 256-bit vector result, first extract the 128-bit
- // vector and then extract from the 128-bit vector.
- if (VecVT.getSizeInBits() > 128) {
+ // If this is a 256-bit vector result, first extract the 128-bit vector and
+ // then extract the element from the 128-bit vector.
+ if (VecVT.getSizeInBits() == 256) {
DebugLoc dl = Op.getNode()->getDebugLoc();
unsigned NumElems = VecVT.getVectorNumElements();
SDValue Idx = Op.getOperand(1);
-
- if (!isa<ConstantSDNode>(Idx))
- return SDValue();
-
- unsigned ExtractNumElems = NumElems / (VecVT.getSizeInBits() / 128);
unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
// Get the 128-bit vector.
- bool Upper = IdxVal >= ExtractNumElems;
- Vec = Extract128BitVector(Vec, Idx, DAG, dl);
-
- // Extract from it.
- SDValue ScaledIdx = Idx;
- if (Upper)
- ScaledIdx = DAG.getNode(ISD::SUB, dl, Idx.getValueType(), Idx,
- DAG.getConstant(ExtractNumElems,
- Idx.getValueType()));
+ bool Upper = IdxVal >= NumElems/2;
+ Vec = Extract128BitVector(Vec,
+ DAG.getConstant(Upper ? NumElems/2 : 0, MVT::i32), DAG, dl);
+
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
- ScaledIdx);
+ Upper ? DAG.getConstant(IdxVal-NumElems/2, MVT::i32) : Idx);
}
assert(Vec.getValueSizeInBits() <= 128 && "Unexpected vector length");
- if (Subtarget->hasSSE41()) {
+ if (Subtarget->hasSSE41() || Subtarget->hasAVX()) {
SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG);
if (Res.getNode())
return Res;
return Op;
// SHUFPS the element to the lowest double word, then movss.
- int Mask[4] = { Idx, -1, -1, -1 };
+ int Mask[4] = { static_cast<int>(Idx), -1, -1, -1 };
EVT VVT = Op.getOperand(0).getValueType();
SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
DAG.getUNDEF(VVT), Mask);
SDValue N1 = Op.getOperand(1);
SDValue N2 = Op.getOperand(2);
+ if (VT.getSizeInBits() == 256)
+ return SDValue();
+
if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) &&
isa<ConstantSDNode>(N2)) {
unsigned Opc;
SDValue N1 = Op.getOperand(1);
SDValue N2 = Op.getOperand(2);
- // If this is a 256-bit vector result, first insert into a 128-bit
- // vector and then insert into the 256-bit vector.
- if (VT.getSizeInBits() > 128) {
+ // If this is a 256-bit vector result, first extract the 128-bit vector,
+ // insert the element into the extracted half and then place it back.
+ if (VT.getSizeInBits() == 256) {
if (!isa<ConstantSDNode>(N2))
return SDValue();
- // Get the 128-bit vector.
+ // Get the desired 128-bit vector half.
unsigned NumElems = VT.getVectorNumElements();
unsigned IdxVal = cast<ConstantSDNode>(N2)->getZExtValue();
- bool Upper = IdxVal >= NumElems / 2;
+ bool Upper = IdxVal >= NumElems/2;
+ SDValue Ins128Idx = DAG.getConstant(Upper ? NumElems/2 : 0, MVT::i32);
+ SDValue V = Extract128BitVector(N0, Ins128Idx, DAG, dl);
- SDValue SubN0 = Extract128BitVector(N0, N2, DAG, dl);
+ // Insert the element into the desired half.
+ V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V,
+ N1, Upper ? DAG.getConstant(IdxVal-NumElems/2, MVT::i32) : N2);
- // Insert into it.
- SDValue ScaledN2 = N2;
- if (Upper)
- ScaledN2 = DAG.getNode(ISD::SUB, dl, N2.getValueType(), N2,
- DAG.getConstant(NumElems /
- (VT.getSizeInBits() / 128),
- N2.getValueType()));
- Op = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubN0.getValueType(), SubN0,
- N1, ScaledN2);
-
- // Insert the 128-bit vector
- // FIXME: Why UNDEF?
- return Insert128BitVector(N0, Op, N2, DAG, dl);
+ // Insert the changed part back to the 256-bit vector
+ return Insert128BitVector(N0, V, Ins128Idx, DAG, dl);
}
- if (Subtarget->hasSSE41())
+ if (Subtarget->hasSSE41() || Subtarget->hasAVX())
return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG);
if (EltVT == MVT::i8)
CodeModel::Model M = getTargetMachine().getCodeModel();
if (Subtarget->isPICStyleRIPRel() &&
- (M == CodeModel::Small || M == CodeModel::Kernel))
+ (M == CodeModel::Small || M == CodeModel::Kernel)) {
+ if (Subtarget->isTargetDarwin() || Subtarget->isTargetELF())
+ OpFlag = X86II::MO_GOTPCREL;
WrapperKind = X86ISD::WrapperRIP;
- else if (Subtarget->isPICStyleGOT())
- OpFlag = X86II::MO_GOTOFF;
- else if (Subtarget->isPICStyleStubPIC())
- OpFlag = X86II::MO_PIC_BASE_OFFSET;
+ } else if (Subtarget->isPICStyleGOT()) {
+ OpFlag = X86II::MO_GOT;
+ } else if (Subtarget->isPICStyleStubPIC()) {
+ OpFlag = X86II::MO_DARWIN_NONLAZY_PIC_BASE;
+ } else if (Subtarget->isPICStyleStubNoDynamic()) {
+ OpFlag = X86II::MO_DARWIN_NONLAZY;
+ }
SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag);
Result);
}
+ // For symbols that require a load from a stub to get the address, emit the
+ // load.
+ if (isGlobalStubReference(OpFlag))
+ Result = DAG.getLoad(getPointerTy(), DL, DAG.getEntryNode(), Result,
+ MachinePointerInfo::getGOT(), false, false, 0);
+
return Result;
}
// Load the 32-bit value into an XMM register.
SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
- DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
- Op.getOperand(0),
- DAG.getIntPtrConstant(0)));
+ Op.getOperand(0));
+
+ // Zero out the upper parts of the register.
+ Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget->hasXMMInt(),
+ DAG);
Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load),
}
SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
+
+ if (Op.getValueType().isVector()) return LowerVSETCC(Op, DAG);
+
assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer");
SDValue Op0 = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
DAG.getConstant(X86CC, MVT::i8), EFLAGS);
}
+// Lower256IntVETCC - Break a VSETCC 256-bit integer VSETCC into two new 128
+// ones, and then concatenate the result back.
+static SDValue Lower256IntVETCC(SDValue Op, SelectionDAG &DAG) {
+ EVT VT = Op.getValueType();
+
+ assert(VT.getSizeInBits() == 256 && Op.getOpcode() == ISD::SETCC &&
+ "Unsupported value type for operation");
+
+ int NumElems = VT.getVectorNumElements();
+ DebugLoc dl = Op.getDebugLoc();
+ SDValue CC = Op.getOperand(2);
+ SDValue Idx0 = DAG.getConstant(0, MVT::i32);
+ SDValue Idx1 = DAG.getConstant(NumElems/2, MVT::i32);
+
+ // Extract the LHS vectors
+ SDValue LHS = Op.getOperand(0);
+ SDValue LHS1 = Extract128BitVector(LHS, Idx0, DAG, dl);
+ SDValue LHS2 = Extract128BitVector(LHS, Idx1, DAG, dl);
+
+ // Extract the RHS vectors
+ SDValue RHS = Op.getOperand(1);
+ SDValue RHS1 = Extract128BitVector(RHS, Idx0, DAG, dl);
+ SDValue RHS2 = Extract128BitVector(RHS, Idx1, DAG, dl);
+
+ // Issue the operation on the smaller types and concatenate the result back
+ MVT EltVT = VT.getVectorElementType().getSimpleVT();
+ EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
+ DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
+ DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
+}
+
+
SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
SDValue Cond;
SDValue Op0 = Op.getOperand(0);
if (isFP) {
unsigned SSECC = 8;
- EVT VT0 = Op0.getValueType();
- assert(VT0 == MVT::v4f32 || VT0 == MVT::v2f64);
- unsigned Opc = VT0 == MVT::v4f32 ? X86ISD::CMPPS : X86ISD::CMPPD;
+ EVT EltVT = Op0.getValueType().getVectorElementType();
+ assert(EltVT == MVT::f32 || EltVT == MVT::f64);
+
+ unsigned Opc = EltVT == MVT::f32 ? X86ISD::CMPPS : X86ISD::CMPPD;
bool Swap = false;
+ // SSE Condition code mapping:
+ // 0 - EQ
+ // 1 - LT
+ // 2 - LE
+ // 3 - UNORD
+ // 4 - NEQ
+ // 5 - NLT
+ // 6 - NLE
+ // 7 - ORD
switch (SetCCOpcode) {
default: break;
case ISD::SETOEQ:
return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, MVT::i8));
}
+ // Break 256-bit integer vector compare into smaller ones.
+ if (!isFP && VT.getSizeInBits() == 256)
+ return Lower256IntVETCC(Op, DAG);
+
// We are handling one of the integer comparisons here. Since SSE only has
// GT and EQ comparisons for integer, swapping operands and multiple
// operations may be required for some comparisons.
SDValue
X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
SelectionDAG &DAG) const {
- assert((Subtarget->isTargetCygMing() || Subtarget->isTargetWindows()) &&
- "This should be used only on Windows targets");
- assert(!Subtarget->isTargetEnvMacho());
+ assert((Subtarget->isTargetCygMing() || Subtarget->isTargetWindows() ||
+ EnableSegmentedStacks) &&
+ "This should be used only on Windows targets or when segmented stacks "
+ "are being used");
+ assert(!Subtarget->isTargetEnvMacho() && "Not implemented");
DebugLoc dl = Op.getDebugLoc();
// Get the inputs.
SDValue Size = Op.getOperand(1);
// FIXME: Ensure alignment here
- SDValue Flag;
+ bool Is64Bit = Subtarget->is64Bit();
+ EVT SPTy = Is64Bit ? MVT::i64 : MVT::i32;
- EVT SPTy = Subtarget->is64Bit() ? MVT::i64 : MVT::i32;
- unsigned Reg = (Subtarget->is64Bit() ? X86::RAX : X86::EAX);
+ if (EnableSegmentedStacks) {
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
- Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag);
- Flag = Chain.getValue(1);
+ if (Is64Bit) {
+ // The 64 bit implementation of segmented stacks needs to clobber both r10
+ // r11. This makes it impossible to use it along with nested parameters.
+ const Function *F = MF.getFunction();
+
+ for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
+ I != E; I++)
+ if (I->hasNestAttr())
+ report_fatal_error("Cannot use segmented stacks with functions that "
+ "have nested arguments.");
+ }
+
+ const TargetRegisterClass *AddrRegClass =
+ getRegClassFor(Subtarget->is64Bit() ? MVT::i64:MVT::i32);
+ unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
+ Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
+ SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
+ DAG.getRegister(Vreg, SPTy));
+ SDValue Ops1[2] = { Value, Chain };
+ return DAG.getMergeValues(Ops1, 2, dl);
+ } else {
+ SDValue Flag;
+ unsigned Reg = (Subtarget->is64Bit() ? X86::RAX : X86::EAX);
- SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+ Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag);
+ Flag = Chain.getValue(1);
+ SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
- Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag);
- Flag = Chain.getValue(1);
+ Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag);
+ Flag = Chain.getValue(1);
- Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1);
+ Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1);
- SDValue Ops1[2] = { Chain.getValue(0), Chain };
- return DAG.getMergeValues(Ops1, 2, dl);
+ SDValue Ops1[2] = { Chain.getValue(0), Chain };
+ return DAG.getMergeValues(Ops1, 2, dl);
+ }
}
SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
Chain, DAG.getRegister(StoreAddrReg, getPointerTy()));
}
-SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op,
- SelectionDAG &DAG) const {
+SDValue X86TargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op,
+ SelectionDAG &DAG) const {
+ return Op.getOperand(0);
+}
+
+SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
+ SelectionDAG &DAG) const {
SDValue Root = Op.getOperand(0);
SDValue Trmp = Op.getOperand(1); // trampoline
SDValue FPtr = Op.getOperand(2); // nested function
MachinePointerInfo(TrmpAddr, 22),
false, false, 0);
- SDValue Ops[] =
- { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6) };
- return DAG.getMergeValues(Ops, 2, dl);
+ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6);
} else {
const Function *Func =
cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
MachinePointerInfo(TrmpAddr, 6),
false, false, 1);
- SDValue Ops[] =
- { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4) };
- return DAG.getMergeValues(Ops, 2, dl);
+ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4);
}
}
return Op;
}
-SDValue X86TargetLowering::LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) const {
+// Lower256IntArith - Break a 256-bit integer operation into two new 128-bit
+// ones, and then concatenate the result back.
+static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
+ EVT VT = Op.getValueType();
+
+ assert(VT.getSizeInBits() == 256 && VT.isInteger() &&
+ "Unsupported value type for operation");
+
+ int NumElems = VT.getVectorNumElements();
+ DebugLoc dl = Op.getDebugLoc();
+ SDValue Idx0 = DAG.getConstant(0, MVT::i32);
+ SDValue Idx1 = DAG.getConstant(NumElems/2, MVT::i32);
+
+ // Extract the LHS vectors
+ SDValue LHS = Op.getOperand(0);
+ SDValue LHS1 = Extract128BitVector(LHS, Idx0, DAG, dl);
+ SDValue LHS2 = Extract128BitVector(LHS, Idx1, DAG, dl);
+
+ // Extract the RHS vectors
+ SDValue RHS = Op.getOperand(1);
+ SDValue RHS1 = Extract128BitVector(RHS, Idx0, DAG, dl);
+ SDValue RHS2 = Extract128BitVector(RHS, Idx1, DAG, dl);
+
+ MVT EltVT = VT.getVectorElementType().getSimpleVT();
+ EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
+
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
+ DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
+ DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
+}
+
+SDValue X86TargetLowering::LowerADD(SDValue Op, SelectionDAG &DAG) const {
+ assert(Op.getValueType().getSizeInBits() == 256 &&
+ Op.getValueType().isInteger() &&
+ "Only handle AVX 256-bit vector integer operation");
+ return Lower256IntArith(Op, DAG);
+}
+
+SDValue X86TargetLowering::LowerSUB(SDValue Op, SelectionDAG &DAG) const {
+ assert(Op.getValueType().getSizeInBits() == 256 &&
+ Op.getValueType().isInteger() &&
+ "Only handle AVX 256-bit vector integer operation");
+ return Lower256IntArith(Op, DAG);
+}
+
+SDValue X86TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
+
+ // Decompose 256-bit ops into smaller 128-bit ops.
+ if (VT.getSizeInBits() == 256)
+ return Lower256IntArith(Op, DAG);
+
assert(VT == MVT::v2i64 && "Only know how to lower V2I64 multiply");
DebugLoc dl = Op.getDebugLoc();
DebugLoc dl = Op.getDebugLoc();
SDValue R = Op.getOperand(0);
SDValue Amt = Op.getOperand(1);
-
LLVMContext *Context = DAG.getContext();
- // Must have SSE2.
- if (!Subtarget->hasSSE2()) return SDValue();
+ if (!Subtarget->hasXMMInt())
+ return SDValue();
+
+ // Decompose 256-bit shifts into smaller 128-bit shifts.
+ if (VT.getSizeInBits() == 256) {
+ int NumElems = VT.getVectorNumElements();
+ MVT EltVT = VT.getVectorElementType().getSimpleVT();
+ EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
+
+ // Extract the two vectors
+ SDValue V1 = Extract128BitVector(R, DAG.getConstant(0, MVT::i32), DAG, dl);
+ SDValue V2 = Extract128BitVector(R, DAG.getConstant(NumElems/2, MVT::i32),
+ DAG, dl);
+
+ // Recreate the shift amount vectors
+ SDValue Amt1, Amt2;
+ if (Amt.getOpcode() == ISD::BUILD_VECTOR) {
+ // Constant shift amount
+ SmallVector<SDValue, 4> Amt1Csts;
+ SmallVector<SDValue, 4> Amt2Csts;
+ for (int i = 0; i < NumElems/2; ++i)
+ Amt1Csts.push_back(Amt->getOperand(i));
+ for (int i = NumElems/2; i < NumElems; ++i)
+ Amt2Csts.push_back(Amt->getOperand(i));
+
+ Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT,
+ &Amt1Csts[0], NumElems/2);
+ Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT,
+ &Amt2Csts[0], NumElems/2);
+ } else {
+ // Variable shift amount
+ Amt1 = Extract128BitVector(Amt, DAG.getConstant(0, MVT::i32), DAG, dl);
+ Amt2 = Extract128BitVector(Amt, DAG.getConstant(NumElems/2, MVT::i32),
+ DAG, dl);
+ }
+
+ // Issue new vector shifts for the smaller types
+ V1 = DAG.getNode(Op.getOpcode(), dl, NewVT, V1, Amt1);
+ V2 = DAG.getNode(Op.getOpcode(), dl, NewVT, V2, Amt2);
+
+ // Concatenate the result back
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, V1, V2);
+ }
// Optimize shl/srl/sra with constant shift amount.
if (isSplatVector(Amt.getNode())) {
}
// Lower SHL with variable shift amount.
- // Cannot lower SHL without SSE2 or later.
- if (!Subtarget->hasSSE2()) return SDValue();
-
if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32),
M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M,
DAG.getConstant(4, MVT::i32));
- R = DAG.getNode(X86ISD::PBLENDVB, dl, VT, R, M, Op);
+ R = DAG.getNode(ISD::VSELECT, dl, VT, Op, R, M);
// a += a
Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M,
DAG.getConstant(2, MVT::i32));
- R = DAG.getNode(X86ISD::PBLENDVB, dl, VT, R, M, Op);
+ R = DAG.getNode(ISD::VSELECT, dl, VT, Op, R, M);
// a += a
Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
// return pblendv(r, r+r, a);
- R = DAG.getNode(X86ISD::PBLENDVB, dl, VT,
- R, DAG.getNode(ISD::ADD, dl, VT, R, R), Op);
+ R = DAG.getNode(ISD::VSELECT, dl, VT, Op,
+ R, DAG.getNode(ISD::ADD, dl, VT, R, R));
return R;
}
return SDValue();
SDNode* Node = Op.getNode();
EVT ExtraVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
EVT VT = Node->getValueType(0);
-
- if (Subtarget->hasSSE2() && VT.isVector()) {
+ if (Subtarget->hasXMMInt() && VT.isVector()) {
unsigned BitsDiff = VT.getScalarType().getSizeInBits() -
ExtraVT.getScalarType().getSizeInBits();
SDValue ShAmt = DAG.getConstant(BitsDiff, MVT::i32);
// Go ahead and emit the fence on x86-64 even if we asked for no-sse2.
// There isn't any reason to disable it if the target processor supports it.
- if (!Subtarget->hasSSE2() && !Subtarget->is64Bit()) {
+ if (!Subtarget->hasXMMInt() && !Subtarget->is64Bit()) {
SDValue Chain = Op.getOperand(0);
SDValue Zero = DAG.getConstant(0, MVT::i32);
SDValue Ops[] = {
return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
}
+SDValue X86TargetLowering::LowerATOMIC_FENCE(SDValue Op,
+ SelectionDAG &DAG) const {
+ DebugLoc dl = Op.getDebugLoc();
+ AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
+ cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
+ SynchronizationScope FenceScope = static_cast<SynchronizationScope>(
+ cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
+
+ // The only fence that needs an instruction is a sequentially-consistent
+ // cross-thread fence.
+ if (FenceOrdering == SequentiallyConsistent && FenceScope == CrossThread) {
+ // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for
+ // no-sse2). There isn't any reason to disable it if the target processor
+ // supports it.
+ if (Subtarget->hasXMMInt() || Subtarget->is64Bit())
+ return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
+
+ SDValue Chain = Op.getOperand(0);
+ SDValue Zero = DAG.getConstant(0, MVT::i32);
+ SDValue Ops[] = {
+ DAG.getRegister(X86::ESP, MVT::i32), // Base
+ DAG.getTargetConstant(1, MVT::i8), // Scale
+ DAG.getRegister(0, MVT::i32), // Index
+ DAG.getTargetConstant(0, MVT::i32), // Disp
+ DAG.getRegister(0, MVT::i32), // Segment.
+ Zero,
+ Chain
+ };
+ SDNode *Res =
+ DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops,
+ array_lengthof(Ops));
+ return SDValue(Res, 0);
+ }
+
+ // MEMBARRIER is a compiler barrier; it codegens to a no-op.
+ return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
+}
+
+
SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const {
EVT T = Op.getValueType();
DebugLoc DL = Op.getDebugLoc();
SelectionDAG &DAG) const {
EVT SrcVT = Op.getOperand(0).getValueType();
EVT DstVT = Op.getValueType();
- assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() &&
+ assert(Subtarget->is64Bit() && !Subtarget->hasXMMInt() &&
Subtarget->hasMMX() && "Unexpected custom BITCAST");
assert((DstVT == MVT::i64 ||
(DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
Node->getOperand(0),
Node->getOperand(1), negOp,
cast<AtomicSDNode>(Node)->getSrcValue(),
- cast<AtomicSDNode>(Node)->getAlignment());
+ cast<AtomicSDNode>(Node)->getAlignment(),
+ cast<AtomicSDNode>(Node)->getOrdering(),
+ cast<AtomicSDNode>(Node)->getSynchScope());
+}
+
+static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
+ SDNode *Node = Op.getNode();
+ DebugLoc dl = Node->getDebugLoc();
+ EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
+
+ // Convert seq_cst store -> xchg
+ // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
+ // FIXME: On 32-bit, store -> fist or movq would be more efficient
+ // (The only way to get a 16-byte store is cmpxchg16b)
+ // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
+ if (cast<AtomicSDNode>(Node)->getOrdering() == SequentiallyConsistent ||
+ !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
+ SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
+ cast<AtomicSDNode>(Node)->getMemoryVT(),
+ Node->getOperand(0),
+ Node->getOperand(1), Node->getOperand(2),
+ cast<AtomicSDNode>(Node)->getMemOperand(),
+ cast<AtomicSDNode>(Node)->getOrdering(),
+ cast<AtomicSDNode>(Node)->getSynchScope());
+ return Swap.getValue(1);
+ }
+ // Other atomic stores have a simple pattern.
+ return Op;
}
static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
default: llvm_unreachable("Should not custom lower this!");
case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op,DAG);
case ISD::MEMBARRIER: return LowerMEMBARRIER(Op,DAG);
+ case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op,DAG);
case ISD::ATOMIC_CMP_SWAP: return LowerCMP_SWAP(Op,DAG);
case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG);
+ case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op,DAG);
case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
case ISD::SETCC: return LowerSETCC(Op, DAG);
- case ISD::VSETCC: return LowerVSETCC(Op, DAG);
case ISD::SELECT: return LowerSELECT(Op, DAG);
case ISD::BRCOND: return LowerBRCOND(Op, DAG);
case ISD::JumpTable: return LowerJumpTable(Op, DAG);
return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
- case ISD::TRAMPOLINE: return LowerTRAMPOLINE(Op, DAG);
+ case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
+ case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
case ISD::CTLZ: return LowerCTLZ(Op, DAG);
case ISD::CTTZ: return LowerCTTZ(Op, DAG);
- case ISD::MUL: return LowerMUL_V2I64(Op, DAG);
+ case ISD::MUL: return LowerMUL(Op, DAG);
case ISD::SRA:
case ISD::SRL:
case ISD::SHL: return LowerShift(Op, DAG);
case ISD::ADDE:
case ISD::SUBC:
case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
+ case ISD::ADD: return LowerADD(Op, DAG);
+ case ISD::SUB: return LowerSUB(Op, DAG);
}
}
+static void ReplaceATOMIC_LOAD(SDNode *Node,
+ SmallVectorImpl<SDValue> &Results,
+ SelectionDAG &DAG) {
+ DebugLoc dl = Node->getDebugLoc();
+ EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
+
+ // Convert wide load -> cmpxchg8b/cmpxchg16b
+ // FIXME: On 32-bit, load -> fild or movq would be more efficient
+ // (The only way to get a 16-byte load is cmpxchg16b)
+ // FIXME: 16-byte ATOMIC_CMP_SWAP isn't actually hooked up at the moment.
+ SDValue Zero = DAG.getConstant(0, VT);
+ SDValue Swap = DAG.getAtomic(ISD::ATOMIC_CMP_SWAP, dl, VT,
+ Node->getOperand(0),
+ Node->getOperand(1), Zero, Zero,
+ cast<AtomicSDNode>(Node)->getMemOperand(),
+ cast<AtomicSDNode>(Node)->getOrdering(),
+ cast<AtomicSDNode>(Node)->getSynchScope());
+ Results.push_back(Swap.getValue(0));
+ Results.push_back(Swap.getValue(1));
+}
+
void X86TargetLowering::
ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results,
SelectionDAG &DAG, unsigned NewOp) const {
}
case ISD::ATOMIC_CMP_SWAP: {
EVT T = N->getValueType(0);
- assert (T == MVT::i64 && "Only know how to expand i64 Cmp and Swap");
+ assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
+ bool Regs64bit = T == MVT::i128;
+ EVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
SDValue cpInL, cpInH;
- cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2),
- DAG.getConstant(0, MVT::i32));
- cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2),
- DAG.getConstant(1, MVT::i32));
- cpInL = DAG.getCopyToReg(N->getOperand(0), dl, X86::EAX, cpInL, SDValue());
- cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, X86::EDX, cpInH,
- cpInL.getValue(1));
+ cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
+ DAG.getConstant(0, HalfT));
+ cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
+ DAG.getConstant(1, HalfT));
+ cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
+ Regs64bit ? X86::RAX : X86::EAX,
+ cpInL, SDValue());
+ cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
+ Regs64bit ? X86::RDX : X86::EDX,
+ cpInH, cpInL.getValue(1));
SDValue swapInL, swapInH;
- swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3),
- DAG.getConstant(0, MVT::i32));
- swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3),
- DAG.getConstant(1, MVT::i32));
- swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, X86::EBX, swapInL,
- cpInH.getValue(1));
- swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, X86::ECX, swapInH,
- swapInL.getValue(1));
+ swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
+ DAG.getConstant(0, HalfT));
+ swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
+ DAG.getConstant(1, HalfT));
+ swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl,
+ Regs64bit ? X86::RBX : X86::EBX,
+ swapInL, cpInH.getValue(1));
+ swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl,
+ Regs64bit ? X86::RCX : X86::ECX,
+ swapInH, swapInL.getValue(1));
SDValue Ops[] = { swapInH.getValue(0),
N->getOperand(1),
swapInH.getValue(1) };
SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
- SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys,
+ unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_DAG :
+ X86ISD::LCMPXCHG8_DAG;
+ SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys,
Ops, 3, T, MMO);
- SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, X86::EAX,
- MVT::i32, Result.getValue(1));
- SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, X86::EDX,
- MVT::i32, cpOutL.getValue(2));
+ SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
+ Regs64bit ? X86::RAX : X86::EAX,
+ HalfT, Result.getValue(1));
+ SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
+ Regs64bit ? X86::RDX : X86::EDX,
+ HalfT, cpOutL.getValue(2));
SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
- Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2));
+ Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF, 2));
Results.push_back(cpOutH.getValue(1));
return;
}
case ISD::ATOMIC_SWAP:
ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSWAP64_DAG);
return;
+ case ISD::ATOMIC_LOAD:
+ ReplaceATOMIC_LOAD(N, Results, DAG);
}
}
case X86ISD::PSIGNB: return "X86ISD::PSIGNB";
case X86ISD::PSIGNW: return "X86ISD::PSIGNW";
case X86ISD::PSIGND: return "X86ISD::PSIGND";
- case X86ISD::PBLENDVB: return "X86ISD::PBLENDVB";
case X86ISD::FMAX: return "X86ISD::FMAX";
case X86ISD::FMIN: return "X86ISD::FMIN";
case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
case X86ISD::PUNPCKHWD: return "X86ISD::PUNPCKHWD";
case X86ISD::PUNPCKHDQ: return "X86ISD::PUNPCKHDQ";
case X86ISD::PUNPCKHQDQ: return "X86ISD::PUNPCKHQDQ";
- case X86ISD::VPERMIL: return "X86ISD::VPERMIL";
+ case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST";
+ case X86ISD::VPERMILPS: return "X86ISD::VPERMILPS";
+ case X86ISD::VPERMILPSY: return "X86ISD::VPERMILPSY";
+ case X86ISD::VPERMILPD: return "X86ISD::VPERMILPD";
+ case X86ISD::VPERMILPDY: return "X86ISD::VPERMILPDY";
+ case X86ISD::VPERM2F128: return "X86ISD::VPERM2F128";
case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
case X86ISD::VAARG_64: return "X86ISD::VAARG_64";
case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA";
+ case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER";
+ case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA";
}
}
EVT VT) const {
// Very little shuffling can be done for 64-bit vectors right now.
if (VT.getSizeInBits() == 64)
- return isPALIGNRMask(M, VT, Subtarget->hasSSSE3());
+ return isPALIGNRMask(M, VT, Subtarget->hasSSSE3() || Subtarget->hasAVX());
// FIXME: pshufb, blends, shifts.
return (VT.getVectorNumElements() == 2 ||
isPSHUFDMask(M, VT) ||
isPSHUFHWMask(M, VT) ||
isPSHUFLWMask(M, VT) ||
- isPALIGNRMask(M, VT, Subtarget->hasSSSE3()) ||
+ isPALIGNRMask(M, VT, Subtarget->hasSSSE3() || Subtarget->hasAVX()) ||
isUNPCKLMask(M, VT) ||
isUNPCKHMask(M, VT) ||
isUNPCKL_v_undef_Mask(M, VT) ||
if (!(Op.isReg() && Op.isImplicit()))
MIB.addOperand(Op);
}
- BuildMI(*BB, MI, dl, TII->get(X86::MOVAPSrr), MI->getOperand(0).getReg())
+ BuildMI(*BB, MI, dl,
+ TII->get(Subtarget->hasAVX() ? X86::VMOVAPSrr : X86::MOVAPSrr),
+ MI->getOperand(0).getReg())
.addReg(X86::XMM0);
MI->eraseFromParent();
MBB->addSuccessor(EndMBB);
}
+ unsigned MOVOpc = Subtarget->hasAVX() ? X86::VMOVAPSmr : X86::MOVAPSmr;
// In the XMM save block, save all the XMM argument registers.
for (int i = 3, e = MI->getNumOperands(); i != e; ++i) {
int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
MachinePointerInfo::getFixedStack(RegSaveFrameIndex, Offset),
MachineMemOperand::MOStore,
/*Size=*/16, /*Align=*/16);
- BuildMI(XMMSaveMBB, DL, TII->get(X86::MOVAPSmr))
+ BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
.addFrameIndex(RegSaveFrameIndex)
.addImm(/*Scale=*/1)
.addReg(/*IndexReg=*/0)
// If the EFLAGS register isn't dead in the terminator, then claim that it's
// live into the sink and copy blocks.
- const MachineFunction *MF = BB->getParent();
- const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo();
- BitVector ReservedRegs = TRI->getReservedRegs(*MF);
-
- for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) {
- const MachineOperand &MO = MI->getOperand(I);
- if (!MO.isReg() || !MO.isUse() || MO.isKill()) continue;
- unsigned Reg = MO.getReg();
- if (Reg != X86::EFLAGS) continue;
- copy0MBB->addLiveIn(Reg);
- sinkMBB->addLiveIn(Reg);
+ if (!MI->killsRegister(X86::EFLAGS)) {
+ copy0MBB->addLiveIn(X86::EFLAGS);
+ sinkMBB->addLiveIn(X86::EFLAGS);
}
// Transfer the remainder of BB and its successor edges to sinkMBB.
return sinkMBB;
}
+MachineBasicBlock *
+X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB,
+ bool Is64Bit) const {
+ const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+ DebugLoc DL = MI->getDebugLoc();
+ MachineFunction *MF = BB->getParent();
+ const BasicBlock *LLVM_BB = BB->getBasicBlock();
+
+ assert(EnableSegmentedStacks);
+
+ unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
+ unsigned TlsOffset = Is64Bit ? 0x70 : 0x30;
+
+ // BB:
+ // ... [Till the alloca]
+ // If stacklet is not large enough, jump to mallocMBB
+ //
+ // bumpMBB:
+ // Allocate by subtracting from RSP
+ // Jump to continueMBB
+ //
+ // mallocMBB:
+ // Allocate by call to runtime
+ //
+ // continueMBB:
+ // ...
+ // [rest of original BB]
+ //
+
+ MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ const TargetRegisterClass *AddrRegClass =
+ getRegClassFor(Is64Bit ? MVT::i64:MVT::i32);
+
+ unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
+ bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
+ tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
+ sizeVReg = MI->getOperand(1).getReg(),
+ physSPReg = Is64Bit ? X86::RSP : X86::ESP;
+
+ MachineFunction::iterator MBBIter = BB;
+ ++MBBIter;
+
+ MF->insert(MBBIter, bumpMBB);
+ MF->insert(MBBIter, mallocMBB);
+ MF->insert(MBBIter, continueMBB);
+
+ continueMBB->splice(continueMBB->begin(), BB, llvm::next
+ (MachineBasicBlock::iterator(MI)), BB->end());
+ continueMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+ // Add code to the main basic block to check if the stack limit has been hit,
+ // and if so, jump to mallocMBB otherwise to bumpMBB.
+ BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
+ BuildMI(BB, DL, TII->get(Is64Bit ? X86::SUB64rr:X86::SUB32rr), tmpSPVReg)
+ .addReg(tmpSPVReg).addReg(sizeVReg);
+ BuildMI(BB, DL, TII->get(Is64Bit ? X86::CMP64mr:X86::CMP32mr))
+ .addReg(0).addImm(0).addReg(0).addImm(TlsOffset).addReg(TlsReg)
+ .addReg(tmpSPVReg);
+ BuildMI(BB, DL, TII->get(X86::JG_4)).addMBB(mallocMBB);
+
+ // bumpMBB simply decreases the stack pointer, since we know the current
+ // stacklet has enough space.
+ BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
+ .addReg(tmpSPVReg);
+ BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
+ .addReg(tmpSPVReg);
+ BuildMI(bumpMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB);
+
+ // Calls into a routine in libgcc to allocate more space from the heap.
+ if (Is64Bit) {
+ BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
+ .addReg(sizeVReg);
+ BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
+ .addExternalSymbol("__morestack_allocate_stack_space").addReg(X86::RDI);
+ } else {
+ BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
+ .addImm(12);
+ BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
+ BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
+ .addExternalSymbol("__morestack_allocate_stack_space");
+ }
+
+ if (!Is64Bit)
+ BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
+ .addImm(16);
+
+ BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
+ .addReg(Is64Bit ? X86::RAX : X86::EAX);
+ BuildMI(mallocMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB);
+
+ // Set up the CFG correctly.
+ BB->addSuccessor(bumpMBB);
+ BB->addSuccessor(mallocMBB);
+ mallocMBB->addSuccessor(continueMBB);
+ bumpMBB->addSuccessor(continueMBB);
+
+ // Take care of the PHI nodes.
+ BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
+ MI->getOperand(0).getReg())
+ .addReg(mallocPtrVReg).addMBB(mallocMBB)
+ .addReg(bumpSPPtrVReg).addMBB(bumpMBB);
+
+ // Delete the original pseudo instruction.
+ MI->eraseFromParent();
+
+ // And we're done.
+ return continueMBB;
+}
+
MachineBasicBlock *
X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI,
MachineBasicBlock *BB) const {
return BB;
case X86::WIN_ALLOCA:
return EmitLoweredWinAlloca(MI, BB);
+ case X86::SEG_ALLOCA_32:
+ return EmitLoweredSegAlloca(MI, BB, false);
+ case X86::SEG_ALLOCA_64:
+ return EmitLoweredSegAlloca(MI, BB, true);
case X86::TLSCall_32:
case X86::TLSCall_64:
return EmitLoweredTLSCall(MI, BB);
case X86::CMOV_V4F32:
case X86::CMOV_V2F64:
case X86::CMOV_V2I64:
+ case X86::CMOV_V8F32:
+ case X86::CMOV_V4F64:
+ case X86::CMOV_V4I64:
case X86::CMOV_GR16:
case X86::CMOV_GR32:
case X86::CMOV_RFP32:
return TargetLowering::isGAPlusOffset(N, GA, Offset);
}
+/// isShuffleHigh128VectorInsertLow - Checks whether the shuffle node is the
+/// same as extracting the high 128-bit part of 256-bit vector and then
+/// inserting the result into the low part of a new 256-bit vector
+static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) {
+ EVT VT = SVOp->getValueType(0);
+ int NumElems = VT.getVectorNumElements();
+
+ // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
+ for (int i = 0, j = NumElems/2; i < NumElems/2; ++i, ++j)
+ if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
+ SVOp->getMaskElt(j) >= 0)
+ return false;
+
+ return true;
+}
+
+/// isShuffleLow128VectorInsertHigh - Checks whether the shuffle node is the
+/// same as extracting the low 128-bit part of 256-bit vector and then
+/// inserting the result into the high part of a new 256-bit vector
+static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) {
+ EVT VT = SVOp->getValueType(0);
+ int NumElems = VT.getVectorNumElements();
+
+ // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
+ for (int i = NumElems/2, j = 0; i < NumElems; ++i, ++j)
+ if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
+ SVOp->getMaskElt(j) >= 0)
+ return false;
+
+ return true;
+}
+
/// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors.
static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI) {
SDValue V1 = SVOp->getOperand(0);
SDValue V2 = SVOp->getOperand(1);
EVT VT = SVOp->getValueType(0);
+ int NumElems = VT.getVectorNumElements();
if (V1.getOpcode() == ISD::CONCAT_VECTORS &&
V2.getOpcode() == ISD::CONCAT_VECTORS) {
// To match the shuffle mask, the first half of the mask should
// be exactly the first vector, and all the rest a splat with the
// first element of the second one.
- int NumElems = VT.getVectorNumElements();
for (int i = 0; i < NumElems/2; ++i)
if (!isUndefOrEqual(SVOp->getMaskElt(i), i) ||
!isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems))
// Emit a zeroed vector and insert the desired subvector on its
// first half.
- SDValue Zeros = getZeroVector(VT, true /* HasSSE2 */, DAG, dl);
+ SDValue Zeros = getZeroVector(VT, true /* HasXMMInt */, DAG, dl);
SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0),
DAG.getConstant(0, MVT::i32), DAG, dl);
return DCI.CombineTo(N, InsV);
}
+ //===--------------------------------------------------------------------===//
+ // Combine some shuffles into subvector extracts and inserts:
+ //
+
+ // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
+ if (isShuffleHigh128VectorInsertLow(SVOp)) {
+ SDValue V = Extract128BitVector(V1, DAG.getConstant(NumElems/2, MVT::i32),
+ DAG, dl);
+ SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT),
+ V, DAG.getConstant(0, MVT::i32), DAG, dl);
+ return DCI.CombineTo(N, InsV);
+ }
+
+ // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
+ if (isShuffleLow128VectorInsertHigh(SVOp)) {
+ SDValue V = Extract128BitVector(V1, DAG.getConstant(0, MVT::i32), DAG, dl);
+ SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT),
+ V, DAG.getConstant(NumElems/2, MVT::i32), DAG, dl);
+ return DCI.CombineTo(N, InsV);
+ }
+
return SDValue();
}
/// PerformShuffleCombine - Performs several different shuffle combines.
static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI) {
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget *Subtarget) {
DebugLoc dl = N->getDebugLoc();
EVT VT = N->getValueType(0);
if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType()))
return SDValue();
- // Only handle pure VECTOR_SHUFFLE nodes.
- if (VT.getSizeInBits() == 256 && N->getOpcode() == ISD::VECTOR_SHUFFLE)
+ // Combine 256-bit vector shuffles. This is only profitable when in AVX mode
+ if (Subtarget->hasAVX() && VT.getSizeInBits() == 256 &&
+ N->getOpcode() == ISD::VECTOR_SHUFFLE)
return PerformShuffleCombine256(N, DAG, DCI);
// Only handle 128 wide vector from here on.
return SDValue();
}
-/// PerformSELECTCombine - Do target-specific dag combines on SELECT nodes.
+/// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT
+/// nodes.
static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
const X86Subtarget *Subtarget) {
DebugLoc DL = N->getDebugLoc();
// instructions match the semantics of the common C idiom x<y?x:y but not
// x<=y?x:y, because of how they handle negative zero (which can be
// ignored in unsafe-math mode).
- if (Subtarget->hasSSE2() &&
- (LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64) &&
- Cond.getOpcode() == ISD::SETCC) {
+ if (Subtarget->hasXMMInt() && Cond.getOpcode() == ISD::SETCC &&
+ (LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64 ||
+ LHS.getValueType() == MVT::v4f32 || LHS.getValueType() == MVT::v2f64)) {
ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
unsigned Opcode = 0;
// Converting this to a max would handle comparisons between positive
// and negative zero incorrectly.
if (!UnsafeFPMath &&
- !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(LHS))
+ !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
break;
Opcode = X86ISD::FMAX;
break;
// all elements are shifted by the same amount. We can't do this in legalize
// because the a constant vector is typically transformed to a constant pool
// so we have no knowledge of the shift amount.
- if (!Subtarget->hasSSE2())
+ if (!Subtarget->hasXMMInt())
return SDValue();
if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16)
// SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
// we're requiring SSE2 for both.
- if (Subtarget->hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
+ if (Subtarget->hasXMMInt() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
SDValue CMP0 = N0->getOperand(1);
// Sometimes the operand may come from a insert_subvector building a 256-bit
// allones vector
- SDValue V1 = N->getOperand(0);
- SDValue V2 = N->getOperand(1);
-
if (VT.getSizeInBits() == 256 &&
- N->getOpcode() == ISD::INSERT_SUBVECTOR &&
- V1.getOpcode() == ISD::INSERT_SUBVECTOR &&
- V1.getOperand(0).getOpcode() == ISD::UNDEF &&
- ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) &&
- ISD::isBuildVectorAllOnes(V2.getNode()))
- return true;
+ N->getOpcode() == ISD::INSERT_SUBVECTOR) {
+ SDValue V1 = N->getOperand(0);
+ SDValue V2 = N->getOperand(1);
+
+ if (V1.getOpcode() == ISD::INSERT_SUBVECTOR &&
+ V1.getOperand(0).getOpcode() == ISD::UNDEF &&
+ ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) &&
+ ISD::isBuildVectorAllOnes(V2.getNode()))
+ return true;
+ }
return false;
}
SDValue N1 = N->getOperand(1);
// look for psign/blend
- if (Subtarget->hasSSSE3()) {
+ if (Subtarget->hasSSSE3() || Subtarget->hasAVX()) {
if (VT == MVT::v2i64) {
// Canonicalize pandn to RHS
if (N0.getOpcode() == X86ISD::ANDNP)
}
}
// PBLENDVB only available on SSE 4.1
- if (!Subtarget->hasSSE41())
+ if (!(Subtarget->hasSSE41() || Subtarget->hasAVX()))
return SDValue();
X = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, X);
Y = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Y);
Mask = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Mask);
- Mask = DAG.getNode(X86ISD::PBLENDVB, DL, MVT::v16i8, X, Y, Mask);
+ Mask = DAG.getNode(ISD::VSELECT, DL, MVT::v16i8, Mask, X, Y);
return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Mask);
}
}
/// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
const X86Subtarget *Subtarget) {
+ StoreSDNode *St = cast<StoreSDNode>(N);
+ EVT VT = St->getValue().getValueType();
+ EVT StVT = St->getMemoryVT();
+ DebugLoc dl = St->getDebugLoc();
+ SDValue StoredVal = St->getOperand(1);
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+ // If we are saving a concatination of two XMM registers, perform two stores.
+ // This is better in Sandy Bridge cause one 256-bit mem op is done via two
+ // 128-bit ones. If in the future the cost becomes only one memory access the
+ // first version would be better.
+ if (VT.getSizeInBits() == 256 &&
+ StoredVal.getNode()->getOpcode() == ISD::CONCAT_VECTORS &&
+ StoredVal.getNumOperands() == 2) {
+
+ SDValue Value0 = StoredVal.getOperand(0);
+ SDValue Value1 = StoredVal.getOperand(1);
+
+ SDValue Stride = DAG.getConstant(16, TLI.getPointerTy());
+ SDValue Ptr0 = St->getBasePtr();
+ SDValue Ptr1 = DAG.getNode(ISD::ADD, dl, Ptr0.getValueType(), Ptr0, Stride);
+
+ SDValue Ch0 = DAG.getStore(St->getChain(), dl, Value0, Ptr0,
+ St->getPointerInfo(), St->isVolatile(),
+ St->isNonTemporal(), St->getAlignment());
+ SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1,
+ St->getPointerInfo(), St->isVolatile(),
+ St->isNonTemporal(), St->getAlignment());
+ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
+ }
+
+ // Optimize trunc store (of multiple scalars) to shuffle and store.
+ // First, pack all of the elements in one place. Next, store to memory
+ // in fewer chunks.
+ if (St->isTruncatingStore() && VT.isVector()) {
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ unsigned NumElems = VT.getVectorNumElements();
+ assert(StVT != VT && "Cannot truncate to the same type");
+ unsigned FromSz = VT.getVectorElementType().getSizeInBits();
+ unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
+
+ // From, To sizes and ElemCount must be pow of two
+ if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
+ // We are going to use the original vector elt for storing.
+ // accumulated smaller vector elements must be a multiple of bigger size.
+ if (0 != (NumElems * ToSz) % FromSz) return SDValue();
+ unsigned SizeRatio = FromSz / ToSz;
+
+ assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
+
+ // Create a type on which we perform the shuffle
+ EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
+ StVT.getScalarType(), NumElems*SizeRatio);
+
+ assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
+
+ SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, St->getValue());
+ SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
+ for (unsigned i = 0; i < NumElems; i++ ) ShuffleVec[i] = i * SizeRatio;
+
+ // Can't shuffle using an illegal type
+ if (!TLI.isTypeLegal(WideVecVT)) return SDValue();
+
+ SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
+ DAG.getUNDEF(WideVec.getValueType()),
+ ShuffleVec.data());
+ // At this point all of the data is stored at the bottom of the
+ // register. We now need to save it to mem.
+
+ // Find the largest store unit
+ MVT StoreType = MVT::i8;
+ for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE;
+ tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) {
+ MVT Tp = (MVT::SimpleValueType)tp;
+ if (TLI.isTypeLegal(Tp) && StoreType.getSizeInBits() < NumElems * ToSz)
+ StoreType = Tp;
+ }
+
+ // Bitcast the original vector into a vector of store-size units
+ EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
+ StoreType, VT.getSizeInBits()/EVT(StoreType).getSizeInBits());
+ assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
+ SDValue ShuffWide = DAG.getNode(ISD::BITCAST, dl, StoreVecVT, Shuff);
+ SmallVector<SDValue, 8> Chains;
+ SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8,
+ TLI.getPointerTy());
+ SDValue Ptr = St->getBasePtr();
+
+ // Perform one or more big stores into memory.
+ for (unsigned i = 0; i < (ToSz*NumElems)/StoreType.getSizeInBits() ; i++) {
+ SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+ StoreType, ShuffWide,
+ DAG.getIntPtrConstant(i));
+ SDValue Ch = DAG.getStore(St->getChain(), dl, SubVec, Ptr,
+ St->getPointerInfo(), St->isVolatile(),
+ St->isNonTemporal(), St->getAlignment());
+ Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
+ Chains.push_back(Ch);
+ }
+
+ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0],
+ Chains.size());
+ }
+
+
// Turn load->store of MMX types into GPR load/stores. This avoids clobbering
// the FP state in cases where an emms may be missing.
// A preferable solution to the general problem is to figure out the right
// places to insert EMMS. This qualifies as a quick hack.
// Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
- StoreSDNode *St = cast<StoreSDNode>(N);
- EVT VT = St->getValue().getValueType();
if (VT.getSizeInBits() != 64)
return SDValue();
const Function *F = DAG.getMachineFunction().getFunction();
bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat);
bool F64IsLegal = !UseSoftFloat && !NoImplicitFloatOps
- && Subtarget->hasSSE2();
+ && Subtarget->hasXMMInt();
if ((VT.isVector() ||
(VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) &&
isa<LoadSDNode>(St->getValue()) &&
// X86 can't encode an immediate LHS of a sub. See if we can push the
// negation into a preceding instruction.
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
- uint64_t Op0C = C->getSExtValue();
-
// If the RHS of the sub is a XOR with one use and a constant, invert the
// immediate. Then add one to the LHS of the sub so we can turn
// X-Y -> X+~Y+1, saving one register.
if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
isa<ConstantSDNode>(Op1.getOperand(1))) {
- uint64_t XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getSExtValue();
+ APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
EVT VT = Op0.getValueType();
SDValue NewXor = DAG.getNode(ISD::XOR, Op1.getDebugLoc(), VT,
Op1.getOperand(0),
DAG.getConstant(~XorC, VT));
return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, NewXor,
- DAG.getConstant(Op0C+1, VT));
+ DAG.getConstant(C->getAPIntValue()+1, VT));
}
}
default: break;
case ISD::EXTRACT_VECTOR_ELT:
return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, *this);
+ case ISD::VSELECT:
case ISD::SELECT: return PerformSELECTCombine(N, DAG, Subtarget);
case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI);
case ISD::ADD: return OptimizeConditionalInDecrement(N, DAG);
case X86ISD::PSHUFLW:
case X86ISD::MOVSS:
case X86ISD::MOVSD:
- case X86ISD::VPERMIL:
- case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI);
+ case X86ISD::VPERMILPS:
+ case X86ISD::VPERMILPSY:
+ case X86ISD::VPERMILPD:
+ case X86ISD::VPERMILPDY:
+ case X86ISD::VPERM2F128:
+ case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget);
}
return SDValue();