static bool isTargetShuffle(unsigned Opcode) {
switch(Opcode) {
default: return false;
+ case X86ISD::BLENDI:
case X86ISD::PSHUFB:
case X86ISD::PSHUFD:
case X86ISD::PSHUFHW:
case X86ISD::MOVSD:
case X86ISD::UNPCKL:
case X86ISD::UNPCKH:
- case X86ISD::VPERMILP:
+ case X86ISD::VPERMILPI:
case X86ISD::VPERM2X128:
case X86ISD::VPERMI:
return true;
case X86ISD::PSHUFD:
case X86ISD::PSHUFHW:
case X86ISD::PSHUFLW:
- case X86ISD::VPERMILP:
+ case X86ISD::VPERMILPI:
case X86ISD::VPERMI:
return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8));
}
IsUnary = false;
bool IsFakeUnary = false;
switch(N->getOpcode()) {
+ case X86ISD::BLENDI:
+ ImmN = N->getOperand(N->getNumOperands()-1);
+ DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+ break;
case X86ISD::SHUFP:
ImmN = N->getOperand(N->getNumOperands()-1);
DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
break;
case X86ISD::PSHUFD:
- case X86ISD::VPERMILP:
+ case X86ISD::VPERMILPI:
ImmN = N->getOperand(N->getNumOperands()-1);
DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
IsUnary = true;
SmallVector<uint64_t, 32> RawMask;
for (int i = 0, e = MaskNode->getNumOperands(); i < e; ++i) {
- auto *CN = dyn_cast<ConstantSDNode>(MaskNode->getOperand(i));
+ SDValue Op = MaskNode->getOperand(i);
+ if (Op->getOpcode() == ISD::UNDEF) {
+ RawMask.push_back((uint64_t)SM_SentinelUndef);
+ continue;
+ }
+ auto *CN = dyn_cast<ConstantSDNode>(Op.getNode());
if (!CN)
return false;
APInt MaskElement = CN->getAPIntValue();
if (!MaskCP || MaskCP->isMachineConstantPoolEntry())
return false;
- if (auto *C = dyn_cast<ConstantDataSequential>(MaskCP->getConstVal())) {
+ if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) {
// FIXME: Support AVX-512 here.
- if (!C->getType()->isVectorTy() ||
- (C->getNumElements() != 16 && C->getNumElements() != 32))
+ Type *Ty = C->getType();
+ if (!Ty->isVectorTy() || (Ty->getVectorNumElements() != 16 &&
+ Ty->getVectorNumElements() != 32))
return false;
- assert(C->getType()->isVectorTy() && "Expected a vector constant.");
DecodePSHUFBMask(C, Mask);
break;
}
/// or SDValue() otherwise.
static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
SelectionDAG &DAG) {
- if (!Subtarget->hasFp256())
+ // VBROADCAST requires AVX.
+ // TODO: Splats could be generated for non-AVX CPUs using SSE
+ // instructions, but there's less potential gain for only 128-bit vectors.
+ if (!Subtarget->hasAVX())
return SDValue();
MVT VT = Op.getSimpleValueType();
}
}
+ unsigned ScalarSize = Ld.getValueType().getSizeInBits();
bool IsGE256 = (VT.getSizeInBits() >= 256);
- // Handle the broadcasting a single constant scalar from the constant pool
- // into a vector. On Sandybridge it is still better to load a constant vector
+ // When optimizing for size, generate up to 5 extra bytes for a broadcast
+ // instruction to save 8 or more bytes of constant pool data.
+ // TODO: If multiple splats are generated to load the same constant,
+ // it may be detrimental to overall size. There needs to be a way to detect
+ // that condition to know if this is truly a size win.
+ const Function *F = DAG.getMachineFunction().getFunction();
+ bool OptForSize = F->getAttributes().
+ hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
+
+ // Handle broadcasting a single constant scalar from the constant pool
+ // into a vector.
+ // On Sandybridge (no AVX2), it is still better to load a constant vector
// from the constant pool and not to broadcast it from a scalar.
- if (ConstSplatVal && Subtarget->hasInt256()) {
+ // But override that restriction when optimizing for size.
+ // TODO: Check if splatting is recommended for other AVX-capable CPUs.
+ if (ConstSplatVal && (Subtarget->hasAVX2() || OptForSize)) {
EVT CVT = Ld.getValueType();
assert(!CVT.isVector() && "Must not broadcast a vector type");
- unsigned ScalarSize = CVT.getSizeInBits();
- if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)) {
+ // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
+ // For size optimization, also splat v2f64 and v2i64, and for size opt
+ // with AVX2, also splat i8 and i16.
+ // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
+ if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
+ (OptForSize && (ScalarSize == 64 || Subtarget->hasAVX2()))) {
const Constant *C = nullptr;
if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
C = CI->getConstantIntValue();
}
bool IsLoad = ISD::isNormalLoad(Ld.getNode());
- unsigned ScalarSize = Ld.getValueType().getSizeInBits();
// Handle AVX2 in-register broadcasts.
if (!IsLoad && Subtarget->hasInt256() &&
return true;
}
+/// \brief Test whether there are elements crossing 128-bit lanes in this
+/// shuffle mask.
+///
+/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
+/// and we routinely test for these.
+static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
+ int LaneSize = 128 / VT.getScalarSizeInBits();
+ int Size = Mask.size();
+ for (int i = 0; i < Size; ++i)
+ if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
+ return true;
+ return false;
+}
+
+/// \brief Test whether a shuffle mask is equivalent within each 128-bit lane.
+///
+/// This checks a shuffle mask to see if it is performing the same
+/// 128-bit lane-relative shuffle in each 128-bit lane. This trivially implies
+/// that it is also not lane-crossing. It may however involve a blend from the
+/// same lane of a second vector.
+///
+/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
+/// non-trivial to compute in the face of undef lanes. The representation is
+/// *not* suitable for use with existing 128-bit shuffles as it will contain
+/// entries from both V1 and V2 inputs to the wider mask.
+static bool
+is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
+ SmallVectorImpl<int> &RepeatedMask) {
+ int LaneSize = 128 / VT.getScalarSizeInBits();
+ RepeatedMask.resize(LaneSize, -1);
+ int Size = Mask.size();
+ for (int i = 0; i < Size; ++i) {
+ if (Mask[i] < 0)
+ continue;
+ if ((Mask[i] % Size) / LaneSize != i / LaneSize)
+ // This entry crosses lanes, so there is no way to model this shuffle.
+ return false;
+
+ // Ok, handle the in-lane shuffles by detecting if and when they repeat.
+ if (RepeatedMask[i % LaneSize] == -1)
+ // This is the first non-undef entry in this slot of a 128-bit lane.
+ RepeatedMask[i % LaneSize] =
+ Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + Size;
+ else if (RepeatedMask[i % LaneSize] + (i / LaneSize) * LaneSize != Mask[i])
+ // Found a mismatch with the repeated mask.
+ return false;
+ }
+ return true;
+}
+
// Hide this symbol with an anonymous namespace instead of 'static' so that MSVC
// 2013 will allow us to use it as a non-type template parameter.
namespace {
/// that the shuffle mask is in fact a blend.
static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
+ const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
unsigned BlendMask = 0;
return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
DAG.getConstant(BlendMask, MVT::i8));
- case MVT::v8i16:
+ case MVT::v4i64:
+ case MVT::v8i32:
+ assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
+ // FALLTHROUGH
+ case MVT::v2i64:
case MVT::v4i32:
- case MVT::v2i64: {
+ // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
+ // that instruction.
+ if (Subtarget->hasAVX2()) {
+ // Scale the blend by the number of 32-bit dwords per element.
+ int Scale = VT.getScalarSizeInBits() / 32;
+ BlendMask = 0;
+ for (int i = 0, Size = Mask.size(); i < Size; ++i)
+ if (Mask[i] >= Size)
+ for (int j = 0; j < Scale; ++j)
+ BlendMask |= 1u << (i * Scale + j);
+
+ MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
+ V1 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V1);
+ V2 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V2);
+ return DAG.getNode(ISD::BITCAST, DL, VT,
+ DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
+ DAG.getConstant(BlendMask, MVT::i8)));
+ }
+ // FALLTHROUGH
+ case MVT::v8i16: {
// For integer shuffles we need to expand the mask and cast the inputs to
// v8i16s prior to blending.
int Scale = 8 / VT.getVectorNumElements();
DAG.getConstant(BlendMask, MVT::i8)));
}
+ case MVT::v16i16: {
+ assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
+ SmallVector<int, 8> RepeatedMask;
+ if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
+ // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
+ assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
+ BlendMask = 0;
+ for (int i = 0; i < 8; ++i)
+ if (RepeatedMask[i] >= 16)
+ BlendMask |= 1u << i;
+ return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
+ DAG.getConstant(BlendMask, MVT::i8));
+ }
+
+ // Fall back to a fully general variable byte blend.
+ SDValue PBLENDVMask[32];
+ // Scale the blend by the number of bytes per element.
+ int Scale = VT.getScalarSizeInBits() / 8;
+ assert(Mask.size() * Scale == 32 && "Not a 256-bit vector!");
+ for (int i = 0, Size = Mask.size(); i < Size; ++i)
+ for (int j = 0; j < Scale; ++j)
+ PBLENDVMask[Scale * i + j] =
+ Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
+ : DAG.getConstant(Mask[i] < Size ? 0 : 0x80, MVT::i8);
+
+ V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V1);
+ V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V2);
+ return DAG.getNode(
+ ISD::BITCAST, DL, VT,
+ DAG.getNode(ISD::VSELECT, DL, MVT::v32i8,
+ DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PBLENDVMask),
+ V1, V2));
+ }
+
default:
llvm_unreachable("Not a supported integer vector type!");
}
}
+/// \brief Generic routine to lower a shuffle and blend as a decomposed set of
+/// unblended shuffles followed by an unshuffled blend.
+///
+/// This matches the extremely common pattern for handling combined
+/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
+/// operations.
+static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(SDLoc DL, MVT VT,
+ SDValue V1,
+ SDValue V2,
+ ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
+ // Shuffle the input elements into the desired positions in V1 and V2 and
+ // blend them together.
+ SmallVector<int, 32> V1Mask(Mask.size(), -1);
+ SmallVector<int, 32> V2Mask(Mask.size(), -1);
+ SmallVector<int, 32> BlendMask(Mask.size(), -1);
+ for (int i = 0, Size = Mask.size(); i < Size; ++i)
+ if (Mask[i] >= 0 && Mask[i] < Size) {
+ V1Mask[i] = Mask[i];
+ BlendMask[i] = i;
+ } else if (Mask[i] >= Size) {
+ V2Mask[i] = Mask[i] - Size;
+ BlendMask[i] = i + Size;
+ }
+
+ V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
+ V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
+ return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
+}
+
/// \brief Try to lower a vector shuffle as a byte rotation.
///
/// We have a generic PALIGNR instruction in x86 that will do an arbitrary
if (Subtarget->hasAVX()) {
// If we have AVX, we can use VPERMILPS which will allow folding a load
// into the shuffle.
- return DAG.getNode(X86ISD::VPERMILP, DL, MVT::v2f64, V1,
+ return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
DAG.getConstant(SHUFPDMask, MVT::i8));
}
return Insertion;
if (Subtarget->hasSSE41())
- if (SDValue Blend =
- lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask, DAG))
+ if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
+ Subtarget, DAG))
return Blend;
unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
return Insertion;
if (Subtarget->hasSSE41())
- if (SDValue Blend =
- lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask, DAG))
+ if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
+ Subtarget, DAG))
return Blend;
// Try to use rotation instructions if available.
if (Subtarget->hasAVX()) {
// If we have AVX, we can use VPERMILPS which will allow folding a load
// into the shuffle.
- return DAG.getNode(X86ISD::VPERMILP, DL, MVT::v4f32, V1,
+ return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
getV4X86ShuffleImm8ForMask(Mask, DAG));
}
return V;
if (Subtarget->hasSSE41())
- if (SDValue Blend =
- lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask, DAG))
+ if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
+ Subtarget, DAG))
return Blend;
// Check for whether we can use INSERTPS to perform the blend. We only use
return V;
if (Subtarget->hasSSE41())
- if (SDValue Blend =
- lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask, DAG))
+ if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
+ Subtarget, DAG))
return Blend;
// Try to use rotation instructions if available.
return V;
if (Subtarget->hasSSE41())
- if (SDValue Blend =
- lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
+ if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
+ Subtarget, DAG))
return Blend;
// Try to use rotation instructions if available.
SDValue V2Mask[16];
for (int i = 0; i < 16; ++i)
if (Mask[i] == -1) {
- V1Mask[i] = V2Mask[i] = DAG.getConstant(0x80, MVT::i8);
+ V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
} else {
V1Mask[i] = DAG.getConstant(Mask[i] < 16 ? Mask[i] : 0x80, MVT::i8);
V2Mask[i] =
}
}
-/// \brief Test whether there are elements crossing 128-bit lanes in this
-/// shuffle mask.
-///
-/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
-/// and we routinely test for these.
-static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
- int LaneSize = 128 / VT.getScalarSizeInBits();
- int Size = Mask.size();
- for (int i = 0; i < Size; ++i)
- if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
- return true;
- return false;
-}
-
-/// \brief Test whether a shuffle mask is equivalent within each 128-bit lane.
-///
-/// This checks a shuffle mask to see if it is performing the same
-/// 128-bit lane-relative shuffle in each 128-bit lane. This trivially implies
-/// that it is also not lane-crossing.
-static bool is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask) {
- int LaneSize = 128 / VT.getScalarSizeInBits();
- int Size = Mask.size();
- for (int i = LaneSize; i < Size; ++i)
- if (Mask[i] >= 0 && Mask[i] != (Mask[i % LaneSize] + (i / LaneSize) * LaneSize))
- return false;
- return true;
-}
-
/// \brief Generic routine to split a 256-bit vector shuffle into 128-bit
/// shuffles.
///
// interleaved permutation.
unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
- return DAG.getNode(X86ISD::VPERMILP, DL, MVT::v4f64, V1,
+ return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
DAG.getConstant(VPERMILPMask, MVT::i8));
}
MVT::v4f64, DL, V1, V2, Mask, Subtarget, DAG))
return Insertion;
- if (SDValue Blend =
- lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask, DAG))
+ if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
+ Subtarget, DAG))
return Blend;
// Check if the blend happens to exactly fit that of SHUFPD.
DAG.getConstant(SHUFPDMask, MVT::i8));
}
- // Shuffle the input elements into the desired positions in V1 and V2 and
- // blend them together.
- int V1Mask[] = {-1, -1, -1, -1};
- int V2Mask[] = {-1, -1, -1, -1};
- for (int i = 0; i < 4; ++i)
- if (Mask[i] >= 0 && Mask[i] < 4)
- V1Mask[i] = Mask[i];
- else if (Mask[i] >= 4)
- V2Mask[i] = Mask[i] - 4;
+ // Otherwise fall back on generic blend lowering.
+ return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
+ Mask, DAG);
+}
+
+/// \brief Handle lowering of 4-lane 64-bit integer shuffles.
+///
+/// This routine is only called when we have AVX2 and thus a reasonable
+/// instruction set for v4i64 shuffling..
+static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ SDLoc DL(Op);
+ assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+ ArrayRef<int> Mask = SVOp->getMask();
+ assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
+ assert(Subtarget->hasAVX2() && "We can only lower v4i64 with AVX2!");
- V1 = DAG.getVectorShuffle(MVT::v4f64, DL, V1, DAG.getUNDEF(MVT::v4f64), V1Mask);
- V2 = DAG.getVectorShuffle(MVT::v4f64, DL, V2, DAG.getUNDEF(MVT::v4f64), V2Mask);
+ if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
+ Subtarget, DAG))
+ return Blend;
- unsigned BlendMask = 0;
- for (int i = 0; i < 4; ++i)
- if (Mask[i] >= 4)
- BlendMask |= 1 << i;
+ // When the shuffle is mirrored between the 128-bit lanes of the unit, we can
+ // use lower latency instructions that will operate on both 128-bit lanes.
+ SmallVector<int, 2> RepeatedMask;
+ if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
+ if (isSingleInputShuffleMask(Mask)) {
+ int PSHUFDMask[] = {-1, -1, -1, -1};
+ for (int i = 0; i < 2; ++i)
+ if (RepeatedMask[i] >= 0) {
+ PSHUFDMask[2 * i] = 2 * RepeatedMask[i];
+ PSHUFDMask[2 * i + 1] = 2 * RepeatedMask[i] + 1;
+ }
+ return DAG.getNode(
+ ISD::BITCAST, DL, MVT::v4i64,
+ DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
+ DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, V1),
+ getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
+ }
+
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (isShuffleEquivalent(Mask, 0, 4, 2, 6))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V1, V2);
+ if (isShuffleEquivalent(Mask, 1, 5, 3, 7))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V1, V2);
+ }
- return DAG.getNode(X86ISD::BLENDI, DL, MVT::v4f64, V1, V2,
- DAG.getConstant(BlendMask, MVT::i8));
+ // AVX2 provides a direct instruction for permuting a single input across
+ // lanes.
+ if (isSingleInputShuffleMask(Mask))
+ return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
+ getV4X86ShuffleImm8ForMask(Mask, DAG));
+
+ // Otherwise fall back on generic blend lowering.
+ return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
+ Mask, DAG);
}
/// \brief Handle lowering of 8-lane 32-bit floating point shuffles.
if (is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
return splitAndLower256BitVectorShuffle(Op, V1, V2, Subtarget, DAG);
- if (SDValue Blend =
- lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask, DAG))
+ if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
+ Subtarget, DAG))
return Blend;
// If the shuffle mask is repeated in each 128-bit lane, we have many more
// options to efficiently lower the shuffle.
- if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask)) {
- ArrayRef<int> LoMask = Mask.slice(0, 4);
+ SmallVector<int, 2> RepeatedMask;
+ if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
if (isSingleInputShuffleMask(Mask))
- return DAG.getNode(X86ISD::VPERMILP, DL, MVT::v8f32, V1,
- getV4X86ShuffleImm8ForMask(LoMask, DAG));
+ return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
+ getV4X86ShuffleImm8ForMask(RepeatedMask, DAG));
// Use dedicated unpack instructions for masks that match their pattern.
- if (isShuffleEquivalent(LoMask, 0, 8, 1, 9))
+ if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 4, 12, 5, 13))
return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f32, V1, V2);
- if (isShuffleEquivalent(LoMask, 2, 10, 3, 11))
+ if (isShuffleEquivalent(Mask, 2, 10, 3, 11, 6, 14, 7, 15))
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f32, V1, V2);
// Otherwise, fall back to a SHUFPS sequence. Here it is important that we
}
// If we have a single input shuffle with different shuffle patterns in the
- // two 128-bit lanes, just do two shuffles and blend them together. This will
- // be faster than extracting the high 128-bit lane, shuffling it, and
- // re-inserting it. Especially on newer processors where blending is *the*
- // fastest operation.
+ // two 128-bit lanes use the variable mask to VPERMILPS.
if (isSingleInputShuffleMask(Mask)) {
- int LoMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
- int HiMask[4] = {Mask[4], Mask[5], Mask[6], Mask[7]};
- for (int &M : HiMask)
- if (M >= 0)
- M -= 4;
- SDValue Lo = V1, Hi = V1;
- if (!isNoopShuffleMask(LoMask))
- Lo = DAG.getNode(X86ISD::VPERMILP, DL, MVT::v8f32, Lo,
- getV4X86ShuffleImm8ForMask(LoMask, DAG));
- if (!isNoopShuffleMask(HiMask))
- Hi = DAG.getNode(X86ISD::VPERMILP, DL, MVT::v8f32, Hi,
- getV4X86ShuffleImm8ForMask(HiMask, DAG));
- unsigned BlendMask = 1 << 4 | 1 << 5 | 1 << 6 | 1 << 7;
- return DAG.getNode(X86ISD::BLENDI, DL, MVT::v8f32, Lo, Hi,
- DAG.getConstant(BlendMask, MVT::i8));
+ SDValue VPermMask[8];
+ for (int i = 0; i < 8; ++i)
+ VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32)
+ : DAG.getConstant(Mask[i], MVT::i32);
+ return DAG.getNode(
+ X86ISD::VPERMILPV, DL, MVT::v8f32, V1,
+ DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask));
}
- // Shuffle the input elements into the desired positions in V1 and V2 and
- // blend them together.
- int V1Mask[] = {-1, -1, -1, -1, -1, -1, -1, -1};
- int V2Mask[] = {-1, -1, -1, -1, -1, -1, -1, -1};
- unsigned BlendMask = 0;
- for (int i = 0; i < 8; ++i)
- if (Mask[i] >= 0 && Mask[i] < 8) {
- V1Mask[i] = Mask[i];
- } else if (Mask[i] >= 8) {
- V2Mask[i] = Mask[i] - 8;
- BlendMask |= 1 << i;
+ // Otherwise fall back on generic blend lowering.
+ return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
+ Mask, DAG);
+}
+
+/// \brief Handle lowering of 8-lane 32-bit integer shuffles.
+///
+/// This routine is only called when we have AVX2 and thus a reasonable
+/// instruction set for v8i32 shuffling..
+static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ SDLoc DL(Op);
+ assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+ ArrayRef<int> Mask = SVOp->getMask();
+ assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
+ assert(Subtarget->hasAVX2() && "We can only lower v8i32 with AVX2!");
+
+ if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
+ Subtarget, DAG))
+ return Blend;
+
+ // If the shuffle mask is repeated in each 128-bit lane we can use more
+ // efficient instructions that mirror the shuffles across the two 128-bit
+ // lanes.
+ SmallVector<int, 4> RepeatedMask;
+ if (is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask)) {
+ assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
+ if (isSingleInputShuffleMask(Mask))
+ return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
+ getV4X86ShuffleImm8ForMask(RepeatedMask, DAG));
+
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (isShuffleEquivalent(Mask, 0, 8, 1, 9))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i32, V1, V2);
+ if (isShuffleEquivalent(Mask, 2, 10, 3, 11))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i32, V1, V2);
+ }
+
+ // If the shuffle patterns aren't repeated but it is a single input, directly
+ // generate a cross-lane VPERMD instruction.
+ if (isSingleInputShuffleMask(Mask)) {
+ SDValue VPermMask[8];
+ for (int i = 0; i < 8; ++i)
+ VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32)
+ : DAG.getConstant(Mask[i], MVT::i32);
+ return DAG.getNode(
+ X86ISD::VPERMV, DL, MVT::v8i32,
+ DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask), V1);
+ }
+
+ // Otherwise fall back on generic blend lowering.
+ return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
+ Mask, DAG);
+}
+
+/// \brief Handle lowering of 16-lane 16-bit integer shuffles.
+///
+/// This routine is only called when we have AVX2 and thus a reasonable
+/// instruction set for v16i16 shuffling..
+static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ SDLoc DL(Op);
+ assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+ ArrayRef<int> Mask = SVOp->getMask();
+ assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
+ assert(Subtarget->hasAVX2() && "We can only lower v16i16 with AVX2!");
+
+ if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
+ Subtarget, DAG))
+ return Blend;
+
+ // If the shuffle mask is repeated in each 128-bit lane we can use more
+ // efficient instructions that mirror the shuffles across the two 128-bit
+ // lanes.
+ SmallVector<int, 4> RepeatedMask;
+ if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
+ assert(RepeatedMask.size() == 8 && "Unexpected repeated mask size!");
+ // FIXME: It might be worth it to call into the (terribly complex) v8i16
+ // lowering here.
+
+ // Use dedicated unpack instructions for masks that match their pattern.
+ //
+ if (isShuffleEquivalent(Mask,
+ // First 128-bit lane:
+ 0, 16, 1, 17, 2, 18, 3, 19,
+ // Second 128-bit lane:
+ 8, 24, 9, 25, 10, 26, 11, 27))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i16, V1, V2);
+ if (isShuffleEquivalent(Mask,
+ // First 128-bit lane:
+ 4, 20, 5, 21, 6, 22, 7, 23,
+ // Second 128-bit lane:
+ 12, 28, 13, 29, 14, 30, 15, 31))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i16, V1, V2);
+ }
+
+ // There are no generalized cross-lane shuffle operations available on i16
+ // element types.
+ // FIXME: We should teach the "split and lower" path to do something more
+ // clever, or do it ourselves here. The optimal lowering of cross-lane
+ // shuffles I am aware of is to swap the lanes into a copy, shuffle both the
+ // original and the copy, and then blend to pick up the cross-lane elements.
+ // This is four instructions with a tree height of three which is better than
+ // the worst case for a gather-cross-scatter approach such as used in SSE2
+ // v8i16 lowering (where we don't have blends). While for cross-lane blends it
+ // results in a blend tree, blends are very cheap in AVX2 and newer chips. We
+ // might also want to special case situations where we can always do a single
+ // VPERMD to produce a non-lane-crossing shuffle.
+ if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
+ return splitAndLower256BitVectorShuffle(Op, V1, V2, Subtarget, DAG);
+
+ if (isSingleInputShuffleMask(Mask)) {
+ SDValue PSHUFBMask[32];
+ for (int i = 0; i < 16; ++i) {
+ if (Mask[i] == -1) {
+ PSHUFBMask[2 * i] = PSHUFBMask[2 * i + 1] = DAG.getUNDEF(MVT::i8);
+ continue;
+ }
+
+ int M = i < 8 ? Mask[i] : Mask[i] - 8;
+ assert(M >= 0 && M < 8 && "Invalid single-input mask!");
+ PSHUFBMask[2 * i] = DAG.getConstant(2 * M, MVT::i8);
+ PSHUFBMask[2 * i + 1] = DAG.getConstant(2 * M + 1, MVT::i8);
}
+ return DAG.getNode(
+ ISD::BITCAST, DL, MVT::v16i16,
+ DAG.getNode(
+ X86ISD::PSHUFB, DL, MVT::v32i8,
+ DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V1),
+ DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask)));
+ }
- V1 = DAG.getVectorShuffle(MVT::v8f32, DL, V1, DAG.getUNDEF(MVT::v8f32), V1Mask);
- V2 = DAG.getVectorShuffle(MVT::v8f32, DL, V2, DAG.getUNDEF(MVT::v8f32), V2Mask);
+ // Otherwise fall back on generic blend lowering.
+ return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i16, V1, V2,
+ Mask, DAG);
+}
+
+/// \brief Handle lowering of 32-lane 8-bit integer shuffles.
+///
+/// This routine is only called when we have AVX2 and thus a reasonable
+/// instruction set for v32i8 shuffling..
+static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ SDLoc DL(Op);
+ assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+ ArrayRef<int> Mask = SVOp->getMask();
+ assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
+ assert(Subtarget->hasAVX2() && "We can only lower v32i8 with AVX2!");
- return DAG.getNode(X86ISD::BLENDI, DL, MVT::v8f32, V1, V2,
- DAG.getConstant(BlendMask, MVT::i8));
+ // FIXME: Actually implement this using AVX2!!!
+ (void)Mask;
+ return splitAndLower256BitVectorShuffle(Op, V1, V2, Subtarget, DAG);
}
/// \brief High-level routine to lower various 256-bit x86 vector shuffles.
// ability to manipulate a 256-bit vector with integer types. Since we'll use
// floating point types there eventually, just immediately cast everything to
// a float and operate entirely in that domain.
- // FIXME: Actually test for AVX2 when we have implemented it.
- if (VT.isInteger()) {
+ if (VT.isInteger() && !Subtarget->hasAVX2()) {
int ElementBits = VT.getScalarSizeInBits();
if (ElementBits < 32)
// No floating point type available, decompose into 128-bit vectors.
case MVT::v4f64:
return lowerV4F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
case MVT::v4i64:
- llvm_unreachable("AVX2 integer support not yet implemented!");
+ return lowerV4I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
case MVT::v8f32:
return lowerV8F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
case MVT::v8i32:
+ return lowerV8I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
case MVT::v16i16:
+ return lowerV16I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
case MVT::v32i8:
- llvm_unreachable("AVX2 integer support not yet implemented!");
+ return lowerV32I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
default:
llvm_unreachable("Not a valid 256-bit x86 vector type!");
return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG);
if (HasFp256 && (VT == MVT::v4f32 || VT == MVT::v2f64))
- return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1, TargetMask,
+ return getTargetShuffleNode(X86ISD::VPERMILPI, dl, VT, V1, TargetMask,
DAG);
return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V1,
if ((HasInt256 && VT == MVT::v8i32) || VT == MVT::v16i32)
return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1,
getShuffleSHUFImmediate(SVOp), DAG);
- return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1,
+ return getTargetShuffleNode(X86ISD::VPERMILPI, dl, VT, V1,
getShuffleSHUFImmediate(SVOp), DAG);
}
EVT VT = Op.getNode()->getValueType(0);
bool Is64Bit = Subtarget->is64Bit();
- EVT SPTy = Is64Bit ? MVT::i64 : MVT::i32;
+ EVT SPTy = getPointerTy();
if (SplitStack) {
MachineRegisterInfo &MRI = MF.getRegInfo();
}
const TargetRegisterClass *AddrRegClass =
- getRegClassFor(Subtarget->is64Bit() ? MVT::i64:MVT::i32);
+ getRegClassFor(getPointerTy());
unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
return DAG.getMergeValues(Ops1, dl);
} else {
SDValue Flag;
- unsigned Reg = (Subtarget->is64Bit() ? X86::RAX : X86::EAX);
+ const unsigned Reg = (Subtarget->isTarget64BitLP64() ? X86::RAX : X86::EAX);
Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag);
Flag = Chain.getValue(1);
return needsCmpXchgNb(SI->getValueOperand()->getType());
}
-bool X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *SI) const {
- return false; // FIXME, currently these are expanded separately in this file.
+// Note: this turns large loads into lock cmpxchg8b/16b.
+// FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
+bool X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
+ auto PTy = cast<PointerType>(LI->getPointerOperand()->getType());
+ return needsCmpXchgNb(PTy->getElementType());
}
bool X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
}
}
-static void ReplaceATOMIC_LOAD(SDNode *Node,
- SmallVectorImpl<SDValue> &Results,
- SelectionDAG &DAG) {
- SDLoc dl(Node);
- EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
-
- // Convert wide load -> cmpxchg8b/cmpxchg16b
- // FIXME: On 32-bit, load -> fild or movq would be more efficient
- // (The only way to get a 16-byte load is cmpxchg16b)
- // FIXME: 16-byte ATOMIC_CMP_SWAP isn't actually hooked up at the moment.
- SDValue Zero = DAG.getConstant(0, VT);
- SDVTList VTs = DAG.getVTList(VT, MVT::i1, MVT::Other);
- SDValue Swap =
- DAG.getAtomicCmpSwap(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, dl, VT, VTs,
- Node->getOperand(0), Node->getOperand(1), Zero, Zero,
- cast<AtomicSDNode>(Node)->getMemOperand(),
- cast<AtomicSDNode>(Node)->getOrdering(),
- cast<AtomicSDNode>(Node)->getOrdering(),
- cast<AtomicSDNode>(Node)->getSynchScope());
- Results.push_back(Swap.getValue(0));
- Results.push_back(Swap.getValue(2));
-}
-
/// ReplaceNodeResults - Replace a node with an illegal result type
/// with a new node built out of custom code.
void X86TargetLowering::ReplaceNodeResults(SDNode *N,
case ISD::ATOMIC_LOAD_MAX:
case ISD::ATOMIC_LOAD_UMIN:
case ISD::ATOMIC_LOAD_UMAX:
+ case ISD::ATOMIC_LOAD: {
// Delegate to generic TypeLegalization. Situations we can really handle
// should have already been dealt with by AtomicExpandPass.cpp.
break;
- case ISD::ATOMIC_LOAD: {
- ReplaceATOMIC_LOAD(N, Results, DAG);
- return;
}
case ISD::BITCAST: {
assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
case X86ISD::PSHUFB: return "X86ISD::PSHUFB";
case X86ISD::ANDNP: return "X86ISD::ANDNP";
case X86ISD::PSIGN: return "X86ISD::PSIGN";
- case X86ISD::BLENDV: return "X86ISD::BLENDV";
case X86ISD::BLENDI: return "X86ISD::BLENDI";
case X86ISD::SUBUS: return "X86ISD::SUBUS";
case X86ISD::HADD: return "X86ISD::HADD";
case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST";
case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM";
case X86ISD::VEXTRACT: return "X86ISD::VEXTRACT";
- case X86ISD::VPERMILP: return "X86ISD::VPERMILP";
+ case X86ISD::VPERMILPI: return "X86ISD::VPERMILPI";
case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128";
case X86ISD::VPERMV: return "X86ISD::VPERMV";
case X86ISD::VPERMV3: return "X86ISD::VPERMV3";
}
MachineBasicBlock *
-X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB,
- bool Is64Bit) const {
+X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI,
+ MachineBasicBlock *BB) const {
MachineFunction *MF = BB->getParent();
const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
DebugLoc DL = MI->getDebugLoc();
assert(MF->shouldSplitStack());
- unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
- unsigned TlsOffset = Is64Bit ? 0x70 : 0x30;
+ const bool Is64Bit = Subtarget->is64Bit();
+ const bool IsLP64 = Subtarget->isTarget64BitLP64();
+
+ const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
+ const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
// BB:
// ... [Till the alloca]
MachineRegisterInfo &MRI = MF->getRegInfo();
const TargetRegisterClass *AddrRegClass =
- getRegClassFor(Is64Bit ? MVT::i64:MVT::i32);
+ getRegClassFor(getPointerTy());
unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
sizeVReg = MI->getOperand(1).getReg(),
- physSPReg = Is64Bit ? X86::RSP : X86::ESP;
+ physSPReg = IsLP64 || Subtarget->isTargetNaCl64() ? X86::RSP : X86::ESP;
MachineFunction::iterator MBBIter = BB;
++MBBIter;
// Add code to the main basic block to check if the stack limit has been hit,
// and if so, jump to mallocMBB otherwise to bumpMBB.
BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
- BuildMI(BB, DL, TII->get(Is64Bit ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
+ BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
.addReg(tmpSPVReg).addReg(sizeVReg);
- BuildMI(BB, DL, TII->get(Is64Bit ? X86::CMP64mr:X86::CMP32mr))
+ BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
.addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
.addReg(SPLimitVReg);
BuildMI(BB, DL, TII->get(X86::JG_4)).addMBB(mallocMBB);
.getSubtargetImpl()
->getRegisterInfo()
->getCallPreservedMask(CallingConv::C);
- if (Is64Bit) {
+ if (IsLP64) {
BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
.addReg(sizeVReg);
BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
.addRegMask(RegMask)
.addReg(X86::RDI, RegState::Implicit)
.addReg(X86::RAX, RegState::ImplicitDefine);
+ } else if (Is64Bit) {
+ BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
+ .addReg(sizeVReg);
+ BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
+ .addExternalSymbol("__morestack_allocate_stack_space")
+ .addRegMask(RegMask)
+ .addReg(X86::EDI, RegState::Implicit)
+ .addReg(X86::EAX, RegState::ImplicitDefine);
} else {
BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
.addImm(12);
.addImm(16);
BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
- .addReg(Is64Bit ? X86::RAX : X86::EAX);
+ .addReg(IsLP64 ? X86::RAX : X86::EAX);
BuildMI(mallocMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB);
// Set up the CFG correctly.
case X86::WIN_ALLOCA:
return EmitLoweredWinAlloca(MI, BB);
case X86::SEG_ALLOCA_32:
- return EmitLoweredSegAlloca(MI, BB, false);
case X86::SEG_ALLOCA_64:
- return EmitLoweredSegAlloca(MI, BB, true);
+ return EmitLoweredSegAlloca(MI, BB);
case X86::TLSCall_32:
case X86::TLSCall_64:
return EmitLoweredTLSCall(MI, BB);
assert(Mask.size() <= 16 && "Can't shuffle elements smaller than bytes!");
int Ratio = 16 / Mask.size();
for (unsigned i = 0; i < 16; ++i) {
+ if (Mask[i / Ratio] == SM_SentinelUndef) {
+ PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
+ continue;
+ }
int M = Mask[i / Ratio] != SM_SentinelZero
? Ratio * Mask[i / Ratio] + i % Ratio
: 255;
// for this order is that we are recursing up the operation chain.
for (int i = 0, e = std::max(OpMask.size(), RootMask.size()); i < e; ++i) {
int RootIdx = i / RootRatio;
- if (RootMask[RootIdx] == SM_SentinelZero) {
- // This is a zero-ed lane, we're done.
- Mask.push_back(SM_SentinelZero);
+ if (RootMask[RootIdx] < 0) {
+ // This is a zero or undef lane, we're done.
+ Mask.push_back(RootMask[RootIdx]);
continue;
}
int RootMaskedIdx = RootMask[RootIdx] * RootRatio + i % RootRatio;
int OpIdx = RootMaskedIdx / OpRatio;
- if (OpMask[OpIdx] == SM_SentinelZero) {
- // The incoming lanes are zero, it doesn't matter which ones we are using.
- Mask.push_back(SM_SentinelZero);
+ if (OpMask[OpIdx] < 0) {
+ // The incoming lanes are zero or undef, it doesn't matter which ones we
+ // are using.
+ Mask.push_back(OpMask[OpIdx]);
continue;
}
case X86ISD::PSHUFLW:
case X86ISD::MOVSS:
case X86ISD::MOVSD:
- case X86ISD::VPERMILP:
+ case X86ISD::VPERMILPI:
case X86ISD::VPERM2X128:
case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget);
case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget);