cl::Hidden);
static cl::opt<bool> ExperimentalVectorShuffleLowering(
- "x86-experimental-vector-shuffle-lowering", cl::init(false),
+ "x86-experimental-vector-shuffle-lowering", cl::init(true),
cl::desc("Enable an experimental vector shuffle lowering code path."),
cl::Hidden);
// FIXME: This should stop caching the target machine as soon as
// we can remove resetOperationActions et al.
-X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
- : TargetLowering(TM, createTLOF(Triple(TM.getTargetTriple()))) {
+X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM)
+ : TargetLowering(TM, createTLOF(Triple(TM.getTargetTriple()))) {
Subtarget = &TM.getSubtarget<X86Subtarget>();
X86ScalarSSEf64 = Subtarget->hasSSE2();
X86ScalarSSEf32 = Subtarget->hasSSE1();
setOperationAction(ISD::FLOG10, MVT::f80, Expand);
setOperationAction(ISD::FEXP, MVT::f80, Expand);
setOperationAction(ISD::FEXP2, MVT::f80, Expand);
+ setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
+ setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
// First set operation action for all vector types to either promote
// (for widening) or expand (for scalarization). Then we will selectively
setOperationAction(ISD::SETCC, MVT::v4i1, Custom);
setOperationAction(ISD::SETCC, MVT::v2i1, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Legal);
}
// SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion
setOperationAction(ISD::UMULO, VT, Custom);
}
- // There are no 8-bit 3-address imul/mul instructions
- setOperationAction(ISD::SMULO, MVT::i8, Expand);
- setOperationAction(ISD::UMULO, MVT::i8, Expand);
if (!Subtarget->is64Bit()) {
// These libcalls are not available in 32-bit.
/// \brief Try to lower a vector shuffle as a byte rotation.
///
/// We have a generic PALIGNR instruction in x86 that will do an arbitrary
-/// byte-rotation of a the concatentation of two vectors. This routine will
+/// byte-rotation of the concatenation of two vectors. This routine will
/// try to generically lower a vector shuffle through such an instruction. It
/// does not check for the availability of PALIGNR-based lowerings, only the
/// applicability of this strategy to the given mask. This matches shuffle
return SDValue();
}
+/// \brief Try to get a scalar value for a specific element of a vector.
+///
+/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
+static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
+ SelectionDAG &DAG) {
+ MVT VT = V.getSimpleValueType();
+ MVT EltVT = VT.getVectorElementType();
+ while (V.getOpcode() == ISD::BITCAST)
+ V = V.getOperand(0);
+ // If the bitcasts shift the element size, we can't extract an equivalent
+ // element from it.
+ MVT NewVT = V.getSimpleValueType();
+ if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
+ return SDValue();
+
+ if (V.getOpcode() == ISD::BUILD_VECTOR ||
+ (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR))
+ return DAG.getNode(ISD::BITCAST, SDLoc(V), EltVT, V.getOperand(Idx));
+
+ return SDValue();
+}
+
+/// \brief Helper to test for a load that can be folded with x86 shuffles.
+///
+/// This is particularly important because the set of instructions varies
+/// significantly based on whether the operand is a load or not.
+static bool isShuffleFoldableLoad(SDValue V) {
+ while (V.getOpcode() == ISD::BITCAST)
+ V = V.getOperand(0);
+
+ return ISD::isNON_EXTLoad(V.getNode());
+}
+
/// \brief Try to lower insertion of a single element into a zero vector.
///
/// This is a common pattern that we have especially efficient patterns to lower
MVT VT, SDLoc DL, SDValue V1, SDValue V2, ArrayRef<int> Mask,
const X86Subtarget *Subtarget, SelectionDAG &DAG) {
SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+ MVT ExtVT = VT;
+ MVT EltVT = VT.getVectorElementType();
int V2Index = std::find_if(Mask.begin(), Mask.end(),
[&Mask](int M) { return M >= (int)Mask.size(); }) -
Mask.begin();
- if (Mask.size() == 2) {
- if (!Zeroable[V2Index ^ 1]) {
- // For 2-wide masks we may be able to just invert the inputs. We use an xor
- // with 2 to flip from {2,3} to {0,1} and vice versa.
- int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
- Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
- if (Zeroable[V2Index])
- return lowerVectorShuffleAsElementInsertion(VT, DL, V2, V1, InverseMask,
- Subtarget, DAG);
- else
- return SDValue();
+ bool IsV1Zeroable = true;
+ for (int i = 0, Size = Mask.size(); i < Size; ++i)
+ if (i != V2Index && !Zeroable[i]) {
+ IsV1Zeroable = false;
+ break;
}
- } else {
- for (int i = 0, Size = Mask.size(); i < Size; ++i)
- if (i != V2Index && !Zeroable[i])
- return SDValue(); // Not inserting into a zero vector.
- }
-
- // Step over any bitcasts on either input so we can scan the actual
- // BUILD_VECTOR nodes.
- while (V1.getOpcode() == ISD::BITCAST)
- V1 = V1.getOperand(0);
- while (V2.getOpcode() == ISD::BITCAST)
- V2 = V2.getOperand(0);
// Check for a single input from a SCALAR_TO_VECTOR node.
// FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
// all the smarts here sunk into that routine. However, the current
// lowering of BUILD_VECTOR makes that nearly impossible until the old
// vector shuffle lowering is dead.
- if (!((V2.getOpcode() == ISD::SCALAR_TO_VECTOR &&
- Mask[V2Index] == (int)Mask.size()) ||
- V2.getOpcode() == ISD::BUILD_VECTOR))
+ if (SDValue V2S = getScalarValueForVectorElement(
+ V2, Mask[V2Index] - Mask.size(), DAG)) {
+ // We need to zext the scalar if it is smaller than an i32.
+ V2S = DAG.getNode(ISD::BITCAST, DL, EltVT, V2S);
+ if (EltVT == MVT::i8 || EltVT == MVT::i16) {
+ // Using zext to expand a narrow element won't work for non-zero
+ // insertions.
+ if (!IsV1Zeroable)
+ return SDValue();
+
+ // Zero-extend directly to i32.
+ ExtVT = MVT::v4i32;
+ V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
+ }
+ V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
+ } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
+ EltVT == MVT::i16) {
+ // Either not inserting from the low element of the input or the input
+ // element size is too small to use VZEXT_MOVL to clear the high bits.
return SDValue();
+ }
- SDValue V2S = V2.getOperand(Mask[V2Index] - Mask.size());
+ if (!IsV1Zeroable) {
+ // If V1 can't be treated as a zero vector we have fewer options to lower
+ // this. We can't support integer vectors or non-zero targets cheaply, and
+ // the V1 elements can't be permuted in any way.
+ assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
+ if (!VT.isFloatingPoint() || V2Index != 0)
+ return SDValue();
+ SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
+ V1Mask[V2Index] = -1;
+ if (!isNoopShuffleMask(V1Mask))
+ return SDValue();
+ // This is essentially a special case blend operation, but if we have
+ // general purpose blend operations, they are always faster. Bail and let
+ // the rest of the lowering handle these as blends.
+ if (Subtarget->hasSSE41())
+ return SDValue();
- // First, we need to zext the scalar if it is smaller than an i32.
- MVT ExtVT = VT;
- MVT EltVT = VT.getVectorElementType();
- V2S = DAG.getNode(ISD::BITCAST, DL, EltVT, V2S);
- if (EltVT == MVT::i8 || EltVT == MVT::i16) {
- // Zero-extend directly to i32.
- ExtVT = MVT::v4i32;
- V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
+ // Otherwise, use MOVSD or MOVSS.
+ assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&
+ "Only two types of floating point element types to handle!");
+ return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
+ ExtVT, V1, V2);
}
- V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT,
- DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S));
+ V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
if (ExtVT != VT)
V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2);
return V2;
}
+/// \brief Try to lower broadcast of a single element.
+///
+/// For convenience, this code also bundles all of the subtarget feature set
+/// filtering. While a little annoying to re-dispatch on type here, there isn't
+/// a convenient way to factor it out.
+static SDValue lowerVectorShuffleAsBroadcast(MVT VT, SDLoc DL, SDValue V,
+ ArrayRef<int> Mask,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ if (!Subtarget->hasAVX())
+ return SDValue();
+ if (VT.isInteger() && !Subtarget->hasAVX2())
+ return SDValue();
+
+ // Check that the mask is a broadcast.
+ int BroadcastIdx = -1;
+ for (int M : Mask)
+ if (M >= 0 && BroadcastIdx == -1)
+ BroadcastIdx = M;
+ else if (M >= 0 && M != BroadcastIdx)
+ return SDValue();
+
+ assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
+ "a sorted mask where the broadcast "
+ "comes from V1.");
+
+ // Go up the chain of (vector) values to try and find a scalar load that
+ // we can combine with the broadcast.
+ for (;;) {
+ switch (V.getOpcode()) {
+ case ISD::CONCAT_VECTORS: {
+ int OperandSize = Mask.size() / V.getNumOperands();
+ V = V.getOperand(BroadcastIdx / OperandSize);
+ BroadcastIdx %= OperandSize;
+ continue;
+ }
+
+ case ISD::INSERT_SUBVECTOR: {
+ SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
+ auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
+ if (!ConstantIdx)
+ break;
+
+ int BeginIdx = (int)ConstantIdx->getZExtValue();
+ int EndIdx =
+ BeginIdx + (int)VInner.getValueType().getVectorNumElements();
+ if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
+ BroadcastIdx -= BeginIdx;
+ V = VInner;
+ } else {
+ V = VOuter;
+ }
+ continue;
+ }
+ }
+ break;
+ }
+
+ // Check if this is a broadcast of a scalar. We special case lowering
+ // for scalars so that we can more effectively fold with loads.
+ if (V.getOpcode() == ISD::BUILD_VECTOR ||
+ (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
+ V = V.getOperand(BroadcastIdx);
+
+ // If the scalar isn't a load we can't broadcast from it in AVX1, only with
+ // AVX2.
+ if (!Subtarget->hasAVX2() && !isShuffleFoldableLoad(V))
+ return SDValue();
+ } else if (BroadcastIdx != 0 || !Subtarget->hasAVX2()) {
+ // We can't broadcast from a vector register w/o AVX2, and we can only
+ // broadcast from the zero-element of a vector register.
+ return SDValue();
+ }
+
+ return DAG.getNode(X86ISD::VBROADCAST, DL, VT, V);
+}
+
/// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
///
/// This is the basis function for the 2-lane 64-bit shuffles as we have full
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2f64, V1, V2);
// If we have a single input, insert that into V1 if we can do so cheaply.
- if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1)
+ if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
MVT::v2f64, DL, V1, V2, Mask, Subtarget, DAG))
return Insertion;
+ // Try inverting the insertion since for v2 masks it is easy to do and we
+ // can't reliably sort the mask one way or the other.
+ int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
+ Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
+ if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
+ MVT::v2f64, DL, V2, V1, InverseMask, Subtarget, DAG))
+ return Insertion;
+ }
+
+ // Try to use one of the special instruction patterns to handle two common
+ // blend patterns if a zero-blend above didn't work.
+ if (isShuffleEquivalent(Mask, 0, 3) || isShuffleEquivalent(Mask, 1, 3))
+ if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
+ // We can either use a special instruction to load over the low double or
+ // to move just the low double.
+ return DAG.getNode(
+ isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD,
+ DL, MVT::v2f64, V2,
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
if (Subtarget->hasSSE41())
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
if (isSingleInputShuffleMask(Mask)) {
+ // Check for being able to broadcast a single element.
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v2i64, DL, V1,
+ Mask, Subtarget, DAG))
+ return Broadcast;
+
// Straight shuffle of a single input vector. For everything from SSE2
// onward this has a single fast instruction with no scary immediates.
// We have to map the mask as it is actually a v4i32 shuffle instruction.
getV4X86ShuffleImm8ForMask(WidenedMask, DAG)));
}
- // Use dedicated unpack instructions for masks that match their pattern.
- if (isShuffleEquivalent(Mask, 0, 2))
- return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, V1, V2);
- if (isShuffleEquivalent(Mask, 1, 3))
- return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2i64, V1, V2);
-
// If we have a single input from V2 insert that into V1 if we can do so
// cheaply.
- if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1)
+ if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
MVT::v2i64, DL, V1, V2, Mask, Subtarget, DAG))
return Insertion;
+ // Try inverting the insertion since for v2 masks it is easy to do and we
+ // can't reliably sort the mask one way or the other.
+ int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
+ Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
+ if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
+ MVT::v2i64, DL, V2, V1, InverseMask, Subtarget, DAG))
+ return Insertion;
+ }
+
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (isShuffleEquivalent(Mask, 0, 2))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, V1, V2);
+ if (isShuffleEquivalent(Mask, 1, 3))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2i64, V1, V2);
if (Subtarget->hasSSE41())
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
if (NumV2Elements == 0) {
+ // Check for being able to broadcast a single element.
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4f32, DL, V1,
+ Mask, Subtarget, DAG))
+ return Broadcast;
+
if (Subtarget->hasAVX()) {
// If we have AVX, we can use VPERMILPS which will allow folding a load
// into the shuffle.
ArrayRef<int> Mask = SVOp->getMask();
assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
+ // Whenever we can lower this as a zext, that instruction is strictly faster
+ // than any alternative. It also allows us to fold memory operands into the
+ // shuffle in many cases.
+ if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2,
+ Mask, Subtarget, DAG))
+ return ZExt;
+
int NumV2Elements =
std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
if (NumV2Elements == 0) {
+ // Check for being able to broadcast a single element.
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4i32, DL, V1,
+ Mask, Subtarget, DAG))
+ return Broadcast;
+
// Straight shuffle of a single input vector. For everything from SSE2
// onward this has a single fast instruction with no scary immediates.
// We coerce the shuffle pattern to be compatible with UNPCK instructions
getV4X86ShuffleImm8ForMask(Mask, DAG));
}
- // Whenever we can lower this as a zext, that instruction is strictly faster
- // than any alternative.
- if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2,
+ // There are special ways we can lower some single-element blends.
+ if (NumV2Elements == 1)
+ if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4i32, DL, V1, V2,
Mask, Subtarget, DAG))
- return ZExt;
+ return V;
// Use dedicated unpack instructions for masks that match their pattern.
if (isShuffleEquivalent(Mask, 0, 4, 1, 5))
if (isShuffleEquivalent(Mask, 2, 6, 3, 7))
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V1, V2);
- // There are special ways we can lower some single-element blends.
- if (NumV2Elements == 1)
- if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4i32, DL, V1, V2,
- Mask, Subtarget, DAG))
- return V;
-
if (Subtarget->hasSSE41())
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
Subtarget, DAG))
MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
+ // Check for being able to broadcast a single element.
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8i16, DL, V,
+ Mask, Subtarget, DAG))
+ return Broadcast;
+
// Use dedicated unpack instructions for masks that match their pattern.
if (isShuffleEquivalent(Mask, 0, 0, 1, 1, 2, 2, 3, 3))
return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V, V);
// Try to use rotation instructions if available.
if (Subtarget->hasSSSE3())
- if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask, DAG))
+ if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
+ DL, MVT::v8i16, V1, V2, Mask, DAG))
return Rotate;
if (NumV1Inputs + NumV2Inputs <= 4)
// Try to use rotation instructions if available.
if (Subtarget->hasSSSE3())
- if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v16i8, V1, V2,
- OrigMask, DAG))
+ if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
+ DL, MVT::v16i8, V1, V2, OrigMask, DAG))
return Rotate;
// Try to use a zext lowering.
// For single-input shuffles, there are some nicer lowering tricks we can use.
if (NumV2Elements == 0) {
+ // Check for being able to broadcast a single element.
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16i8, DL, V1,
+ Mask, Subtarget, DAG))
+ return Broadcast;
+
// Check whether we can widen this to an i16 shuffle by duplicating bytes.
// Notably, this handles splat and partial-splat shuffles more efficiently.
// However, it only makes sense if the pre-duplication shuffle simplifies
//
// FIXME: We need to handle other interleaving widths (i16, i32, ...).
if (shouldLowerAsInterleaving(Mask)) {
- // FIXME: Figure out whether we should pack these into the low or high
- // halves.
-
- int EMask[16], OMask[16];
+ int NumLoHalf = std::count_if(Mask.begin(), Mask.end(), [](int M) {
+ return (M >= 0 && M < 8) || (M >= 16 && M < 24);
+ });
+ int NumHiHalf = std::count_if(Mask.begin(), Mask.end(), [](int M) {
+ return (M >= 8 && M < 16) || M >= 24;
+ });
+ int EMask[16] = {-1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1};
+ int OMask[16] = {-1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1};
+ bool UnpackLo = NumLoHalf >= NumHiHalf;
+ MutableArrayRef<int> TargetEMask(UnpackLo ? EMask : EMask + 8, 8);
+ MutableArrayRef<int> TargetOMask(UnpackLo ? OMask : OMask + 8, 8);
for (int i = 0; i < 8; ++i) {
- EMask[i] = Mask[2*i];
- OMask[i] = Mask[2*i + 1];
- EMask[i + 8] = -1;
- OMask[i + 8] = -1;
+ TargetEMask[i] = Mask[2 * i];
+ TargetOMask[i] = Mask[2 * i + 1];
}
SDValue Evens = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, EMask);
SDValue Odds = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, OMask);
- return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, Evens, Odds);
+ return DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
+ MVT::v16i8, Evens, Odds);
}
// Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
}
}
+/// \brief Helper function to test whether a shuffle mask could be
+/// simplified by widening the elements being shuffled.
+///
+/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
+/// leaves it in an unspecified state.
+///
+/// NOTE: This must handle normal vector shuffle masks and *target* vector
+/// shuffle masks. The latter have the special property of a '-2' representing
+/// a zero-ed lane of a vector.
+static bool canWidenShuffleElements(ArrayRef<int> Mask,
+ SmallVectorImpl<int> &WidenedMask) {
+ for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
+ // If both elements are undef, its trivial.
+ if (Mask[i] == SM_SentinelUndef && Mask[i + 1] == SM_SentinelUndef) {
+ WidenedMask.push_back(SM_SentinelUndef);
+ continue;
+ }
+
+ // Check for an undef mask and a mask value properly aligned to fit with
+ // a pair of values. If we find such a case, use the non-undef mask's value.
+ if (Mask[i] == SM_SentinelUndef && Mask[i + 1] >= 0 && Mask[i + 1] % 2 == 1) {
+ WidenedMask.push_back(Mask[i + 1] / 2);
+ continue;
+ }
+ if (Mask[i + 1] == SM_SentinelUndef && Mask[i] >= 0 && Mask[i] % 2 == 0) {
+ WidenedMask.push_back(Mask[i] / 2);
+ continue;
+ }
+
+ // When zeroing, we need to spread the zeroing across both lanes to widen.
+ if (Mask[i] == SM_SentinelZero || Mask[i + 1] == SM_SentinelZero) {
+ if ((Mask[i] == SM_SentinelZero || Mask[i] == SM_SentinelUndef) &&
+ (Mask[i + 1] == SM_SentinelZero || Mask[i + 1] == SM_SentinelUndef)) {
+ WidenedMask.push_back(SM_SentinelZero);
+ continue;
+ }
+ return false;
+ }
+
+ // Finally check if the two mask values are adjacent and aligned with
+ // a pair.
+ if (Mask[i] != SM_SentinelUndef && Mask[i] % 2 == 0 && Mask[i] + 1 == Mask[i + 1]) {
+ WidenedMask.push_back(Mask[i] / 2);
+ continue;
+ }
+
+ // Otherwise we can't safely widen the elements used in this shuffle.
+ return false;
+ }
+ assert(WidenedMask.size() == Mask.size() / 2 &&
+ "Incorrect size of mask after widening the elements!");
+
+ return true;
+}
+
/// \brief Generic routine to split ector shuffle into half-sized shuffles.
///
/// This routine just extracts two subvectors, shuffles them independently, and
return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
}
+/// \brief Handle lowering 2-lane 128-bit shuffles.
+static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ // Blends are faster and handle all the non-lane-crossing cases.
+ if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
+ Subtarget, DAG))
+ return Blend;
+
+ MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
+ VT.getVectorNumElements() / 2);
+ // Check for patterns which can be matched with a single insert of a 128-bit
+ // subvector.
+ if (isShuffleEquivalent(Mask, 0, 1, 0, 1) ||
+ isShuffleEquivalent(Mask, 0, 1, 4, 5)) {
+ SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
+ DAG.getIntPtrConstant(0));
+ SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
+ Mask[2] < 4 ? V1 : V2, DAG.getIntPtrConstant(0));
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
+ }
+ if (isShuffleEquivalent(Mask, 0, 1, 6, 7)) {
+ SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
+ DAG.getIntPtrConstant(0));
+ SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
+ DAG.getIntPtrConstant(2));
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
+ }
+
+ // Otherwise form a 128-bit permutation.
+ // FIXME: Detect zero-vector inputs and use the VPERM2X128 to zero that half.
+ unsigned PermMask = Mask[0] / 2 | (Mask[2] / 2) << 4;
+ return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
+ DAG.getConstant(PermMask, MVT::i8));
+}
+
/// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
///
/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
ArrayRef<int> Mask = SVOp->getMask();
assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
+ SmallVector<int, 4> WidenedMask;
+ if (canWidenShuffleElements(Mask, WidenedMask))
+ return lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask, Subtarget,
+ DAG);
+
if (isSingleInputShuffleMask(Mask)) {
+ // Check for being able to broadcast a single element.
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4f64, DL, V1,
+ Mask, Subtarget, DAG))
+ return Broadcast;
+
if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
// Non-half-crossing single input shuffles can be lowerid with an
// interleaved permutation.
assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
assert(Subtarget->hasAVX2() && "We can only lower v4i64 with AVX2!");
+ SmallVector<int, 4> WidenedMask;
+ if (canWidenShuffleElements(Mask, WidenedMask))
+ return lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask, Subtarget,
+ DAG);
+
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
Subtarget, DAG))
return Blend;
+ // Check for being able to broadcast a single element.
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4i64, DL, V1,
+ Mask, Subtarget, DAG))
+ return Broadcast;
+
// When the shuffle is mirrored between the 128-bit lanes of the unit, we can
// use lower latency instructions that will operate on both 128-bit lanes.
SmallVector<int, 2> RepeatedMask;
Subtarget, DAG))
return Blend;
+ // Check for being able to broadcast a single element.
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8f32, DL, V1,
+ Mask, Subtarget, DAG))
+ return Broadcast;
+
// If the shuffle mask is repeated in each 128-bit lane, we have many more
// options to efficiently lower the shuffle.
SmallVector<int, 4> RepeatedMask;
Subtarget, DAG))
return Blend;
+ // Check for being able to broadcast a single element.
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8i32, DL, V1,
+ Mask, Subtarget, DAG))
+ return Broadcast;
+
// If the shuffle mask is repeated in each 128-bit lane we can use more
// efficient instructions that mirror the shuffles across the two 128-bit
// lanes.
assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
assert(Subtarget->hasAVX2() && "We can only lower v16i16 with AVX2!");
+ // Check for being able to broadcast a single element.
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16i16, DL, V1,
+ Mask, Subtarget, DAG))
+ return Broadcast;
+
// There are no generalized cross-lane shuffle operations available on i16
// element types.
if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
assert(Subtarget->hasAVX2() && "We can only lower v32i8 with AVX2!");
+ // Check for being able to broadcast a single element.
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v32i8, DL, V1,
+ Mask, Subtarget, DAG))
+ return Broadcast;
+
// There are no generalized cross-lane shuffle operations available on i8
// element types.
if (is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
}
-/// \brief Helper function to test whether a shuffle mask could be
-/// simplified by widening the elements being shuffled.
-///
-/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
-/// leaves it in an unspecified state.
-///
-/// NOTE: This must handle normal vector shuffle masks and *target* vector
-/// shuffle masks. The latter have the special property of a '-2' representing
-/// a zero-ed lane of a vector.
-static bool canWidenShuffleElements(ArrayRef<int> Mask,
- SmallVectorImpl<int> &WidenedMask) {
- for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
- // If both elements are undef, its trivial.
- if (Mask[i] == SM_SentinelUndef && Mask[i + 1] == SM_SentinelUndef) {
- WidenedMask.push_back(SM_SentinelUndef);
- continue;
- }
-
- // Check for an undef mask and a mask value properly aligned to fit with
- // a pair of values. If we find such a case, use the non-undef mask's value.
- if (Mask[i] == SM_SentinelUndef && Mask[i + 1] >= 0 && Mask[i + 1] % 2 == 1) {
- WidenedMask.push_back(Mask[i + 1] / 2);
- continue;
- }
- if (Mask[i + 1] == SM_SentinelUndef && Mask[i] >= 0 && Mask[i] % 2 == 0) {
- WidenedMask.push_back(Mask[i] / 2);
- continue;
- }
-
- // When zeroing, we need to spread the zeroing across both lanes to widen.
- if (Mask[i] == SM_SentinelZero || Mask[i + 1] == SM_SentinelZero) {
- if ((Mask[i] == SM_SentinelZero || Mask[i] == SM_SentinelUndef) &&
- (Mask[i + 1] == SM_SentinelZero || Mask[i + 1] == SM_SentinelUndef)) {
- WidenedMask.push_back(SM_SentinelZero);
- continue;
- }
- return false;
- }
-
- // Finally check if the two mask values are adjacent and aligned with
- // a pair.
- if (Mask[i] != SM_SentinelUndef && Mask[i] % 2 == 0 && Mask[i] + 1 == Mask[i + 1]) {
- WidenedMask.push_back(Mask[i] / 2);
- continue;
- }
-
- // Otherwise we can't safely widen the elements used in this shuffle.
- return false;
- }
- assert(WidenedMask.size() == Mask.size() / 2 &&
- "Incorrect size of mask after widening the elements!");
-
- return true;
-}
-
/// \brief Top-level lowering for x86 vector shuffles.
///
/// This handles decomposition, canonicalization, and lowering of all x86
return DAG.getVectorShuffle(VT, dl, V1, V2, NewMask);
}
- // For integer vector shuffles, try to collapse them into a shuffle of fewer
- // lanes but wider integers. We cap this to not form integers larger than i64
- // but it might be interesting to form i128 integers to handle flipping the
- // low and high halves of AVX 256-bit vectors.
+ // Try to collapse shuffles into using a vector type with fewer elements but
+ // wider element types. We cap this to not form integers or floating point
+ // elements wider than 64 bits, but it might be interesting to form i128
+ // integers to handle flipping the low and high halves of AVX 256-bit vectors.
SmallVector<int, 16> WidenedMask;
- if (VT.isInteger() && VT.getScalarSizeInBits() < 64 &&
+ if (VT.getScalarSizeInBits() < 64 &&
canWidenShuffleElements(Mask, WidenedMask)) {
- MVT NewVT =
- MVT::getVectorVT(MVT::getIntegerVT(VT.getScalarSizeInBits() * 2),
- VT.getVectorNumElements() / 2);
- V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1);
- V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2);
- return DAG.getNode(ISD::BITCAST, dl, VT,
- DAG.getVectorShuffle(NewVT, dl, V1, V2, WidenedMask));
+ MVT NewEltVT = VT.isFloatingPoint()
+ ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
+ : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
+ MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
+ // Make sure that the new vector type is legal. For example, v2f64 isn't
+ // legal on SSE1.
+ if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
+ V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1);
+ V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2);
+ return DAG.getNode(ISD::BITCAST, dl, VT,
+ DAG.getVectorShuffle(NewVT, dl, V1, V2, WidenedMask));
+ }
}
int NumV1Elements = 0, NumUndefElements = 0, NumV2Elements = 0;
if (!DAG.getTargetLoweringInfo().isTypeLegal(NVT))
return SDValue();
- // Simplify the operand as it's prepared to be fed into shuffle.
- unsigned SignificantBits = NVT.getSizeInBits() >> Shift;
- if (V1.getOpcode() == ISD::BITCAST &&
- V1.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
- V1.getOperand(0).getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
- V1.getOperand(0).getOperand(0)
- .getSimpleValueType().getSizeInBits() == SignificantBits) {
- // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
- SDValue V = V1.getOperand(0).getOperand(0).getOperand(0);
- ConstantSDNode *CIdx =
- dyn_cast<ConstantSDNode>(V1.getOperand(0).getOperand(0).getOperand(1));
- // If it's foldable, i.e. normal load with single use, we will let code
- // selection to fold it. Otherwise, we will short the conversion sequence.
- if (CIdx && CIdx->getZExtValue() == 0 &&
- (!ISD::isNormalLoad(V.getNode()) || !V.hasOneUse())) {
- MVT FullVT = V.getSimpleValueType();
- MVT V1VT = V1.getSimpleValueType();
- if (FullVT.getSizeInBits() > V1VT.getSizeInBits()) {
- // The "ext_vec_elt" node is wider than the result node.
- // In this case we should extract subvector from V.
- // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast (extract_subvector x)).
- unsigned Ratio = FullVT.getSizeInBits() / V1VT.getSizeInBits();
- MVT SubVecVT = MVT::getVectorVT(FullVT.getVectorElementType(),
- FullVT.getVectorNumElements()/Ratio);
- V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVecVT, V,
- DAG.getIntPtrConstant(0));
- }
- V1 = DAG.getNode(ISD::BITCAST, DL, V1VT, V);
- }
- }
-
return DAG.getNode(ISD::BITCAST, DL, VT,
DAG.getNode(X86ISD::VZEXT, DL, NVT, V1));
}
SDValue Cond = Op.getOperand(0);
SDValue LHS = Op.getOperand(1);
SDValue RHS = Op.getOperand(2);
- SDLoc DL(Op);
+ SDLoc dl(Op);
MVT VT = Op.getSimpleValueType();
MVT EltVT = VT.getVectorElementType();
+ unsigned NumElems = VT.getVectorNumElements();
// There is no blend with immediate in AVX-512.
if (VT.is512BitVector())
return SDValue();
- // No blend instruction before SSE4.1.
- if (!Subtarget->hasSSE41())
+ if (!Subtarget->hasSSE41() || EltVT == MVT::i8)
return SDValue();
- // There is no byte-blend immediate controlled instruction.
- if (EltVT == MVT::i8)
+ if (!Subtarget->hasInt256() && VT == MVT::v16i16)
return SDValue();
if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
return SDValue();
- auto *CondBV = cast<BuildVectorSDNode>(Cond);
+ // Check the mask for BLEND and build the value.
+ unsigned MaskValue = 0;
+ if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue))
+ return SDValue();
- unsigned BlendMask = 0;
+ // Convert i32 vectors to floating point if it is not AVX2.
+ // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.
MVT BlendVT = VT;
- if (VT == MVT::v16i16) {
- // v16i16 blends are completely special. We can only do them when we have
- // a repeated blend across the two 128-bit halves and we have AVX2.
- if (!Subtarget->hasAVX2())
- return SDValue();
-
- for (int i = 0; i < 8; ++i) {
- SDValue Lo = CondBV->getOperand(i);
- SDValue Hi = CondBV->getOperand(i + 8);
- bool IsLoZero = X86::isZeroNode(Lo);
- bool IsHiZero = X86::isZeroNode(Hi);
- if (Lo->getOpcode() != ISD::UNDEF && Hi->getOpcode() != ISD::UNDEF &&
- IsLoZero != IsHiZero)
- // Asymmetric blends, bail.
- return SDValue();
- BlendMask |= (unsigned)(IsLoZero || IsHiZero) << i;
- }
- } else {
- // Everything else uses a generic blend mask computation with a custom type.
- if (VT.isInteger()) {
- if (VT.is256BitVector()) {
- // The 256-bit integer blend instructions are only available on AVX2.
- if (!Subtarget->hasAVX2())
- return SDValue();
-
- // We do the blend on v8i32 for 256-bit integer types.
- BlendVT = MVT::v8i32;
- } else {
- // For 128-bit vectors we do the blend on v8i16 types.
- BlendVT = MVT::v8i16;
- }
- }
- assert(BlendVT.getVectorNumElements() <= 8 &&
- "Cannot blend more than 8 elements with an immediate!");
- // Scale the blend mask based on the number of elements in the selected
- // blend type.
- int Scale = BlendVT.getVectorNumElements() / VT.getVectorNumElements();
- for (int i = 0, e = CondBV->getNumOperands(); i < e; ++i) {
- SDValue CondElement = CondBV->getOperand(i);
- if (CondElement->getOpcode() != ISD::UNDEF &&
- X86::isZeroNode(CondElement))
- for (int j = 0; j < Scale; ++j)
- BlendMask |= 1u << (i * Scale + j);
- }
+ if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) {
+ BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()),
+ NumElems);
+ LHS = DAG.getNode(ISD::BITCAST, dl, VT, LHS);
+ RHS = DAG.getNode(ISD::BITCAST, dl, VT, RHS);
}
- LHS = DAG.getNode(ISD::BITCAST, DL, BlendVT, LHS);
- RHS = DAG.getNode(ISD::BITCAST, DL, BlendVT, RHS);
-
- return DAG.getNode(ISD::BITCAST, DL, VT,
- DAG.getNode(X86ISD::BLENDI, DL, BlendVT, LHS, RHS,
- DAG.getConstant(BlendMask, MVT::i8)));
+ SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, LHS, RHS,
+ DAG.getConstant(MaskValue, MVT::i32));
+ return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
}
SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
In, DAG.getUNDEF(SVT)));
}
-// The only differences between FABS and FNEG are the mask and the logic op.
+/// The only differences between FABS and FNEG are the mask and the logic op.
+/// FNEG also has a folding opportunity for FNEG(FABS(x)).
static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
"Wrong opcode for lowering FABS or FNEG.");
bool IsFABS = (Op.getOpcode() == ISD::FABS);
+
+ // If this is a FABS and it has an FNEG user, bail out to fold the combination
+ // into an FNABS. We'll lower the FABS after that if it is still in use.
+ if (IsFABS)
+ for (SDNode *User : Op->uses())
+ if (User->getOpcode() == ISD::FNEG)
+ return Op;
+
+ SDValue Op0 = Op.getOperand(0);
+ bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
+
SDLoc dl(Op);
MVT VT = Op.getSimpleValueType();
// Assume scalar op for initialization; update for vector if needed.
// For a vector, cast operands to a vector type, perform the logic op,
// and cast the result back to the original value type.
MVT VecVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
- SDValue Op0Casted = DAG.getNode(ISD::BITCAST, dl, VecVT, Op.getOperand(0));
SDValue MaskCasted = DAG.getNode(ISD::BITCAST, dl, VecVT, Mask);
- unsigned LogicOp = IsFABS ? ISD::AND : ISD::XOR;
+ SDValue Operand = IsFNABS ?
+ DAG.getNode(ISD::BITCAST, dl, VecVT, Op0.getOperand(0)) :
+ DAG.getNode(ISD::BITCAST, dl, VecVT, Op0);
+ unsigned BitOp = IsFABS ? ISD::AND : IsFNABS ? ISD::OR : ISD::XOR;
return DAG.getNode(ISD::BITCAST, dl, VT,
- DAG.getNode(LogicOp, dl, VecVT, Op0Casted, MaskCasted));
+ DAG.getNode(BitOp, dl, VecVT, Operand, MaskCasted));
}
+
// If not vector, then scalar.
- unsigned LogicOp = IsFABS ? X86ISD::FAND : X86ISD::FXOR;
- return DAG.getNode(LogicOp, dl, VT, Op.getOperand(0), Mask);
+ unsigned BitOp = IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
+ SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
+ return DAG.getNode(BitOp, dl, VT, Operand, Mask);
}
static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, VT));
}
-// LowerVectorAllZeroTest - Check whether an OR'd tree is PTEST-able.
-//
+// Check whether an OR'd tree is PTEST-able.
static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
}
+/// The minimum architected relative accuracy is 2^-12. We need one
+/// Newton-Raphson step to have a good float result (24 bits of precision).
+SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op,
+ DAGCombinerInfo &DCI,
+ unsigned &RefinementSteps,
+ bool &UseOneConstNR) const {
+ // FIXME: We should use instruction latency models to calculate the cost of
+ // each potential sequence, but this is very hard to do reliably because
+ // at least Intel's Core* chips have variable timing based on the number of
+ // significant digits in the divisor and/or sqrt operand.
+ if (!Subtarget->useSqrtEst())
+ return SDValue();
+
+ EVT VT = Op.getValueType();
+
+ // SSE1 has rsqrtss and rsqrtps.
+ // TODO: Add support for AVX512 (v16f32).
+ // It is likely not profitable to do this for f64 because a double-precision
+ // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
+ // instructions: convert to single, rsqrtss, convert back to double, refine
+ // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
+ // along with FMA, this could be a throughput win.
+ if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
+ (Subtarget->hasAVX() && VT == MVT::v8f32)) {
+ RefinementSteps = 1;
+ UseOneConstNR = false;
+ return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
+ }
+ return SDValue();
+}
+
static bool isAllOnes(SDValue V) {
ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
return C && C->isAllOnesValue();
return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops);
}
-static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, SelectionDAG &DAG) {
+static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
MVT VT = Op->getSimpleValueType(0);
SDValue In = Op->getOperand(0);
MVT InVT = In.getSimpleValueType();
+ MVT VTElt = VT.getVectorElementType();
+ MVT InVTElt = InVT.getVectorElementType();
SDLoc dl(Op);
+ // SKX processor
+ if ((InVTElt == MVT::i1) &&
+ (((Subtarget->hasBWI() && Subtarget->hasVLX() &&
+ VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() <= 16)) ||
+
+ ((Subtarget->hasBWI() && VT.is512BitVector() &&
+ VTElt.getSizeInBits() <= 16)) ||
+
+ ((Subtarget->hasDQI() && Subtarget->hasVLX() &&
+ VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() >= 32)) ||
+
+ ((Subtarget->hasDQI() && VT.is512BitVector() &&
+ VTElt.getSizeInBits() >= 32))))
+ return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
+
unsigned int NumElts = VT.getVectorNumElements();
+
if (NumElts != 8 && NumElts != 16)
return SDValue();
SDLoc dl(Op);
if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
- return LowerSIGN_EXTEND_AVX512(Op, DAG);
+ return LowerSIGN_EXTEND_AVX512(Op, Subtarget, DAG);
if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
(VT != MVT::v8i32 || InVT != MVT::v8i16) &&
return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
}
-/// \brief Return (vselect \p Mask, \p Op, \p PreservedSrc) along with the
+/// \brief Return (and \p Op, \p Mask) for compare instructions or
+/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
/// necessary casting for \p Mask when lowering masking intrinsics.
static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
SDValue PreservedSrc, SelectionDAG &DAG) {
EVT VT = Op.getValueType();
EVT MaskVT = EVT::getVectorVT(*DAG.getContext(),
MVT::i1, VT.getVectorNumElements());
+ EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+ Mask.getValueType().getSizeInBits());
SDLoc dl(Op);
assert(MaskVT.isSimple() && "invalid mask type");
- return DAG.getNode(ISD::VSELECT, dl, VT,
- DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask),
- Op, PreservedSrc);
+
+ if (isAllOnes(Mask))
+ return Op;
+
+ // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
+ // are extracted by EXTRACT_SUBVECTOR.
+ SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
+ DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
+ DAG.getIntPtrConstant(0));
+
+ switch (Op.getOpcode()) {
+ default: break;
+ case X86ISD::PCMPEQM:
+ case X86ISD::PCMPGTM:
+ case X86ISD::CMPM:
+ case X86ISD::CMPMU:
+ return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
+ }
+
+ return DAG.getNode(ISD::VSELECT, dl, VT, VMask, Op, PreservedSrc);
}
static unsigned getOpcodeForFMAIntrinsic(unsigned IntNo) {
case INTR_TYPE_3OP:
return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
Op.getOperand(2), Op.getOperand(3));
+ case CMP_MASK:
+ case CMP_MASK_CC: {
+ // Comparison intrinsics with masks.
+ // Example of transformation:
+ // (i8 (int_x86_avx512_mask_pcmpeq_q_128
+ // (v2i64 %a), (v2i64 %b), (i8 %mask))) ->
+ // (i8 (bitcast
+ // (v8i1 (insert_subvector undef,
+ // (v2i1 (and (PCMPEQM %a, %b),
+ // (extract_subvector
+ // (v8i1 (bitcast %mask)), 0))), 0))))
+ EVT VT = Op.getOperand(1).getValueType();
+ EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+ VT.getVectorNumElements());
+ SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);
+ EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+ Mask.getValueType().getSizeInBits());
+ SDValue Cmp;
+ if (IntrData->Type == CMP_MASK_CC) {
+ Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
+ Op.getOperand(2), Op.getOperand(3));
+ } else {
+ assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!");
+ Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
+ Op.getOperand(2));
+ }
+ SDValue CmpMask = getVectorMaskingNode(Cmp, Mask,
+ DAG.getTargetConstant(0, MaskVT), DAG);
+ SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
+ DAG.getUNDEF(BitcastVT), CmpMask,
+ DAG.getIntPtrConstant(0));
+ return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
+ }
case COMI: { // Comparison intrinsics
ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
SDValue LHS = Op.getOperand(1);
Cond = X86::COND_B;
break;
case ISD::SMULO:
- BaseOp = X86ISD::SMUL;
+ BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL;
Cond = X86::COND_O;
break;
case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
+ if (N->getValueType(0) == MVT::i8) {
+ BaseOp = X86ISD::UMUL8;
+ Cond = X86::COND_O;
+ break;
+ }
SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
MVT::i32);
SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
DCI.isBeforeLegalizeOps());
if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) ||
- TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne, TLO))
+ (TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne,
+ TLO) &&
+ // Don't optimize vector of constants. Those are handled by
+ // the generic code and all the bits must be properly set for
+ // the generic optimizer.
+ !ISD::isBuildVectorOfConstantSDNodes(TLO.New.getNode())))
DCI.CommitTargetLoweringOpt(TLO);
}
/// performVZEXTCombine - Performs build vector combines
static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI,
- const X86Subtarget *Subtarget) {
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget *Subtarget) {
+ SDLoc DL(N);
+ MVT VT = N->getSimpleValueType(0);
+ SDValue Op = N->getOperand(0);
+ MVT OpVT = Op.getSimpleValueType();
+ MVT OpEltVT = OpVT.getVectorElementType();
+ unsigned InputBits = OpEltVT.getSizeInBits() * VT.getVectorNumElements();
+
// (vzext (bitcast (vzext (x)) -> (vzext x)
- SDValue In = N->getOperand(0);
- while (In.getOpcode() == ISD::BITCAST)
- In = In.getOperand(0);
+ SDValue V = Op;
+ while (V.getOpcode() == ISD::BITCAST)
+ V = V.getOperand(0);
- if (In.getOpcode() != X86ISD::VZEXT)
- return SDValue();
+ if (V != Op && V.getOpcode() == X86ISD::VZEXT) {
+ MVT InnerVT = V.getSimpleValueType();
+ MVT InnerEltVT = InnerVT.getVectorElementType();
+
+ // If the element sizes match exactly, we can just do one larger vzext. This
+ // is always an exact type match as vzext operates on integer types.
+ if (OpEltVT == InnerEltVT) {
+ assert(OpVT == InnerVT && "Types must match for vzext!");
+ return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0));
+ }
+
+ // The only other way we can combine them is if only a single element of the
+ // inner vzext is used in the input to the outer vzext.
+ if (InnerEltVT.getSizeInBits() < InputBits)
+ return SDValue();
- return DAG.getNode(X86ISD::VZEXT, SDLoc(N), N->getValueType(0),
- In.getOperand(0));
+ // In this case, the inner vzext is completely dead because we're going to
+ // only look at bits inside of the low element. Just do the outer vzext on
+ // a bitcast of the input to the inner.
+ return DAG.getNode(X86ISD::VZEXT, DL, VT,
+ DAG.getNode(ISD::BITCAST, DL, OpVT, V));
+ }
+
+ // Check if we can bypass extracting and re-inserting an element of an input
+ // vector. Essentialy:
+ // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
+ if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
+ V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+ V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) {
+ SDValue ExtractedV = V.getOperand(0);
+ SDValue OrigV = ExtractedV.getOperand(0);
+ if (auto *ExtractIdx = dyn_cast<ConstantSDNode>(ExtractedV.getOperand(1)))
+ if (ExtractIdx->getZExtValue() == 0) {
+ MVT OrigVT = OrigV.getSimpleValueType();
+ // Extract a subvector if necessary...
+ if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) {
+ int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits();
+ OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(),
+ OrigVT.getVectorNumElements() / Ratio);
+ OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV,
+ DAG.getIntPtrConstant(0));
+ }
+ Op = DAG.getNode(ISD::BITCAST, DL, OpVT, OrigV);
+ return DAG.getNode(X86ISD::VZEXT, DL, VT, Op);
+ }
+ }
+
+ return SDValue();
}
SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,