setOperationAction(ISD::XOR, MVT::v4i32, Legal);
}
- // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion
- // of this type with custom code.
- for (MVT VT : MVT::vector_valuetypes())
- setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Custom);
-
// We want to custom lower some of our intrinsics.
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
setTargetDAGCombine(ISD::MUL);
setTargetDAGCombine(ISD::XOR);
- computeRegisterProperties();
+ computeRegisterProperties(Subtarget->getRegisterInfo());
// On Darwin, -Os means optimize for size without hurting performance,
// do not reduce the limit.
return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx);
}
-// FIXME: Why this routine is here? Move to RegInfo!
-std::pair<const TargetRegisterClass*, uint8_t>
-X86TargetLowering::findRepresentativeClass(MVT VT) const{
+std::pair<const TargetRegisterClass *, uint8_t>
+X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
+ MVT VT) const {
const TargetRegisterClass *RRC = nullptr;
uint8_t Cost = 1;
switch (VT.SimpleTy) {
default:
- return TargetLowering::findRepresentativeClass(VT);
+ return TargetLowering::findRepresentativeClass(TRI, VT);
case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
RRC = Subtarget->is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
break;
// Find the first unallocated argument registers.
ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
- unsigned NumIntRegs =
- CCInfo.getFirstUnallocated(ArgGPRs.data(), ArgGPRs.size());
- unsigned NumXMMRegs =
- CCInfo.getFirstUnallocated(ArgXMMs.data(), ArgXMMs.size());
+ unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
+ unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
"SSE register cannot be used when SSE is disabled!");
X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
};
- unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
+ unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
assert((Subtarget->hasSSE1() || !NumXMMRegs)
&& "SSE registers cannot be used when SSE is disabled");
bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC);
bool IsCallerWin64 = Subtarget->isCallingConvWin64(CallerCC);
+ // Win64 functions have extra shadow space for argument homing. Don't do the
+ // sibcall if the caller and callee have mismatched expectations for this
+ // space.
+ if (IsCalleeWin64 != IsCallerWin64)
+ return false;
+
if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
if (IsTailCallConvention(CalleeCC) && CCMatch)
return true;
return true;
}
-/// \brief Base case helper for testing a single mask element.
-static bool isShuffleEquivalentImpl(SDValue V1, SDValue V2,
- BuildVectorSDNode *BV1,
- BuildVectorSDNode *BV2, ArrayRef<int> Mask,
- int i, int Arg) {
- int Size = Mask.size();
- if (Mask[i] != -1 && Mask[i] != Arg) {
- auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
- auto *ArgsBV = Arg < Size ? BV1 : BV2;
- if (!MaskBV || !ArgsBV ||
- MaskBV->getOperand(Mask[i] % Size) != ArgsBV->getOperand(Arg % Size))
- return false;
- }
- return true;
-}
-
-/// \brief Recursive helper to peel off and test each mask element.
-template <typename... Ts>
-static bool isShuffleEquivalentImpl(SDValue V1, SDValue V2,
- BuildVectorSDNode *BV1,
- BuildVectorSDNode *BV2, ArrayRef<int> Mask,
- int i, int Arg, Ts... Args) {
- if (!isShuffleEquivalentImpl(V1, V2, BV1, BV2, Mask, i, Arg))
- return false;
-
- return isShuffleEquivalentImpl(V1, V2, BV1, BV2, Mask, i + 1, Args...);
-}
-
/// \brief Checks whether a shuffle mask is equivalent to an explicit list of
/// arguments.
///
/// This is a fast way to test a shuffle mask against a fixed pattern:
///
-/// if (isShuffleEquivalent(Mask, 3, 2, 1, 0)) { ... }
+/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
///
/// It returns true if the mask is exactly as wide as the argument list, and
/// each element of the mask is either -1 (signifying undef) or the value given
/// in the argument.
-template <typename... Ts>
static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
- Ts... Args) {
- if (Mask.size() != sizeof...(Args))
+ ArrayRef<int> ExpectedMask) {
+ if (Mask.size() != ExpectedMask.size())
return false;
+ int Size = Mask.size();
+
// If the values are build vectors, we can look through them to find
// equivalent inputs that make the shuffles equivalent.
auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);
- // Recursively peel off arguments and test them against the mask.
- return isShuffleEquivalentImpl(V1, V2, BV1, BV2, Mask, 0, Args...);
+ for (int i = 0; i < Size; ++i)
+ if (Mask[i] != -1 && Mask[i] != ExpectedMask[i]) {
+ auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
+ auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
+ if (!MaskBV || !ExpectedBV ||
+ MaskBV->getOperand(Mask[i] % Size) !=
+ ExpectedBV->getOperand(ExpectedMask[i] % Size))
+ return false;
+ }
+
+ return true;
}
/// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
// FALLTHROUGH
case MVT::v16i8:
case MVT::v32i8: {
+ assert((VT.getSizeInBits() == 128 || Subtarget->hasAVX2()) &&
+ "256-bit byte-blends require AVX2 support!");
+
// Scale the blend by the number of bytes per element.
int Scale = VT.getScalarSizeInBits() / 8;
/// This is a common pattern that we have especially efficient patterns to lower
/// across all subtarget feature sets.
static SDValue lowerVectorShuffleAsElementInsertion(
- MVT VT, SDLoc DL, SDValue V1, SDValue V2, ArrayRef<int> Mask,
+ SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
const X86Subtarget *Subtarget, SelectionDAG &DAG) {
SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
MVT ExtVT = VT;
/// For convenience, this code also bundles all of the subtarget feature set
/// filtering. While a little annoying to re-dispatch on type here, there isn't
/// a convenient way to factor it out.
-static SDValue lowerVectorShuffleAsBroadcast(MVT VT, SDLoc DL, SDValue V,
+static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V,
ArrayRef<int> Mask,
const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
/// because for floating point vectors we have a generalized SHUFPS lowering
/// strategy that handles everything that doesn't *exactly* match an unpack,
/// making this clever lowering unnecessary.
-static SDValue lowerVectorShuffleAsUnpack(MVT VT, SDLoc DL, SDValue V1,
+static SDValue lowerVectorShuffleAsUnpack(SDLoc DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
SelectionDAG &DAG) {
assert(!VT.isFloatingPoint() &&
if (isSingleInputShuffleMask(Mask)) {
// Use low duplicate instructions for masks that match their pattern.
if (Subtarget->hasSSE3())
- if (isShuffleEquivalent(V1, V2, Mask, 0, 0))
+ if (isShuffleEquivalent(V1, V2, Mask, {0, 0}))
return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, V1);
// Straight shuffle of a single input vector. Simulate this by using the
// If we have a single input, insert that into V1 if we can do so cheaply.
if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
- MVT::v2f64, DL, V1, V2, Mask, Subtarget, DAG))
+ DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG))
return Insertion;
// Try inverting the insertion since for v2 masks it is easy to do and we
// can't reliably sort the mask one way or the other.
int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
- MVT::v2f64, DL, V2, V1, InverseMask, Subtarget, DAG))
+ DL, MVT::v2f64, V2, V1, InverseMask, Subtarget, DAG))
return Insertion;
}
// Try to use one of the special instruction patterns to handle two common
// blend patterns if a zero-blend above didn't work.
- if (isShuffleEquivalent(V1, V2, Mask, 0, 3) || isShuffleEquivalent(V1, V2, Mask, 1, 3))
+ if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
+ isShuffleEquivalent(V1, V2, Mask, {1, 3}))
if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
// We can either use a special instruction to load over the low double or
// to move just the low double.
return Blend;
// Use dedicated unpack instructions for masks that match their pattern.
- if (isShuffleEquivalent(V1, V2, Mask, 0, 2))
+ if (isShuffleEquivalent(V1, V2, Mask, {0, 2}))
return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2f64, V1, V2);
- if (isShuffleEquivalent(V1, V2, Mask, 1, 3))
+ if (isShuffleEquivalent(V1, V2, Mask, {1, 3}))
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2f64, V1, V2);
unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
if (isSingleInputShuffleMask(Mask)) {
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v2i64, DL, V1,
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v2i64, V1,
Mask, Subtarget, DAG))
return Broadcast;
// When loading a scalar and then shuffling it into a vector we can often do
// the insertion cheaply.
if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
- MVT::v2i64, DL, V1, V2, Mask, Subtarget, DAG))
+ DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
return Insertion;
// Try inverting the insertion since for v2 masks it is easy to do and we
// can't reliably sort the mask one way or the other.
int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
- MVT::v2i64, DL, V2, V1, InverseMask, Subtarget, DAG))
+ DL, MVT::v2i64, V2, V1, InverseMask, Subtarget, DAG))
return Insertion;
// We have different paths for blend lowering, but they all must use the
return Blend;
// Use dedicated unpack instructions for masks that match their pattern.
- if (isShuffleEquivalent(V1, V2, Mask, 0, 2))
+ if (isShuffleEquivalent(V1, V2, Mask, {0, 2}))
return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, V1, V2);
- if (isShuffleEquivalent(V1, V2, Mask, 1, 3))
+ if (isShuffleEquivalent(V1, V2, Mask, {1, 3}))
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2i64, V1, V2);
// Try to use byte rotation instructions.
if (NumV2Elements == 0) {
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4f32, DL, V1,
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4f32, V1,
Mask, Subtarget, DAG))
return Broadcast;
// Use even/odd duplicate instructions for masks that match their pattern.
if (Subtarget->hasSSE3()) {
- if (isShuffleEquivalent(V1, V2, Mask, 0, 0, 2, 2))
+ if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
- if (isShuffleEquivalent(V1, V2, Mask, 1, 1, 3, 3))
+ if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3}))
return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
}
// when the V2 input is targeting element 0 of the mask -- that is the fast
// case here.
if (NumV2Elements == 1 && Mask[0] >= 4)
- if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4f32, DL, V1, V2,
+ if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v4f32, V1, V2,
Mask, Subtarget, DAG))
return V;
}
// Use dedicated unpack instructions for masks that match their pattern.
- if (isShuffleEquivalent(V1, V2, Mask, 0, 4, 1, 5))
+ if (isShuffleEquivalent(V1, V2, Mask, {0, 4, 1, 5}))
return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V1, V2);
- if (isShuffleEquivalent(V1, V2, Mask, 2, 6, 3, 7))
+ if (isShuffleEquivalent(V1, V2, Mask, {2, 6, 3, 7}))
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V1, V2);
- if (isShuffleEquivalent(V1, V2, Mask, 4, 0, 5, 1))
+ if (isShuffleEquivalent(V1, V2, Mask, {4, 0, 5, 1}))
return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V2, V1);
- if (isShuffleEquivalent(V1, V2, Mask, 6, 2, 7, 3))
+ if (isShuffleEquivalent(V1, V2, Mask, {6, 2, 7, 3}))
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V2, V1);
// Otherwise fall back to a SHUFPS lowering strategy.
if (NumV2Elements == 0) {
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4i32, DL, V1,
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i32, V1,
Mask, Subtarget, DAG))
return Broadcast;
// so prevents folding a load into this instruction or making a copy.
const int UnpackLoMask[] = {0, 0, 1, 1};
const int UnpackHiMask[] = {2, 2, 3, 3};
- if (isShuffleEquivalent(V1, V2, Mask, 0, 0, 1, 1))
+ if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1}))
Mask = UnpackLoMask;
- else if (isShuffleEquivalent(V1, V2, Mask, 2, 2, 3, 3))
+ else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3}))
Mask = UnpackHiMask;
return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
// There are special ways we can lower some single-element blends.
if (NumV2Elements == 1)
- if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4i32, DL, V1, V2,
+ if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v4i32, V1, V2,
Mask, Subtarget, DAG))
return V;
return Masked;
// Use dedicated unpack instructions for masks that match their pattern.
- if (isShuffleEquivalent(V1, V2, Mask, 0, 4, 1, 5))
+ if (isShuffleEquivalent(V1, V2, Mask, {0, 4, 1, 5}))
return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V1, V2);
- if (isShuffleEquivalent(V1, V2, Mask, 2, 6, 3, 7))
+ if (isShuffleEquivalent(V1, V2, Mask, {2, 6, 3, 7}))
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V1, V2);
- if (isShuffleEquivalent(V1, V2, Mask, 4, 0, 5, 1))
+ if (isShuffleEquivalent(V1, V2, Mask, {4, 0, 5, 1}))
return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V2, V1);
- if (isShuffleEquivalent(V1, V2, Mask, 6, 2, 7, 3))
+ if (isShuffleEquivalent(V1, V2, Mask, {6, 2, 7, 3}))
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V2, V1);
// Try to use byte rotation instructions.
// Try to lower by permuting the inputs into an unpack instruction.
if (SDValue Unpack =
- lowerVectorShuffleAsUnpack(MVT::v4i32, DL, V1, V2, Mask, DAG))
+ lowerVectorShuffleAsUnpack(DL, MVT::v4i32, V1, V2, Mask, DAG))
return Unpack;
// We implement this with SHUFPS because it can blend from two vectors.
/// The exact breakdown of how to form these dword pairs and align them on the
/// correct sides is really tricky. See the comments within the function for
/// more of the details.
-static SDValue lowerV8I16SingleInputVectorShuffle(
- SDLoc DL, SDValue V, MutableArrayRef<int> Mask,
+///
+/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
+/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
+/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
+/// vector, form the analogous 128-bit 8-element Mask.
+static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
+ SDLoc DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
const X86Subtarget *Subtarget, SelectionDAG &DAG) {
- assert(V.getSimpleValueType() == MVT::v8i16 && "Bad input type!");
+ assert(VT.getScalarType() == MVT::i16 && "Bad input type!");
+ MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
+
+ assert(Mask.size() == 8 && "Shuffle mask length doen't match!");
MutableArrayRef<int> LoMask = Mask.slice(0, 4);
MutableArrayRef<int> HiMask = Mask.slice(4, 4);
MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
- // Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8i16, DL, V,
- Mask, Subtarget, DAG))
- return Broadcast;
-
- // Try to use shift instructions.
- if (SDValue Shift =
- lowerVectorShuffleAsShift(DL, MVT::v8i16, V, V, Mask, DAG))
- return Shift;
-
- // Use dedicated unpack instructions for masks that match their pattern.
- if (isShuffleEquivalent(V, V, Mask, 0, 0, 1, 1, 2, 2, 3, 3))
- return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V, V);
- if (isShuffleEquivalent(V, V, Mask, 4, 4, 5, 5, 6, 6, 7, 7))
- return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V, V);
-
- // Try to use byte rotation instructions.
- if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
- DL, MVT::v8i16, V, V, Mask, Subtarget, DAG))
- return Rotate;
-
// Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
// such inputs we can swap two of the dwords across the half mark and end up
// with <=2 inputs to each half in each half. Once there, we can fall through
int PSHUFDMask[] = {0, 1, 2, 3};
PSHUFDMask[ADWord] = BDWord;
PSHUFDMask[BDWord] = ADWord;
- V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
- DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
- DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V),
+ V = DAG.getNode(ISD::BITCAST, DL, VT,
+ DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT,
+ DAG.getNode(ISD::BITCAST, DL, PSHUFDVT, V),
getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
// Adjust the mask to match the new locations of A and B.
// Recurse back into this routine to re-compute state now that this isn't
// a 3 and 1 problem.
- return DAG.getVectorShuffle(MVT::v8i16, DL, V, DAG.getUNDEF(MVT::v8i16),
- Mask);
+ return lowerV8I16GeneralSingleInputVectorShuffle(DL, VT, V, Mask, Subtarget,
+ DAG);
};
if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
// Now enact all the shuffles we've computed to move the inputs into their
// target half.
if (!isNoopShuffleMask(PSHUFLMask))
- V = DAG.getNode(X86ISD::PSHUFLW, DL, MVT::v8i16, V,
+ V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
getV4X86ShuffleImm8ForMask(PSHUFLMask, DAG));
if (!isNoopShuffleMask(PSHUFHMask))
- V = DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, V,
+ V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
getV4X86ShuffleImm8ForMask(PSHUFHMask, DAG));
if (!isNoopShuffleMask(PSHUFDMask))
- V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
- DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
- DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V),
+ V = DAG.getNode(ISD::BITCAST, DL, VT,
+ DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT,
+ DAG.getNode(ISD::BITCAST, DL, PSHUFDVT, V),
getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
// At this point, each half should contain all its inputs, and we can then
// Do a half shuffle for the low mask.
if (!isNoopShuffleMask(LoMask))
- V = DAG.getNode(X86ISD::PSHUFLW, DL, MVT::v8i16, V,
+ V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
getV4X86ShuffleImm8ForMask(LoMask, DAG));
// Do a half shuffle with the high mask after shifting its values down.
if (M >= 0)
M -= 4;
if (!isNoopShuffleMask(HiMask))
- V = DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, V,
+ V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
getV4X86ShuffleImm8ForMask(HiMask, DAG));
return V;
int NumV2Inputs = std::count_if(Mask.begin(), Mask.end(), isV2);
- if (NumV2Inputs == 0)
- return lowerV8I16SingleInputVectorShuffle(DL, V1, Mask, Subtarget, DAG);
+ if (NumV2Inputs == 0) {
+ // Check for being able to broadcast a single element.
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i16, V1,
+ Mask, Subtarget, DAG))
+ return Broadcast;
+
+ // Try to use shift instructions.
+ if (SDValue Shift =
+ lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, DAG))
+ return Shift;
+
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (isShuffleEquivalent(V1, V1, Mask, {0, 0, 1, 1, 2, 2, 3, 3}))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V1, V1);
+ if (isShuffleEquivalent(V1, V1, Mask, {4, 4, 5, 5, 6, 6, 7, 7}))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V1, V1);
+
+ // Try to use byte rotation instructions.
+ if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1,
+ Mask, Subtarget, DAG))
+ return Rotate;
+
+ return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1, Mask,
+ Subtarget, DAG);
+ }
assert(std::any_of(Mask.begin(), Mask.end(), isV1) &&
"All single-input shuffles should be canonicalized to be V1-input "
// There are special ways we can lower some single-element blends.
if (NumV2Inputs == 1)
- if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v8i16, DL, V1, V2,
+ if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v8i16, V1, V2,
Mask, Subtarget, DAG))
return V;
return Masked;
// Use dedicated unpack instructions for masks that match their pattern.
- if (isShuffleEquivalent(V1, V2, Mask, 0, 8, 1, 9, 2, 10, 3, 11))
+ if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 1, 9, 2, 10, 3, 11}))
return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V1, V2);
- if (isShuffleEquivalent(V1, V2, Mask, 4, 12, 5, 13, 6, 14, 7, 15))
+ if (isShuffleEquivalent(V1, V2, Mask, {4, 12, 5, 13, 6, 14, 7, 15}))
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V1, V2);
// Try to use byte rotation instructions.
return BitBlend;
if (SDValue Unpack =
- lowerVectorShuffleAsUnpack(MVT::v8i16, DL, V1, V2, Mask, DAG))
+ lowerVectorShuffleAsUnpack(DL, MVT::v8i16, V1, V2, Mask, DAG))
return Unpack;
// If we can't directly blend but can use PSHUFB, that will be better as it
// For single-input shuffles, there are some nicer lowering tricks we can use.
if (NumV2Elements == 0) {
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16i8, DL, V1,
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i8, V1,
Mask, Subtarget, DAG))
return Broadcast;
}
// Use dedicated unpack instructions for masks that match their pattern.
- if (isShuffleEquivalent(V1, V2, Mask,
- 0, 16, 1, 17, 2, 18, 3, 19,
- 4, 20, 5, 21, 6, 22, 7, 23))
+ if (isShuffleEquivalent(V1, V2, Mask, {// Low half.
+ 0, 16, 1, 17, 2, 18, 3, 19,
+ // High half.
+ 4, 20, 5, 21, 6, 22, 7, 23}))
return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V1, V2);
- if (isShuffleEquivalent(V1, V2, Mask,
- 8, 24, 9, 25, 10, 26, 11, 27,
- 12, 28, 13, 29, 14, 30, 15, 31))
+ if (isShuffleEquivalent(V1, V2, Mask, {// Low half.
+ 8, 24, 9, 25, 10, 26, 11, 27,
+ // High half.
+ 12, 28, 13, 29, 14, 30, 15, 31}))
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V1, V2);
// Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
// shuffles will both be pshufb, in which case we shouldn't bother with
// this.
if (SDValue Unpack =
- lowerVectorShuffleAsUnpack(MVT::v16i8, DL, V1, V2, Mask, DAG))
+ lowerVectorShuffleAsUnpack(DL, MVT::v16i8, V1, V2, Mask, DAG))
return Unpack;
}
// There are special ways we can lower some single-element blends.
if (NumV2Elements == 1)
- if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v16i8, DL, V1, V2,
+ if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v16i8, V1, V2,
Mask, Subtarget, DAG))
return V;
VT.getVectorNumElements() / 2);
// Check for patterns which can be matched with a single insert of a 128-bit
// subvector.
- if (isShuffleEquivalent(V1, V2, Mask, 0, 1, 0, 1) ||
- isShuffleEquivalent(V1, V2, Mask, 0, 1, 4, 5)) {
+ if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1}) ||
+ isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {
SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
DAG.getIntPtrConstant(0));
SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
Mask[2] < 4 ? V1 : V2, DAG.getIntPtrConstant(0));
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
}
- if (isShuffleEquivalent(V1, V2, Mask, 0, 1, 6, 7)) {
+ if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 6, 7})) {
SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
DAG.getIntPtrConstant(0));
SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
if (isSingleInputShuffleMask(Mask)) {
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4f64, DL, V1,
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4f64, V1,
Mask, Subtarget, DAG))
return Broadcast;
// Use low duplicate instructions for masks that match their pattern.
- if (isShuffleEquivalent(V1, V2, Mask, 0, 0, 2, 2))
+ if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
// X86 has dedicated unpack instructions that can handle specific blend
// operations: UNPCKH and UNPCKL.
- if (isShuffleEquivalent(V1, V2, Mask, 0, 4, 2, 6))
+ if (isShuffleEquivalent(V1, V2, Mask, {0, 4, 2, 6}))
return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f64, V1, V2);
- if (isShuffleEquivalent(V1, V2, Mask, 1, 5, 3, 7))
+ if (isShuffleEquivalent(V1, V2, Mask, {1, 5, 3, 7}))
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V1, V2);
- if (isShuffleEquivalent(V1, V2, Mask, 4, 0, 6, 2))
+ if (isShuffleEquivalent(V1, V2, Mask, {4, 0, 6, 2}))
return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f64, V2, V1);
- if (isShuffleEquivalent(V1, V2, Mask, 5, 1, 7, 3))
+ if (isShuffleEquivalent(V1, V2, Mask, {5, 1, 7, 3}))
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V2, V1);
// If we have a single input to the zero element, insert that into V1 if we
std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
if (NumV2Elements == 1 && Mask[0] >= 4)
if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
- MVT::v4f64, DL, V1, V2, Mask, Subtarget, DAG))
+ DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
return Insertion;
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
return Blend;
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4i64, DL, V1,
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1,
Mask, Subtarget, DAG))
return Broadcast;
return Shift;
// Use dedicated unpack instructions for masks that match their pattern.
- if (isShuffleEquivalent(V1, V2, Mask, 0, 4, 2, 6))
+ if (isShuffleEquivalent(V1, V2, Mask, {0, 4, 2, 6}))
return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V1, V2);
- if (isShuffleEquivalent(V1, V2, Mask, 1, 5, 3, 7))
+ if (isShuffleEquivalent(V1, V2, Mask, {1, 5, 3, 7}))
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V1, V2);
- if (isShuffleEquivalent(V1, V2, Mask, 4, 0, 6, 2))
+ if (isShuffleEquivalent(V1, V2, Mask, {4, 0, 6, 2}))
return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V2, V1);
- if (isShuffleEquivalent(V1, V2, Mask, 5, 1, 7, 3))
+ if (isShuffleEquivalent(V1, V2, Mask, {5, 1, 7, 3}))
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V2, V1);
// Try to simplify this by merging 128-bit lanes to enable a lane-based
return Blend;
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8f32, DL, V1,
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1,
Mask, Subtarget, DAG))
return Broadcast;
"Repeated masks must be half the mask width!");
// Use even/odd duplicate instructions for masks that match their pattern.
- if (isShuffleEquivalent(V1, V2, Mask, 0, 0, 2, 2, 4, 4, 6, 6))
+ if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))
return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
- if (isShuffleEquivalent(V1, V2, Mask, 1, 1, 3, 3, 5, 5, 7, 7))
+ if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3, 5, 5, 7, 7}))
return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
if (isSingleInputShuffleMask(Mask))
getV4X86ShuffleImm8ForMask(RepeatedMask, DAG));
// Use dedicated unpack instructions for masks that match their pattern.
- if (isShuffleEquivalent(V1, V2, Mask, 0, 8, 1, 9, 4, 12, 5, 13))
+ if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 1, 9, 4, 12, 5, 13}))
return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f32, V1, V2);
- if (isShuffleEquivalent(V1, V2, Mask, 2, 10, 3, 11, 6, 14, 7, 15))
+ if (isShuffleEquivalent(V1, V2, Mask, {2, 10, 3, 11, 6, 14, 7, 15}))
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f32, V1, V2);
- if (isShuffleEquivalent(V1, V2, Mask, 8, 0, 9, 1, 12, 4, 13, 5))
+ if (isShuffleEquivalent(V1, V2, Mask, {8, 0, 9, 1, 12, 4, 13, 5}))
return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f32, V2, V1);
- if (isShuffleEquivalent(V1, V2, Mask, 10, 2, 11, 3, 14, 6, 15, 7))
+ if (isShuffleEquivalent(V1, V2, Mask, {10, 2, 11, 3, 14, 6, 15, 7}))
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f32, V2, V1);
// Otherwise, fall back to a SHUFPS sequence. Here it is important that we
return Blend;
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8i32, DL, V1,
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1,
Mask, Subtarget, DAG))
return Broadcast;
getV4X86ShuffleImm8ForMask(RepeatedMask, DAG));
// Use dedicated unpack instructions for masks that match their pattern.
- if (isShuffleEquivalent(V1, V2, Mask, 0, 8, 1, 9, 4, 12, 5, 13))
+ if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 1, 9, 4, 12, 5, 13}))
return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i32, V1, V2);
- if (isShuffleEquivalent(V1, V2, Mask, 2, 10, 3, 11, 6, 14, 7, 15))
+ if (isShuffleEquivalent(V1, V2, Mask, {2, 10, 3, 11, 6, 14, 7, 15}))
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i32, V1, V2);
- if (isShuffleEquivalent(V1, V2, Mask, 8, 0, 9, 1, 12, 4, 13, 5))
+ if (isShuffleEquivalent(V1, V2, Mask, {8, 0, 9, 1, 12, 4, 13, 5}))
return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i32, V2, V1);
- if (isShuffleEquivalent(V1, V2, Mask, 10, 2, 11, 3, 14, 6, 15, 7))
+ if (isShuffleEquivalent(V1, V2, Mask, {10, 2, 11, 3, 14, 6, 15, 7}))
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i32, V2, V1);
}
return ZExt;
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16i16, DL, V1,
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1,
Mask, Subtarget, DAG))
return Broadcast;
// Use dedicated unpack instructions for masks that match their pattern.
if (isShuffleEquivalent(V1, V2, Mask,
- // First 128-bit lane:
- 0, 16, 1, 17, 2, 18, 3, 19,
- // Second 128-bit lane:
- 8, 24, 9, 25, 10, 26, 11, 27))
+ {// First 128-bit lane:
+ 0, 16, 1, 17, 2, 18, 3, 19,
+ // Second 128-bit lane:
+ 8, 24, 9, 25, 10, 26, 11, 27}))
return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i16, V1, V2);
if (isShuffleEquivalent(V1, V2, Mask,
- // First 128-bit lane:
- 4, 20, 5, 21, 6, 22, 7, 23,
- // Second 128-bit lane:
- 12, 28, 13, 29, 14, 30, 15, 31))
+ {// First 128-bit lane:
+ 4, 20, 5, 21, 6, 22, 7, 23,
+ // Second 128-bit lane:
+ 12, 28, 13, 29, 14, 30, 15, 31}))
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i16, V1, V2);
// Try to use shift instructions.
return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
Mask, DAG);
+ SmallVector<int, 8> RepeatedMask;
+ if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
+ // As this is a single-input shuffle, the repeated mask should be
+ // a strictly valid v8i16 mask that we can pass through to the v8i16
+ // lowering to handle even the v16 case.
+ return lowerV8I16GeneralSingleInputVectorShuffle(
+ DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
+ }
+
SDValue PSHUFBMask[32];
for (int i = 0; i < 16; ++i) {
if (Mask[i] == -1) {
return ZExt;
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v32i8, DL, V1,
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1,
Mask, Subtarget, DAG))
return Broadcast;
// 256-bit lanes.
if (isShuffleEquivalent(
V1, V2, Mask,
- // First 128-bit lane:
- 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39,
- // Second 128-bit lane:
- 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55))
+ {// First 128-bit lane:
+ 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39,
+ // Second 128-bit lane:
+ 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55}))
return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v32i8, V1, V2);
if (isShuffleEquivalent(
V1, V2, Mask,
- // First 128-bit lane:
- 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47,
- // Second 128-bit lane:
- 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63))
+ {// First 128-bit lane:
+ 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47,
+ // Second 128-bit lane:
+ 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63}))
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v32i8, V1, V2);
// Try to use shift instructions.
// X86 has dedicated unpack instructions that can handle specific blend
// operations: UNPCKH and UNPCKL.
- if (isShuffleEquivalent(V1, V2, Mask, 0, 8, 2, 10, 4, 12, 6, 14))
+ if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 2, 10, 4, 12, 6, 14}))
return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f64, V1, V2);
- if (isShuffleEquivalent(V1, V2, Mask, 1, 9, 3, 11, 5, 13, 7, 15))
+ if (isShuffleEquivalent(V1, V2, Mask, {1, 9, 3, 11, 5, 13, 7, 15}))
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f64, V1, V2);
// FIXME: Implement direct support for this type!
// Use dedicated unpack instructions for masks that match their pattern.
if (isShuffleEquivalent(V1, V2, Mask,
- 0, 16, 1, 17, 4, 20, 5, 21,
- 8, 24, 9, 25, 12, 28, 13, 29))
+ {// First 128-bit lane.
+ 0, 16, 1, 17, 4, 20, 5, 21,
+ // Second 128-bit lane.
+ 8, 24, 9, 25, 12, 28, 13, 29}))
return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16f32, V1, V2);
if (isShuffleEquivalent(V1, V2, Mask,
- 2, 18, 3, 19, 6, 22, 7, 23,
- 10, 26, 11, 27, 14, 30, 15, 31))
+ {// First 128-bit lane.
+ 2, 18, 3, 19, 6, 22, 7, 23,
+ // Second 128-bit lane.
+ 10, 26, 11, 27, 14, 30, 15, 31}))
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16f32, V1, V2);
// FIXME: Implement direct support for this type!
// X86 has dedicated unpack instructions that can handle specific blend
// operations: UNPCKH and UNPCKL.
- if (isShuffleEquivalent(V1, V2, Mask, 0, 8, 2, 10, 4, 12, 6, 14))
+ if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 2, 10, 4, 12, 6, 14}))
return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i64, V1, V2);
- if (isShuffleEquivalent(V1, V2, Mask, 1, 9, 3, 11, 5, 13, 7, 15))
+ if (isShuffleEquivalent(V1, V2, Mask, {1, 9, 3, 11, 5, 13, 7, 15}))
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i64, V1, V2);
// FIXME: Implement direct support for this type!
// Use dedicated unpack instructions for masks that match their pattern.
if (isShuffleEquivalent(V1, V2, Mask,
- 0, 16, 1, 17, 4, 20, 5, 21,
- 8, 24, 9, 25, 12, 28, 13, 29))
+ {// First 128-bit lane.
+ 0, 16, 1, 17, 4, 20, 5, 21,
+ // Second 128-bit lane.
+ 8, 24, 9, 25, 12, 28, 13, 29}))
return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i32, V1, V2);
if (isShuffleEquivalent(V1, V2, Mask,
- 2, 18, 3, 19, 6, 22, 7, 23,
- 10, 26, 11, 27, 14, 30, 15, 31))
+ {// First 128-bit lane.
+ 2, 18, 3, 19, 6, 22, 7, 23,
+ // Second 128-bit lane.
+ 10, 26, 11, 27, 14, 30, 15, 31}))
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i32, V1, V2);
// FIXME: Implement direct support for this type!
"Cannot lower 512-bit vectors w/ basic ISA!");
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(VT.SimpleTy, DL, V1,
- Mask, Subtarget, DAG))
+ if (SDValue Broadcast =
+ lowerVectorShuffleAsBroadcast(DL, VT, V1, Mask, Subtarget, DAG))
return Broadcast;
// Dispatch to each element type for lowering. If we don't have supprot for
if (!Subtarget->hasSSE41())
return SDValue();
- // Some types for vselect were previously set to Expand, not Legal or
- // Custom. Return an empty SDValue so we fall-through to Expand, after
- // the Custom lowering phase.
- MVT VT = Op.getSimpleValueType();
- switch (VT.SimpleTy) {
+ // Only some types will be legal on some subtargets. If we can emit a legal
+ // VSELECT-matching blend, return Op, and but if we need to expand, return
+ // a null value.
+ switch (Op.getSimpleValueType().SimpleTy) {
default:
- break;
+ // Most of the vector types have blends past SSE4.1.
+ return Op;
+
+ case MVT::v32i8:
+ // The byte blends for AVX vectors were introduced only in AVX2.
+ if (Subtarget->hasAVX2())
+ return Op;
+
+ return SDValue();
+
case MVT::v8i16:
case MVT::v16i16:
+ // AVX-512 BWI and VLX features support VSELECT with i16 elements.
if (Subtarget->hasBWI() && Subtarget->hasVLX())
- break;
+ return Op;
+
+ // FIXME: We should custom lower this by fixing the condition and using i8
+ // blends.
return SDValue();
}
-
- // We couldn't create a "Blend with immediate" node.
- // This node should still be legal, but we'll have to emit a blendv*
- // instruction.
- return Op;
}
static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
SDValue Src2 = Op.getOperand(2);
SDValue Src0 = Op.getOperand(3);
SDValue Mask = Op.getOperand(4);
- SDValue RoundingMode = Op.getOperand(5);
+ // There are 2 kinds of intrinsics in this group:
+ // (1) With supress-all-exceptions (sae) - 6 operands
+ // (2) With rounding mode and sae - 7 operands.
+ if (Op.getNumOperands() == 6) {
+ SDValue Sae = Op.getOperand(5);
+ return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
+ Sae),
+ Mask, Src0, Subtarget, DAG);
+ }
+ assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form");
+ SDValue RoundingMode = Op.getOperand(5);
+ SDValue Sae = Op.getOperand(6);
return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
- RoundingMode),
+ RoundingMode, Sae),
Mask, Src0, Subtarget, DAG);
}
case INTR_TYPE_2OP_MASK: {
DAG);
}
- if (VT == MVT::v16i8) {
- if (Op.getOpcode() == ISD::SHL) {
- // Make a large shift.
- SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl,
- MVT::v8i16, R, ShiftAmt,
- DAG);
- SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
- // Zero out the rightmost bits.
- SmallVector<SDValue, 16> V(16,
- DAG.getConstant(uint8_t(-1U << ShiftAmt),
- MVT::i8));
- return DAG.getNode(ISD::AND, dl, VT, SHL,
- DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
- }
- if (Op.getOpcode() == ISD::SRL) {
- // Make a large shift.
- SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl,
- MVT::v8i16, R, ShiftAmt,
- DAG);
- SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
- // Zero out the leftmost bits.
- SmallVector<SDValue, 16> V(16,
- DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
- MVT::i8));
- return DAG.getNode(ISD::AND, dl, VT, SRL,
- DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
- }
- if (Op.getOpcode() == ISD::SRA) {
- if (ShiftAmt == 7) {
- // R s>> 7 === R s< 0
- SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
- return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
- }
-
- // R s>> a === ((R u>> a) ^ m) - m
- SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
- SmallVector<SDValue, 16> V(16, DAG.getConstant(128 >> ShiftAmt,
- MVT::i8));
- SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V);
- Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
- Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
- return Res;
- }
- llvm_unreachable("Unknown shift opcode.");
- }
+ if (VT == MVT::v16i8 || (Subtarget->hasInt256() && VT == MVT::v32i8)) {
+ unsigned NumElts = VT.getVectorNumElements();
+ MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
- if (Subtarget->hasInt256() && VT == MVT::v32i8) {
if (Op.getOpcode() == ISD::SHL) {
// Make a large shift.
- SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl,
- MVT::v16i16, R, ShiftAmt,
- DAG);
+ SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT,
+ R, ShiftAmt, DAG);
SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
// Zero out the rightmost bits.
- SmallVector<SDValue, 32> V(32,
- DAG.getConstant(uint8_t(-1U << ShiftAmt),
- MVT::i8));
+ SmallVector<SDValue, 32> V(
+ NumElts, DAG.getConstant(uint8_t(-1U << ShiftAmt), MVT::i8));
return DAG.getNode(ISD::AND, dl, VT, SHL,
DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
}
if (Op.getOpcode() == ISD::SRL) {
// Make a large shift.
- SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl,
- MVT::v16i16, R, ShiftAmt,
- DAG);
+ SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT,
+ R, ShiftAmt, DAG);
SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
// Zero out the leftmost bits.
- SmallVector<SDValue, 32> V(32,
- DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
- MVT::i8));
+ SmallVector<SDValue, 32> V(
+ NumElts, DAG.getConstant(uint8_t(-1U) >> ShiftAmt, MVT::i8));
return DAG.getNode(ISD::AND, dl, VT, SRL,
DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
}
// R s>> a === ((R u>> a) ^ m) - m
SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
- SmallVector<SDValue, 32> V(32, DAG.getConstant(128 >> ShiftAmt,
- MVT::i8));
+ SmallVector<SDValue, 32> V(NumElts,
+ DAG.getConstant(128 >> ShiftAmt, MVT::i8));
SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V);
Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
Amt = DAG.getNode(ISD::ANY_EXTEND, dl, NewVT, Amt);
return DAG.getNode(ISD::TRUNCATE, dl, VT,
DAG.getNode(Op.getOpcode(), dl, NewVT, R, Amt));
- }
+ }
// Decompose 256-bit shifts into smaller 128-bit shifts.
if (VT.is256BitVector()) {
SDValue Amt1, Amt2;
if (Amt.getOpcode() == ISD::BUILD_VECTOR) {
// Constant shift amount
- SmallVector<SDValue, 4> Amt1Csts;
- SmallVector<SDValue, 4> Amt2Csts;
- for (unsigned i = 0; i != NumElems/2; ++i)
- Amt1Csts.push_back(Amt->getOperand(i));
- for (unsigned i = NumElems/2; i != NumElems; ++i)
- Amt2Csts.push_back(Amt->getOperand(i));
+ SmallVector<SDValue, 8> Ops(Amt->op_begin(), Amt->op_begin() + NumElems);
+ ArrayRef<SDValue> Amt1Csts = makeArrayRef(Ops).slice(0, NumElems / 2);
+ ArrayRef<SDValue> Amt2Csts = makeArrayRef(Ops).slice(NumElems / 2);
Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt1Csts);
Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt2Csts);
return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
}
-// Sign extension of the low part of vector elements. This may be used either
-// when sign extend instructions are not available or if the vector element
-// sizes already match the sign-extended size. If the vector elements are in
-// their pre-extended size and sign extend instructions are available, that will
-// be handled by LowerSIGN_EXTEND.
-SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
- SelectionDAG &DAG) const {
- SDLoc dl(Op);
- EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
- MVT VT = Op.getSimpleValueType();
-
- if (!Subtarget->hasSSE2() || !VT.isVector())
- return SDValue();
-
- unsigned BitsDiff = VT.getScalarType().getSizeInBits() -
- ExtraVT.getScalarType().getSizeInBits();
-
- switch (VT.SimpleTy) {
- default: return SDValue();
- case MVT::v8i32:
- case MVT::v16i16:
- if (!Subtarget->hasFp256())
- return SDValue();
- if (!Subtarget->hasInt256()) {
- // needs to be split
- unsigned NumElems = VT.getVectorNumElements();
-
- // Extract the LHS vectors
- SDValue LHS = Op.getOperand(0);
- SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
- SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
-
- MVT EltVT = VT.getVectorElementType();
- EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
-
- EVT ExtraEltVT = ExtraVT.getVectorElementType();
- unsigned ExtraNumElems = ExtraVT.getVectorNumElements();
- ExtraVT = EVT::getVectorVT(*DAG.getContext(), ExtraEltVT,
- ExtraNumElems/2);
- SDValue Extra = DAG.getValueType(ExtraVT);
-
- LHS1 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, Extra);
- LHS2 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, Extra);
-
- return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2);
- }
- // fall through
- case MVT::v4i32:
- case MVT::v8i16: {
- SDValue Op0 = Op.getOperand(0);
-
- // This is a sign extension of some low part of vector elements without
- // changing the size of the vector elements themselves:
- // Shift-Left + Shift-Right-Algebraic.
- SDValue Shl = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Op0,
- BitsDiff, DAG);
- return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Shl, BitsDiff,
- DAG);
- }
- }
-}
-
/// Returns true if the operand type is exactly twice the native width, and
/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
/// Used to know whether to use cmpxchg8/16b when expanding atomic operations
SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
switch (Op.getOpcode()) {
default: llvm_unreachable("Should not custom lower this!");
- case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op,DAG);
case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
return LowerCMP_SWAP(Op, Subtarget, DAG);
// Note that even with AVX we prefer the PSHUFD form of shuffle for integer
// vectors because it can have a load folded into it that UNPCK cannot. This
// doesn't preclude something switching to the shorter encoding post-RA.
- if (FloatDomain) {
+ //
+ // FIXME: Should teach these routines about AVX vector widths.
+ if (FloatDomain && VT.getSizeInBits() == 128) {
if (Mask.equals(0, 0) || Mask.equals(1, 1)) {
bool Lo = Mask.equals(0, 0);
unsigned Shuffle;
// We always canonicalize the 8 x i16 and 16 x i8 shuffles into their UNPCK
// variants as none of these have single-instruction variants that are
// superior to the UNPCK formulation.
- if (!FloatDomain &&
+ if (!FloatDomain && VT.getSizeInBits() == 128 &&
(Mask.equals(0, 0, 1, 1, 2, 2, 3, 3) ||
Mask.equals(4, 4, 5, 5, 6, 6, 7, 7) ||
Mask.equals(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7) ||
// in practice PSHUFB tends to be *very* fast so we're more aggressive.
if ((Depth >= 3 || HasPSHUFB) && Subtarget->hasSSSE3()) {
SmallVector<SDValue, 16> PSHUFBMask;
- assert(Mask.size() <= 16 && "Can't shuffle elements smaller than bytes!");
- int Ratio = 16 / Mask.size();
- for (unsigned i = 0; i < 16; ++i) {
+ int NumBytes = VT.getSizeInBits() / 8;
+ int Ratio = NumBytes / Mask.size();
+ for (int i = 0; i < NumBytes; ++i) {
if (Mask[i / Ratio] == SM_SentinelUndef) {
PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
continue;
: 255;
PSHUFBMask.push_back(DAG.getConstant(M, MVT::i8));
}
- Op = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Input);
+ MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
+ Op = DAG.getNode(ISD::BITCAST, DL, ByteVT, Input);
DCI.AddToWorklist(Op.getNode());
SDValue PSHUFBMaskOp =
- DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, PSHUFBMask);
+ DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVT, PSHUFBMask);
DCI.AddToWorklist(PSHUFBMaskOp.getNode());
- Op = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, Op, PSHUFBMaskOp);
+ Op = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Op, PSHUFBMaskOp);
DCI.AddToWorklist(Op.getNode());
DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
/*AddTo*/ true);
MVT VT = Op.getSimpleValueType();
if (!VT.isVector())
return false; // Bail if we hit a non-vector.
- // FIXME: This routine should be taught about 256-bit shuffles, or a 256-bit
- // version should be added.
- if (VT.getSizeInBits() != 128)
- return false;
assert(Root.getSimpleValueType().isVector() &&
"Shuffles operate on vector types!");
/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
/// PSHUF-style masks that can be reused with such instructions.
static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
+ MVT VT = N.getSimpleValueType();
SmallVector<int, 4> Mask;
bool IsUnary;
- bool HaveMask = getTargetShuffleMask(N.getNode(), N.getSimpleValueType(), Mask, IsUnary);
+ bool HaveMask = getTargetShuffleMask(N.getNode(), VT, Mask, IsUnary);
(void)HaveMask;
assert(HaveMask);
+ // If we have more than 128-bits, only the low 128-bits of shuffle mask
+ // matter. Check that the upper masks are repeats and remove them.
+ if (VT.getSizeInBits() > 128) {
+ int LaneElts = 128 / VT.getScalarSizeInBits();
+#ifndef NDEBUG
+ for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
+ for (int j = 0; j < LaneElts; ++j)
+ assert(Mask[j] == Mask[i * LaneElts + j] - LaneElts &&
+ "Mask doesn't repeat in high 128-bit lanes!");
+#endif
+ Mask.resize(LaneElts);
+ }
+
switch (N.getOpcode()) {
case X86ISD::PSHUFD:
return Mask;
case X86ISD::UNPCKH:
// For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
// shuffle into a preceding word shuffle.
- if (V.getValueType() != MVT::v16i8 && V.getValueType() != MVT::v8i16)
+ if (V.getSimpleValueType().getScalarType() != MVT::i8 &&
+ V.getSimpleValueType().getScalarType() != MVT::i16)
return SDValue();
// Search for a half-shuffle which we can combine with.
break;
case X86ISD::PSHUFLW:
case X86ISD::PSHUFHW:
- assert(VT == MVT::v8i16);
- (void)VT;
+ assert(VT.getScalarType() == MVT::i16 && "Bad word shuffle type!");
if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
return SDValue(); // We combined away this shuffle, so we're done.
int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
DMask[DOffset + 0] = DOffset + 1;
DMask[DOffset + 1] = DOffset + 0;
- V = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V);
+ MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
+ V = DAG.getNode(ISD::BITCAST, DL, DVT, V);
DCI.AddToWorklist(V.getNode());
- V = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V,
+ V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
getV4X86ShuffleImm8ForMask(DMask, DAG));
DCI.AddToWorklist(V.getNode());
- return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V);
+ return DAG.getNode(ISD::BITCAST, DL, VT, V);
}
// Look for shuffle patterns which can be implemented as a single unpack.
std::equal(std::begin(MappedMask), std::end(MappedMask),
std::begin(UnpackHiMask))) {
// We can replace all three shuffles with an unpack.
- V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, D.getOperand(0));
+ V = DAG.getNode(ISD::BITCAST, DL, VT, D.getOperand(0));
DCI.AddToWorklist(V.getNode());
return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
: X86ISD::UNPCKH,
- DL, MVT::v8i16, V, V);
+ DL, VT, V, V);
}
}
}
// We're looking for blends between FADD and FSUB nodes. We insist on these
// nodes being lined up in a specific expected pattern.
- if (!(isShuffleEquivalent(V1, V2, Mask, 0, 3) ||
- isShuffleEquivalent(V1, V2, Mask, 0, 5, 2, 7) ||
- isShuffleEquivalent(V1, V2, Mask, 0, 9, 2, 11, 4, 13, 6, 15)))
+ if (!(isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
+ isShuffleEquivalent(V1, V2, Mask, {0, 5, 2, 7}) ||
+ isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15})))
return SDValue();
// Only specific types are legal at this point, assert so we notice if and
}
}
- // Only handle 128 wide vector from here on.
- if (!VT.is128BitVector())
- return SDValue();
-
// Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
// load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
// consecutive, non-overlapping, and in the right order.
}
}
- // If we know that this node is legal then we know that it is going to be
- // matched by one of the SSE/AVX BLEND instructions. These instructions only
- // depend on the highest bit in each word. Try to use SimplifyDemandedBits
- // to simplify previous instructions.
+ // We should generate an X86ISD::BLENDI from a vselect if its argument
+ // is a sign_extend_inreg of an any_extend of a BUILD_VECTOR of
+ // constants. This specific pattern gets generated when we split a
+ // selector for a 512 bit vector in a machine without AVX512 (but with
+ // 256-bit vectors), during legalization:
+ //
+ // (vselect (sign_extend (any_extend (BUILD_VECTOR)) i1) LHS RHS)
+ //
+ // Iff we find this pattern and the build_vectors are built from
+ // constants, we translate the vselect into a shuffle_vector that we
+ // know will be matched by LowerVECTOR_SHUFFLEtoBlend.
+ if ((N->getOpcode() == ISD::VSELECT ||
+ N->getOpcode() == X86ISD::SHRUNKBLEND) &&
+ !DCI.isBeforeLegalize()) {
+ SDValue Shuffle = transformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget);
+ if (Shuffle.getNode())
+ return Shuffle;
+ }
+
+ // If this is a *dynamic* select (non-constant condition) and we can match
+ // this node with one of the variable blend instructions, restructure the
+ // condition so that the blends can use the high bit of each element and use
+ // SimplifyDemandedBits to simplify the condition operand.
if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
!DCI.isBeforeLegalize() &&
- // We explicitly check against SSE4.1, v8i16 and v16i16 because, although
- // vselect nodes may be marked as Custom, they might only be legal when
- // Cond is a build_vector of constants. This will be taken care in
- // a later condition.
- (TLI.isOperationLegalOrCustom(ISD::VSELECT, VT) &&
- Subtarget->hasSSE41() && VT != MVT::v16i16 && VT != MVT::v8i16) &&
- // Don't optimize vector of constants. Those are handled by
- // the generic code and all the bits must be properly set for
- // the generic optimizer.
!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits();
if (BitWidth == 1)
return SDValue();
+ // We can only handle the cases where VSELECT is directly legal on the
+ // subtarget. We custom lower VSELECT nodes with constant conditions and
+ // this makes it hard to see whether a dynamic VSELECT will correctly
+ // lower, so we both check the operation's status and explicitly handle the
+ // cases where a *dynamic* blend will fail even though a constant-condition
+ // blend could be custom lowered.
+ // FIXME: We should find a better way to handle this class of problems.
+ // Potentially, we should combine constant-condition vselect nodes
+ // pre-legalization into shuffles and not mark as many types as custom
+ // lowered.
+ if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
+ return SDValue();
+ // FIXME: We don't support i16-element blends currently. We could and
+ // should support them by making *all* the bits in the condition be set
+ // rather than just the high bit and using an i8-element blend.
+ if (VT.getScalarType() == MVT::i16)
+ return SDValue();
+ // Dynamic blending was only available from SSE4.1 onward.
+ if (VT.getSizeInBits() == 128 && !Subtarget->hasSSE41())
+ return SDValue();
+ // Byte blends are only available in AVX2
+ if (VT.getSizeInBits() == 256 && VT.getScalarType() == MVT::i8 &&
+ !Subtarget->hasAVX2())
+ return SDValue();
+
assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
}
}
- // We should generate an X86ISD::BLENDI from a vselect if its argument
- // is a sign_extend_inreg of an any_extend of a BUILD_VECTOR of
- // constants. This specific pattern gets generated when we split a
- // selector for a 512 bit vector in a machine without AVX512 (but with
- // 256-bit vectors), during legalization:
- //
- // (vselect (sign_extend (any_extend (BUILD_VECTOR)) i1) LHS RHS)
- //
- // Iff we find this pattern and the build_vectors are built from
- // constants, we translate the vselect into a shuffle_vector that we
- // know will be matched by LowerVECTOR_SHUFFLEtoBlend.
- if ((N->getOpcode() == ISD::VSELECT ||
- N->getOpcode() == X86ISD::SHRUNKBLEND) &&
- !DCI.isBeforeLegalize()) {
- SDValue Shuffle = transformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget);
- if (Shuffle.getNode())
- return Shuffle;
- }
-
return SDValue();
}
if (MayFoldLoad(Ld)) {
// Extract the countS bits from the immediate so we can get the proper
// address when narrowing the vector load to a specific element.
- // When the second source op is a memory address, interps doesn't use
+ // When the second source op is a memory address, insertps doesn't use
// countS and just gets an f32 from that address.
unsigned DestIndex =
cast<ConstantSDNode>(N->getOperand(2))->getZExtValue() >> 6;
+
Ld = NarrowVectorLoadToElement(cast<LoadSDNode>(Ld), DestIndex, DAG);
- } else
- return SDValue();
- // Create this as a scalar to vector to match the instruction pattern.
- SDValue LoadScalarToVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Ld);
- // countS bits are ignored when loading from memory on insertps, which
- // means we don't need to explicitly set them to 0.
- return DAG.getNode(X86ISD::INSERTPS, dl, VT, N->getOperand(0),
- LoadScalarToVector, N->getOperand(2));
+ // Create this as a scalar to vector to match the instruction pattern.
+ SDValue LoadScalarToVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Ld);
+ // countS bits are ignored when loading from memory on insertps, which
+ // means we don't need to explicitly set them to 0.
+ return DAG.getNode(X86ISD::INSERTPS, dl, VT, N->getOperand(0),
+ LoadScalarToVector, N->getOperand(2));
+ }
+ return SDValue();
+}
+
+static SDValue PerformBLENDICombine(SDNode *N, SelectionDAG &DAG) {
+ SDValue V0 = N->getOperand(0);
+ SDValue V1 = N->getOperand(1);
+ SDLoc DL(N);
+ EVT VT = N->getValueType(0);
+
+ // Canonicalize a v2f64 blend with a mask of 2 by swapping the vector
+ // operands and changing the mask to 1. This saves us a bunch of
+ // pattern-matching possibilities related to scalar math ops in SSE/AVX.
+ // x86InstrInfo knows how to commute this back after instruction selection
+ // if it would help register allocation.
+
+ // TODO: If optimizing for size or a processor that doesn't suffer from
+ // partial register update stalls, this should be transformed into a MOVSD
+ // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD.
+
+ if (VT == MVT::v2f64)
+ if (auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(2)))
+ if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) {
+ SDValue NewMask = DAG.getConstant(1, MVT::i8);
+ return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask);
+ }
+
+ return SDValue();
}
// Helper function of PerformSETCCCombine. It is to materialize "setb reg"
return PerformINSERTPSCombine(N, DAG, Subtarget);
break;
}
+ case X86ISD::BLENDI: return PerformBLENDICombine(N, DAG);
case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DAG, Subtarget);
}
return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
}
-std::pair<unsigned, const TargetRegisterClass*>
-X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
+std::pair<unsigned, const TargetRegisterClass *>
+X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+ const std::string &Constraint,
MVT VT) const {
// First, see if this is a constraint that directly corresponds to an LLVM
// register class.
// Use the default implementation in TargetLowering to convert the register
// constraint into a member of a register class.
std::pair<unsigned, const TargetRegisterClass*> Res;
- Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+ Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
// Not found as a standard register?
if (!Res.second) {