#include "X86MachineFunctionInfo.h"
#include "X86TargetMachine.h"
#include "X86TargetObjectFile.h"
+#include "llvm/ADT/SmallBitVector.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringExtras.h"
Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(),
OpFlags);
+ } else if (Subtarget->isTarget64BitILP32() && Callee->getValueType(0) == MVT::i32) {
+ // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
+ Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
}
// Returns a chain & a flag for retval copy to use.
static bool isTargetShuffle(unsigned Opcode) {
switch(Opcode) {
default: return false;
+ case X86ISD::BLENDI:
case X86ISD::PSHUFB:
case X86ISD::PSHUFD:
case X86ISD::PSHUFHW:
case X86ISD::MOVSD:
case X86ISD::UNPCKL:
case X86ISD::UNPCKH:
- case X86ISD::VPERMILP:
+ case X86ISD::VPERMILPI:
case X86ISD::VPERM2X128:
case X86ISD::VPERMI:
return true;
case X86ISD::PSHUFD:
case X86ISD::PSHUFHW:
case X86ISD::PSHUFLW:
- case X86ISD::VPERMILP:
+ case X86ISD::VPERMILPI:
case X86ISD::VPERMI:
return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8));
}
IsUnary = false;
bool IsFakeUnary = false;
switch(N->getOpcode()) {
+ case X86ISD::BLENDI:
+ ImmN = N->getOperand(N->getNumOperands()-1);
+ DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+ break;
case X86ISD::SHUFP:
ImmN = N->getOperand(N->getNumOperands()-1);
DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
break;
case X86ISD::PSHUFD:
- case X86ISD::VPERMILP:
+ case X86ISD::VPERMILPI:
ImmN = N->getOperand(N->getNumOperands()-1);
DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
IsUnary = true;
SmallVector<uint64_t, 32> RawMask;
for (int i = 0, e = MaskNode->getNumOperands(); i < e; ++i) {
- auto *CN = dyn_cast<ConstantSDNode>(MaskNode->getOperand(i));
+ SDValue Op = MaskNode->getOperand(i);
+ if (Op->getOpcode() == ISD::UNDEF) {
+ RawMask.push_back((uint64_t)SM_SentinelUndef);
+ continue;
+ }
+ auto *CN = dyn_cast<ConstantSDNode>(Op.getNode());
if (!CN)
return false;
APInt MaskElement = CN->getAPIntValue();
if (!MaskCP || MaskCP->isMachineConstantPoolEntry())
return false;
- if (auto *C = dyn_cast<ConstantDataSequential>(MaskCP->getConstVal())) {
+ if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) {
// FIXME: Support AVX-512 here.
- if (!C->getType()->isVectorTy() ||
- (C->getNumElements() != 16 && C->getNumElements() != 32))
+ Type *Ty = C->getType();
+ if (!Ty->isVectorTy() || (Ty->getVectorNumElements() != 16 &&
+ Ty->getVectorNumElements() != 32))
return false;
- assert(C->getType()->isVectorTy() && "Expected a vector constant.");
DecodePSHUFBMask(C, Mask);
break;
}
/// or SDValue() otherwise.
static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
SelectionDAG &DAG) {
- if (!Subtarget->hasFp256())
+ // VBROADCAST requires AVX.
+ // TODO: Splats could be generated for non-AVX CPUs using SSE
+ // instructions, but there's less potential gain for only 128-bit vectors.
+ if (!Subtarget->hasAVX())
return SDValue();
MVT VT = Op.getSimpleValueType();
}
}
+ unsigned ScalarSize = Ld.getValueType().getSizeInBits();
bool IsGE256 = (VT.getSizeInBits() >= 256);
- // Handle the broadcasting a single constant scalar from the constant pool
- // into a vector. On Sandybridge it is still better to load a constant vector
+ // When optimizing for size, generate up to 5 extra bytes for a broadcast
+ // instruction to save 8 or more bytes of constant pool data.
+ // TODO: If multiple splats are generated to load the same constant,
+ // it may be detrimental to overall size. There needs to be a way to detect
+ // that condition to know if this is truly a size win.
+ const Function *F = DAG.getMachineFunction().getFunction();
+ bool OptForSize = F->getAttributes().
+ hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
+
+ // Handle broadcasting a single constant scalar from the constant pool
+ // into a vector.
+ // On Sandybridge (no AVX2), it is still better to load a constant vector
// from the constant pool and not to broadcast it from a scalar.
- if (ConstSplatVal && Subtarget->hasInt256()) {
+ // But override that restriction when optimizing for size.
+ // TODO: Check if splatting is recommended for other AVX-capable CPUs.
+ if (ConstSplatVal && (Subtarget->hasAVX2() || OptForSize)) {
EVT CVT = Ld.getValueType();
assert(!CVT.isVector() && "Must not broadcast a vector type");
- unsigned ScalarSize = CVT.getSizeInBits();
- if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)) {
+ // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
+ // For size optimization, also splat v2f64 and v2i64, and for size opt
+ // with AVX2, also splat i8 and i16.
+ // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
+ if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
+ (OptForSize && (ScalarSize == 64 || Subtarget->hasAVX2()))) {
const Constant *C = nullptr;
if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
C = CI->getConstantIntValue();
}
bool IsLoad = ISD::isNormalLoad(Ld.getNode());
- unsigned ScalarSize = Ld.getValueType().getSizeInBits();
// Handle AVX2 in-register broadcasts.
if (!IsLoad && Subtarget->hasInt256() &&
return true;
}
+/// \brief Test whether there are elements crossing 128-bit lanes in this
+/// shuffle mask.
+///
+/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
+/// and we routinely test for these.
+static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
+ int LaneSize = 128 / VT.getScalarSizeInBits();
+ int Size = Mask.size();
+ for (int i = 0; i < Size; ++i)
+ if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
+ return true;
+ return false;
+}
+
+/// \brief Test whether a shuffle mask is equivalent within each 128-bit lane.
+///
+/// This checks a shuffle mask to see if it is performing the same
+/// 128-bit lane-relative shuffle in each 128-bit lane. This trivially implies
+/// that it is also not lane-crossing. It may however involve a blend from the
+/// same lane of a second vector.
+///
+/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
+/// non-trivial to compute in the face of undef lanes. The representation is
+/// *not* suitable for use with existing 128-bit shuffles as it will contain
+/// entries from both V1 and V2 inputs to the wider mask.
+static bool
+is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
+ SmallVectorImpl<int> &RepeatedMask) {
+ int LaneSize = 128 / VT.getScalarSizeInBits();
+ RepeatedMask.resize(LaneSize, -1);
+ int Size = Mask.size();
+ for (int i = 0; i < Size; ++i) {
+ if (Mask[i] < 0)
+ continue;
+ if ((Mask[i] % Size) / LaneSize != i / LaneSize)
+ // This entry crosses lanes, so there is no way to model this shuffle.
+ return false;
+
+ // Ok, handle the in-lane shuffles by detecting if and when they repeat.
+ if (RepeatedMask[i % LaneSize] == -1)
+ // This is the first non-undef entry in this slot of a 128-bit lane.
+ RepeatedMask[i % LaneSize] =
+ Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + Size;
+ else if (RepeatedMask[i % LaneSize] + (i / LaneSize) * LaneSize != Mask[i])
+ // Found a mismatch with the repeated mask.
+ return false;
+ }
+ return true;
+}
+
// Hide this symbol with an anonymous namespace instead of 'static' so that MSVC
// 2013 will allow us to use it as a non-type template parameter.
namespace {
return false;
for (int i = 0, e = Mask.size(); i < e; ++i) {
assert(*Args[i] >= 0 && "Arguments must be positive integers!");
- assert(*Args[i] < (int)Args.size() * 2 &&
- "Argument outside the range of possible shuffle inputs!");
if (Mask[i] != -1 && Mask[i] != *Args[i])
return false;
}
/// that the shuffle mask is in fact a blend.
static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
+ const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
unsigned BlendMask = 0;
if (Mask[i] >= 0 && Mask[i] != i)
return SDValue(); // Shuffled V1 input!
}
- if (VT == MVT::v4f32 || VT == MVT::v2f64)
+ switch (VT.SimpleTy) {
+ case MVT::v2f64:
+ case MVT::v4f32:
+ case MVT::v4f64:
+ case MVT::v8f32:
return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
DAG.getConstant(BlendMask, MVT::i8));
- assert(!VT.isFloatingPoint() && "Only v4f32 and v2f64 are supported!");
-
- // For integer shuffles we need to expand the mask and cast the inputs to
- // v8i16s prior to blending.
- assert((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v2i64) &&
- "Not a supported integer vector type!");
- int Scale = 8 / VT.getVectorNumElements();
- BlendMask = 0;
- for (int i = 0, Size = Mask.size(); i < Size; ++i)
- if (Mask[i] >= Size)
+
+ case MVT::v4i64:
+ case MVT::v8i32:
+ assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
+ // FALLTHROUGH
+ case MVT::v2i64:
+ case MVT::v4i32:
+ // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
+ // that instruction.
+ if (Subtarget->hasAVX2()) {
+ // Scale the blend by the number of 32-bit dwords per element.
+ int Scale = VT.getScalarSizeInBits() / 32;
+ BlendMask = 0;
+ for (int i = 0, Size = Mask.size(); i < Size; ++i)
+ if (Mask[i] >= Size)
+ for (int j = 0; j < Scale; ++j)
+ BlendMask |= 1u << (i * Scale + j);
+
+ MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
+ V1 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V1);
+ V2 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V2);
+ return DAG.getNode(ISD::BITCAST, DL, VT,
+ DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
+ DAG.getConstant(BlendMask, MVT::i8)));
+ }
+ // FALLTHROUGH
+ case MVT::v8i16: {
+ // For integer shuffles we need to expand the mask and cast the inputs to
+ // v8i16s prior to blending.
+ int Scale = 8 / VT.getVectorNumElements();
+ BlendMask = 0;
+ for (int i = 0, Size = Mask.size(); i < Size; ++i)
+ if (Mask[i] >= Size)
+ for (int j = 0; j < Scale; ++j)
+ BlendMask |= 1u << (i * Scale + j);
+
+ V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1);
+ V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V2);
+ return DAG.getNode(ISD::BITCAST, DL, VT,
+ DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
+ DAG.getConstant(BlendMask, MVT::i8)));
+ }
+
+ case MVT::v16i16: {
+ assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
+ SmallVector<int, 8> RepeatedMask;
+ if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
+ // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
+ assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
+ BlendMask = 0;
+ for (int i = 0; i < 8; ++i)
+ if (RepeatedMask[i] >= 16)
+ BlendMask |= 1u << i;
+ return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
+ DAG.getConstant(BlendMask, MVT::i8));
+ }
+ }
+ // FALLTHROUGH
+ case MVT::v32i8: {
+ assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
+ SDValue PBLENDVMask[32];
+ // Scale the blend by the number of bytes per element.
+ int Scale = VT.getScalarSizeInBits() / 8;
+ assert(Mask.size() * Scale == 32 && "Not a 256-bit vector!");
+ for (int i = 0, Size = Mask.size(); i < Size; ++i)
for (int j = 0; j < Scale; ++j)
- BlendMask |= 1u << (i * Scale + j);
+ PBLENDVMask[Scale * i + j] =
+ Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
+ : DAG.getConstant(Mask[i] < Size ? 0 : 0x80, MVT::i8);
+
+ V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V1);
+ V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V2);
+ return DAG.getNode(
+ ISD::BITCAST, DL, VT,
+ DAG.getNode(ISD::VSELECT, DL, MVT::v32i8,
+ DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PBLENDVMask),
+ V1, V2));
+ }
+
+ default:
+ llvm_unreachable("Not a supported integer vector type!");
+ }
+}
+
+/// \brief Generic routine to lower a shuffle and blend as a decomposed set of
+/// unblended shuffles followed by an unshuffled blend.
+///
+/// This matches the extremely common pattern for handling combined
+/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
+/// operations.
+static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(SDLoc DL, MVT VT,
+ SDValue V1,
+ SDValue V2,
+ ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
+ // Shuffle the input elements into the desired positions in V1 and V2 and
+ // blend them together.
+ SmallVector<int, 32> V1Mask(Mask.size(), -1);
+ SmallVector<int, 32> V2Mask(Mask.size(), -1);
+ SmallVector<int, 32> BlendMask(Mask.size(), -1);
+ for (int i = 0, Size = Mask.size(); i < Size; ++i)
+ if (Mask[i] >= 0 && Mask[i] < Size) {
+ V1Mask[i] = Mask[i];
+ BlendMask[i] = i;
+ } else if (Mask[i] >= Size) {
+ V2Mask[i] = Mask[i] - Size;
+ BlendMask[i] = i + Size;
+ }
+
+ V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
+ V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
+ return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
+}
+
+/// \brief Try to lower a vector shuffle as a byte rotation.
+///
+/// We have a generic PALIGNR instruction in x86 that will do an arbitrary
+/// byte-rotation of a the concatentation of two vectors. This routine will
+/// try to generically lower a vector shuffle through such an instruction. It
+/// does not check for the availability of PALIGNR-based lowerings, only the
+/// applicability of this strategy to the given mask. This matches shuffle
+/// vectors that look like:
+///
+/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
+///
+/// Essentially it concatenates V1 and V2, shifts right by some number of
+/// elements, and takes the low elements as the result. Note that while this is
+/// specified as a *right shift* because x86 is little-endian, it is a *left
+/// rotate* of the vector lanes.
+///
+/// Note that this only handles 128-bit vector widths currently.
+static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1,
+ SDValue V2,
+ ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
+ assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
+
+ // We need to detect various ways of spelling a rotation:
+ // [11, 12, 13, 14, 15, 0, 1, 2]
+ // [-1, 12, 13, 14, -1, -1, 1, -1]
+ // [-1, -1, -1, -1, -1, -1, 1, 2]
+ // [ 3, 4, 5, 6, 7, 8, 9, 10]
+ // [-1, 4, 5, 6, -1, -1, 9, -1]
+ // [-1, 4, 5, 6, -1, -1, -1, -1]
+ int Rotation = 0;
+ SDValue Lo, Hi;
+ for (int i = 0, Size = Mask.size(); i < Size; ++i) {
+ if (Mask[i] == -1)
+ continue;
+ assert(Mask[i] >= 0 && "Only -1 is a valid negative mask element!");
+
+ // Based on the mod-Size value of this mask element determine where
+ // a rotated vector would have started.
+ int StartIdx = i - (Mask[i] % Size);
+ if (StartIdx == 0)
+ // The identity rotation isn't interesting, stop.
+ return SDValue();
+
+ // If we found the tail of a vector the rotation must be the missing
+ // front. If we found the head of a vector, it must be how much of the head.
+ int CandidateRotation = StartIdx < 0 ? -StartIdx : Size - StartIdx;
+
+ if (Rotation == 0)
+ Rotation = CandidateRotation;
+ else if (Rotation != CandidateRotation)
+ // The rotations don't match, so we can't match this mask.
+ return SDValue();
+
+ // Compute which value this mask is pointing at.
+ SDValue MaskV = Mask[i] < Size ? V1 : V2;
+
+ // Compute which of the two target values this index should be assigned to.
+ // This reflects whether the high elements are remaining or the low elements
+ // are remaining.
+ SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
+
+ // Either set up this value if we've not encountered it before, or check
+ // that it remains consistent.
+ if (!TargetV)
+ TargetV = MaskV;
+ else if (TargetV != MaskV)
+ // This may be a rotation, but it pulls from the inputs in some
+ // unsupported interleaving.
+ return SDValue();
+ }
+
+ // Check that we successfully analyzed the mask, and normalize the results.
+ assert(Rotation != 0 && "Failed to locate a viable rotation!");
+ assert((Lo || Hi) && "Failed to find a rotated input vector!");
+ if (!Lo)
+ Lo = Hi;
+ else if (!Hi)
+ Hi = Lo;
+
+ // Cast the inputs to v16i8 to match PALIGNR.
+ Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Lo);
+ Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Hi);
+
+ assert(VT.getSizeInBits() == 128 &&
+ "Rotate-based lowering only supports 128-bit lowering!");
+ assert(Mask.size() <= 16 &&
+ "Can shuffle at most 16 bytes in a 128-bit vector!");
+ // The actual rotate instruction rotates bytes, so we need to scale the
+ // rotation based on how many bytes are in the vector.
+ int Scale = 16 / Mask.size();
- V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1);
- V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V2);
return DAG.getNode(ISD::BITCAST, DL, VT,
- DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
- DAG.getConstant(BlendMask, MVT::i8)));
+ DAG.getNode(X86ISD::PALIGNR, DL, MVT::v16i8, Hi, Lo,
+ DAG.getConstant(Rotation * Scale, MVT::i8)));
+}
+
+/// \brief Compute whether each element of a shuffle is zeroable.
+///
+/// A "zeroable" vector shuffle element is one which can be lowered to zero.
+/// Either it is an undef element in the shuffle mask, the element of the input
+/// referenced is undef, or the element of the input referenced is known to be
+/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
+/// as many lanes with this technique as possible to simplify the remaining
+/// shuffle.
+static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
+ SDValue V1, SDValue V2) {
+ SmallBitVector Zeroable(Mask.size(), false);
+
+ bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
+ bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
+
+ for (int i = 0, Size = Mask.size(); i < Size; ++i) {
+ int M = Mask[i];
+ // Handle the easy cases.
+ if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
+ Zeroable[i] = true;
+ continue;
+ }
+
+ // If this is an index into a build_vector node, dig out the input value and
+ // use it.
+ SDValue V = M < Size ? V1 : V2;
+ if (V.getOpcode() != ISD::BUILD_VECTOR)
+ continue;
+
+ SDValue Input = V.getOperand(M % Size);
+ // The UNDEF opcode check really should be dead code here, but not quite
+ // worth asserting on (it isn't invalid, just unexpected).
+ if (Input.getOpcode() == ISD::UNDEF || X86::isZeroNode(Input))
+ Zeroable[i] = true;
+ }
+
+ return Zeroable;
+}
+
+/// \brief Lower a vector shuffle as a zero or any extension.
+///
+/// Given a specific number of elements, element bit width, and extension
+/// stride, produce either a zero or any extension based on the available
+/// features of the subtarget.
+static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
+ SDLoc DL, MVT VT, int NumElements, int Scale, bool AnyExt, SDValue InputV,
+ const X86Subtarget *Subtarget, SelectionDAG &DAG) {
+ assert(Scale > 1 && "Need a scale to extend.");
+ int EltBits = VT.getSizeInBits() / NumElements;
+ assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
+ "Only 8, 16, and 32 bit elements can be extended.");
+ assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
+
+ // Found a valid zext mask! Try various lowering strategies based on the
+ // input type and available ISA extensions.
+ if (Subtarget->hasSSE41()) {
+ MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
+ MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
+ NumElements / Scale);
+ InputV = DAG.getNode(ISD::BITCAST, DL, InputVT, InputV);
+ return DAG.getNode(ISD::BITCAST, DL, VT,
+ DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV));
+ }
+
+ // For any extends we can cheat for larger element sizes and use shuffle
+ // instructions that can fold with a load and/or copy.
+ if (AnyExt && EltBits == 32) {
+ int PSHUFDMask[4] = {0, -1, 1, -1};
+ return DAG.getNode(
+ ISD::BITCAST, DL, VT,
+ DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
+ DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, InputV),
+ getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
+ }
+ if (AnyExt && EltBits == 16 && Scale > 2) {
+ int PSHUFDMask[4] = {0, -1, 0, -1};
+ InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
+ DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, InputV),
+ getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG));
+ int PSHUFHWMask[4] = {1, -1, -1, -1};
+ return DAG.getNode(
+ ISD::BITCAST, DL, VT,
+ DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16,
+ DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, InputV),
+ getV4X86ShuffleImm8ForMask(PSHUFHWMask, DAG)));
+ }
+
+ // If this would require more than 2 unpack instructions to expand, use
+ // pshufb when available. We can only use more than 2 unpack instructions
+ // when zero extending i8 elements which also makes it easier to use pshufb.
+ if (Scale > 4 && EltBits == 8 && Subtarget->hasSSSE3()) {
+ assert(NumElements == 16 && "Unexpected byte vector width!");
+ SDValue PSHUFBMask[16];
+ for (int i = 0; i < 16; ++i)
+ PSHUFBMask[i] =
+ DAG.getConstant((i % Scale == 0) ? i / Scale : 0x80, MVT::i8);
+ InputV = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, InputV);
+ return DAG.getNode(ISD::BITCAST, DL, VT,
+ DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
+ DAG.getNode(ISD::BUILD_VECTOR, DL,
+ MVT::v16i8, PSHUFBMask)));
+ }
+
+ // Otherwise emit a sequence of unpacks.
+ do {
+ MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
+ SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
+ : getZeroVector(InputVT, Subtarget, DAG, DL);
+ InputV = DAG.getNode(ISD::BITCAST, DL, InputVT, InputV);
+ InputV = DAG.getNode(X86ISD::UNPCKL, DL, InputVT, InputV, Ext);
+ Scale /= 2;
+ EltBits *= 2;
+ NumElements /= 2;
+ } while (Scale > 1);
+ return DAG.getNode(ISD::BITCAST, DL, VT, InputV);
+}
+
+/// \brief Try to lower a vector shuffle as a zero extension on any micrarch.
+///
+/// This routine will try to do everything in its power to cleverly lower
+/// a shuffle which happens to match the pattern of a zero extend. It doesn't
+/// check for the profitability of this lowering, it tries to aggressively
+/// match this pattern. It will use all of the micro-architectural details it
+/// can to emit an efficient lowering. It handles both blends with all-zero
+/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
+/// masking out later).
+///
+/// The reason we have dedicated lowering for zext-style shuffles is that they
+/// are both incredibly common and often quite performance sensitive.
+static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
+ SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
+ const X86Subtarget *Subtarget, SelectionDAG &DAG) {
+ SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+
+ int Bits = VT.getSizeInBits();
+ int NumElements = Mask.size();
+
+ // Define a helper function to check a particular ext-scale and lower to it if
+ // valid.
+ auto Lower = [&](int Scale) -> SDValue {
+ SDValue InputV;
+ bool AnyExt = true;
+ for (int i = 0; i < NumElements; ++i) {
+ if (Mask[i] == -1)
+ continue; // Valid anywhere but doesn't tell us anything.
+ if (i % Scale != 0) {
+ // Each of the extend elements needs to be zeroable.
+ if (!Zeroable[i])
+ return SDValue();
+
+ // We no lorger are in the anyext case.
+ AnyExt = false;
+ continue;
+ }
+
+ // Each of the base elements needs to be consecutive indices into the
+ // same input vector.
+ SDValue V = Mask[i] < NumElements ? V1 : V2;
+ if (!InputV)
+ InputV = V;
+ else if (InputV != V)
+ return SDValue(); // Flip-flopping inputs.
+
+ if (Mask[i] % NumElements != i / Scale)
+ return SDValue(); // Non-consecutive strided elemenst.
+ }
+
+ // If we fail to find an input, we have a zero-shuffle which should always
+ // have already been handled.
+ // FIXME: Maybe handle this here in case during blending we end up with one?
+ if (!InputV)
+ return SDValue();
+
+ return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
+ DL, VT, NumElements, Scale, AnyExt, InputV, Subtarget, DAG);
+ };
+
+ // The widest scale possible for extending is to a 64-bit integer.
+ assert(Bits % 64 == 0 &&
+ "The number of bits in a vector must be divisible by 64 on x86!");
+ int NumExtElements = Bits / 64;
+
+ // Each iteration, try extending the elements half as much, but into twice as
+ // many elements.
+ for (; NumExtElements < NumElements; NumExtElements *= 2) {
+ assert(NumElements % NumExtElements == 0 &&
+ "The input vector size must be divisble by the extended size.");
+ if (SDValue V = Lower(NumElements / NumExtElements))
+ return V;
+ }
+
+ // No viable ext lowering found.
+ return SDValue();
+}
+
+/// \brief Try to lower insertion of a single element into a zero vector.
+///
+/// This is a common pattern that we have especially efficient patterns to lower
+/// across all subtarget feature sets.
+static SDValue lowerVectorShuffleAsElementInsertion(
+ MVT VT, SDLoc DL, SDValue V1, SDValue V2, ArrayRef<int> Mask,
+ const X86Subtarget *Subtarget, SelectionDAG &DAG) {
+ SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+
+ int V2Index = std::find_if(Mask.begin(), Mask.end(),
+ [&Mask](int M) { return M >= (int)Mask.size(); }) -
+ Mask.begin();
+ if (Mask.size() == 2) {
+ if (!Zeroable[V2Index ^ 1]) {
+ // For 2-wide masks we may be able to just invert the inputs. We use an xor
+ // with 2 to flip from {2,3} to {0,1} and vice versa.
+ int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
+ Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
+ if (Zeroable[V2Index])
+ return lowerVectorShuffleAsElementInsertion(VT, DL, V2, V1, InverseMask,
+ Subtarget, DAG);
+ else
+ return SDValue();
+ }
+ } else {
+ for (int i = 0, Size = Mask.size(); i < Size; ++i)
+ if (i != V2Index && !Zeroable[i])
+ return SDValue(); // Not inserting into a zero vector.
+ }
+
+ // Step over any bitcasts on either input so we can scan the actual
+ // BUILD_VECTOR nodes.
+ while (V1.getOpcode() == ISD::BITCAST)
+ V1 = V1.getOperand(0);
+ while (V2.getOpcode() == ISD::BITCAST)
+ V2 = V2.getOperand(0);
+
+ // Check for a single input from a SCALAR_TO_VECTOR node.
+ // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
+ // all the smarts here sunk into that routine. However, the current
+ // lowering of BUILD_VECTOR makes that nearly impossible until the old
+ // vector shuffle lowering is dead.
+ if (!((V2.getOpcode() == ISD::SCALAR_TO_VECTOR &&
+ Mask[V2Index] == (int)Mask.size()) ||
+ V2.getOpcode() == ISD::BUILD_VECTOR))
+ return SDValue();
+
+ SDValue V2S = V2.getOperand(Mask[V2Index] - Mask.size());
+
+ // First, we need to zext the scalar if it is smaller than an i32.
+ MVT ExtVT = VT;
+ MVT EltVT = VT.getVectorElementType();
+ V2S = DAG.getNode(ISD::BITCAST, DL, EltVT, V2S);
+ if (EltVT == MVT::i8 || EltVT == MVT::i16) {
+ // Zero-extend directly to i32.
+ ExtVT = MVT::v4i32;
+ V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
+ }
+
+ V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT,
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S));
+ if (ExtVT != VT)
+ V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2);
+
+ if (V2Index != 0) {
+ // If we have 4 or fewer lanes we can cheaply shuffle the element into
+ // the desired position. Otherwise it is more efficient to do a vector
+ // shift left. We know that we can do a vector shift left because all
+ // the inputs are zero.
+ if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
+ SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
+ V2Shuffle[V2Index] = 0;
+ V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
+ } else {
+ V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, V2);
+ V2 = DAG.getNode(
+ X86ISD::VSHLDQ, DL, MVT::v2i64, V2,
+ DAG.getConstant(
+ V2Index * EltVT.getSizeInBits(),
+ DAG.getTargetLoweringInfo().getScalarShiftAmountTy(MVT::v2i64)));
+ V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2);
+ }
+ }
+ return V2;
}
/// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
// Straight shuffle of a single input vector. Simulate this by using the
// single input as both of the "inputs" to this instruction..
unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
+
+ if (Subtarget->hasAVX()) {
+ // If we have AVX, we can use VPERMILPS which will allow folding a load
+ // into the shuffle.
+ return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
+ DAG.getConstant(SHUFPDMask, MVT::i8));
+ }
+
return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V1,
DAG.getConstant(SHUFPDMask, MVT::i8));
}
if (isShuffleEquivalent(Mask, 1, 3))
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2f64, V1, V2);
+ // If we have a single input, insert that into V1 if we can do so cheaply.
+ if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1)
+ if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
+ MVT::v2f64, DL, V1, V2, Mask, Subtarget, DAG))
+ return Insertion;
+
if (Subtarget->hasSSE41())
- if (SDValue Blend =
- lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask, DAG))
+ if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
+ Subtarget, DAG))
return Blend;
unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
if (isShuffleEquivalent(Mask, 1, 3))
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2i64, V1, V2);
+ // If we have a single input from V2 insert that into V1 if we can do so
+ // cheaply.
+ if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1)
+ if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
+ MVT::v2i64, DL, V1, V2, Mask, Subtarget, DAG))
+ return Insertion;
+
if (Subtarget->hasSSE41())
- if (SDValue Blend =
- lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask, DAG))
+ if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
+ Subtarget, DAG))
return Blend;
+ // Try to use rotation instructions if available.
+ if (Subtarget->hasSSSE3())
+ if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
+ DL, MVT::v2i64, V1, V2, Mask, DAG))
+ return Rotate;
+
// We implement this with SHUFPD which is pretty lame because it will likely
// incur 2 cycles of stall for integer vectors on Nehalem and older chips.
// However, all the alternatives are still more cycles and newer chips don't
DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
}
-/// \brief Lower 4-lane 32-bit floating point shuffles.
+/// \brief Lower a vector shuffle using the SHUFPS instruction.
///
-/// Uses instructions exclusively from the floating point unit to minimize
-/// domain crossing penalties, as these are sufficient to implement all v4f32
-/// shuffles.
-static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
- const X86Subtarget *Subtarget,
- SelectionDAG &DAG) {
- SDLoc DL(Op);
- assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!");
- assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
- assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
- ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
- ArrayRef<int> Mask = SVOp->getMask();
- assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
-
+/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
+/// It makes no assumptions about whether this is the *best* lowering, it simply
+/// uses it.
+static SDValue lowerVectorShuffleWithSHUFPS(SDLoc DL, MVT VT,
+ ArrayRef<int> Mask, SDValue V1,
+ SDValue V2, SelectionDAG &DAG) {
SDValue LowV = V1, HighV = V2;
int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
int NumV2Elements =
std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
- if (NumV2Elements == 0)
- // Straight shuffle of a single input vector. We pass the input vector to
- // both operands to simulate this with a SHUFPS.
- return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
- getV4X86ShuffleImm8ForMask(Mask, DAG));
-
- // Use dedicated unpack instructions for masks that match their pattern.
- if (isShuffleEquivalent(Mask, 0, 4, 1, 5))
- return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V1, V2);
- if (isShuffleEquivalent(Mask, 2, 6, 3, 7))
- return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V1, V2);
-
- if (Subtarget->hasSSE41())
- if (SDValue Blend =
- lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask, DAG))
- return Blend;
-
if (NumV2Elements == 1) {
int V2Index =
std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) -
Mask.begin();
- // Check for whether we can use INSERTPS to perform the blend. We only use
- // INSERTPS when the V1 elements are already in the correct locations
- // because otherwise we can just always use two SHUFPS instructions which
- // are much smaller to encode than a SHUFPS and an INSERTPS.
- if (Subtarget->hasSSE41()) {
- // When using INSERTPS we can zero any lane of the destination. Collect
- // the zero inputs into a mask and drop them from the lanes of V1 which
- // actually need to be present as inputs to the INSERTPS.
- unsigned ZMask = 0;
- if (ISD::isBuildVectorAllZeros(V1.getNode())) {
- ZMask = 0xF ^ (1 << V2Index);
- } else if (V1.getOpcode() == ISD::BUILD_VECTOR) {
- for (int i = 0; i < 4; ++i) {
- int M = Mask[i];
- if (M >= 4)
- continue;
- if (M > -1) {
- SDValue Input = V1.getOperand(M);
- if (Input.getOpcode() != ISD::UNDEF &&
- !X86::isZeroNode(Input)) {
- // A non-zero input!
- ZMask = 0;
- break;
- }
- }
- ZMask |= 1 << i;
- }
- }
-
- // Synthesize a shuffle mask for the non-zero and non-v2 inputs.
- int InsertShuffleMask[4] = {-1, -1, -1, -1};
- for (int i = 0; i < 4; ++i)
- if (i != V2Index && (ZMask & (1 << i)) == 0)
- InsertShuffleMask[i] = Mask[i];
-
- if (isNoopShuffleMask(InsertShuffleMask)) {
- // Replace V1 with undef if nothing from V1 survives the INSERTPS.
- if ((ZMask | 1 << V2Index) == 0xF)
- V1 = DAG.getUNDEF(MVT::v4f32);
-
- unsigned InsertPSMask = (Mask[V2Index] - 4) << 6 | V2Index << 4 | ZMask;
- assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
-
- // Insert the V2 element into the desired position.
- return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
- DAG.getConstant(InsertPSMask, MVT::i8));
- }
- }
-
// Compute the index adjacent to V2Index and in the same half by toggling
// the low bit.
int V2AdjIndex = V2Index ^ 1;
// To make this work, blend them together as the first step.
int V1Index = V2AdjIndex;
int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
- V2 = DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V2, V1,
+ V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
getV4X86ShuffleImm8ForMask(BlendMask, DAG));
// Now proceed to reconstruct the final blend as we have the necessary
} else if (NumV2Elements == 2) {
if (Mask[0] < 4 && Mask[1] < 4) {
// Handle the easy case where we have V1 in the low lanes and V2 in the
- // high lanes. We never see this reversed because we sort the shuffle.
+ // high lanes.
NewMask[2] -= 4;
NewMask[3] -= 4;
+ } else if (Mask[2] < 4 && Mask[3] < 4) {
+ // We also handle the reversed case because this utility may get called
+ // when we detect a SHUFPS pattern but can't easily commute the shuffle to
+ // arrange things in the right direction.
+ NewMask[0] -= 4;
+ NewMask[1] -= 4;
+ HighV = V1;
+ LowV = V2;
} else {
// We have a mixture of V1 and V2 in both low and high lanes. Rather than
// trying to place elements directly, just blend them and set up the final
Mask[2] < 4 ? Mask[2] : Mask[3],
(Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
(Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
- V1 = DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V2,
+ V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
getV4X86ShuffleImm8ForMask(BlendMask, DAG));
// Now we do a normal shuffle of V1 by giving V1 as both operands to
NewMask[3] = Mask[2] < 4 ? 3 : 1;
}
}
- return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, LowV, HighV,
+ return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
getV4X86ShuffleImm8ForMask(NewMask, DAG));
}
-static SDValue lowerIntegerElementInsertionVectorShuffle(
- MVT VT, SDLoc DL, SDValue V1, SDValue V2, ArrayRef<int> Mask,
- const X86Subtarget *Subtarget, SelectionDAG &DAG) {
- int V2Index = std::find_if(Mask.begin(), Mask.end(),
- [&Mask](int M) { return M >= (int)Mask.size(); }) -
- Mask.begin();
+/// \brief Lower 4-lane 32-bit floating point shuffles.
+///
+/// Uses instructions exclusively from the floating point unit to minimize
+/// domain crossing penalties, as these are sufficient to implement all v4f32
+/// shuffles.
+static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ SDLoc DL(Op);
+ assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!");
+ assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+ ArrayRef<int> Mask = SVOp->getMask();
+ assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
- // Check for a single input from a SCALAR_TO_VECTOR node.
- // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
- // all the smarts here sunk into that routine. However, the current
- // lowering of BUILD_VECTOR makes that nearly impossible until the old
- // vector shuffle lowering is dead.
- if ((Mask[V2Index] == (int)Mask.size() &&
- V2.getOpcode() == ISD::SCALAR_TO_VECTOR) ||
- V2.getOpcode() == ISD::BUILD_VECTOR) {
- SDValue V2S = V2.getOperand(Mask[V2Index] - Mask.size());
-
- bool V1IsAllZero = false;
- if (ISD::isBuildVectorAllZeros(V1.getNode())) {
- V1IsAllZero = true;
- } else if (V1.getOpcode() == ISD::BUILD_VECTOR) {
- V1IsAllZero = true;
- for (int M : Mask) {
- if (M < 0 || M >= (int)Mask.size())
- continue;
- SDValue Input = V1.getOperand(M);
- if (Input.getOpcode() != ISD::UNDEF && !X86::isZeroNode(Input)) {
- // A non-zero input!
- V1IsAllZero = false;
- break;
- }
- }
+ int NumV2Elements =
+ std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
+
+ if (NumV2Elements == 0) {
+ if (Subtarget->hasAVX()) {
+ // If we have AVX, we can use VPERMILPS which will allow folding a load
+ // into the shuffle.
+ return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
+ getV4X86ShuffleImm8ForMask(Mask, DAG));
}
- if (V1IsAllZero) {
- // First, we need to zext the scalar if it is smaller than an i32.
- MVT EltVT = VT.getVectorElementType();
- assert(EltVT == V2S.getSimpleValueType() &&
- "Different scalar and element types!");
- MVT ExtVT = VT;
- if (EltVT == MVT::i8 || EltVT == MVT::i16) {
- // Zero-extend directly to i32.
- ExtVT = MVT::v4i32;
- V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
- }
- V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT,
- DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S));
- if (ExtVT != VT)
- V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2);
-
- if (V2Index != 0) {
- // If we have 4 or fewer lanes we can cheaply shuffle the element into
- // the desired position. Otherwise it is more efficient to do a vector
- // shift left. We know that we can do a vector shift left because all
- // the inputs are zero.
- if (VT.getVectorNumElements() <= 4) {
- SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
- V2Shuffle[V2Index] = 0;
- V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
- } else {
- V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, V2);
- V2 = DAG.getNode(
- X86ISD::VSHLDQ, DL, MVT::v2i64, V2,
- DAG.getConstant(
- V2Index * EltVT.getSizeInBits(),
- DAG.getTargetLoweringInfo().getScalarShiftAmountTy(MVT::v2i64)));
- V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2);
+ // Otherwise, use a straight shuffle of a single input vector. We pass the
+ // input vector to both operands to simulate this with a SHUFPS.
+ return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
+ getV4X86ShuffleImm8ForMask(Mask, DAG));
+ }
+
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (isShuffleEquivalent(Mask, 0, 4, 1, 5))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V1, V2);
+ if (isShuffleEquivalent(Mask, 2, 6, 3, 7))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V1, V2);
+
+ // There are special ways we can lower some single-element blends. However, we
+ // have custom ways we can lower more complex single-element blends below that
+ // we defer to if both this and BLENDPS fail to match, so restrict this to
+ // when the V2 input is targeting element 0 of the mask -- that is the fast
+ // case here.
+ if (NumV2Elements == 1 && Mask[0] >= 4)
+ if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4f32, DL, V1, V2,
+ Mask, Subtarget, DAG))
+ return V;
+
+ if (Subtarget->hasSSE41())
+ if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
+ Subtarget, DAG))
+ return Blend;
+
+ // Check for whether we can use INSERTPS to perform the blend. We only use
+ // INSERTPS when the V1 elements are already in the correct locations
+ // because otherwise we can just always use two SHUFPS instructions which
+ // are much smaller to encode than a SHUFPS and an INSERTPS.
+ if (NumV2Elements == 1 && Subtarget->hasSSE41()) {
+ int V2Index =
+ std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) -
+ Mask.begin();
+
+ // When using INSERTPS we can zero any lane of the destination. Collect
+ // the zero inputs into a mask and drop them from the lanes of V1 which
+ // actually need to be present as inputs to the INSERTPS.
+ SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+
+ // Synthesize a shuffle mask for the non-zero and non-v2 inputs.
+ bool InsertNeedsShuffle = false;
+ unsigned ZMask = 0;
+ for (int i = 0; i < 4; ++i)
+ if (i != V2Index) {
+ if (Zeroable[i]) {
+ ZMask |= 1 << i;
+ } else if (Mask[i] != i) {
+ InsertNeedsShuffle = true;
+ break;
}
}
- return V2;
+
+ // We don't want to use INSERTPS or other insertion techniques if it will
+ // require shuffling anyways.
+ if (!InsertNeedsShuffle) {
+ // If all of V1 is zeroable, replace it with undef.
+ if ((ZMask | 1 << V2Index) == 0xF)
+ V1 = DAG.getUNDEF(MVT::v4f32);
+
+ unsigned InsertPSMask = (Mask[V2Index] - 4) << 6 | V2Index << 4 | ZMask;
+ assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
+
+ // Insert the V2 element into the desired position.
+ return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
+ DAG.getConstant(InsertPSMask, MVT::i8));
}
}
- return SDValue();
+
+ // Otherwise fall back to a SHUFPS lowering strategy.
+ return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
}
/// \brief Lower 4-lane i32 vector shuffles.
getV4X86ShuffleImm8ForMask(Mask, DAG));
}
+ // Whenever we can lower this as a zext, that instruction is strictly faster
+ // than any alternative.
+ if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2,
+ Mask, Subtarget, DAG))
+ return ZExt;
+
// Use dedicated unpack instructions for masks that match their pattern.
if (isShuffleEquivalent(Mask, 0, 4, 1, 5))
return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V1, V2);
// There are special ways we can lower some single-element blends.
if (NumV2Elements == 1)
- if (SDValue V = lowerIntegerElementInsertionVectorShuffle(
- MVT::v4i32, DL, V1, V2, Mask, Subtarget, DAG))
+ if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4i32, DL, V1, V2,
+ Mask, Subtarget, DAG))
return V;
if (Subtarget->hasSSE41())
- if (SDValue Blend =
- lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask, DAG))
+ if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
+ Subtarget, DAG))
return Blend;
+ // Try to use rotation instructions if available.
+ if (Subtarget->hasSSSE3())
+ if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
+ DL, MVT::v4i32, V1, V2, Mask, DAG))
+ return Rotate;
+
// We implement this with SHUFPS because it can blend from two vectors.
// Because we're going to eventually use SHUFPS, we use SHUFPS even to build
// up the inputs, bypassing domain shift penalties that we would encur if we
if (isShuffleEquivalent(Mask, 4, 4, 5, 5, 6, 6, 7, 7))
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V, V);
+ // Try to use rotation instructions if available.
+ if (Subtarget->hasSSSE3())
+ if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
+ DL, MVT::v8i16, V, V, Mask, DAG))
+ return Rotate;
+
// Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
// such inputs we can swap two of the dwords across the half mark and end up
// with <=2 inputs to each half in each half. Once there, we can fall through
assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
+ // Whenever we can lower this as a zext, that instruction is strictly faster
+ // than any alternative.
+ if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
+ DL, MVT::v8i16, V1, V2, OrigMask, Subtarget, DAG))
+ return ZExt;
+
auto isV1 = [](int M) { return M >= 0 && M < 8; };
auto isV2 = [](int M) { return M >= 8; };
// There are special ways we can lower some single-element blends.
if (NumV2Inputs == 1)
- if (SDValue V = lowerIntegerElementInsertionVectorShuffle(
- MVT::v8i16, DL, V1, V2, Mask, Subtarget, DAG))
+ if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v8i16, DL, V1, V2,
+ Mask, Subtarget, DAG))
return V;
if (Subtarget->hasSSE41())
- if (SDValue Blend =
- lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
+ if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
+ Subtarget, DAG))
return Blend;
+ // Try to use rotation instructions if available.
+ if (Subtarget->hasSSSE3())
+ if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask, DAG))
+ return Rotate;
+
if (NumV1Inputs + NumV2Inputs <= 4)
return lowerV8I16BasicBlendVectorShuffle(DL, V1, V2, Mask, Subtarget, DAG);
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
ArrayRef<int> OrigMask = SVOp->getMask();
assert(OrigMask.size() == 16 && "Unexpected mask size for v16 shuffle!");
+
+ // Try to use rotation instructions if available.
+ if (Subtarget->hasSSSE3())
+ if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v16i8, V1, V2,
+ OrigMask, DAG))
+ return Rotate;
+
+ // Try to use a zext lowering.
+ if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
+ DL, MVT::v16i8, V1, V2, OrigMask, Subtarget, DAG))
+ return ZExt;
+
int MaskStorage[16] = {
OrigMask[0], OrigMask[1], OrigMask[2], OrigMask[3],
OrigMask[4], OrigMask[5], OrigMask[6], OrigMask[7],
// FIXME: We should check for other patterns which can be widened into an
// i16 shuffle as well.
auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
- for (int i = 0; i < 16; i += 2) {
- if (Mask[i] != Mask[i + 1])
+ for (int i = 0; i < 16; i += 2)
+ if (Mask[i] != -1 && Mask[i + 1] != -1 && Mask[i] != Mask[i + 1])
return false;
- }
+
return true;
};
auto tryToWidenViaDuplication = [&]() -> SDValue {
MVT::v16i8, V1, V1);
int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
- for (int i = 0; i < 16; i += 2) {
- if (Mask[i] != -1)
- PostDupI16Shuffle[i / 2] = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
- assert(PostDupI16Shuffle[i / 2] < 8 && "Invalid v8 shuffle mask!");
- }
+ for (int i = 0; i < 16; ++i)
+ if (Mask[i] != -1) {
+ int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
+ assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
+ if (PostDupI16Shuffle[i / 2] == -1)
+ PostDupI16Shuffle[i / 2] = MappedMask;
+ else
+ assert(PostDupI16Shuffle[i / 2] == MappedMask &&
+ "Conflicting entrties in the original shuffle!");
+ }
return DAG.getNode(
ISD::BITCAST, DL, MVT::v16i8,
DAG.getVectorShuffle(MVT::v8i16, DL,
SDValue V2Mask[16];
for (int i = 0; i < 16; ++i)
if (Mask[i] == -1) {
- V1Mask[i] = V2Mask[i] = DAG.getConstant(0x80, MVT::i8);
+ V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
} else {
V1Mask[i] = DAG.getConstant(Mask[i] < 16 ? Mask[i] : 0x80, MVT::i8);
V2Mask[i] =
// There are special ways we can lower some single-element blends.
if (NumV2Elements == 1)
- if (SDValue V = lowerIntegerElementInsertionVectorShuffle(
- MVT::v16i8, DL, V1, V2, Mask, Subtarget, DAG))
+ if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v16i8, DL, V1, V2,
+ Mask, Subtarget, DAG))
return V;
// Check whether a compaction lowering can be done. This handles shuffles
}
}
-static bool isHalfCrossingShuffleMask(ArrayRef<int> Mask) {
- int Size = Mask.size();
- for (int M : Mask.slice(0, Size / 2))
- if (M >= 0 && (M % Size) >= Size / 2)
- return true;
- for (int M : Mask.slice(Size / 2, Size / 2))
- if (M >= 0 && (M % Size) < Size / 2)
- return true;
- return false;
-}
-
/// \brief Generic routine to split a 256-bit vector shuffle into 128-bit
/// shuffles.
///
/// shuffles. This can be done generically for any 256-bit vector shuffle and so
/// we encode the logic here for specific shuffle lowering routines to bail to
/// when they exhaust the features avaible to more directly handle the shuffle.
-static SDValue splitAndLower256BitVectorShuffle(SDValue Op, SDValue V1,
- SDValue V2,
- const X86Subtarget *Subtarget,
+static SDValue splitAndLower256BitVectorShuffle(SDLoc DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
SelectionDAG &DAG) {
- SDLoc DL(Op);
- MVT VT = Op.getSimpleValueType();
assert(VT.getSizeInBits() == 256 && "Only for 256-bit vector shuffles!");
assert(V1.getSimpleValueType() == VT && "Bad operand type!");
assert(V2.getSimpleValueType() == VT && "Bad operand type!");
- ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
- ArrayRef<int> Mask = SVOp->getMask();
ArrayRef<int> LoMask = Mask.slice(0, Mask.size()/2);
ArrayRef<int> HiMask = Mask.slice(Mask.size()/2);
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
}
+/// \brief Lower a vector shuffle crossing multiple 128-bit lanes as
+/// a permutation and blend of those lanes.
+///
+/// This essentially blends the out-of-lane inputs to each lane into the lane
+/// from a permuted copy of the vector. This lowering strategy results in four
+/// instructions in the worst case for a single-input cross lane shuffle which
+/// is lower than any other fully general cross-lane shuffle strategy I'm aware
+/// of. Special cases for each particular shuffle pattern should be handled
+/// prior to trying this lowering.
+static SDValue lowerVectorShuffleAsLanePermuteAndBlend(SDLoc DL, MVT VT,
+ SDValue V1, SDValue V2,
+ ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
+ // FIXME: This should probably be generalized for 512-bit vectors as well.
+ assert(VT.getSizeInBits() == 256 && "Only for 256-bit vector shuffles!");
+ int LaneSize = Mask.size() / 2;
+
+ // If there are only inputs from one 128-bit lane, splitting will in fact be
+ // less expensive. The flags track wether the given lane contains an element
+ // that crosses to another lane.
+ bool LaneCrossing[2] = {false, false};
+ for (int i = 0, Size = Mask.size(); i < Size; ++i)
+ if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
+ LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
+ if (!LaneCrossing[0] || !LaneCrossing[1])
+ return splitAndLower256BitVectorShuffle(DL, VT, V1, V2, Mask, DAG);
+
+ if (isSingleInputShuffleMask(Mask)) {
+ SmallVector<int, 32> FlippedBlendMask;
+ for (int i = 0, Size = Mask.size(); i < Size; ++i)
+ FlippedBlendMask.push_back(
+ Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
+ ? Mask[i]
+ : Mask[i] % LaneSize +
+ (i / LaneSize) * LaneSize + Size));
+
+ // Flip the vector, and blend the results which should now be in-lane. The
+ // VPERM2X128 mask uses the low 2 bits for the low source and bits 4 and
+ // 5 for the high source. The value 3 selects the high half of source 2 and
+ // the value 2 selects the low half of source 2. We only use source 2 to
+ // allow folding it into a memory operand.
+ unsigned PERMMask = 3 | 2 << 4;
+ SDValue Flipped = DAG.getNode(X86ISD::VPERM2X128, DL, VT, DAG.getUNDEF(VT),
+ V1, DAG.getConstant(PERMMask, MVT::i8));
+ return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
+ }
+
+ // This now reduces to two single-input shuffles of V1 and V2 which at worst
+ // will be handled by the above logic and a blend of the results, much like
+ // other patterns in AVX.
+ return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
+}
+
/// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
///
/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
ArrayRef<int> Mask = SVOp->getMask();
assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
- // FIXME: If we have AVX2, we should delegate to generic code as crossing
- // shuffles aren't a problem and FP and int have the same patterns.
-
- // FIXME: We can handle these more cleverly than splitting for v4f64.
- if (isHalfCrossingShuffleMask(Mask))
- return splitAndLower256BitVectorShuffle(Op, V1, V2, Subtarget, DAG);
-
if (isSingleInputShuffleMask(Mask)) {
- // Non-half-crossing single input shuffles can be lowerid with an
- // interleaved permutation.
- unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
- ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
- return DAG.getNode(X86ISD::VPERMILP, DL, MVT::v4f64, V1,
- DAG.getConstant(VPERMILPMask, MVT::i8));
+ if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
+ // Non-half-crossing single input shuffles can be lowerid with an
+ // interleaved permutation.
+ unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
+ ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
+ return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
+ DAG.getConstant(VPERMILPMask, MVT::i8));
+ }
+
+ // With AVX2 we have direct support for this permutation.
+ if (Subtarget->hasAVX2())
+ return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
+ getV4X86ShuffleImm8ForMask(Mask, DAG));
+
+ // Otherwise, fall back.
+ return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
+ DAG);
}
// X86 has dedicated unpack instructions that can handle specific blend
return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f64, V1, V2);
if (isShuffleEquivalent(Mask, 1, 5, 3, 7))
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V1, V2);
- // FIXME: It would be nice to find a way to get canonicalization to commute
- // these patterns.
- if (isShuffleEquivalent(Mask, 4, 0, 6, 2))
- return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f64, V2, V1);
- if (isShuffleEquivalent(Mask, 5, 1, 7, 3))
- return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V2, V1);
+
+ // If we have a single input to the zero element, insert that into V1 if we
+ // can do so cheaply.
+ int NumV2Elements =
+ std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
+ if (NumV2Elements == 1 && Mask[0] >= 4)
+ if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
+ MVT::v4f64, DL, V1, V2, Mask, Subtarget, DAG))
+ return Insertion;
+
+ if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
+ Subtarget, DAG))
+ return Blend;
// Check if the blend happens to exactly fit that of SHUFPD.
- if (Mask[0] < 4 && (Mask[1] == -1 || Mask[1] >= 4) &&
- Mask[2] < 4 && (Mask[3] == -1 || Mask[3] >= 4)) {
+ if ((Mask[0] == -1 || Mask[0] < 2) &&
+ (Mask[1] == -1 || (Mask[1] >= 4 && Mask[1] < 6)) &&
+ (Mask[2] == -1 || (Mask[2] >= 2 && Mask[2] < 4)) &&
+ (Mask[3] == -1 || Mask[3] >= 6)) {
unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 5) << 1) |
((Mask[2] == 3) << 2) | ((Mask[3] == 7) << 3);
return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f64, V1, V2,
DAG.getConstant(SHUFPDMask, MVT::i8));
}
- if ((Mask[0] == -1 || Mask[0] >= 4) && Mask[1] < 4 &&
- (Mask[2] == -1 || Mask[2] >= 4) && Mask[3] < 4) {
+ if ((Mask[0] == -1 || (Mask[0] >= 4 && Mask[0] < 6)) &&
+ (Mask[1] == -1 || Mask[1] < 2) &&
+ (Mask[2] == -1 || Mask[2] >= 6) &&
+ (Mask[3] == -1 || (Mask[3] >= 2 && Mask[3] < 4))) {
unsigned SHUFPDMask = (Mask[0] == 5) | ((Mask[1] == 1) << 1) |
((Mask[2] == 7) << 2) | ((Mask[3] == 3) << 3);
return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f64, V2, V1,
DAG.getConstant(SHUFPDMask, MVT::i8));
}
- // Shuffle the input elements into the desired positions in V1 and V2 and
- // blend them together.
- int V1Mask[] = {-1, -1, -1, -1};
- int V2Mask[] = {-1, -1, -1, -1};
- for (int i = 0; i < 4; ++i)
- if (Mask[i] >= 0 && Mask[i] < 4)
- V1Mask[i] = Mask[i];
- else if (Mask[i] >= 4)
- V2Mask[i] = Mask[i] - 4;
-
- V1 = DAG.getVectorShuffle(MVT::v4f64, DL, V1, DAG.getUNDEF(MVT::v4f64), V1Mask);
- V2 = DAG.getVectorShuffle(MVT::v4f64, DL, V2, DAG.getUNDEF(MVT::v4f64), V2Mask);
-
- unsigned BlendMask = 0;
- for (int i = 0; i < 4; ++i)
- if (Mask[i] >= 4)
- BlendMask |= 1 << i;
-
- return DAG.getNode(X86ISD::BLENDI, DL, MVT::v4f64, V1, V2,
- DAG.getConstant(BlendMask, MVT::i8));
+ // Otherwise fall back on generic blend lowering.
+ return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
+ Mask, DAG);
}
/// \brief Handle lowering of 4-lane 64-bit integer shuffles.
///
-/// Largely delegates to common code when we have AVX2 and to the floating-point
-/// code when we only have AVX.
+/// This routine is only called when we have AVX2 and thus a reasonable
+/// instruction set for v4i64 shuffling..
static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
SDLoc DL(Op);
- assert(Op.getSimpleValueType() == MVT::v4i64 && "Bad shuffle type!");
assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
ArrayRef<int> Mask = SVOp->getMask();
assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
+ assert(Subtarget->hasAVX2() && "We can only lower v4i64 with AVX2!");
+
+ if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
+ Subtarget, DAG))
+ return Blend;
+
+ // When the shuffle is mirrored between the 128-bit lanes of the unit, we can
+ // use lower latency instructions that will operate on both 128-bit lanes.
+ SmallVector<int, 2> RepeatedMask;
+ if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
+ if (isSingleInputShuffleMask(Mask)) {
+ int PSHUFDMask[] = {-1, -1, -1, -1};
+ for (int i = 0; i < 2; ++i)
+ if (RepeatedMask[i] >= 0) {
+ PSHUFDMask[2 * i] = 2 * RepeatedMask[i];
+ PSHUFDMask[2 * i + 1] = 2 * RepeatedMask[i] + 1;
+ }
+ return DAG.getNode(
+ ISD::BITCAST, DL, MVT::v4i64,
+ DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
+ DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, V1),
+ getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
+ }
+
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (isShuffleEquivalent(Mask, 0, 4, 2, 6))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V1, V2);
+ if (isShuffleEquivalent(Mask, 1, 5, 3, 7))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V1, V2);
+ }
+
+ // AVX2 provides a direct instruction for permuting a single input across
+ // lanes.
+ if (isSingleInputShuffleMask(Mask))
+ return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
+ getV4X86ShuffleImm8ForMask(Mask, DAG));
+
+ // Otherwise fall back on generic blend lowering.
+ return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
+ Mask, DAG);
+}
+
+/// \brief Handle lowering of 8-lane 32-bit floating point shuffles.
+///
+/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
+/// isn't available.
+static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ SDLoc DL(Op);
+ assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+ ArrayRef<int> Mask = SVOp->getMask();
+ assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
+
+ if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
+ Subtarget, DAG))
+ return Blend;
+
+ // If the shuffle mask is repeated in each 128-bit lane, we have many more
+ // options to efficiently lower the shuffle.
+ SmallVector<int, 4> RepeatedMask;
+ if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
+ assert(RepeatedMask.size() == 4 &&
+ "Repeated masks must be half the mask width!");
+ if (isSingleInputShuffleMask(Mask))
+ return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
+ getV4X86ShuffleImm8ForMask(RepeatedMask, DAG));
+
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 4, 12, 5, 13))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f32, V1, V2);
+ if (isShuffleEquivalent(Mask, 2, 10, 3, 11, 6, 14, 7, 15))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f32, V1, V2);
+
+ // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
+ // have already handled any direct blends. We also need to squash the
+ // repeated mask into a simulated v4f32 mask.
+ for (int i = 0; i < 4; ++i)
+ if (RepeatedMask[i] >= 8)
+ RepeatedMask[i] -= 4;
+ return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
+ }
+
+ // If we have a single input shuffle with different shuffle patterns in the
+ // two 128-bit lanes use the variable mask to VPERMILPS.
+ if (isSingleInputShuffleMask(Mask)) {
+ SDValue VPermMask[8];
+ for (int i = 0; i < 8; ++i)
+ VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32)
+ : DAG.getConstant(Mask[i], MVT::i32);
+ if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
+ return DAG.getNode(
+ X86ISD::VPERMILPV, DL, MVT::v8f32, V1,
+ DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask));
+
+ if (Subtarget->hasAVX2())
+ return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32,
+ DAG.getNode(ISD::BITCAST, DL, MVT::v8f32,
+ DAG.getNode(ISD::BUILD_VECTOR, DL,
+ MVT::v8i32, VPermMask)),
+ V1);
+
+ // Otherwise, fall back.
+ return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
+ DAG);
+ }
+
+ // Otherwise fall back on generic blend lowering.
+ return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
+ Mask, DAG);
+}
- // FIXME: If we have AVX2, we should delegate to generic code as crossing
- // shuffles aren't a problem and FP and int have the same patterns.
+/// \brief Handle lowering of 8-lane 32-bit integer shuffles.
+///
+/// This routine is only called when we have AVX2 and thus a reasonable
+/// instruction set for v8i32 shuffling..
+static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ SDLoc DL(Op);
+ assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+ ArrayRef<int> Mask = SVOp->getMask();
+ assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
+ assert(Subtarget->hasAVX2() && "We can only lower v8i32 with AVX2!");
+
+ if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
+ Subtarget, DAG))
+ return Blend;
+
+ // If the shuffle mask is repeated in each 128-bit lane we can use more
+ // efficient instructions that mirror the shuffles across the two 128-bit
+ // lanes.
+ SmallVector<int, 4> RepeatedMask;
+ if (is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask)) {
+ assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
+ if (isSingleInputShuffleMask(Mask))
+ return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
+ getV4X86ShuffleImm8ForMask(RepeatedMask, DAG));
+
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 4, 12, 5, 13))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i32, V1, V2);
+ if (isShuffleEquivalent(Mask, 2, 10, 3, 11, 6, 14, 7, 15))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i32, V1, V2);
+ }
+
+ // If the shuffle patterns aren't repeated but it is a single input, directly
+ // generate a cross-lane VPERMD instruction.
+ if (isSingleInputShuffleMask(Mask)) {
+ SDValue VPermMask[8];
+ for (int i = 0; i < 8; ++i)
+ VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32)
+ : DAG.getConstant(Mask[i], MVT::i32);
+ return DAG.getNode(
+ X86ISD::VPERMV, DL, MVT::v8i32,
+ DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask), V1);
+ }
+
+ // Otherwise fall back on generic blend lowering.
+ return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
+ Mask, DAG);
+}
+
+/// \brief Handle lowering of 16-lane 16-bit integer shuffles.
+///
+/// This routine is only called when we have AVX2 and thus a reasonable
+/// instruction set for v16i16 shuffling..
+static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ SDLoc DL(Op);
+ assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+ ArrayRef<int> Mask = SVOp->getMask();
+ assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
+ assert(Subtarget->hasAVX2() && "We can only lower v16i16 with AVX2!");
+
+ // There are no generalized cross-lane shuffle operations available on i16
+ // element types.
+ if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
+ return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
+ Mask, DAG);
+
+ if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
+ Subtarget, DAG))
+ return Blend;
+
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (isShuffleEquivalent(Mask,
+ // First 128-bit lane:
+ 0, 16, 1, 17, 2, 18, 3, 19,
+ // Second 128-bit lane:
+ 8, 24, 9, 25, 10, 26, 11, 27))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i16, V1, V2);
+ if (isShuffleEquivalent(Mask,
+ // First 128-bit lane:
+ 4, 20, 5, 21, 6, 22, 7, 23,
+ // Second 128-bit lane:
+ 12, 28, 13, 29, 14, 30, 15, 31))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i16, V1, V2);
- if (isHalfCrossingShuffleMask(Mask))
- return splitAndLower256BitVectorShuffle(Op, V1, V2, Subtarget, DAG);
+ if (isSingleInputShuffleMask(Mask)) {
+ SDValue PSHUFBMask[32];
+ for (int i = 0; i < 16; ++i) {
+ if (Mask[i] == -1) {
+ PSHUFBMask[2 * i] = PSHUFBMask[2 * i + 1] = DAG.getUNDEF(MVT::i8);
+ continue;
+ }
+
+ int M = i < 8 ? Mask[i] : Mask[i] - 8;
+ assert(M >= 0 && M < 8 && "Invalid single-input mask!");
+ PSHUFBMask[2 * i] = DAG.getConstant(2 * M, MVT::i8);
+ PSHUFBMask[2 * i + 1] = DAG.getConstant(2 * M + 1, MVT::i8);
+ }
+ return DAG.getNode(
+ ISD::BITCAST, DL, MVT::v16i16,
+ DAG.getNode(
+ X86ISD::PSHUFB, DL, MVT::v32i8,
+ DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V1),
+ DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask)));
+ }
- // AVX1 doesn't provide any facilities for v4i64 shuffles, bitcast and
- // delegate to floating point code.
- V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v4f64, V1);
- V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v4f64, V2);
- return DAG.getNode(ISD::BITCAST, DL, MVT::v4i64,
- lowerV4F64VectorShuffle(Op, V1, V2, Subtarget, DAG));
+ // Otherwise fall back on generic blend lowering.
+ return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i16, V1, V2,
+ Mask, DAG);
+}
+
+/// \brief Handle lowering of 32-lane 8-bit integer shuffles.
+///
+/// This routine is only called when we have AVX2 and thus a reasonable
+/// instruction set for v32i8 shuffling..
+static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ SDLoc DL(Op);
+ assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+ ArrayRef<int> Mask = SVOp->getMask();
+ assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
+ assert(Subtarget->hasAVX2() && "We can only lower v32i8 with AVX2!");
+
+ // There are no generalized cross-lane shuffle operations available on i8
+ // element types.
+ if (is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
+ return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2,
+ Mask, DAG);
+
+ if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
+ Subtarget, DAG))
+ return Blend;
+
+ // Use dedicated unpack instructions for masks that match their pattern.
+ // Note that these are repeated 128-bit lane unpacks, not unpacks across all
+ // 256-bit lanes.
+ if (isShuffleEquivalent(
+ Mask,
+ // First 128-bit lane:
+ 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39,
+ // Second 128-bit lane:
+ 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v32i8, V1, V2);
+ if (isShuffleEquivalent(
+ Mask,
+ // First 128-bit lane:
+ 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47,
+ // Second 128-bit lane:
+ 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v32i8, V1, V2);
+
+ if (isSingleInputShuffleMask(Mask)) {
+ SDValue PSHUFBMask[32];
+ for (int i = 0; i < 32; ++i)
+ PSHUFBMask[i] =
+ Mask[i] < 0
+ ? DAG.getUNDEF(MVT::i8)
+ : DAG.getConstant(Mask[i] < 16 ? Mask[i] : Mask[i] - 16, MVT::i8);
+
+ return DAG.getNode(
+ X86ISD::PSHUFB, DL, MVT::v32i8, V1,
+ DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask));
+ }
+
+ // Otherwise fall back on generic blend lowering.
+ return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v32i8, V1, V2,
+ Mask, DAG);
}
/// \brief High-level routine to lower various 256-bit x86 vector shuffles.
static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
MVT VT, const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
+ SDLoc DL(Op);
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+ ArrayRef<int> Mask = SVOp->getMask();
+
+ // There is a really nice hard cut-over between AVX1 and AVX2 that means we can
+ // check for those subtargets here and avoid much of the subtarget querying in
+ // the per-vector-type lowering routines. With AVX1 we have essentially *zero*
+ // ability to manipulate a 256-bit vector with integer types. Since we'll use
+ // floating point types there eventually, just immediately cast everything to
+ // a float and operate entirely in that domain.
+ if (VT.isInteger() && !Subtarget->hasAVX2()) {
+ int ElementBits = VT.getScalarSizeInBits();
+ if (ElementBits < 32)
+ // No floating point type available, decompose into 128-bit vectors.
+ return splitAndLower256BitVectorShuffle(DL, VT, V1, V2, Mask, DAG);
+
+ MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
+ VT.getVectorNumElements());
+ V1 = DAG.getNode(ISD::BITCAST, DL, FpVT, V1);
+ V2 = DAG.getNode(ISD::BITCAST, DL, FpVT, V2);
+ return DAG.getNode(ISD::BITCAST, DL, VT,
+ DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
+ }
+
switch (VT.SimpleTy) {
case MVT::v4f64:
return lowerV4F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
case MVT::v4i64:
return lowerV4I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
- case MVT::v8i32:
case MVT::v8f32:
+ return lowerV8F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
+ case MVT::v8i32:
+ return lowerV8I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
case MVT::v16i16:
+ return lowerV16I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
case MVT::v32i8:
- // Fall back to the basic pattern of extracting the high half and forming
- // a 4-way blend.
- // FIXME: Add targeted lowering for each type that can document rationale
- // for delegating to this when necessary.
- return splitAndLower256BitVectorShuffle(Op, V1, V2, Subtarget, DAG);
+ return lowerV32I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
default:
llvm_unreachable("Not a valid 256-bit x86 vector type!");
}
}
-/// \brief Tiny helper function to test whether a shuffle mask could be
+/// \brief Helper function to test whether a shuffle mask could be
/// simplified by widening the elements being shuffled.
-static bool canWidenShuffleElements(ArrayRef<int> Mask) {
- for (int i = 0, Size = Mask.size(); i < Size; i += 2)
- if ((Mask[i] != -1 && Mask[i] % 2 != 0) ||
- (Mask[i + 1] != -1 && (Mask[i + 1] % 2 != 1 ||
- (Mask[i] != -1 && Mask[i] + 1 != Mask[i + 1]))))
- return false;
+///
+/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
+/// leaves it in an unspecified state.
+///
+/// NOTE: This must handle normal vector shuffle masks and *target* vector
+/// shuffle masks. The latter have the special property of a '-2' representing
+/// a zero-ed lane of a vector.
+static bool canWidenShuffleElements(ArrayRef<int> Mask,
+ SmallVectorImpl<int> &WidenedMask) {
+ for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
+ // Check for any of the sentinel values (negative) and if they are the same,
+ // we can widen to that.
+ if (Mask[i] < 0 && Mask[i] == Mask[i + 1]) {
+ WidenedMask.push_back(Mask[i]);
+ continue;
+ }
+
+ // Check for an undef mask and a mask value properly aligned to fit with
+ // a pair of values. If we find such a case, use the non-undef mask's value.
+ if (Mask[i] == -1 && Mask[i + 1] >= 0 && Mask[i + 1] % 2 == 1) {
+ WidenedMask.push_back(Mask[i + 1] / 2);
+ continue;
+ }
+ if (Mask[i + 1] == -1 && Mask[i] >= 0 && Mask[i] % 2 == 0) {
+ WidenedMask.push_back(Mask[i] / 2);
+ continue;
+ }
+
+ // Finally check if the two mask values are adjacent and aligned with
+ // a pair.
+ if (Mask[i] != -1 && Mask[i] % 2 == 0 && Mask[i] + 1 == Mask[i + 1]) {
+ WidenedMask.push_back(Mask[i] / 2);
+ continue;
+ }
+
+ // Otherwise we can't safely widen the elements used in this shuffle.
+ return false;
+ }
+ assert(WidenedMask.size() == Mask.size() / 2 &&
+ "Incorrect size of mask after widening the elements!");
return true;
}
// lanes but wider integers. We cap this to not form integers larger than i64
// but it might be interesting to form i128 integers to handle flipping the
// low and high halves of AVX 256-bit vectors.
+ SmallVector<int, 16> WidenedMask;
if (VT.isInteger() && VT.getScalarSizeInBits() < 64 &&
- canWidenShuffleElements(Mask)) {
- SmallVector<int, 8> NewMask;
- for (int i = 0, Size = Mask.size(); i < Size; i += 2)
- NewMask.push_back(Mask[i] != -1
- ? Mask[i] / 2
- : (Mask[i + 1] != -1 ? Mask[i + 1] / 2 : -1));
+ canWidenShuffleElements(Mask, WidenedMask)) {
MVT NewVT =
MVT::getVectorVT(MVT::getIntegerVT(VT.getScalarSizeInBits() * 2),
VT.getVectorNumElements() / 2);
V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1);
V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2);
return DAG.getNode(ISD::BITCAST, dl, VT,
- DAG.getVectorShuffle(NewVT, dl, V1, V2, NewMask));
+ DAG.getVectorShuffle(NewVT, dl, V1, V2, WidenedMask));
}
int NumV1Elements = 0, NumUndefElements = 0, NumV2Elements = 0;
return DAG.getCommutedVectorShuffle(*SVOp);
// When the number of V1 and V2 elements are the same, try to minimize the
- // number of uses of V2 in the low half of the vector.
+ // number of uses of V2 in the low half of the vector. When that is tied,
+ // ensure that the sum of indices for V1 is equal to or lower than the sum
+ // indices for V2.
if (NumV1Elements == NumV2Elements) {
int LowV1Elements = 0, LowV2Elements = 0;
for (int M : SVOp->getMask().slice(0, NumElements / 2))
++LowV2Elements;
else if (M >= 0)
++LowV1Elements;
- if (LowV2Elements > LowV1Elements)
+ if (LowV2Elements > LowV1Elements) {
return DAG.getCommutedVectorShuffle(*SVOp);
+ } else if (LowV2Elements == LowV1Elements) {
+ int SumV1Indices = 0, SumV2Indices = 0;
+ for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i)
+ if (SVOp->getMask()[i] >= NumElements)
+ SumV2Indices += i;
+ else if (SVOp->getMask()[i] >= 0)
+ SumV1Indices += i;
+ if (SumV2Indices < SumV1Indices)
+ return DAG.getCommutedVectorShuffle(*SVOp);
+ }
}
// For each vector width, delegate to a specialized lowering routine.
return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG);
if (HasFp256 && (VT == MVT::v4f32 || VT == MVT::v2f64))
- return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1, TargetMask,
+ return getTargetShuffleNode(X86ISD::VPERMILPI, dl, VT, V1, TargetMask,
DAG);
return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V1,
if ((HasInt256 && VT == MVT::v8i32) || VT == MVT::v16i32)
return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1,
getShuffleSHUFImmediate(SVOp), DAG);
- return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1,
+ return getTargetShuffleNode(X86ISD::VPERMILPI, dl, VT, V1,
getShuffleSHUFImmediate(SVOp), DAG);
}
EVT VT = Op.getNode()->getValueType(0);
bool Is64Bit = Subtarget->is64Bit();
- EVT SPTy = Is64Bit ? MVT::i64 : MVT::i32;
+ EVT SPTy = getPointerTy();
if (SplitStack) {
MachineRegisterInfo &MRI = MF.getRegInfo();
}
const TargetRegisterClass *AddrRegClass =
- getRegClassFor(Subtarget->is64Bit() ? MVT::i64:MVT::i32);
+ getRegClassFor(getPointerTy());
unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
return DAG.getMergeValues(Ops1, dl);
} else {
SDValue Flag;
- unsigned Reg = (Subtarget->is64Bit() ? X86::RAX : X86::EAX);
+ const unsigned Reg = (Subtarget->isTarget64BitLP64() ? X86::RAX : X86::EAX);
Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag);
Flag = Chain.getValue(1);
return needsCmpXchgNb(SI->getValueOperand()->getType());
}
-bool X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *SI) const {
- return false; // FIXME, currently these are expanded separately in this file.
+// Note: this turns large loads into lock cmpxchg8b/16b.
+// FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
+bool X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
+ auto PTy = cast<PointerType>(LI->getPointerOperand()->getType());
+ return needsCmpXchgNb(PTy->getElementType());
}
bool X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
}
}
+static bool hasMFENCE(const X86Subtarget& Subtarget) {
+ // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for
+ // no-sse2). There isn't any reason to disable it if the target processor
+ // supports it.
+ return Subtarget.hasSSE2() || Subtarget.is64Bit();
+}
+
+LoadInst *
+X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
+ const X86Subtarget &Subtarget =
+ getTargetMachine().getSubtarget<X86Subtarget>();
+ unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
+ const Type *MemType = AI->getType();
+ // Accesses larger than the native width are turned into cmpxchg/libcalls, so
+ // there is no benefit in turning such RMWs into loads, and it is actually
+ // harmful as it introduces a mfence.
+ if (MemType->getPrimitiveSizeInBits() > NativeWidth)
+ return nullptr;
+
+ auto Builder = IRBuilder<>(AI);
+ Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+ auto SynchScope = AI->getSynchScope();
+ // We must restrict the ordering to avoid generating loads with Release or
+ // ReleaseAcquire orderings.
+ auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
+ auto Ptr = AI->getPointerOperand();
+
+ // Before the load we need a fence. Here is an example lifted from
+ // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
+ // is required:
+ // Thread 0:
+ // x.store(1, relaxed);
+ // r1 = y.fetch_add(0, release);
+ // Thread 1:
+ // y.fetch_add(42, acquire);
+ // r2 = x.load(relaxed);
+ // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
+ // lowered to just a load without a fence. A mfence flushes the store buffer,
+ // making the optimization clearly correct.
+ // FIXME: it is required if isAtLeastRelease(Order) but it is not clear
+ // otherwise, we might be able to be more agressive on relaxed idempotent
+ // rmw. In practice, they do not look useful, so we don't try to be
+ // especially clever.
+ if (SynchScope == SingleThread) {
+ // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
+ // the IR level, so we must wrap it in an intrinsic.
+ return nullptr;
+ } else if (hasMFENCE(Subtarget)) {
+ Function *MFence = llvm::Intrinsic::getDeclaration(M,
+ Intrinsic::x86_sse2_mfence);
+ Builder.CreateCall(MFence);
+ } else {
+ // FIXME: it might make sense to use a locked operation here but on a
+ // different cache-line to prevent cache-line bouncing. In practice it
+ // is probably a small win, and x86 processors without mfence are rare
+ // enough that we do not bother.
+ return nullptr;
+ }
+
+ // Finally we can emit the atomic load.
+ LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
+ AI->getType()->getPrimitiveSizeInBits());
+ Loaded->setAtomic(Order, SynchScope);
+ AI->replaceAllUsesWith(Loaded);
+ AI->eraseFromParent();
+ return Loaded;
+}
+
static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
SDLoc dl(Op);
// The only fence that needs an instruction is a sequentially-consistent
// cross-thread fence.
if (FenceOrdering == SequentiallyConsistent && FenceScope == CrossThread) {
- // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for
- // no-sse2). There isn't any reason to disable it if the target processor
- // supports it.
- if (Subtarget->hasSSE2() || Subtarget->is64Bit())
+ if (hasMFENCE(*Subtarget))
return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
SDValue Chain = Op.getOperand(0);
}
}
-static void ReplaceATOMIC_LOAD(SDNode *Node,
- SmallVectorImpl<SDValue> &Results,
- SelectionDAG &DAG) {
- SDLoc dl(Node);
- EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
-
- // Convert wide load -> cmpxchg8b/cmpxchg16b
- // FIXME: On 32-bit, load -> fild or movq would be more efficient
- // (The only way to get a 16-byte load is cmpxchg16b)
- // FIXME: 16-byte ATOMIC_CMP_SWAP isn't actually hooked up at the moment.
- SDValue Zero = DAG.getConstant(0, VT);
- SDVTList VTs = DAG.getVTList(VT, MVT::i1, MVT::Other);
- SDValue Swap =
- DAG.getAtomicCmpSwap(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, dl, VT, VTs,
- Node->getOperand(0), Node->getOperand(1), Zero, Zero,
- cast<AtomicSDNode>(Node)->getMemOperand(),
- cast<AtomicSDNode>(Node)->getOrdering(),
- cast<AtomicSDNode>(Node)->getOrdering(),
- cast<AtomicSDNode>(Node)->getSynchScope());
- Results.push_back(Swap.getValue(0));
- Results.push_back(Swap.getValue(2));
-}
-
/// ReplaceNodeResults - Replace a node with an illegal result type
/// with a new node built out of custom code.
void X86TargetLowering::ReplaceNodeResults(SDNode *N,
case ISD::ATOMIC_LOAD_MAX:
case ISD::ATOMIC_LOAD_UMIN:
case ISD::ATOMIC_LOAD_UMAX:
+ case ISD::ATOMIC_LOAD: {
// Delegate to generic TypeLegalization. Situations we can really handle
// should have already been dealt with by AtomicExpandPass.cpp.
break;
- case ISD::ATOMIC_LOAD: {
- ReplaceATOMIC_LOAD(N, Results, DAG);
- return;
}
case ISD::BITCAST: {
assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
case X86ISD::PSHUFB: return "X86ISD::PSHUFB";
case X86ISD::ANDNP: return "X86ISD::ANDNP";
case X86ISD::PSIGN: return "X86ISD::PSIGN";
- case X86ISD::BLENDV: return "X86ISD::BLENDV";
case X86ISD::BLENDI: return "X86ISD::BLENDI";
case X86ISD::SUBUS: return "X86ISD::SUBUS";
case X86ISD::HADD: return "X86ISD::HADD";
case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST";
case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM";
case X86ISD::VEXTRACT: return "X86ISD::VEXTRACT";
- case X86ISD::VPERMILP: return "X86ISD::VPERMILP";
+ case X86ISD::VPERMILPI: return "X86ISD::VPERMILPI";
case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128";
case X86ISD::VPERMV: return "X86ISD::VPERMV";
case X86ISD::VPERMV3: return "X86ISD::VPERMV3";
}
MachineBasicBlock *
-X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB,
- bool Is64Bit) const {
+X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI,
+ MachineBasicBlock *BB) const {
MachineFunction *MF = BB->getParent();
const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
DebugLoc DL = MI->getDebugLoc();
assert(MF->shouldSplitStack());
- unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
- unsigned TlsOffset = Is64Bit ? 0x70 : 0x30;
+ const bool Is64Bit = Subtarget->is64Bit();
+ const bool IsLP64 = Subtarget->isTarget64BitLP64();
+
+ const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
+ const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
// BB:
// ... [Till the alloca]
MachineRegisterInfo &MRI = MF->getRegInfo();
const TargetRegisterClass *AddrRegClass =
- getRegClassFor(Is64Bit ? MVT::i64:MVT::i32);
+ getRegClassFor(getPointerTy());
unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
sizeVReg = MI->getOperand(1).getReg(),
- physSPReg = Is64Bit ? X86::RSP : X86::ESP;
+ physSPReg = IsLP64 || Subtarget->isTargetNaCl64() ? X86::RSP : X86::ESP;
MachineFunction::iterator MBBIter = BB;
++MBBIter;
// Add code to the main basic block to check if the stack limit has been hit,
// and if so, jump to mallocMBB otherwise to bumpMBB.
BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
- BuildMI(BB, DL, TII->get(Is64Bit ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
+ BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
.addReg(tmpSPVReg).addReg(sizeVReg);
- BuildMI(BB, DL, TII->get(Is64Bit ? X86::CMP64mr:X86::CMP32mr))
+ BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
.addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
.addReg(SPLimitVReg);
BuildMI(BB, DL, TII->get(X86::JG_4)).addMBB(mallocMBB);
.getSubtargetImpl()
->getRegisterInfo()
->getCallPreservedMask(CallingConv::C);
- if (Is64Bit) {
+ if (IsLP64) {
BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
.addReg(sizeVReg);
BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
.addRegMask(RegMask)
.addReg(X86::RDI, RegState::Implicit)
.addReg(X86::RAX, RegState::ImplicitDefine);
+ } else if (Is64Bit) {
+ BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
+ .addReg(sizeVReg);
+ BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
+ .addExternalSymbol("__morestack_allocate_stack_space")
+ .addRegMask(RegMask)
+ .addReg(X86::EDI, RegState::Implicit)
+ .addReg(X86::EAX, RegState::ImplicitDefine);
} else {
BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
.addImm(12);
.addImm(16);
BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
- .addReg(Is64Bit ? X86::RAX : X86::EAX);
+ .addReg(IsLP64 ? X86::RAX : X86::EAX);
BuildMI(mallocMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB);
// Set up the CFG correctly.
case X86::WIN_ALLOCA:
return EmitLoweredWinAlloca(MI, BB);
case X86::SEG_ALLOCA_32:
- return EmitLoweredSegAlloca(MI, BB, false);
case X86::SEG_ALLOCA_64:
- return EmitLoweredSegAlloca(MI, BB, true);
+ return EmitLoweredSegAlloca(MI, BB);
case X86::TLSCall_32:
case X86::TLSCall_64:
return EmitLoweredTLSCall(MI, BB);
assert(Mask.size() <= 16 && "Can't shuffle elements smaller than bytes!");
int Ratio = 16 / Mask.size();
for (unsigned i = 0; i < 16; ++i) {
+ if (Mask[i / Ratio] == SM_SentinelUndef) {
+ PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
+ continue;
+ }
int M = Mask[i / Ratio] != SM_SentinelZero
? Ratio * Mask[i / Ratio] + i % Ratio
: 255;
// for this order is that we are recursing up the operation chain.
for (int i = 0, e = std::max(OpMask.size(), RootMask.size()); i < e; ++i) {
int RootIdx = i / RootRatio;
- if (RootMask[RootIdx] == SM_SentinelZero) {
- // This is a zero-ed lane, we're done.
- Mask.push_back(SM_SentinelZero);
+ if (RootMask[RootIdx] < 0) {
+ // This is a zero or undef lane, we're done.
+ Mask.push_back(RootMask[RootIdx]);
continue;
}
int RootMaskedIdx = RootMask[RootIdx] * RootRatio + i % RootRatio;
int OpIdx = RootMaskedIdx / OpRatio;
- if (OpMask[OpIdx] == SM_SentinelZero) {
- // The incoming lanes are zero, it doesn't matter which ones we are using.
- Mask.push_back(SM_SentinelZero);
+ if (OpMask[OpIdx] < 0) {
+ // The incoming lanes are zero or undef, it doesn't matter which ones we
+ // are using.
+ Mask.push_back(OpMask[OpIdx]);
continue;
}
// elements, and shrink them to the half-width mask. It does this in a loop
// so it will reduce the size of the mask to the minimal width mask which
// performs an equivalent shuffle.
- while (Mask.size() > 1 && canWidenShuffleElements(Mask)) {
- for (int i = 0, e = Mask.size() / 2; i < e; ++i)
- Mask[i] = Mask[2 * i] / 2;
- Mask.resize(Mask.size() / 2);
+ SmallVector<int, 16> WidenedMask;
+ while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
+ Mask = std::move(WidenedMask);
+ WidenedMask.clear();
}
return combineX86ShuffleChain(Op, Root, Mask, Depth, HasPSHUFB, DAG, DCI,
return SDValue(); // We combined away this shuffle, so we're done.
// See if this reduces to a PSHUFD which is no more expensive and can
- // combine with more operations.
- if (canWidenShuffleElements(Mask)) {
- int DMask[] = {-1, -1, -1, -1};
+ // combine with more operations. Note that it has to at least flip the
+ // dwords as otherwise it would have been removed as a no-op.
+ if (Mask[0] == 2 && Mask[1] == 3 && Mask[2] == 0 && Mask[3]) {
+ int DMask[] = {0, 1, 2, 3};
int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
- DMask[DOffset + 0] = DOffset + Mask[0] / 2;
- DMask[DOffset + 1] = DOffset + Mask[2] / 2;
+ DMask[DOffset + 0] = DOffset + 1;
+ DMask[DOffset + 1] = DOffset + 0;
V = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V);
DCI.AddToWorklist(V.getNode());
V = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V,
case X86ISD::PSHUFLW:
case X86ISD::MOVSS:
case X86ISD::MOVSD:
- case X86ISD::VPERMILP:
+ case X86ISD::VPERMILPI:
case X86ISD::VPERM2X128:
case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget);
case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget);