X-Git-Url: http://plrg.eecs.uci.edu/git/?p=oota-llvm.git;a=blobdiff_plain;f=lib%2FTarget%2FX86%2FX86ISelLowering.cpp;h=656c1dea1f7ebbd98fe9aeab15fc9decda273b96;hp=a3871339a495971e41d2b79d95a3208fd832e7be;hb=4a524934577d85e5095df8ea62ad6a3261076d0c;hpb=29720a4bad5f6ca271843f31cf3f03865ccb73cb diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index a3871339a49..656c1dea1f7 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -67,7 +67,7 @@ static cl::opt ExperimentalVectorWideningLegalization( cl::Hidden); static cl::opt ExperimentalVectorShuffleLowering( - "x86-experimental-vector-shuffle-lowering", cl::init(false), + "x86-experimental-vector-shuffle-lowering", cl::init(true), cl::desc("Enable an experimental vector shuffle lowering code path."), cl::Hidden); @@ -213,8 +213,8 @@ static TargetLoweringObjectFile *createTLOF(const Triple &TT) { // FIXME: This should stop caching the target machine as soon as // we can remove resetOperationActions et al. -X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) - : TargetLowering(TM, createTLOF(Triple(TM.getTargetTriple()))) { +X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM) + : TargetLowering(TM, createTLOF(Triple(TM.getTargetTriple()))) { Subtarget = &TM.getSubtarget(); X86ScalarSSEf64 = Subtarget->hasSSE2(); X86ScalarSSEf32 = Subtarget->hasSSE1(); @@ -811,6 +811,8 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::FLOG10, MVT::f80, Expand); setOperationAction(ISD::FEXP, MVT::f80, Expand); setOperationAction(ISD::FEXP2, MVT::f80, Expand); + setOperationAction(ISD::FMINNUM, MVT::f80, Expand); + setOperationAction(ISD::FMAXNUM, MVT::f80, Expand); // First set operation action for all vector types to either promote // (for widening) or expand (for scalarization). Then we will selectively @@ -1560,6 +1562,7 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::SETCC, MVT::v4i1, Custom); setOperationAction(ISD::SETCC, MVT::v2i1, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Legal); } // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion @@ -1594,9 +1597,6 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::UMULO, VT, Custom); } - // There are no 8-bit 3-address imul/mul instructions - setOperationAction(ISD::SMULO, MVT::i8, Expand); - setOperationAction(ISD::UMULO, MVT::i8, Expand); if (!Subtarget->is64Bit()) { // These libcalls are not available in 32-bit. @@ -3539,6 +3539,7 @@ static bool MayFoldIntoStore(SDValue Op) { static bool isTargetShuffle(unsigned Opcode) { switch(Opcode) { default: return false; + case X86ISD::BLENDI: case X86ISD::PSHUFB: case X86ISD::PSHUFD: case X86ISD::PSHUFHW: @@ -3557,7 +3558,7 @@ static bool isTargetShuffle(unsigned Opcode) { case X86ISD::MOVSD: case X86ISD::UNPCKL: case X86ISD::UNPCKH: - case X86ISD::VPERMILP: + case X86ISD::VPERMILPI: case X86ISD::VPERM2X128: case X86ISD::VPERMI: return true; @@ -3583,7 +3584,7 @@ static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT, case X86ISD::PSHUFD: case X86ISD::PSHUFHW: case X86ISD::PSHUFLW: - case X86ISD::VPERMILP: + case X86ISD::VPERMILPI: case X86ISD::VPERMI: return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8)); } @@ -5288,6 +5289,10 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, IsUnary = false; bool IsFakeUnary = false; switch(N->getOpcode()) { + case X86ISD::BLENDI: + ImmN = N->getOperand(N->getNumOperands()-1); + DecodeBLENDMask(VT, cast(ImmN)->getZExtValue(), Mask); + break; case X86ISD::SHUFP: ImmN = N->getOperand(N->getNumOperands()-1); DecodeSHUFPMask(VT, cast(ImmN)->getZExtValue(), Mask); @@ -5314,7 +5319,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, DecodePALIGNRMask(VT, cast(ImmN)->getZExtValue(), Mask); break; case X86ISD::PSHUFD: - case X86ISD::VPERMILP: + case X86ISD::VPERMILPI: ImmN = N->getOperand(N->getNumOperands()-1); DecodePSHUFMask(VT, cast(ImmN)->getZExtValue(), Mask); IsUnary = true; @@ -5347,7 +5352,12 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, SmallVector RawMask; for (int i = 0, e = MaskNode->getNumOperands(); i < e; ++i) { - auto *CN = dyn_cast(MaskNode->getOperand(i)); + SDValue Op = MaskNode->getOperand(i); + if (Op->getOpcode() == ISD::UNDEF) { + RawMask.push_back((uint64_t)SM_SentinelUndef); + continue; + } + auto *CN = dyn_cast(Op.getNode()); if (!CN) return false; APInt MaskElement = CN->getAPIntValue(); @@ -5377,13 +5387,13 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, if (!MaskCP || MaskCP->isMachineConstantPoolEntry()) return false; - if (auto *C = dyn_cast(MaskCP->getConstVal())) { + if (auto *C = dyn_cast(MaskCP->getConstVal())) { // FIXME: Support AVX-512 here. - if (!C->getType()->isVectorTy() || - (C->getNumElements() != 16 && C->getNumElements() != 32)) + Type *Ty = C->getType(); + if (!Ty->isVectorTy() || (Ty->getVectorNumElements() != 16 && + Ty->getVectorNumElements() != 32)) return false; - assert(C->getType()->isVectorTy() && "Expected a vector constant."); DecodePSHUFBMask(C, Mask); break; } @@ -5996,7 +6006,10 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl &Elts, /// or SDValue() otherwise. static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget, SelectionDAG &DAG) { - if (!Subtarget->hasFp256()) + // VBROADCAST requires AVX. + // TODO: Splats could be generated for non-AVX CPUs using SSE + // instructions, but there's less potential gain for only 128-bit vectors. + if (!Subtarget->hasAVX()) return SDValue(); MVT VT = Op.getSimpleValueType(); @@ -6073,17 +6086,34 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget, } } + unsigned ScalarSize = Ld.getValueType().getSizeInBits(); bool IsGE256 = (VT.getSizeInBits() >= 256); - // Handle the broadcasting a single constant scalar from the constant pool - // into a vector. On Sandybridge it is still better to load a constant vector + // When optimizing for size, generate up to 5 extra bytes for a broadcast + // instruction to save 8 or more bytes of constant pool data. + // TODO: If multiple splats are generated to load the same constant, + // it may be detrimental to overall size. There needs to be a way to detect + // that condition to know if this is truly a size win. + const Function *F = DAG.getMachineFunction().getFunction(); + bool OptForSize = F->getAttributes(). + hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize); + + // Handle broadcasting a single constant scalar from the constant pool + // into a vector. + // On Sandybridge (no AVX2), it is still better to load a constant vector // from the constant pool and not to broadcast it from a scalar. - if (ConstSplatVal && Subtarget->hasInt256()) { + // But override that restriction when optimizing for size. + // TODO: Check if splatting is recommended for other AVX-capable CPUs. + if (ConstSplatVal && (Subtarget->hasAVX2() || OptForSize)) { EVT CVT = Ld.getValueType(); assert(!CVT.isVector() && "Must not broadcast a vector type"); - unsigned ScalarSize = CVT.getSizeInBits(); - if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)) { + // Splat f32, i32, v4f64, v4i64 in all cases with AVX2. + // For size optimization, also splat v2f64 and v2i64, and for size opt + // with AVX2, also splat i8 and i16. + // With pattern matching, the VBROADCAST node may become a VMOVDDUP. + if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) || + (OptForSize && (ScalarSize == 64 || Subtarget->hasAVX2()))) { const Constant *C = nullptr; if (ConstantSDNode *CI = dyn_cast(Ld)) C = CI->getConstantIntValue(); @@ -6104,7 +6134,6 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget, } bool IsLoad = ISD::isNormalLoad(Ld.getNode()); - unsigned ScalarSize = Ld.getValueType().getSizeInBits(); // Handle AVX2 in-register broadcasts. if (!IsLoad && Subtarget->hasInt256() && @@ -7154,6 +7183,56 @@ static bool isSingleInputShuffleMask(ArrayRef Mask) { return true; } +/// \brief Test whether there are elements crossing 128-bit lanes in this +/// shuffle mask. +/// +/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations +/// and we routinely test for these. +static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef Mask) { + int LaneSize = 128 / VT.getScalarSizeInBits(); + int Size = Mask.size(); + for (int i = 0; i < Size; ++i) + if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize) + return true; + return false; +} + +/// \brief Test whether a shuffle mask is equivalent within each 128-bit lane. +/// +/// This checks a shuffle mask to see if it is performing the same +/// 128-bit lane-relative shuffle in each 128-bit lane. This trivially implies +/// that it is also not lane-crossing. It may however involve a blend from the +/// same lane of a second vector. +/// +/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is +/// non-trivial to compute in the face of undef lanes. The representation is +/// *not* suitable for use with existing 128-bit shuffles as it will contain +/// entries from both V1 and V2 inputs to the wider mask. +static bool +is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef Mask, + SmallVectorImpl &RepeatedMask) { + int LaneSize = 128 / VT.getScalarSizeInBits(); + RepeatedMask.resize(LaneSize, -1); + int Size = Mask.size(); + for (int i = 0; i < Size; ++i) { + if (Mask[i] < 0) + continue; + if ((Mask[i] % Size) / LaneSize != i / LaneSize) + // This entry crosses lanes, so there is no way to model this shuffle. + return false; + + // Ok, handle the in-lane shuffles by detecting if and when they repeat. + if (RepeatedMask[i % LaneSize] == -1) + // This is the first non-undef entry in this slot of a 128-bit lane. + RepeatedMask[i % LaneSize] = + Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + Size; + else if (RepeatedMask[i % LaneSize] + (i / LaneSize) * LaneSize != Mask[i]) + // Found a mismatch with the repeated mask. + return false; + } + return true; +} + // Hide this symbol with an anonymous namespace instead of 'static' so that MSVC // 2013 will allow us to use it as a non-type template parameter. namespace { @@ -7166,8 +7245,6 @@ bool isShuffleEquivalentImpl(ArrayRef Mask, ArrayRef Args) { return false; for (int i = 0, e = Mask.size(); i < e; ++i) { assert(*Args[i] >= 0 && "Arguments must be positive integers!"); - assert(*Args[i] < (int)Args.size() * 2 && - "Argument outside the range of possible shuffle inputs!"); if (Mask[i] != -1 && Mask[i] != *Args[i]) return false; } @@ -7221,6 +7298,7 @@ static SDValue getV4X86ShuffleImm8ForMask(ArrayRef Mask, /// that the shuffle mask is in fact a blend. static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, + const X86Subtarget *Subtarget, SelectionDAG &DAG) { unsigned BlendMask = 0; @@ -7238,12 +7316,36 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1, case MVT::v2f64: case MVT::v4f32: case MVT::v4f64: + case MVT::v8f32: return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2, DAG.getConstant(BlendMask, MVT::i8)); - case MVT::v8i16: + case MVT::v4i64: + case MVT::v8i32: + assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!"); + // FALLTHROUGH + case MVT::v2i64: case MVT::v4i32: - case MVT::v2i64: { + // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into + // that instruction. + if (Subtarget->hasAVX2()) { + // Scale the blend by the number of 32-bit dwords per element. + int Scale = VT.getScalarSizeInBits() / 32; + BlendMask = 0; + for (int i = 0, Size = Mask.size(); i < Size; ++i) + if (Mask[i] >= Size) + for (int j = 0; j < Scale; ++j) + BlendMask |= 1u << (i * Scale + j); + + MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32; + V1 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V1); + V2 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V2); + return DAG.getNode(ISD::BITCAST, DL, VT, + DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2, + DAG.getConstant(BlendMask, MVT::i8))); + } + // FALLTHROUGH + case MVT::v8i16: { // For integer shuffles we need to expand the mask and cast the inputs to // v8i16s prior to blending. int Scale = 8 / VT.getVectorNumElements(); @@ -7260,15 +7362,93 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1, DAG.getConstant(BlendMask, MVT::i8))); } + case MVT::v16i16: { + assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!"); + SmallVector RepeatedMask; + if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) { + // We can lower these with PBLENDW which is mirrored across 128-bit lanes. + assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!"); + BlendMask = 0; + for (int i = 0; i < 8; ++i) + if (RepeatedMask[i] >= 16) + BlendMask |= 1u << i; + return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2, + DAG.getConstant(BlendMask, MVT::i8)); + } + } + // FALLTHROUGH + case MVT::v32i8: { + assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!"); + // Scale the blend by the number of bytes per element. + int Scale = VT.getScalarSizeInBits() / 8; + assert(Mask.size() * Scale == 32 && "Not a 256-bit vector!"); + + // Compute the VSELECT mask. Note that VSELECT is really confusing in the + // mix of LLVM's code generator and the x86 backend. We tell the code + // generator that boolean values in the elements of an x86 vector register + // are -1 for true and 0 for false. We then use the LLVM semantics of 'true' + // mapping a select to operand #1, and 'false' mapping to operand #2. The + // reality in x86 is that vector masks (pre-AVX-512) use only the high bit + // of the element (the remaining are ignored) and 0 in that high bit would + // mean operand #1 while 1 in the high bit would mean operand #2. So while + // the LLVM model for boolean values in vector elements gets the relevant + // bit set, it is set backwards and over constrained relative to x86's + // actual model. + SDValue VSELECTMask[32]; + for (int i = 0, Size = Mask.size(); i < Size; ++i) + for (int j = 0; j < Scale; ++j) + VSELECTMask[Scale * i + j] = + Mask[i] < 0 ? DAG.getUNDEF(MVT::i8) + : DAG.getConstant(Mask[i] < Size ? -1 : 0, MVT::i8); + + V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V1); + V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V2); + return DAG.getNode( + ISD::BITCAST, DL, VT, + DAG.getNode(ISD::VSELECT, DL, MVT::v32i8, + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, VSELECTMask), + V1, V2)); + } + default: llvm_unreachable("Not a supported integer vector type!"); } } +/// \brief Generic routine to lower a shuffle and blend as a decomposed set of +/// unblended shuffles followed by an unshuffled blend. +/// +/// This matches the extremely common pattern for handling combined +/// shuffle+blend operations on newer X86 ISAs where we have very fast blend +/// operations. +static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(SDLoc DL, MVT VT, + SDValue V1, + SDValue V2, + ArrayRef Mask, + SelectionDAG &DAG) { + // Shuffle the input elements into the desired positions in V1 and V2 and + // blend them together. + SmallVector V1Mask(Mask.size(), -1); + SmallVector V2Mask(Mask.size(), -1); + SmallVector BlendMask(Mask.size(), -1); + for (int i = 0, Size = Mask.size(); i < Size; ++i) + if (Mask[i] >= 0 && Mask[i] < Size) { + V1Mask[i] = Mask[i]; + BlendMask[i] = i; + } else if (Mask[i] >= Size) { + V2Mask[i] = Mask[i] - Size; + BlendMask[i] = i + Size; + } + + V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask); + V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask); + return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask); +} + /// \brief Try to lower a vector shuffle as a byte rotation. /// /// We have a generic PALIGNR instruction in x86 that will do an arbitrary -/// byte-rotation of a the concatentation of two vectors. This routine will +/// byte-rotation of the concatenation of two vectors. This routine will /// try to generically lower a vector shuffle through such an instruction. It /// does not check for the availability of PALIGNR-based lowerings, only the /// applicability of this strategy to the given mask. This matches shuffle @@ -7557,6 +7737,39 @@ static SDValue lowerVectorShuffleAsZeroOrAnyExtend( return SDValue(); } +/// \brief Try to get a scalar value for a specific element of a vector. +/// +/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar. +static SDValue getScalarValueForVectorElement(SDValue V, int Idx, + SelectionDAG &DAG) { + MVT VT = V.getSimpleValueType(); + MVT EltVT = VT.getVectorElementType(); + while (V.getOpcode() == ISD::BITCAST) + V = V.getOperand(0); + // If the bitcasts shift the element size, we can't extract an equivalent + // element from it. + MVT NewVT = V.getSimpleValueType(); + if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) + return SDValue(); + + if (V.getOpcode() == ISD::BUILD_VECTOR || + (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) + return DAG.getNode(ISD::BITCAST, SDLoc(V), EltVT, V.getOperand(Idx)); + + return SDValue(); +} + +/// \brief Helper to test for a load that can be folded with x86 shuffles. +/// +/// This is particularly important because the set of instructions varies +/// significantly based on whether the operand is a load or not. +static bool isShuffleFoldableLoad(SDValue V) { + while (V.getOpcode() == ISD::BITCAST) + V = V.getOperand(0); + + return ISD::isNON_EXTLoad(V.getNode()); +} + /// \brief Try to lower insertion of a single element into a zero vector. /// /// This is a common pattern that we have especially efficient patterns to lower @@ -7565,59 +7778,71 @@ static SDValue lowerVectorShuffleAsElementInsertion( MVT VT, SDLoc DL, SDValue V1, SDValue V2, ArrayRef Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); + MVT ExtVT = VT; + MVT EltVT = VT.getVectorElementType(); int V2Index = std::find_if(Mask.begin(), Mask.end(), [&Mask](int M) { return M >= (int)Mask.size(); }) - Mask.begin(); - if (Mask.size() == 2) { - if (!Zeroable[V2Index ^ 1]) { - // For 2-wide masks we may be able to just invert the inputs. We use an xor - // with 2 to flip from {2,3} to {0,1} and vice versa. - int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2), - Mask[1] < 0 ? -1 : (Mask[1] ^ 2)}; - if (Zeroable[V2Index]) - return lowerVectorShuffleAsElementInsertion(VT, DL, V2, V1, InverseMask, - Subtarget, DAG); - else - return SDValue(); + bool IsV1Zeroable = true; + for (int i = 0, Size = Mask.size(); i < Size; ++i) + if (i != V2Index && !Zeroable[i]) { + IsV1Zeroable = false; + break; } - } else { - for (int i = 0, Size = Mask.size(); i < Size; ++i) - if (i != V2Index && !Zeroable[i]) - return SDValue(); // Not inserting into a zero vector. - } - - // Step over any bitcasts on either input so we can scan the actual - // BUILD_VECTOR nodes. - while (V1.getOpcode() == ISD::BITCAST) - V1 = V1.getOperand(0); - while (V2.getOpcode() == ISD::BITCAST) - V2 = V2.getOperand(0); // Check for a single input from a SCALAR_TO_VECTOR node. // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and // all the smarts here sunk into that routine. However, the current // lowering of BUILD_VECTOR makes that nearly impossible until the old // vector shuffle lowering is dead. - if (!((V2.getOpcode() == ISD::SCALAR_TO_VECTOR && - Mask[V2Index] == (int)Mask.size()) || - V2.getOpcode() == ISD::BUILD_VECTOR)) + if (SDValue V2S = getScalarValueForVectorElement( + V2, Mask[V2Index] - Mask.size(), DAG)) { + // We need to zext the scalar if it is smaller than an i32. + V2S = DAG.getNode(ISD::BITCAST, DL, EltVT, V2S); + if (EltVT == MVT::i8 || EltVT == MVT::i16) { + // Using zext to expand a narrow element won't work for non-zero + // insertions. + if (!IsV1Zeroable) + return SDValue(); + + // Zero-extend directly to i32. + ExtVT = MVT::v4i32; + V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S); + } + V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S); + } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 || + EltVT == MVT::i16) { + // Either not inserting from the low element of the input or the input + // element size is too small to use VZEXT_MOVL to clear the high bits. return SDValue(); + } - SDValue V2S = V2.getOperand(Mask[V2Index] - Mask.size()); + if (!IsV1Zeroable) { + // If V1 can't be treated as a zero vector we have fewer options to lower + // this. We can't support integer vectors or non-zero targets cheaply, and + // the V1 elements can't be permuted in any way. + assert(VT == ExtVT && "Cannot change extended type when non-zeroable!"); + if (!VT.isFloatingPoint() || V2Index != 0) + return SDValue(); + SmallVector V1Mask(Mask.begin(), Mask.end()); + V1Mask[V2Index] = -1; + if (!isNoopShuffleMask(V1Mask)) + return SDValue(); + // This is essentially a special case blend operation, but if we have + // general purpose blend operations, they are always faster. Bail and let + // the rest of the lowering handle these as blends. + if (Subtarget->hasSSE41()) + return SDValue(); - // First, we need to zext the scalar if it is smaller than an i32. - MVT ExtVT = VT; - MVT EltVT = VT.getVectorElementType(); - V2S = DAG.getNode(ISD::BITCAST, DL, EltVT, V2S); - if (EltVT == MVT::i8 || EltVT == MVT::i16) { - // Zero-extend directly to i32. - ExtVT = MVT::v4i32; - V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S); + // Otherwise, use MOVSD or MOVSS. + assert((EltVT == MVT::f32 || EltVT == MVT::f64) && + "Only two types of floating point element types to handle!"); + return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL, + ExtVT, V1, V2); } - V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, - DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S)); + V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2); if (ExtVT != VT) V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2); @@ -7643,6 +7868,83 @@ static SDValue lowerVectorShuffleAsElementInsertion( return V2; } +/// \brief Try to lower broadcast of a single element. +/// +/// For convenience, this code also bundles all of the subtarget feature set +/// filtering. While a little annoying to re-dispatch on type here, there isn't +/// a convenient way to factor it out. +static SDValue lowerVectorShuffleAsBroadcast(MVT VT, SDLoc DL, SDValue V, + ArrayRef Mask, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + if (!Subtarget->hasAVX()) + return SDValue(); + if (VT.isInteger() && !Subtarget->hasAVX2()) + return SDValue(); + + // Check that the mask is a broadcast. + int BroadcastIdx = -1; + for (int M : Mask) + if (M >= 0 && BroadcastIdx == -1) + BroadcastIdx = M; + else if (M >= 0 && M != BroadcastIdx) + return SDValue(); + + assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with " + "a sorted mask where the broadcast " + "comes from V1."); + + // Go up the chain of (vector) values to try and find a scalar load that + // we can combine with the broadcast. + for (;;) { + switch (V.getOpcode()) { + case ISD::CONCAT_VECTORS: { + int OperandSize = Mask.size() / V.getNumOperands(); + V = V.getOperand(BroadcastIdx / OperandSize); + BroadcastIdx %= OperandSize; + continue; + } + + case ISD::INSERT_SUBVECTOR: { + SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1); + auto ConstantIdx = dyn_cast(V.getOperand(2)); + if (!ConstantIdx) + break; + + int BeginIdx = (int)ConstantIdx->getZExtValue(); + int EndIdx = + BeginIdx + (int)VInner.getValueType().getVectorNumElements(); + if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) { + BroadcastIdx -= BeginIdx; + V = VInner; + } else { + V = VOuter; + } + continue; + } + } + break; + } + + // Check if this is a broadcast of a scalar. We special case lowering + // for scalars so that we can more effectively fold with loads. + if (V.getOpcode() == ISD::BUILD_VECTOR || + (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) { + V = V.getOperand(BroadcastIdx); + + // If the scalar isn't a load we can't broadcast from it in AVX1, only with + // AVX2. + if (!Subtarget->hasAVX2() && !isShuffleFoldableLoad(V)) + return SDValue(); + } else if (BroadcastIdx != 0 || !Subtarget->hasAVX2()) { + // We can't broadcast from a vector register w/o AVX2, and we can only + // broadcast from the zero-element of a vector register. + return SDValue(); + } + + return DAG.getNode(X86ISD::VBROADCAST, DL, VT, V); +} + /// \brief Handle lowering of 2-lane 64-bit floating point shuffles. /// /// This is the basis function for the 2-lane 64-bit shuffles as we have full @@ -7669,7 +7971,7 @@ static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, if (Subtarget->hasAVX()) { // If we have AVX, we can use VPERMILPS which will allow folding a load // into the shuffle. - return DAG.getNode(X86ISD::VPERMILP, DL, MVT::v2f64, V1, + return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1, DAG.getConstant(SHUFPDMask, MVT::i8)); } @@ -7686,14 +7988,33 @@ static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2f64, V1, V2); // If we have a single input, insert that into V1 if we can do so cheaply. - if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) + if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) { if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( MVT::v2f64, DL, V1, V2, Mask, Subtarget, DAG)) return Insertion; + // Try inverting the insertion since for v2 masks it is easy to do and we + // can't reliably sort the mask one way or the other. + int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2), + Mask[1] < 0 ? -1 : (Mask[1] ^ 2)}; + if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( + MVT::v2f64, DL, V2, V1, InverseMask, Subtarget, DAG)) + return Insertion; + } + + // Try to use one of the special instruction patterns to handle two common + // blend patterns if a zero-blend above didn't work. + if (isShuffleEquivalent(Mask, 0, 3) || isShuffleEquivalent(Mask, 1, 3)) + if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG)) + // We can either use a special instruction to load over the low double or + // to move just the low double. + return DAG.getNode( + isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD, + DL, MVT::v2f64, V2, + DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S)); if (Subtarget->hasSSE41()) - if (SDValue Blend = - lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask, DAG)) + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask, + Subtarget, DAG)) return Blend; unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1); @@ -7719,6 +8040,11 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"); if (isSingleInputShuffleMask(Mask)) { + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v2i64, DL, V1, + Mask, Subtarget, DAG)) + return Broadcast; + // Straight shuffle of a single input vector. For everything from SSE2 // onward this has a single fast instruction with no scary immediates. // We have to map the mask as it is actually a v4i32 shuffle instruction. @@ -7732,22 +8058,30 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, getV4X86ShuffleImm8ForMask(WidenedMask, DAG))); } - // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(Mask, 0, 2)) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, V1, V2); - if (isShuffleEquivalent(Mask, 1, 3)) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2i64, V1, V2); - // If we have a single input from V2 insert that into V1 if we can do so // cheaply. - if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) + if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) { if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( MVT::v2i64, DL, V1, V2, Mask, Subtarget, DAG)) return Insertion; + // Try inverting the insertion since for v2 masks it is easy to do and we + // can't reliably sort the mask one way or the other. + int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2), + Mask[1] < 0 ? -1 : (Mask[1] ^ 2)}; + if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( + MVT::v2i64, DL, V2, V1, InverseMask, Subtarget, DAG)) + return Insertion; + } + + // Use dedicated unpack instructions for masks that match their pattern. + if (isShuffleEquivalent(Mask, 0, 2)) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, V1, V2); + if (isShuffleEquivalent(Mask, 1, 3)) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2i64, V1, V2); if (Subtarget->hasSSE41()) - if (SDValue Blend = - lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask, DAG)) + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask, + Subtarget, DAG)) return Blend; // Try to use rotation instructions if available. @@ -7766,107 +8100,25 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask)); } -/// \brief Lower 4-lane 32-bit floating point shuffles. +/// \brief Lower a vector shuffle using the SHUFPS instruction. /// -/// Uses instructions exclusively from the floating point unit to minimize -/// domain crossing penalties, as these are sufficient to implement all v4f32 -/// shuffles. -static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, - const X86Subtarget *Subtarget, - SelectionDAG &DAG) { - SDLoc DL(Op); - assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!"); - assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); - assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); - ShuffleVectorSDNode *SVOp = cast(Op); - ArrayRef Mask = SVOp->getMask(); - assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); - +/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS. +/// It makes no assumptions about whether this is the *best* lowering, it simply +/// uses it. +static SDValue lowerVectorShuffleWithSHUFPS(SDLoc DL, MVT VT, + ArrayRef Mask, SDValue V1, + SDValue V2, SelectionDAG &DAG) { SDValue LowV = V1, HighV = V2; int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]}; int NumV2Elements = std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }); - if (NumV2Elements == 0) { - if (Subtarget->hasAVX()) { - // If we have AVX, we can use VPERMILPS which will allow folding a load - // into the shuffle. - return DAG.getNode(X86ISD::VPERMILP, DL, MVT::v4f32, V1, - getV4X86ShuffleImm8ForMask(Mask, DAG)); - } - - // Otherwise, use a straight shuffle of a single input vector. We pass the - // input vector to both operands to simulate this with a SHUFPS. - return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1, - getV4X86ShuffleImm8ForMask(Mask, DAG)); - } - - // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(Mask, 0, 4, 1, 5)) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V1, V2); - if (isShuffleEquivalent(Mask, 2, 6, 3, 7)) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V1, V2); - - // There are special ways we can lower some single-element blends. However, we - // have custom ways we can lower more complex single-element blends below that - // we defer to if both this and BLENDPS fail to match, so restrict this to - // when the V2 input is targeting element 0 of the mask -- that is the fast - // case here. - if (NumV2Elements == 1 && Mask[0] >= 4) - if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4f32, DL, V1, V2, - Mask, Subtarget, DAG)) - return V; - - if (Subtarget->hasSSE41()) - if (SDValue Blend = - lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask, DAG)) - return Blend; - if (NumV2Elements == 1) { int V2Index = std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) - Mask.begin(); - // Check for whether we can use INSERTPS to perform the blend. We only use - // INSERTPS when the V1 elements are already in the correct locations - // because otherwise we can just always use two SHUFPS instructions which - // are much smaller to encode than a SHUFPS and an INSERTPS. - if (Subtarget->hasSSE41()) { - // When using INSERTPS we can zero any lane of the destination. Collect - // the zero inputs into a mask and drop them from the lanes of V1 which - // actually need to be present as inputs to the INSERTPS. - SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); - - // Synthesize a shuffle mask for the non-zero and non-v2 inputs. - bool InsertNeedsShuffle = false; - unsigned ZMask = 0; - for (int i = 0; i < 4; ++i) - if (i != V2Index) { - if (Zeroable[i]) { - ZMask |= 1 << i; - } else if (Mask[i] != i) { - InsertNeedsShuffle = true; - break; - } - } - - // We don't want to use INSERTPS or other insertion techniques if it will - // require shuffling anyways. - if (!InsertNeedsShuffle) { - // If all of V1 is zeroable, replace it with undef. - if ((ZMask | 1 << V2Index) == 0xF) - V1 = DAG.getUNDEF(MVT::v4f32); - - unsigned InsertPSMask = (Mask[V2Index] - 4) << 6 | V2Index << 4 | ZMask; - assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!"); - - // Insert the V2 element into the desired position. - return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2, - DAG.getConstant(InsertPSMask, MVT::i8)); - } - } - // Compute the index adjacent to V2Index and in the same half by toggling // the low bit. int V2AdjIndex = V2Index ^ 1; @@ -7883,7 +8135,7 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, // To make this work, blend them together as the first step. int V1Index = V2AdjIndex; int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0}; - V2 = DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V2, V1, + V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1, getV4X86ShuffleImm8ForMask(BlendMask, DAG)); // Now proceed to reconstruct the final blend as we have the necessary @@ -7900,9 +8152,17 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, } else if (NumV2Elements == 2) { if (Mask[0] < 4 && Mask[1] < 4) { // Handle the easy case where we have V1 in the low lanes and V2 in the - // high lanes. We never see this reversed because we sort the shuffle. + // high lanes. NewMask[2] -= 4; NewMask[3] -= 4; + } else if (Mask[2] < 4 && Mask[3] < 4) { + // We also handle the reversed case because this utility may get called + // when we detect a SHUFPS pattern but can't easily commute the shuffle to + // arrange things in the right direction. + NewMask[0] -= 4; + NewMask[1] -= 4; + HighV = V1; + LowV = V2; } else { // We have a mixture of V1 and V2 in both low and high lanes. Rather than // trying to place elements directly, just blend them and set up the final @@ -7914,7 +8174,7 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, Mask[2] < 4 ? Mask[2] : Mask[3], (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4, (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4}; - V1 = DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V2, + V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2, getV4X86ShuffleImm8ForMask(BlendMask, DAG)); // Now we do a normal shuffle of V1 by giving V1 as both operands to @@ -7926,10 +8186,116 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, NewMask[3] = Mask[2] < 4 ? 3 : 1; } } - return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, LowV, HighV, + return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV, getV4X86ShuffleImm8ForMask(NewMask, DAG)); } +/// \brief Lower 4-lane 32-bit floating point shuffles. +/// +/// Uses instructions exclusively from the floating point unit to minimize +/// domain crossing penalties, as these are sufficient to implement all v4f32 +/// shuffles. +static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!"); + assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast(Op); + ArrayRef Mask = SVOp->getMask(); + assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); + + int NumV2Elements = + std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }); + + if (NumV2Elements == 0) { + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4f32, DL, V1, + Mask, Subtarget, DAG)) + return Broadcast; + + if (Subtarget->hasAVX()) { + // If we have AVX, we can use VPERMILPS which will allow folding a load + // into the shuffle. + return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1, + getV4X86ShuffleImm8ForMask(Mask, DAG)); + } + + // Otherwise, use a straight shuffle of a single input vector. We pass the + // input vector to both operands to simulate this with a SHUFPS. + return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1, + getV4X86ShuffleImm8ForMask(Mask, DAG)); + } + + // Use dedicated unpack instructions for masks that match their pattern. + if (isShuffleEquivalent(Mask, 0, 4, 1, 5)) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V1, V2); + if (isShuffleEquivalent(Mask, 2, 6, 3, 7)) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V1, V2); + + // There are special ways we can lower some single-element blends. However, we + // have custom ways we can lower more complex single-element blends below that + // we defer to if both this and BLENDPS fail to match, so restrict this to + // when the V2 input is targeting element 0 of the mask -- that is the fast + // case here. + if (NumV2Elements == 1 && Mask[0] >= 4) + if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4f32, DL, V1, V2, + Mask, Subtarget, DAG)) + return V; + + if (Subtarget->hasSSE41()) + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask, + Subtarget, DAG)) + return Blend; + + // Check for whether we can use INSERTPS to perform the blend. We only use + // INSERTPS when the V1 elements are already in the correct locations + // because otherwise we can just always use two SHUFPS instructions which + // are much smaller to encode than a SHUFPS and an INSERTPS. + if (NumV2Elements == 1 && Subtarget->hasSSE41()) { + int V2Index = + std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) - + Mask.begin(); + + // When using INSERTPS we can zero any lane of the destination. Collect + // the zero inputs into a mask and drop them from the lanes of V1 which + // actually need to be present as inputs to the INSERTPS. + SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); + + // Synthesize a shuffle mask for the non-zero and non-v2 inputs. + bool InsertNeedsShuffle = false; + unsigned ZMask = 0; + for (int i = 0; i < 4; ++i) + if (i != V2Index) { + if (Zeroable[i]) { + ZMask |= 1 << i; + } else if (Mask[i] != i) { + InsertNeedsShuffle = true; + break; + } + } + + // We don't want to use INSERTPS or other insertion techniques if it will + // require shuffling anyways. + if (!InsertNeedsShuffle) { + // If all of V1 is zeroable, replace it with undef. + if ((ZMask | 1 << V2Index) == 0xF) + V1 = DAG.getUNDEF(MVT::v4f32); + + unsigned InsertPSMask = (Mask[V2Index] - 4) << 6 | V2Index << 4 | ZMask; + assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!"); + + // Insert the V2 element into the desired position. + return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2, + DAG.getConstant(InsertPSMask, MVT::i8)); + } + } + + // Otherwise fall back to a SHUFPS lowering strategy. + return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG); +} + /// \brief Lower 4-lane i32 vector shuffles. /// /// We try to handle these with integer-domain shuffles where we can, but for @@ -7945,10 +8311,22 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, ArrayRef Mask = SVOp->getMask(); assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); + // Whenever we can lower this as a zext, that instruction is strictly faster + // than any alternative. It also allows us to fold memory operands into the + // shuffle in many cases. + if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, + Mask, Subtarget, DAG)) + return ZExt; + int NumV2Elements = std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }); if (NumV2Elements == 0) { + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4i32, DL, V1, + Mask, Subtarget, DAG)) + return Broadcast; + // Straight shuffle of a single input vector. For everything from SSE2 // onward this has a single fast instruction with no scary immediates. // We coerce the shuffle pattern to be compatible with UNPCK instructions @@ -7965,11 +8343,11 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, getV4X86ShuffleImm8ForMask(Mask, DAG)); } - // Whenever we can lower this as a zext, that instruction is strictly faster - // than any alternative. - if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, + // There are special ways we can lower some single-element blends. + if (NumV2Elements == 1) + if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4i32, DL, V1, V2, Mask, Subtarget, DAG)) - return ZExt; + return V; // Use dedicated unpack instructions for masks that match their pattern. if (isShuffleEquivalent(Mask, 0, 4, 1, 5)) @@ -7977,15 +8355,9 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, if (isShuffleEquivalent(Mask, 2, 6, 3, 7)) return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V1, V2); - // There are special ways we can lower some single-element blends. - if (NumV2Elements == 1) - if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4i32, DL, V1, V2, - Mask, Subtarget, DAG)) - return V; - if (Subtarget->hasSSE41()) - if (SDValue Blend = - lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask, DAG)) + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask, + Subtarget, DAG)) return Blend; // Try to use rotation instructions if available. @@ -8045,6 +8417,11 @@ static SDValue lowerV8I16SingleInputVectorShuffle( MutableArrayRef HToLInputs(LoInputs.data() + NumLToL, NumHToL); MutableArrayRef HToHInputs(HiInputs.data() + NumLToH, NumHToH); + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8i16, DL, V, + Mask, Subtarget, DAG)) + return Broadcast; + // Use dedicated unpack instructions for masks that match their pattern. if (isShuffleEquivalent(Mask, 0, 0, 1, 1, 2, 2, 3, 3)) return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V, V); @@ -8665,13 +9042,14 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return V; if (Subtarget->hasSSE41()) - if (SDValue Blend = - lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask, DAG)) + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask, + Subtarget, DAG)) return Blend; // Try to use rotation instructions if available. if (Subtarget->hasSSSE3()) - if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask, DAG)) + if (SDValue Rotate = lowerVectorShuffleAsByteRotate( + DL, MVT::v8i16, V1, V2, Mask, DAG)) return Rotate; if (NumV1Inputs + NumV2Inputs <= 4) @@ -8805,8 +9183,8 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, // Try to use rotation instructions if available. if (Subtarget->hasSSSE3()) - if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, - OrigMask, DAG)) + if (SDValue Rotate = lowerVectorShuffleAsByteRotate( + DL, MVT::v16i8, V1, V2, OrigMask, DAG)) return Rotate; // Try to use a zext lowering. @@ -8828,6 +9206,11 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, // For single-input shuffles, there are some nicer lowering tricks we can use. if (NumV2Elements == 0) { + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16i8, DL, V1, + Mask, Subtarget, DAG)) + return Broadcast; + // Check whether we can widen this to an i16 shuffle by duplicating bytes. // Notably, this handles splat and partial-splat shuffles more efficiently. // However, it only makes sense if the pre-duplication shuffle simplifies @@ -8901,11 +9284,16 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, MVT::v16i8, V1, V1); int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; - for (int i = 0; i < 16; i += 2) { - if (Mask[i] != -1) - PostDupI16Shuffle[i / 2] = LaneMap[Mask[i]] - (TargetLo ? 0 : 8); - assert(PostDupI16Shuffle[i / 2] < 8 && "Invalid v8 shuffle mask!"); - } + for (int i = 0; i < 16; ++i) + if (Mask[i] != -1) { + int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8); + assert(MappedMask < 8 && "Invalid v8 shuffle mask!"); + if (PostDupI16Shuffle[i / 2] == -1) + PostDupI16Shuffle[i / 2] = MappedMask; + else + assert(PostDupI16Shuffle[i / 2] == MappedMask && + "Conflicting entrties in the original shuffle!"); + } return DAG.getNode( ISD::BITCAST, DL, MVT::v16i8, DAG.getVectorShuffle(MVT::v8i16, DL, @@ -8922,21 +9310,29 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, // // FIXME: We need to handle other interleaving widths (i16, i32, ...). if (shouldLowerAsInterleaving(Mask)) { - // FIXME: Figure out whether we should pack these into the low or high - // halves. - - int EMask[16], OMask[16]; + int NumLoHalf = std::count_if(Mask.begin(), Mask.end(), [](int M) { + return (M >= 0 && M < 8) || (M >= 16 && M < 24); + }); + int NumHiHalf = std::count_if(Mask.begin(), Mask.end(), [](int M) { + return (M >= 8 && M < 16) || M >= 24; + }); + int EMask[16] = {-1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1}; + int OMask[16] = {-1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1}; + bool UnpackLo = NumLoHalf >= NumHiHalf; + MutableArrayRef TargetEMask(UnpackLo ? EMask : EMask + 8, 8); + MutableArrayRef TargetOMask(UnpackLo ? OMask : OMask + 8, 8); for (int i = 0; i < 8; ++i) { - EMask[i] = Mask[2*i]; - OMask[i] = Mask[2*i + 1]; - EMask[i + 8] = -1; - OMask[i + 8] = -1; + TargetEMask[i] = Mask[2 * i]; + TargetOMask[i] = Mask[2 * i + 1]; } SDValue Evens = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, EMask); SDValue Odds = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, OMask); - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, Evens, Odds); + return DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL, + MVT::v16i8, Evens, Odds); } // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly @@ -8957,7 +9353,7 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, SDValue V2Mask[16]; for (int i = 0; i < 16; ++i) if (Mask[i] == -1) { - V1Mask[i] = V2Mask[i] = DAG.getConstant(0x80, MVT::i8); + V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8); } else { V1Mask[i] = DAG.getConstant(Mask[i] < 16 ? Mask[i] : 0x80, MVT::i8); V2Mask[i] = @@ -9114,42 +9510,76 @@ static SDValue lower128BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, } } -/// \brief Test whether there are elements crossing 128-bit lanes in this -/// shuffle mask. +/// \brief Helper function to test whether a shuffle mask could be +/// simplified by widening the elements being shuffled. /// -/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations -/// and we routinely test for these. -static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef Mask) { - int LaneSize = 128 / VT.getScalarSizeInBits(); - int Size = Mask.size(); - for (int i = 0; i < Size; ++i) - if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize) - return true; - return false; +/// Appends the mask for wider elements in WidenedMask if valid. Otherwise +/// leaves it in an unspecified state. +/// +/// NOTE: This must handle normal vector shuffle masks and *target* vector +/// shuffle masks. The latter have the special property of a '-2' representing +/// a zero-ed lane of a vector. +static bool canWidenShuffleElements(ArrayRef Mask, + SmallVectorImpl &WidenedMask) { + for (int i = 0, Size = Mask.size(); i < Size; i += 2) { + // If both elements are undef, its trivial. + if (Mask[i] == SM_SentinelUndef && Mask[i + 1] == SM_SentinelUndef) { + WidenedMask.push_back(SM_SentinelUndef); + continue; + } + + // Check for an undef mask and a mask value properly aligned to fit with + // a pair of values. If we find such a case, use the non-undef mask's value. + if (Mask[i] == SM_SentinelUndef && Mask[i + 1] >= 0 && Mask[i + 1] % 2 == 1) { + WidenedMask.push_back(Mask[i + 1] / 2); + continue; + } + if (Mask[i + 1] == SM_SentinelUndef && Mask[i] >= 0 && Mask[i] % 2 == 0) { + WidenedMask.push_back(Mask[i] / 2); + continue; + } + + // When zeroing, we need to spread the zeroing across both lanes to widen. + if (Mask[i] == SM_SentinelZero || Mask[i + 1] == SM_SentinelZero) { + if ((Mask[i] == SM_SentinelZero || Mask[i] == SM_SentinelUndef) && + (Mask[i + 1] == SM_SentinelZero || Mask[i + 1] == SM_SentinelUndef)) { + WidenedMask.push_back(SM_SentinelZero); + continue; + } + return false; + } + + // Finally check if the two mask values are adjacent and aligned with + // a pair. + if (Mask[i] != SM_SentinelUndef && Mask[i] % 2 == 0 && Mask[i] + 1 == Mask[i + 1]) { + WidenedMask.push_back(Mask[i] / 2); + continue; + } + + // Otherwise we can't safely widen the elements used in this shuffle. + return false; + } + assert(WidenedMask.size() == Mask.size() / 2 && + "Incorrect size of mask after widening the elements!"); + + return true; } -/// \brief Generic routine to split a 256-bit vector shuffle into 128-bit -/// shuffles. +/// \brief Generic routine to split ector shuffle into half-sized shuffles. /// -/// There is a severely limited set of shuffles available in AVX1 for 256-bit -/// vectors resulting in routinely needing to split the shuffle into two 128-bit -/// shuffles. This can be done generically for any 256-bit vector shuffle and so -/// we encode the logic here for specific shuffle lowering routines to bail to -/// when they exhaust the features avaible to more directly handle the shuffle. -static SDValue splitAndLower256BitVectorShuffle(SDValue Op, SDValue V1, - SDValue V2, - const X86Subtarget *Subtarget, - SelectionDAG &DAG) { - SDLoc DL(Op); - MVT VT = Op.getSimpleValueType(); - assert(VT.getSizeInBits() == 256 && "Only for 256-bit vector shuffles!"); +/// This routine just extracts two subvectors, shuffles them independently, and +/// then concatenates them back together. This should work effectively with all +/// AVX vector shuffle types. +static SDValue splitAndLowerVectorShuffle(SDLoc DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef Mask, + SelectionDAG &DAG) { + assert(VT.getSizeInBits() >= 256 && + "Only for 256-bit or wider vector shuffles!"); assert(V1.getSimpleValueType() == VT && "Bad operand type!"); assert(V2.getSimpleValueType() == VT && "Bad operand type!"); - ShuffleVectorSDNode *SVOp = cast(Op); - ArrayRef Mask = SVOp->getMask(); - ArrayRef LoMask = Mask.slice(0, Mask.size()/2); - ArrayRef HiMask = Mask.slice(Mask.size()/2); + ArrayRef LoMask = Mask.slice(0, Mask.size() / 2); + ArrayRef HiMask = Mask.slice(Mask.size() / 2); int NumElements = VT.getVectorNumElements(); int SplitNumElements = NumElements / 2; @@ -9167,7 +9597,7 @@ static SDValue splitAndLower256BitVectorShuffle(SDValue Op, SDValue V1, // Now create two 4-way blends of these half-width vectors. auto HalfBlend = [&](ArrayRef HalfMask) { - SmallVector V1BlendMask, V2BlendMask, BlendMask; + SmallVector V1BlendMask, V2BlendMask, BlendMask; for (int i = 0; i < SplitNumElements; ++i) { int M = HalfMask[i]; if (M >= NumElements) { @@ -9184,8 +9614,10 @@ static SDValue splitAndLower256BitVectorShuffle(SDValue Op, SDValue V1, BlendMask.push_back(-1); } } - SDValue V1Blend = DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask); - SDValue V2Blend = DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask); + SDValue V1Blend = + DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask); + SDValue V2Blend = + DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask); return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask); }; SDValue Lo = HalfBlend(LoMask); @@ -9193,6 +9625,96 @@ static SDValue splitAndLower256BitVectorShuffle(SDValue Op, SDValue V1, return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi); } +/// \brief Lower a vector shuffle crossing multiple 128-bit lanes as +/// a permutation and blend of those lanes. +/// +/// This essentially blends the out-of-lane inputs to each lane into the lane +/// from a permuted copy of the vector. This lowering strategy results in four +/// instructions in the worst case for a single-input cross lane shuffle which +/// is lower than any other fully general cross-lane shuffle strategy I'm aware +/// of. Special cases for each particular shuffle pattern should be handled +/// prior to trying this lowering. +static SDValue lowerVectorShuffleAsLanePermuteAndBlend(SDLoc DL, MVT VT, + SDValue V1, SDValue V2, + ArrayRef Mask, + SelectionDAG &DAG) { + // FIXME: This should probably be generalized for 512-bit vectors as well. + assert(VT.getSizeInBits() == 256 && "Only for 256-bit vector shuffles!"); + int LaneSize = Mask.size() / 2; + + // If there are only inputs from one 128-bit lane, splitting will in fact be + // less expensive. The flags track wether the given lane contains an element + // that crosses to another lane. + bool LaneCrossing[2] = {false, false}; + for (int i = 0, Size = Mask.size(); i < Size; ++i) + if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize) + LaneCrossing[(Mask[i] % Size) / LaneSize] = true; + if (!LaneCrossing[0] || !LaneCrossing[1]) + return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG); + + if (isSingleInputShuffleMask(Mask)) { + SmallVector FlippedBlendMask; + for (int i = 0, Size = Mask.size(); i < Size; ++i) + FlippedBlendMask.push_back( + Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize) + ? Mask[i] + : Mask[i] % LaneSize + + (i / LaneSize) * LaneSize + Size)); + + // Flip the vector, and blend the results which should now be in-lane. The + // VPERM2X128 mask uses the low 2 bits for the low source and bits 4 and + // 5 for the high source. The value 3 selects the high half of source 2 and + // the value 2 selects the low half of source 2. We only use source 2 to + // allow folding it into a memory operand. + unsigned PERMMask = 3 | 2 << 4; + SDValue Flipped = DAG.getNode(X86ISD::VPERM2X128, DL, VT, DAG.getUNDEF(VT), + V1, DAG.getConstant(PERMMask, MVT::i8)); + return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask); + } + + // This now reduces to two single-input shuffles of V1 and V2 which at worst + // will be handled by the above logic and a blend of the results, much like + // other patterns in AVX. + return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG); +} + +/// \brief Handle lowering 2-lane 128-bit shuffles. +static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef Mask, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + // Blends are faster and handle all the non-lane-crossing cases. + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask, + Subtarget, DAG)) + return Blend; + + MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), + VT.getVectorNumElements() / 2); + // Check for patterns which can be matched with a single insert of a 128-bit + // subvector. + if (isShuffleEquivalent(Mask, 0, 1, 0, 1) || + isShuffleEquivalent(Mask, 0, 1, 4, 5)) { + SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1, + DAG.getIntPtrConstant(0)); + SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, + Mask[2] < 4 ? V1 : V2, DAG.getIntPtrConstant(0)); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV); + } + if (isShuffleEquivalent(Mask, 0, 1, 6, 7)) { + SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1, + DAG.getIntPtrConstant(0)); + SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2, + DAG.getIntPtrConstant(2)); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV); + } + + // Otherwise form a 128-bit permutation. + // FIXME: Detect zero-vector inputs and use the VPERM2X128 to zero that half. + unsigned PermMask = Mask[0] / 2 | (Mask[2] / 2) << 4; + return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2, + DAG.getConstant(PermMask, MVT::i8)); +} + /// \brief Handle lowering of 4-lane 64-bit floating point shuffles. /// /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2 @@ -9205,138 +9727,417 @@ static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!"); ShuffleVectorSDNode *SVOp = cast(Op); ArrayRef Mask = SVOp->getMask(); - assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); - - if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) - return splitAndLower256BitVectorShuffle(Op, V1, V2, Subtarget, DAG); - - if (isSingleInputShuffleMask(Mask)) { - // Non-half-crossing single input shuffles can be lowerid with an - // interleaved permutation. - unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) | - ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3); - return DAG.getNode(X86ISD::VPERMILP, DL, MVT::v4f64, V1, - DAG.getConstant(VPERMILPMask, MVT::i8)); - } - - // X86 has dedicated unpack instructions that can handle specific blend - // operations: UNPCKH and UNPCKL. - if (isShuffleEquivalent(Mask, 0, 4, 2, 6)) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f64, V1, V2); - if (isShuffleEquivalent(Mask, 1, 5, 3, 7)) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V1, V2); - if (isShuffleEquivalent(Mask, 4, 0, 6, 2)) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f64, V2, V1); - if (isShuffleEquivalent(Mask, 5, 1, 7, 3)) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V2, V1); + assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); + + SmallVector WidenedMask; + if (canWidenShuffleElements(Mask, WidenedMask)) + return lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask, Subtarget, + DAG); + + if (isSingleInputShuffleMask(Mask)) { + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4f64, DL, V1, + Mask, Subtarget, DAG)) + return Broadcast; + + if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) { + // Non-half-crossing single input shuffles can be lowerid with an + // interleaved permutation. + unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) | + ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3); + return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1, + DAG.getConstant(VPERMILPMask, MVT::i8)); + } + + // With AVX2 we have direct support for this permutation. + if (Subtarget->hasAVX2()) + return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1, + getV4X86ShuffleImm8ForMask(Mask, DAG)); + + // Otherwise, fall back. + return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask, + DAG); + } + + // X86 has dedicated unpack instructions that can handle specific blend + // operations: UNPCKH and UNPCKL. + if (isShuffleEquivalent(Mask, 0, 4, 2, 6)) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f64, V1, V2); + if (isShuffleEquivalent(Mask, 1, 5, 3, 7)) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V1, V2); + + // If we have a single input to the zero element, insert that into V1 if we + // can do so cheaply. + int NumV2Elements = + std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }); + if (NumV2Elements == 1 && Mask[0] >= 4) + if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( + MVT::v4f64, DL, V1, V2, Mask, Subtarget, DAG)) + return Insertion; + + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask, + Subtarget, DAG)) + return Blend; + + // Check if the blend happens to exactly fit that of SHUFPD. + if ((Mask[0] == -1 || Mask[0] < 2) && + (Mask[1] == -1 || (Mask[1] >= 4 && Mask[1] < 6)) && + (Mask[2] == -1 || (Mask[2] >= 2 && Mask[2] < 4)) && + (Mask[3] == -1 || Mask[3] >= 6)) { + unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 5) << 1) | + ((Mask[2] == 3) << 2) | ((Mask[3] == 7) << 3); + return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f64, V1, V2, + DAG.getConstant(SHUFPDMask, MVT::i8)); + } + if ((Mask[0] == -1 || (Mask[0] >= 4 && Mask[0] < 6)) && + (Mask[1] == -1 || Mask[1] < 2) && + (Mask[2] == -1 || Mask[2] >= 6) && + (Mask[3] == -1 || (Mask[3] >= 2 && Mask[3] < 4))) { + unsigned SHUFPDMask = (Mask[0] == 5) | ((Mask[1] == 1) << 1) | + ((Mask[2] == 7) << 2) | ((Mask[3] == 3) << 3); + return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f64, V2, V1, + DAG.getConstant(SHUFPDMask, MVT::i8)); + } + + // Otherwise fall back on generic blend lowering. + return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2, + Mask, DAG); +} + +/// \brief Handle lowering of 4-lane 64-bit integer shuffles. +/// +/// This routine is only called when we have AVX2 and thus a reasonable +/// instruction set for v4i64 shuffling.. +static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast(Op); + ArrayRef Mask = SVOp->getMask(); + assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); + assert(Subtarget->hasAVX2() && "We can only lower v4i64 with AVX2!"); + + SmallVector WidenedMask; + if (canWidenShuffleElements(Mask, WidenedMask)) + return lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask, Subtarget, + DAG); + + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask, + Subtarget, DAG)) + return Blend; + + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4i64, DL, V1, + Mask, Subtarget, DAG)) + return Broadcast; + + // When the shuffle is mirrored between the 128-bit lanes of the unit, we can + // use lower latency instructions that will operate on both 128-bit lanes. + SmallVector RepeatedMask; + if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) { + if (isSingleInputShuffleMask(Mask)) { + int PSHUFDMask[] = {-1, -1, -1, -1}; + for (int i = 0; i < 2; ++i) + if (RepeatedMask[i] >= 0) { + PSHUFDMask[2 * i] = 2 * RepeatedMask[i]; + PSHUFDMask[2 * i + 1] = 2 * RepeatedMask[i] + 1; + } + return DAG.getNode( + ISD::BITCAST, DL, MVT::v4i64, + DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, + DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, V1), + getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG))); + } + + // Use dedicated unpack instructions for masks that match their pattern. + if (isShuffleEquivalent(Mask, 0, 4, 2, 6)) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V1, V2); + if (isShuffleEquivalent(Mask, 1, 5, 3, 7)) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V1, V2); + } + + // AVX2 provides a direct instruction for permuting a single input across + // lanes. + if (isSingleInputShuffleMask(Mask)) + return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1, + getV4X86ShuffleImm8ForMask(Mask, DAG)); + + // Otherwise fall back on generic blend lowering. + return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2, + Mask, DAG); +} + +/// \brief Handle lowering of 8-lane 32-bit floating point shuffles. +/// +/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2 +/// isn't available. +static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast(Op); + ArrayRef Mask = SVOp->getMask(); + assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); + + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask, + Subtarget, DAG)) + return Blend; + + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8f32, DL, V1, + Mask, Subtarget, DAG)) + return Broadcast; + + // If the shuffle mask is repeated in each 128-bit lane, we have many more + // options to efficiently lower the shuffle. + SmallVector RepeatedMask; + if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) { + assert(RepeatedMask.size() == 4 && + "Repeated masks must be half the mask width!"); + if (isSingleInputShuffleMask(Mask)) + return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1, + getV4X86ShuffleImm8ForMask(RepeatedMask, DAG)); + + // Use dedicated unpack instructions for masks that match their pattern. + if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 4, 12, 5, 13)) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f32, V1, V2); + if (isShuffleEquivalent(Mask, 2, 10, 3, 11, 6, 14, 7, 15)) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f32, V1, V2); + + // Otherwise, fall back to a SHUFPS sequence. Here it is important that we + // have already handled any direct blends. We also need to squash the + // repeated mask into a simulated v4f32 mask. + for (int i = 0; i < 4; ++i) + if (RepeatedMask[i] >= 8) + RepeatedMask[i] -= 4; + return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG); + } + + // If we have a single input shuffle with different shuffle patterns in the + // two 128-bit lanes use the variable mask to VPERMILPS. + if (isSingleInputShuffleMask(Mask)) { + SDValue VPermMask[8]; + for (int i = 0; i < 8; ++i) + VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32) + : DAG.getConstant(Mask[i], MVT::i32); + if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) + return DAG.getNode( + X86ISD::VPERMILPV, DL, MVT::v8f32, V1, + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask)); + + if (Subtarget->hasAVX2()) + return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, + DAG.getNode(ISD::BITCAST, DL, MVT::v8f32, + DAG.getNode(ISD::BUILD_VECTOR, DL, + MVT::v8i32, VPermMask)), + V1); + + // Otherwise, fall back. + return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask, + DAG); + } + + // Otherwise fall back on generic blend lowering. + return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2, + Mask, DAG); +} + +/// \brief Handle lowering of 8-lane 32-bit integer shuffles. +/// +/// This routine is only called when we have AVX2 and thus a reasonable +/// instruction set for v8i32 shuffling.. +static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast(Op); + ArrayRef Mask = SVOp->getMask(); + assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); + assert(Subtarget->hasAVX2() && "We can only lower v8i32 with AVX2!"); - if (SDValue Blend = - lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask, DAG)) + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask, + Subtarget, DAG)) return Blend; - // Check if the blend happens to exactly fit that of SHUFPD. - if (Mask[0] < 4 && (Mask[1] == -1 || Mask[1] >= 4) && - Mask[2] < 4 && (Mask[3] == -1 || Mask[3] >= 4)) { - unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 5) << 1) | - ((Mask[2] == 3) << 2) | ((Mask[3] == 7) << 3); - return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f64, V1, V2, - DAG.getConstant(SHUFPDMask, MVT::i8)); - } - if ((Mask[0] == -1 || Mask[0] >= 4) && Mask[1] < 4 && - (Mask[2] == -1 || Mask[2] >= 4) && Mask[3] < 4) { - unsigned SHUFPDMask = (Mask[0] == 5) | ((Mask[1] == 1) << 1) | - ((Mask[2] == 7) << 2) | ((Mask[3] == 3) << 3); - return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f64, V2, V1, - DAG.getConstant(SHUFPDMask, MVT::i8)); - } + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8i32, DL, V1, + Mask, Subtarget, DAG)) + return Broadcast; - // Shuffle the input elements into the desired positions in V1 and V2 and - // blend them together. - int V1Mask[] = {-1, -1, -1, -1}; - int V2Mask[] = {-1, -1, -1, -1}; - for (int i = 0; i < 4; ++i) - if (Mask[i] >= 0 && Mask[i] < 4) - V1Mask[i] = Mask[i]; - else if (Mask[i] >= 4) - V2Mask[i] = Mask[i] - 4; + // If the shuffle mask is repeated in each 128-bit lane we can use more + // efficient instructions that mirror the shuffles across the two 128-bit + // lanes. + SmallVector RepeatedMask; + if (is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask)) { + assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!"); + if (isSingleInputShuffleMask(Mask)) + return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1, + getV4X86ShuffleImm8ForMask(RepeatedMask, DAG)); - V1 = DAG.getVectorShuffle(MVT::v4f64, DL, V1, DAG.getUNDEF(MVT::v4f64), V1Mask); - V2 = DAG.getVectorShuffle(MVT::v4f64, DL, V2, DAG.getUNDEF(MVT::v4f64), V2Mask); + // Use dedicated unpack instructions for masks that match their pattern. + if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 4, 12, 5, 13)) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i32, V1, V2); + if (isShuffleEquivalent(Mask, 2, 10, 3, 11, 6, 14, 7, 15)) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i32, V1, V2); + } - unsigned BlendMask = 0; - for (int i = 0; i < 4; ++i) - if (Mask[i] >= 4) - BlendMask |= 1 << i; + // If the shuffle patterns aren't repeated but it is a single input, directly + // generate a cross-lane VPERMD instruction. + if (isSingleInputShuffleMask(Mask)) { + SDValue VPermMask[8]; + for (int i = 0; i < 8; ++i) + VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32) + : DAG.getConstant(Mask[i], MVT::i32); + return DAG.getNode( + X86ISD::VPERMV, DL, MVT::v8i32, + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask), V1); + } - return DAG.getNode(X86ISD::BLENDI, DL, MVT::v4f64, V1, V2, - DAG.getConstant(BlendMask, MVT::i8)); + // Otherwise fall back on generic blend lowering. + return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2, + Mask, DAG); } -/// \brief Handle lowering of 4-lane 64-bit integer shuffles. +/// \brief Handle lowering of 16-lane 16-bit integer shuffles. /// -/// Largely delegates to common code when we have AVX2 and to the floating-point -/// code when we only have AVX. -static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, - const X86Subtarget *Subtarget, - SelectionDAG &DAG) { +/// This routine is only called when we have AVX2 and thus a reasonable +/// instruction set for v16i16 shuffling.. +static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { SDLoc DL(Op); - assert(Op.getSimpleValueType() == MVT::v4i64 && "Bad shuffle type!"); - assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!"); - assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!"); + assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!"); ShuffleVectorSDNode *SVOp = cast(Op); ArrayRef Mask = SVOp->getMask(); - assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); + assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); + assert(Subtarget->hasAVX2() && "We can only lower v16i16 with AVX2!"); + + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16i16, DL, V1, + Mask, Subtarget, DAG)) + return Broadcast; + + // There are no generalized cross-lane shuffle operations available on i16 + // element types. + if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) + return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2, + Mask, DAG); - // FIXME: If we have AVX2, we should delegate to generic code as crossing - // shuffles aren't a problem and FP and int have the same patterns. + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask, + Subtarget, DAG)) + return Blend; + + // Use dedicated unpack instructions for masks that match their pattern. + if (isShuffleEquivalent(Mask, + // First 128-bit lane: + 0, 16, 1, 17, 2, 18, 3, 19, + // Second 128-bit lane: + 8, 24, 9, 25, 10, 26, 11, 27)) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i16, V1, V2); + if (isShuffleEquivalent(Mask, + // First 128-bit lane: + 4, 20, 5, 21, 6, 22, 7, 23, + // Second 128-bit lane: + 12, 28, 13, 29, 14, 30, 15, 31)) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i16, V1, V2); + + if (isSingleInputShuffleMask(Mask)) { + SDValue PSHUFBMask[32]; + for (int i = 0; i < 16; ++i) { + if (Mask[i] == -1) { + PSHUFBMask[2 * i] = PSHUFBMask[2 * i + 1] = DAG.getUNDEF(MVT::i8); + continue; + } - if (is128BitLaneCrossingShuffleMask(MVT::v4i64, Mask)) - return splitAndLower256BitVectorShuffle(Op, V1, V2, Subtarget, DAG); + int M = i < 8 ? Mask[i] : Mask[i] - 8; + assert(M >= 0 && M < 8 && "Invalid single-input mask!"); + PSHUFBMask[2 * i] = DAG.getConstant(2 * M, MVT::i8); + PSHUFBMask[2 * i + 1] = DAG.getConstant(2 * M + 1, MVT::i8); + } + return DAG.getNode( + ISD::BITCAST, DL, MVT::v16i16, + DAG.getNode( + X86ISD::PSHUFB, DL, MVT::v32i8, + DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V1), + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask))); + } - // AVX1 doesn't provide any facilities for v4i64 shuffles, bitcast and - // delegate to floating point code. - V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v4f64, V1); - V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v4f64, V2); - return DAG.getNode(ISD::BITCAST, DL, MVT::v4i64, - lowerV4F64VectorShuffle(Op, V1, V2, Subtarget, DAG)); + // Otherwise fall back on generic blend lowering. + return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i16, V1, V2, + Mask, DAG); } -/// \brief Handle lowering of 8-lane 32-bit floating point shuffles. +/// \brief Handle lowering of 32-lane 8-bit integer shuffles. /// -/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2 -/// isn't available. -static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, +/// This routine is only called when we have AVX2 and thus a reasonable +/// instruction set for v32i8 shuffling.. +static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc DL(Op); - assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!"); - assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!"); + assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!"); ShuffleVectorSDNode *SVOp = cast(Op); ArrayRef Mask = SVOp->getMask(); - assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); + assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"); + assert(Subtarget->hasAVX2() && "We can only lower v32i8 with AVX2!"); - if (is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask) || - isSingleInputShuffleMask(Mask)) - return splitAndLower256BitVectorShuffle(Op, V1, V2, Subtarget, DAG); + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v32i8, DL, V1, + Mask, Subtarget, DAG)) + return Broadcast; - // Shuffle the input elements into the desired positions in V1 and V2 and - // blend them together. - int V1Mask[] = {-1, -1, -1, -1, -1, -1, -1, -1}; - int V2Mask[] = {-1, -1, -1, -1, -1, -1, -1, -1}; - unsigned BlendMask = 0; - for (int i = 0; i < 8; ++i) - if (Mask[i] >= 0 && Mask[i] < 8) { - V1Mask[i] = Mask[i]; - } else if (Mask[i] >= 8) { - V2Mask[i] = Mask[i] - 8; - BlendMask |= 1 << i; - } + // There are no generalized cross-lane shuffle operations available on i8 + // element types. + if (is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) + return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, + Mask, DAG); + + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask, + Subtarget, DAG)) + return Blend; + + // Use dedicated unpack instructions for masks that match their pattern. + // Note that these are repeated 128-bit lane unpacks, not unpacks across all + // 256-bit lanes. + if (isShuffleEquivalent( + Mask, + // First 128-bit lane: + 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, + // Second 128-bit lane: + 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55)) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v32i8, V1, V2); + if (isShuffleEquivalent( + Mask, + // First 128-bit lane: + 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47, + // Second 128-bit lane: + 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63)) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v32i8, V1, V2); + + if (isSingleInputShuffleMask(Mask)) { + SDValue PSHUFBMask[32]; + for (int i = 0; i < 32; ++i) + PSHUFBMask[i] = + Mask[i] < 0 + ? DAG.getUNDEF(MVT::i8) + : DAG.getConstant(Mask[i] < 16 ? Mask[i] : Mask[i] - 16, MVT::i8); - V1 = DAG.getVectorShuffle(MVT::v8f32, DL, V1, DAG.getUNDEF(MVT::v8f32), V1Mask); - V2 = DAG.getVectorShuffle(MVT::v8f32, DL, V2, DAG.getUNDEF(MVT::v8f32), V2Mask); + return DAG.getNode( + X86ISD::PSHUFB, DL, MVT::v32i8, V1, + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask)); + } - return DAG.getNode(X86ISD::BLENDI, DL, MVT::v8f32, V1, V2, - DAG.getConstant(BlendMask, MVT::i8)); + // Otherwise fall back on generic blend lowering. + return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v32i8, V1, V2, + Mask, DAG); } /// \brief High-level routine to lower various 256-bit x86 vector shuffles. @@ -9347,6 +10148,30 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, MVT VT, const X86Subtarget *Subtarget, SelectionDAG &DAG) { + SDLoc DL(Op); + ShuffleVectorSDNode *SVOp = cast(Op); + ArrayRef Mask = SVOp->getMask(); + + // There is a really nice hard cut-over between AVX1 and AVX2 that means we can + // check for those subtargets here and avoid much of the subtarget querying in + // the per-vector-type lowering routines. With AVX1 we have essentially *zero* + // ability to manipulate a 256-bit vector with integer types. Since we'll use + // floating point types there eventually, just immediately cast everything to + // a float and operate entirely in that domain. + if (VT.isInteger() && !Subtarget->hasAVX2()) { + int ElementBits = VT.getScalarSizeInBits(); + if (ElementBits < 32) + // No floating point type available, decompose into 128-bit vectors. + return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG); + + MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits), + VT.getVectorNumElements()); + V1 = DAG.getNode(ISD::BITCAST, DL, FpVT, V1); + V2 = DAG.getNode(ISD::BITCAST, DL, FpVT, V2); + return DAG.getNode(ISD::BITCAST, DL, VT, + DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask)); + } + switch (VT.SimpleTy) { case MVT::v4f64: return lowerV4F64VectorShuffle(Op, V1, V2, Subtarget, DAG); @@ -9355,29 +10180,156 @@ static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, case MVT::v8f32: return lowerV8F32VectorShuffle(Op, V1, V2, Subtarget, DAG); case MVT::v8i32: + return lowerV8I32VectorShuffle(Op, V1, V2, Subtarget, DAG); case MVT::v16i16: + return lowerV16I16VectorShuffle(Op, V1, V2, Subtarget, DAG); case MVT::v32i8: - // Fall back to the basic pattern of extracting the high half and forming - // a 4-way blend. - // FIXME: Add targeted lowering for each type that can document rationale - // for delegating to this when necessary. - return splitAndLower256BitVectorShuffle(Op, V1, V2, Subtarget, DAG); + return lowerV32I8VectorShuffle(Op, V1, V2, Subtarget, DAG); default: llvm_unreachable("Not a valid 256-bit x86 vector type!"); } } -/// \brief Tiny helper function to test whether a shuffle mask could be -/// simplified by widening the elements being shuffled. -static bool canWidenShuffleElements(ArrayRef Mask) { - for (int i = 0, Size = Mask.size(); i < Size; i += 2) - if ((Mask[i] != -1 && Mask[i] % 2 != 0) || - (Mask[i + 1] != -1 && (Mask[i + 1] % 2 != 1 || - (Mask[i] != -1 && Mask[i] + 1 != Mask[i + 1])))) - return false; +/// \brief Handle lowering of 8-lane 64-bit floating point shuffles. +static SDValue lowerV8F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast(Op); + ArrayRef Mask = SVOp->getMask(); + assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); - return true; + // FIXME: Implement direct support for this type! + return splitAndLowerVectorShuffle(DL, MVT::v8f64, V1, V2, Mask, DAG); +} + +/// \brief Handle lowering of 16-lane 32-bit floating point shuffles. +static SDValue lowerV16F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast(Op); + ArrayRef Mask = SVOp->getMask(); + assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); + + // FIXME: Implement direct support for this type! + return splitAndLowerVectorShuffle(DL, MVT::v16f32, V1, V2, Mask, DAG); +} + +/// \brief Handle lowering of 8-lane 64-bit integer shuffles. +static SDValue lowerV8I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast(Op); + ArrayRef Mask = SVOp->getMask(); + assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); + + // FIXME: Implement direct support for this type! + return splitAndLowerVectorShuffle(DL, MVT::v8i64, V1, V2, Mask, DAG); +} + +/// \brief Handle lowering of 16-lane 32-bit integer shuffles. +static SDValue lowerV16I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast(Op); + ArrayRef Mask = SVOp->getMask(); + assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); + + // FIXME: Implement direct support for this type! + return splitAndLowerVectorShuffle(DL, MVT::v16i32, V1, V2, Mask, DAG); +} + +/// \brief Handle lowering of 32-lane 16-bit integer shuffles. +static SDValue lowerV32I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast(Op); + ArrayRef Mask = SVOp->getMask(); + assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"); + assert(Subtarget->hasBWI() && "We can only lower v32i16 with AVX-512-BWI!"); + + // FIXME: Implement direct support for this type! + return splitAndLowerVectorShuffle(DL, MVT::v32i16, V1, V2, Mask, DAG); +} + +/// \brief Handle lowering of 64-lane 8-bit integer shuffles. +static SDValue lowerV64I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast(Op); + ArrayRef Mask = SVOp->getMask(); + assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!"); + assert(Subtarget->hasBWI() && "We can only lower v64i8 with AVX-512-BWI!"); + + // FIXME: Implement direct support for this type! + return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG); +} + +/// \brief High-level routine to lower various 512-bit x86 vector shuffles. +/// +/// This routine either breaks down the specific type of a 512-bit x86 vector +/// shuffle or splits it into two 256-bit shuffles and fuses the results back +/// together based on the available instructions. +static SDValue lower512BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, + MVT VT, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + ShuffleVectorSDNode *SVOp = cast(Op); + ArrayRef Mask = SVOp->getMask(); + assert(Subtarget->hasAVX512() && + "Cannot lower 512-bit vectors w/ basic ISA!"); + + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(VT.SimpleTy, DL, V1, + Mask, Subtarget, DAG)) + return Broadcast; + + // Dispatch to each element type for lowering. If we don't have supprot for + // specific element type shuffles at 512 bits, immediately split them and + // lower them. Each lowering routine of a given type is allowed to assume that + // the requisite ISA extensions for that element type are available. + switch (VT.SimpleTy) { + case MVT::v8f64: + return lowerV8F64VectorShuffle(Op, V1, V2, Subtarget, DAG); + case MVT::v16f32: + return lowerV16F32VectorShuffle(Op, V1, V2, Subtarget, DAG); + case MVT::v8i64: + return lowerV8I64VectorShuffle(Op, V1, V2, Subtarget, DAG); + case MVT::v16i32: + return lowerV16I32VectorShuffle(Op, V1, V2, Subtarget, DAG); + case MVT::v32i16: + if (Subtarget->hasBWI()) + return lowerV32I16VectorShuffle(Op, V1, V2, Subtarget, DAG); + break; + case MVT::v64i8: + if (Subtarget->hasBWI()) + return lowerV64I8VectorShuffle(Op, V1, V2, Subtarget, DAG); + break; + + default: + llvm_unreachable("Not a valid 512-bit x86 vector type!"); + } + + // Otherwise fall back on splitting. + return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG); } /// \brief Top-level lowering for x86 vector shuffles. @@ -9423,24 +10375,25 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget, return DAG.getVectorShuffle(VT, dl, V1, V2, NewMask); } - // For integer vector shuffles, try to collapse them into a shuffle of fewer - // lanes but wider integers. We cap this to not form integers larger than i64 - // but it might be interesting to form i128 integers to handle flipping the - // low and high halves of AVX 256-bit vectors. - if (VT.isInteger() && VT.getScalarSizeInBits() < 64 && - canWidenShuffleElements(Mask)) { - SmallVector NewMask; - for (int i = 0, Size = Mask.size(); i < Size; i += 2) - NewMask.push_back(Mask[i] != -1 - ? Mask[i] / 2 - : (Mask[i + 1] != -1 ? Mask[i + 1] / 2 : -1)); - MVT NewVT = - MVT::getVectorVT(MVT::getIntegerVT(VT.getScalarSizeInBits() * 2), - VT.getVectorNumElements() / 2); - V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1); - V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2); - return DAG.getNode(ISD::BITCAST, dl, VT, - DAG.getVectorShuffle(NewVT, dl, V1, V2, NewMask)); + // Try to collapse shuffles into using a vector type with fewer elements but + // wider element types. We cap this to not form integers or floating point + // elements wider than 64 bits, but it might be interesting to form i128 + // integers to handle flipping the low and high halves of AVX 256-bit vectors. + SmallVector WidenedMask; + if (VT.getScalarSizeInBits() < 64 && + canWidenShuffleElements(Mask, WidenedMask)) { + MVT NewEltVT = VT.isFloatingPoint() + ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2) + : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2); + MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2); + // Make sure that the new vector type is legal. For example, v2f64 isn't + // legal on SSE1. + if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) { + V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1); + V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2); + return DAG.getNode(ISD::BITCAST, dl, VT, + DAG.getVectorShuffle(NewVT, dl, V1, V2, WidenedMask)); + } } int NumV1Elements = 0, NumUndefElements = 0, NumV2Elements = 0; @@ -9459,7 +10412,9 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget, return DAG.getCommutedVectorShuffle(*SVOp); // When the number of V1 and V2 elements are the same, try to minimize the - // number of uses of V2 in the low half of the vector. + // number of uses of V2 in the low half of the vector. When that is tied, + // ensure that the sum of indices for V1 is equal to or lower than the sum + // indices for V2. if (NumV1Elements == NumV2Elements) { int LowV1Elements = 0, LowV2Elements = 0; for (int M : SVOp->getMask().slice(0, NumElements / 2)) @@ -9467,8 +10422,18 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget, ++LowV2Elements; else if (M >= 0) ++LowV1Elements; - if (LowV2Elements > LowV1Elements) + if (LowV2Elements > LowV1Elements) { return DAG.getCommutedVectorShuffle(*SVOp); + } else if (LowV2Elements == LowV1Elements) { + int SumV1Indices = 0, SumV2Indices = 0; + for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i) + if (SVOp->getMask()[i] >= NumElements) + SumV2Indices += i; + else if (SVOp->getMask()[i] >= 0) + SumV1Indices += i; + if (SumV2Indices < SumV1Indices) + return DAG.getCommutedVectorShuffle(*SVOp); + } } // For each vector width, delegate to a specialized lowering routine. @@ -9478,6 +10443,11 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget, if (VT.getSizeInBits() == 256) return lower256BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG); + // Force AVX-512 vectors to be scalarized for now. + // FIXME: Implement AVX-512 support! + if (VT.getSizeInBits() == 512) + return lower512BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG); + llvm_unreachable("Unimplemented!"); } @@ -10648,37 +11618,6 @@ static SDValue LowerVectorIntExtend(SDValue Op, const X86Subtarget *Subtarget, if (!DAG.getTargetLoweringInfo().isTypeLegal(NVT)) return SDValue(); - // Simplify the operand as it's prepared to be fed into shuffle. - unsigned SignificantBits = NVT.getSizeInBits() >> Shift; - if (V1.getOpcode() == ISD::BITCAST && - V1.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR && - V1.getOperand(0).getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT && - V1.getOperand(0).getOperand(0) - .getSimpleValueType().getSizeInBits() == SignificantBits) { - // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x) - SDValue V = V1.getOperand(0).getOperand(0).getOperand(0); - ConstantSDNode *CIdx = - dyn_cast(V1.getOperand(0).getOperand(0).getOperand(1)); - // If it's foldable, i.e. normal load with single use, we will let code - // selection to fold it. Otherwise, we will short the conversion sequence. - if (CIdx && CIdx->getZExtValue() == 0 && - (!ISD::isNormalLoad(V.getNode()) || !V.hasOneUse())) { - MVT FullVT = V.getSimpleValueType(); - MVT V1VT = V1.getSimpleValueType(); - if (FullVT.getSizeInBits() > V1VT.getSizeInBits()) { - // The "ext_vec_elt" node is wider than the result node. - // In this case we should extract subvector from V. - // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast (extract_subvector x)). - unsigned Ratio = FullVT.getSizeInBits() / V1VT.getSizeInBits(); - MVT SubVecVT = MVT::getVectorVT(FullVT.getVectorElementType(), - FullVT.getVectorNumElements()/Ratio); - V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVecVT, V, - DAG.getIntPtrConstant(0)); - } - V1 = DAG.getNode(ISD::BITCAST, DL, V1VT, V); - } - } - return DAG.getNode(ISD::BITCAST, DL, VT, DAG.getNode(X86ISD::VZEXT, DL, NVT, V1)); } @@ -10828,7 +11767,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG); if (HasFp256 && (VT == MVT::v4f32 || VT == MVT::v2f64)) - return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1, TargetMask, + return getTargetShuffleNode(X86ISD::VPERMILPI, dl, VT, V1, TargetMask, DAG); return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V1, @@ -11010,7 +11949,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { if ((HasInt256 && VT == MVT::v8i32) || VT == MVT::v16i32) return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, getShuffleSHUFImmediate(SVOp), DAG); - return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1, + return getTargetShuffleNode(X86ISD::VPERMILPI, dl, VT, V1, getShuffleSHUFImmediate(SVOp), DAG); } @@ -11129,9 +12068,10 @@ static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector, return true; } -// Try to lower a vselect node into a simple blend instruction. -static SDValue LowerVSELECTtoBlend(SDValue Op, const X86Subtarget *Subtarget, - SelectionDAG &DAG) { +/// \brief Try to lower a VSELECT instruction to an immediate-controlled blend +/// instruction. +static SDValue lowerVSELECTtoBLENDI(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { SDValue Cond = Op.getOperand(0); SDValue LHS = Op.getOperand(1); SDValue RHS = Op.getOperand(2); @@ -11179,8 +12119,8 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const { ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) && ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode())) return SDValue(); - - SDValue BlendOp = LowerVSELECTtoBlend(Op, Subtarget, DAG); + + SDValue BlendOp = lowerVSELECTtoBLENDI(Op, Subtarget, DAG); if (BlendOp.getNode()) return BlendOp; @@ -12859,12 +13799,24 @@ static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) { In, DAG.getUNDEF(SVT))); } -// The only differences between FABS and FNEG are the mask and the logic op. +/// The only differences between FABS and FNEG are the mask and the logic op. +/// FNEG also has a folding opportunity for FNEG(FABS(x)). static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) { assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) && "Wrong opcode for lowering FABS or FNEG."); bool IsFABS = (Op.getOpcode() == ISD::FABS); + + // If this is a FABS and it has an FNEG user, bail out to fold the combination + // into an FNABS. We'll lower the FABS after that if it is still in use. + if (IsFABS) + for (SDNode *User : Op->uses()) + if (User->getOpcode() == ISD::FNEG) + return Op; + + SDValue Op0 = Op.getOperand(0); + bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS); + SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); // Assume scalar op for initialization; update for vector if needed. @@ -12900,15 +13852,19 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) { // For a vector, cast operands to a vector type, perform the logic op, // and cast the result back to the original value type. MVT VecVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64); - SDValue Op0Casted = DAG.getNode(ISD::BITCAST, dl, VecVT, Op.getOperand(0)); SDValue MaskCasted = DAG.getNode(ISD::BITCAST, dl, VecVT, Mask); - unsigned LogicOp = IsFABS ? ISD::AND : ISD::XOR; + SDValue Operand = IsFNABS ? + DAG.getNode(ISD::BITCAST, dl, VecVT, Op0.getOperand(0)) : + DAG.getNode(ISD::BITCAST, dl, VecVT, Op0); + unsigned BitOp = IsFABS ? ISD::AND : IsFNABS ? ISD::OR : ISD::XOR; return DAG.getNode(ISD::BITCAST, dl, VT, - DAG.getNode(LogicOp, dl, VecVT, Op0Casted, MaskCasted)); + DAG.getNode(BitOp, dl, VecVT, Operand, MaskCasted)); } + // If not vector, then scalar. - unsigned LogicOp = IsFABS ? X86ISD::FAND : X86ISD::FXOR; - return DAG.getNode(LogicOp, dl, VT, Op.getOperand(0), Mask); + unsigned BitOp = IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR; + SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0; + return DAG.getNode(BitOp, dl, VT, Operand, Mask); } static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { @@ -13002,8 +13958,7 @@ static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) { return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, VT)); } -// LowerVectorAllZeroTest - Check whether an OR'd tree is PTEST-able. -// +// Check whether an OR'd tree is PTEST-able. static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree."); @@ -13411,6 +14366,37 @@ SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp, return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl); } +/// The minimum architected relative accuracy is 2^-12. We need one +/// Newton-Raphson step to have a good float result (24 bits of precision). +SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op, + DAGCombinerInfo &DCI, + unsigned &RefinementSteps, + bool &UseOneConstNR) const { + // FIXME: We should use instruction latency models to calculate the cost of + // each potential sequence, but this is very hard to do reliably because + // at least Intel's Core* chips have variable timing based on the number of + // significant digits in the divisor and/or sqrt operand. + if (!Subtarget->useSqrtEst()) + return SDValue(); + + EVT VT = Op.getValueType(); + + // SSE1 has rsqrtss and rsqrtps. + // TODO: Add support for AVX512 (v16f32). + // It is likely not profitable to do this for f64 because a double-precision + // rsqrt estimate with refinement on x86 prior to FMA requires at least 16 + // instructions: convert to single, rsqrtss, convert back to double, refine + // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA + // along with FMA, this could be a throughput win. + if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) || + (Subtarget->hasAVX() && VT == MVT::v8f32)) { + RefinementSteps = 1; + UseOneConstNR = false; + return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op); + } + return SDValue(); +} + static bool isAllOnes(SDValue V) { ConstantSDNode *C = dyn_cast(V); return C && C->isAllOnesValue(); @@ -14195,13 +15181,32 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops); } -static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, SelectionDAG &DAG) { +static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { MVT VT = Op->getSimpleValueType(0); SDValue In = Op->getOperand(0); MVT InVT = In.getSimpleValueType(); + MVT VTElt = VT.getVectorElementType(); + MVT InVTElt = InVT.getVectorElementType(); SDLoc dl(Op); + // SKX processor + if ((InVTElt == MVT::i1) && + (((Subtarget->hasBWI() && Subtarget->hasVLX() && + VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() <= 16)) || + + ((Subtarget->hasBWI() && VT.is512BitVector() && + VTElt.getSizeInBits() <= 16)) || + + ((Subtarget->hasDQI() && Subtarget->hasVLX() && + VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() >= 32)) || + + ((Subtarget->hasDQI() && VT.is512BitVector() && + VTElt.getSizeInBits() >= 32)))) + return DAG.getNode(X86ISD::VSEXT, dl, VT, In); + unsigned int NumElts = VT.getVectorNumElements(); + if (NumElts != 8 && NumElts != 16) return SDValue(); @@ -14234,7 +15239,7 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget, SDLoc dl(Op); if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1) - return LowerSIGN_EXTEND_AVX512(Op, DAG); + return LowerSIGN_EXTEND_AVX512(Op, Subtarget, DAG); if ((VT != MVT::v4i64 || InVT != MVT::v4i32) && (VT != MVT::v8i32 || InVT != MVT::v8i16) && @@ -14844,7 +15849,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, EVT VT = Op.getNode()->getValueType(0); bool Is64Bit = Subtarget->is64Bit(); - EVT SPTy = Is64Bit ? MVT::i64 : MVT::i32; + EVT SPTy = getPointerTy(); if (SplitStack) { MachineRegisterInfo &MRI = MF.getRegInfo(); @@ -14862,7 +15867,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, } const TargetRegisterClass *AddrRegClass = - getRegClassFor(Subtarget->is64Bit() ? MVT::i64:MVT::i32); + getRegClassFor(getPointerTy()); unsigned Vreg = MRI.createVirtualRegister(AddrRegClass); Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size); SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain, @@ -14871,7 +15876,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, return DAG.getMergeValues(Ops1, dl); } else { SDValue Flag; - unsigned Reg = (Subtarget->is64Bit() ? X86::RAX : X86::EAX); + const unsigned Reg = (Subtarget->isTarget64BitLP64() ? X86::RAX : X86::EAX); Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag); Flag = Chain.getValue(1); @@ -15153,19 +16158,39 @@ static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT, return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt); } -/// \brief Return (vselect \p Mask, \p Op, \p PreservedSrc) along with the +/// \brief Return (and \p Op, \p Mask) for compare instructions or +/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the /// necessary casting for \p Mask when lowering masking intrinsics. static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, SelectionDAG &DAG) { EVT VT = Op.getValueType(); EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, VT.getVectorNumElements()); + EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + Mask.getValueType().getSizeInBits()); SDLoc dl(Op); assert(MaskVT.isSimple() && "invalid mask type"); - return DAG.getNode(ISD::VSELECT, dl, VT, - DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask), - Op, PreservedSrc); + + if (isAllOnes(Mask)) + return Op; + + // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements + // are extracted by EXTRACT_SUBVECTOR. + SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, + DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask), + DAG.getIntPtrConstant(0)); + + switch (Op.getOpcode()) { + default: break; + case X86ISD::PCMPEQM: + case X86ISD::PCMPGTM: + case X86ISD::CMPM: + case X86ISD::CMPMU: + return DAG.getNode(ISD::AND, dl, VT, Op, VMask); + } + + return DAG.getNode(ISD::VSELECT, dl, VT, VMask, Op, PreservedSrc); } static unsigned getOpcodeForFMAIntrinsic(unsigned IntNo) { @@ -15231,6 +16256,39 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { case INTR_TYPE_3OP: return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + case CMP_MASK: + case CMP_MASK_CC: { + // Comparison intrinsics with masks. + // Example of transformation: + // (i8 (int_x86_avx512_mask_pcmpeq_q_128 + // (v2i64 %a), (v2i64 %b), (i8 %mask))) -> + // (i8 (bitcast + // (v8i1 (insert_subvector undef, + // (v2i1 (and (PCMPEQM %a, %b), + // (extract_subvector + // (v8i1 (bitcast %mask)), 0))), 0)))) + EVT VT = Op.getOperand(1).getValueType(); + EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + VT.getVectorNumElements()); + SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3); + EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + Mask.getValueType().getSizeInBits()); + SDValue Cmp; + if (IntrData->Type == CMP_MASK_CC) { + Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1), + Op.getOperand(2), Op.getOperand(3)); + } else { + assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!"); + Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1), + Op.getOperand(2)); + } + SDValue CmpMask = getVectorMaskingNode(Cmp, Mask, + DAG.getTargetConstant(0, MaskVT), DAG); + SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT, + DAG.getUNDEF(BitcastVT), CmpMask, + DAG.getIntPtrConstant(0)); + return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res); + } case COMI: { // Comparison intrinsics ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1; SDValue LHS = Op.getOperand(1); @@ -17159,10 +18217,15 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { Cond = X86::COND_B; break; case ISD::SMULO: - BaseOp = X86ISD::SMUL; + BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL; Cond = X86::COND_O; break; case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs + if (N->getValueType(0) == MVT::i8) { + BaseOp = X86ISD::UMUL8; + Cond = X86::COND_O; + break; + } SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0), MVT::i32); SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS); @@ -17271,8 +18334,11 @@ bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { return needsCmpXchgNb(SI->getValueOperand()->getType()); } -bool X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *SI) const { - return false; // FIXME, currently these are expanded separately in this file. +// Note: this turns large loads into lock cmpxchg8b/16b. +// FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b. +bool X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { + auto PTy = cast(LI->getPointerOperand()->getType()); + return needsCmpXchgNb(PTy->getElementType()); } bool X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { @@ -17312,6 +18378,74 @@ bool X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { } } +static bool hasMFENCE(const X86Subtarget& Subtarget) { + // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for + // no-sse2). There isn't any reason to disable it if the target processor + // supports it. + return Subtarget.hasSSE2() || Subtarget.is64Bit(); +} + +LoadInst * +X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { + const X86Subtarget &Subtarget = + getTargetMachine().getSubtarget(); + unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32; + const Type *MemType = AI->getType(); + // Accesses larger than the native width are turned into cmpxchg/libcalls, so + // there is no benefit in turning such RMWs into loads, and it is actually + // harmful as it introduces a mfence. + if (MemType->getPrimitiveSizeInBits() > NativeWidth) + return nullptr; + + auto Builder = IRBuilder<>(AI); + Module *M = Builder.GetInsertBlock()->getParent()->getParent(); + auto SynchScope = AI->getSynchScope(); + // We must restrict the ordering to avoid generating loads with Release or + // ReleaseAcquire orderings. + auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering()); + auto Ptr = AI->getPointerOperand(); + + // Before the load we need a fence. Here is an example lifted from + // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence + // is required: + // Thread 0: + // x.store(1, relaxed); + // r1 = y.fetch_add(0, release); + // Thread 1: + // y.fetch_add(42, acquire); + // r2 = x.load(relaxed); + // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is + // lowered to just a load without a fence. A mfence flushes the store buffer, + // making the optimization clearly correct. + // FIXME: it is required if isAtLeastRelease(Order) but it is not clear + // otherwise, we might be able to be more agressive on relaxed idempotent + // rmw. In practice, they do not look useful, so we don't try to be + // especially clever. + if (SynchScope == SingleThread) { + // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at + // the IR level, so we must wrap it in an intrinsic. + return nullptr; + } else if (hasMFENCE(Subtarget)) { + Function *MFence = llvm::Intrinsic::getDeclaration(M, + Intrinsic::x86_sse2_mfence); + Builder.CreateCall(MFence); + } else { + // FIXME: it might make sense to use a locked operation here but on a + // different cache-line to prevent cache-line bouncing. In practice it + // is probably a small win, and x86 processors without mfence are rare + // enough that we do not bother. + return nullptr; + } + + // Finally we can emit the atomic load. + LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr, + AI->getType()->getPrimitiveSizeInBits()); + Loaded->setAtomic(Order, SynchScope); + AI->replaceAllUsesWith(Loaded); + AI->eraseFromParent(); + return Loaded; +} + static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc dl(Op); @@ -17323,10 +18457,7 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget, // The only fence that needs an instruction is a sequentially-consistent // cross-thread fence. if (FenceOrdering == SequentiallyConsistent && FenceScope == CrossThread) { - // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for - // no-sse2). There isn't any reason to disable it if the target processor - // supports it. - if (Subtarget->hasSSE2() || Subtarget->is64Bit()) + if (hasMFENCE(*Subtarget)) return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); SDValue Chain = Op.getOperand(0); @@ -17649,29 +18780,6 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { } } -static void ReplaceATOMIC_LOAD(SDNode *Node, - SmallVectorImpl &Results, - SelectionDAG &DAG) { - SDLoc dl(Node); - EVT VT = cast(Node)->getMemoryVT(); - - // Convert wide load -> cmpxchg8b/cmpxchg16b - // FIXME: On 32-bit, load -> fild or movq would be more efficient - // (The only way to get a 16-byte load is cmpxchg16b) - // FIXME: 16-byte ATOMIC_CMP_SWAP isn't actually hooked up at the moment. - SDValue Zero = DAG.getConstant(0, VT); - SDVTList VTs = DAG.getVTList(VT, MVT::i1, MVT::Other); - SDValue Swap = - DAG.getAtomicCmpSwap(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, dl, VT, VTs, - Node->getOperand(0), Node->getOperand(1), Zero, Zero, - cast(Node)->getMemOperand(), - cast(Node)->getOrdering(), - cast(Node)->getOrdering(), - cast(Node)->getSynchScope()); - Results.push_back(Swap.getValue(0)); - Results.push_back(Swap.getValue(2)); -} - /// ReplaceNodeResults - Replace a node with an illegal result type /// with a new node built out of custom code. void X86TargetLowering::ReplaceNodeResults(SDNode *N, @@ -17830,12 +18938,10 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, case ISD::ATOMIC_LOAD_MAX: case ISD::ATOMIC_LOAD_UMIN: case ISD::ATOMIC_LOAD_UMAX: + case ISD::ATOMIC_LOAD: { // Delegate to generic TypeLegalization. Situations we can really handle // should have already been dealt with by AtomicExpandPass.cpp. break; - case ISD::ATOMIC_LOAD: { - ReplaceATOMIC_LOAD(N, Results, DAG); - return; } case ISD::BITCAST: { assert(Subtarget->hasSSE2() && "Requires at least SSE2!"); @@ -17918,7 +19024,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::PSHUFB: return "X86ISD::PSHUFB"; case X86ISD::ANDNP: return "X86ISD::ANDNP"; case X86ISD::PSIGN: return "X86ISD::PSIGN"; - case X86ISD::BLENDV: return "X86ISD::BLENDV"; case X86ISD::BLENDI: return "X86ISD::BLENDI"; case X86ISD::SUBUS: return "X86ISD::SUBUS"; case X86ISD::HADD: return "X86ISD::HADD"; @@ -18010,7 +19115,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST"; case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM"; case X86ISD::VEXTRACT: return "X86ISD::VEXTRACT"; - case X86ISD::VPERMILP: return "X86ISD::VPERMILP"; + case X86ISD::VPERMILPI: return "X86ISD::VPERMILPI"; case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128"; case X86ISD::VPERMV: return "X86ISD::VPERMV"; case X86ISD::VPERMV3: return "X86ISD::VPERMV3"; @@ -18877,8 +19982,8 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, } MachineBasicBlock * -X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB, - bool Is64Bit) const { +X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, + MachineBasicBlock *BB) const { MachineFunction *MF = BB->getParent(); const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); DebugLoc DL = MI->getDebugLoc(); @@ -18886,8 +19991,11 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB, assert(MF->shouldSplitStack()); - unsigned TlsReg = Is64Bit ? X86::FS : X86::GS; - unsigned TlsOffset = Is64Bit ? 0x70 : 0x30; + const bool Is64Bit = Subtarget->is64Bit(); + const bool IsLP64 = Subtarget->isTarget64BitLP64(); + + const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS; + const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30; // BB: // ... [Till the alloca] @@ -18911,14 +20019,14 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB, MachineRegisterInfo &MRI = MF->getRegInfo(); const TargetRegisterClass *AddrRegClass = - getRegClassFor(Is64Bit ? MVT::i64:MVT::i32); + getRegClassFor(getPointerTy()); unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass), bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass), tmpSPVReg = MRI.createVirtualRegister(AddrRegClass), SPLimitVReg = MRI.createVirtualRegister(AddrRegClass), sizeVReg = MI->getOperand(1).getReg(), - physSPReg = Is64Bit ? X86::RSP : X86::ESP; + physSPReg = IsLP64 || Subtarget->isTargetNaCl64() ? X86::RSP : X86::ESP; MachineFunction::iterator MBBIter = BB; ++MBBIter; @@ -18934,9 +20042,9 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB, // Add code to the main basic block to check if the stack limit has been hit, // and if so, jump to mallocMBB otherwise to bumpMBB. BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg); - BuildMI(BB, DL, TII->get(Is64Bit ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg) + BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg) .addReg(tmpSPVReg).addReg(sizeVReg); - BuildMI(BB, DL, TII->get(Is64Bit ? X86::CMP64mr:X86::CMP32mr)) + BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr)) .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg) .addReg(SPLimitVReg); BuildMI(BB, DL, TII->get(X86::JG_4)).addMBB(mallocMBB); @@ -18954,7 +20062,7 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB, .getSubtargetImpl() ->getRegisterInfo() ->getCallPreservedMask(CallingConv::C); - if (Is64Bit) { + if (IsLP64) { BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI) .addReg(sizeVReg); BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32)) @@ -18962,6 +20070,14 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB, .addRegMask(RegMask) .addReg(X86::RDI, RegState::Implicit) .addReg(X86::RAX, RegState::ImplicitDefine); + } else if (Is64Bit) { + BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI) + .addReg(sizeVReg); + BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32)) + .addExternalSymbol("__morestack_allocate_stack_space") + .addRegMask(RegMask) + .addReg(X86::EDI, RegState::Implicit) + .addReg(X86::EAX, RegState::ImplicitDefine); } else { BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg) .addImm(12); @@ -18977,7 +20093,7 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB, .addImm(16); BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg) - .addReg(Is64Bit ? X86::RAX : X86::EAX); + .addReg(IsLP64 ? X86::RAX : X86::EAX); BuildMI(mallocMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB); // Set up the CFG correctly. @@ -19411,9 +20527,8 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case X86::WIN_ALLOCA: return EmitLoweredWinAlloca(MI, BB); case X86::SEG_ALLOCA_32: - return EmitLoweredSegAlloca(MI, BB, false); case X86::SEG_ALLOCA_64: - return EmitLoweredSegAlloca(MI, BB, true); + return EmitLoweredSegAlloca(MI, BB); case X86::TLSCall_32: case X86::TLSCall_64: return EmitLoweredTLSCall(MI, BB); @@ -19975,6 +21090,10 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef Mask, assert(Mask.size() <= 16 && "Can't shuffle elements smaller than bytes!"); int Ratio = 16 / Mask.size(); for (unsigned i = 0; i < 16; ++i) { + if (Mask[i / Ratio] == SM_SentinelUndef) { + PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8)); + continue; + } int M = Mask[i / Ratio] != SM_SentinelZero ? Ratio * Mask[i / Ratio] + i % Ratio : 255; @@ -20085,17 +21204,18 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root, // for this order is that we are recursing up the operation chain. for (int i = 0, e = std::max(OpMask.size(), RootMask.size()); i < e; ++i) { int RootIdx = i / RootRatio; - if (RootMask[RootIdx] == SM_SentinelZero) { - // This is a zero-ed lane, we're done. - Mask.push_back(SM_SentinelZero); + if (RootMask[RootIdx] < 0) { + // This is a zero or undef lane, we're done. + Mask.push_back(RootMask[RootIdx]); continue; } int RootMaskedIdx = RootMask[RootIdx] * RootRatio + i % RootRatio; int OpIdx = RootMaskedIdx / OpRatio; - if (OpMask[OpIdx] == SM_SentinelZero) { - // The incoming lanes are zero, it doesn't matter which ones we are using. - Mask.push_back(SM_SentinelZero); + if (OpMask[OpIdx] < 0) { + // The incoming lanes are zero or undef, it doesn't matter which ones we + // are using. + Mask.push_back(OpMask[OpIdx]); continue; } @@ -20133,10 +21253,10 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root, // elements, and shrink them to the half-width mask. It does this in a loop // so it will reduce the size of the mask to the minimal width mask which // performs an equivalent shuffle. - while (Mask.size() > 1 && canWidenShuffleElements(Mask)) { - for (int i = 0, e = Mask.size() / 2; i < e; ++i) - Mask[i] = Mask[2 * i] / 2; - Mask.resize(Mask.size() / 2); + SmallVector WidenedMask; + while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) { + Mask = std::move(WidenedMask); + WidenedMask.clear(); } return combineX86ShuffleChain(Op, Root, Mask, Depth, HasPSHUFB, DAG, DCI, @@ -20407,12 +21527,13 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG, return SDValue(); // We combined away this shuffle, so we're done. // See if this reduces to a PSHUFD which is no more expensive and can - // combine with more operations. - if (canWidenShuffleElements(Mask)) { - int DMask[] = {-1, -1, -1, -1}; + // combine with more operations. Note that it has to at least flip the + // dwords as otherwise it would have been removed as a no-op. + if (Mask[0] == 2 && Mask[1] == 3 && Mask[2] == 0 && Mask[3] == 1) { + int DMask[] = {0, 1, 2, 3}; int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2; - DMask[DOffset + 0] = DOffset + Mask[0] / 2; - DMask[DOffset + 1] = DOffset + Mask[2] / 2; + DMask[DOffset + 0] = DOffset + 1; + DMask[DOffset + 1] = DOffset + 0; V = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V); DCI.AddToWorklist(V.getNode()); V = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V, @@ -20654,7 +21775,7 @@ static SDValue PerformTruncateCombine(SDNode *N, SelectionDAG &DAG, /// XFormVExtractWithShuffleIntoLoad - Check if a vector extract from a target /// specific shuffle of a load can be folded into a single element load. /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but -/// shuffles have been customed lowered so we need to handle those here. +/// shuffles have been custom lowered so we need to handle those here. static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { if (DCI.isBeforeLegalizeOps()) @@ -20666,18 +21787,20 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, if (!isa(EltNo)) return SDValue(); - EVT VT = InVec.getValueType(); + EVT OriginalVT = InVec.getValueType(); if (InVec.getOpcode() == ISD::BITCAST) { // Don't duplicate a load with other uses. if (!InVec.hasOneUse()) return SDValue(); EVT BCVT = InVec.getOperand(0).getValueType(); - if (BCVT.getVectorNumElements() != VT.getVectorNumElements()) + if (BCVT.getVectorNumElements() != OriginalVT.getVectorNumElements()) return SDValue(); InVec = InVec.getOperand(0); } + EVT CurrentVT = InVec.getValueType(); + if (!isTargetShuffle(InVec.getOpcode())) return SDValue(); @@ -20687,12 +21810,12 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, SmallVector ShuffleMask; bool UnaryShuffle; - if (!getTargetShuffleMask(InVec.getNode(), VT.getSimpleVT(), ShuffleMask, - UnaryShuffle)) + if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), + ShuffleMask, UnaryShuffle)) return SDValue(); // Select the input vector, guarding against out of range extract vector. - unsigned NumElems = VT.getVectorNumElements(); + unsigned NumElems = CurrentVT.getVectorNumElements(); int Elt = cast(EltNo)->getZExtValue(); int Idx = (Elt > (int)NumElems) ? -1 : ShuffleMask[Elt]; SDValue LdNode = (Idx < (int)NumElems) ? InVec.getOperand(0) @@ -20734,11 +21857,12 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, SDLoc dl(N); // Create shuffle node taking into account the case that its a unary shuffle - SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(VT) : InVec.getOperand(1); - Shuffle = DAG.getVectorShuffle(InVec.getValueType(), dl, + SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) + : InVec.getOperand(1); + Shuffle = DAG.getVectorShuffle(CurrentVT, dl, InVec.getOperand(0), Shuffle, &ShuffleMask[0]); - Shuffle = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle); + Shuffle = DAG.getNode(ISD::BITCAST, dl, OriginalVT, Shuffle); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle, EltNo); } @@ -21511,7 +22635,12 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(), DCI.isBeforeLegalizeOps()); if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) || - TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne, TLO)) + (TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne, + TLO) && + // Don't optimize vector of constants. Those are handled by + // the generic code and all the bits must be properly set for + // the generic optimizer. + !ISD::isBuildVectorOfConstantSDNodes(TLO.New.getNode()))) DCI.CommitTargetLoweringOpt(TLO); } @@ -23617,18 +24746,68 @@ static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG, /// performVZEXTCombine - Performs build vector combines static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, - const X86Subtarget *Subtarget) { + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + SDLoc DL(N); + MVT VT = N->getSimpleValueType(0); + SDValue Op = N->getOperand(0); + MVT OpVT = Op.getSimpleValueType(); + MVT OpEltVT = OpVT.getVectorElementType(); + unsigned InputBits = OpEltVT.getSizeInBits() * VT.getVectorNumElements(); + // (vzext (bitcast (vzext (x)) -> (vzext x) - SDValue In = N->getOperand(0); - while (In.getOpcode() == ISD::BITCAST) - In = In.getOperand(0); + SDValue V = Op; + while (V.getOpcode() == ISD::BITCAST) + V = V.getOperand(0); - if (In.getOpcode() != X86ISD::VZEXT) - return SDValue(); + if (V != Op && V.getOpcode() == X86ISD::VZEXT) { + MVT InnerVT = V.getSimpleValueType(); + MVT InnerEltVT = InnerVT.getVectorElementType(); + + // If the element sizes match exactly, we can just do one larger vzext. This + // is always an exact type match as vzext operates on integer types. + if (OpEltVT == InnerEltVT) { + assert(OpVT == InnerVT && "Types must match for vzext!"); + return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0)); + } + + // The only other way we can combine them is if only a single element of the + // inner vzext is used in the input to the outer vzext. + if (InnerEltVT.getSizeInBits() < InputBits) + return SDValue(); + + // In this case, the inner vzext is completely dead because we're going to + // only look at bits inside of the low element. Just do the outer vzext on + // a bitcast of the input to the inner. + return DAG.getNode(X86ISD::VZEXT, DL, VT, + DAG.getNode(ISD::BITCAST, DL, OpVT, V)); + } + + // Check if we can bypass extracting and re-inserting an element of an input + // vector. Essentialy: + // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x) + if (V.getOpcode() == ISD::SCALAR_TO_VECTOR && + V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT && + V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) { + SDValue ExtractedV = V.getOperand(0); + SDValue OrigV = ExtractedV.getOperand(0); + if (auto *ExtractIdx = dyn_cast(ExtractedV.getOperand(1))) + if (ExtractIdx->getZExtValue() == 0) { + MVT OrigVT = OrigV.getSimpleValueType(); + // Extract a subvector if necessary... + if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) { + int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits(); + OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(), + OrigVT.getVectorNumElements() / Ratio); + OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV, + DAG.getIntPtrConstant(0)); + } + Op = DAG.getNode(ISD::BITCAST, DL, OpVT, OrigV); + return DAG.getNode(X86ISD::VZEXT, DL, VT, Op); + } + } - return DAG.getNode(X86ISD::VZEXT, SDLoc(N), N->getValueType(0), - In.getOperand(0)); + return SDValue(); } SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, @@ -23686,7 +24865,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::PSHUFLW: case X86ISD::MOVSS: case X86ISD::MOVSD: - case X86ISD::VPERMILP: + case X86ISD::VPERMILPI: case X86ISD::VPERM2X128: case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget); case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget);