UI != UE; ++UI) {
if (UI->getOpcode() != X86ISD::RET_FLAG)
return false;
+ // If we are returning more than one value, we can definitely
+ // not make a tail call see PR19530
+ if (UI->getNumOperands() > 4)
+ return false;
+ if (UI->getNumOperands() == 4 &&
+ UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
+ return false;
HasRet = true;
}
CCInfo.AllocateStack(32, 8);
CCInfo.AnalyzeFormalArguments(Ins, CC_X86);
+ CCInfo.AlignStack(Is64Bit ? 8 : 4);
unsigned LastVal = ~0U;
SDValue ArgValue;
return true;
}
+// Hide this symbol with an anonymous namespace instead of 'static' so that MSVC
+// 2013 will allow us to use it as a non-type template parameter.
+namespace {
+
+/// \brief Implementation of the \c isShuffleEquivalent variadic functor.
+///
+/// See its documentation for details.
+bool isShuffleEquivalentImpl(ArrayRef<int> Mask, ArrayRef<const int *> Args) {
+ if (Mask.size() != Args.size())
+ return false;
+ for (int i = 0, e = Mask.size(); i < e; ++i) {
+ assert(*Args[i] >= 0 && "Arguments must be positive integers!");
+ assert(*Args[i] < (int)Args.size() * 2 &&
+ "Argument outside the range of possible shuffle inputs!");
+ if (Mask[i] != -1 && Mask[i] != *Args[i])
+ return false;
+ }
+ return true;
+}
+
+} // namespace
+
+/// \brief Checks whether a shuffle mask is equivalent to an explicit list of
+/// arguments.
+///
+/// This is a fast way to test a shuffle mask against a fixed pattern:
+///
+/// if (isShuffleEquivalent(Mask, 3, 2, 1, 0)) { ... }
+///
+/// It returns true if the mask is exactly as wide as the argument list, and
+/// each element of the mask is either -1 (signifying undef) or the value given
+/// in the argument.
+static const VariadicFunction1<
+ bool, ArrayRef<int>, int, isShuffleEquivalentImpl> isShuffleEquivalent = {};
+
/// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
///
/// This helper function produces an 8-bit shuffle immediate corresponding to
assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
assert(Mask[1] >= 2 && "Non-canonicalized blend!");
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (isShuffleEquivalent(Mask, 0, 2))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2f64, V1, V2);
+ if (isShuffleEquivalent(Mask, 1, 3))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2f64, V1, V2);
+
unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V2,
DAG.getConstant(SHUFPDMask, MVT::i8));
getV4X86ShuffleImm8ForMask(WidenedMask, DAG)));
}
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (isShuffleEquivalent(Mask, 0, 2))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, V1, V2);
+ if (isShuffleEquivalent(Mask, 1, 3))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2i64, V1, V2);
+
// We implement this with SHUFPD which is pretty lame because it will likely
// incur 2 cycles of stall for integer vectors on Nehalem and older chips.
// However, all the alternatives are still more cycles and newer chips don't
return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
getV4X86ShuffleImm8ForMask(Mask, DAG));
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (isShuffleEquivalent(Mask, 0, 4, 1, 5))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V1, V2);
+ if (isShuffleEquivalent(Mask, 2, 6, 3, 7))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V1, V2);
+
if (NumV2Elements == 1) {
int V2Index =
std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) -
return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
getV4X86ShuffleImm8ForMask(Mask, DAG));
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (isShuffleEquivalent(Mask, 0, 4, 1, 5))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V1, V2);
+ if (isShuffleEquivalent(Mask, 2, 6, 3, 7))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V1, V2);
+
// We implement this with SHUFPS because it can blend from two vectors.
// Because we're going to eventually use SHUFPS, we use SHUFPS even to build
// up the inputs, bypassing domain shift penalties that we would encur if we
}
}
+static bool isHalfCrossingShuffleMask(ArrayRef<int> Mask) {
+ int Size = Mask.size();
+ for (int M : Mask.slice(0, Size / 2))
+ if (M >= 0 && (M % Size) >= Size / 2)
+ return true;
+ for (int M : Mask.slice(Size / 2, Size / 2))
+ if (M >= 0 && (M % Size) < Size / 2)
+ return true;
+ return false;
+}
+
+/// \brief Generic routine to split a 256-bit vector shuffle into 128-bit
+/// shuffles.
+///
+/// There is a severely limited set of shuffles available in AVX1 for 256-bit
+/// vectors resulting in routinely needing to split the shuffle into two 128-bit
+/// shuffles. This can be done generically for any 256-bit vector shuffle and so
+/// we encode the logic here for specific shuffle lowering routines to bail to
+/// when they exhaust the features avaible to more directly handle the shuffle.
+static SDValue splitAndLower256BitVectorShuffle(SDValue Op, SDValue V1,
+ SDValue V2,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ SDLoc DL(Op);
+ MVT VT = Op.getSimpleValueType();
+ assert(VT.getSizeInBits() == 256 && "Only for 256-bit vector shuffles!");
+ assert(V1.getSimpleValueType() == VT && "Bad operand type!");
+ assert(V2.getSimpleValueType() == VT && "Bad operand type!");
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+ ArrayRef<int> Mask = SVOp->getMask();
+
+ ArrayRef<int> LoMask = Mask.slice(0, Mask.size()/2);
+ ArrayRef<int> HiMask = Mask.slice(Mask.size()/2);
+
+ int NumElements = VT.getVectorNumElements();
+ int SplitNumElements = NumElements / 2;
+ MVT ScalarVT = VT.getScalarType();
+ MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
+
+ SDValue LoV1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V1,
+ DAG.getIntPtrConstant(0));
+ SDValue HiV1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V1,
+ DAG.getIntPtrConstant(SplitNumElements));
+ SDValue LoV2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V2,
+ DAG.getIntPtrConstant(0));
+ SDValue HiV2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V2,
+ DAG.getIntPtrConstant(SplitNumElements));
+
+ // Now create two 4-way blends of these half-width vectors.
+ auto HalfBlend = [&](ArrayRef<int> HalfMask) {
+ SmallVector<int, 16> V1BlendMask, V2BlendMask, BlendMask;
+ for (int i = 0; i < SplitNumElements; ++i) {
+ int M = HalfMask[i];
+ if (M >= NumElements) {
+ V2BlendMask.push_back(M - NumElements);
+ V1BlendMask.push_back(-1);
+ BlendMask.push_back(SplitNumElements + i);
+ } else if (M >= 0) {
+ V2BlendMask.push_back(-1);
+ V1BlendMask.push_back(M);
+ BlendMask.push_back(i);
+ } else {
+ V2BlendMask.push_back(-1);
+ V1BlendMask.push_back(-1);
+ BlendMask.push_back(-1);
+ }
+ }
+ SDValue V1Blend = DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
+ SDValue V2Blend = DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
+ return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
+ };
+ SDValue Lo = HalfBlend(LoMask);
+ SDValue Hi = HalfBlend(HiMask);
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
+}
+
+/// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
+///
+/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
+/// isn't available.
+static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ SDLoc DL(Op);
+ assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+ ArrayRef<int> Mask = SVOp->getMask();
+ assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
+
+ // FIXME: If we have AVX2, we should delegate to generic code as crossing
+ // shuffles aren't a problem and FP and int have the same patterns.
+
+ // FIXME: We can handle these more cleverly than splitting for v4f64.
+ if (isHalfCrossingShuffleMask(Mask))
+ return splitAndLower256BitVectorShuffle(Op, V1, V2, Subtarget, DAG);
+
+ if (isSingleInputShuffleMask(Mask)) {
+ // Non-half-crossing single input shuffles can be lowerid with an
+ // interleaved permutation.
+ unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
+ ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
+ return DAG.getNode(X86ISD::VPERMILP, DL, MVT::v4f64, V1,
+ DAG.getConstant(VPERMILPMask, MVT::i8));
+ }
+
+ // X86 has dedicated unpack instructions that can handle specific blend
+ // operations: UNPCKH and UNPCKL.
+ if (isShuffleEquivalent(Mask, 0, 4, 2, 6))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f64, V1, V2);
+ if (isShuffleEquivalent(Mask, 1, 5, 3, 7))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V1, V2);
+ // FIXME: It would be nice to find a way to get canonicalization to commute
+ // these patterns.
+ if (isShuffleEquivalent(Mask, 4, 0, 6, 2))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f64, V2, V1);
+ if (isShuffleEquivalent(Mask, 5, 1, 7, 3))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V2, V1);
+
+ // Check if the blend happens to exactly fit that of SHUFPD.
+ if (Mask[0] < 4 && (Mask[1] == -1 || Mask[1] >= 4) &&
+ Mask[2] < 4 && (Mask[3] == -1 || Mask[3] >= 4)) {
+ unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 5) << 1) |
+ ((Mask[2] == 3) << 2) | ((Mask[3] == 7) << 3);
+ return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f64, V1, V2,
+ DAG.getConstant(SHUFPDMask, MVT::i8));
+ }
+ if ((Mask[0] == -1 || Mask[0] >= 4) && Mask[1] < 4 &&
+ (Mask[2] == -1 || Mask[2] >= 4) && Mask[3] < 4) {
+ unsigned SHUFPDMask = (Mask[0] == 5) | ((Mask[1] == 1) << 1) |
+ ((Mask[2] == 7) << 2) | ((Mask[3] == 3) << 3);
+ return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f64, V2, V1,
+ DAG.getConstant(SHUFPDMask, MVT::i8));
+ }
+
+ // Shuffle the input elements into the desired positions in V1 and V2 and
+ // blend them together.
+ int V1Mask[] = {-1, -1, -1, -1};
+ int V2Mask[] = {-1, -1, -1, -1};
+ for (int i = 0; i < 4; ++i)
+ if (Mask[i] >= 0 && Mask[i] < 4)
+ V1Mask[i] = Mask[i];
+ else if (Mask[i] >= 4)
+ V2Mask[i] = Mask[i] - 4;
+
+ V1 = DAG.getVectorShuffle(MVT::v4f64, DL, V1, DAG.getUNDEF(MVT::v4f64), V1Mask);
+ V2 = DAG.getVectorShuffle(MVT::v4f64, DL, V2, DAG.getUNDEF(MVT::v4f64), V2Mask);
+
+ unsigned BlendMask = 0;
+ for (int i = 0; i < 4; ++i)
+ if (Mask[i] >= 4)
+ BlendMask |= 1 << i;
+
+ return DAG.getNode(X86ISD::BLENDI, DL, MVT::v4f64, V1, V2,
+ DAG.getConstant(BlendMask, MVT::i8));
+}
+
+/// \brief Handle lowering of 4-lane 64-bit integer shuffles.
+///
+/// Largely delegates to common code when we have AVX2 and to the floating-point
+/// code when we only have AVX.
+static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ SDLoc DL(Op);
+ assert(Op.getSimpleValueType() == MVT::v4i64 && "Bad shuffle type!");
+ assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+ ArrayRef<int> Mask = SVOp->getMask();
+ assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
+
+ // FIXME: If we have AVX2, we should delegate to generic code as crossing
+ // shuffles aren't a problem and FP and int have the same patterns.
+
+ if (isHalfCrossingShuffleMask(Mask))
+ return splitAndLower256BitVectorShuffle(Op, V1, V2, Subtarget, DAG);
+
+ // AVX1 doesn't provide any facilities for v4i64 shuffles, bitcast and
+ // delegate to floating point code.
+ V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v4f64, V1);
+ V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v4f64, V2);
+ return DAG.getNode(ISD::BITCAST, DL, MVT::v4i64,
+ lowerV4F64VectorShuffle(Op, V1, V2, Subtarget, DAG));
+}
+
+/// \brief High-level routine to lower various 256-bit x86 vector shuffles.
+///
+/// This routine either breaks down the specific type of a 256-bit x86 vector
+/// shuffle or splits it into two 128-bit shuffles and fuses the results back
+/// together based on the available instructions.
+static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+ MVT VT, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ switch (VT.SimpleTy) {
+ case MVT::v4f64:
+ return lowerV4F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
+ case MVT::v4i64:
+ return lowerV4I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
+ case MVT::v8i32:
+ case MVT::v8f32:
+ case MVT::v16i16:
+ case MVT::v32i8:
+ // Fall back to the basic pattern of extracting the high half and forming
+ // a 4-way blend.
+ // FIXME: Add targeted lowering for each type that can document rationale
+ // for delegating to this when necessary.
+ return splitAndLower256BitVectorShuffle(Op, V1, V2, Subtarget, DAG);
+
+ default:
+ llvm_unreachable("Not a valid 256-bit x86 vector type!");
+ }
+}
+
/// \brief Tiny helper function to test whether a shuffle mask could be
/// simplified by widening the elements being shuffled.
static bool canWidenShuffleElements(ArrayRef<int> Mask) {
if (VT.getSizeInBits() == 128)
return lower128BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
+ if (VT.getSizeInBits() == 256)
+ return lower256BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
+
llvm_unreachable("Unimplemented!");
}
}
SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
+ // A vselect where all conditions and data are constants can be optimized into
+ // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
+ if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) &&
+ ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) &&
+ ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
+ return SDValue();
+
SDValue BlendOp = LowerVSELECTtoBlend(Op, Subtarget, DAG);
if (BlendOp.getNode())
return BlendOp;
if (VT == MVT::i1) {
assert((InVT.isInteger() && (InVT.getSizeInBits() <= 64)) &&
"Invalid scalar TRUNCATE operation");
- if (InVT == MVT::i32)
+ if (InVT.getSizeInBits() >= 32)
return SDValue();
- if (InVT.getSizeInBits() == 64)
- In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::i32, In);
- else if (InVT.getSizeInBits() < 32)
- In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);
+ In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);
return DAG.getNode(ISD::TRUNCATE, DL, VT, In);
}
assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
}
+/// \brief Return (vselect \p Mask, \p Op, \p PreservedSrc) along with the
+/// necessary casting for \p Mask when lowering masking intrinsics.
+static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
+ SDValue PreservedSrc, SelectionDAG &DAG) {
+ EVT VT = Op.getValueType();
+ EVT MaskVT = EVT::getVectorVT(*DAG.getContext(),
+ MVT::i1, VT.getVectorNumElements());
+ SDLoc dl(Op);
+
+ assert(MaskVT.isSimple() && "invalid mask type");
+ return DAG.getNode(ISD::VSELECT, dl, VT,
+ DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask),
+ Op, PreservedSrc);
+}
+
+static unsigned getOpcodeForFMAIntrinsic(unsigned IntNo) {
+ switch (IntNo) {
+ default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
+ case Intrinsic::x86_fma_vfmadd_ps:
+ case Intrinsic::x86_fma_vfmadd_pd:
+ case Intrinsic::x86_fma_vfmadd_ps_256:
+ case Intrinsic::x86_fma_vfmadd_pd_256:
+ case Intrinsic::x86_fma_mask_vfmadd_ps_512:
+ case Intrinsic::x86_fma_mask_vfmadd_pd_512:
+ return X86ISD::FMADD;
+ case Intrinsic::x86_fma_vfmsub_ps:
+ case Intrinsic::x86_fma_vfmsub_pd:
+ case Intrinsic::x86_fma_vfmsub_ps_256:
+ case Intrinsic::x86_fma_vfmsub_pd_256:
+ case Intrinsic::x86_fma_mask_vfmsub_ps_512:
+ case Intrinsic::x86_fma_mask_vfmsub_pd_512:
+ return X86ISD::FMSUB;
+ case Intrinsic::x86_fma_vfnmadd_ps:
+ case Intrinsic::x86_fma_vfnmadd_pd:
+ case Intrinsic::x86_fma_vfnmadd_ps_256:
+ case Intrinsic::x86_fma_vfnmadd_pd_256:
+ case Intrinsic::x86_fma_mask_vfnmadd_ps_512:
+ case Intrinsic::x86_fma_mask_vfnmadd_pd_512:
+ return X86ISD::FNMADD;
+ case Intrinsic::x86_fma_vfnmsub_ps:
+ case Intrinsic::x86_fma_vfnmsub_pd:
+ case Intrinsic::x86_fma_vfnmsub_ps_256:
+ case Intrinsic::x86_fma_vfnmsub_pd_256:
+ case Intrinsic::x86_fma_mask_vfnmsub_ps_512:
+ case Intrinsic::x86_fma_mask_vfnmsub_pd_512:
+ return X86ISD::FNMSUB;
+ case Intrinsic::x86_fma_vfmaddsub_ps:
+ case Intrinsic::x86_fma_vfmaddsub_pd:
+ case Intrinsic::x86_fma_vfmaddsub_ps_256:
+ case Intrinsic::x86_fma_vfmaddsub_pd_256:
+ case Intrinsic::x86_fma_mask_vfmaddsub_ps_512:
+ case Intrinsic::x86_fma_mask_vfmaddsub_pd_512:
+ return X86ISD::FMADDSUB;
+ case Intrinsic::x86_fma_vfmsubadd_ps:
+ case Intrinsic::x86_fma_vfmsubadd_pd:
+ case Intrinsic::x86_fma_vfmsubadd_ps_256:
+ case Intrinsic::x86_fma_vfmsubadd_pd_256:
+ case Intrinsic::x86_fma_mask_vfmsubadd_ps_512:
+ case Intrinsic::x86_fma_mask_vfmsubadd_pd_512:
+ return X86ISD::FMSUBADD;
+ }
+}
+
static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
SDLoc dl(Op);
unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
return DAG.getNode(ISD::FSQRT, dl, Op.getValueType(), Op.getOperand(1));
case Intrinsic::x86_avx512_mask_valign_q_512:
- case Intrinsic::x86_avx512_mask_valign_d_512: {
- EVT VT = Op.getValueType();
- EVT MaskVT = EVT::getVectorVT(*DAG.getContext(),
- MVT::i1, VT.getVectorNumElements());
- assert(MaskVT.isSimple() && "invalid valign mask type");
+ case Intrinsic::x86_avx512_mask_valign_d_512:
// Vector source operands are swapped.
- return DAG.getNode(ISD::VSELECT, dl, VT,
- DAG.getNode(ISD::BITCAST, dl, MaskVT,
- Op.getOperand(5)),
- DAG.getNode(X86ISD::VALIGN, dl, VT,
- Op.getOperand(2), Op.getOperand(1),
- Op.getOperand(3)),
- Op.getOperand(4));
- }
+ return getVectorMaskingNode(DAG.getNode(X86ISD::VALIGN, dl,
+ Op.getValueType(), Op.getOperand(2),
+ Op.getOperand(1),
+ Op.getOperand(3)),
+ Op.getOperand(5), Op.getOperand(4), DAG);
// ptest and testp intrinsics. The intrinsic these come from are designed to
// return an integer value, not just an instruction so lower it to the ptest
SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
return DAG.getNode(Opcode, dl, VTs, NewOps);
}
+
+ case Intrinsic::x86_fma_mask_vfmadd_ps_512:
+ case Intrinsic::x86_fma_mask_vfmadd_pd_512:
+ case Intrinsic::x86_fma_mask_vfmsub_ps_512:
+ case Intrinsic::x86_fma_mask_vfmsub_pd_512:
+ case Intrinsic::x86_fma_mask_vfnmadd_ps_512:
+ case Intrinsic::x86_fma_mask_vfnmadd_pd_512:
+ case Intrinsic::x86_fma_mask_vfnmsub_ps_512:
+ case Intrinsic::x86_fma_mask_vfnmsub_pd_512:
+ case Intrinsic::x86_fma_mask_vfmaddsub_ps_512:
+ case Intrinsic::x86_fma_mask_vfmaddsub_pd_512:
+ case Intrinsic::x86_fma_mask_vfmsubadd_ps_512:
+ case Intrinsic::x86_fma_mask_vfmsubadd_pd_512: {
+ auto *SAE = cast<ConstantSDNode>(Op.getOperand(5));
+ if (SAE->getZExtValue() == X86::STATIC_ROUNDING::CUR_DIRECTION)
+ return getVectorMaskingNode(DAG.getNode(getOpcodeForFMAIntrinsic(IntNo),
+ dl, Op.getValueType(),
+ Op.getOperand(1),
+ Op.getOperand(2),
+ Op.getOperand(3)),
+ Op.getOperand(4), Op.getOperand(1), DAG);
+ else
+ return SDValue();
+ }
+
case Intrinsic::x86_fma_vfmadd_ps:
case Intrinsic::x86_fma_vfmadd_pd:
case Intrinsic::x86_fma_vfmsub_ps:
case Intrinsic::x86_fma_vfmaddsub_pd_256:
case Intrinsic::x86_fma_vfmsubadd_ps_256:
case Intrinsic::x86_fma_vfmsubadd_pd_256:
- case Intrinsic::x86_fma_vfmadd_ps_512:
- case Intrinsic::x86_fma_vfmadd_pd_512:
- case Intrinsic::x86_fma_vfmsub_ps_512:
- case Intrinsic::x86_fma_vfmsub_pd_512:
- case Intrinsic::x86_fma_vfnmadd_ps_512:
- case Intrinsic::x86_fma_vfnmadd_pd_512:
- case Intrinsic::x86_fma_vfnmsub_ps_512:
- case Intrinsic::x86_fma_vfnmsub_pd_512:
- case Intrinsic::x86_fma_vfmaddsub_ps_512:
- case Intrinsic::x86_fma_vfmaddsub_pd_512:
- case Intrinsic::x86_fma_vfmsubadd_ps_512:
- case Intrinsic::x86_fma_vfmsubadd_pd_512: {
- unsigned Opc;
- switch (IntNo) {
- default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
- case Intrinsic::x86_fma_vfmadd_ps:
- case Intrinsic::x86_fma_vfmadd_pd:
- case Intrinsic::x86_fma_vfmadd_ps_256:
- case Intrinsic::x86_fma_vfmadd_pd_256:
- case Intrinsic::x86_fma_vfmadd_ps_512:
- case Intrinsic::x86_fma_vfmadd_pd_512:
- Opc = X86ISD::FMADD;
- break;
- case Intrinsic::x86_fma_vfmsub_ps:
- case Intrinsic::x86_fma_vfmsub_pd:
- case Intrinsic::x86_fma_vfmsub_ps_256:
- case Intrinsic::x86_fma_vfmsub_pd_256:
- case Intrinsic::x86_fma_vfmsub_ps_512:
- case Intrinsic::x86_fma_vfmsub_pd_512:
- Opc = X86ISD::FMSUB;
- break;
- case Intrinsic::x86_fma_vfnmadd_ps:
- case Intrinsic::x86_fma_vfnmadd_pd:
- case Intrinsic::x86_fma_vfnmadd_ps_256:
- case Intrinsic::x86_fma_vfnmadd_pd_256:
- case Intrinsic::x86_fma_vfnmadd_ps_512:
- case Intrinsic::x86_fma_vfnmadd_pd_512:
- Opc = X86ISD::FNMADD;
- break;
- case Intrinsic::x86_fma_vfnmsub_ps:
- case Intrinsic::x86_fma_vfnmsub_pd:
- case Intrinsic::x86_fma_vfnmsub_ps_256:
- case Intrinsic::x86_fma_vfnmsub_pd_256:
- case Intrinsic::x86_fma_vfnmsub_ps_512:
- case Intrinsic::x86_fma_vfnmsub_pd_512:
- Opc = X86ISD::FNMSUB;
- break;
- case Intrinsic::x86_fma_vfmaddsub_ps:
- case Intrinsic::x86_fma_vfmaddsub_pd:
- case Intrinsic::x86_fma_vfmaddsub_ps_256:
- case Intrinsic::x86_fma_vfmaddsub_pd_256:
- case Intrinsic::x86_fma_vfmaddsub_ps_512:
- case Intrinsic::x86_fma_vfmaddsub_pd_512:
- Opc = X86ISD::FMADDSUB;
- break;
- case Intrinsic::x86_fma_vfmsubadd_ps:
- case Intrinsic::x86_fma_vfmsubadd_pd:
- case Intrinsic::x86_fma_vfmsubadd_ps_256:
- case Intrinsic::x86_fma_vfmsubadd_pd_256:
- case Intrinsic::x86_fma_vfmsubadd_ps_512:
- case Intrinsic::x86_fma_vfmsubadd_pd_512:
- Opc = X86ISD::FMSUBADD;
- break;
- }
-
- return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
- Op.getOperand(2), Op.getOperand(3));
- }
+ return DAG.getNode(getOpcodeForFMAIntrinsic(IntNo), dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
}
}
} else {
const int HighMask[] = {1, 5, 3, 7};
Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
- const int LowMask[] = {1, 4, 2, 6};
+ const int LowMask[] = {0, 4, 2, 6};
Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
}
assert(Mask.size() <= 16 && "Can't shuffle elements smaller than bytes!");
int Ratio = 16 / Mask.size();
for (unsigned i = 0; i < 16; ++i) {
- int M = Ratio * Mask[i / Ratio] + i % Ratio;
+ int M = Mask[i / Ratio] != SM_SentinelZero
+ ? Ratio * Mask[i / Ratio] + i % Ratio
+ : 255;
PSHUFBMask.push_back(DAG.getConstant(M, MVT::i8));
}
Op = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Input);
/// combine-ordering. To fix this, we should do the redundant instruction
/// combining in this recursive walk.
static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root,
- ArrayRef<int> IncomingMask, int Depth,
- bool HasPSHUFB, SelectionDAG &DAG,
+ ArrayRef<int> RootMask,
+ int Depth, bool HasPSHUFB,
+ SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget *Subtarget) {
// Bound the depth of our recursive combine because this is ultimately
assert(VT.getVectorNumElements() == OpMask.size() &&
"Different mask size from vector size!");
+ assert(((RootMask.size() > OpMask.size() &&
+ RootMask.size() % OpMask.size() == 0) ||
+ (OpMask.size() > RootMask.size() &&
+ OpMask.size() % RootMask.size() == 0) ||
+ OpMask.size() == RootMask.size()) &&
+ "The smaller number of elements must divide the larger.");
+ int RootRatio = std::max<int>(1, OpMask.size() / RootMask.size());
+ int OpRatio = std::max<int>(1, RootMask.size() / OpMask.size());
+ assert(((RootRatio == 1 && OpRatio == 1) ||
+ (RootRatio == 1) != (OpRatio == 1)) &&
+ "Must not have a ratio for both incoming and op masks!");
SmallVector<int, 16> Mask;
- Mask.reserve(std::max(OpMask.size(), IncomingMask.size()));
-
- // Merge this shuffle operation's mask into our accumulated mask. This is
- // a bit tricky as the shuffle may have a different size from the root.
- if (OpMask.size() == IncomingMask.size()) {
- for (int M : IncomingMask)
- Mask.push_back(OpMask[M]);
- } else if (OpMask.size() < IncomingMask.size()) {
- assert(IncomingMask.size() % OpMask.size() == 0 &&
- "The smaller number of elements must divide the larger.");
- int Ratio = IncomingMask.size() / OpMask.size();
- for (int M : IncomingMask)
- Mask.push_back(Ratio * OpMask[M / Ratio] + M % Ratio);
- } else {
- assert(OpMask.size() > IncomingMask.size() && "All other cases handled!");
- assert(OpMask.size() % IncomingMask.size() == 0 &&
- "The smaller number of elements must divide the larger.");
- int Ratio = OpMask.size() / IncomingMask.size();
- for (int i = 0, e = OpMask.size(); i < e; ++i)
- Mask.push_back(OpMask[Ratio * IncomingMask[i / Ratio] + i % Ratio]);
+ Mask.reserve(std::max(OpMask.size(), RootMask.size()));
+
+ // Merge this shuffle operation's mask into our accumulated mask. Note that
+ // this shuffle's mask will be the first applied to the input, followed by the
+ // root mask to get us all the way to the root value arrangement. The reason
+ // for this order is that we are recursing up the operation chain.
+ for (int i = 0, e = std::max(OpMask.size(), RootMask.size()); i < e; ++i) {
+ int RootIdx = i / RootRatio;
+ if (RootMask[RootIdx] == SM_SentinelZero) {
+ // This is a zero-ed lane, we're done.
+ Mask.push_back(SM_SentinelZero);
+ continue;
+ }
+
+ int RootMaskedIdx = RootMask[RootIdx] * RootRatio + i % RootRatio;
+ int OpIdx = RootMaskedIdx / OpRatio;
+ if (OpMask[OpIdx] == SM_SentinelZero) {
+ // The incoming lanes are zero, it doesn't matter which ones we are using.
+ Mask.push_back(SM_SentinelZero);
+ continue;
+ }
+
+ // Ok, we have non-zero lanes, map them through.
+ Mask.push_back(OpMask[OpIdx] * OpRatio +
+ RootMaskedIdx % OpRatio);
}
// See if we can recurse into the operand to combine more things.
// elements, and shrink them to the half-width mask. It does this in a loop
// so it will reduce the size of the mask to the minimal width mask which
// performs an equivalent shuffle.
- while (Mask.size() > 1) {
- SmallVector<int, 16> NewMask;
- for (int i = 0, e = Mask.size()/2; i < e; ++i) {
- if (Mask[2*i] % 2 != 0 || Mask[2*i] != Mask[2*i + 1] + 1) {
- NewMask.clear();
- break;
- }
- NewMask.push_back(Mask[2*i] / 2);
- }
- if (NewMask.empty())
- break;
- Mask.swap(NewMask);
+ while (Mask.size() > 1 && canWidenShuffleElements(Mask)) {
+ for (int i = 0, e = Mask.size() / 2; i < e; ++i)
+ Mask[i] = Mask[2 * i] / 2;
+ Mask.resize(Mask.size() / 2);
}
return combineX86ShuffleChain(Op, Root, Mask, Depth, HasPSHUFB, DAG, DCI,
if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
return SDValue();
+ // A vselect where all conditions and data are constants can be optimized into
+ // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
+ if (ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) &&
+ ISD::isBuildVectorOfConstantSDNodes(RHS.getNode()))
+ return SDValue();
+
unsigned MaskValue = 0;
if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue))
return SDValue();