cl::desc("Enable an experimental vector shuffle lowering code path."),
cl::Hidden);
+static cl::opt<int> ReciprocalEstimateRefinementSteps(
+ "x86-recip-refinement-steps", cl::init(1),
+ cl::desc("Specify the number of Newton-Raphson iterations applied to the "
+ "result of the hardware reciprocal estimate instruction."),
+ cl::NotHidden);
+
// Forward declarations.
static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
SDValue V2);
return Insert256BitVector(V, V2, NumElems/2, DAG, dl);
}
-static TargetLoweringObjectFile *createTLOF(const Triple &TT) {
- if (TT.isOSBinFormatMachO()) {
- if (TT.getArch() == Triple::x86_64)
- return new X86_64MachoTargetObjectFile();
- return new TargetLoweringObjectFileMachO();
- }
-
- if (TT.isOSLinux())
- return new X86LinuxTargetObjectFile();
- if (TT.isOSBinFormatELF())
- return new TargetLoweringObjectFileELF();
- if (TT.isKnownWindowsMSVCEnvironment())
- return new X86WindowsTargetObjectFile();
- if (TT.isOSBinFormatCOFF())
- return new TargetLoweringObjectFileCOFF();
- llvm_unreachable("unknown subtarget type");
-}
-
// FIXME: This should stop caching the target machine as soon as
// we can remove resetOperationActions et al.
X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM)
- : TargetLowering(TM, createTLOF(Triple(TM.getTargetTriple()))) {
+ : TargetLowering(TM) {
Subtarget = &TM.getSubtarget<X86Subtarget>();
X86ScalarSSEf64 = Subtarget->hasSSE2();
X86ScalarSSEf32 = Subtarget->hasSSE1();
setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
// Bypass expensive divides on Atom when compiling with O2
- if (Subtarget->hasSlowDivide() && TM.getOptLevel() >= CodeGenOpt::Default) {
- addBypassSlowDiv(32, 8);
- if (Subtarget->is64Bit())
+ if (TM.getOptLevel() >= CodeGenOpt::Default) {
+ if (Subtarget->hasSlowDivide32())
+ addBypassSlowDiv(32, 8);
+ if (Subtarget->hasSlowDivide64() && Subtarget->is64Bit())
addBypassSlowDiv(64, 16);
}
setOperationAction(ISD::FLOG10, MVT::f80, Expand);
setOperationAction(ISD::FEXP, MVT::f80, Expand);
setOperationAction(ISD::FEXP2, MVT::f80, Expand);
+ setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
+ setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
// First set operation action for all vector types to either promote
// (for widening) or expand (for scalarization). Then we will selectively
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
}
if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) {
setOperationAction(ISD::VSELECT, MVT::v16i16, Custom);
setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
+
+ // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
+ // when we have a 256bit-wide blend with immediate.
+ setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
} else {
setOperationAction(ISD::ADD, MVT::v4i64, Custom);
setOperationAction(ISD::ADD, MVT::v8i32, Custom);
// Extract subvector is special because the value type
// (result) is 128-bit but the source is 256-bit wide.
- if (VT.is128BitVector())
+ if (VT.is128BitVector()) {
+ if (VT.getScalarSizeInBits() >= 32) {
+ setOperationAction(ISD::MLOAD, VT, Custom);
+ setOperationAction(ISD::MSTORE, VT, Custom);
+ }
setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
-
+ }
// Do not attempt to custom lower other non-256-bit vectors
if (!VT.is256BitVector())
continue;
+ if (VT.getScalarSizeInBits() >= 32) {
+ setOperationAction(ISD::MLOAD, VT, Legal);
+ setOperationAction(ISD::MSTORE, VT, Legal);
+ }
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v8i1, Custom);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v16i1, Custom);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v16i8, Promote);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v16i16, Promote);
setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);
setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
unsigned EltSize = VT.getVectorElementType().getSizeInBits();
// Extract subvector is special because the value type
// (result) is 256/128-bit but the source is 512-bit wide.
- if (VT.is128BitVector() || VT.is256BitVector())
+ if (VT.is128BitVector() || VT.is256BitVector()) {
setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
-
+ if ( EltSize >= 32) {
+ setOperationAction(ISD::MLOAD, VT, Legal);
+ setOperationAction(ISD::MSTORE, VT, Legal);
+ }
+ }
if (VT.getVectorElementType() == MVT::i1)
setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
+ setOperationAction(ISD::MLOAD, VT, Legal);
+ setOperationAction(ISD::MSTORE, VT, Legal);
}
}
for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
setOperationAction(ISD::UMULO, VT, Custom);
}
- // There are no 8-bit 3-address imul/mul instructions
- setOperationAction(ISD::SMULO, MVT::i8, Expand);
- setOperationAction(ISD::UMULO, MVT::i8, Expand);
if (!Subtarget->is64Bit()) {
// These libcalls are not available in 32-bit.
}
/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that
-/// is suitable for input to PSHUFD or PSHUFW. That is, it doesn't reference
-/// the second operand.
-static bool isPSHUFDMask(ArrayRef<int> Mask, MVT VT) {
- if (VT == MVT::v4f32 || VT == MVT::v4i32 )
- return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4);
- if (VT == MVT::v2f64 || VT == MVT::v2i64)
- return (Mask[0] < 2 && Mask[1] < 2);
- return false;
+/// is suitable for input to PSHUFD. That is, it doesn't reference the other
+/// operand - by default will match for first operand.
+static bool isPSHUFDMask(ArrayRef<int> Mask, MVT VT,
+ bool TestSecondOperand = false) {
+ if (VT != MVT::v4f32 && VT != MVT::v4i32 &&
+ VT != MVT::v2f64 && VT != MVT::v2i64)
+ return false;
+
+ unsigned NumElems = VT.getVectorNumElements();
+ unsigned Lo = TestSecondOperand ? NumElems : 0;
+ unsigned Hi = Lo + NumElems;
+
+ for (unsigned i = 0; i < NumElems; ++i)
+ if (!isUndefOrInRange(Mask[i], (int)Lo, (int)Hi))
+ return false;
+
+ return true;
}
/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that
SDValue Vec;
if (VT.is128BitVector()) { // SSE
if (Subtarget->hasSSE2()) { // SSE2
- SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
+ SDValue Cst = DAG.getConstant(0, MVT::i32);
Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
} else { // SSE1
- SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
+ SDValue Cst = DAG.getConstantFP(+0.0, MVT::f32);
Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
}
} else if (VT.is256BitVector()) { // AVX
if (Subtarget->hasInt256()) { // AVX2
- SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
+ SDValue Cst = DAG.getConstant(0, MVT::i32);
SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
} else {
// 256-bit logic and arithmetic instructions in AVX are all
// floating-point, no support for integer ops. Emit fp zeroed vectors.
- SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
+ SDValue Cst = DAG.getConstantFP(+0.0, MVT::f32);
SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops);
}
} else if (VT.is512BitVector()) { // AVX-512
- SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
+ SDValue Cst = DAG.getConstant(0, MVT::i32);
SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops);
} else if (VT.getScalarType() == MVT::i1) {
assert(VT.getVectorNumElements() <= 16 && "Unexpected vector type");
- SDValue Cst = DAG.getTargetConstant(0, MVT::i1);
+ SDValue Cst = DAG.getConstant(0, MVT::i1);
SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
} else
SDLoc dl) {
assert(VT.isVector() && "Expected a vector type");
- SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32);
+ SDValue Cst = DAG.getConstant(~0U, MVT::i32);
SDValue Vec;
if (VT.is256BitVector()) {
if (HasInt256) { // AVX2
}
/// LowerBuildVectorv4x32 - Custom lower build_vector of v4i32 or v4f32.
-static SDValue LowerBuildVectorv4x32(SDValue Op, unsigned NumElems,
- unsigned NonZeros, unsigned NumNonZero,
- unsigned NumZero, SelectionDAG &DAG,
+static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
const X86Subtarget *Subtarget,
const TargetLowering &TLI) {
- // We know there's at least one non-zero element
- unsigned FirstNonZeroIdx = 0;
- SDValue FirstNonZero = Op->getOperand(FirstNonZeroIdx);
- while (FirstNonZero.getOpcode() == ISD::UNDEF ||
- X86::isZeroNode(FirstNonZero)) {
- ++FirstNonZeroIdx;
- FirstNonZero = Op->getOperand(FirstNonZeroIdx);
+ // Find all zeroable elements.
+ bool Zeroable[4];
+ for (int i=0; i < 4; ++i) {
+ SDValue Elt = Op->getOperand(i);
+ Zeroable[i] = (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt));
+ }
+ assert(std::count_if(&Zeroable[0], &Zeroable[4],
+ [](bool M) { return !M; }) > 1 &&
+ "We expect at least two non-zero elements!");
+
+ // We only know how to deal with build_vector nodes where elements are either
+ // zeroable or extract_vector_elt with constant index.
+ SDValue FirstNonZero;
+ unsigned FirstNonZeroIdx;
+ for (unsigned i=0; i < 4; ++i) {
+ if (Zeroable[i])
+ continue;
+ SDValue Elt = Op->getOperand(i);
+ if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ !isa<ConstantSDNode>(Elt.getOperand(1)))
+ return SDValue();
+ // Make sure that this node is extracting from a 128-bit vector.
+ MVT VT = Elt.getOperand(0).getSimpleValueType();
+ if (!VT.is128BitVector())
+ return SDValue();
+ if (!FirstNonZero.getNode()) {
+ FirstNonZero = Elt;
+ FirstNonZeroIdx = i;
+ }
}
- if (FirstNonZero.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
- !isa<ConstantSDNode>(FirstNonZero.getOperand(1)))
- return SDValue();
+ assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
+ SDValue V1 = FirstNonZero.getOperand(0);
+ MVT VT = V1.getSimpleValueType();
- SDValue V = FirstNonZero.getOperand(0);
- MVT VVT = V.getSimpleValueType();
- if (!Subtarget->hasSSE41() || (VVT != MVT::v4f32 && VVT != MVT::v4i32))
- return SDValue();
+ // See if this build_vector can be lowered as a blend with zero.
+ SDValue Elt;
+ unsigned EltMaskIdx, EltIdx;
+ int Mask[4];
+ for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
+ if (Zeroable[EltIdx]) {
+ // The zero vector will be on the right hand side.
+ Mask[EltIdx] = EltIdx+4;
+ continue;
+ }
- unsigned FirstNonZeroDst =
- cast<ConstantSDNode>(FirstNonZero.getOperand(1))->getZExtValue();
- unsigned CorrectIdx = FirstNonZeroDst == FirstNonZeroIdx;
- unsigned IncorrectIdx = CorrectIdx ? -1U : FirstNonZeroIdx;
- unsigned IncorrectDst = CorrectIdx ? -1U : FirstNonZeroDst;
+ Elt = Op->getOperand(EltIdx);
+ // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
+ EltMaskIdx = cast<ConstantSDNode>(Elt.getOperand(1))->getZExtValue();
+ if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
+ break;
+ Mask[EltIdx] = EltIdx;
+ }
- for (unsigned Idx = FirstNonZeroIdx + 1; Idx < NumElems; ++Idx) {
- SDValue Elem = Op.getOperand(Idx);
- if (Elem.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elem))
- continue;
+ if (EltIdx == 4) {
+ // Let the shuffle legalizer deal with blend operations.
+ SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
+ if (V1.getSimpleValueType() != VT)
+ V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), VT, V1);
+ return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, &Mask[0]);
+ }
- // TODO: What else can be here? Deal with it.
- if (Elem.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
- return SDValue();
+ // See if we can lower this build_vector to a INSERTPS.
+ if (!Subtarget->hasSSE41())
+ return SDValue();
- // TODO: Some optimizations are still possible here
- // ex: Getting one element from a vector, and the rest from another.
- if (Elem.getOperand(0) != V)
- return SDValue();
+ SDValue V2 = Elt.getOperand(0);
+ if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
+ V1 = SDValue();
- unsigned Dst = cast<ConstantSDNode>(Elem.getOperand(1))->getZExtValue();
- if (Dst == Idx)
- ++CorrectIdx;
- else if (IncorrectIdx == -1U) {
- IncorrectIdx = Idx;
- IncorrectDst = Dst;
- } else
- // There was already one element with an incorrect index.
- // We can't optimize this case to an insertps.
- return SDValue();
+ bool CanFold = true;
+ for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
+ if (Zeroable[i])
+ continue;
+
+ SDValue Current = Op->getOperand(i);
+ SDValue SrcVector = Current->getOperand(0);
+ if (!V1.getNode())
+ V1 = SrcVector;
+ CanFold = SrcVector == V1 &&
+ cast<ConstantSDNode>(Current.getOperand(1))->getZExtValue() == i;
}
- if (NumNonZero == CorrectIdx || NumNonZero == CorrectIdx + 1) {
- SDLoc dl(Op);
- EVT VT = Op.getSimpleValueType();
- unsigned ElementMoveMask = 0;
- if (IncorrectIdx == -1U)
- ElementMoveMask = FirstNonZeroIdx << 6 | FirstNonZeroIdx << 4;
- else
- ElementMoveMask = IncorrectDst << 6 | IncorrectIdx << 4;
+ if (!CanFold)
+ return SDValue();
- SDValue InsertpsMask =
- DAG.getIntPtrConstant(ElementMoveMask | (~NonZeros & 0xf));
- return DAG.getNode(X86ISD::INSERTPS, dl, VT, V, V, InsertpsMask);
- }
+ assert(V1.getNode() && "Expected at least two non-zero elements!");
+ if (V1.getSimpleValueType() != MVT::v4f32)
+ V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), MVT::v4f32, V1);
+ if (V2.getSimpleValueType() != MVT::v4f32)
+ V2 = DAG.getNode(ISD::BITCAST, SDLoc(V2), MVT::v4f32, V2);
- return SDValue();
+ // Ok, we can emit an INSERTPS instruction.
+ unsigned ZMask = 0;
+ for (int i = 0; i < 4; ++i)
+ if (Zeroable[i])
+ ZMask |= 1 << i;
+
+ unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
+ assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
+ SDValue Result = DAG.getNode(X86ISD::INSERTPS, SDLoc(Op), MVT::v4f32, V1, V2,
+ DAG.getIntPtrConstant(InsertPSMask));
+ return DAG.getNode(ISD::BITCAST, SDLoc(Op), VT, Result);
}
/// getVShift - Return a vector logical shift node.
// If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
if (EVTBits == 32 && NumElems == 4) {
- SDValue V = LowerBuildVectorv4x32(Op, NumElems, NonZeros, NumNonZero,
- NumZero, DAG, Subtarget, *this);
+ SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget, *this);
if (V.getNode())
return V;
}
/// \brief Try to lower a vector shuffle as a byte rotation.
///
-/// We have a generic PALIGNR instruction in x86 that will do an arbitrary
-/// byte-rotation of the concatenation of two vectors. This routine will
-/// try to generically lower a vector shuffle through such an instruction. It
-/// does not check for the availability of PALIGNR-based lowerings, only the
-/// applicability of this strategy to the given mask. This matches shuffle
-/// vectors that look like:
+/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
+/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
+/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
+/// try to generically lower a vector shuffle through such an pattern. It
+/// does not check for the profitability of lowering either as PALIGNR or
+/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
+/// This matches shuffle vectors that look like:
///
/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
///
static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1,
SDValue V2,
ArrayRef<int> Mask,
+ const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
else if (!Hi)
Hi = Lo;
- // Cast the inputs to v16i8 to match PALIGNR.
- Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Lo);
- Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Hi);
-
assert(VT.getSizeInBits() == 128 &&
"Rotate-based lowering only supports 128-bit lowering!");
assert(Mask.size() <= 16 &&
"Can shuffle at most 16 bytes in a 128-bit vector!");
+
// The actual rotate instruction rotates bytes, so we need to scale the
// rotation based on how many bytes are in the vector.
int Scale = 16 / Mask.size();
+ // SSSE3 targets can use the palignr instruction
+ if (Subtarget->hasSSSE3()) {
+ // Cast the inputs to v16i8 to match PALIGNR.
+ Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Lo);
+ Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Hi);
+
+ return DAG.getNode(ISD::BITCAST, DL, VT,
+ DAG.getNode(X86ISD::PALIGNR, DL, MVT::v16i8, Hi, Lo,
+ DAG.getConstant(Rotation * Scale, MVT::i8)));
+ }
+
+ // Default SSE2 implementation
+ int LoByteShift = 16 - Rotation * Scale;
+ int HiByteShift = Rotation * Scale;
+
+ // Cast the inputs to v2i64 to match PSLLDQ/PSRLDQ.
+ Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Lo);
+ Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Hi);
+
+ SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, Lo,
+ DAG.getConstant(8 * LoByteShift, MVT::i8));
+ SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, Hi,
+ DAG.getConstant(8 * HiByteShift, MVT::i8));
return DAG.getNode(ISD::BITCAST, DL, VT,
- DAG.getNode(X86ISD::PALIGNR, DL, MVT::v16i8, Hi, Lo,
- DAG.getConstant(Rotation * Scale, MVT::i8)));
+ DAG.getNode(ISD::OR, DL, MVT::v2i64, LoShift, HiShift));
}
/// \brief Compute whether each element of a shuffle is zeroable.
return Zeroable;
}
+/// \brief Try to lower a vector shuffle as a byte shift (shifts in zeros).
+///
+/// Attempts to match a shuffle mask against the PSRLDQ and PSLLDQ SSE2
+/// byte-shift instructions. The mask must consist of a shifted sequential
+/// shuffle from one of the input vectors and zeroable elements for the
+/// remaining 'shifted in' elements.
+///
+/// Note that this only handles 128-bit vector widths currently.
+static SDValue lowerVectorShuffleAsByteShift(SDLoc DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
+ assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
+
+ SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+
+ int Size = Mask.size();
+ int Scale = 16 / Size;
+
+ auto isSequential = [](int Base, int StartIndex, int EndIndex, int MaskOffset,
+ ArrayRef<int> Mask) {
+ for (int i = StartIndex; i < EndIndex; i++) {
+ if (Mask[i] < 0)
+ continue;
+ if (i + Base != Mask[i] - MaskOffset)
+ return false;
+ }
+ return true;
+ };
+
+ for (int Shift = 1; Shift < Size; Shift++) {
+ int ByteShift = Shift * Scale;
+
+ // PSRLDQ : (little-endian) right byte shift
+ // [ 5, 6, 7, zz, zz, zz, zz, zz]
+ // [ -1, 5, 6, 7, zz, zz, zz, zz]
+ // [ 1, 2, -1, -1, -1, -1, zz, zz]
+ bool ZeroableRight = true;
+ for (int i = Size - Shift; i < Size; i++) {
+ ZeroableRight &= Zeroable[i];
+ }
+
+ if (ZeroableRight) {
+ bool ValidShiftRight1 = isSequential(Shift, 0, Size - Shift, 0, Mask);
+ bool ValidShiftRight2 = isSequential(Shift, 0, Size - Shift, Size, Mask);
+
+ if (ValidShiftRight1 || ValidShiftRight2) {
+ // Cast the inputs to v2i64 to match PSRLDQ.
+ SDValue &TargetV = ValidShiftRight1 ? V1 : V2;
+ SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TargetV);
+ SDValue Shifted = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, V,
+ DAG.getConstant(ByteShift * 8, MVT::i8));
+ return DAG.getNode(ISD::BITCAST, DL, VT, Shifted);
+ }
+ }
+
+ // PSLLDQ : (little-endian) left byte shift
+ // [ zz, 0, 1, 2, 3, 4, 5, 6]
+ // [ zz, zz, -1, -1, 2, 3, 4, -1]
+ // [ zz, zz, zz, zz, zz, zz, -1, 1]
+ bool ZeroableLeft = true;
+ for (int i = 0; i < Shift; i++) {
+ ZeroableLeft &= Zeroable[i];
+ }
+
+ if (ZeroableLeft) {
+ bool ValidShiftLeft1 = isSequential(-Shift, Shift, Size, 0, Mask);
+ bool ValidShiftLeft2 = isSequential(-Shift, Shift, Size, Size, Mask);
+
+ if (ValidShiftLeft1 || ValidShiftLeft2) {
+ // Cast the inputs to v2i64 to match PSLLDQ.
+ SDValue &TargetV = ValidShiftLeft1 ? V1 : V2;
+ SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TargetV);
+ SDValue Shifted = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, V,
+ DAG.getConstant(ByteShift * 8, MVT::i8));
+ return DAG.getNode(ISD::BITCAST, DL, VT, Shifted);
+ }
+ }
+ }
+
+ return SDValue();
+}
+
/// \brief Lower a vector shuffle as a zero or any extension.
///
/// Given a specific number of elements, element bit width, and extension
"a sorted mask where the broadcast "
"comes from V1.");
- // Check if this is a broadcast of a scalar. We special case lowering for
- // scalars so that we can more effectively fold with loads.
+ // Go up the chain of (vector) values to try and find a scalar load that
+ // we can combine with the broadcast.
+ for (;;) {
+ switch (V.getOpcode()) {
+ case ISD::CONCAT_VECTORS: {
+ int OperandSize = Mask.size() / V.getNumOperands();
+ V = V.getOperand(BroadcastIdx / OperandSize);
+ BroadcastIdx %= OperandSize;
+ continue;
+ }
+
+ case ISD::INSERT_SUBVECTOR: {
+ SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
+ auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
+ if (!ConstantIdx)
+ break;
+
+ int BeginIdx = (int)ConstantIdx->getZExtValue();
+ int EndIdx =
+ BeginIdx + (int)VInner.getValueType().getVectorNumElements();
+ if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
+ BroadcastIdx -= BeginIdx;
+ V = VInner;
+ } else {
+ V = VOuter;
+ }
+ continue;
+ }
+ }
+ break;
+ }
+
+ // Check if this is a broadcast of a scalar. We special case lowering
+ // for scalars so that we can more effectively fold with loads.
if (V.getOpcode() == ISD::BUILD_VECTOR ||
- (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
+ (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
V = V.getOperand(BroadcastIdx);
// If the scalar isn't a load we can't broadcast from it in AVX1, only with
Subtarget, DAG))
return Blend;
- // Try to use rotation instructions if available.
+ // Try to use byte shift instructions.
+ if (SDValue Shift = lowerVectorShuffleAsByteShift(
+ DL, MVT::v2i64, V1, V2, Mask, DAG))
+ return Shift;
+
+ // Try to use byte rotation instructions.
+ // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
if (Subtarget->hasSSSE3())
if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
- DL, MVT::v2i64, V1, V2, Mask, DAG))
+ DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
return Rotate;
// We implement this with SHUFPD which is pretty lame because it will likely
Subtarget, DAG))
return Blend;
- // Try to use rotation instructions if available.
+ // Try to use byte shift instructions.
+ if (SDValue Shift = lowerVectorShuffleAsByteShift(
+ DL, MVT::v4i32, V1, V2, Mask, DAG))
+ return Shift;
+
+ // Try to use byte rotation instructions.
+ // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
if (Subtarget->hasSSSE3())
if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
- DL, MVT::v4i32, V1, V2, Mask, DAG))
+ DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
return Rotate;
// We implement this with SHUFPS because it can blend from two vectors.
if (isShuffleEquivalent(Mask, 4, 4, 5, 5, 6, 6, 7, 7))
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V, V);
- // Try to use rotation instructions if available.
- if (Subtarget->hasSSSE3())
- if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
- DL, MVT::v8i16, V, V, Mask, DAG))
- return Rotate;
+ // Try to use byte shift instructions.
+ if (SDValue Shift = lowerVectorShuffleAsByteShift(
+ DL, MVT::v8i16, V, V, Mask, DAG))
+ return Shift;
+
+ // Try to use byte rotation instructions.
+ if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
+ DL, MVT::v8i16, V, V, Mask, Subtarget, DAG))
+ return Rotate;
// Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
// such inputs we can swap two of the dwords across the half mark and end up
Mask, Subtarget, DAG))
return V;
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 2, 10, 3, 11))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V1, V2);
+ if (isShuffleEquivalent(Mask, 4, 12, 5, 13, 6, 14, 7, 15))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V1, V2);
+
if (Subtarget->hasSSE41())
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
Subtarget, DAG))
return Blend;
- // Try to use rotation instructions if available.
- if (Subtarget->hasSSSE3())
- if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
- DL, MVT::v8i16, V1, V2, Mask, DAG))
- return Rotate;
+ // Try to use byte shift instructions.
+ if (SDValue Shift = lowerVectorShuffleAsByteShift(
+ DL, MVT::v8i16, V1, V2, Mask, DAG))
+ return Shift;
+
+ // Try to use byte rotation instructions.
+ if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
+ DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
+ return Rotate;
if (NumV1Inputs + NumV2Inputs <= 4)
return lowerV8I16BasicBlendVectorShuffle(DL, V1, V2, Mask, Subtarget, DAG);
ArrayRef<int> OrigMask = SVOp->getMask();
assert(OrigMask.size() == 16 && "Unexpected mask size for v16 shuffle!");
- // Try to use rotation instructions if available.
- if (Subtarget->hasSSSE3())
- if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
- DL, MVT::v16i8, V1, V2, OrigMask, DAG))
- return Rotate;
+ // Try to use byte shift instructions.
+ if (SDValue Shift = lowerVectorShuffleAsByteShift(
+ DL, MVT::v16i8, V1, V2, OrigMask, DAG))
+ return Shift;
+
+ // Try to use byte rotation instructions.
+ if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
+ DL, MVT::v16i8, V1, V2, OrigMask, Subtarget, DAG))
+ return Rotate;
// Try to use a zext lowering.
if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
// Now create two 4-way blends of these half-width vectors.
auto HalfBlend = [&](ArrayRef<int> HalfMask) {
+ bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
SmallVector<int, 32> V1BlendMask, V2BlendMask, BlendMask;
for (int i = 0; i < SplitNumElements; ++i) {
int M = HalfMask[i];
if (M >= NumElements) {
+ if (M >= NumElements + SplitNumElements)
+ UseHiV2 = true;
+ else
+ UseLoV2 = true;
V2BlendMask.push_back(M - NumElements);
V1BlendMask.push_back(-1);
BlendMask.push_back(SplitNumElements + i);
} else if (M >= 0) {
+ if (M >= SplitNumElements)
+ UseHiV1 = true;
+ else
+ UseLoV1 = true;
V2BlendMask.push_back(-1);
V1BlendMask.push_back(M);
BlendMask.push_back(i);
BlendMask.push_back(-1);
}
}
- SDValue V1Blend =
+
+ // Because the lowering happens after all combining takes place, we need to
+ // manually combine these blend masks as much as possible so that we create
+ // a minimal number of high-level vector shuffle nodes.
+
+ // First try just blending the halves of V1 or V2.
+ if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
+ return DAG.getUNDEF(SplitVT);
+ if (!UseLoV2 && !UseHiV2)
+ return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
+ if (!UseLoV1 && !UseHiV1)
+ return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
+
+ SDValue V1Blend, V2Blend;
+ if (UseLoV1 && UseHiV1) {
+ V1Blend =
DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
- SDValue V2Blend =
+ } else {
+ // We only use half of V1 so map the usage down into the final blend mask.
+ V1Blend = UseLoV1 ? LoV1 : HiV1;
+ for (int i = 0; i < SplitNumElements; ++i)
+ if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
+ BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
+ }
+ if (UseLoV2 && UseHiV2) {
+ V2Blend =
DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
+ } else {
+ // We only use half of V2 so map the usage down into the final blend mask.
+ V2Blend = UseLoV2 ? LoV2 : HiV2;
+ for (int i = 0; i < SplitNumElements; ++i)
+ if (BlendMask[i] >= SplitNumElements)
+ BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
+ }
return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
};
SDValue Lo = HalfBlend(LoMask);
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
}
+/// \brief Either split a vector in halves or decompose the shuffles and the
+/// blend.
+///
+/// This is provided as a good fallback for many lowerings of non-single-input
+/// shuffles with more than one 128-bit lane. In those cases, we want to select
+/// between splitting the shuffle into 128-bit components and stitching those
+/// back together vs. extracting the single-input shuffles and blending those
+/// results.
+static SDValue lowerVectorShuffleAsSplitOrBlend(SDLoc DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
+ assert(!isSingleInputShuffleMask(Mask) && "This routine must not be used to "
+ "lower single-input shuffles as it "
+ "could then recurse on itself.");
+ int Size = Mask.size();
+
+ // If this can be modeled as a broadcast of two elements followed by a blend,
+ // prefer that lowering. This is especially important because broadcasts can
+ // often fold with memory operands.
+ auto DoBothBroadcast = [&] {
+ int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
+ for (int M : Mask)
+ if (M >= Size) {
+ if (V2BroadcastIdx == -1)
+ V2BroadcastIdx = M - Size;
+ else if (M - Size != V2BroadcastIdx)
+ return false;
+ } else if (M >= 0) {
+ if (V1BroadcastIdx == -1)
+ V1BroadcastIdx = M;
+ else if (M != V1BroadcastIdx)
+ return false;
+ }
+ return true;
+ };
+ if (DoBothBroadcast())
+ return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
+ DAG);
+
+ // If the inputs all stem from a single 128-bit lane of each input, then we
+ // split them rather than blending because the split will decompose to
+ // unusually few instructions.
+ int LaneCount = VT.getSizeInBits() / 128;
+ int LaneSize = Size / LaneCount;
+ SmallBitVector LaneInputs[2];
+ LaneInputs[0].resize(LaneCount, false);
+ LaneInputs[1].resize(LaneCount, false);
+ for (int i = 0; i < Size; ++i)
+ if (Mask[i] >= 0)
+ LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
+ if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
+ return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
+
+ // Otherwise, just fall back to decomposed shuffles and a blend. This requires
+ // that the decomposed single-input shuffles don't end up here.
+ return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
+}
+
/// \brief Lower a vector shuffle crossing multiple 128-bit lanes as
/// a permutation and blend of those lanes.
///
DAG.getConstant(PermMask, MVT::i8));
}
+/// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then
+/// shuffling each lane.
+///
+/// This will only succeed when the result of fixing the 128-bit lanes results
+/// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in
+/// each 128-bit lanes. This handles many cases where we can quickly blend away
+/// the lane crosses early and then use simpler shuffles within each lane.
+///
+/// FIXME: It might be worthwhile at some point to support this without
+/// requiring the 128-bit lane-relative shuffles to be repeating, but currently
+/// in x86 only floating point has interesting non-repeating shuffles, and even
+/// those are still *marginally* more expensive.
+static SDValue lowerVectorShuffleByMerging128BitLanes(
+ SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
+ const X86Subtarget *Subtarget, SelectionDAG &DAG) {
+ assert(!isSingleInputShuffleMask(Mask) &&
+ "This is only useful with multiple inputs.");
+
+ int Size = Mask.size();
+ int LaneSize = 128 / VT.getScalarSizeInBits();
+ int NumLanes = Size / LaneSize;
+ assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.");
+
+ // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
+ // check whether the in-128-bit lane shuffles share a repeating pattern.
+ SmallVector<int, 4> Lanes;
+ Lanes.resize(NumLanes, -1);
+ SmallVector<int, 4> InLaneMask;
+ InLaneMask.resize(LaneSize, -1);
+ for (int i = 0; i < Size; ++i) {
+ if (Mask[i] < 0)
+ continue;
+
+ int j = i / LaneSize;
+
+ if (Lanes[j] < 0) {
+ // First entry we've seen for this lane.
+ Lanes[j] = Mask[i] / LaneSize;
+ } else if (Lanes[j] != Mask[i] / LaneSize) {
+ // This doesn't match the lane selected previously!
+ return SDValue();
+ }
+
+ // Check that within each lane we have a consistent shuffle mask.
+ int k = i % LaneSize;
+ if (InLaneMask[k] < 0) {
+ InLaneMask[k] = Mask[i] % LaneSize;
+ } else if (InLaneMask[k] != Mask[i] % LaneSize) {
+ // This doesn't fit a repeating in-lane mask.
+ return SDValue();
+ }
+ }
+
+ // First shuffle the lanes into place.
+ MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
+ VT.getSizeInBits() / 64);
+ SmallVector<int, 8> LaneMask;
+ LaneMask.resize(NumLanes * 2, -1);
+ for (int i = 0; i < NumLanes; ++i)
+ if (Lanes[i] >= 0) {
+ LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
+ LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
+ }
+
+ V1 = DAG.getNode(ISD::BITCAST, DL, LaneVT, V1);
+ V2 = DAG.getNode(ISD::BITCAST, DL, LaneVT, V2);
+ SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);
+
+ // Cast it back to the type we actually want.
+ LaneShuffle = DAG.getNode(ISD::BITCAST, DL, VT, LaneShuffle);
+
+ // Now do a simple shuffle that isn't lane crossing.
+ SmallVector<int, 8> NewMask;
+ NewMask.resize(Size, -1);
+ for (int i = 0; i < Size; ++i)
+ if (Mask[i] >= 0)
+ NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
+ assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&
+ "Must not introduce lane crosses at this point!");
+
+ return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
+}
+
+/// \brief Test whether the specified input (0 or 1) is in-place blended by the
+/// given mask.
+///
+/// This returns true if the elements from a particular input are already in the
+/// slot required by the given mask and require no permutation.
+static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
+ assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
+ int Size = Mask.size();
+ for (int i = 0; i < Size; ++i)
+ if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
+ return false;
+
+ return true;
+}
+
/// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
///
/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
DAG.getConstant(SHUFPDMask, MVT::i8));
}
- // Otherwise fall back on generic blend lowering.
- return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
- Mask, DAG);
+ // Try to simplify this by merging 128-bit lanes to enable a lane-based
+ // shuffle. However, if we have AVX2 and either inputs are already in place,
+ // we will be able to shuffle even across lanes the other input in a single
+ // instruction so skip this pattern.
+ if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
+ isShuffleMaskInputInPlace(1, Mask))))
+ if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
+ DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
+ return Result;
+
+ // If we have AVX2 then we always want to lower with a blend because an v4 we
+ // can fully permute the elements.
+ if (Subtarget->hasAVX2())
+ return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
+ Mask, DAG);
+
+ // Otherwise fall back on generic lowering.
+ return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);
}
/// \brief Handle lowering of 4-lane 64-bit integer shuffles.
return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
getV4X86ShuffleImm8ForMask(Mask, DAG));
+ // Try to simplify this by merging 128-bit lanes to enable a lane-based
+ // shuffle. However, if we have AVX2 and either inputs are already in place,
+ // we will be able to shuffle even across lanes the other input in a single
+ // instruction so skip this pattern.
+ if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
+ isShuffleMaskInputInPlace(1, Mask))))
+ if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
+ DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
+ return Result;
+
// Otherwise fall back on generic blend lowering.
return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
Mask, DAG);
DAG);
}
- // Otherwise fall back on generic blend lowering.
- return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
- Mask, DAG);
+ // Try to simplify this by merging 128-bit lanes to enable a lane-based
+ // shuffle.
+ if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
+ DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
+ return Result;
+
+ // If we have AVX2 then we always want to lower with a blend because at v8 we
+ // can fully permute the elements.
+ if (Subtarget->hasAVX2())
+ return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
+ Mask, DAG);
+
+ // Otherwise fall back on generic lowering.
+ return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);
}
/// \brief Handle lowering of 8-lane 32-bit integer shuffles.
DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask), V1);
}
+ // Try to simplify this by merging 128-bit lanes to enable a lane-based
+ // shuffle.
+ if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
+ DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
+ return Result;
+
// Otherwise fall back on generic blend lowering.
return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
Mask, DAG);
Mask, Subtarget, DAG))
return Broadcast;
- // There are no generalized cross-lane shuffle operations available on i16
- // element types.
- if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
- return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
- Mask, DAG);
-
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
Subtarget, DAG))
return Blend;
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i16, V1, V2);
if (isSingleInputShuffleMask(Mask)) {
+ // There are no generalized cross-lane shuffle operations available on i16
+ // element types.
+ if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
+ return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
+ Mask, DAG);
+
SDValue PSHUFBMask[32];
for (int i = 0; i < 16; ++i) {
if (Mask[i] == -1) {
DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask)));
}
- // Otherwise fall back on generic blend lowering.
- return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i16, V1, V2,
- Mask, DAG);
+ // Try to simplify this by merging 128-bit lanes to enable a lane-based
+ // shuffle.
+ if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
+ DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
+ return Result;
+
+ // Otherwise fall back on generic lowering.
+ return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
}
/// \brief Handle lowering of 32-lane 8-bit integer shuffles.
Mask, Subtarget, DAG))
return Broadcast;
- // There are no generalized cross-lane shuffle operations available on i8
- // element types.
- if (is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
- return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2,
- Mask, DAG);
-
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
Subtarget, DAG))
return Blend;
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v32i8, V1, V2);
if (isSingleInputShuffleMask(Mask)) {
+ // There are no generalized cross-lane shuffle operations available on i8
+ // element types.
+ if (is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
+ return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2,
+ Mask, DAG);
+
SDValue PSHUFBMask[32];
for (int i = 0; i < 32; ++i)
PSHUFBMask[i] =
DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask));
}
- // Otherwise fall back on generic blend lowering.
- return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v32i8, V1, V2,
- Mask, DAG);
+ // Try to simplify this by merging 128-bit lanes to enable a lane-based
+ // shuffle.
+ if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
+ DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
+ return Result;
+
+ // Otherwise fall back on generic lowering.
+ return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
}
/// \brief High-level routine to lower various 256-bit x86 vector shuffles.
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
ArrayRef<int> Mask = SVOp->getMask();
assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
- assert(Subtarget->hasDQI() && "We can only lower v8i64 with AVX-512-DQI");
// FIXME: Implement direct support for this type!
return splitAndLowerVectorShuffle(DL, MVT::v8i64, V1, V2, Mask, DAG);
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
ArrayRef<int> Mask = SVOp->getMask();
assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
- assert(Subtarget->hasDQI() && "We can only lower v16i32 with AVX-512-DQI!");
// FIXME: Implement direct support for this type!
return splitAndLowerVectorShuffle(DL, MVT::v16i32, V1, V2, Mask, DAG);
assert(Subtarget->hasAVX512() &&
"Cannot lower 512-bit vectors w/ basic ISA!");
+ // Check for being able to broadcast a single element.
+ if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(VT.SimpleTy, DL, V1,
+ Mask, Subtarget, DAG))
+ return Broadcast;
+
// Dispatch to each element type for lowering. If we don't have supprot for
// specific element type shuffles at 512 bits, immediately split them and
// lower them. Each lowering routine of a given type is allowed to assume that
case MVT::v16f32:
return lowerV16F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
case MVT::v8i64:
- if (Subtarget->hasDQI())
- return lowerV8I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
- break;
+ return lowerV8I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
case MVT::v16i32:
- if (Subtarget->hasDQI())
- return lowerV16I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
- break;
+ return lowerV16I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
case MVT::v32i16:
if (Subtarget->hasBWI())
return lowerV32I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
// When the number of V1 and V2 elements are the same, try to minimize the
// number of uses of V2 in the low half of the vector. When that is tied,
// ensure that the sum of indices for V1 is equal to or lower than the sum
- // indices for V2.
+ // indices for V2. When those are equal, try to ensure that the number of odd
+ // indices for V1 is lower than the number of odd indices for V2.
if (NumV1Elements == NumV2Elements) {
int LowV1Elements = 0, LowV2Elements = 0;
for (int M : SVOp->getMask().slice(0, NumElements / 2))
SumV2Indices += i;
else if (SVOp->getMask()[i] >= 0)
SumV1Indices += i;
- if (SumV2Indices < SumV1Indices)
+ if (SumV2Indices < SumV1Indices) {
return DAG.getCommutedVectorShuffle(*SVOp);
+ } else if (SumV2Indices == SumV1Indices) {
+ int NumV1OddIndices = 0, NumV2OddIndices = 0;
+ for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i)
+ if (SVOp->getMask()[i] >= NumElements)
+ NumV2OddIndices += i % 2;
+ else if (SVOp->getMask()[i] >= 0)
+ NumV1OddIndices += i % 2;
+ if (NumV2OddIndices < NumV1OddIndices)
+ return DAG.getCommutedVectorShuffle(*SVOp);
+ }
}
}
// TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
MFI->setAdjustsStack(true);
+ MFI->setHasCalls(true);
SDValue Flag = Chain.getValue(1);
return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
SelectionDAG &DAG) const {
MVT SrcVT = Op.getOperand(0).getSimpleValueType();
+ SDLoc dl(Op);
- if (SrcVT.isVector())
+ if (SrcVT.isVector()) {
+ if (SrcVT.getVectorElementType() == MVT::i1) {
+ MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
+ return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
+ DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT,
+ Op.getOperand(0)));
+ }
return SDValue();
-
+ }
+
assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
"Unknown SINT_TO_FP to lower!");
return Op;
}
- SDLoc dl(Op);
unsigned Size = SrcVT.getSizeInBits()/8;
MachineFunction &MF = DAG.getMachineFunction();
int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false);
return Sub;
}
+static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ // The algorithm is the following:
+ // #ifdef __SSE4_1__
+ // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
+ // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
+ // (uint4) 0x53000000, 0xaa);
+ // #else
+ // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
+ // uint4 hi = (v >> 16) | (uint4) 0x53000000;
+ // #endif
+ // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
+ // return (float4) lo + fhi;
+
+ SDLoc DL(Op);
+ SDValue V = Op->getOperand(0);
+ EVT VecIntVT = V.getValueType();
+ bool Is128 = VecIntVT == MVT::v4i32;
+ EVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
+ // If we convert to something else than the supported type, e.g., to v4f64,
+ // abort early.
+ if (VecFloatVT != Op->getValueType(0))
+ return SDValue();
+
+ unsigned NumElts = VecIntVT.getVectorNumElements();
+ assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
+ "Unsupported custom type");
+ assert(NumElts <= 8 && "The size of the constant array must be fixed");
+
+ // In the #idef/#else code, we have in common:
+ // - The vector of constants:
+ // -- 0x4b000000
+ // -- 0x53000000
+ // - A shift:
+ // -- v >> 16
+
+ // Create the splat vector for 0x4b000000.
+ SDValue CstLow = DAG.getConstant(0x4b000000, MVT::i32);
+ SDValue CstLowArray[] = {CstLow, CstLow, CstLow, CstLow,
+ CstLow, CstLow, CstLow, CstLow};
+ SDValue VecCstLow = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT,
+ makeArrayRef(&CstLowArray[0], NumElts));
+ // Create the splat vector for 0x53000000.
+ SDValue CstHigh = DAG.getConstant(0x53000000, MVT::i32);
+ SDValue CstHighArray[] = {CstHigh, CstHigh, CstHigh, CstHigh,
+ CstHigh, CstHigh, CstHigh, CstHigh};
+ SDValue VecCstHigh = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT,
+ makeArrayRef(&CstHighArray[0], NumElts));
+
+ // Create the right shift.
+ SDValue CstShift = DAG.getConstant(16, MVT::i32);
+ SDValue CstShiftArray[] = {CstShift, CstShift, CstShift, CstShift,
+ CstShift, CstShift, CstShift, CstShift};
+ SDValue VecCstShift = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT,
+ makeArrayRef(&CstShiftArray[0], NumElts));
+ SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
+
+ SDValue Low, High;
+ if (Subtarget.hasSSE41()) {
+ EVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
+ // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
+ SDValue VecCstLowBitcast =
+ DAG.getNode(ISD::BITCAST, DL, VecI16VT, VecCstLow);
+ SDValue VecBitcast = DAG.getNode(ISD::BITCAST, DL, VecI16VT, V);
+ // Low will be bitcasted right away, so do not bother bitcasting back to its
+ // original type.
+ Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
+ VecCstLowBitcast, DAG.getConstant(0xaa, MVT::i32));
+ // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
+ // (uint4) 0x53000000, 0xaa);
+ SDValue VecCstHighBitcast =
+ DAG.getNode(ISD::BITCAST, DL, VecI16VT, VecCstHigh);
+ SDValue VecShiftBitcast =
+ DAG.getNode(ISD::BITCAST, DL, VecI16VT, HighShift);
+ // High will be bitcasted right away, so do not bother bitcasting back to
+ // its original type.
+ High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
+ VecCstHighBitcast, DAG.getConstant(0xaa, MVT::i32));
+ } else {
+ SDValue CstMask = DAG.getConstant(0xffff, MVT::i32);
+ SDValue VecCstMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT, CstMask,
+ CstMask, CstMask, CstMask);
+ // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
+ SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
+ Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
+
+ // uint4 hi = (v >> 16) | (uint4) 0x53000000;
+ High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
+ }
+
+ // Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
+ SDValue CstFAdd = DAG.getConstantFP(
+ APFloat(APFloat::IEEEsingle, APInt(32, 0xD3000080)), MVT::f32);
+ SDValue CstFAddArray[] = {CstFAdd, CstFAdd, CstFAdd, CstFAdd,
+ CstFAdd, CstFAdd, CstFAdd, CstFAdd};
+ SDValue VecCstFAdd = DAG.getNode(ISD::BUILD_VECTOR, DL, VecFloatVT,
+ makeArrayRef(&CstFAddArray[0], NumElts));
+
+ // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
+ SDValue HighBitcast = DAG.getNode(ISD::BITCAST, DL, VecFloatVT, High);
+ SDValue FHigh =
+ DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
+ // return (float4) lo + fhi;
+ SDValue LowBitcast = DAG.getNode(ISD::BITCAST, DL, VecFloatVT, Low);
+ return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
+}
+
SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
SelectionDAG &DAG) const {
SDValue N0 = Op.getOperand(0);
MVT SVT = N0.getSimpleValueType();
SDLoc dl(Op);
- assert((SVT == MVT::v4i8 || SVT == MVT::v4i16 ||
- SVT == MVT::v8i8 || SVT == MVT::v8i16) &&
- "Custom UINT_TO_FP is not supported!");
-
- MVT NVT = MVT::getVectorVT(MVT::i32, SVT.getVectorNumElements());
- return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
- DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0));
+ switch (SVT.SimpleTy) {
+ default:
+ llvm_unreachable("Custom UINT_TO_FP is not supported!");
+ case MVT::v4i8:
+ case MVT::v4i16:
+ case MVT::v8i8:
+ case MVT::v8i16: {
+ MVT NVT = MVT::getVectorVT(MVT::i32, SVT.getVectorNumElements());
+ return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
+ DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0));
+ }
+ case MVT::v4i32:
+ case MVT::v8i32:
+ return lowerUINT_TO_FP_vXi32(Op, DAG, *Subtarget);
+ }
+ llvm_unreachable(nullptr);
}
SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
}
+/// The minimum architected relative accuracy is 2^-12. We need one
+/// Newton-Raphson step to have a good float result (24 bits of precision).
+SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op,
+ DAGCombinerInfo &DCI,
+ unsigned &RefinementSteps,
+ bool &UseOneConstNR) const {
+ // FIXME: We should use instruction latency models to calculate the cost of
+ // each potential sequence, but this is very hard to do reliably because
+ // at least Intel's Core* chips have variable timing based on the number of
+ // significant digits in the divisor and/or sqrt operand.
+ if (!Subtarget->useSqrtEst())
+ return SDValue();
+
+ EVT VT = Op.getValueType();
+
+ // SSE1 has rsqrtss and rsqrtps.
+ // TODO: Add support for AVX512 (v16f32).
+ // It is likely not profitable to do this for f64 because a double-precision
+ // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
+ // instructions: convert to single, rsqrtss, convert back to double, refine
+ // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
+ // along with FMA, this could be a throughput win.
+ if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
+ (Subtarget->hasAVX() && VT == MVT::v8f32)) {
+ RefinementSteps = 1;
+ UseOneConstNR = false;
+ return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
+ }
+ return SDValue();
+}
+
+/// The minimum architected relative accuracy is 2^-12. We need one
+/// Newton-Raphson step to have a good float result (24 bits of precision).
+SDValue X86TargetLowering::getRecipEstimate(SDValue Op,
+ DAGCombinerInfo &DCI,
+ unsigned &RefinementSteps) const {
+ // FIXME: We should use instruction latency models to calculate the cost of
+ // each potential sequence, but this is very hard to do reliably because
+ // at least Intel's Core* chips have variable timing based on the number of
+ // significant digits in the divisor.
+ if (!Subtarget->useReciprocalEst())
+ return SDValue();
+
+ EVT VT = Op.getValueType();
+
+ // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
+ // TODO: Add support for AVX512 (v16f32).
+ // It is likely not profitable to do this for f64 because a double-precision
+ // reciprocal estimate with refinement on x86 prior to FMA requires
+ // 15 instructions: convert to single, rcpss, convert back to double, refine
+ // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
+ // along with FMA, this could be a throughput win.
+ if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
+ (Subtarget->hasAVX() && VT == MVT::v8f32)) {
+ RefinementSteps = ReciprocalEstimateRefinementSteps;
+ return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
+ }
+ return SDValue();
+}
+
static bool isAllOnes(SDValue V) {
ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
return C && C->isAllOnesValue();
if (NumElts != 8 && NumElts != 16)
return SDValue();
- if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1)
+ if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) {
+ if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT)
+ return DAG.getNode(In.getOpcode(), dl, VT, In.getOperand(0));
return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
+ }
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
assert (InVT.getVectorElementType() == MVT::i1 && "Unexpected vector type");
/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
/// necessary casting for \p Mask when lowering masking intrinsics.
static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
- SDValue PreservedSrc, SelectionDAG &DAG) {
+ SDValue PreservedSrc,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
EVT VT = Op.getValueType();
EVT MaskVT = EVT::getVectorVT(*DAG.getContext(),
MVT::i1, VT.getVectorNumElements());
case X86ISD::CMPMU:
return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
}
-
+ if (PreservedSrc.getOpcode() == ISD::UNDEF)
+ PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
return DAG.getNode(ISD::VSELECT, dl, VT, VMask, Op, PreservedSrc);
}
}
}
-static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
+static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
SDLoc dl(Op);
unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
-
+ EVT VT = Op.getValueType();
const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
if (IntrData) {
switch(IntrData->Type) {
case INTR_TYPE_3OP:
return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
Op.getOperand(2), Op.getOperand(3));
+ case INTR_TYPE_1OP_MASK_RM: {
+ SDValue Src = Op.getOperand(1);
+ SDValue Src0 = Op.getOperand(2);
+ SDValue Mask = Op.getOperand(3);
+ SDValue RoundingMode = Op.getOperand(4);
+ return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
+ RoundingMode),
+ Mask, Src0, Subtarget, DAG);
+ }
+ case INTR_TYPE_2OP_MASK: {
+ return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Op.getOperand(1),
+ Op.getOperand(2)),
+ Op.getOperand(4), Op.getOperand(3), Subtarget, DAG);
+ }
case CMP_MASK:
case CMP_MASK_CC: {
// Comparison intrinsics with masks.
Op.getOperand(2));
}
SDValue CmpMask = getVectorMaskingNode(Cmp, Mask,
- DAG.getTargetConstant(0, MaskVT), DAG);
+ DAG.getTargetConstant(0, MaskVT),
+ Subtarget, DAG);
SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
DAG.getUNDEF(BitcastVT), CmpMask,
DAG.getIntPtrConstant(0));
case VSHIFT:
return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
Op.getOperand(1), Op.getOperand(2), DAG);
+ case VSHIFT_MASK:
+ return getVectorMaskingNode(getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
+ Op.getOperand(1), Op.getOperand(2), DAG),
+ Op.getOperand(4), Op.getOperand(3), Subtarget, DAG);
default:
break;
}
Op.getValueType(), Op.getOperand(2),
Op.getOperand(1),
Op.getOperand(3)),
- Op.getOperand(5), Op.getOperand(4), DAG);
+ Op.getOperand(5), Op.getOperand(4),
+ Subtarget, DAG);
// ptest and testp intrinsics. The intrinsic these come from are designed to
// return an integer value, not just an instruction so lower it to the ptest
Op.getOperand(1),
Op.getOperand(2),
Op.getOperand(3)),
- Op.getOperand(4), Op.getOperand(1), DAG);
+ Op.getOperand(4), Op.getOperand(1),
+ Subtarget, DAG);
else
return SDValue();
}
Cond = X86::COND_B;
break;
case ISD::SMULO:
- BaseOp = X86ISD::SMUL;
+ BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL;
Cond = X86::COND_O;
break;
case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
+ if (N->getValueType(0) == MVT::i8) {
+ BaseOp = X86ISD::UMUL8;
+ Cond = X86::COND_O;
+ break;
+ }
SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
MVT::i32);
SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
SDValue Callee = DAG.getExternalSymbol(LibcallName, TLI.getPointerTy());
Type *RetTy = isF64
- ? (Type*)StructType::get(ArgTy, ArgTy, NULL)
+ ? (Type*)StructType::get(ArgTy, ArgTy, nullptr)
: (Type*)VectorType::get(ArgTy, 4);
TargetLowering::CallLoweringInfo CLI(DAG);
case ISD::VASTART: return LowerVASTART(Op, DAG);
case ISD::VAARG: return LowerVAARG(Op, DAG);
case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
- case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+ case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, Subtarget, DAG);
case ISD::INTRINSIC_VOID:
case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
case X86ISD::ANDNP: return "X86ISD::ANDNP";
case X86ISD::PSIGN: return "X86ISD::PSIGN";
case X86ISD::BLENDI: return "X86ISD::BLENDI";
+ case X86ISD::SHRUNKBLEND: return "X86ISD::SHRUNKBLEND";
case X86ISD::SUBUS: return "X86ISD::SUBUS";
case X86ISD::HADD: return "X86ISD::HADD";
case X86ISD::HSUB: return "X86ISD::HSUB";
case X86ISD::SBB: return "X86ISD::SBB";
case X86ISD::SMUL: return "X86ISD::SMUL";
case X86ISD::UMUL: return "X86ISD::UMUL";
+ case X86ISD::SMUL8: return "X86ISD::SMUL8";
+ case X86ISD::UMUL8: return "X86ISD::UMUL8";
+ case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG";
+ case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG";
case X86ISD::INC: return "X86ISD::INC";
case X86ISD::DEC: return "X86ISD::DEC";
case X86ISD::OR: return "X86ISD::OR";
return (SVT.getVectorNumElements() == 2 ||
ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
isMOVLMask(M, SVT) ||
+ isCommutedMOVLMask(M, SVT) ||
isMOVHLPSMask(M, SVT) ||
isSHUFPMask(M, SVT) ||
+ isSHUFPMask(M, SVT, /* Commuted */ true) ||
isPSHUFDMask(M, SVT) ||
+ isPSHUFDMask(M, SVT, /* SecondOperand */ true) ||
isPSHUFHWMask(M, SVT, Subtarget->hasInt256()) ||
isPSHUFLWMask(M, SVT, Subtarget->hasInt256()) ||
isPALIGNRMask(M, SVT, Subtarget) ||
isUNPCKHMask(M, SVT, Subtarget->hasInt256()) ||
isUNPCKL_v_undef_Mask(M, SVT, Subtarget->hasInt256()) ||
isUNPCKH_v_undef_Mask(M, SVT, Subtarget->hasInt256()) ||
- isBlendMask(M, SVT, Subtarget->hasSSE41(), Subtarget->hasInt256()));
+ isBlendMask(M, SVT, Subtarget->hasSSE41(), Subtarget->hasInt256()) ||
+ (Subtarget->hasSSE41() && isINSERTPSMask(M, SVT)));
}
bool
return (isMOVLMask(Mask, SVT) ||
isCommutedMOVLMask(Mask, SVT, true) ||
isSHUFPMask(Mask, SVT) ||
- isSHUFPMask(Mask, SVT, /* Commuted */ true));
+ isSHUFPMask(Mask, SVT, /* Commuted */ true) ||
+ isBlendMask(Mask, SVT, Subtarget->hasSSE41(),
+ Subtarget->hasInt256()));
}
return false;
}
.addReg(X86::RAX);
}
} else {
- const char *StackProbeSymbol =
- Subtarget->isTargetKnownWindowsMSVC() ? "_chkstk" : "_alloca";
+ const char *StackProbeSymbol = (Subtarget->isTargetKnownWindowsMSVC() ||
+ Subtarget->isTargetWindowsItanium())
+ ? "_chkstk"
+ : "_alloca";
BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32))
.addExternalSymbol(StackProbeSymbol)
case X86::VFNMSUBPSr213r: NewFMAOpc = X86::VFNMSUBPSr231r; break;
case X86::VFNMSUBSDr213r: NewFMAOpc = X86::VFNMSUBSDr231r; break;
case X86::VFNMSUBSSr213r: NewFMAOpc = X86::VFNMSUBSSr231r; break;
+ case X86::VFMADDSUBPDr213r: NewFMAOpc = X86::VFMADDSUBPDr231r; break;
+ case X86::VFMADDSUBPSr213r: NewFMAOpc = X86::VFMADDSUBPSr231r; break;
+ case X86::VFMSUBADDPDr213r: NewFMAOpc = X86::VFMSUBADDPDr231r; break;
+ case X86::VFMSUBADDPSr213r: NewFMAOpc = X86::VFMSUBADDPSr231r; break;
+
case X86::VFMADDPDr213rY: NewFMAOpc = X86::VFMADDPDr231rY; break;
case X86::VFMADDPSr213rY: NewFMAOpc = X86::VFMADDPSr231rY; break;
case X86::VFMSUBPDr213rY: NewFMAOpc = X86::VFMSUBPDr231rY; break;
case X86::VFNMADDPSr213rY: NewFMAOpc = X86::VFNMADDPSr231rY; break;
case X86::VFNMSUBPDr213rY: NewFMAOpc = X86::VFNMSUBPDr231rY; break;
case X86::VFNMSUBPSr213rY: NewFMAOpc = X86::VFNMSUBPSr231rY; break;
+ case X86::VFMADDSUBPDr213rY: NewFMAOpc = X86::VFMADDSUBPDr231rY; break;
+ case X86::VFMADDSUBPSr213rY: NewFMAOpc = X86::VFMADDSUBPSr231rY; break;
+ case X86::VFMSUBADDPDr213rY: NewFMAOpc = X86::VFMSUBADDPDr231rY; break;
+ case X86::VFMSUBADDPSr213rY: NewFMAOpc = X86::VFMSUBADDPSr231rY; break;
default: llvm_unreachable("Unrecognized FMA variant.");
}
case X86::VFNMSUBPSr213r:
case X86::VFNMSUBSDr213r:
case X86::VFNMSUBSSr213r:
+ case X86::VFMADDSUBPDr213r:
+ case X86::VFMADDSUBPSr213r:
+ case X86::VFMSUBADDPDr213r:
+ case X86::VFMSUBADDPSr213r:
case X86::VFMADDPDr213rY:
case X86::VFMADDPSr213rY:
case X86::VFMSUBPDr213rY:
case X86::VFNMADDPSr213rY:
case X86::VFNMSUBPDr213rY:
case X86::VFNMSUBPSr213rY:
+ case X86::VFMADDSUBPDr213rY:
+ case X86::VFMADDSUBPSr213rY:
+ case X86::VFMSUBADDPDr213rY:
+ case X86::VFMSUBADDPSr213rY:
return emitFMA3Instr(MI, BB);
}
}
/// XFormVExtractWithShuffleIntoLoad - Check if a vector extract from a target
/// specific shuffle of a load can be folded into a single element load.
/// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
-/// shuffles have been customed lowered so we need to handle those here.
+/// shuffles have been custom lowered so we need to handle those here.
static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI) {
if (DCI.isBeforeLegalizeOps())
if (!isa<ConstantSDNode>(EltNo))
return SDValue();
- EVT VT = InVec.getValueType();
+ EVT OriginalVT = InVec.getValueType();
if (InVec.getOpcode() == ISD::BITCAST) {
// Don't duplicate a load with other uses.
if (!InVec.hasOneUse())
return SDValue();
EVT BCVT = InVec.getOperand(0).getValueType();
- if (BCVT.getVectorNumElements() != VT.getVectorNumElements())
+ if (BCVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
return SDValue();
InVec = InVec.getOperand(0);
}
+ EVT CurrentVT = InVec.getValueType();
+
if (!isTargetShuffle(InVec.getOpcode()))
return SDValue();
SmallVector<int, 16> ShuffleMask;
bool UnaryShuffle;
- if (!getTargetShuffleMask(InVec.getNode(), VT.getSimpleVT(), ShuffleMask,
- UnaryShuffle))
+ if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(),
+ ShuffleMask, UnaryShuffle))
return SDValue();
// Select the input vector, guarding against out of range extract vector.
- unsigned NumElems = VT.getVectorNumElements();
+ unsigned NumElems = CurrentVT.getVectorNumElements();
int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
int Idx = (Elt > (int)NumElems) ? -1 : ShuffleMask[Elt];
SDValue LdNode = (Idx < (int)NumElems) ? InVec.getOperand(0)
SDLoc dl(N);
// Create shuffle node taking into account the case that its a unary shuffle
- SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(VT) : InVec.getOperand(1);
- Shuffle = DAG.getVectorShuffle(InVec.getValueType(), dl,
+ SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT)
+ : InVec.getOperand(1);
+ Shuffle = DAG.getVectorShuffle(CurrentVT, dl,
InVec.getOperand(0), Shuffle,
&ShuffleMask[0]);
- Shuffle = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
+ Shuffle = DAG.getNode(ISD::BITCAST, dl, OriginalVT, Shuffle);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
EltNo);
}
}
static SDValue
-TransformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG,
+transformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG,
const X86Subtarget *Subtarget) {
SDLoc dl(N);
SDValue Cond = N->getOperand(0);
Cond = CondSrc->getOperand(0);
}
- MVT VT = N->getSimpleValueType(0);
- MVT EltVT = VT.getVectorElementType();
- unsigned NumElems = VT.getVectorNumElements();
- // There is no blend with immediate in AVX-512.
- if (VT.is512BitVector())
- return SDValue();
-
- if (!Subtarget->hasSSE41() || EltVT == MVT::i8)
- return SDValue();
- if (!Subtarget->hasInt256() && VT == MVT::v16i16)
- return SDValue();
-
if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
return SDValue();
if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue))
return SDValue();
+ MVT VT = N->getSimpleValueType(0);
+ unsigned NumElems = VT.getVectorNumElements();
SmallVector<int, 8> ShuffleMask(NumElems, -1);
for (unsigned i = 0; i < NumElems; ++i) {
// Be sure we emit undef where we can.
ShuffleMask[i] = i + NumElems * ((MaskValue >> i) & 1);
}
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (!TLI.isShuffleMaskLegal(ShuffleMask, VT))
+ return SDValue();
return DAG.getVectorShuffle(VT, dl, LHS, RHS, &ShuffleMask[0]);
}
return DAG.getNode(Opc, DL, VT, LHS, RHS);
}
- // Simplify vector selection if the selector will be produced by CMPP*/PCMP*.
- if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
- // Check if SETCC has already been promoted
- TLI.getSetCCResultType(*DAG.getContext(), VT) == CondVT &&
- // Check that condition value type matches vselect operand type
- CondVT == VT) {
-
+ // Simplify vector selection if condition value type matches vselect
+ // operand type
+ if (N->getOpcode() == ISD::VSELECT && CondVT == VT) {
assert(Cond.getValueType().isVector() &&
"vector select expects a vector selector!");
bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
- if (!TValIsAllOnes && !FValIsAllZeros) {
- // Try invert the condition if true value is not all 1s and false value
- // is not all 0s.
+ // Try invert the condition if true value is not all 1s and false value
+ // is not all 0s.
+ if (!TValIsAllOnes && !FValIsAllZeros &&
+ // Check if the selector will be produced by CMPP*/PCMP*
+ Cond.getOpcode() == ISD::SETCC &&
+ // Check if SETCC has already been promoted
+ TLI.getSetCCResultType(*DAG.getContext(), VT) == CondVT) {
bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
}
}
- // Try to fold this VSELECT into a MOVSS/MOVSD
- if (N->getOpcode() == ISD::VSELECT &&
- Cond.getOpcode() == ISD::BUILD_VECTOR && !DCI.isBeforeLegalize()) {
- if (VT == MVT::v4i32 || VT == MVT::v4f32 ||
- (Subtarget->hasSSE2() && (VT == MVT::v2i64 || VT == MVT::v2f64))) {
- bool CanFold = false;
- unsigned NumElems = Cond.getNumOperands();
- SDValue A = LHS;
- SDValue B = RHS;
-
- if (isZero(Cond.getOperand(0))) {
- CanFold = true;
-
- // fold (vselect <0,-1,-1,-1>, A, B) -> (movss A, B)
- // fold (vselect <0,-1> -> (movsd A, B)
- for (unsigned i = 1, e = NumElems; i != e && CanFold; ++i)
- CanFold = isAllOnes(Cond.getOperand(i));
- } else if (isAllOnes(Cond.getOperand(0))) {
- CanFold = true;
- std::swap(A, B);
-
- // fold (vselect <-1,0,0,0>, A, B) -> (movss B, A)
- // fold (vselect <-1,0> -> (movsd B, A)
- for (unsigned i = 1, e = NumElems; i != e && CanFold; ++i)
- CanFold = isZero(Cond.getOperand(i));
- }
-
- if (CanFold) {
- if (VT == MVT::v4i32 || VT == MVT::v4f32)
- return getTargetShuffleNode(X86ISD::MOVSS, DL, VT, A, B, DAG);
- return getTargetShuffleNode(X86ISD::MOVSD, DL, VT, A, B, DAG);
- }
-
- if (Subtarget->hasSSE2() && (VT == MVT::v4i32 || VT == MVT::v4f32)) {
- // fold (v4i32: vselect <0,0,-1,-1>, A, B) ->
- // (v4i32 (bitcast (movsd (v2i64 (bitcast A)),
- // (v2i64 (bitcast B)))))
- //
- // fold (v4f32: vselect <0,0,-1,-1>, A, B) ->
- // (v4f32 (bitcast (movsd (v2f64 (bitcast A)),
- // (v2f64 (bitcast B)))))
- //
- // fold (v4i32: vselect <-1,-1,0,0>, A, B) ->
- // (v4i32 (bitcast (movsd (v2i64 (bitcast B)),
- // (v2i64 (bitcast A)))))
- //
- // fold (v4f32: vselect <-1,-1,0,0>, A, B) ->
- // (v4f32 (bitcast (movsd (v2f64 (bitcast B)),
- // (v2f64 (bitcast A)))))
-
- CanFold = (isZero(Cond.getOperand(0)) &&
- isZero(Cond.getOperand(1)) &&
- isAllOnes(Cond.getOperand(2)) &&
- isAllOnes(Cond.getOperand(3)));
-
- if (!CanFold && isAllOnes(Cond.getOperand(0)) &&
- isAllOnes(Cond.getOperand(1)) &&
- isZero(Cond.getOperand(2)) &&
- isZero(Cond.getOperand(3))) {
- CanFold = true;
- std::swap(LHS, RHS);
- }
-
- if (CanFold) {
- EVT NVT = (VT == MVT::v4i32) ? MVT::v2i64 : MVT::v2f64;
- SDValue NewA = DAG.getNode(ISD::BITCAST, DL, NVT, LHS);
- SDValue NewB = DAG.getNode(ISD::BITCAST, DL, NVT, RHS);
- SDValue Select = getTargetShuffleNode(X86ISD::MOVSD, DL, NVT, NewA,
- NewB, DAG);
- return DAG.getNode(ISD::BITCAST, DL, VT, Select);
- }
- }
- }
- }
-
// If we know that this node is legal then we know that it is going to be
// matched by one of the SSE/AVX BLEND instructions. These instructions only
// depend on the highest bit in each word. Try to use SimplifyDemandedBits
// build_vector of constants. This will be taken care in a later
// condition.
(TLI.isOperationLegalOrCustom(ISD::VSELECT, VT) && VT != MVT::v16i16 &&
- VT != MVT::v8i16)) {
+ VT != MVT::v8i16) &&
+ // Don't optimize vector of constants. Those are handled by
+ // the generic code and all the bits must be properly set for
+ // the generic optimizer.
+ !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits();
// Don't optimize vector selects that map to mask-registers.
if (BitWidth == 1)
return SDValue();
- // Check all uses of that condition operand to check whether it will be
- // consumed by non-BLEND instructions, which may depend on all bits are set
- // properly.
- for (SDNode::use_iterator I = Cond->use_begin(),
- E = Cond->use_end(); I != E; ++I)
- if (I->getOpcode() != ISD::VSELECT)
- // TODO: Add other opcodes eventually lowered into BLEND.
- return SDValue();
-
assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
DCI.isBeforeLegalizeOps());
if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) ||
- TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne, TLO))
- DCI.CommitTargetLoweringOpt(TLO);
+ TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne,
+ TLO)) {
+ // If we changed the computation somewhere in the DAG, this change
+ // will affect all users of Cond.
+ // Make sure it is fine and update all the nodes so that we do not
+ // use the generic VSELECT anymore. Otherwise, we may perform
+ // wrong optimizations as we messed up with the actual expectation
+ // for the vector boolean values.
+ if (Cond != TLO.Old) {
+ // Check all uses of that condition operand to check whether it will be
+ // consumed by non-BLEND instructions, which may depend on all bits are
+ // set properly.
+ for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
+ I != E; ++I)
+ if (I->getOpcode() != ISD::VSELECT)
+ // TODO: Add other opcodes eventually lowered into BLEND.
+ return SDValue();
+
+ // Update all the users of the condition, before committing the change,
+ // so that the VSELECT optimizations that expect the correct vector
+ // boolean value will not be triggered.
+ for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
+ I != E; ++I)
+ DAG.ReplaceAllUsesOfValueWith(
+ SDValue(*I, 0),
+ DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(*I), I->getValueType(0),
+ Cond, I->getOperand(1), I->getOperand(2)));
+ DCI.CommitTargetLoweringOpt(TLO);
+ return SDValue();
+ }
+ // At this point, only Cond is changed. Change the condition
+ // just for N to keep the opportunity to optimize all other
+ // users their own way.
+ DAG.ReplaceAllUsesOfValueWith(
+ SDValue(N, 0),
+ DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(N), N->getValueType(0),
+ TLO.New, N->getOperand(1), N->getOperand(2)));
+ return SDValue();
+ }
}
// We should generate an X86ISD::BLENDI from a vselect if its argument
// Iff we find this pattern and the build_vectors are built from
// constants, we translate the vselect into a shuffle_vector that we
// know will be matched by LowerVECTOR_SHUFFLEtoBlend.
- if (N->getOpcode() == ISD::VSELECT && !DCI.isBeforeLegalize()) {
- SDValue Shuffle = TransformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget);
+ if ((N->getOpcode() == ISD::VSELECT ||
+ N->getOpcode() == X86ISD::SHRUNKBLEND) &&
+ !DCI.isBeforeLegalize()) {
+ SDValue Shuffle = transformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget);
if (Shuffle.getNode())
return Shuffle;
}
SDLoc dl(Ld);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- // On Sandybridge unaligned 256bit loads are inefficient.
+ // For chips with slow 32-byte unaligned loads, break the 32-byte operation
+ // into two 16-byte operations.
ISD::LoadExtType Ext = Ld->getExtensionType();
unsigned Alignment = Ld->getAlignment();
bool IsAligned = Alignment == 0 || Alignment >= MemVT.getSizeInBits()/8;
- if (RegVT.is256BitVector() && !Subtarget->hasInt256() &&
+ if (RegVT.is256BitVector() && Subtarget->isUnalignedMem32Slow() &&
!DCI.isBeforeLegalizeOps() && !IsAligned && Ext == ISD::NON_EXTLOAD) {
unsigned NumElems = RegVT.getVectorNumElements();
if (NumElems < 2)
SDValue StoredVal = St->getOperand(1);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- // If we are saving a concatenation of two XMM registers, perform two stores.
- // On Sandy Bridge, 256-bit memory operations are executed by two
- // 128-bit ports. However, on Haswell it is better to issue a single 256-bit
- // memory operation.
+ // If we are saving a concatenation of two XMM registers and 32-byte stores
+ // are slow, such as on Sandy Bridge, perform two 16-byte stores.
unsigned Alignment = St->getAlignment();
bool IsAligned = Alignment == 0 || Alignment >= VT.getSizeInBits()/8;
- if (VT.is256BitVector() && !Subtarget->hasInt256() &&
+ if (VT.is256BitVector() && Subtarget->isUnalignedMem32Slow() &&
StVT == VT && !IsAligned) {
unsigned NumElems = VT.getVectorNumElements();
if (NumElems < 2)
static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget *Subtarget) {
+ SDValue N0 = N->getOperand(0);
+ EVT VT = N->getValueType(0);
+
+ // (i8,i32 sext (sdivrem (i8 x, i8 y)) ->
+ // (i8,i32 (sdivrem_sext_hreg (i8 x, i8 y)
+ // This exposes the sext to the sdivrem lowering, so that it directly extends
+ // from AH (which we otherwise need to do contortions to access).
+ if (N0.getOpcode() == ISD::SDIVREM && N0.getResNo() == 1 &&
+ N0.getValueType() == MVT::i8 && VT == MVT::i32) {
+ SDLoc dl(N);
+ SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
+ SDValue R = DAG.getNode(X86ISD::SDIVREM8_SEXT_HREG, dl, NodeTys,
+ N0.getOperand(0), N0.getOperand(1));
+ DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
+ return R.getValue(1);
+ }
+
if (!DCI.isBeforeLegalizeOps())
return SDValue();
if (!Subtarget->hasFp256())
return SDValue();
- EVT VT = N->getValueType(0);
if (VT.isVector() && VT.getSizeInBits() == 256) {
SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget);
if (R.getNode())
return R;
}
+ // (i8,i32 zext (udivrem (i8 x, i8 y)) ->
+ // (i8,i32 (udivrem_zext_hreg (i8 x, i8 y)
+ // This exposes the zext to the udivrem lowering, so that it directly extends
+ // from AH (which we otherwise need to do contortions to access).
+ if (N0.getOpcode() == ISD::UDIVREM &&
+ N0.getResNo() == 1 && N0.getValueType() == MVT::i8 &&
+ (VT == MVT::i32 || VT == MVT::i64)) {
+ SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
+ SDValue R = DAG.getNode(X86ISD::UDIVREM8_ZEXT_HREG, dl, NodeTys,
+ N0.getOperand(0), N0.getOperand(1));
+ DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
+ return R.getValue(1);
+ }
+
return SDValue();
}
case ISD::EXTRACT_VECTOR_ELT:
return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, DCI);
case ISD::VSELECT:
- case ISD::SELECT: return PerformSELECTCombine(N, DAG, DCI, Subtarget);
+ case ISD::SELECT:
+ case X86ISD::SHRUNKBLEND:
+ return PerformSELECTCombine(N, DAG, DCI, Subtarget);
case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI, Subtarget);
case ISD::ADD: return PerformAddCombine(N, DAG, Subtarget);
case ISD::SUB: return PerformSubCombine(N, DAG, Subtarget);