#include "X86ISelLowering.h"
#include "Utils/X86ShuffleDecode.h"
#include "X86CallingConv.h"
+#include "X86FrameLowering.h"
#include "X86InstrBuilder.h"
#include "X86MachineFunctionInfo.h"
#include "X86TargetMachine.h"
return Insert256BitVector(V, V2, NumElems/2, DAG, dl);
}
-// FIXME: This should stop caching the target machine as soon as
-// we can remove resetOperationActions et al.
-X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM)
- : TargetLowering(TM) {
- Subtarget = &TM.getSubtarget<X86Subtarget>();
+X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
+ const X86Subtarget &STI)
+ : TargetLowering(TM), Subtarget(&STI) {
X86ScalarSSEf64 = Subtarget->hasSSE2();
X86ScalarSSEf32 = Subtarget->hasSSE1();
TD = getDataLayout();
- resetOperationActions();
-}
-
-void X86TargetLowering::resetOperationActions() {
- const TargetMachine &TM = getTargetMachine();
- static bool FirstTimeThrough = true;
-
- // If none of the target options have changed, then we don't need to reset the
- // operation actions.
- if (!FirstTimeThrough && TO == TM.Options) return;
-
- if (!FirstTimeThrough) {
- // Reinitialize the actions.
- initActions();
- FirstTimeThrough = false;
- }
-
- TO = TM.Options;
-
// Set up the TargetLowering object.
static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
setSchedulingPreference(Sched::ILP);
else
setSchedulingPreference(Sched::RegPressure);
- const X86RegisterInfo *RegInfo =
- TM.getSubtarget<X86Subtarget>().getRegisterInfo();
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
// Bypass expensive divides on Atom when compiling with O2.
setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
}
+ // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, MVT::v8i8, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i8, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
+
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i16, MVT::v8i8, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i8, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
+
// i8 and i16 vectors are custom because the source register and source
// source memory operand types are not the same width. f32 vectors are
// custom since the immediate controlling the insert encodes additional
// Custom CTPOP always performs better on natively supported v8i32
setOperationAction(ISD::CTPOP, MVT::v8i32, Custom);
+
+ // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32, MVT::v8i8, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i8, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32, MVT::v8i16, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i16, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i32, Legal);
+
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32, MVT::v8i8, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i8, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32, MVT::v8i16, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i16, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i32, Legal);
} else {
setOperationAction(ISD::ADD, MVT::v4i64, Custom);
setOperationAction(ISD::ADD, MVT::v8i32, Custom);
setTargetDAGCombine(ISD::FMA);
setTargetDAGCombine(ISD::SUB);
setTargetDAGCombine(ISD::LOAD);
+ setTargetDAGCombine(ISD::MLOAD);
setTargetDAGCombine(ISD::STORE);
+ setTargetDAGCombine(ISD::MSTORE);
setTargetDAGCombine(ISD::ZERO_EXTEND);
setTargetDAGCombine(ISD::ANY_EXTEND);
setTargetDAGCombine(ISD::SIGN_EXTEND);
setTargetDAGCombine(ISD::SETCC);
setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
setTargetDAGCombine(ISD::BUILD_VECTOR);
- if (Subtarget->is64Bit())
- setTargetDAGCombine(ISD::MUL);
+ setTargetDAGCombine(ISD::MUL);
setTargetDAGCombine(ISD::XOR);
computeRegisterProperties();
}
if (IsWin64) {
- const TargetFrameLowering &TFI = *MF.getSubtarget().getFrameLowering();
+ const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
// Get to the caller-allocated home save location. Add 8 to account
// for the return address.
int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
// Walk the register/memloc assignments, inserting copies/loads. In the case
// of tail call optimization arguments are handle later.
- const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
- DAG.getSubtarget().getRegisterInfo());
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
// Skip inalloca arguments, they have already been written.
ISD::ArgFlagsTy Flags = Outs[i].Flags;
Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(),
OpFlags);
- } else if (Subtarget->isTarget64BitILP32() && Callee->getValueType(0) == MVT::i32) {
+ } else if (Subtarget->isTarget64BitILP32() &&
+ Callee->getValueType(0) == MVT::i32) {
// Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
}
RegsToPass[i].second.getValueType()));
// Add a register mask operand representing the call-preserved registers.
- const TargetRegisterInfo *TRI = DAG.getSubtarget().getRegisterInfo();
+ const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
assert(Mask && "Missing call preserved mask for calling convention");
Ops.push_back(DAG.getRegisterMask(Mask));
unsigned
X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
SelectionDAG& DAG) const {
- MachineFunction &MF = DAG.getMachineFunction();
- const TargetMachine &TM = MF.getTarget();
- const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
- TM.getSubtargetImpl()->getRegisterInfo());
- const TargetFrameLowering &TFI = *TM.getSubtargetImpl()->getFrameLowering();
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+ const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
unsigned StackAlignment = TFI.getStackAlignment();
uint64_t AlignMask = StackAlignment - 1;
int64_t Offset = StackSize;
return false;
} else {
unsigned Opcode = Def->getOpcode();
- if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) &&
+ if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
+ Opcode == X86::LEA64_32r) &&
Def->getOperand(1).isFI()) {
FI = Def->getOperand(1).getIndex();
Bytes = Flags.getByValSize();
// Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
// emit a special epilogue.
- const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
- DAG.getSubtarget().getRegisterInfo());
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
if (RegInfo->needsStackRealignment(MF))
return false;
// the caller's fixed stack objects.
MachineFrameInfo *MFI = MF.getFrameInfo();
const MachineRegisterInfo *MRI = &MF.getRegInfo();
- const X86InstrInfo *TII =
- static_cast<const X86InstrInfo *>(DAG.getSubtarget().getInstrInfo());
+ const X86InstrInfo *TII = Subtarget->getInstrInfo();
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
CCValAssign &VA = ArgLocs[i];
SDValue Arg = OutVals[i];
SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
- const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
- DAG.getSubtarget().getRegisterInfo());
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
int ReturnAddrIndex = FuncInfo->getRAIndex();
return false;
if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) {
- // FIXME: Support AVX-512 here.
- Type *Ty = C->getType();
- if (!Ty->isVectorTy() || (Ty->getVectorNumElements() != 16 &&
- Ty->getVectorNumElements() != 32))
- return false;
-
DecodePSHUFBMask(C, Mask);
break;
}
IsUnary = true;
break;
case X86ISD::MOVSS:
- case X86ISD::MOVSD: {
- // The index 0 always comes from the first element of the second source,
- // this is why MOVSS and MOVSD are used in the first place. The other
- // elements come from the other positions of the first source vector
- Mask.push_back(NumElems);
- for (unsigned i = 1; i != NumElems; ++i) {
- Mask.push_back(i);
- }
+ case X86ISD::MOVSD:
+ DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask);
break;
- }
case X86ISD::VPERM2X128:
ImmN = N->getOperand(N->getNumOperands()-1);
DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
break;
case X86ISD::MOVSLDUP:
DecodeMOVSLDUPMask(VT, Mask);
+ IsUnary = true;
break;
case X86ISD::MOVSHDUP:
DecodeMOVSHDUPMask(VT, Mask);
+ IsUnary = true;
break;
case X86ISD::MOVDDUP:
+ DecodeMOVDDUPMask(VT, Mask);
+ IsUnary = true;
+ break;
case X86ISD::MOVLHPD:
case X86ISD::MOVLPD:
case X86ISD::MOVLPS:
return DAG.getNode(ISD::BITCAST, SDLoc(Op), VT, Result);
}
-/// getVShift - Return a vector logical shift node.
-///
+/// Return a vector logical shift node.
static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
unsigned NumBits, SelectionDAG &DAG,
const TargetLowering &TLI, SDLoc dl) {
assert(VT.is128BitVector() && "Unknown type for VShift");
- EVT ShVT = MVT::v2i64;
+ MVT ShVT = MVT::v2i64;
unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp);
+ MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(SrcOp.getValueType());
+ SDValue ShiftVal = DAG.getConstant(NumBits, ScalarShiftTy);
return DAG.getNode(ISD::BITCAST, dl, VT,
- DAG.getNode(Opc, dl, ShVT, SrcOp,
- DAG.getConstant(NumBits,
- TLI.getScalarShiftAmountTy(SrcOp.getValueType()))));
+ DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
}
static SDValue
return SDValue();
}
-/// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a
-/// vector of type 'VT', see if the elements can be replaced by a single large
-/// load which has the same value as a build_vector whose operands are 'elts'.
+/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
+/// elements can be replaced by a single large load which has the same value as
+/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
///
/// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a
///
/// FIXME: we'd also like to handle the case where the last elements are zero
/// rather than undef via VZEXT_LOAD, but we do not detect that case today.
/// There's even a handy isZeroNode for that purpose.
-static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
+static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
SDLoc &DL, SelectionDAG &DAG,
bool isAfterLegalize) {
- EVT EltVT = VT.getVectorElementType();
unsigned NumElems = Elts.size();
LoadSDNode *LDBase = nullptr;
// non-consecutive, bail out.
for (unsigned i = 0; i < NumElems; ++i) {
SDValue Elt = Elts[i];
-
+ // Look through a bitcast.
+ if (Elt.getNode() && Elt.getOpcode() == ISD::BITCAST)
+ Elt = Elt.getOperand(0);
if (!Elt.getNode() ||
(Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
return SDValue();
continue;
LoadSDNode *LD = cast<LoadSDNode>(Elt);
- if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i))
+ EVT LdVT = Elt.getValueType();
+ // Each loaded element must be the correct fractional portion of the
+ // requested vector load.
+ if (LdVT.getSizeInBits() != VT.getSizeInBits() / NumElems)
+ return SDValue();
+ if (!DAG.isConsecutiveLoad(LD, LDBase, LdVT.getSizeInBits() / 8, i))
return SDValue();
LastLoadedElt = i;
}
// load of the entire vector width starting at the base pointer. If we found
// consecutive loads for the low half, generate a vzext_load node.
if (LastLoadedElt == NumElems - 1) {
+ assert(LDBase && "Did not find base load for merging consecutive loads");
+ EVT EltVT = LDBase->getValueType(0);
+ // Ensure that the input vector size for the merged loads matches the
+ // cumulative size of the input elements.
+ if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
+ return SDValue();
if (isAfterLegalize &&
!DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, VT))
//TODO: The code below fires only for for loading the low v2i32 / v2f32
//of a v4i32 / v4f32. It's probably worth generalizing.
+ EVT EltVT = VT.getVectorElementType();
if (NumElems == 4 && LastLoadedElt == 1 && (EltVT.getSizeInBits() == 32) &&
DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
return Sh;
// For SSE 4.1, use insertps to put the high elements into the low element.
- if (getSubtarget()->hasSSE41()) {
+ if (Subtarget->hasSSE41()) {
SDValue Result;
if (Op.getOperand(0).getOpcode() != ISD::UNDEF)
Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
if (Mask[i] == -1)
continue; // Valid anywhere but doesn't tell us anything.
if (i % Scale != 0) {
- // Each of the extend elements needs to be zeroable.
+ // Each of the extended elements need to be zeroable.
if (!Zeroable[i])
return SDValue();
- // We no lorger are in the anyext case.
+ // We no longer are in the anyext case.
AnyExt = false;
continue;
}
return SDValue(); // Flip-flopping inputs.
if (Mask[i] % NumElements != i / Scale)
- return SDValue(); // Non-consecutive strided elemenst.
+ return SDValue(); // Non-consecutive strided elements.
}
// If we fail to find an input, we have a zero-shuffle which should always
// many elements.
for (; NumExtElements < NumElements; NumExtElements *= 2) {
assert(NumElements % NumExtElements == 0 &&
- "The input vector size must be divisble by the extended size.");
+ "The input vector size must be divisible by the extended size.");
if (SDValue V = Lower(NumElements / NumExtElements))
return V;
}
return DAG.getNode(X86ISD::VBROADCAST, DL, VT, V);
}
+// Check for whether we can use INSERTPS to perform the shuffle. We only use
+// INSERTPS when the V1 elements are already in the correct locations
+// because otherwise we can just always use two SHUFPS instructions which
+// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
+// perform INSERTPS if a single V1 element is out of place and all V2
+// elements are zeroable.
+static SDValue lowerVectorShuffleAsInsertPS(SDValue Op, SDValue V1, SDValue V2,
+ ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
+ assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!");
+ assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
+ assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
+
+ SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+
+ unsigned ZMask = 0;
+ int V1DstIndex = -1;
+ int V2DstIndex = -1;
+ bool V1UsedInPlace = false;
+
+ for (int i = 0; i < 4; i++) {
+ // Synthesize a zero mask from the zeroable elements (includes undefs).
+ if (Zeroable[i]) {
+ ZMask |= 1 << i;
+ continue;
+ }
+
+ // Flag if we use any V1 inputs in place.
+ if (i == Mask[i]) {
+ V1UsedInPlace = true;
+ continue;
+ }
+
+ // We can only insert a single non-zeroable element.
+ if (V1DstIndex != -1 || V2DstIndex != -1)
+ return SDValue();
+
+ if (Mask[i] < 4) {
+ // V1 input out of place for insertion.
+ V1DstIndex = i;
+ } else {
+ // V2 input for insertion.
+ V2DstIndex = i;
+ }
+ }
+
+ // Don't bother if we have no (non-zeroable) element for insertion.
+ if (V1DstIndex == -1 && V2DstIndex == -1)
+ return SDValue();
+
+ // Determine element insertion src/dst indices. The src index is from the
+ // start of the inserted vector, not the start of the concatenated vector.
+ unsigned V2SrcIndex = 0;
+ if (V1DstIndex != -1) {
+ // If we have a V1 input out of place, we use V1 as the V2 element insertion
+ // and don't use the original V2 at all.
+ V2SrcIndex = Mask[V1DstIndex];
+ V2DstIndex = V1DstIndex;
+ V2 = V1;
+ } else {
+ V2SrcIndex = Mask[V2DstIndex] - 4;
+ }
+
+ // If no V1 inputs are used in place, then the result is created only from
+ // the zero mask and the V2 insertion - so remove V1 dependency.
+ if (!V1UsedInPlace)
+ V1 = DAG.getUNDEF(MVT::v4f32);
+
+ unsigned InsertPSMask = V2SrcIndex << 6 | V2DstIndex << 4 | ZMask;
+ assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
+
+ // Insert the V2 element into the desired position.
+ SDLoc DL(Op);
+ return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
+ DAG.getConstant(InsertPSMask, MVT::i8));
+}
+
/// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
///
/// This is the basis function for the 2-lane 64-bit shuffles as we have full
assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
if (isSingleInputShuffleMask(Mask)) {
+ // Use low duplicate instructions for masks that match their pattern.
+ if (Subtarget->hasSSE3())
+ if (isShuffleEquivalent(Mask, 0, 0))
+ return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, V1);
+
// Straight shuffle of a single input vector. Simulate this by using the
// single input as both of the "inputs" to this instruction..
unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
Mask, Subtarget, DAG))
return Broadcast;
+ // Use even/odd duplicate instructions for masks that match their pattern.
+ if (Subtarget->hasSSE3()) {
+ if (isShuffleEquivalent(Mask, 0, 0, 2, 2))
+ return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
+ if (isShuffleEquivalent(Mask, 1, 1, 3, 3))
+ return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
+ }
+
if (Subtarget->hasAVX()) {
// If we have AVX, we can use VPERMILPS which will allow folding a load
// into the shuffle.
Mask, Subtarget, DAG))
return V;
- if (Subtarget->hasSSE41())
+ if (Subtarget->hasSSE41()) {
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
Subtarget, DAG))
return Blend;
- // Check for whether we can use INSERTPS to perform the blend. We only use
- // INSERTPS when the V1 elements are already in the correct locations
- // because otherwise we can just always use two SHUFPS instructions which
- // are much smaller to encode than a SHUFPS and an INSERTPS.
- if (NumV2Elements == 1 && Subtarget->hasSSE41()) {
- int V2Index =
- std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) -
- Mask.begin();
-
- // When using INSERTPS we can zero any lane of the destination. Collect
- // the zero inputs into a mask and drop them from the lanes of V1 which
- // actually need to be present as inputs to the INSERTPS.
- SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
-
- // Synthesize a shuffle mask for the non-zero and non-v2 inputs.
- bool InsertNeedsShuffle = false;
- unsigned ZMask = 0;
- for (int i = 0; i < 4; ++i)
- if (i != V2Index) {
- if (Zeroable[i]) {
- ZMask |= 1 << i;
- } else if (Mask[i] != i) {
- InsertNeedsShuffle = true;
- break;
- }
- }
-
- // We don't want to use INSERTPS or other insertion techniques if it will
- // require shuffling anyways.
- if (!InsertNeedsShuffle) {
- // If all of V1 is zeroable, replace it with undef.
- if ((ZMask | 1 << V2Index) == 0xF)
- V1 = DAG.getUNDEF(MVT::v4f32);
-
- unsigned InsertPSMask = (Mask[V2Index] - 4) << 6 | V2Index << 4 | ZMask;
- assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
-
- // Insert the V2 element into the desired position.
- return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
- DAG.getConstant(InsertPSMask, MVT::i8));
- }
+ // Use INSERTPS if we can complete the shuffle efficiently.
+ if (SDValue V = lowerVectorShuffleAsInsertPS(Op, V1, V2, Mask, DAG))
+ return V;
}
// Otherwise fall back to a SHUFPS lowering strategy.
V2InUse |= (ZeroMask != V2Idx);
}
}
- assert((V1InUse || V2InUse) && "Shuffling to a zeroable vector");
if (V1InUse)
V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V1,
return V1; // Single inputs are easy.
if (V2InUse)
return V2; // Single inputs are easy.
+ // Shuffling to a zeroable vector.
+ return getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
}
// There are special ways we can lower some single-element blends.
Mask, Subtarget, DAG))
return Broadcast;
+ // Use low duplicate instructions for masks that match their pattern.
+ if (isShuffleEquivalent(Mask, 0, 0, 2, 2))
+ return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
+
if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
// Non-half-crossing single input shuffles can be lowerid with an
// interleaved permutation.
if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
assert(RepeatedMask.size() == 4 &&
"Repeated masks must be half the mask width!");
+
+ // Use even/odd duplicate instructions for masks that match their pattern.
+ if (isShuffleEquivalent(Mask, 0, 0, 2, 2, 4, 4, 6, 6))
+ return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
+ if (isShuffleEquivalent(Mask, 1, 1, 3, 3, 5, 5, 7, 7))
+ return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
+
if (isSingleInputShuffleMask(Mask))
return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
getV4X86ShuffleImm8ForMask(RepeatedMask, DAG));
ArrayRef<int> Mask = SVOp->getMask();
assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
+ // X86 has dedicated unpack instructions that can handle specific blend
+ // operations: UNPCKH and UNPCKL.
+ if (isShuffleEquivalent(Mask, 0, 8, 2, 10, 4, 12, 6, 14))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f64, V1, V2);
+ if (isShuffleEquivalent(Mask, 1, 9, 3, 11, 5, 13, 7, 15))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f64, V1, V2);
+
// FIXME: Implement direct support for this type!
return splitAndLowerVectorShuffle(DL, MVT::v8f64, V1, V2, Mask, DAG);
}
ArrayRef<int> Mask = SVOp->getMask();
assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (isShuffleEquivalent(Mask,
+ 0, 16, 1, 17, 4, 20, 5, 21,
+ 8, 24, 9, 25, 12, 28, 13, 29))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16f32, V1, V2);
+ if (isShuffleEquivalent(Mask,
+ 2, 18, 3, 19, 6, 22, 7, 23,
+ 10, 26, 11, 27, 14, 30, 15, 31))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16f32, V1, V2);
+
// FIXME: Implement direct support for this type!
return splitAndLowerVectorShuffle(DL, MVT::v16f32, V1, V2, Mask, DAG);
}
ArrayRef<int> Mask = SVOp->getMask();
assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
+ // X86 has dedicated unpack instructions that can handle specific blend
+ // operations: UNPCKH and UNPCKL.
+ if (isShuffleEquivalent(Mask, 0, 8, 2, 10, 4, 12, 6, 14))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i64, V1, V2);
+ if (isShuffleEquivalent(Mask, 1, 9, 3, 11, 5, 13, 7, 15))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i64, V1, V2);
+
// FIXME: Implement direct support for this type!
return splitAndLowerVectorShuffle(DL, MVT::v8i64, V1, V2, Mask, DAG);
}
ArrayRef<int> Mask = SVOp->getMask();
assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (isShuffleEquivalent(Mask,
+ 0, 16, 1, 17, 4, 20, 5, 21,
+ 8, 24, 9, 25, 12, 28, 13, 29))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i32, V1, V2);
+ if (isShuffleEquivalent(Mask,
+ 2, 18, 3, 19, 6, 22, 7, 23,
+ 10, 26, 11, 27, 14, 30, 15, 31))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i32, V1, V2);
+
// FIXME: Implement direct support for this type!
return splitAndLowerVectorShuffle(DL, MVT::v16i32, V1, V2, Mask, DAG);
}
MVT EltVT = Op.getSimpleValueType();
assert((EltVT == MVT::i1) && "Unexpected operands in ExtractBitFromMaskVector");
+ assert((VecVT.getVectorNumElements() <= 16 || Subtarget->hasBWI()) &&
+ "Unexpected vector type in ExtractBitFromMaskVector");
// variable index can't be handled in mask registers,
// extend vector to VR512
unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
const TargetRegisterClass* rc = getRegClassFor(VecVT);
+ if (!Subtarget->hasDQI() && (VecVT.getVectorNumElements() <= 8))
+ rc = getRegClassFor(MVT::v16i1);
unsigned MaxSift = rc->getSize()*8 - 1;
Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
DAG.getConstant(MaxSift - IdxVal, MVT::i8));
// the upper bits of a vector.
static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
- if (Subtarget->hasFp256()) {
- SDLoc dl(Op.getNode());
- SDValue Vec = Op.getNode()->getOperand(0);
- SDValue SubVec = Op.getNode()->getOperand(1);
- SDValue Idx = Op.getNode()->getOperand(2);
-
- if ((Op.getNode()->getSimpleValueType(0).is256BitVector() ||
- Op.getNode()->getSimpleValueType(0).is512BitVector()) &&
- SubVec.getNode()->getSimpleValueType(0).is128BitVector() &&
- isa<ConstantSDNode>(Idx)) {
- unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
- return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
- }
+ if (!Subtarget->hasAVX())
+ return SDValue();
- if (Op.getNode()->getSimpleValueType(0).is512BitVector() &&
- SubVec.getNode()->getSimpleValueType(0).is256BitVector() &&
- isa<ConstantSDNode>(Idx)) {
- unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
- return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl);
+ SDLoc dl(Op);
+ SDValue Vec = Op.getOperand(0);
+ SDValue SubVec = Op.getOperand(1);
+ SDValue Idx = Op.getOperand(2);
+
+ if (!isa<ConstantSDNode>(Idx))
+ return SDValue();
+
+ unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+ MVT OpVT = Op.getSimpleValueType();
+ MVT SubVecVT = SubVec.getSimpleValueType();
+
+ // Fold two 16-byte subvector loads into one 32-byte load:
+ // (insert_subvector (insert_subvector undef, (load addr), 0),
+ // (load addr + 16), Elts/2)
+ // --> load32 addr
+ if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
+ Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
+ OpVT.is256BitVector() && SubVecVT.is128BitVector() &&
+ !Subtarget->isUnalignedMem32Slow()) {
+ SDValue SubVec2 = Vec.getOperand(1);
+ if (auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2))) {
+ if (Idx2->getZExtValue() == 0) {
+ SDValue Ops[] = { SubVec2, SubVec };
+ SDValue LD = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false);
+ if (LD.getNode())
+ return LD;
+ }
}
}
+
+ if ((OpVT.is256BitVector() || OpVT.is512BitVector()) &&
+ SubVecVT.is128BitVector())
+ return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
+
+ if (OpVT.is512BitVector() && SubVecVT.is256BitVector())
+ return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl);
+
return SDValue();
}
SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
Chain = SP.getValue(1);
unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue();
- const TargetFrameLowering &TFI = *DAG.getSubtarget().getFrameLowering();
+ const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
unsigned StackAlign = TFI.getStackAlignment();
Tmp1 = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
if (Align > StackAlign)
Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag);
- const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
- DAG.getSubtarget().getRegisterInfo());
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
unsigned SPReg = RegInfo->getStackRegister();
SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
Chain = SP.getValue(1);
}
const X86Subtarget &Subtarget =
- DAG.getTarget().getSubtarget<X86Subtarget>();
+ static_cast<const X86Subtarget &>(DAG.getSubtarget());
if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
// Let the shuffle legalizer expand this shift amount node.
return DAG.getNode(X86ISD::SELECT, dl, VT, IMask, Op, PreservedSrc);
}
-static unsigned getOpcodeForFMAIntrinsic(unsigned IntNo) {
- switch (IntNo) {
- default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
- case Intrinsic::x86_fma_vfmadd_ps:
- case Intrinsic::x86_fma_vfmadd_pd:
- case Intrinsic::x86_fma_vfmadd_ps_256:
- case Intrinsic::x86_fma_vfmadd_pd_256:
- case Intrinsic::x86_fma_mask_vfmadd_ps_512:
- case Intrinsic::x86_fma_mask_vfmadd_pd_512:
- return X86ISD::FMADD;
- case Intrinsic::x86_fma_vfmsub_ps:
- case Intrinsic::x86_fma_vfmsub_pd:
- case Intrinsic::x86_fma_vfmsub_ps_256:
- case Intrinsic::x86_fma_vfmsub_pd_256:
- case Intrinsic::x86_fma_mask_vfmsub_ps_512:
- case Intrinsic::x86_fma_mask_vfmsub_pd_512:
- return X86ISD::FMSUB;
- case Intrinsic::x86_fma_vfnmadd_ps:
- case Intrinsic::x86_fma_vfnmadd_pd:
- case Intrinsic::x86_fma_vfnmadd_ps_256:
- case Intrinsic::x86_fma_vfnmadd_pd_256:
- case Intrinsic::x86_fma_mask_vfnmadd_ps_512:
- case Intrinsic::x86_fma_mask_vfnmadd_pd_512:
- return X86ISD::FNMADD;
- case Intrinsic::x86_fma_vfnmsub_ps:
- case Intrinsic::x86_fma_vfnmsub_pd:
- case Intrinsic::x86_fma_vfnmsub_ps_256:
- case Intrinsic::x86_fma_vfnmsub_pd_256:
- case Intrinsic::x86_fma_mask_vfnmsub_ps_512:
- case Intrinsic::x86_fma_mask_vfnmsub_pd_512:
- return X86ISD::FNMSUB;
- case Intrinsic::x86_fma_vfmaddsub_ps:
- case Intrinsic::x86_fma_vfmaddsub_pd:
- case Intrinsic::x86_fma_vfmaddsub_ps_256:
- case Intrinsic::x86_fma_vfmaddsub_pd_256:
- case Intrinsic::x86_fma_mask_vfmaddsub_ps_512:
- case Intrinsic::x86_fma_mask_vfmaddsub_pd_512:
- return X86ISD::FMADDSUB;
- case Intrinsic::x86_fma_vfmsubadd_ps:
- case Intrinsic::x86_fma_vfmsubadd_pd:
- case Intrinsic::x86_fma_vfmsubadd_ps_256:
- case Intrinsic::x86_fma_vfmsubadd_pd_256:
- case Intrinsic::x86_fma_mask_vfmsubadd_ps_512:
- case Intrinsic::x86_fma_mask_vfmsubadd_pd_512:
- return X86ISD::FMSUBADD;
- }
-}
-
static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
SDLoc dl(Op);
Mask, Src0, Subtarget, DAG);
}
case INTR_TYPE_2OP_MASK: {
- return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Op.getOperand(1),
+ SDValue Mask = Op.getOperand(4);
+ SDValue PassThru = Op.getOperand(3);
+ unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
+ if (IntrWithRoundingModeOpcode != 0) {
+ unsigned Round = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue();
+ if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) {
+ return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
+ dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2),
+ Op.getOperand(3), Op.getOperand(5)),
+ Mask, PassThru, Subtarget, DAG);
+ }
+ }
+ return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
+ Op.getOperand(1),
Op.getOperand(2)),
- Op.getOperand(4), Op.getOperand(3), Subtarget, DAG);
+ Mask, PassThru, Subtarget, DAG);
+ }
+ case FMA_OP_MASK: {
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Src2 = Op.getOperand(2);
+ SDValue Src3 = Op.getOperand(3);
+ SDValue Mask = Op.getOperand(4);
+ unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
+ if (IntrWithRoundingModeOpcode != 0) {
+ SDValue Rnd = Op.getOperand(5);
+ if (cast<ConstantSDNode>(Rnd)->getZExtValue() !=
+ X86::STATIC_ROUNDING::CUR_DIRECTION)
+ return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
+ dl, Op.getValueType(),
+ Src1, Src2, Src3, Rnd),
+ Mask, Src1, Subtarget, DAG);
+ }
+ return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
+ dl, Op.getValueType(),
+ Src1, Src2, Src3),
+ Mask, Src1, Subtarget, DAG);
}
case CMP_MASK:
case CMP_MASK_CC: {
return DAG.getNode(IntrData->Opc0, dl, VT, VMask, Op.getOperand(1),
Op.getOperand(2));
}
- case FMA_OP_MASK:
- {
- return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
- dl, Op.getValueType(),
- Op.getOperand(1),
- Op.getOperand(2),
- Op.getOperand(3)),
- Op.getOperand(4), Op.getOperand(1),
- Subtarget, DAG);
- }
default:
break;
}
SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
return DAG.getNode(Opcode, dl, VTs, NewOps);
}
-
- case Intrinsic::x86_fma_mask_vfmadd_ps_512:
- case Intrinsic::x86_fma_mask_vfmadd_pd_512:
- case Intrinsic::x86_fma_mask_vfmsub_ps_512:
- case Intrinsic::x86_fma_mask_vfmsub_pd_512:
- case Intrinsic::x86_fma_mask_vfnmadd_ps_512:
- case Intrinsic::x86_fma_mask_vfnmadd_pd_512:
- case Intrinsic::x86_fma_mask_vfnmsub_ps_512:
- case Intrinsic::x86_fma_mask_vfnmsub_pd_512:
- case Intrinsic::x86_fma_mask_vfmaddsub_ps_512:
- case Intrinsic::x86_fma_mask_vfmaddsub_pd_512:
- case Intrinsic::x86_fma_mask_vfmsubadd_ps_512:
- case Intrinsic::x86_fma_mask_vfmsubadd_pd_512: {
- auto *SAE = cast<ConstantSDNode>(Op.getOperand(5));
- if (SAE->getZExtValue() == X86::STATIC_ROUNDING::CUR_DIRECTION)
- return getVectorMaskingNode(DAG.getNode(getOpcodeForFMAIntrinsic(IntNo),
- dl, Op.getValueType(),
- Op.getOperand(1),
- Op.getOperand(2),
- Op.getOperand(3)),
- Op.getOperand(4), Op.getOperand(1),
- Subtarget, DAG);
- else
- return SDValue();
- }
-
- case Intrinsic::x86_fma_vfmadd_ps:
- case Intrinsic::x86_fma_vfmadd_pd:
- case Intrinsic::x86_fma_vfmsub_ps:
- case Intrinsic::x86_fma_vfmsub_pd:
- case Intrinsic::x86_fma_vfnmadd_ps:
- case Intrinsic::x86_fma_vfnmadd_pd:
- case Intrinsic::x86_fma_vfnmsub_ps:
- case Intrinsic::x86_fma_vfnmsub_pd:
- case Intrinsic::x86_fma_vfmaddsub_ps:
- case Intrinsic::x86_fma_vfmaddsub_pd:
- case Intrinsic::x86_fma_vfmsubadd_ps:
- case Intrinsic::x86_fma_vfmsubadd_pd:
- case Intrinsic::x86_fma_vfmadd_ps_256:
- case Intrinsic::x86_fma_vfmadd_pd_256:
- case Intrinsic::x86_fma_vfmsub_ps_256:
- case Intrinsic::x86_fma_vfmsub_pd_256:
- case Intrinsic::x86_fma_vfnmadd_ps_256:
- case Intrinsic::x86_fma_vfnmadd_pd_256:
- case Intrinsic::x86_fma_vfnmsub_ps_256:
- case Intrinsic::x86_fma_vfnmsub_pd_256:
- case Intrinsic::x86_fma_vfmaddsub_ps_256:
- case Intrinsic::x86_fma_vfmaddsub_pd_256:
- case Intrinsic::x86_fma_vfmsubadd_ps_256:
- case Intrinsic::x86_fma_vfmsubadd_pd_256:
- return DAG.getNode(getOpcodeForFMAIntrinsic(IntNo), dl, Op.getValueType(),
- Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
}
}
if (Depth > 0) {
SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
- const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
- DAG.getSubtarget().getRegisterInfo());
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), PtrVT);
return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
DAG.getNode(ISD::ADD, dl, PtrVT,
EVT VT = Op.getValueType();
SDLoc dl(Op); // FIXME probably not meaningful
unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
- const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
- DAG.getSubtarget().getRegisterInfo());
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
unsigned FrameReg = RegInfo->getPtrSizedFrameRegister(
DAG.getMachineFunction());
assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
SelectionDAG &DAG) const {
- const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
- DAG.getSubtarget().getRegisterInfo());
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize());
}
SDLoc dl (Op);
EVT PtrVT = getPointerTy();
- const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
- DAG.getSubtarget().getRegisterInfo());
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
(FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
SDLoc dl (Op);
const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
- const TargetRegisterInfo *TRI = DAG.getSubtarget().getRegisterInfo();
+ const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
if (Subtarget->is64Bit()) {
SDValue OutChains[6];
*/
MachineFunction &MF = DAG.getMachineFunction();
- const TargetMachine &TM = MF.getTarget();
- const TargetFrameLowering &TFI = *TM.getSubtargetImpl()->getFrameLowering();
+ const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
unsigned StackAlignment = TFI.getStackAlignment();
MVT VT = Op.getSimpleValueType();
SDLoc DL(Op);
/// Used to know whether to use cmpxchg8/16b when expanding atomic operations
/// (otherwise we leave them alone to become __sync_fetch_and_... calls).
bool X86TargetLowering::needsCmpXchgNb(const Type *MemType) const {
- const X86Subtarget &Subtarget =
- getTargetMachine().getSubtarget<X86Subtarget>();
unsigned OpWidth = MemType->getPrimitiveSizeInBits();
if (OpWidth == 64)
- return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
+ return !Subtarget->is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
else if (OpWidth == 128)
- return Subtarget.hasCmpxchg16b();
+ return Subtarget->hasCmpxchg16b();
else
return false;
}
}
bool X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
- const X86Subtarget &Subtarget =
- getTargetMachine().getSubtarget<X86Subtarget>();
- unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
+ unsigned NativeWidth = Subtarget->is64Bit() ? 64 : 32;
const Type *MemType = AI->getType();
// If the operand is too big, we must see if cmpxchg8/16b is available
LoadInst *
X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
- const X86Subtarget &Subtarget =
- getTargetMachine().getSubtarget<X86Subtarget>();
- unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
+ unsigned NativeWidth = Subtarget->is64Bit() ? 64 : 32;
const Type *MemType = AI->getType();
// Accesses larger than the native width are turned into cmpxchg/libcalls, so
// there is no benefit in turning such RMWs into loads, and it is actually
// FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
// the IR level, so we must wrap it in an intrinsic.
return nullptr;
- } else if (hasMFENCE(Subtarget)) {
+ } else if (hasMFENCE(*Subtarget)) {
Function *MFence = llvm::Intrinsic::getDeclaration(M,
Intrinsic::x86_sse2_mfence);
Builder.CreateCall(MFence);
switch (N->getOpcode()) {
default:
llvm_unreachable("Do not know how to custom type legalize this operation!");
+ // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
+ case X86ISD::FMINC:
+ case X86ISD::FMIN:
+ case X86ISD::FMAXC:
+ case X86ISD::FMAX: {
+ EVT VT = N->getValueType(0);
+ if (VT != MVT::v2f32)
+ llvm_unreachable("Unexpected type (!= v2f32) on FMIN/FMAX.");
+ SDValue UNDEF = DAG.getUNDEF(VT);
+ SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
+ N->getOperand(0), UNDEF);
+ SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
+ N->getOperand(1), UNDEF);
+ Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
+ return;
+ }
case ISD::SIGN_EXTEND_INREG:
case ISD::ADDC:
case ISD::ADDE:
case X86ISD::COMPRESS: return "X86ISD::COMPRESS";
case X86ISD::EXPAND: return "X86ISD::EXPAND";
case X86ISD::SELECT: return "X86ISD::SELECT";
+ case X86ISD::ADDSUB: return "X86ISD::ADDSUB";
+ case X86ISD::RCP28: return "X86ISD::RCP28";
+ case X86ISD::RSQRT28: return "X86ISD::RSQRT28";
}
}
return BB;
}
-static MachineBasicBlock * EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB,
- const TargetInstrInfo *TII,
- const X86Subtarget* Subtarget) {
+static MachineBasicBlock *EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB,
+ const X86Subtarget *Subtarget) {
DebugLoc dl = MI->getDebugLoc();
-
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
// Address into RAX/EAX, other two args into ECX, EDX.
unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r;
unsigned MemReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
}
MachineBasicBlock *
-X86TargetLowering::EmitVAARG64WithCustomInserter(
- MachineInstr *MI,
- MachineBasicBlock *MBB) const {
+X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr *MI,
+ MachineBasicBlock *MBB) const {
// Emit va_arg instruction on X86-64.
// Operands to this pseudo-instruction:
MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
// Machine Information
- const TargetInstrInfo *TII = MBB->getParent()->getSubtarget().getInstrInfo();
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
XMMSaveMBB->addSuccessor(EndMBB);
// Now add the instructions.
- const TargetInstrInfo *TII = MBB->getParent()->getSubtarget().getInstrInfo();
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
DebugLoc DL = MI->getDebugLoc();
unsigned CountReg = MI->getOperand(0).getReg();
MachineBasicBlock *
X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
MachineBasicBlock *BB) const {
- const TargetInstrInfo *TII = BB->getParent()->getSubtarget().getInstrInfo();
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
DebugLoc DL = MI->getDebugLoc();
// To "insert" a SELECT_CC instruction, we actually have to insert the
// If the EFLAGS register isn't dead in the terminator, then claim that it's
// live into the sink and copy blocks.
- const TargetRegisterInfo *TRI =
- BB->getParent()->getSubtarget().getRegisterInfo();
+ const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
if (!MI->killsRegister(X86::EFLAGS) &&
!checkAndUpdateEFLAGSKill(MI, BB, TRI)) {
copy0MBB->addLiveIn(X86::EFLAGS);
X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI,
MachineBasicBlock *BB) const {
MachineFunction *MF = BB->getParent();
- const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
DebugLoc DL = MI->getDebugLoc();
const BasicBlock *LLVM_BB = BB->getBasicBlock();
BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
// Calls into a routine in libgcc to allocate more space from the heap.
- const uint32_t *RegMask = MF->getTarget()
- .getSubtargetImpl()
- ->getRegisterInfo()
- ->getCallPreservedMask(CallingConv::C);
+ const uint32_t *RegMask =
+ Subtarget->getRegisterInfo()->getCallPreservedMask(CallingConv::C);
if (IsLP64) {
BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
.addReg(sizeVReg);
MachineBasicBlock *
X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI,
MachineBasicBlock *BB) const {
- const TargetInstrInfo *TII = BB->getParent()->getSubtarget().getInstrInfo();
DebugLoc DL = MI->getDebugLoc();
assert(!Subtarget->isTargetMachO());
- // The lowering is pretty easy: we're just emitting the call to _alloca. The
- // non-trivial part is impdef of ESP.
-
- if (Subtarget->isTargetWin64()) {
- if (Subtarget->isTargetCygMing()) {
- // ___chkstk(Mingw64):
- // Clobbers R10, R11, RAX and EFLAGS.
- // Updates RSP.
- BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA))
- .addExternalSymbol("___chkstk")
- .addReg(X86::RAX, RegState::Implicit)
- .addReg(X86::RSP, RegState::Implicit)
- .addReg(X86::RAX, RegState::Define | RegState::Implicit)
- .addReg(X86::RSP, RegState::Define | RegState::Implicit)
- .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
- } else {
- // __chkstk(MSVCRT): does not update stack pointer.
- // Clobbers R10, R11 and EFLAGS.
- BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA))
- .addExternalSymbol("__chkstk")
- .addReg(X86::RAX, RegState::Implicit)
- .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
- // RAX has the offset to be subtracted from RSP.
- BuildMI(*BB, MI, DL, TII->get(X86::SUB64rr), X86::RSP)
- .addReg(X86::RSP)
- .addReg(X86::RAX);
- }
- } else {
- const char *StackProbeSymbol = (Subtarget->isTargetKnownWindowsMSVC() ||
- Subtarget->isTargetWindowsItanium())
- ? "_chkstk"
- : "_alloca";
-
- BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32))
- .addExternalSymbol(StackProbeSymbol)
- .addReg(X86::EAX, RegState::Implicit)
- .addReg(X86::ESP, RegState::Implicit)
- .addReg(X86::EAX, RegState::Define | RegState::Implicit)
- .addReg(X86::ESP, RegState::Define | RegState::Implicit)
- .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
- }
+ X86FrameLowering::emitStackProbeCall(*BB->getParent(), *BB, MI, DL);
MI->eraseFromParent(); // The pseudo instruction is gone now.
return BB;
// or EAX and doing an indirect call. The return value will then
// be in the normal return register.
MachineFunction *F = BB->getParent();
- const X86InstrInfo *TII =
- static_cast<const X86InstrInfo *>(F->getSubtarget().getInstrInfo());
+ const X86InstrInfo *TII = Subtarget->getInstrInfo();
DebugLoc DL = MI->getDebugLoc();
assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?");
// Get a register mask for the lowered call.
// FIXME: The 32-bit calls have non-standard calling conventions. Use a
// proper register mask.
- const uint32_t *RegMask = F->getTarget()
- .getSubtargetImpl()
- ->getRegisterInfo()
- ->getCallPreservedMask(CallingConv::C);
+ const uint32_t *RegMask =
+ Subtarget->getRegisterInfo()->getCallPreservedMask(CallingConv::C);
if (Subtarget->is64Bit()) {
MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
TII->get(X86::MOV64rm), X86::RDI)
MachineBasicBlock *MBB) const {
DebugLoc DL = MI->getDebugLoc();
MachineFunction *MF = MBB->getParent();
- const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
MachineRegisterInfo &MRI = MF->getRegInfo();
const BasicBlock *BB = MBB->getBasicBlock();
MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
.addMBB(restoreMBB);
- const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
- MF->getSubtarget().getRegisterInfo());
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
MIB.addRegMask(RegInfo->getNoPreservedMask());
thisMBB->addSuccessor(mainMBB);
thisMBB->addSuccessor(restoreMBB);
// restoreMBB:
if (RegInfo->hasBasePointer(*MF)) {
- const X86Subtarget &STI = MF->getTarget().getSubtarget<X86Subtarget>();
- const bool Uses64BitFramePtr = STI.isTarget64BitLP64() || STI.isTargetNaCl64();
+ const bool Uses64BitFramePtr =
+ Subtarget->isTarget64BitLP64() || Subtarget->isTargetNaCl64();
X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
X86FI->setRestoreBasePointer(MF);
unsigned FramePtr = RegInfo->getFrameRegister(*MF);
MachineBasicBlock *MBB) const {
DebugLoc DL = MI->getDebugLoc();
MachineFunction *MF = MBB->getParent();
- const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
MachineRegisterInfo &MRI = MF->getRegInfo();
// Memory Reference
(PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
unsigned Tmp = MRI.createVirtualRegister(RC);
// Since FP is only updated here but NOT referenced, it's treated as GPR.
- const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
- MF->getSubtarget().getRegisterInfo());
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
unsigned SP = RegInfo->getStackRegister();
default: llvm_unreachable("Unrecognized FMA variant.");
}
- const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+ const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
MachineInstrBuilder MIB =
BuildMI(MF, MI->getDebugLoc(), TII.get(NewFMAOpc))
.addOperand(MI->getOperand(0))
case X86::TAILJMPd64:
case X86::TAILJMPr64:
case X86::TAILJMPm64:
+ case X86::TAILJMPd64_REX:
+ case X86::TAILJMPr64_REX:
+ case X86::TAILJMPm64_REX:
llvm_unreachable("TAILJMP64 would not be touched here.");
case X86::TCRETURNdi64:
case X86::TCRETURNri64:
case X86::FP80_TO_INT32_IN_MEM:
case X86::FP80_TO_INT64_IN_MEM: {
MachineFunction *F = BB->getParent();
- const TargetInstrInfo *TII = F->getSubtarget().getInstrInfo();
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
DebugLoc DL = MI->getDebugLoc();
// Change the floating point control register to use "round towards zero"
case X86::VPCMPESTRM128MEM:
assert(Subtarget->hasSSE42() &&
"Target must have SSE4.2 or AVX features enabled");
- return EmitPCMPSTRM(MI, BB, BB->getParent()->getSubtarget().getInstrInfo());
+ return EmitPCMPSTRM(MI, BB, Subtarget->getInstrInfo());
// String/text processing lowering.
case X86::PCMPISTRIREG:
case X86::VPCMPESTRIMEM:
assert(Subtarget->hasSSE42() &&
"Target must have SSE4.2 or AVX features enabled");
- return EmitPCMPSTRI(MI, BB, BB->getParent()->getSubtarget().getInstrInfo());
+ return EmitPCMPSTRI(MI, BB, Subtarget->getInstrInfo());
// Thread synchronization.
case X86::MONITOR:
- return EmitMonitor(MI, BB, BB->getParent()->getSubtarget().getInstrInfo(),
- Subtarget);
+ return EmitMonitor(MI, BB, Subtarget);
// xbegin
case X86::XBEGIN:
- return EmitXBegin(MI, BB, BB->getParent()->getSubtarget().getInstrInfo());
+ return EmitXBegin(MI, BB, Subtarget->getInstrInfo());
case X86::VASTART_SAVE_XMM_REGS:
return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
: InVec.getOperand(1);
// If inputs to shuffle are the same for both ops, then allow 2 uses
- unsigned AllowedUses = InVec.getOperand(0) == InVec.getOperand(1) ? 2 : 1;
+ unsigned AllowedUses = InVec.getNumOperands() > 1 &&
+ InVec.getOperand(0) == InVec.getOperand(1) ? 2 : 1;
if (LdNode.getOpcode() == ISD::BITCAST) {
// Don't duplicate a load with other uses.
SDValue InputVector = N->getOperand(0);
- // Detect whether we are trying to convert from mmx to i32 and the bitcast
- // from mmx to v2i32 has a single usage.
- if (InputVector.getNode()->getOpcode() == llvm::ISD::BITCAST &&
- InputVector.getNode()->getOperand(0).getValueType() == MVT::x86mmx &&
- InputVector.hasOneUse() && N->getValueType(0) == MVT::i32)
- return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector),
- N->getValueType(0),
- InputVector.getNode()->getOperand(0));
+ // Detect mmx to i32 conversion through a v2i32 elt extract.
+ if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
+ N->getValueType(0) == MVT::i32 &&
+ InputVector.getValueType() == MVT::v2i32) {
+
+ // The bitcast source is a direct mmx result.
+ SDValue MMXSrc = InputVector.getNode()->getOperand(0);
+ if (MMXSrc.getValueType() == MVT::x86mmx)
+ return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector),
+ N->getValueType(0),
+ InputVector.getNode()->getOperand(0));
+
+ // The mmx is indirect: (i64 extract_elt (v1i64 bitcast (x86mmx ...))).
+ SDValue MMXSrcOp = MMXSrc.getOperand(0);
+ if (MMXSrc.getOpcode() == ISD::EXTRACT_VECTOR_ELT && MMXSrc.hasOneUse() &&
+ MMXSrc.getValueType() == MVT::i64 && MMXSrcOp.hasOneUse() &&
+ MMXSrcOp.getOpcode() == ISD::BITCAST &&
+ MMXSrcOp.getValueType() == MVT::v1i64 &&
+ MMXSrcOp.getOperand(0).getValueType() == MVT::x86mmx)
+ return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector),
+ N->getValueType(0),
+ MMXSrcOp.getOperand(0));
+ }
// Only operate on vectors of 4 elements, where the alternative shuffling
// gets to be more expensive.
// instructions match the semantics of the common C idiom x<y?x:y but not
// x<=y?x:y, because of how they handle negative zero (which can be
// ignored in unsafe-math mode).
+ // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
- VT != MVT::f80 && TLI.isTypeLegal(VT) &&
+ VT != MVT::f80 && (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
(Subtarget->hasSSE2() ||
(Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) {
ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
return SDValue();
EVT VT = N->getValueType(0);
- if (VT != MVT::i64)
+ if (VT != MVT::i64 && VT != MVT::i32)
return SDValue();
ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
return SDValue();
}
+/// PerformMLOADCombine - Resolve extending loads
+static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget *Subtarget) {
+ MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
+ if (Mld->getExtensionType() != ISD::SEXTLOAD)
+ return SDValue();
+
+ EVT VT = Mld->getValueType(0);
+ unsigned NumElems = VT.getVectorNumElements();
+ EVT LdVT = Mld->getMemoryVT();
+ SDLoc dl(Mld);
+
+ assert(LdVT != VT && "Cannot extend to the same type");
+ unsigned ToSz = VT.getVectorElementType().getSizeInBits();
+ unsigned FromSz = LdVT.getVectorElementType().getSizeInBits();
+ // From, To sizes and ElemCount must be pow of two
+ assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
+ "Unexpected size for extending masked load");
+
+ unsigned SizeRatio = ToSz / FromSz;
+ assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());
+
+ // Create a type on which we perform the shuffle
+ EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
+ LdVT.getScalarType(), NumElems*SizeRatio);
+ assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
+
+ // Convert Src0 value
+ SDValue WideSrc0 = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mld->getSrc0());
+ if (Mld->getSrc0().getOpcode() != ISD::UNDEF) {
+ SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
+ for (unsigned i = 0; i != NumElems; ++i)
+ ShuffleVec[i] = i * SizeRatio;
+
+ // Can't shuffle using an illegal type.
+ assert (DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT)
+ && "WideVecVT should be legal");
+ WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
+ DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
+ }
+ // Prepare the new mask
+ SDValue NewMask;
+ SDValue Mask = Mld->getMask();
+ if (Mask.getValueType() == VT) {
+ // Mask and original value have the same type
+ NewMask = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mask);
+ SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
+ for (unsigned i = 0; i != NumElems; ++i)
+ ShuffleVec[i] = i * SizeRatio;
+ for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
+ ShuffleVec[i] = NumElems*SizeRatio;
+ NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
+ DAG.getConstant(0, WideVecVT),
+ &ShuffleVec[0]);
+ }
+ else {
+ assert(Mask.getValueType().getVectorElementType() == MVT::i1);
+ unsigned WidenNumElts = NumElems*SizeRatio;
+ unsigned MaskNumElts = VT.getVectorNumElements();
+ EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+ WidenNumElts);
+
+ unsigned NumConcat = WidenNumElts / MaskNumElts;
+ SmallVector<SDValue, 16> Ops(NumConcat);
+ SDValue ZeroVal = DAG.getConstant(0, Mask.getValueType());
+ Ops[0] = Mask;
+ for (unsigned i = 1; i != NumConcat; ++i)
+ Ops[i] = ZeroVal;
+
+ NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
+ }
+
+ SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
+ Mld->getBasePtr(), NewMask, WideSrc0,
+ Mld->getMemoryVT(), Mld->getMemOperand(),
+ ISD::NON_EXTLOAD);
+ SDValue NewVec = DAG.getNode(X86ISD::VSEXT, dl, VT, WideLd);
+ return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
+
+}
+/// PerformMSTORECombine - Resolve truncating stores
+static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
+ if (!Mst->isTruncatingStore())
+ return SDValue();
+
+ EVT VT = Mst->getValue().getValueType();
+ unsigned NumElems = VT.getVectorNumElements();
+ EVT StVT = Mst->getMemoryVT();
+ SDLoc dl(Mst);
+
+ assert(StVT != VT && "Cannot truncate to the same type");
+ unsigned FromSz = VT.getVectorElementType().getSizeInBits();
+ unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
+
+ // From, To sizes and ElemCount must be pow of two
+ assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
+ "Unexpected size for truncating masked store");
+ // We are going to use the original vector elt for storing.
+ // Accumulated smaller vector elements must be a multiple of the store size.
+ assert (((NumElems * FromSz) % ToSz) == 0 &&
+ "Unexpected ratio for truncating masked store");
+
+ unsigned SizeRatio = FromSz / ToSz;
+ assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
+
+ // Create a type on which we perform the shuffle
+ EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
+ StVT.getScalarType(), NumElems*SizeRatio);
+
+ assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
+
+ SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mst->getValue());
+ SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
+ for (unsigned i = 0; i != NumElems; ++i)
+ ShuffleVec[i] = i * SizeRatio;
+
+ // Can't shuffle using an illegal type.
+ assert (DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT)
+ && "WideVecVT should be legal");
+
+ SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
+ DAG.getUNDEF(WideVecVT),
+ &ShuffleVec[0]);
+
+ SDValue NewMask;
+ SDValue Mask = Mst->getMask();
+ if (Mask.getValueType() == VT) {
+ // Mask and original value have the same type
+ NewMask = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mask);
+ for (unsigned i = 0; i != NumElems; ++i)
+ ShuffleVec[i] = i * SizeRatio;
+ for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
+ ShuffleVec[i] = NumElems*SizeRatio;
+ NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
+ DAG.getConstant(0, WideVecVT),
+ &ShuffleVec[0]);
+ }
+ else {
+ assert(Mask.getValueType().getVectorElementType() == MVT::i1);
+ unsigned WidenNumElts = NumElems*SizeRatio;
+ unsigned MaskNumElts = VT.getVectorNumElements();
+ EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+ WidenNumElts);
+
+ unsigned NumConcat = WidenNumElts / MaskNumElts;
+ SmallVector<SDValue, 16> Ops(NumConcat);
+ SDValue ZeroVal = DAG.getConstant(0, Mask.getValueType());
+ Ops[0] = Mask;
+ for (unsigned i = 1; i != NumConcat; ++i)
+ Ops[i] = ZeroVal;
+
+ NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
+ }
+
+ return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal, Mst->getBasePtr(),
+ NewMask, StVT, Mst->getMemOperand(), false);
+}
/// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
const X86Subtarget *Subtarget) {
}
static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
- const X86TargetLowering *XTLI) {
+ const X86Subtarget *Subtarget) {
// First try to optimize away the conversion entirely when it's
// conditionally from a constant. Vectors only.
SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG);
EVT VT = Ld->getValueType(0);
if (!Ld->isVolatile() && !N->getValueType(0).isVector() &&
ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
- !XTLI->getSubtarget()->is64Bit() &&
- VT == MVT::i64) {
- SDValue FILDChain = XTLI->BuildFILD(SDValue(N, 0), Ld->getValueType(0),
- Ld->getChain(), Op0, DAG);
+ !Subtarget->is64Bit() && VT == MVT::i64) {
+ SDValue FILDChain = Subtarget->getTargetLowering()->BuildFILD(
+ SDValue(N, 0), Ld->getValueType(0), Ld->getChain(), Op0, DAG);
DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
return FILDChain;
}
case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget);
case ISD::XOR: return PerformXorCombine(N, DAG, DCI, Subtarget);
case ISD::LOAD: return PerformLOADCombine(N, DAG, DCI, Subtarget);
+ case ISD::MLOAD: return PerformMLOADCombine(N, DAG, DCI, Subtarget);
case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget);
- case ISD::SINT_TO_FP: return PerformSINT_TO_FPCombine(N, DAG, this);
+ case ISD::MSTORE: return PerformMSTORECombine(N, DAG, Subtarget);
+ case ISD::SINT_TO_FP: return PerformSINT_TO_FPCombine(N, DAG, Subtarget);
case ISD::FADD: return PerformFADDCombine(N, DAG, Subtarget);
case ISD::FSUB: return PerformFSUBCombine(N, DAG, Subtarget);
case X86ISD::FXOR:
case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget);
case ISD::INTRINSIC_WO_CHAIN:
return PerformINTRINSIC_WO_CHAINCombine(N, DAG, Subtarget);
- case X86ISD::INSERTPS:
- return PerformINSERTPSCombine(N, DAG, Subtarget);
+ case X86ISD::INSERTPS: {
+ if (getTargetMachine().getOptLevel() > CodeGenOpt::None)
+ return PerformINSERTPSCombine(N, DAG, Subtarget);
+ break;
+ }
case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DAG, Subtarget);
}
}
}
return;
+ case 'L':
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
+ (Subtarget->is64Bit() && C->getZExtValue() == 0xffffffff)) {
+ Result = DAG.getTargetConstant(C->getSExtValue(), Op.getValueType());
+ break;
+ }
+ }
+ return;
+ case 'M':
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ if (C->getZExtValue() <= 3) {
+ Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
+ break;
+ }
+ }
+ return;
case 'N':
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
if (C->getZExtValue() <= 255) {
}
}
return;
+ case 'O':
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ if (C->getZExtValue() <= 127) {
+ Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
+ break;
+ }
+ }
+ return;
case 'e': {
// 32-bit signed value
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {