#include "X86ISelLowering.h"
#include "Utils/X86ShuffleDecode.h"
#include "X86CallingConv.h"
+#include "X86FrameLowering.h"
#include "X86InstrBuilder.h"
#include "X86MachineFunctionInfo.h"
#include "X86TargetMachine.h"
cl::desc("Enable an experimental vector shuffle lowering code path."),
cl::Hidden);
+static cl::opt<bool> ExperimentalVectorShuffleLegality(
+ "x86-experimental-vector-shuffle-legality", cl::init(false),
+ cl::desc("Enable experimental shuffle legality based on the experimental "
+ "shuffle lowering. Should only be used with the experimental "
+ "shuffle lowering."),
+ cl::Hidden);
+
static cl::opt<int> ReciprocalEstimateRefinementSteps(
"x86-recip-refinement-steps", cl::init(1),
cl::desc("Specify the number of Newton-Raphson iterations applied to the "
return Insert256BitVector(V, V2, NumElems/2, DAG, dl);
}
-// FIXME: This should stop caching the target machine as soon as
-// we can remove resetOperationActions et al.
-X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM)
- : TargetLowering(TM) {
- Subtarget = &TM.getSubtarget<X86Subtarget>();
+X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
+ const X86Subtarget &STI)
+ : TargetLowering(TM), Subtarget(&STI) {
X86ScalarSSEf64 = Subtarget->hasSSE2();
X86ScalarSSEf32 = Subtarget->hasSSE1();
TD = getDataLayout();
- resetOperationActions();
-}
-
-void X86TargetLowering::resetOperationActions() {
- const TargetMachine &TM = getTargetMachine();
- static bool FirstTimeThrough = true;
-
- // If none of the target options have changed, then we don't need to reset the
- // operation actions.
- if (!FirstTimeThrough && TO == TM.Options) return;
-
- if (!FirstTimeThrough) {
- // Reinitialize the actions.
- initActions();
- FirstTimeThrough = false;
- }
-
- TO = TM.Options;
-
// Set up the TargetLowering object.
static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
setSchedulingPreference(Sched::ILP);
else
setSchedulingPreference(Sched::RegPressure);
- const X86RegisterInfo *RegInfo =
- TM.getSubtarget<X86Subtarget>().getRegisterInfo();
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
// Bypass expensive divides on Atom when compiling with O2.
if (Subtarget->is64Bit())
addRegisterClass(MVT::i64, &X86::GR64RegClass);
- setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
+ for (MVT VT : MVT::integer_valuetypes())
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
// We don't accept any truncstore of integer registers.
setTruncStoreAction(MVT::i64, MVT::i32, Expand);
setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
setTruncStoreAction(MVT::f32, MVT::f16, Expand);
setTruncStoreAction(MVT::f64, MVT::f16, Expand);
setTruncStoreAction(MVT::f80, MVT::f16, Expand);
// First set operation action for all vector types to either promote
// (for widening) or expand (for scalarization). Then we will selectively
// turn on ones that can be effectively codegen'd.
- for (int i = MVT::FIRST_VECTOR_VALUETYPE;
- i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
- MVT VT = (MVT::SimpleValueType)i;
+ for (MVT VT : MVT::vector_valuetypes()) {
setOperationAction(ISD::ADD , VT, Expand);
setOperationAction(ISD::SUB , VT, Expand);
setOperationAction(ISD::FADD, VT, Expand);
setOperationAction(ISD::ANY_EXTEND, VT, Expand);
setOperationAction(ISD::VSELECT, VT, Expand);
setOperationAction(ISD::SELECT_CC, VT, Expand);
- for (int InnerVT = MVT::FIRST_VECTOR_VALUETYPE;
- InnerVT <= MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
- setTruncStoreAction(VT,
- (MVT::SimpleValueType)InnerVT, Expand);
- setLoadExtAction(ISD::SEXTLOAD, VT, Expand);
- setLoadExtAction(ISD::ZEXTLOAD, VT, Expand);
+ for (MVT InnerVT : MVT::vector_valuetypes()) {
+ setTruncStoreAction(InnerVT, VT, Expand);
+
+ setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
+ setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
- // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like types,
- // we have to deal with them whether we ask for Expansion or not. Setting
- // Expand causes its own optimisation problems though, so leave them legal.
- if (VT.getVectorElementType() == MVT::i1)
- setLoadExtAction(ISD::EXTLOAD, VT, Expand);
+ // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
+ // types, we have to deal with them whether we ask for Expansion or not.
+ // Setting Expand causes its own optimisation problems though, so leave
+ // them legal.
+ if (VT.getVectorElementType() == MVT::i1)
+ setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
+ }
}
// FIXME: In order to prevent SSE instructions being expanded to MMX ones
// memory vector types which we can load as a scalar (or sequence of
// scalars) and extend in-register to a legal 128-bit vector type. For sext
// loads these must work with a single scalar load.
- setLoadExtAction(ISD::SEXTLOAD, MVT::v4i8, Custom);
- setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, Custom);
- setLoadExtAction(ISD::SEXTLOAD, MVT::v8i8, Custom);
- setLoadExtAction(ISD::EXTLOAD, MVT::v2i8, Custom);
- setLoadExtAction(ISD::EXTLOAD, MVT::v2i16, Custom);
- setLoadExtAction(ISD::EXTLOAD, MVT::v2i32, Custom);
- setLoadExtAction(ISD::EXTLOAD, MVT::v4i8, Custom);
- setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, Custom);
- setLoadExtAction(ISD::EXTLOAD, MVT::v8i8, Custom);
+ for (MVT VT : MVT::integer_vector_valuetypes()) {
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
+ }
setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom);
setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom);
setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
- setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, Legal);
+ for (MVT VT : MVT::fp_vector_valuetypes())
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
// SSE41 brings specific instructions for doing vector sign extend even in
// cases where we don't have SRA.
- setLoadExtAction(ISD::SEXTLOAD, MVT::v2i8, Custom);
- setLoadExtAction(ISD::SEXTLOAD, MVT::v2i16, Custom);
- setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, Custom);
+ for (MVT VT : MVT::integer_vector_valuetypes()) {
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
+ }
+
+ // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, MVT::v8i8, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i8, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
+
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i16, MVT::v8i8, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i8, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
// i8 and i16 vectors are custom because the source register and source
// source memory operand types are not the same width. f32 vectors are
setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom);
- setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, Legal);
+ for (MVT VT : MVT::fp_vector_valuetypes())
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
setOperationAction(ISD::SRL, MVT::v16i16, Custom);
setOperationAction(ISD::SRL, MVT::v32i8, Custom);
// Custom CTPOP always performs better on natively supported v8i32
setOperationAction(ISD::CTPOP, MVT::v8i32, Custom);
+
+ // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32, MVT::v8i8, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i8, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32, MVT::v8i16, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i16, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i32, Legal);
+
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32, MVT::v8i8, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i8, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32, MVT::v8i16, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i16, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i32, Legal);
} else {
setOperationAction(ISD::ADD, MVT::v4i64, Custom);
setOperationAction(ISD::ADD, MVT::v8i32, Custom);
setOperationAction(ISD::SRA, MVT::v8i32, Custom);
// Custom lower several nodes for 256-bit types.
- for (int i = MVT::FIRST_VECTOR_VALUETYPE;
- i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
- MVT VT = (MVT::SimpleValueType)i;
-
+ for (MVT VT : MVT::vector_valuetypes()) {
if (VT.getScalarSizeInBits() >= 32) {
setOperationAction(ISD::MLOAD, VT, Legal);
setOperationAction(ISD::MSTORE, VT, Legal);
addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
+ for (MVT VT : MVT::fp_vector_valuetypes())
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
+
setOperationAction(ISD::BR_CC, MVT::i1, Expand);
setOperationAction(ISD::SETCC, MVT::i1, Custom);
setOperationAction(ISD::XOR, MVT::i1, Legal);
setOperationAction(ISD::OR, MVT::i1, Legal);
setOperationAction(ISD::AND, MVT::i1, Legal);
- setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, Legal);
setOperationAction(ISD::LOAD, MVT::v16f32, Legal);
setOperationAction(ISD::LOAD, MVT::v8f64, Legal);
setOperationAction(ISD::LOAD, MVT::v8i64, Legal);
}
// Custom lower several nodes.
- for (int i = MVT::FIRST_VECTOR_VALUETYPE;
- i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
- MVT VT = (MVT::SimpleValueType)i;
-
+ for (MVT VT : MVT::vector_valuetypes()) {
unsigned EltSize = VT.getVectorElementType().getSizeInBits();
// Extract subvector is special because the value type
// (result) is 256/128-bit but the source is 512-bit wide.
for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
MVT VT = (MVT::SimpleValueType)i;
- // Do not attempt to promote non-256-bit vectors.
+ // Do not attempt to promote non-512-bit vectors.
if (!VT.is512BitVector())
continue;
const unsigned EltSize = VT.getVectorElementType().getSizeInBits();
- // Do not attempt to promote non-256-bit vectors.
+ // Do not attempt to promote non-512-bit vectors.
if (!VT.is512BitVector())
continue;
// SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion
// of this type with custom code.
- for (int VT = MVT::FIRST_VECTOR_VALUETYPE;
- VT != MVT::LAST_VECTOR_VALUETYPE; VT++) {
- setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,
- Custom);
- }
+ for (MVT VT : MVT::vector_valuetypes())
+ setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Custom);
// We want to custom lower some of our intrinsics.
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
// We have target-specific dag combine patterns for the following nodes:
setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
+ setTargetDAGCombine(ISD::BITCAST);
setTargetDAGCombine(ISD::VSELECT);
setTargetDAGCombine(ISD::SELECT);
setTargetDAGCombine(ISD::SHL);
setTargetDAGCombine(ISD::FMA);
setTargetDAGCombine(ISD::SUB);
setTargetDAGCombine(ISD::LOAD);
+ setTargetDAGCombine(ISD::MLOAD);
setTargetDAGCombine(ISD::STORE);
+ setTargetDAGCombine(ISD::MSTORE);
setTargetDAGCombine(ISD::ZERO_EXTEND);
setTargetDAGCombine(ISD::ANY_EXTEND);
setTargetDAGCombine(ISD::SIGN_EXTEND);
setTargetDAGCombine(ISD::SETCC);
setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
setTargetDAGCombine(ISD::BUILD_VECTOR);
- if (Subtarget->is64Bit())
- setTargetDAGCombine(ISD::MUL);
+ setTargetDAGCombine(ISD::MUL);
setTargetDAGCombine(ISD::XOR);
computeRegisterProperties();
// Win32 requires us to put the sret argument to %eax as well.
// We saved the argument into a virtual register in the entry block,
// so now we copy the value out and into %rax/%eax.
- if (DAG.getMachineFunction().getFunction()->hasStructRetAttr() &&
- (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC())) {
- MachineFunction &MF = DAG.getMachineFunction();
- X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
- unsigned Reg = FuncInfo->getSRetReturnReg();
- assert(Reg &&
- "SRetReturnReg should have been set in LowerFormalArguments().");
- SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy());
+ //
+ // Checking Function.hasStructRetAttr() here is insufficient because the IR
+ // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
+ // false, then an sret argument may be implicitly inserted in the SelDAG. In
+ // either case FuncInfo->setSRetReturnReg() will have been called.
+ if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
+ assert((Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC()) &&
+ "No need for an sret register");
+ SDValue Val = DAG.getCopyFromReg(Chain, dl, SRetReg, getPointerTy());
unsigned RetValReg
= (Subtarget->is64Bit() && !Subtarget->isTarget64BitILP32()) ?
}
// Figure out if XMM registers are in use.
- bool HaveXMMArgs = Is64Bit && !IsWin64;
- bool NoImplicitFloatOps = Fn->getAttributes().hasAttribute(
- AttributeSet::FunctionIndex, Attribute::NoImplicitFloat);
- assert(!(MF.getTarget().Options.UseSoftFloat && NoImplicitFloatOps) &&
+ assert(!(MF.getTarget().Options.UseSoftFloat &&
+ Fn->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+ Attribute::NoImplicitFloat)) &&
"SSE register cannot be used when SSE is disabled!");
- if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps ||
- !Subtarget->hasSSE1())
- HaveXMMArgs = false;
// 64-bit calling conventions support varargs and register parameters, so we
// have to do extra work to spill them in the prologue.
}
if (IsWin64) {
- const TargetFrameLowering &TFI = *MF.getSubtarget().getFrameLowering();
+ const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
// Get to the caller-allocated home save location. Add 8 to account
// for the return address.
int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
// Walk the register/memloc assignments, inserting copies/loads. In the case
// of tail call optimization arguments are handle later.
- const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
- DAG.getSubtarget().getRegisterInfo());
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
// Skip inalloca arguments, they have already been written.
ISD::ArgFlagsTy Flags = Outs[i].Flags;
// through a register, since the call instruction's 32-bit
// pc-relative offset may not be large enough to hold the whole
// address.
- } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+ } else if (Callee->getOpcode() == ISD::GlobalAddress) {
// If the callee is a GlobalAddress node (quite common, every direct call
// is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
// it.
+ GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);
// We should use extra load for direct calls to dllimported functions in
// non-JIT mode.
Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(),
OpFlags);
- } else if (Subtarget->isTarget64BitILP32() && Callee->getValueType(0) == MVT::i32) {
+ } else if (Subtarget->isTarget64BitILP32() &&
+ Callee->getValueType(0) == MVT::i32) {
// Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
}
RegsToPass[i].second.getValueType()));
// Add a register mask operand representing the call-preserved registers.
- const TargetRegisterInfo *TRI = DAG.getSubtarget().getRegisterInfo();
+ const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
assert(Mask && "Missing call preserved mask for calling convention");
Ops.push_back(DAG.getRegisterMask(Mask));
unsigned
X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
SelectionDAG& DAG) const {
- MachineFunction &MF = DAG.getMachineFunction();
- const TargetMachine &TM = MF.getTarget();
- const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
- TM.getSubtargetImpl()->getRegisterInfo());
- const TargetFrameLowering &TFI = *TM.getSubtargetImpl()->getFrameLowering();
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+ const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
unsigned StackAlignment = TFI.getStackAlignment();
uint64_t AlignMask = StackAlignment - 1;
int64_t Offset = StackSize;
return false;
} else {
unsigned Opcode = Def->getOpcode();
- if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) &&
+ if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
+ Opcode == X86::LEA64_32r) &&
Def->getOperand(1).isFI()) {
FI = Def->getOperand(1).getIndex();
Bytes = Flags.getByValSize();
// Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
// emit a special epilogue.
- const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
- DAG.getSubtarget().getRegisterInfo());
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
if (RegInfo->needsStackRealignment(MF))
return false;
// the caller's fixed stack objects.
MachineFrameInfo *MFI = MF.getFrameInfo();
const MachineRegisterInfo *MRI = &MF.getRegInfo();
- const X86InstrInfo *TII =
- static_cast<const X86InstrInfo *>(DAG.getSubtarget().getInstrInfo());
+ const X86InstrInfo *TII = Subtarget->getInstrInfo();
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
CCValAssign &VA = ArgLocs[i];
SDValue Arg = OutVals[i];
SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
- const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
- DAG.getSubtarget().getRegisterInfo());
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
int ReturnAddrIndex = FuncInfo->getRAIndex();
return false;
}
+bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
+ ISD::LoadExtType ExtTy,
+ EVT NewVT) const {
+ // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
+ // relocation target a movq or addq instruction: don't let the load shrink.
+ SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
+ if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
+ if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
+ return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
+ return true;
+}
+
/// \brief Returns true if it is beneficial to convert a load of a constant
/// to just the constant itself.
bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
return true;
}
-bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT,
+bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT,
unsigned Index) const {
if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
return false;
return (Index == 0 || Index == ResVT.getVectorNumElements());
}
+bool X86TargetLowering::isCheapToSpeculateCttz() const {
+ // Speculate cttz only if we can directly use TZCNT.
+ return Subtarget->hasBMI();
+}
+
+bool X86TargetLowering::isCheapToSpeculateCtlz() const {
+ // Speculate ctlz only if we can directly use LZCNT.
+ return Subtarget->hasLZCNT();
+}
+
/// isUndefOrInRange - Return true if Val is undef or if its value falls within
/// the specified range (L, H].
static bool isUndefOrInRange(int Val, int Low, int Hi) {
/// isSequentialOrUndefInRange - Return true if every element in Mask, beginning
/// from position Pos and ending in Pos+Size, falls within the specified
-/// sequential range (L, L+Pos]. or is undef.
+/// sequential range (Low, Low+Size]. or is undef.
static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
unsigned Pos, unsigned Size, int Low) {
for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
return false;
if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) {
- // FIXME: Support AVX-512 here.
- Type *Ty = C->getType();
- if (!Ty->isVectorTy() || (Ty->getVectorNumElements() != 16 &&
- Ty->getVectorNumElements() != 32))
- return false;
-
DecodePSHUFBMask(C, Mask);
+ if (Mask.empty())
+ return false;
break;
}
IsUnary = true;
break;
case X86ISD::MOVSS:
- case X86ISD::MOVSD: {
- // The index 0 always comes from the first element of the second source,
- // this is why MOVSS and MOVSD are used in the first place. The other
- // elements come from the other positions of the first source vector
- Mask.push_back(NumElems);
- for (unsigned i = 1; i != NumElems; ++i) {
- Mask.push_back(i);
- }
+ case X86ISD::MOVSD:
+ DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask);
break;
- }
case X86ISD::VPERM2X128:
ImmN = N->getOperand(N->getNumOperands()-1);
DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
break;
case X86ISD::MOVSLDUP:
DecodeMOVSLDUPMask(VT, Mask);
+ IsUnary = true;
break;
case X86ISD::MOVSHDUP:
DecodeMOVSHDUPMask(VT, Mask);
+ IsUnary = true;
break;
case X86ISD::MOVDDUP:
+ DecodeMOVDDUPMask(VT, Mask);
+ IsUnary = true;
+ break;
case X86ISD::MOVLHPD:
case X86ISD::MOVLPD:
case X86ISD::MOVLPS:
return DAG.getNode(ISD::BITCAST, SDLoc(Op), VT, Result);
}
-/// getVShift - Return a vector logical shift node.
-///
+/// Return a vector logical shift node.
static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
unsigned NumBits, SelectionDAG &DAG,
const TargetLowering &TLI, SDLoc dl) {
assert(VT.is128BitVector() && "Unknown type for VShift");
- EVT ShVT = MVT::v2i64;
+ MVT ShVT = MVT::v2i64;
unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp);
+ MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(SrcOp.getValueType());
+ SDValue ShiftVal = DAG.getConstant(NumBits, ScalarShiftTy);
return DAG.getNode(ISD::BITCAST, dl, VT,
- DAG.getNode(Opc, dl, ShVT, SrcOp,
- DAG.getConstant(NumBits,
- TLI.getScalarShiftAmountTy(SrcOp.getValueType()))));
+ DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
}
static SDValue
return SDValue();
}
-/// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a
-/// vector of type 'VT', see if the elements can be replaced by a single large
-/// load which has the same value as a build_vector whose operands are 'elts'.
+/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
+/// elements can be replaced by a single large load which has the same value as
+/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
///
/// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a
///
/// FIXME: we'd also like to handle the case where the last elements are zero
/// rather than undef via VZEXT_LOAD, but we do not detect that case today.
/// There's even a handy isZeroNode for that purpose.
-static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
+static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
SDLoc &DL, SelectionDAG &DAG,
bool isAfterLegalize) {
- EVT EltVT = VT.getVectorElementType();
unsigned NumElems = Elts.size();
LoadSDNode *LDBase = nullptr;
// non-consecutive, bail out.
for (unsigned i = 0; i < NumElems; ++i) {
SDValue Elt = Elts[i];
-
+ // Look through a bitcast.
+ if (Elt.getNode() && Elt.getOpcode() == ISD::BITCAST)
+ Elt = Elt.getOperand(0);
if (!Elt.getNode() ||
(Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
return SDValue();
continue;
LoadSDNode *LD = cast<LoadSDNode>(Elt);
- if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i))
+ EVT LdVT = Elt.getValueType();
+ // Each loaded element must be the correct fractional portion of the
+ // requested vector load.
+ if (LdVT.getSizeInBits() != VT.getSizeInBits() / NumElems)
+ return SDValue();
+ if (!DAG.isConsecutiveLoad(LD, LDBase, LdVT.getSizeInBits() / 8, i))
return SDValue();
LastLoadedElt = i;
}
// load of the entire vector width starting at the base pointer. If we found
// consecutive loads for the low half, generate a vzext_load node.
if (LastLoadedElt == NumElems - 1) {
+ assert(LDBase && "Did not find base load for merging consecutive loads");
+ EVT EltVT = LDBase->getValueType(0);
+ // Ensure that the input vector size for the merged loads matches the
+ // cumulative size of the input elements.
+ if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
+ return SDValue();
if (isAfterLegalize &&
!DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, VT))
return NewLd;
}
-
+
//TODO: The code below fires only for for loading the low v2i32 / v2f32
//of a v4i32 / v4f32. It's probably worth generalizing.
+ EVT EltVT = VT.getVectorElementType();
if (NumElems == 4 && LastLoadedElt == 1 && (EltVT.getSizeInBits() == 32) &&
DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
// Check for a build vector of consecutive loads.
if (SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false))
return LD;
-
+
EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
// Build both the lower and upper subvector.
return Sh;
// For SSE 4.1, use insertps to put the high elements into the low element.
- if (getSubtarget()->hasSSE41()) {
+ if (Subtarget->hasSSE41()) {
SDValue Result;
if (Op.getOperand(0).getOpcode() != ISD::UNDEF)
Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
return Zeroable;
}
+/// \brief Try to emit a bitmask instruction for a shuffle.
+///
+/// This handles cases where we can model a blend exactly as a bitmask due to
+/// one of the inputs being zeroable.
+static SDValue lowerVectorShuffleAsBitMask(SDLoc DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
+ MVT EltVT = VT.getScalarType();
+ int NumEltBits = EltVT.getSizeInBits();
+ MVT IntEltVT = MVT::getIntegerVT(NumEltBits);
+ SDValue Zero = DAG.getConstant(0, IntEltVT);
+ SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), IntEltVT);
+ if (EltVT.isFloatingPoint()) {
+ Zero = DAG.getNode(ISD::BITCAST, DL, EltVT, Zero);
+ AllOnes = DAG.getNode(ISD::BITCAST, DL, EltVT, AllOnes);
+ }
+ SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
+ SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+ SDValue V;
+ for (int i = 0, Size = Mask.size(); i < Size; ++i) {
+ if (Zeroable[i])
+ continue;
+ if (Mask[i] % Size != i)
+ return SDValue(); // Not a blend.
+ if (!V)
+ V = Mask[i] < Size ? V1 : V2;
+ else if (V != (Mask[i] < Size ? V1 : V2))
+ return SDValue(); // Can only let one input through the mask.
+
+ VMaskOps[i] = AllOnes;
+ }
+ if (!V)
+ return SDValue(); // No non-zeroable elements!
+
+ SDValue VMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, VMaskOps);
+ V = DAG.getNode(VT.isFloatingPoint()
+ ? (unsigned) X86ISD::FAND : (unsigned) ISD::AND,
+ DL, VT, V, VMask);
+ return V;
+}
+
/// \brief Try to lower a vector shuffle as a byte shift (shifts in zeros).
///
/// Attempts to match a shuffle mask against the PSRLDQ and PSLLDQ SSE2
int Size = Mask.size();
int Scale = 16 / Size;
- auto isSequential = [](int Base, int StartIndex, int EndIndex, int MaskOffset,
- ArrayRef<int> Mask) {
- for (int i = StartIndex; i < EndIndex; i++) {
- if (Mask[i] < 0)
- continue;
- if (i + Base != Mask[i] - MaskOffset)
- return false;
- }
- return true;
- };
-
for (int Shift = 1; Shift < Size; Shift++) {
int ByteShift = Shift * Scale;
}
if (ZeroableRight) {
- bool ValidShiftRight1 = isSequential(Shift, 0, Size - Shift, 0, Mask);
- bool ValidShiftRight2 = isSequential(Shift, 0, Size - Shift, Size, Mask);
+ bool ValidShiftRight1 =
+ isSequentialOrUndefInRange(Mask, 0, Size - Shift, Shift);
+ bool ValidShiftRight2 =
+ isSequentialOrUndefInRange(Mask, 0, Size - Shift, Size + Shift);
if (ValidShiftRight1 || ValidShiftRight2) {
// Cast the inputs to v2i64 to match PSRLDQ.
}
if (ZeroableLeft) {
- bool ValidShiftLeft1 = isSequential(-Shift, Shift, Size, 0, Mask);
- bool ValidShiftLeft2 = isSequential(-Shift, Shift, Size, Size, Mask);
+ bool ValidShiftLeft1 =
+ isSequentialOrUndefInRange(Mask, Shift, Size - Shift, 0);
+ bool ValidShiftLeft2 =
+ isSequentialOrUndefInRange(Mask, Shift, Size - Shift, Size);
if (ValidShiftLeft1 || ValidShiftLeft2) {
// Cast the inputs to v2i64 to match PSLLDQ.
return SDValue();
}
+/// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros).
+///
+/// Attempts to match a shuffle mask against the PSRL(W/D/Q) and PSLL(W/D/Q)
+/// SSE2 and AVX2 logical bit-shift instructions. The function matches
+/// elements from one of the input vectors shuffled to the left or right
+/// with zeroable elements 'shifted in'.
+static SDValue lowerVectorShuffleAsBitShift(SDLoc DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
+ SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+
+ int Size = Mask.size();
+ assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
+
+ // PSRL : (little-endian) right bit shift.
+ // [ 1, zz, 3, zz]
+ // [ -1, -1, 7, zz]
+ // PSHL : (little-endian) left bit shift.
+ // [ zz, 0, zz, 2 ]
+ // [ -1, 4, zz, -1 ]
+ auto MatchBitShift = [&](int Shift, int Scale) -> SDValue {
+ MVT ShiftSVT = MVT::getIntegerVT(VT.getScalarSizeInBits() * Scale);
+ MVT ShiftVT = MVT::getVectorVT(ShiftSVT, Size / Scale);
+ assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
+ "Illegal integer vector type");
+
+ bool MatchLeft = true, MatchRight = true;
+ for (int i = 0; i != Size; i += Scale) {
+ for (int j = 0; j != Shift; j++) {
+ MatchLeft &= Zeroable[i + j];
+ }
+ for (int j = Scale - Shift; j != Scale; j++) {
+ MatchRight &= Zeroable[i + j];
+ }
+ }
+ if (!(MatchLeft || MatchRight))
+ return SDValue();
+
+ bool MatchV1 = true, MatchV2 = true;
+ for (int i = 0; i != Size; i += Scale) {
+ unsigned Pos = MatchLeft ? i + Shift : i;
+ unsigned Low = MatchLeft ? i : i + Shift;
+ unsigned Len = Scale - Shift;
+ MatchV1 &= isSequentialOrUndefInRange(Mask, Pos, Len, Low);
+ MatchV2 &= isSequentialOrUndefInRange(Mask, Pos, Len, Low + Size);
+ }
+ if (!(MatchV1 || MatchV2))
+ return SDValue();
+
+ // Cast the inputs to ShiftVT to match VSRLI/VSHLI and back again.
+ unsigned OpCode = MatchLeft ? X86ISD::VSHLI : X86ISD::VSRLI;
+ int ShiftAmt = Shift * VT.getScalarSizeInBits();
+ SDValue V = MatchV1 ? V1 : V2;
+ V = DAG.getNode(ISD::BITCAST, DL, ShiftVT, V);
+ V = DAG.getNode(OpCode, DL, ShiftVT, V, DAG.getConstant(ShiftAmt, MVT::i8));
+ return DAG.getNode(ISD::BITCAST, DL, VT, V);
+ };
+
+ // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
+ // keep doubling the size of the integer elements up to that. We can
+ // then shift the elements of the integer vector by whole multiples of
+ // their width within the elements of the larger integer vector. Test each
+ // multiple to see if we can find a match with the moved element indices
+ // and that the shifted in elements are all zeroable.
+ for (int Scale = 2; Scale * VT.getScalarSizeInBits() <= 64; Scale *= 2)
+ for (int Shift = 1; Shift != Scale; Shift++)
+ if (SDValue BitShift = MatchBitShift(Shift, Scale))
+ return BitShift;
+
+ // no match
+ return SDValue();
+}
+
/// \brief Lower a vector shuffle as a zero or any extension.
///
/// Given a specific number of elements, element bit width, and extension
/// stride, produce either a zero or any extension based on the available
/// features of the subtarget.
static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
- SDLoc DL, MVT VT, int NumElements, int Scale, bool AnyExt, SDValue InputV,
+ SDLoc DL, MVT VT, int Scale, bool AnyExt, SDValue InputV,
const X86Subtarget *Subtarget, SelectionDAG &DAG) {
assert(Scale > 1 && "Need a scale to extend.");
- int EltBits = VT.getSizeInBits() / NumElements;
+ int NumElements = VT.getVectorNumElements();
+ int EltBits = VT.getScalarSizeInBits();
assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
"Only 8, 16, and 32 bit elements can be extended.");
assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
// Found a valid zext mask! Try various lowering strategies based on the
// input type and available ISA extensions.
if (Subtarget->hasSSE41()) {
- MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
NumElements / Scale);
- InputV = DAG.getNode(ISD::BITCAST, DL, InputVT, InputV);
return DAG.getNode(ISD::BITCAST, DL, VT,
DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV));
}
return DAG.getNode(ISD::BITCAST, DL, VT, InputV);
}
-/// \brief Try to lower a vector shuffle as a zero extension on any micrarch.
+/// \brief Try to lower a vector shuffle as a zero extension on any microarch.
///
/// This routine will try to do everything in its power to cleverly lower
/// a shuffle which happens to match the pattern of a zero extend. It doesn't
SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
int Bits = VT.getSizeInBits();
- int NumElements = Mask.size();
+ int NumElements = VT.getVectorNumElements();
+ assert(VT.getScalarSizeInBits() <= 32 &&
+ "Exceeds 32-bit integer zero extension limit");
+ assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
// Define a helper function to check a particular ext-scale and lower to it if
// valid.
if (Mask[i] == -1)
continue; // Valid anywhere but doesn't tell us anything.
if (i % Scale != 0) {
- // Each of the extend elements needs to be zeroable.
+ // Each of the extended elements need to be zeroable.
if (!Zeroable[i])
return SDValue();
- // We no lorger are in the anyext case.
+ // We no longer are in the anyext case.
AnyExt = false;
continue;
}
return SDValue(); // Flip-flopping inputs.
if (Mask[i] % NumElements != i / Scale)
- return SDValue(); // Non-consecutive strided elemenst.
+ return SDValue(); // Non-consecutive strided elements.
}
// If we fail to find an input, we have a zero-shuffle which should always
return SDValue();
return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
- DL, VT, NumElements, Scale, AnyExt, InputV, Subtarget, DAG);
+ DL, VT, Scale, AnyExt, InputV, Subtarget, DAG);
};
// The widest scale possible for extending is to a 64-bit integer.
// many elements.
for (; NumExtElements < NumElements; NumExtElements *= 2) {
assert(NumElements % NumExtElements == 0 &&
- "The input vector size must be divisble by the extended size.");
+ "The input vector size must be divisible by the extended size.");
if (SDValue V = Lower(NumElements / NumExtElements))
return V;
}
+ // General extends failed, but 128-bit vectors may be able to use MOVQ.
+ if (Bits != 128)
+ return SDValue();
+
+ // Returns one of the source operands if the shuffle can be reduced to a
+ // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
+ auto CanZExtLowHalf = [&]() {
+ for (int i = NumElements / 2; i != NumElements; i++)
+ if (!Zeroable[i])
+ return SDValue();
+ if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
+ return V1;
+ if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
+ return V2;
+ return SDValue();
+ };
+
+ if (SDValue V = CanZExtLowHalf()) {
+ V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, V);
+ V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
+ return DAG.getNode(ISD::BITCAST, DL, VT, V);
+ }
+
// No viable ext lowering found.
return SDValue();
}
return DAG.getNode(X86ISD::VBROADCAST, DL, VT, V);
}
+// Check for whether we can use INSERTPS to perform the shuffle. We only use
+// INSERTPS when the V1 elements are already in the correct locations
+// because otherwise we can just always use two SHUFPS instructions which
+// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
+// perform INSERTPS if a single V1 element is out of place and all V2
+// elements are zeroable.
+static SDValue lowerVectorShuffleAsInsertPS(SDValue Op, SDValue V1, SDValue V2,
+ ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
+ assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!");
+ assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
+ assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
+
+ SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+
+ unsigned ZMask = 0;
+ int V1DstIndex = -1;
+ int V2DstIndex = -1;
+ bool V1UsedInPlace = false;
+
+ for (int i = 0; i < 4; i++) {
+ // Synthesize a zero mask from the zeroable elements (includes undefs).
+ if (Zeroable[i]) {
+ ZMask |= 1 << i;
+ continue;
+ }
+
+ // Flag if we use any V1 inputs in place.
+ if (i == Mask[i]) {
+ V1UsedInPlace = true;
+ continue;
+ }
+
+ // We can only insert a single non-zeroable element.
+ if (V1DstIndex != -1 || V2DstIndex != -1)
+ return SDValue();
+
+ if (Mask[i] < 4) {
+ // V1 input out of place for insertion.
+ V1DstIndex = i;
+ } else {
+ // V2 input for insertion.
+ V2DstIndex = i;
+ }
+ }
+
+ // Don't bother if we have no (non-zeroable) element for insertion.
+ if (V1DstIndex == -1 && V2DstIndex == -1)
+ return SDValue();
+
+ // Determine element insertion src/dst indices. The src index is from the
+ // start of the inserted vector, not the start of the concatenated vector.
+ unsigned V2SrcIndex = 0;
+ if (V1DstIndex != -1) {
+ // If we have a V1 input out of place, we use V1 as the V2 element insertion
+ // and don't use the original V2 at all.
+ V2SrcIndex = Mask[V1DstIndex];
+ V2DstIndex = V1DstIndex;
+ V2 = V1;
+ } else {
+ V2SrcIndex = Mask[V2DstIndex] - 4;
+ }
+
+ // If no V1 inputs are used in place, then the result is created only from
+ // the zero mask and the V2 insertion - so remove V1 dependency.
+ if (!V1UsedInPlace)
+ V1 = DAG.getUNDEF(MVT::v4f32);
+
+ unsigned InsertPSMask = V2SrcIndex << 6 | V2DstIndex << 4 | ZMask;
+ assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
+
+ // Insert the V2 element into the desired position.
+ SDLoc DL(Op);
+ return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
+ DAG.getConstant(InsertPSMask, MVT::i8));
+}
+
/// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
///
/// This is the basis function for the 2-lane 64-bit shuffles as we have full
assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
if (isSingleInputShuffleMask(Mask)) {
+ // Use low duplicate instructions for masks that match their pattern.
+ if (Subtarget->hasSSE3())
+ if (isShuffleEquivalent(Mask, 0, 0))
+ return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, V1);
+
// Straight shuffle of a single input vector. Simulate this by using the
// single input as both of the "inputs" to this instruction..
unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
Mask, Subtarget, DAG))
return Broadcast;
+ // Use even/odd duplicate instructions for masks that match their pattern.
+ if (Subtarget->hasSSE3()) {
+ if (isShuffleEquivalent(Mask, 0, 0, 2, 2))
+ return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
+ if (isShuffleEquivalent(Mask, 1, 1, 3, 3))
+ return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
+ }
+
if (Subtarget->hasAVX()) {
// If we have AVX, we can use VPERMILPS which will allow folding a load
// into the shuffle.
Mask, Subtarget, DAG))
return V;
- if (Subtarget->hasSSE41())
+ if (Subtarget->hasSSE41()) {
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
Subtarget, DAG))
return Blend;
- // Check for whether we can use INSERTPS to perform the blend. We only use
- // INSERTPS when the V1 elements are already in the correct locations
- // because otherwise we can just always use two SHUFPS instructions which
- // are much smaller to encode than a SHUFPS and an INSERTPS.
- if (NumV2Elements == 1 && Subtarget->hasSSE41()) {
- int V2Index =
- std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) -
- Mask.begin();
-
- // When using INSERTPS we can zero any lane of the destination. Collect
- // the zero inputs into a mask and drop them from the lanes of V1 which
- // actually need to be present as inputs to the INSERTPS.
- SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
-
- // Synthesize a shuffle mask for the non-zero and non-v2 inputs.
- bool InsertNeedsShuffle = false;
- unsigned ZMask = 0;
- for (int i = 0; i < 4; ++i)
- if (i != V2Index) {
- if (Zeroable[i]) {
- ZMask |= 1 << i;
- } else if (Mask[i] != i) {
- InsertNeedsShuffle = true;
- break;
- }
- }
-
- // We don't want to use INSERTPS or other insertion techniques if it will
- // require shuffling anyways.
- if (!InsertNeedsShuffle) {
- // If all of V1 is zeroable, replace it with undef.
- if ((ZMask | 1 << V2Index) == 0xF)
- V1 = DAG.getUNDEF(MVT::v4f32);
-
- unsigned InsertPSMask = (Mask[V2Index] - 4) << 6 | V2Index << 4 | ZMask;
- assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
-
- // Insert the V2 element into the desired position.
- return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
- DAG.getConstant(InsertPSMask, MVT::i8));
- }
+ // Use INSERTPS if we can complete the shuffle efficiently.
+ if (SDValue V = lowerVectorShuffleAsInsertPS(Op, V1, V2, Mask, DAG))
+ return V;
}
// Otherwise fall back to a SHUFPS lowering strategy.
getV4X86ShuffleImm8ForMask(Mask, DAG));
}
+ // Try to use bit shift instructions.
+ if (SDValue Shift = lowerVectorShuffleAsBitShift(
+ DL, MVT::v4i32, V1, V2, Mask, DAG))
+ return Shift;
+
// Try to use byte shift instructions.
if (SDValue Shift = lowerVectorShuffleAsByteShift(
DL, MVT::v4i32, V1, V2, Mask, DAG))
Mask, Subtarget, DAG))
return V;
+ if (Subtarget->hasSSE41())
+ if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
+ Subtarget, DAG))
+ return Blend;
+
+ if (SDValue Masked =
+ lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask, DAG))
+ return Masked;
+
// Use dedicated unpack instructions for masks that match their pattern.
if (isShuffleEquivalent(Mask, 0, 4, 1, 5))
return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V1, V2);
if (isShuffleEquivalent(Mask, 2, 6, 3, 7))
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V1, V2);
- if (Subtarget->hasSSE41())
- if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
- Subtarget, DAG))
- return Blend;
-
// Try to use byte rotation instructions.
// Its more profitable for pre-SSSE3 to use shuffles/unpacks.
if (Subtarget->hasSSSE3())
Mask, Subtarget, DAG))
return Broadcast;
+ // Try to use bit shift instructions.
+ if (SDValue Shift = lowerVectorShuffleAsBitShift(
+ DL, MVT::v8i16, V, V, Mask, DAG))
+ return Shift;
+
// Try to use byte shift instructions.
if (SDValue Shift = lowerVectorShuffleAsByteShift(
DL, MVT::v8i16, V, V, Mask, DAG))
assert(NumV1Inputs > 0 && "All single-input shuffles should be canonicalized "
"to be V1-input shuffles.");
+ // Try to use bit shift instructions.
+ if (SDValue Shift = lowerVectorShuffleAsBitShift(
+ DL, MVT::v8i16, V1, V2, Mask, DAG))
+ return Shift;
+
// Try to use byte shift instructions.
if (SDValue Shift = lowerVectorShuffleAsByteShift(
DL, MVT::v8i16, V1, V2, Mask, DAG))
Mask, Subtarget, DAG))
return V;
+ if (Subtarget->hasSSE41())
+ if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
+ Subtarget, DAG))
+ return Blend;
+
+ if (SDValue Masked =
+ lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask, DAG))
+ return Masked;
+
// Use dedicated unpack instructions for masks that match their pattern.
if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 2, 10, 3, 11))
return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V1, V2);
if (isShuffleEquivalent(Mask, 4, 12, 5, 13, 6, 14, 7, 15))
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V1, V2);
- if (Subtarget->hasSSE41())
- if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
- Subtarget, DAG))
- return Blend;
-
// Try to use byte rotation instructions.
if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
ArrayRef<int> OrigMask = SVOp->getMask();
assert(OrigMask.size() == 16 && "Unexpected mask size for v16 shuffle!");
+ // Try to use bit shift instructions.
+ if (SDValue Shift = lowerVectorShuffleAsBitShift(
+ DL, MVT::v16i8, V1, V2, OrigMask, DAG))
+ return Shift;
+
// Try to use byte shift instructions.
if (SDValue Shift = lowerVectorShuffleAsByteShift(
DL, MVT::v16i8, V1, V2, OrigMask, DAG))
if (Subtarget->hasSSSE3()) {
SDValue V1Mask[16];
SDValue V2Mask[16];
- for (int i = 0; i < 16; ++i)
+ bool V1InUse = false;
+ bool V2InUse = false;
+ SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+
+ for (int i = 0; i < 16; ++i) {
if (Mask[i] == -1) {
V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
} else {
- V1Mask[i] = DAG.getConstant(Mask[i] < 16 ? Mask[i] : 0x80, MVT::i8);
- V2Mask[i] =
- DAG.getConstant(Mask[i] < 16 ? 0x80 : Mask[i] - 16, MVT::i8);
+ const int ZeroMask = 0x80;
+ int V1Idx = (Mask[i] < 16 ? Mask[i] : ZeroMask);
+ int V2Idx = (Mask[i] < 16 ? ZeroMask : Mask[i] - 16);
+ if (Zeroable[i])
+ V1Idx = V2Idx = ZeroMask;
+ V1Mask[i] = DAG.getConstant(V1Idx, MVT::i8);
+ V2Mask[i] = DAG.getConstant(V2Idx, MVT::i8);
+ V1InUse |= (ZeroMask != V1Idx);
+ V2InUse |= (ZeroMask != V2Idx);
}
- V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V1,
- DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V1Mask));
- if (isSingleInputShuffleMask(Mask))
- return V1; // Single inputs are easy.
+ }
+
+ if (V1InUse)
+ V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V1,
+ DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V1Mask));
+ if (V2InUse)
+ V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V2,
+ DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V2Mask));
- // Otherwise, blend the two.
- V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V2,
- DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V2Mask));
- return DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
+ // If we need shuffled inputs from both, blend the two.
+ if (V1InUse && V2InUse)
+ return DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
+ if (V1InUse)
+ return V1; // Single inputs are easy.
+ if (V2InUse)
+ return V2; // Single inputs are easy.
+ // Shuffling to a zeroable vector.
+ return getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
}
// There are special ways we can lower some single-element blends.
Mask, Subtarget, DAG))
return Broadcast;
+ // Use low duplicate instructions for masks that match their pattern.
+ if (isShuffleEquivalent(Mask, 0, 0, 2, 2))
+ return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
+
if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
// Non-half-crossing single input shuffles can be lowerid with an
// interleaved permutation.
if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
assert(RepeatedMask.size() == 4 &&
"Repeated masks must be half the mask width!");
+
+ // Use even/odd duplicate instructions for masks that match their pattern.
+ if (isShuffleEquivalent(Mask, 0, 0, 2, 2, 4, 4, 6, 6))
+ return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
+ if (isShuffleEquivalent(Mask, 1, 1, 3, 3, 5, 5, 7, 7))
+ return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
+
if (isSingleInputShuffleMask(Mask))
return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
getV4X86ShuffleImm8ForMask(RepeatedMask, DAG));
assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
assert(Subtarget->hasAVX2() && "We can only lower v8i32 with AVX2!");
+ // Whenever we can lower this as a zext, that instruction is strictly faster
+ // than any alternative. It also allows us to fold memory operands into the
+ // shuffle in many cases.
+ if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2,
+ Mask, Subtarget, DAG))
+ return ZExt;
+
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
Subtarget, DAG))
return Blend;
DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask), V1);
}
+ // Try to use bit shift instructions.
+ if (SDValue Shift = lowerVectorShuffleAsBitShift(
+ DL, MVT::v8i32, V1, V2, Mask, DAG))
+ return Shift;
+
// Try to simplify this by merging 128-bit lanes to enable a lane-based
// shuffle.
if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
assert(Subtarget->hasAVX2() && "We can only lower v16i16 with AVX2!");
+ // Whenever we can lower this as a zext, that instruction is strictly faster
+ // than any alternative. It also allows us to fold memory operands into the
+ // shuffle in many cases.
+ if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v16i16, V1, V2,
+ Mask, Subtarget, DAG))
+ return ZExt;
+
// Check for being able to broadcast a single element.
if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16i16, DL, V1,
Mask, Subtarget, DAG))
DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask)));
}
+ // Try to use bit shift instructions.
+ if (SDValue Shift = lowerVectorShuffleAsBitShift(
+ DL, MVT::v16i16, V1, V2, Mask, DAG))
+ return Shift;
+
// Try to simplify this by merging 128-bit lanes to enable a lane-based
// shuffle.
if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
assert(Subtarget->hasAVX2() && "We can only lower v32i8 with AVX2!");
+ // Whenever we can lower this as a zext, that instruction is strictly faster
+ // than any alternative. It also allows us to fold memory operands into the
+ // shuffle in many cases.
+ if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2,
+ Mask, Subtarget, DAG))
+ return ZExt;
+
// Check for being able to broadcast a single element.
if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v32i8, DL, V1,
Mask, Subtarget, DAG))
DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask));
}
+ // Try to use bit shift instructions.
+ if (SDValue Shift = lowerVectorShuffleAsBitShift(
+ DL, MVT::v32i8, V1, V2, Mask, DAG))
+ return Shift;
+
// Try to simplify this by merging 128-bit lanes to enable a lane-based
// shuffle.
if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
ArrayRef<int> Mask = SVOp->getMask();
assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
+ // X86 has dedicated unpack instructions that can handle specific blend
+ // operations: UNPCKH and UNPCKL.
+ if (isShuffleEquivalent(Mask, 0, 8, 2, 10, 4, 12, 6, 14))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f64, V1, V2);
+ if (isShuffleEquivalent(Mask, 1, 9, 3, 11, 5, 13, 7, 15))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f64, V1, V2);
+
// FIXME: Implement direct support for this type!
return splitAndLowerVectorShuffle(DL, MVT::v8f64, V1, V2, Mask, DAG);
}
ArrayRef<int> Mask = SVOp->getMask();
assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (isShuffleEquivalent(Mask,
+ 0, 16, 1, 17, 4, 20, 5, 21,
+ 8, 24, 9, 25, 12, 28, 13, 29))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16f32, V1, V2);
+ if (isShuffleEquivalent(Mask,
+ 2, 18, 3, 19, 6, 22, 7, 23,
+ 10, 26, 11, 27, 14, 30, 15, 31))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16f32, V1, V2);
+
// FIXME: Implement direct support for this type!
return splitAndLowerVectorShuffle(DL, MVT::v16f32, V1, V2, Mask, DAG);
}
ArrayRef<int> Mask = SVOp->getMask();
assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
+ // X86 has dedicated unpack instructions that can handle specific blend
+ // operations: UNPCKH and UNPCKL.
+ if (isShuffleEquivalent(Mask, 0, 8, 2, 10, 4, 12, 6, 14))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i64, V1, V2);
+ if (isShuffleEquivalent(Mask, 1, 9, 3, 11, 5, 13, 7, 15))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i64, V1, V2);
+
// FIXME: Implement direct support for this type!
return splitAndLowerVectorShuffle(DL, MVT::v8i64, V1, V2, Mask, DAG);
}
ArrayRef<int> Mask = SVOp->getMask();
assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (isShuffleEquivalent(Mask,
+ 0, 16, 1, 17, 4, 20, 5, 21,
+ 8, 24, 9, 25, 12, 28, 13, 29))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i32, V1, V2);
+ if (isShuffleEquivalent(Mask,
+ 2, 18, 3, 19, 6, 22, 7, 23,
+ 10, 26, 11, 27, 14, 30, 15, 31))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i32, V1, V2);
+
// FIXME: Implement direct support for this type!
return splitAndLowerVectorShuffle(DL, MVT::v16i32, V1, V2, Mask, DAG);
}
unsigned NumLanes = (NumElems - 1) / 8 + 1;
unsigned NumElemsInLane = NumElems / NumLanes;
- // Blend for v16i16 should be symetric for the both lanes.
+ // Blend for v16i16 should be symmetric for both lanes.
for (unsigned i = 0; i < NumElemsInLane; ++i) {
int SndLaneEltIdx = (NumLanes == 2) ? MaskVals[i + NumElemsInLane] : -1;
SDValue getMOVDDup(SDValue &Op, SDLoc &dl, SDValue V1, SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
- // Canonizalize to v2f64.
+ // Canonicalize to v2f64.
V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1);
return DAG.getNode(ISD::BITCAST, dl, VT,
getTargetShuffleNode(X86ISD::MOVDDUP, dl, MVT::v2f64,
if (HasSSE2 && VT == MVT::v2f64)
return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG);
- // v4f32 or v4i32: canonizalized to v4f32 (which is legal for SSE1)
+ // v4f32 or v4i32: canonicalize to v4f32 (which is legal for SSE1)
return DAG.getNode(ISD::BITCAST, dl, VT,
getTargetShuffleNode(X86ISD::MOVLHPS, dl, MVT::v4f32,
DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V1),
DAG);
unsigned MaskValue;
- if (isBlendMask(M, VT, Subtarget->hasSSE41(), Subtarget->hasInt256(),
- &MaskValue))
+ if (isBlendMask(M, VT, Subtarget->hasSSE41(), HasInt256, &MaskValue))
return LowerVECTOR_SHUFFLEtoBlend(SVOp, MaskValue, Subtarget, DAG);
if (isSHUFPMask(M, VT))
return NewOp;
}
- if (VT == MVT::v16i16 && Subtarget->hasInt256()) {
+ if (VT == MVT::v16i16 && HasInt256) {
SDValue NewOp = LowerVECTOR_SHUFFLEv16i16(Op, DAG);
if (NewOp.getNode())
return NewOp;
MVT EltVT = Op.getSimpleValueType();
assert((EltVT == MVT::i1) && "Unexpected operands in ExtractBitFromMaskVector");
+ assert((VecVT.getVectorNumElements() <= 16 || Subtarget->hasBWI()) &&
+ "Unexpected vector type in ExtractBitFromMaskVector");
// variable index can't be handled in mask registers,
// extend vector to VR512
unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
const TargetRegisterClass* rc = getRegClassFor(VecVT);
+ if (!Subtarget->hasDQI() && (VecVT.getVectorNumElements() <= 8))
+ rc = getRegClassFor(MVT::v16i1);
unsigned MaxSift = rc->getSize()*8 - 1;
Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
DAG.getConstant(MaxSift - IdxVal, MVT::i8));
// the upper bits of a vector.
static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
- if (Subtarget->hasFp256()) {
- SDLoc dl(Op.getNode());
- SDValue Vec = Op.getNode()->getOperand(0);
- SDValue SubVec = Op.getNode()->getOperand(1);
- SDValue Idx = Op.getNode()->getOperand(2);
-
- if ((Op.getNode()->getSimpleValueType(0).is256BitVector() ||
- Op.getNode()->getSimpleValueType(0).is512BitVector()) &&
- SubVec.getNode()->getSimpleValueType(0).is128BitVector() &&
- isa<ConstantSDNode>(Idx)) {
- unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
- return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
- }
+ if (!Subtarget->hasAVX())
+ return SDValue();
- if (Op.getNode()->getSimpleValueType(0).is512BitVector() &&
- SubVec.getNode()->getSimpleValueType(0).is256BitVector() &&
- isa<ConstantSDNode>(Idx)) {
- unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
- return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl);
+ SDLoc dl(Op);
+ SDValue Vec = Op.getOperand(0);
+ SDValue SubVec = Op.getOperand(1);
+ SDValue Idx = Op.getOperand(2);
+
+ if (!isa<ConstantSDNode>(Idx))
+ return SDValue();
+
+ unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+ MVT OpVT = Op.getSimpleValueType();
+ MVT SubVecVT = SubVec.getSimpleValueType();
+
+ // Fold two 16-byte subvector loads into one 32-byte load:
+ // (insert_subvector (insert_subvector undef, (load addr), 0),
+ // (load addr + 16), Elts/2)
+ // --> load32 addr
+ if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
+ Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
+ OpVT.is256BitVector() && SubVecVT.is128BitVector() &&
+ !Subtarget->isUnalignedMem32Slow()) {
+ SDValue SubVec2 = Vec.getOperand(1);
+ if (auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2))) {
+ if (Idx2->getZExtValue() == 0) {
+ SDValue Ops[] = { SubVec2, SubVec };
+ SDValue LD = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false);
+ if (LD.getNode())
+ return LD;
+ }
}
}
+
+ if ((OpVT.is256BitVector() || OpVT.is512BitVector()) &&
+ SubVecVT.is128BitVector())
+ return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
+
+ if (OpVT.is512BitVector() && SubVecVT.is256BitVector())
+ return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl);
+
return SDValue();
}
SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1);
// Next, clear the sign bit from the first operand (magnitude).
- CV[0] = ConstantFP::get(
- *Context, APFloat(Sem, APInt::getLowBitsSet(SizeInBits, SizeInBits - 1)));
+ // If it's a constant, we can clear it here.
+ if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Op0)) {
+ APFloat APF = Op0CN->getValueAPF();
+ // If the magnitude is a positive zero, the sign bit alone is enough.
+ if (APF.isPosZero())
+ return SignBit;
+ APF.clearSign();
+ CV[0] = ConstantFP::get(*Context, APF);
+ } else {
+ CV[0] = ConstantFP::get(
+ *Context,
+ APFloat(Sem, APInt::getLowBitsSet(SizeInBits, SizeInBits - 1)));
+ }
C = ConstantVector::get(CV);
CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16);
- SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
- MachinePointerInfo::getConstantPool(),
- false, false, false, 16);
- SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2);
+ SDValue Val = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
+ MachinePointerInfo::getConstantPool(),
+ false, false, false, 16);
+ // If the magnitude operand wasn't a constant, we need to AND out the sign.
+ if (!isa<ConstantFPSDNode>(Op0))
+ Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Val);
// OR the magnitude value with the sign bit.
return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit);
/// equivalent.
SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl,
SelectionDAG &DAG) const {
- if (Op.getValueType() == MVT::i1)
- // KORTEST instruction should be selected
- return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
- DAG.getConstant(0, Op.getValueType()));
-
+ if (Op.getValueType() == MVT::i1) {
+ SDValue ExtOp = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, Op);
+ return DAG.getNode(X86ISD::CMP, dl, MVT::i32, ExtOp,
+ DAG.getConstant(0, MVT::i8));
+ }
// CF and OF aren't always set the way we want. Determine which
// of these we need.
bool NeedCF = false;
// may emit an illegal shuffle but the expansion is still better than scalar
// code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
// we'll emit a shuffle and a arithmetic shift.
+// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
// TODO: It is possible to support ZExt by zeroing the undef values during
// the shuffle phase or after the shuffle.
static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget,
// Attempt to load the original value using scalar loads.
// Find the largest scalar type that divides the total loaded size.
MVT SclrLoadTy = MVT::i8;
- for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE;
- tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) {
- MVT Tp = (MVT::SimpleValueType)tp;
+ for (MVT Tp : MVT::integer_valuetypes()) {
if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
SclrLoadTy = Tp;
}
SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
Chain = SP.getValue(1);
unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue();
- const TargetFrameLowering &TFI = *DAG.getSubtarget().getFrameLowering();
+ const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
unsigned StackAlign = TFI.getStackAlignment();
Tmp1 = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
if (Align > StackAlign)
Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag);
- const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
- DAG.getSubtarget().getRegisterInfo());
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
unsigned SPReg = RegInfo->getStackRegister();
SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
Chain = SP.getValue(1);
}
const X86Subtarget &Subtarget =
- DAG.getTarget().getSubtarget<X86Subtarget>();
+ static_cast<const X86Subtarget &>(DAG.getSubtarget());
if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
// Let the shuffle legalizer expand this shift amount node.
/// The mask is comming as MVT::i8 and it should be truncated
/// to MVT::i1 while lowering masking intrinsics.
/// The main difference between ScalarMaskingNode and VectorMaskingNode is using
-/// "X86select" instead of "vselect". We just can't create the "vselect" node for
+/// "X86select" instead of "vselect". We just can't create the "vselect" node for
/// a scalar instruction.
static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
SDValue PreservedSrc,
return DAG.getNode(X86ISD::SELECT, dl, VT, IMask, Op, PreservedSrc);
}
-static unsigned getOpcodeForFMAIntrinsic(unsigned IntNo) {
- switch (IntNo) {
- default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
- case Intrinsic::x86_fma_vfmadd_ps:
- case Intrinsic::x86_fma_vfmadd_pd:
- case Intrinsic::x86_fma_vfmadd_ps_256:
- case Intrinsic::x86_fma_vfmadd_pd_256:
- case Intrinsic::x86_fma_mask_vfmadd_ps_512:
- case Intrinsic::x86_fma_mask_vfmadd_pd_512:
- return X86ISD::FMADD;
- case Intrinsic::x86_fma_vfmsub_ps:
- case Intrinsic::x86_fma_vfmsub_pd:
- case Intrinsic::x86_fma_vfmsub_ps_256:
- case Intrinsic::x86_fma_vfmsub_pd_256:
- case Intrinsic::x86_fma_mask_vfmsub_ps_512:
- case Intrinsic::x86_fma_mask_vfmsub_pd_512:
- return X86ISD::FMSUB;
- case Intrinsic::x86_fma_vfnmadd_ps:
- case Intrinsic::x86_fma_vfnmadd_pd:
- case Intrinsic::x86_fma_vfnmadd_ps_256:
- case Intrinsic::x86_fma_vfnmadd_pd_256:
- case Intrinsic::x86_fma_mask_vfnmadd_ps_512:
- case Intrinsic::x86_fma_mask_vfnmadd_pd_512:
- return X86ISD::FNMADD;
- case Intrinsic::x86_fma_vfnmsub_ps:
- case Intrinsic::x86_fma_vfnmsub_pd:
- case Intrinsic::x86_fma_vfnmsub_ps_256:
- case Intrinsic::x86_fma_vfnmsub_pd_256:
- case Intrinsic::x86_fma_mask_vfnmsub_ps_512:
- case Intrinsic::x86_fma_mask_vfnmsub_pd_512:
- return X86ISD::FNMSUB;
- case Intrinsic::x86_fma_vfmaddsub_ps:
- case Intrinsic::x86_fma_vfmaddsub_pd:
- case Intrinsic::x86_fma_vfmaddsub_ps_256:
- case Intrinsic::x86_fma_vfmaddsub_pd_256:
- case Intrinsic::x86_fma_mask_vfmaddsub_ps_512:
- case Intrinsic::x86_fma_mask_vfmaddsub_pd_512:
- return X86ISD::FMADDSUB;
- case Intrinsic::x86_fma_vfmsubadd_ps:
- case Intrinsic::x86_fma_vfmsubadd_pd:
- case Intrinsic::x86_fma_vfmsubadd_ps_256:
- case Intrinsic::x86_fma_vfmsubadd_pd_256:
- case Intrinsic::x86_fma_mask_vfmsubadd_ps_512:
- case Intrinsic::x86_fma_mask_vfmsubadd_pd_512:
- return X86ISD::FMSUBADD;
- }
-}
-
static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
SDLoc dl(Op);
Mask, Src0, Subtarget, DAG);
}
case INTR_TYPE_2OP_MASK: {
- return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Op.getOperand(1),
+ SDValue Mask = Op.getOperand(4);
+ SDValue PassThru = Op.getOperand(3);
+ unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
+ if (IntrWithRoundingModeOpcode != 0) {
+ unsigned Round = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue();
+ if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) {
+ return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
+ dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2),
+ Op.getOperand(3), Op.getOperand(5)),
+ Mask, PassThru, Subtarget, DAG);
+ }
+ }
+ return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
+ Op.getOperand(1),
Op.getOperand(2)),
- Op.getOperand(4), Op.getOperand(3), Subtarget, DAG);
+ Mask, PassThru, Subtarget, DAG);
+ }
+ case FMA_OP_MASK: {
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Src2 = Op.getOperand(2);
+ SDValue Src3 = Op.getOperand(3);
+ SDValue Mask = Op.getOperand(4);
+ unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
+ if (IntrWithRoundingModeOpcode != 0) {
+ SDValue Rnd = Op.getOperand(5);
+ if (cast<ConstantSDNode>(Rnd)->getZExtValue() !=
+ X86::STATIC_ROUNDING::CUR_DIRECTION)
+ return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
+ dl, Op.getValueType(),
+ Src1, Src2, Src3, Rnd),
+ Mask, Src1, Subtarget, DAG);
+ }
+ return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
+ dl, Op.getValueType(),
+ Src1, Src2, Src3),
+ Mask, Src1, Subtarget, DAG);
}
case CMP_MASK:
case CMP_MASK_CC: {
return DAG.getNode(IntrData->Opc0, dl, VT, VMask, Op.getOperand(1),
Op.getOperand(2));
}
- case FMA_OP_MASK:
- {
- return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
- dl, Op.getValueType(),
- Op.getOperand(1),
- Op.getOperand(2),
- Op.getOperand(3)),
- Op.getOperand(4), Op.getOperand(1),
- Subtarget, DAG);
- }
default:
break;
}
SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
return DAG.getNode(Opcode, dl, VTs, NewOps);
}
-
- case Intrinsic::x86_fma_mask_vfmadd_ps_512:
- case Intrinsic::x86_fma_mask_vfmadd_pd_512:
- case Intrinsic::x86_fma_mask_vfmsub_ps_512:
- case Intrinsic::x86_fma_mask_vfmsub_pd_512:
- case Intrinsic::x86_fma_mask_vfnmadd_ps_512:
- case Intrinsic::x86_fma_mask_vfnmadd_pd_512:
- case Intrinsic::x86_fma_mask_vfnmsub_ps_512:
- case Intrinsic::x86_fma_mask_vfnmsub_pd_512:
- case Intrinsic::x86_fma_mask_vfmaddsub_ps_512:
- case Intrinsic::x86_fma_mask_vfmaddsub_pd_512:
- case Intrinsic::x86_fma_mask_vfmsubadd_ps_512:
- case Intrinsic::x86_fma_mask_vfmsubadd_pd_512: {
- auto *SAE = cast<ConstantSDNode>(Op.getOperand(5));
- if (SAE->getZExtValue() == X86::STATIC_ROUNDING::CUR_DIRECTION)
- return getVectorMaskingNode(DAG.getNode(getOpcodeForFMAIntrinsic(IntNo),
- dl, Op.getValueType(),
- Op.getOperand(1),
- Op.getOperand(2),
- Op.getOperand(3)),
- Op.getOperand(4), Op.getOperand(1),
- Subtarget, DAG);
- else
- return SDValue();
- }
-
- case Intrinsic::x86_fma_vfmadd_ps:
- case Intrinsic::x86_fma_vfmadd_pd:
- case Intrinsic::x86_fma_vfmsub_ps:
- case Intrinsic::x86_fma_vfmsub_pd:
- case Intrinsic::x86_fma_vfnmadd_ps:
- case Intrinsic::x86_fma_vfnmadd_pd:
- case Intrinsic::x86_fma_vfnmsub_ps:
- case Intrinsic::x86_fma_vfnmsub_pd:
- case Intrinsic::x86_fma_vfmaddsub_ps:
- case Intrinsic::x86_fma_vfmaddsub_pd:
- case Intrinsic::x86_fma_vfmsubadd_ps:
- case Intrinsic::x86_fma_vfmsubadd_pd:
- case Intrinsic::x86_fma_vfmadd_ps_256:
- case Intrinsic::x86_fma_vfmadd_pd_256:
- case Intrinsic::x86_fma_vfmsub_ps_256:
- case Intrinsic::x86_fma_vfmsub_pd_256:
- case Intrinsic::x86_fma_vfnmadd_ps_256:
- case Intrinsic::x86_fma_vfnmadd_pd_256:
- case Intrinsic::x86_fma_vfnmsub_ps_256:
- case Intrinsic::x86_fma_vfnmsub_pd_256:
- case Intrinsic::x86_fma_vfmaddsub_ps_256:
- case Intrinsic::x86_fma_vfmaddsub_pd_256:
- case Intrinsic::x86_fma_vfmsubadd_ps_256:
- case Intrinsic::x86_fma_vfmsubadd_pd_256:
- return DAG.getNode(getOpcodeForFMAIntrinsic(IntNo), dl, Op.getValueType(),
- Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
}
}
if (Depth > 0) {
SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
- const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
- DAG.getSubtarget().getRegisterInfo());
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), PtrVT);
return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
DAG.getNode(ISD::ADD, dl, PtrVT,
}
SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
- MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineFrameInfo *MFI = MF.getFrameInfo();
+ X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+ EVT VT = Op.getValueType();
+
MFI->setFrameAddressIsTaken(true);
- EVT VT = Op.getValueType();
+ if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
+ // Depth > 0 makes no sense on targets which use Windows unwind codes. It
+ // is not possible to crawl up the stack without looking at the unwind codes
+ // simultaneously.
+ int FrameAddrIndex = FuncInfo->getFAIndex();
+ if (!FrameAddrIndex) {
+ // Set up a frame object for the return address.
+ unsigned SlotSize = RegInfo->getSlotSize();
+ FrameAddrIndex = MF.getFrameInfo()->CreateFixedObject(
+ SlotSize, /*Offset=*/INT64_MIN, /*IsImmutable=*/false);
+ FuncInfo->setFAIndex(FrameAddrIndex);
+ }
+ return DAG.getFrameIndex(FrameAddrIndex, VT);
+ }
+
+ unsigned FrameReg =
+ RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
SDLoc dl(Op); // FIXME probably not meaningful
unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
- const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
- DAG.getSubtarget().getRegisterInfo());
- unsigned FrameReg = RegInfo->getPtrSizedFrameRegister(
- DAG.getMachineFunction());
assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
(FrameReg == X86::EBP && VT == MVT::i32)) &&
"Invalid Frame Register!");
SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
SelectionDAG &DAG) const {
- const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
- DAG.getSubtarget().getRegisterInfo());
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize());
}
SDLoc dl (Op);
EVT PtrVT = getPointerTy();
- const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
- DAG.getSubtarget().getRegisterInfo());
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
(FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
SDLoc dl (Op);
const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
- const TargetRegisterInfo *TRI = DAG.getSubtarget().getRegisterInfo();
+ const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
if (Subtarget->is64Bit()) {
SDValue OutChains[6];
*/
MachineFunction &MF = DAG.getMachineFunction();
- const TargetMachine &TM = MF.getTarget();
- const TargetFrameLowering &TFI = *TM.getSubtargetImpl()->getFrameLowering();
+ const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
unsigned StackAlignment = TFI.getStackAlignment();
MVT VT = Op.getSimpleValueType();
SDLoc DL(Op);
/// Used to know whether to use cmpxchg8/16b when expanding atomic operations
/// (otherwise we leave them alone to become __sync_fetch_and_... calls).
bool X86TargetLowering::needsCmpXchgNb(const Type *MemType) const {
- const X86Subtarget &Subtarget =
- getTargetMachine().getSubtarget<X86Subtarget>();
unsigned OpWidth = MemType->getPrimitiveSizeInBits();
if (OpWidth == 64)
- return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
+ return !Subtarget->is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
else if (OpWidth == 128)
- return Subtarget.hasCmpxchg16b();
+ return Subtarget->hasCmpxchg16b();
else
return false;
}
}
bool X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
- const X86Subtarget &Subtarget =
- getTargetMachine().getSubtarget<X86Subtarget>();
- unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
+ unsigned NativeWidth = Subtarget->is64Bit() ? 64 : 32;
const Type *MemType = AI->getType();
// If the operand is too big, we must see if cmpxchg8/16b is available
LoadInst *
X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
- const X86Subtarget &Subtarget =
- getTargetMachine().getSubtarget<X86Subtarget>();
- unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
+ unsigned NativeWidth = Subtarget->is64Bit() ? 64 : 32;
const Type *MemType = AI->getType();
// Accesses larger than the native width are turned into cmpxchg/libcalls, so
// there is no benefit in turning such RMWs into loads, and it is actually
// FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
// the IR level, so we must wrap it in an intrinsic.
return nullptr;
- } else if (hasMFENCE(Subtarget)) {
+ } else if (hasMFENCE(*Subtarget)) {
Function *MFence = llvm::Intrinsic::getDeclaration(M,
Intrinsic::x86_sse2_mfence);
Builder.CreateCall(MFence);
switch (N->getOpcode()) {
default:
llvm_unreachable("Do not know how to custom type legalize this operation!");
+ // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
+ case X86ISD::FMINC:
+ case X86ISD::FMIN:
+ case X86ISD::FMAXC:
+ case X86ISD::FMAX: {
+ EVT VT = N->getValueType(0);
+ if (VT != MVT::v2f32)
+ llvm_unreachable("Unexpected type (!= v2f32) on FMIN/FMAX.");
+ SDValue UNDEF = DAG.getUNDEF(VT);
+ SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
+ N->getOperand(0), UNDEF);
+ SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
+ N->getOperand(1), UNDEF);
+ Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
+ return;
+ }
case ISD::SIGN_EXTEND_INREG:
case ISD::ADDC:
case ISD::ADDE:
case X86ISD::COMPRESS: return "X86ISD::COMPRESS";
case X86ISD::EXPAND: return "X86ISD::EXPAND";
case X86ISD::SELECT: return "X86ISD::SELECT";
+ case X86ISD::ADDSUB: return "X86ISD::ADDSUB";
+ case X86ISD::RCP28: return "X86ISD::RCP28";
+ case X86ISD::RSQRT28: return "X86ISD::RSQRT28";
}
}
return false;
}
+bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; }
+
bool
X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
if (!(Subtarget->hasFMA() || Subtarget->hasFMA4()))
if (VT.getSizeInBits() == 64)
return false;
+ // This is an experimental legality test that is tailored to match the
+ // legality test of the experimental lowering more closely. They are gated
+ // separately to ease testing of performance differences.
+ if (ExperimentalVectorShuffleLegality)
+ // We only care that the types being shuffled are legal. The lowering can
+ // handle any possible shuffle mask that results.
+ return isTypeLegal(SVT);
+
// If this is a single-input shuffle with no 128 bit lane crossings we can
// lower it into pshufb.
if ((SVT.is128BitVector() && Subtarget->hasSSSE3()) ||
return false;
MVT SVT = VT.getSimpleVT();
+
+ // This is an experimental legality test that is tailored to match the
+ // legality test of the experimental lowering more closely. They are gated
+ // separately to ease testing of performance differences.
+ if (ExperimentalVectorShuffleLegality)
+ // The new vector shuffle lowering is very good at managing zero-inputs.
+ return isShuffleMaskLegal(Mask, VT);
+
unsigned NumElts = SVT.getVectorNumElements();
// FIXME: This collection of masks seems suspect.
if (NumElts == 2)
return BB;
}
-static MachineBasicBlock * EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB,
- const TargetInstrInfo *TII,
- const X86Subtarget* Subtarget) {
+static MachineBasicBlock *EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB,
+ const X86Subtarget *Subtarget) {
DebugLoc dl = MI->getDebugLoc();
-
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
// Address into RAX/EAX, other two args into ECX, EDX.
unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r;
unsigned MemReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
}
MachineBasicBlock *
-X86TargetLowering::EmitVAARG64WithCustomInserter(
- MachineInstr *MI,
- MachineBasicBlock *MBB) const {
+X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr *MI,
+ MachineBasicBlock *MBB) const {
// Emit va_arg instruction on X86-64.
// Operands to this pseudo-instruction:
MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
// Machine Information
- const TargetInstrInfo *TII = MBB->getParent()->getSubtarget().getInstrInfo();
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
.setMemRefs(MMOBegin, MMOEnd);
// Jump to endMBB
- BuildMI(offsetMBB, DL, TII->get(X86::JMP_4))
+ BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
.addMBB(endMBB);
}
XMMSaveMBB->addSuccessor(EndMBB);
// Now add the instructions.
- const TargetInstrInfo *TII = MBB->getParent()->getSubtarget().getInstrInfo();
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
DebugLoc DL = MI->getDebugLoc();
unsigned CountReg = MI->getOperand(0).getReg();
if (!Subtarget->isTargetWin64()) {
// If %al is 0, branch around the XMM save block.
BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
- BuildMI(MBB, DL, TII->get(X86::JE_4)).addMBB(EndMBB);
+ BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
MBB->addSuccessor(EndMBB);
}
MachineBasicBlock *
X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
MachineBasicBlock *BB) const {
- const TargetInstrInfo *TII = BB->getParent()->getSubtarget().getInstrInfo();
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
DebugLoc DL = MI->getDebugLoc();
// To "insert" a SELECT_CC instruction, we actually have to insert the
// If the EFLAGS register isn't dead in the terminator, then claim that it's
// live into the sink and copy blocks.
- const TargetRegisterInfo *TRI =
- BB->getParent()->getSubtarget().getRegisterInfo();
+ const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
if (!MI->killsRegister(X86::EFLAGS) &&
!checkAndUpdateEFLAGSKill(MI, BB, TRI)) {
copy0MBB->addLiveIn(X86::EFLAGS);
X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI,
MachineBasicBlock *BB) const {
MachineFunction *MF = BB->getParent();
- const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
DebugLoc DL = MI->getDebugLoc();
const BasicBlock *LLVM_BB = BB->getBasicBlock();
BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
.addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
.addReg(SPLimitVReg);
- BuildMI(BB, DL, TII->get(X86::JG_4)).addMBB(mallocMBB);
+ BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB);
// bumpMBB simply decreases the stack pointer, since we know the current
// stacklet has enough space.
.addReg(SPLimitVReg);
BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
.addReg(SPLimitVReg);
- BuildMI(bumpMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB);
+ BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
// Calls into a routine in libgcc to allocate more space from the heap.
- const uint32_t *RegMask = MF->getTarget()
- .getSubtargetImpl()
- ->getRegisterInfo()
- ->getCallPreservedMask(CallingConv::C);
+ const uint32_t *RegMask =
+ Subtarget->getRegisterInfo()->getCallPreservedMask(CallingConv::C);
if (IsLP64) {
BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
.addReg(sizeVReg);
BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
.addReg(IsLP64 ? X86::RAX : X86::EAX);
- BuildMI(mallocMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB);
+ BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
// Set up the CFG correctly.
BB->addSuccessor(bumpMBB);
MachineBasicBlock *
X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI,
MachineBasicBlock *BB) const {
- const TargetInstrInfo *TII = BB->getParent()->getSubtarget().getInstrInfo();
DebugLoc DL = MI->getDebugLoc();
assert(!Subtarget->isTargetMachO());
- // The lowering is pretty easy: we're just emitting the call to _alloca. The
- // non-trivial part is impdef of ESP.
-
- if (Subtarget->isTargetWin64()) {
- if (Subtarget->isTargetCygMing()) {
- // ___chkstk(Mingw64):
- // Clobbers R10, R11, RAX and EFLAGS.
- // Updates RSP.
- BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA))
- .addExternalSymbol("___chkstk")
- .addReg(X86::RAX, RegState::Implicit)
- .addReg(X86::RSP, RegState::Implicit)
- .addReg(X86::RAX, RegState::Define | RegState::Implicit)
- .addReg(X86::RSP, RegState::Define | RegState::Implicit)
- .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
- } else {
- // __chkstk(MSVCRT): does not update stack pointer.
- // Clobbers R10, R11 and EFLAGS.
- BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA))
- .addExternalSymbol("__chkstk")
- .addReg(X86::RAX, RegState::Implicit)
- .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
- // RAX has the offset to be subtracted from RSP.
- BuildMI(*BB, MI, DL, TII->get(X86::SUB64rr), X86::RSP)
- .addReg(X86::RSP)
- .addReg(X86::RAX);
- }
- } else {
- const char *StackProbeSymbol = (Subtarget->isTargetKnownWindowsMSVC() ||
- Subtarget->isTargetWindowsItanium())
- ? "_chkstk"
- : "_alloca";
-
- BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32))
- .addExternalSymbol(StackProbeSymbol)
- .addReg(X86::EAX, RegState::Implicit)
- .addReg(X86::ESP, RegState::Implicit)
- .addReg(X86::EAX, RegState::Define | RegState::Implicit)
- .addReg(X86::ESP, RegState::Define | RegState::Implicit)
- .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
- }
+ X86FrameLowering::emitStackProbeCall(*BB->getParent(), *BB, MI, DL);
MI->eraseFromParent(); // The pseudo instruction is gone now.
return BB;
// or EAX and doing an indirect call. The return value will then
// be in the normal return register.
MachineFunction *F = BB->getParent();
- const X86InstrInfo *TII =
- static_cast<const X86InstrInfo *>(F->getSubtarget().getInstrInfo());
+ const X86InstrInfo *TII = Subtarget->getInstrInfo();
DebugLoc DL = MI->getDebugLoc();
assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?");
// Get a register mask for the lowered call.
// FIXME: The 32-bit calls have non-standard calling conventions. Use a
// proper register mask.
- const uint32_t *RegMask = F->getTarget()
- .getSubtargetImpl()
- ->getRegisterInfo()
- ->getCallPreservedMask(CallingConv::C);
+ const uint32_t *RegMask =
+ Subtarget->getRegisterInfo()->getCallPreservedMask(CallingConv::C);
if (Subtarget->is64Bit()) {
MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
TII->get(X86::MOV64rm), X86::RDI)
MachineBasicBlock *MBB) const {
DebugLoc DL = MI->getDebugLoc();
MachineFunction *MF = MBB->getParent();
- const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
MachineRegisterInfo &MRI = MF->getRegInfo();
const BasicBlock *BB = MBB->getBasicBlock();
MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
.addMBB(restoreMBB);
- const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
- MF->getSubtarget().getRegisterInfo());
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
MIB.addRegMask(RegInfo->getNoPreservedMask());
thisMBB->addSuccessor(mainMBB);
thisMBB->addSuccessor(restoreMBB);
// restoreMBB:
if (RegInfo->hasBasePointer(*MF)) {
- const X86Subtarget &STI = MF->getTarget().getSubtarget<X86Subtarget>();
- const bool Uses64BitFramePtr = STI.isTarget64BitLP64() || STI.isTargetNaCl64();
+ const bool Uses64BitFramePtr =
+ Subtarget->isTarget64BitLP64() || Subtarget->isTargetNaCl64();
X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
X86FI->setRestoreBasePointer(MF);
unsigned FramePtr = RegInfo->getFrameRegister(*MF);
.setMIFlag(MachineInstr::FrameSetup);
}
BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
- BuildMI(restoreMBB, DL, TII->get(X86::JMP_4)).addMBB(sinkMBB);
+ BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
restoreMBB->addSuccessor(sinkMBB);
MI->eraseFromParent();
MachineBasicBlock *MBB) const {
DebugLoc DL = MI->getDebugLoc();
MachineFunction *MF = MBB->getParent();
- const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
MachineRegisterInfo &MRI = MF->getRegInfo();
// Memory Reference
(PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
unsigned Tmp = MRI.createVirtualRegister(RC);
// Since FP is only updated here but NOT referenced, it's treated as GPR.
- const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
- MF->getSubtarget().getRegisterInfo());
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
unsigned SP = RegInfo->getStackRegister();
default: llvm_unreachable("Unrecognized FMA variant.");
}
- const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+ const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
MachineInstrBuilder MIB =
BuildMI(MF, MI->getDebugLoc(), TII.get(NewFMAOpc))
.addOperand(MI->getOperand(0))
case X86::TAILJMPd64:
case X86::TAILJMPr64:
case X86::TAILJMPm64:
+ case X86::TAILJMPd64_REX:
+ case X86::TAILJMPr64_REX:
+ case X86::TAILJMPm64_REX:
llvm_unreachable("TAILJMP64 would not be touched here.");
case X86::TCRETURNdi64:
case X86::TCRETURNri64:
case X86::FP80_TO_INT32_IN_MEM:
case X86::FP80_TO_INT64_IN_MEM: {
MachineFunction *F = BB->getParent();
- const TargetInstrInfo *TII = F->getSubtarget().getInstrInfo();
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
DebugLoc DL = MI->getDebugLoc();
// Change the floating point control register to use "round towards zero"
case X86::VPCMPESTRM128MEM:
assert(Subtarget->hasSSE42() &&
"Target must have SSE4.2 or AVX features enabled");
- return EmitPCMPSTRM(MI, BB, BB->getParent()->getSubtarget().getInstrInfo());
+ return EmitPCMPSTRM(MI, BB, Subtarget->getInstrInfo());
// String/text processing lowering.
case X86::PCMPISTRIREG:
case X86::VPCMPESTRIMEM:
assert(Subtarget->hasSSE42() &&
"Target must have SSE4.2 or AVX features enabled");
- return EmitPCMPSTRI(MI, BB, BB->getParent()->getSubtarget().getInstrInfo());
+ return EmitPCMPSTRI(MI, BB, Subtarget->getInstrInfo());
// Thread synchronization.
case X86::MONITOR:
- return EmitMonitor(MI, BB, BB->getParent()->getSubtarget().getInstrInfo(),
- Subtarget);
+ return EmitMonitor(MI, BB, Subtarget);
// xbegin
case X86::XBEGIN:
- return EmitXBegin(MI, BB, BB->getParent()->getSubtarget().getInstrInfo());
+ return EmitXBegin(MI, BB, Subtarget->getInstrInfo());
case X86::VASTART_SAVE_XMM_REGS:
return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
: InVec.getOperand(1);
// If inputs to shuffle are the same for both ops, then allow 2 uses
- unsigned AllowedUses = InVec.getOperand(0) == InVec.getOperand(1) ? 2 : 1;
+ unsigned AllowedUses = InVec.getNumOperands() > 1 &&
+ InVec.getOperand(0) == InVec.getOperand(1) ? 2 : 1;
if (LdNode.getOpcode() == ISD::BITCAST) {
// Don't duplicate a load with other uses.
EltNo);
}
+/// \brief Detect bitcasts between i32 to x86mmx low word. Since MMX types are
+/// special and don't usually play with other vector types, it's better to
+/// handle them early to be sure we emit efficient code by avoiding
+/// store-load conversions.
+static SDValue PerformBITCASTCombine(SDNode *N, SelectionDAG &DAG) {
+ if (N->getValueType(0) != MVT::x86mmx ||
+ N->getOperand(0)->getOpcode() != ISD::BUILD_VECTOR ||
+ N->getOperand(0)->getValueType(0) != MVT::v2i32)
+ return SDValue();
+
+ SDValue V = N->getOperand(0);
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(V.getOperand(1));
+ if (C && C->getZExtValue() == 0 && V.getOperand(0).getValueType() == MVT::i32)
+ return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(V.getOperand(0)),
+ N->getValueType(0), V.getOperand(0));
+
+ return SDValue();
+}
+
/// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index
/// generation and convert it from being a bunch of shuffles and extracts
/// into a somewhat faster sequence. For i686, the best sequence is apparently
SDValue InputVector = N->getOperand(0);
- // Detect whether we are trying to convert from mmx to i32 and the bitcast
- // from mmx to v2i32 has a single usage.
- if (InputVector.getNode()->getOpcode() == llvm::ISD::BITCAST &&
- InputVector.getNode()->getOperand(0).getValueType() == MVT::x86mmx &&
- InputVector.hasOneUse() && N->getValueType(0) == MVT::i32)
- return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector),
- N->getValueType(0),
- InputVector.getNode()->getOperand(0));
+ // Detect mmx to i32 conversion through a v2i32 elt extract.
+ if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
+ N->getValueType(0) == MVT::i32 &&
+ InputVector.getValueType() == MVT::v2i32) {
+
+ // The bitcast source is a direct mmx result.
+ SDValue MMXSrc = InputVector.getNode()->getOperand(0);
+ if (MMXSrc.getValueType() == MVT::x86mmx)
+ return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector),
+ N->getValueType(0),
+ InputVector.getNode()->getOperand(0));
+
+ // The mmx is indirect: (i64 extract_elt (v1i64 bitcast (x86mmx ...))).
+ SDValue MMXSrcOp = MMXSrc.getOperand(0);
+ if (MMXSrc.getOpcode() == ISD::EXTRACT_VECTOR_ELT && MMXSrc.hasOneUse() &&
+ MMXSrc.getValueType() == MVT::i64 && MMXSrcOp.hasOneUse() &&
+ MMXSrcOp.getOpcode() == ISD::BITCAST &&
+ MMXSrcOp.getValueType() == MVT::v1i64 &&
+ MMXSrcOp.getOperand(0).getValueType() == MVT::x86mmx)
+ return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector),
+ N->getValueType(0),
+ MMXSrcOp.getOperand(0));
+ }
// Only operate on vectors of 4 elements, where the alternative shuffling
// gets to be more expensive.
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
SDValue Vals[4];
SDLoc dl(InputVector);
-
+
if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) {
SDValue Cst = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, InputVector);
EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy();
SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
DAG.getConstant(1, VecIdxTy));
- SDValue ShAmt = DAG.getConstant(32,
+ SDValue ShAmt = DAG.getConstant(32,
DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64));
Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf);
Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
// instructions match the semantics of the common C idiom x<y?x:y but not
// x<=y?x:y, because of how they handle negative zero (which can be
// ignored in unsafe-math mode).
+ // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
- VT != MVT::f80 && TLI.isTypeLegal(VT) &&
+ VT != MVT::f80 && (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
(Subtarget->hasSSE2() ||
(Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) {
ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
return SDValue();
EVT VT = N->getValueType(0);
- if (VT != MVT::i64)
+ if (VT != MVT::i64 && VT != MVT::i32)
return SDValue();
ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
uint64_t Mask = MaskNode->getZExtValue();
uint64_t Shift = ShiftNode->getZExtValue();
if (isMask_64(Mask)) {
- uint64_t MaskSize = CountPopulation_64(Mask);
+ uint64_t MaskSize = countPopulation(Mask);
if (Shift + MaskSize <= VT.getSizeInBits())
return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0),
DAG.getConstant(Shift | (MaskSize << 8), VT));
return SDValue();
}
+/// PerformMLOADCombine - Resolve extending loads
+static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget *Subtarget) {
+ MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
+ if (Mld->getExtensionType() != ISD::SEXTLOAD)
+ return SDValue();
+
+ EVT VT = Mld->getValueType(0);
+ unsigned NumElems = VT.getVectorNumElements();
+ EVT LdVT = Mld->getMemoryVT();
+ SDLoc dl(Mld);
+
+ assert(LdVT != VT && "Cannot extend to the same type");
+ unsigned ToSz = VT.getVectorElementType().getSizeInBits();
+ unsigned FromSz = LdVT.getVectorElementType().getSizeInBits();
+ // From, To sizes and ElemCount must be pow of two
+ assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
+ "Unexpected size for extending masked load");
+
+ unsigned SizeRatio = ToSz / FromSz;
+ assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());
+
+ // Create a type on which we perform the shuffle
+ EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
+ LdVT.getScalarType(), NumElems*SizeRatio);
+ assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
+
+ // Convert Src0 value
+ SDValue WideSrc0 = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mld->getSrc0());
+ if (Mld->getSrc0().getOpcode() != ISD::UNDEF) {
+ SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
+ for (unsigned i = 0; i != NumElems; ++i)
+ ShuffleVec[i] = i * SizeRatio;
+
+ // Can't shuffle using an illegal type.
+ assert (DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT)
+ && "WideVecVT should be legal");
+ WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
+ DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
+ }
+ // Prepare the new mask
+ SDValue NewMask;
+ SDValue Mask = Mld->getMask();
+ if (Mask.getValueType() == VT) {
+ // Mask and original value have the same type
+ NewMask = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mask);
+ SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
+ for (unsigned i = 0; i != NumElems; ++i)
+ ShuffleVec[i] = i * SizeRatio;
+ for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
+ ShuffleVec[i] = NumElems*SizeRatio;
+ NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
+ DAG.getConstant(0, WideVecVT),
+ &ShuffleVec[0]);
+ }
+ else {
+ assert(Mask.getValueType().getVectorElementType() == MVT::i1);
+ unsigned WidenNumElts = NumElems*SizeRatio;
+ unsigned MaskNumElts = VT.getVectorNumElements();
+ EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+ WidenNumElts);
+
+ unsigned NumConcat = WidenNumElts / MaskNumElts;
+ SmallVector<SDValue, 16> Ops(NumConcat);
+ SDValue ZeroVal = DAG.getConstant(0, Mask.getValueType());
+ Ops[0] = Mask;
+ for (unsigned i = 1; i != NumConcat; ++i)
+ Ops[i] = ZeroVal;
+
+ NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
+ }
+
+ SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
+ Mld->getBasePtr(), NewMask, WideSrc0,
+ Mld->getMemoryVT(), Mld->getMemOperand(),
+ ISD::NON_EXTLOAD);
+ SDValue NewVec = DAG.getNode(X86ISD::VSEXT, dl, VT, WideLd);
+ return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
+
+}
+/// PerformMSTORECombine - Resolve truncating stores
+static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
+ if (!Mst->isTruncatingStore())
+ return SDValue();
+
+ EVT VT = Mst->getValue().getValueType();
+ unsigned NumElems = VT.getVectorNumElements();
+ EVT StVT = Mst->getMemoryVT();
+ SDLoc dl(Mst);
+
+ assert(StVT != VT && "Cannot truncate to the same type");
+ unsigned FromSz = VT.getVectorElementType().getSizeInBits();
+ unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
+
+ // From, To sizes and ElemCount must be pow of two
+ assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
+ "Unexpected size for truncating masked store");
+ // We are going to use the original vector elt for storing.
+ // Accumulated smaller vector elements must be a multiple of the store size.
+ assert (((NumElems * FromSz) % ToSz) == 0 &&
+ "Unexpected ratio for truncating masked store");
+
+ unsigned SizeRatio = FromSz / ToSz;
+ assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
+
+ // Create a type on which we perform the shuffle
+ EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
+ StVT.getScalarType(), NumElems*SizeRatio);
+
+ assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
+
+ SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mst->getValue());
+ SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
+ for (unsigned i = 0; i != NumElems; ++i)
+ ShuffleVec[i] = i * SizeRatio;
+
+ // Can't shuffle using an illegal type.
+ assert (DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT)
+ && "WideVecVT should be legal");
+
+ SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
+ DAG.getUNDEF(WideVecVT),
+ &ShuffleVec[0]);
+
+ SDValue NewMask;
+ SDValue Mask = Mst->getMask();
+ if (Mask.getValueType() == VT) {
+ // Mask and original value have the same type
+ NewMask = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mask);
+ for (unsigned i = 0; i != NumElems; ++i)
+ ShuffleVec[i] = i * SizeRatio;
+ for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
+ ShuffleVec[i] = NumElems*SizeRatio;
+ NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
+ DAG.getConstant(0, WideVecVT),
+ &ShuffleVec[0]);
+ }
+ else {
+ assert(Mask.getValueType().getVectorElementType() == MVT::i1);
+ unsigned WidenNumElts = NumElems*SizeRatio;
+ unsigned MaskNumElts = VT.getVectorNumElements();
+ EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+ WidenNumElts);
+
+ unsigned NumConcat = WidenNumElts / MaskNumElts;
+ SmallVector<SDValue, 16> Ops(NumConcat);
+ SDValue ZeroVal = DAG.getConstant(0, Mask.getValueType());
+ Ops[0] = Mask;
+ for (unsigned i = 1; i != NumConcat; ++i)
+ Ops[i] = ZeroVal;
+
+ NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
+ }
+
+ return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal, Mst->getBasePtr(),
+ NewMask, StVT, Mst->getMemOperand(), false);
+}
/// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
const X86Subtarget *Subtarget) {
// Find the largest store unit
MVT StoreType = MVT::i8;
- for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE;
- tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) {
- MVT Tp = (MVT::SimpleValueType)tp;
+ for (MVT Tp : MVT::integer_valuetypes()) {
if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
StoreType = Tp;
}
/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) {
assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
+
// F[X]OR(0.0, x) -> x
- // F[X]OR(x, 0.0) -> x
if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
if (C->getValueAPF().isPosZero())
return N->getOperand(1);
+
+ // F[X]OR(x, 0.0) -> x
if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
if (C->getValueAPF().isPosZero())
return N->getOperand(0);
/// Do target-specific dag combines on X86ISD::FAND nodes.
static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
// FAND(0.0, x) -> 0.0
- // FAND(x, 0.0) -> 0.0
if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
if (C->getValueAPF().isPosZero())
return N->getOperand(0);
+
+ // FAND(x, 0.0) -> 0.0
if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
if (C->getValueAPF().isPosZero())
return N->getOperand(1);
+
return SDValue();
}
/// Do target-specific dag combines on X86ISD::FANDN nodes
static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG) {
- // FANDN(x, 0.0) -> 0.0
// FANDN(0.0, x) -> x
if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
if (C->getValueAPF().isPosZero())
return N->getOperand(1);
+
+ // FANDN(x, 0.0) -> 0.0
if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
if (C->getValueAPF().isPosZero())
return N->getOperand(1);
+
return SDValue();
}
}
static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
- const X86TargetLowering *XTLI) {
+ const X86Subtarget *Subtarget) {
// First try to optimize away the conversion entirely when it's
// conditionally from a constant. Vectors only.
SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG);
EVT VT = Ld->getValueType(0);
if (!Ld->isVolatile() && !N->getValueType(0).isVector() &&
ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
- !XTLI->getSubtarget()->is64Bit() &&
- VT == MVT::i64) {
- SDValue FILDChain = XTLI->BuildFILD(SDValue(N, 0), Ld->getValueType(0),
- Ld->getChain(), Op0, DAG);
+ !Subtarget->is64Bit() && VT == MVT::i64) {
+ SDValue FILDChain = Subtarget->getTargetLowering()->BuildFILD(
+ SDValue(N, 0), Ld->getValueType(0), Ld->getChain(), Op0, DAG);
DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
return FILDChain;
}
case ISD::SELECT:
case X86ISD::SHRUNKBLEND:
return PerformSELECTCombine(N, DAG, DCI, Subtarget);
+ case ISD::BITCAST: return PerformBITCASTCombine(N, DAG);
case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI, Subtarget);
case ISD::ADD: return PerformAddCombine(N, DAG, Subtarget);
case ISD::SUB: return PerformSubCombine(N, DAG, Subtarget);
case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget);
case ISD::XOR: return PerformXorCombine(N, DAG, DCI, Subtarget);
case ISD::LOAD: return PerformLOADCombine(N, DAG, DCI, Subtarget);
+ case ISD::MLOAD: return PerformMLOADCombine(N, DAG, DCI, Subtarget);
case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget);
- case ISD::SINT_TO_FP: return PerformSINT_TO_FPCombine(N, DAG, this);
+ case ISD::MSTORE: return PerformMSTORECombine(N, DAG, Subtarget);
+ case ISD::SINT_TO_FP: return PerformSINT_TO_FPCombine(N, DAG, Subtarget);
case ISD::FADD: return PerformFADDCombine(N, DAG, Subtarget);
case ISD::FSUB: return PerformFSUBCombine(N, DAG, Subtarget);
case X86ISD::FXOR:
case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget);
case ISD::INTRINSIC_WO_CHAIN:
return PerformINTRINSIC_WO_CHAINCombine(N, DAG, Subtarget);
- case X86ISD::INSERTPS:
- return PerformINSERTPSCombine(N, DAG, Subtarget);
+ case X86ISD::INSERTPS: {
+ if (getTargetMachine().getOptLevel() > CodeGenOpt::None)
+ return PerformINSERTPSCombine(N, DAG, Subtarget);
+ break;
+ }
case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DAG, Subtarget);
}
}
}
return;
+ case 'L':
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
+ (Subtarget->is64Bit() && C->getZExtValue() == 0xffffffff)) {
+ Result = DAG.getTargetConstant(C->getSExtValue(), Op.getValueType());
+ break;
+ }
+ }
+ return;
+ case 'M':
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ if (C->getZExtValue() <= 3) {
+ Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
+ break;
+ }
+ }
+ return;
case 'N':
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
if (C->getZExtValue() <= 255) {
}
}
return;
+ case 'O':
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ if (C->getZExtValue() <= 127) {
+ Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
+ break;
+ }
+ }
+ return;
case 'e': {
// 32-bit signed value
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {