#include "X86ISelLowering.h"
#include "Utils/X86ShuffleDecode.h"
#include "X86CallingConv.h"
+#include "X86FrameLowering.h"
#include "X86InstrBuilder.h"
#include "X86MachineFunctionInfo.h"
#include "X86TargetMachine.h"
cl::desc("Enable an experimental vector shuffle lowering code path."),
cl::Hidden);
+static cl::opt<bool> ExperimentalVectorShuffleLegality(
+ "x86-experimental-vector-shuffle-legality", cl::init(false),
+ cl::desc("Enable experimental shuffle legality based on the experimental "
+ "shuffle lowering. Should only be used with the experimental "
+ "shuffle lowering."),
+ cl::Hidden);
+
static cl::opt<int> ReciprocalEstimateRefinementSteps(
"x86-recip-refinement-steps", cl::init(1),
cl::desc("Specify the number of Newton-Raphson iterations applied to the "
return Insert256BitVector(V, V2, NumElems/2, DAG, dl);
}
-// FIXME: This should stop caching the target machine as soon as
-// we can remove resetOperationActions et al.
-X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM)
- : TargetLowering(TM) {
- Subtarget = &TM.getSubtarget<X86Subtarget>();
+X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
+ const X86Subtarget &STI)
+ : TargetLowering(TM), Subtarget(&STI) {
X86ScalarSSEf64 = Subtarget->hasSSE2();
X86ScalarSSEf32 = Subtarget->hasSSE1();
TD = getDataLayout();
- resetOperationActions();
-}
-
-void X86TargetLowering::resetOperationActions() {
- const TargetMachine &TM = getTargetMachine();
- static bool FirstTimeThrough = true;
-
- // If none of the target options have changed, then we don't need to reset the
- // operation actions.
- if (!FirstTimeThrough && TO == TM.Options) return;
-
- if (!FirstTimeThrough) {
- // Reinitialize the actions.
- initActions();
- FirstTimeThrough = false;
- }
-
- TO = TM.Options;
-
// Set up the TargetLowering object.
static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
setSchedulingPreference(Sched::ILP);
else
setSchedulingPreference(Sched::RegPressure);
- const X86RegisterInfo *RegInfo =
- TM.getSubtarget<X86Subtarget>().getRegisterInfo();
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
// Bypass expensive divides on Atom when compiling with O2.
if (Subtarget->is64Bit())
addRegisterClass(MVT::i64, &X86::GR64RegClass);
- setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
+ for (MVT VT : MVT::integer_valuetypes())
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
// We don't accept any truncstore of integer registers.
setTruncStoreAction(MVT::i64, MVT::i32, Expand);
setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
setTruncStoreAction(MVT::f32, MVT::f16, Expand);
setTruncStoreAction(MVT::f64, MVT::f16, Expand);
setTruncStoreAction(MVT::f80, MVT::f16, Expand);
// First set operation action for all vector types to either promote
// (for widening) or expand (for scalarization). Then we will selectively
// turn on ones that can be effectively codegen'd.
- for (int i = MVT::FIRST_VECTOR_VALUETYPE;
- i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
- MVT VT = (MVT::SimpleValueType)i;
+ for (MVT VT : MVT::vector_valuetypes()) {
setOperationAction(ISD::ADD , VT, Expand);
setOperationAction(ISD::SUB , VT, Expand);
setOperationAction(ISD::FADD, VT, Expand);
setOperationAction(ISD::ANY_EXTEND, VT, Expand);
setOperationAction(ISD::VSELECT, VT, Expand);
setOperationAction(ISD::SELECT_CC, VT, Expand);
- for (int InnerVT = MVT::FIRST_VECTOR_VALUETYPE;
- InnerVT <= MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
- setTruncStoreAction(VT,
- (MVT::SimpleValueType)InnerVT, Expand);
- setLoadExtAction(ISD::SEXTLOAD, VT, Expand);
- setLoadExtAction(ISD::ZEXTLOAD, VT, Expand);
+ for (MVT InnerVT : MVT::vector_valuetypes()) {
+ setTruncStoreAction(InnerVT, VT, Expand);
- // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like types,
- // we have to deal with them whether we ask for Expansion or not. Setting
- // Expand causes its own optimisation problems though, so leave them legal.
- if (VT.getVectorElementType() == MVT::i1)
- setLoadExtAction(ISD::EXTLOAD, VT, Expand);
+ setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
+ setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
+
+ // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
+ // types, we have to deal with them whether we ask for Expansion or not.
+ // Setting Expand causes its own optimisation problems though, so leave
+ // them legal.
+ if (VT.getVectorElementType() == MVT::i1)
+ setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
+ }
}
// FIXME: In order to prevent SSE instructions being expanded to MMX ones
setOperationAction(ISD::LOAD, MVT::v4f32, Legal);
setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
+ setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
continue;
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+ setOperationAction(ISD::VSELECT, VT, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
}
// memory vector types which we can load as a scalar (or sequence of
// scalars) and extend in-register to a legal 128-bit vector type. For sext
// loads these must work with a single scalar load.
- setLoadExtAction(ISD::SEXTLOAD, MVT::v4i8, Custom);
- setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, Custom);
- setLoadExtAction(ISD::SEXTLOAD, MVT::v8i8, Custom);
- setLoadExtAction(ISD::EXTLOAD, MVT::v2i8, Custom);
- setLoadExtAction(ISD::EXTLOAD, MVT::v2i16, Custom);
- setLoadExtAction(ISD::EXTLOAD, MVT::v2i32, Custom);
- setLoadExtAction(ISD::EXTLOAD, MVT::v4i8, Custom);
- setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, Custom);
- setLoadExtAction(ISD::EXTLOAD, MVT::v8i8, Custom);
+ for (MVT VT : MVT::integer_vector_valuetypes()) {
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
+ }
setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom);
setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom);
+ setOperationAction(ISD::VSELECT, MVT::v2f64, Custom);
+ setOperationAction(ISD::VSELECT, MVT::v2i64, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
- setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, Legal);
+ for (MVT VT : MVT::fp_vector_valuetypes())
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
// FIXME: Do we need to handle scalar-to-vector here?
setOperationAction(ISD::MUL, MVT::v4i32, Legal);
- setOperationAction(ISD::VSELECT, MVT::v2f64, Custom);
- setOperationAction(ISD::VSELECT, MVT::v2i64, Custom);
- setOperationAction(ISD::VSELECT, MVT::v4i32, Custom);
- setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
- setOperationAction(ISD::VSELECT, MVT::v8i16, Custom);
- // There is no BLENDI for byte vectors. We don't need to custom lower
- // some vselects for now.
+ // We directly match byte blends in the backend as they match the VSELECT
+ // condition form.
setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
// SSE41 brings specific instructions for doing vector sign extend even in
// cases where we don't have SRA.
- setLoadExtAction(ISD::SEXTLOAD, MVT::v2i8, Custom);
- setLoadExtAction(ISD::SEXTLOAD, MVT::v2i16, Custom);
- setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, Custom);
+ for (MVT VT : MVT::integer_vector_valuetypes()) {
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
+ }
+
+ // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, MVT::v8i8, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i8, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
+
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i16, MVT::v8i8, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i8, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
// i8 and i16 vectors are custom because the source register and source
// source memory operand types are not the same width. f32 vectors are
setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom);
- setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, Legal);
+ for (MVT VT : MVT::fp_vector_valuetypes())
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
setOperationAction(ISD::SRL, MVT::v16i16, Custom);
setOperationAction(ISD::SRL, MVT::v32i8, Custom);
setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
- setOperationAction(ISD::VSELECT, MVT::v4f64, Custom);
- setOperationAction(ISD::VSELECT, MVT::v4i64, Custom);
- setOperationAction(ISD::VSELECT, MVT::v8i32, Custom);
- setOperationAction(ISD::VSELECT, MVT::v8f32, Custom);
-
setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);
setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Custom);
setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom);
setOperationAction(ISD::MULHU, MVT::v16i16, Legal);
setOperationAction(ISD::MULHS, MVT::v16i16, Legal);
- setOperationAction(ISD::VSELECT, MVT::v16i16, Custom);
- setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
-
// The custom lowering for UINT_TO_FP for v8i32 becomes interesting
// when we have a 256bit-wide blend with immediate.
setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
// Custom CTPOP always performs better on natively supported v8i32
setOperationAction(ISD::CTPOP, MVT::v8i32, Custom);
+
+ // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32, MVT::v8i8, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i8, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32, MVT::v8i16, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i16, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i32, Legal);
+
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32, MVT::v8i8, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i8, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32, MVT::v8i16, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i16, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i32, Legal);
} else {
setOperationAction(ISD::ADD, MVT::v4i64, Custom);
setOperationAction(ISD::ADD, MVT::v8i32, Custom);
setOperationAction(ISD::SRA, MVT::v8i32, Custom);
// Custom lower several nodes for 256-bit types.
- for (int i = MVT::FIRST_VECTOR_VALUETYPE;
- i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
- MVT VT = (MVT::SimpleValueType)i;
-
+ for (MVT VT : MVT::vector_valuetypes()) {
if (VT.getScalarSizeInBits() >= 32) {
setOperationAction(ISD::MLOAD, VT, Legal);
setOperationAction(ISD::MSTORE, VT, Legal);
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+ setOperationAction(ISD::VSELECT, VT, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
}
+ if (Subtarget->hasInt256())
+ setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
+
+
// Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
for (int i = MVT::v32i8; i != MVT::v4i64; ++i) {
MVT VT = (MVT::SimpleValueType)i;
addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
+ for (MVT VT : MVT::fp_vector_valuetypes())
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
+
setOperationAction(ISD::BR_CC, MVT::i1, Expand);
setOperationAction(ISD::SETCC, MVT::i1, Custom);
setOperationAction(ISD::XOR, MVT::i1, Legal);
setOperationAction(ISD::OR, MVT::i1, Legal);
setOperationAction(ISD::AND, MVT::i1, Legal);
- setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, Legal);
setOperationAction(ISD::LOAD, MVT::v16f32, Legal);
setOperationAction(ISD::LOAD, MVT::v8f64, Legal);
setOperationAction(ISD::LOAD, MVT::v8i64, Legal);
setOperationAction(ISD::SIGN_EXTEND, MVT::v8i16, Custom);
setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom);
+ setOperationAction(ISD::FFLOOR, MVT::v16f32, Legal);
+ setOperationAction(ISD::FFLOOR, MVT::v8f64, Legal);
+ setOperationAction(ISD::FCEIL, MVT::v16f32, Legal);
+ setOperationAction(ISD::FCEIL, MVT::v8f64, Legal);
+ setOperationAction(ISD::FTRUNC, MVT::v16f32, Legal);
+ setOperationAction(ISD::FTRUNC, MVT::v8f64, Legal);
+ setOperationAction(ISD::FRINT, MVT::v16f32, Legal);
+ setOperationAction(ISD::FRINT, MVT::v8f64, Legal);
+ setOperationAction(ISD::FNEARBYINT, MVT::v16f32, Legal);
+ setOperationAction(ISD::FNEARBYINT, MVT::v8f64, Legal);
+
setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom);
}
// Custom lower several nodes.
- for (int i = MVT::FIRST_VECTOR_VALUETYPE;
- i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
- MVT VT = (MVT::SimpleValueType)i;
-
+ for (MVT VT : MVT::vector_valuetypes()) {
unsigned EltSize = VT.getVectorElementType().getSizeInBits();
// Extract subvector is special because the value type
// (result) is 256/128-bit but the source is 512-bit wide.
for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
MVT VT = (MVT::SimpleValueType)i;
- // Do not attempt to promote non-256-bit vectors.
+ // Do not attempt to promote non-512-bit vectors.
if (!VT.is512BitVector())
continue;
const unsigned EltSize = VT.getVectorElementType().getSizeInBits();
- // Do not attempt to promote non-256-bit vectors.
+ // Do not attempt to promote non-512-bit vectors.
if (!VT.is512BitVector())
continue;
// SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion
// of this type with custom code.
- for (int VT = MVT::FIRST_VECTOR_VALUETYPE;
- VT != MVT::LAST_VECTOR_VALUETYPE; VT++) {
- setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,
- Custom);
- }
+ for (MVT VT : MVT::vector_valuetypes())
+ setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Custom);
// We want to custom lower some of our intrinsics.
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
// We have target-specific dag combine patterns for the following nodes:
setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
+ setTargetDAGCombine(ISD::BITCAST);
setTargetDAGCombine(ISD::VSELECT);
setTargetDAGCombine(ISD::SELECT);
setTargetDAGCombine(ISD::SHL);
setTargetDAGCombine(ISD::FMA);
setTargetDAGCombine(ISD::SUB);
setTargetDAGCombine(ISD::LOAD);
+ setTargetDAGCombine(ISD::MLOAD);
setTargetDAGCombine(ISD::STORE);
+ setTargetDAGCombine(ISD::MSTORE);
setTargetDAGCombine(ISD::ZERO_EXTEND);
setTargetDAGCombine(ISD::ANY_EXTEND);
setTargetDAGCombine(ISD::SIGN_EXTEND);
setTargetDAGCombine(ISD::SETCC);
setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
setTargetDAGCombine(ISD::BUILD_VECTOR);
- if (Subtarget->is64Bit())
- setTargetDAGCombine(ISD::MUL);
+ setTargetDAGCombine(ISD::MUL);
setTargetDAGCombine(ISD::XOR);
computeRegisterProperties();
MachineFunction &MF) const {
const Function *F = MF.getFunction();
if ((!IsMemset || ZeroMemset) &&
- !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
- Attribute::NoImplicitFloat)) {
+ !F->hasFnAttribute(Attribute::NoImplicitFloat)) {
if (Size >= 16 &&
(Subtarget->isUnalignedMemAccessFast() ||
((DstAlign == 0 || DstAlign >= 16) &&
// Win32 requires us to put the sret argument to %eax as well.
// We saved the argument into a virtual register in the entry block,
// so now we copy the value out and into %rax/%eax.
- if (DAG.getMachineFunction().getFunction()->hasStructRetAttr() &&
- (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC())) {
- MachineFunction &MF = DAG.getMachineFunction();
- X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
- unsigned Reg = FuncInfo->getSRetReturnReg();
- assert(Reg &&
- "SRetReturnReg should have been set in LowerFormalArguments().");
- SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy());
+ //
+ // Checking Function.hasStructRetAttr() here is insufficient because the IR
+ // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
+ // false, then an sret argument may be implicitly inserted in the SelDAG. In
+ // either case FuncInfo->setSRetReturnReg() will have been called.
+ if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
+ assert((Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC()) &&
+ "No need for an sret register");
+ SDValue Val = DAG.getCopyFromReg(Chain, dl, SRetReg, getPointerTy());
unsigned RetValReg
= (Subtarget->is64Bit() && !Subtarget->isTarget64BitILP32()) ?
}
const Function *Fn = MF.getFunction();
- bool NoImplicitFloatOps = Fn->getAttributes().
- hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat);
+ bool NoImplicitFloatOps = Fn->hasFnAttribute(Attribute::NoImplicitFloat);
assert(!(MF.getTarget().Options.UseSoftFloat && NoImplicitFloatOps) &&
"SSE register cannot be used when SSE is disabled!");
if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps ||
}
// Figure out if XMM registers are in use.
- bool HaveXMMArgs = Is64Bit && !IsWin64;
- bool NoImplicitFloatOps = Fn->getAttributes().hasAttribute(
- AttributeSet::FunctionIndex, Attribute::NoImplicitFloat);
- assert(!(MF.getTarget().Options.UseSoftFloat && NoImplicitFloatOps) &&
+ assert(!(MF.getTarget().Options.UseSoftFloat &&
+ Fn->hasFnAttribute(Attribute::NoImplicitFloat)) &&
"SSE register cannot be used when SSE is disabled!");
- if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps ||
- !Subtarget->hasSSE1())
- HaveXMMArgs = false;
// 64-bit calling conventions support varargs and register parameters, so we
// have to do extra work to spill them in the prologue.
}
if (IsWin64) {
- const TargetFrameLowering &TFI = *MF.getSubtarget().getFrameLowering();
+ const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
// Get to the caller-allocated home save location. Add 8 to account
// for the return address.
int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
// Walk the register/memloc assignments, inserting copies/loads. In the case
// of tail call optimization arguments are handle later.
- const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
- DAG.getSubtarget().getRegisterInfo());
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
// Skip inalloca arguments, they have already been written.
ISD::ArgFlagsTy Flags = Outs[i].Flags;
// through a register, since the call instruction's 32-bit
// pc-relative offset may not be large enough to hold the whole
// address.
- } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+ } else if (Callee->getOpcode() == ISD::GlobalAddress) {
// If the callee is a GlobalAddress node (quite common, every direct call
// is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
// it.
+ GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);
// We should use extra load for direct calls to dllimported functions in
// non-JIT mode.
// unless we're building with the leopard linker or later, which
// automatically synthesizes these stubs.
OpFlags = X86II::MO_DARWIN_STUB;
- } else if (Subtarget->isPICStyleRIPRel() &&
- isa<Function>(GV) &&
- cast<Function>(GV)->getAttributes().
- hasAttribute(AttributeSet::FunctionIndex,
- Attribute::NonLazyBind)) {
+ } else if (Subtarget->isPICStyleRIPRel() && isa<Function>(GV) &&
+ cast<Function>(GV)->hasFnAttribute(Attribute::NonLazyBind)) {
// If the function is marked as non-lazy, generate an indirect call
// which loads from the GOT directly. This avoids runtime overhead
// at the cost of eager binding (and one extra byte of encoding).
Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(),
OpFlags);
- } else if (Subtarget->isTarget64BitILP32() && Callee->getValueType(0) == MVT::i32) {
+ } else if (Subtarget->isTarget64BitILP32() &&
+ Callee->getValueType(0) == MVT::i32) {
// Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
}
RegsToPass[i].second.getValueType()));
// Add a register mask operand representing the call-preserved registers.
- const TargetRegisterInfo *TRI = DAG.getSubtarget().getRegisterInfo();
+ const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
assert(Mask && "Missing call preserved mask for calling convention");
Ops.push_back(DAG.getRegisterMask(Mask));
unsigned
X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
SelectionDAG& DAG) const {
- MachineFunction &MF = DAG.getMachineFunction();
- const TargetMachine &TM = MF.getTarget();
- const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
- TM.getSubtargetImpl()->getRegisterInfo());
- const TargetFrameLowering &TFI = *TM.getSubtargetImpl()->getFrameLowering();
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+ const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
unsigned StackAlignment = TFI.getStackAlignment();
uint64_t AlignMask = StackAlignment - 1;
int64_t Offset = StackSize;
return false;
} else {
unsigned Opcode = Def->getOpcode();
- if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) &&
+ if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
+ Opcode == X86::LEA64_32r) &&
Def->getOperand(1).isFI()) {
FI = Def->getOperand(1).getIndex();
Bytes = Flags.getByValSize();
// Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
// emit a special epilogue.
- const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
- DAG.getSubtarget().getRegisterInfo());
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
if (RegInfo->needsStackRealignment(MF))
return false;
// the caller's fixed stack objects.
MachineFrameInfo *MFI = MF.getFrameInfo();
const MachineRegisterInfo *MRI = &MF.getRegInfo();
- const X86InstrInfo *TII =
- static_cast<const X86InstrInfo *>(DAG.getSubtarget().getInstrInfo());
+ const X86InstrInfo *TII = Subtarget->getInstrInfo();
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
CCValAssign &VA = ArgLocs[i];
SDValue Arg = OutVals[i];
SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
- const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
- DAG.getSubtarget().getRegisterInfo());
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
int ReturnAddrIndex = FuncInfo->getRAIndex();
return false;
}
+bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
+ ISD::LoadExtType ExtTy,
+ EVT NewVT) const {
+ // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
+ // relocation target a movq or addq instruction: don't let the load shrink.
+ SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
+ if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
+ if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
+ return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
+ return true;
+}
+
/// \brief Returns true if it is beneficial to convert a load of a constant
/// to just the constant itself.
bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
return true;
}
-bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT,
+bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT,
unsigned Index) const {
if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
return false;
return (Index == 0 || Index == ResVT.getVectorNumElements());
}
+bool X86TargetLowering::isCheapToSpeculateCttz() const {
+ // Speculate cttz only if we can directly use TZCNT.
+ return Subtarget->hasBMI();
+}
+
+bool X86TargetLowering::isCheapToSpeculateCtlz() const {
+ // Speculate ctlz only if we can directly use LZCNT.
+ return Subtarget->hasLZCNT();
+}
+
/// isUndefOrInRange - Return true if Val is undef or if its value falls within
/// the specified range (L, H].
static bool isUndefOrInRange(int Val, int Low, int Hi) {
/// isSequentialOrUndefInRange - Return true if every element in Mask, beginning
/// from position Pos and ending in Pos+Size, falls within the specified
-/// sequential range (L, L+Pos]. or is undef.
+/// sequential range (Low, Low+Size]. or is undef.
static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
unsigned Pos, unsigned Size, int Low) {
for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
return false;
unsigned EltSize = VT.getVectorElementType().getSizeInBits();
- bool symetricMaskRequired =
+ bool symmetricMaskRequired =
(VT.getSizeInBits() >= 256) && (EltSize == 32);
// VSHUFPSY divides the resulting vector into 4 chunks.
// For VSHUFPSY, the mask of the second half must be the same as the
// first but with the appropriate offsets. This works in the same way as
// VPERMILPS works with masks.
- if (!symetricMaskRequired || Idx < 0)
+ if (!symmetricMaskRequired || Idx < 0)
continue;
if (MaskVal[i] < 0) {
MaskVal[i] = Idx - l;
return (FstHalf | (SndHalf << 4));
}
-// Symetric in-lane mask. Each lane has 4 elements (for imm8)
+// Symmetric in-lane mask. Each lane has 4 elements (for imm8)
static bool isPermImmMask(ArrayRef<int> Mask, MVT VT, unsigned& Imm8) {
unsigned EltSize = VT.getVectorElementType().getSizeInBits();
if (EltSize < 32)
unsigned EltSize = VT.getVectorElementType().getSizeInBits();
if (VT.getSizeInBits() < 256 || EltSize < 32)
return false;
- bool symetricMaskRequired = (EltSize == 32);
+ bool symmetricMaskRequired = (EltSize == 32);
unsigned NumElts = VT.getVectorNumElements();
unsigned NumLanes = VT.getSizeInBits()/128;
for (unsigned i = 0; i != LaneSize; ++i) {
if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
return false;
- if (symetricMaskRequired) {
+ if (symmetricMaskRequired) {
if (ExpectedMaskVal[i] < 0 && Mask[i+l] >= 0) {
ExpectedMaskVal[i] = Mask[i+l] - l;
continue;
return false;
if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) {
- // FIXME: Support AVX-512 here.
- Type *Ty = C->getType();
- if (!Ty->isVectorTy() || (Ty->getVectorNumElements() != 16 &&
- Ty->getVectorNumElements() != 32))
- return false;
-
DecodePSHUFBMask(C, Mask);
+ if (Mask.empty())
+ return false;
break;
}
IsUnary = true;
break;
case X86ISD::MOVSS:
- case X86ISD::MOVSD: {
- // The index 0 always comes from the first element of the second source,
- // this is why MOVSS and MOVSD are used in the first place. The other
- // elements come from the other positions of the first source vector
- Mask.push_back(NumElems);
- for (unsigned i = 1; i != NumElems; ++i) {
- Mask.push_back(i);
- }
+ case X86ISD::MOVSD:
+ DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask);
break;
- }
case X86ISD::VPERM2X128:
ImmN = N->getOperand(N->getNumOperands()-1);
DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
break;
case X86ISD::MOVSLDUP:
DecodeMOVSLDUPMask(VT, Mask);
+ IsUnary = true;
break;
case X86ISD::MOVSHDUP:
DecodeMOVSHDUPMask(VT, Mask);
+ IsUnary = true;
break;
case X86ISD::MOVDDUP:
+ DecodeMOVDDUPMask(VT, Mask);
+ IsUnary = true;
+ break;
case X86ISD::MOVLHPD:
case X86ISD::MOVLPD:
case X86ISD::MOVLPS:
const X86Subtarget *Subtarget,
const TargetLowering &TLI) {
// Find all zeroable elements.
- bool Zeroable[4];
+ std::bitset<4> Zeroable;
for (int i=0; i < 4; ++i) {
SDValue Elt = Op->getOperand(i);
Zeroable[i] = (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt));
}
- assert(std::count_if(&Zeroable[0], &Zeroable[4],
- [](bool M) { return !M; }) > 1 &&
+ assert(Zeroable.size() - Zeroable.count() > 1 &&
"We expect at least two non-zero elements!");
// We only know how to deal with build_vector nodes where elements are either
V2 = DAG.getNode(ISD::BITCAST, SDLoc(V2), MVT::v4f32, V2);
// Ok, we can emit an INSERTPS instruction.
- unsigned ZMask = 0;
- for (int i = 0; i < 4; ++i)
- if (Zeroable[i])
- ZMask |= 1 << i;
+ unsigned ZMask = Zeroable.to_ulong();
unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
return DAG.getNode(ISD::BITCAST, SDLoc(Op), VT, Result);
}
-/// getVShift - Return a vector logical shift node.
-///
+/// Return a vector logical shift node.
static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
unsigned NumBits, SelectionDAG &DAG,
const TargetLowering &TLI, SDLoc dl) {
assert(VT.is128BitVector() && "Unknown type for VShift");
- EVT ShVT = MVT::v2i64;
+ MVT ShVT = MVT::v2i64;
unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp);
+ MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(SrcOp.getValueType());
+ assert(NumBits % 8 == 0 && "Only support byte sized shifts");
+ SDValue ShiftVal = DAG.getConstant(NumBits/8, ScalarShiftTy);
return DAG.getNode(ISD::BITCAST, dl, VT,
- DAG.getNode(Opc, dl, ShVT, SrcOp,
- DAG.getConstant(NumBits,
- TLI.getScalarShiftAmountTy(SrcOp.getValueType()))));
+ DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
}
static SDValue
LD->getPointerInfo().getWithOffset(StartOffset),
false, false, false, 0);
- SmallVector<int, 8> Mask;
- for (unsigned i = 0; i != NumElems; ++i)
- Mask.push_back(EltNo);
+ SmallVector<int, 8> Mask(NumElems, EltNo);
return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]);
}
return SDValue();
}
-/// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a
-/// vector of type 'VT', see if the elements can be replaced by a single large
-/// load which has the same value as a build_vector whose operands are 'elts'.
+/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
+/// elements can be replaced by a single large load which has the same value as
+/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
///
/// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a
///
/// FIXME: we'd also like to handle the case where the last elements are zero
/// rather than undef via VZEXT_LOAD, but we do not detect that case today.
/// There's even a handy isZeroNode for that purpose.
-static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
+static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
SDLoc &DL, SelectionDAG &DAG,
bool isAfterLegalize) {
- EVT EltVT = VT.getVectorElementType();
unsigned NumElems = Elts.size();
LoadSDNode *LDBase = nullptr;
// non-consecutive, bail out.
for (unsigned i = 0; i < NumElems; ++i) {
SDValue Elt = Elts[i];
-
+ // Look through a bitcast.
+ if (Elt.getNode() && Elt.getOpcode() == ISD::BITCAST)
+ Elt = Elt.getOperand(0);
if (!Elt.getNode() ||
(Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
return SDValue();
continue;
LoadSDNode *LD = cast<LoadSDNode>(Elt);
- if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i))
+ EVT LdVT = Elt.getValueType();
+ // Each loaded element must be the correct fractional portion of the
+ // requested vector load.
+ if (LdVT.getSizeInBits() != VT.getSizeInBits() / NumElems)
+ return SDValue();
+ if (!DAG.isConsecutiveLoad(LD, LDBase, LdVT.getSizeInBits() / 8, i))
return SDValue();
LastLoadedElt = i;
}
// load of the entire vector width starting at the base pointer. If we found
// consecutive loads for the low half, generate a vzext_load node.
if (LastLoadedElt == NumElems - 1) {
+ assert(LDBase && "Did not find base load for merging consecutive loads");
+ EVT EltVT = LDBase->getValueType(0);
+ // Ensure that the input vector size for the merged loads matches the
+ // cumulative size of the input elements.
+ if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
+ return SDValue();
if (isAfterLegalize &&
!DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, VT))
return NewLd;
}
-
+
//TODO: The code below fires only for for loading the low v2i32 / v2f32
//of a v4i32 / v4f32. It's probably worth generalizing.
+ EVT EltVT = VT.getVectorElementType();
if (NumElems == 4 && LastLoadedElt == 1 && (EltVT.getSizeInBits() == 32) &&
DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
// it may be detrimental to overall size. There needs to be a way to detect
// that condition to know if this is truly a size win.
const Function *F = DAG.getMachineFunction().getFunction();
- bool OptForSize = F->getAttributes().
- hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
+ bool OptForSize = F->hasFnAttribute(Attribute::OptimizeForSize);
// Handle broadcasting a single constant scalar from the constant pool
// into a vector.
bool AddFound = false;
bool SubFound = false;
- for (unsigned i = 0, e = NumElts; i != e; i++) {
+ for (unsigned i = 0, e = NumElts; i != e; ++i) {
SDValue Op = BV->getOperand(i);
// Skip 'undef' values.
// elements, otherwise build the individual 128-bit pieces and use
// shuffles to put them in place.
if (VT.is256BitVector() || VT.is512BitVector()) {
- SmallVector<SDValue, 64> V;
- for (unsigned i = 0; i != NumElems; ++i)
- V.push_back(Op.getOperand(i));
+ SmallVector<SDValue, 64> V(Op->op_begin(), Op->op_begin() + NumElems);
// Check for a build vector of consecutive loads.
if (SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false))
return LD;
-
+
EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
// Build both the lower and upper subvector.
return Sh;
// For SSE 4.1, use insertps to put the high elements into the low element.
- if (getSubtarget()->hasSSE41()) {
+ if (Subtarget->hasSSE41()) {
SDValue Result;
if (Op.getOperand(0).getOpcode() != ISD::UNDEF)
Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
return true;
}
-// Hide this symbol with an anonymous namespace instead of 'static' so that MSVC
-// 2013 will allow us to use it as a non-type template parameter.
-namespace {
-
-/// \brief Implementation of the \c isShuffleEquivalent variadic functor.
-///
-/// See its documentation for details.
-bool isShuffleEquivalentImpl(ArrayRef<int> Mask, ArrayRef<const int *> Args) {
- if (Mask.size() != Args.size())
- return false;
- for (int i = 0, e = Mask.size(); i < e; ++i) {
- assert(*Args[i] >= 0 && "Arguments must be positive integers!");
- if (Mask[i] != -1 && Mask[i] != *Args[i])
+/// \brief Base case helper for testing a single mask element.
+static bool isShuffleEquivalentImpl(SDValue V1, SDValue V2,
+ BuildVectorSDNode *BV1,
+ BuildVectorSDNode *BV2, ArrayRef<int> Mask,
+ int i, int Arg) {
+ int Size = Mask.size();
+ if (Mask[i] != -1 && Mask[i] != Arg) {
+ auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
+ auto *ArgsBV = Arg < Size ? BV1 : BV2;
+ if (!MaskBV || !ArgsBV ||
+ MaskBV->getOperand(Mask[i] % Size) != ArgsBV->getOperand(Arg % Size))
return false;
}
return true;
}
-} // namespace
+/// \brief Recursive helper to peel off and test each mask element.
+template <typename... Ts>
+static bool isShuffleEquivalentImpl(SDValue V1, SDValue V2,
+ BuildVectorSDNode *BV1,
+ BuildVectorSDNode *BV2, ArrayRef<int> Mask,
+ int i, int Arg, Ts... Args) {
+ if (!isShuffleEquivalentImpl(V1, V2, BV1, BV2, Mask, i, Arg))
+ return false;
+
+ return isShuffleEquivalentImpl(V1, V2, BV1, BV2, Mask, i + 1, Args...);
+}
/// \brief Checks whether a shuffle mask is equivalent to an explicit list of
/// arguments.
/// It returns true if the mask is exactly as wide as the argument list, and
/// each element of the mask is either -1 (signifying undef) or the value given
/// in the argument.
-static const VariadicFunction1<
- bool, ArrayRef<int>, int, isShuffleEquivalentImpl> isShuffleEquivalent = {};
+template <typename... Ts>
+static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
+ Ts... Args) {
+ if (Mask.size() != sizeof...(Args))
+ return false;
+
+ // If the values are build vectors, we can look through them to find
+ // equivalent inputs that make the shuffles equivalent.
+ auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
+ auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);
+
+ // Recursively peel off arguments and test them against the mask.
+ return isShuffleEquivalentImpl(V1, V2, BV1, BV2, Mask, 0, Args...);
+}
/// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
///
return DAG.getConstant(Imm, MVT::i8);
}
+/// \brief Try to emit a blend instruction for a shuffle using bit math.
+///
+/// This is used as a fallback approach when first class blend instructions are
+/// unavailable. Currently it is only suitable for integer vectors, but could
+/// be generalized for floating point vectors if desirable.
+static SDValue lowerVectorShuffleAsBitBlend(SDLoc DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
+ assert(VT.isInteger() && "Only supports integer vector types!");
+ MVT EltVT = VT.getScalarType();
+ int NumEltBits = EltVT.getSizeInBits();
+ SDValue Zero = DAG.getConstant(0, EltVT);
+ SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), EltVT);
+ SmallVector<SDValue, 16> MaskOps;
+ for (int i = 0, Size = Mask.size(); i < Size; ++i) {
+ if (Mask[i] != -1 && Mask[i] != i && Mask[i] != i + Size)
+ return SDValue(); // Shuffled input!
+ MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
+ }
+
+ SDValue V1Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, MaskOps);
+ V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
+ // We have to cast V2 around.
+ MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
+ V2 = DAG.getNode(ISD::BITCAST, DL, VT,
+ DAG.getNode(X86ISD::ANDNP, DL, MaskVT,
+ DAG.getNode(ISD::BITCAST, DL, MaskVT, V1Mask),
+ DAG.getNode(ISD::BITCAST, DL, MaskVT, V2)));
+ return DAG.getNode(ISD::OR, DL, VT, V1, V2);
+}
+
/// \brief Try to emit a blend instruction for a shuffle.
///
/// This doesn't do any checks for the availability of instructions for blending
SDValue V2, ArrayRef<int> Mask,
const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
-
unsigned BlendMask = 0;
for (int i = 0, Size = Mask.size(); i < Size; ++i) {
if (Mask[i] >= Size) {
}
}
// FALLTHROUGH
+ case MVT::v16i8:
case MVT::v32i8: {
- assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
// Scale the blend by the number of bytes per element.
- int Scale = VT.getScalarSizeInBits() / 8;
- assert(Mask.size() * Scale == 32 && "Not a 256-bit vector!");
+ int Scale = VT.getScalarSizeInBits() / 8;
+
+ // This form of blend is always done on bytes. Compute the byte vector
+ // type.
+ MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
// Compute the VSELECT mask. Note that VSELECT is really confusing in the
// mix of LLVM's code generator and the x86 backend. We tell the code
// the LLVM model for boolean values in vector elements gets the relevant
// bit set, it is set backwards and over constrained relative to x86's
// actual model.
- SDValue VSELECTMask[32];
+ SmallVector<SDValue, 32> VSELECTMask;
for (int i = 0, Size = Mask.size(); i < Size; ++i)
for (int j = 0; j < Scale; ++j)
- VSELECTMask[Scale * i + j] =
+ VSELECTMask.push_back(
Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
- : DAG.getConstant(Mask[i] < Size ? -1 : 0, MVT::i8);
+ : DAG.getConstant(Mask[i] < Size ? -1 : 0, MVT::i8));
- V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V1);
- V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V2);
+ V1 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V1);
+ V2 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V2);
return DAG.getNode(
ISD::BITCAST, DL, VT,
- DAG.getNode(ISD::VSELECT, DL, MVT::v32i8,
- DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, VSELECTMask),
+ DAG.getNode(ISD::VSELECT, DL, BlendVT,
+ DAG.getNode(ISD::BUILD_VECTOR, DL, BlendVT, VSELECTMask),
V1, V2));
}
}
}
-/// \brief Generic routine to lower a shuffle and blend as a decomposed set of
-/// unblended shuffles followed by an unshuffled blend.
+/// \brief Try to lower as a blend of elements from two inputs followed by
+/// a single-input permutation.
+///
+/// This matches the pattern where we can blend elements from two inputs and
+/// then reduce the shuffle to a single-input permutation.
+static SDValue lowerVectorShuffleAsBlendAndPermute(SDLoc DL, MVT VT, SDValue V1,
+ SDValue V2,
+ ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
+ // We build up the blend mask while checking whether a blend is a viable way
+ // to reduce the shuffle.
+ SmallVector<int, 32> BlendMask(Mask.size(), -1);
+ SmallVector<int, 32> PermuteMask(Mask.size(), -1);
+
+ for (int i = 0, Size = Mask.size(); i < Size; ++i) {
+ if (Mask[i] < 0)
+ continue;
+
+ assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
+
+ if (BlendMask[Mask[i] % Size] == -1)
+ BlendMask[Mask[i] % Size] = Mask[i];
+ else if (BlendMask[Mask[i] % Size] != Mask[i])
+ return SDValue(); // Can't blend in the needed input!
+
+ PermuteMask[i] = Mask[i] % Size;
+ }
+
+ SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
+ return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
+}
+
+/// \brief Generic routine to decompose a shuffle and blend into indepndent
+/// blends and permutes.
///
/// This matches the extremely common pattern for handling combined
/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
-/// operations.
+/// operations. It will try to pick the best arrangement of shuffles and
+/// blends.
static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(SDLoc DL, MVT VT,
SDValue V1,
SDValue V2,
BlendMask[i] = i + Size;
}
+ // Try to lower with the simpler initial blend strategy unless one of the
+ // input shuffles would be a no-op. We prefer to shuffle inputs as the
+ // shuffle may be able to fold with a load or other benefit. However, when
+ // we'll have to do 2x as many shuffles in order to achieve this, blending
+ // first is a better strategy.
+ if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask))
+ if (SDValue BlendPerm =
+ lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
+ return BlendPerm;
+
V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
/// elements, and takes the low elements as the result. Note that while this is
/// specified as a *right shift* because x86 is little-endian, it is a *left
/// rotate* of the vector lanes.
-///
-/// Note that this only handles 128-bit vector widths currently.
static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1,
SDValue V2,
ArrayRef<int> Mask,
SelectionDAG &DAG) {
assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
+ int NumElts = Mask.size();
+ int NumLanes = VT.getSizeInBits() / 128;
+ int NumLaneElts = NumElts / NumLanes;
+
// We need to detect various ways of spelling a rotation:
// [11, 12, 13, 14, 15, 0, 1, 2]
// [-1, 12, 13, 14, -1, -1, 1, -1]
// [-1, 4, 5, 6, -1, -1, -1, -1]
int Rotation = 0;
SDValue Lo, Hi;
- for (int i = 0, Size = Mask.size(); i < Size; ++i) {
- if (Mask[i] == -1)
- continue;
- assert(Mask[i] >= 0 && "Only -1 is a valid negative mask element!");
+ for (int l = 0; l < NumElts; l += NumLaneElts) {
+ for (int i = 0; i < NumLaneElts; ++i) {
+ if (Mask[l + i] == -1)
+ continue;
+ assert(Mask[l + i] >= 0 && "Only -1 is a valid negative mask element!");
- // Based on the mod-Size value of this mask element determine where
- // a rotated vector would have started.
- int StartIdx = i - (Mask[i] % Size);
- if (StartIdx == 0)
- // The identity rotation isn't interesting, stop.
- return SDValue();
+ // Get the mod-Size index and lane correct it.
+ int LaneIdx = (Mask[l + i] % NumElts) - l;
+ // Make sure it was in this lane.
+ if (LaneIdx < 0 || LaneIdx >= NumLaneElts)
+ return SDValue();
- // If we found the tail of a vector the rotation must be the missing
- // front. If we found the head of a vector, it must be how much of the head.
- int CandidateRotation = StartIdx < 0 ? -StartIdx : Size - StartIdx;
+ // Determine where a rotated vector would have started.
+ int StartIdx = i - LaneIdx;
+ if (StartIdx == 0)
+ // The identity rotation isn't interesting, stop.
+ return SDValue();
- if (Rotation == 0)
- Rotation = CandidateRotation;
- else if (Rotation != CandidateRotation)
- // The rotations don't match, so we can't match this mask.
- return SDValue();
+ // If we found the tail of a vector the rotation must be the missing
+ // front. If we found the head of a vector, it must be how much of the
+ // head.
+ int CandidateRotation = StartIdx < 0 ? -StartIdx : NumLaneElts - StartIdx;
- // Compute which value this mask is pointing at.
- SDValue MaskV = Mask[i] < Size ? V1 : V2;
-
- // Compute which of the two target values this index should be assigned to.
- // This reflects whether the high elements are remaining or the low elements
- // are remaining.
- SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
-
- // Either set up this value if we've not encountered it before, or check
- // that it remains consistent.
- if (!TargetV)
- TargetV = MaskV;
- else if (TargetV != MaskV)
- // This may be a rotation, but it pulls from the inputs in some
- // unsupported interleaving.
- return SDValue();
+ if (Rotation == 0)
+ Rotation = CandidateRotation;
+ else if (Rotation != CandidateRotation)
+ // The rotations don't match, so we can't match this mask.
+ return SDValue();
+
+ // Compute which value this mask is pointing at.
+ SDValue MaskV = Mask[l + i] < NumElts ? V1 : V2;
+
+ // Compute which of the two target values this index should be assigned
+ // to. This reflects whether the high elements are remaining or the low
+ // elements are remaining.
+ SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
+
+ // Either set up this value if we've not encountered it before, or check
+ // that it remains consistent.
+ if (!TargetV)
+ TargetV = MaskV;
+ else if (TargetV != MaskV)
+ // This may be a rotation, but it pulls from the inputs in some
+ // unsupported interleaving.
+ return SDValue();
+ }
}
// Check that we successfully analyzed the mask, and normalize the results.
else if (!Hi)
Hi = Lo;
- assert(VT.getSizeInBits() == 128 &&
- "Rotate-based lowering only supports 128-bit lowering!");
- assert(Mask.size() <= 16 &&
- "Can shuffle at most 16 bytes in a 128-bit vector!");
-
// The actual rotate instruction rotates bytes, so we need to scale the
- // rotation based on how many bytes are in the vector.
- int Scale = 16 / Mask.size();
+ // rotation based on how many bytes are in the vector lane.
+ int Scale = 16 / NumLaneElts;
- // SSSE3 targets can use the palignr instruction
+ // SSSE3 targets can use the palignr instruction.
if (Subtarget->hasSSSE3()) {
- // Cast the inputs to v16i8 to match PALIGNR.
- Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Lo);
- Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Hi);
+ // Cast the inputs to i8 vector of correct length to match PALIGNR.
+ MVT AlignVT = MVT::getVectorVT(MVT::i8, 16 * NumLanes);
+ Lo = DAG.getNode(ISD::BITCAST, DL, AlignVT, Lo);
+ Hi = DAG.getNode(ISD::BITCAST, DL, AlignVT, Hi);
return DAG.getNode(ISD::BITCAST, DL, VT,
- DAG.getNode(X86ISD::PALIGNR, DL, MVT::v16i8, Hi, Lo,
+ DAG.getNode(X86ISD::PALIGNR, DL, AlignVT, Hi, Lo,
DAG.getConstant(Rotation * Scale, MVT::i8)));
}
+ assert(VT.getSizeInBits() == 128 &&
+ "Rotate-based lowering only supports 128-bit lowering!");
+ assert(Mask.size() <= 16 &&
+ "Can shuffle at most 16 bytes in a 128-bit vector!");
+
// Default SSE2 implementation
int LoByteShift = 16 - Rotation * Scale;
int HiByteShift = Rotation * Scale;
Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Hi);
SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, Lo,
- DAG.getConstant(8 * LoByteShift, MVT::i8));
+ DAG.getConstant(LoByteShift, MVT::i8));
SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, Hi,
- DAG.getConstant(8 * HiByteShift, MVT::i8));
+ DAG.getConstant(HiByteShift, MVT::i8));
return DAG.getNode(ISD::BITCAST, DL, VT,
DAG.getNode(ISD::OR, DL, MVT::v2i64, LoShift, HiShift));
}
SDValue V1, SDValue V2) {
SmallBitVector Zeroable(Mask.size(), false);
+ while (V1.getOpcode() == ISD::BITCAST)
+ V1 = V1->getOperand(0);
+ while (V2.getOpcode() == ISD::BITCAST)
+ V2 = V2->getOperand(0);
+
bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
continue;
}
- // If this is an index into a build_vector node, dig out the input value and
- // use it.
+ // If this is an index into a build_vector node (which has the same number
+ // of elements), dig out the input value and use it.
SDValue V = M < Size ? V1 : V2;
- if (V.getOpcode() != ISD::BUILD_VECTOR)
+ if (V.getOpcode() != ISD::BUILD_VECTOR || Size != (int)V.getNumOperands())
continue;
SDValue Input = V.getOperand(M % Size);
return Zeroable;
}
-/// \brief Try to lower a vector shuffle as a byte shift (shifts in zeros).
+/// \brief Try to emit a bitmask instruction for a shuffle.
///
-/// Attempts to match a shuffle mask against the PSRLDQ and PSLLDQ SSE2
-/// byte-shift instructions. The mask must consist of a shifted sequential
-/// shuffle from one of the input vectors and zeroable elements for the
-/// remaining 'shifted in' elements.
-///
-/// Note that this only handles 128-bit vector widths currently.
-static SDValue lowerVectorShuffleAsByteShift(SDLoc DL, MVT VT, SDValue V1,
- SDValue V2, ArrayRef<int> Mask,
- SelectionDAG &DAG) {
- assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
+/// This handles cases where we can model a blend exactly as a bitmask due to
+/// one of the inputs being zeroable.
+static SDValue lowerVectorShuffleAsBitMask(SDLoc DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
+ MVT EltVT = VT.getScalarType();
+ int NumEltBits = EltVT.getSizeInBits();
+ MVT IntEltVT = MVT::getIntegerVT(NumEltBits);
+ SDValue Zero = DAG.getConstant(0, IntEltVT);
+ SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), IntEltVT);
+ if (EltVT.isFloatingPoint()) {
+ Zero = DAG.getNode(ISD::BITCAST, DL, EltVT, Zero);
+ AllOnes = DAG.getNode(ISD::BITCAST, DL, EltVT, AllOnes);
+ }
+ SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
+ SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+ SDValue V;
+ for (int i = 0, Size = Mask.size(); i < Size; ++i) {
+ if (Zeroable[i])
+ continue;
+ if (Mask[i] % Size != i)
+ return SDValue(); // Not a blend.
+ if (!V)
+ V = Mask[i] < Size ? V1 : V2;
+ else if (V != (Mask[i] < Size ? V1 : V2))
+ return SDValue(); // Can only let one input through the mask.
+
+ VMaskOps[i] = AllOnes;
+ }
+ if (!V)
+ return SDValue(); // No non-zeroable elements!
+
+ SDValue VMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, VMaskOps);
+ V = DAG.getNode(VT.isFloatingPoint()
+ ? (unsigned) X86ISD::FAND : (unsigned) ISD::AND,
+ DL, VT, V, VMask);
+ return V;
+}
+/// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros).
+///
+/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
+/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
+/// matches elements from one of the input vectors shuffled to the left or
+/// right with zeroable elements 'shifted in'. It handles both the strictly
+/// bit-wise element shifts and the byte shift across an entire 128-bit double
+/// quad word lane.
+///
+/// PSHL : (little-endian) left bit shift.
+/// [ zz, 0, zz, 2 ]
+/// [ -1, 4, zz, -1 ]
+/// PSRL : (little-endian) right bit shift.
+/// [ 1, zz, 3, zz]
+/// [ -1, -1, 7, zz]
+/// PSLLDQ : (little-endian) left byte shift
+/// [ zz, 0, 1, 2, 3, 4, 5, 6]
+/// [ zz, zz, -1, -1, 2, 3, 4, -1]
+/// [ zz, zz, zz, zz, zz, zz, -1, 1]
+/// PSRLDQ : (little-endian) right byte shift
+/// [ 5, 6, 7, zz, zz, zz, zz, zz]
+/// [ -1, 5, 6, 7, zz, zz, zz, zz]
+/// [ 1, 2, -1, -1, -1, -1, zz, zz]
+static SDValue lowerVectorShuffleAsShift(SDLoc DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
int Size = Mask.size();
- int Scale = 16 / Size;
+ assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
+
+ auto CheckZeros = [&](int Shift, int Scale, bool Left) {
+ for (int i = 0; i < Size; i += Scale)
+ for (int j = 0; j < Shift; ++j)
+ if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
+ return false;
- auto isSequential = [](int Base, int StartIndex, int EndIndex, int MaskOffset,
- ArrayRef<int> Mask) {
- for (int i = StartIndex; i < EndIndex; i++) {
- if (Mask[i] < 0)
- continue;
- if (i + Base != Mask[i] - MaskOffset)
- return false;
- }
return true;
};
- for (int Shift = 1; Shift < Size; Shift++) {
- int ByteShift = Shift * Scale;
-
- // PSRLDQ : (little-endian) right byte shift
- // [ 5, 6, 7, zz, zz, zz, zz, zz]
- // [ -1, 5, 6, 7, zz, zz, zz, zz]
- // [ 1, 2, -1, -1, -1, -1, zz, zz]
- bool ZeroableRight = true;
- for (int i = Size - Shift; i < Size; i++) {
- ZeroableRight &= Zeroable[i];
- }
-
- if (ZeroableRight) {
- bool ValidShiftRight1 = isSequential(Shift, 0, Size - Shift, 0, Mask);
- bool ValidShiftRight2 = isSequential(Shift, 0, Size - Shift, Size, Mask);
-
- if (ValidShiftRight1 || ValidShiftRight2) {
- // Cast the inputs to v2i64 to match PSRLDQ.
- SDValue &TargetV = ValidShiftRight1 ? V1 : V2;
- SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TargetV);
- SDValue Shifted = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, V,
- DAG.getConstant(ByteShift * 8, MVT::i8));
- return DAG.getNode(ISD::BITCAST, DL, VT, Shifted);
- }
+ auto MatchShift = [&](int Shift, int Scale, bool Left, SDValue V) {
+ for (int i = 0; i != Size; i += Scale) {
+ unsigned Pos = Left ? i + Shift : i;
+ unsigned Low = Left ? i : i + Shift;
+ unsigned Len = Scale - Shift;
+ if (!isSequentialOrUndefInRange(Mask, Pos, Len,
+ Low + (V == V1 ? 0 : Size)))
+ return SDValue();
}
- // PSLLDQ : (little-endian) left byte shift
- // [ zz, 0, 1, 2, 3, 4, 5, 6]
- // [ zz, zz, -1, -1, 2, 3, 4, -1]
- // [ zz, zz, zz, zz, zz, zz, -1, 1]
- bool ZeroableLeft = true;
- for (int i = 0; i < Shift; i++) {
- ZeroableLeft &= Zeroable[i];
- }
+ int ShiftEltBits = VT.getScalarSizeInBits() * Scale;
+ bool ByteShift = ShiftEltBits > 64;
+ unsigned OpCode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
+ : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
+ int ShiftAmt = Shift * VT.getScalarSizeInBits() / (ByteShift ? 8 : 1);
- if (ZeroableLeft) {
- bool ValidShiftLeft1 = isSequential(-Shift, Shift, Size, 0, Mask);
- bool ValidShiftLeft2 = isSequential(-Shift, Shift, Size, Size, Mask);
+ // Normalize the scale for byte shifts to still produce an i64 element
+ // type.
+ Scale = ByteShift ? Scale / 2 : Scale;
- if (ValidShiftLeft1 || ValidShiftLeft2) {
- // Cast the inputs to v2i64 to match PSLLDQ.
- SDValue &TargetV = ValidShiftLeft1 ? V1 : V2;
- SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TargetV);
- SDValue Shifted = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, V,
- DAG.getConstant(ByteShift * 8, MVT::i8));
- return DAG.getNode(ISD::BITCAST, DL, VT, Shifted);
- }
- }
- }
+ // We need to round trip through the appropriate type for the shift.
+ MVT ShiftSVT = MVT::getIntegerVT(VT.getScalarSizeInBits() * Scale);
+ MVT ShiftVT = MVT::getVectorVT(ShiftSVT, Size / Scale);
+ assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
+ "Illegal integer vector type");
+ V = DAG.getNode(ISD::BITCAST, DL, ShiftVT, V);
+
+ V = DAG.getNode(OpCode, DL, ShiftVT, V, DAG.getConstant(ShiftAmt, MVT::i8));
+ return DAG.getNode(ISD::BITCAST, DL, VT, V);
+ };
+ // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
+ // keep doubling the size of the integer elements up to that. We can
+ // then shift the elements of the integer vector by whole multiples of
+ // their width within the elements of the larger integer vector. Test each
+ // multiple to see if we can find a match with the moved element indices
+ // and that the shifted in elements are all zeroable.
+ for (int Scale = 2; Scale * VT.getScalarSizeInBits() <= 128; Scale *= 2)
+ for (int Shift = 1; Shift != Scale; ++Shift)
+ for (bool Left : {true, false})
+ if (CheckZeros(Shift, Scale, Left))
+ for (SDValue V : {V1, V2})
+ if (SDValue Match = MatchShift(Shift, Scale, Left, V))
+ return Match;
+
+ // no match
return SDValue();
}
/// stride, produce either a zero or any extension based on the available
/// features of the subtarget.
static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
- SDLoc DL, MVT VT, int NumElements, int Scale, bool AnyExt, SDValue InputV,
+ SDLoc DL, MVT VT, int Scale, bool AnyExt, SDValue InputV,
const X86Subtarget *Subtarget, SelectionDAG &DAG) {
assert(Scale > 1 && "Need a scale to extend.");
- int EltBits = VT.getSizeInBits() / NumElements;
+ int NumElements = VT.getVectorNumElements();
+ int EltBits = VT.getScalarSizeInBits();
assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
"Only 8, 16, and 32 bit elements can be extended.");
assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
// Found a valid zext mask! Try various lowering strategies based on the
// input type and available ISA extensions.
if (Subtarget->hasSSE41()) {
- MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
NumElements / Scale);
- InputV = DAG.getNode(ISD::BITCAST, DL, InputVT, InputV);
return DAG.getNode(ISD::BITCAST, DL, VT,
DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV));
}
return DAG.getNode(ISD::BITCAST, DL, VT, InputV);
}
-/// \brief Try to lower a vector shuffle as a zero extension on any micrarch.
+/// \brief Try to lower a vector shuffle as a zero extension on any microarch.
///
/// This routine will try to do everything in its power to cleverly lower
/// a shuffle which happens to match the pattern of a zero extend. It doesn't
SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
int Bits = VT.getSizeInBits();
- int NumElements = Mask.size();
+ int NumElements = VT.getVectorNumElements();
+ assert(VT.getScalarSizeInBits() <= 32 &&
+ "Exceeds 32-bit integer zero extension limit");
+ assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
// Define a helper function to check a particular ext-scale and lower to it if
// valid.
if (Mask[i] == -1)
continue; // Valid anywhere but doesn't tell us anything.
if (i % Scale != 0) {
- // Each of the extend elements needs to be zeroable.
+ // Each of the extended elements need to be zeroable.
if (!Zeroable[i])
return SDValue();
- // We no lorger are in the anyext case.
+ // We no longer are in the anyext case.
AnyExt = false;
continue;
}
return SDValue(); // Flip-flopping inputs.
if (Mask[i] % NumElements != i / Scale)
- return SDValue(); // Non-consecutive strided elemenst.
+ return SDValue(); // Non-consecutive strided elements.
}
// If we fail to find an input, we have a zero-shuffle which should always
return SDValue();
return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
- DL, VT, NumElements, Scale, AnyExt, InputV, Subtarget, DAG);
+ DL, VT, Scale, AnyExt, InputV, Subtarget, DAG);
};
// The widest scale possible for extending is to a 64-bit integer.
// many elements.
for (; NumExtElements < NumElements; NumExtElements *= 2) {
assert(NumElements % NumExtElements == 0 &&
- "The input vector size must be divisble by the extended size.");
+ "The input vector size must be divisible by the extended size.");
if (SDValue V = Lower(NumElements / NumExtElements))
return V;
}
+ // General extends failed, but 128-bit vectors may be able to use MOVQ.
+ if (Bits != 128)
+ return SDValue();
+
+ // Returns one of the source operands if the shuffle can be reduced to a
+ // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
+ auto CanZExtLowHalf = [&]() {
+ for (int i = NumElements / 2; i != NumElements; ++i)
+ if (!Zeroable[i])
+ return SDValue();
+ if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
+ return V1;
+ if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
+ return V2;
+ return SDValue();
+ };
+
+ if (SDValue V = CanZExtLowHalf()) {
+ V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, V);
+ V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
+ return DAG.getNode(ISD::BITCAST, DL, VT, V);
+ }
+
// No viable ext lowering found.
return SDValue();
}
ExtVT, V1, V2);
}
+ // This lowering only works for the low element with floating point vectors.
+ if (VT.isFloatingPoint() && V2Index != 0)
+ return SDValue();
+
V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
if (ExtVT != VT)
V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2);
V2 = DAG.getNode(
X86ISD::VSHLDQ, DL, MVT::v2i64, V2,
DAG.getConstant(
- V2Index * EltVT.getSizeInBits(),
+ V2Index * EltVT.getSizeInBits()/8,
DAG.getTargetLoweringInfo().getScalarShiftAmountTy(MVT::v2i64)));
V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2);
}
return DAG.getNode(X86ISD::VBROADCAST, DL, VT, V);
}
+// Check for whether we can use INSERTPS to perform the shuffle. We only use
+// INSERTPS when the V1 elements are already in the correct locations
+// because otherwise we can just always use two SHUFPS instructions which
+// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
+// perform INSERTPS if a single V1 element is out of place and all V2
+// elements are zeroable.
+static SDValue lowerVectorShuffleAsInsertPS(SDValue Op, SDValue V1, SDValue V2,
+ ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
+ assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!");
+ assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
+ assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
+
+ SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+
+ unsigned ZMask = 0;
+ int V1DstIndex = -1;
+ int V2DstIndex = -1;
+ bool V1UsedInPlace = false;
+
+ for (int i = 0; i < 4; ++i) {
+ // Synthesize a zero mask from the zeroable elements (includes undefs).
+ if (Zeroable[i]) {
+ ZMask |= 1 << i;
+ continue;
+ }
+
+ // Flag if we use any V1 inputs in place.
+ if (i == Mask[i]) {
+ V1UsedInPlace = true;
+ continue;
+ }
+
+ // We can only insert a single non-zeroable element.
+ if (V1DstIndex != -1 || V2DstIndex != -1)
+ return SDValue();
+
+ if (Mask[i] < 4) {
+ // V1 input out of place for insertion.
+ V1DstIndex = i;
+ } else {
+ // V2 input for insertion.
+ V2DstIndex = i;
+ }
+ }
+
+ // Don't bother if we have no (non-zeroable) element for insertion.
+ if (V1DstIndex == -1 && V2DstIndex == -1)
+ return SDValue();
+
+ // Determine element insertion src/dst indices. The src index is from the
+ // start of the inserted vector, not the start of the concatenated vector.
+ unsigned V2SrcIndex = 0;
+ if (V1DstIndex != -1) {
+ // If we have a V1 input out of place, we use V1 as the V2 element insertion
+ // and don't use the original V2 at all.
+ V2SrcIndex = Mask[V1DstIndex];
+ V2DstIndex = V1DstIndex;
+ V2 = V1;
+ } else {
+ V2SrcIndex = Mask[V2DstIndex] - 4;
+ }
+
+ // If no V1 inputs are used in place, then the result is created only from
+ // the zero mask and the V2 insertion - so remove V1 dependency.
+ if (!V1UsedInPlace)
+ V1 = DAG.getUNDEF(MVT::v4f32);
+
+ unsigned InsertPSMask = V2SrcIndex << 6 | V2DstIndex << 4 | ZMask;
+ assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
+
+ // Insert the V2 element into the desired position.
+ SDLoc DL(Op);
+ return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
+ DAG.getConstant(InsertPSMask, MVT::i8));
+}
+
+/// \brief Try to lower a shuffle as a permute of the inputs followed by an
+/// UNPCK instruction.
+///
+/// This specifically targets cases where we end up with alternating between
+/// the two inputs, and so can permute them into something that feeds a single
+/// UNPCK instruction. Note that this routine only targets integer vectors
+/// because for floating point vectors we have a generalized SHUFPS lowering
+/// strategy that handles everything that doesn't *exactly* match an unpack,
+/// making this clever lowering unnecessary.
+static SDValue lowerVectorShuffleAsUnpack(MVT VT, SDLoc DL, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
+ assert(!VT.isFloatingPoint() &&
+ "This routine only supports integer vectors.");
+ assert(!isSingleInputShuffleMask(Mask) &&
+ "This routine should only be used when blending two inputs.");
+ assert(Mask.size() >= 2 && "Single element masks are invalid.");
+
+ int Size = Mask.size();
+
+ int NumLoInputs = std::count_if(Mask.begin(), Mask.end(), [Size](int M) {
+ return M >= 0 && M % Size < Size / 2;
+ });
+ int NumHiInputs = std::count_if(
+ Mask.begin(), Mask.end(), [Size](int M) { return M % Size > Size / 2; });
+
+ bool UnpackLo = NumLoInputs >= NumHiInputs;
+
+ auto TryUnpack = [&](MVT UnpackVT, int Scale) {
+ SmallVector<int, 32> V1Mask(Mask.size(), -1);
+ SmallVector<int, 32> V2Mask(Mask.size(), -1);
+
+ for (int i = 0; i < Size; ++i) {
+ if (Mask[i] < 0)
+ continue;
+
+ // Each element of the unpack contains Scale elements from this mask.
+ int UnpackIdx = i / Scale;
+
+ // We only handle the case where V1 feeds the first slots of the unpack.
+ // We rely on canonicalization to ensure this is the case.
+ if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
+ return SDValue();
+
+ // Setup the mask for this input. The indexing is tricky as we have to
+ // handle the unpack stride.
+ SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
+ VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
+ Mask[i] % Size;
+ }
+
+ // Shuffle the inputs into place.
+ V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
+ V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
+
+ // Cast the inputs to the type we will use to unpack them.
+ V1 = DAG.getNode(ISD::BITCAST, DL, UnpackVT, V1);
+ V2 = DAG.getNode(ISD::BITCAST, DL, UnpackVT, V2);
+
+ // Unpack the inputs and cast the result back to the desired type.
+ return DAG.getNode(ISD::BITCAST, DL, VT,
+ DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH,
+ DL, UnpackVT, V1, V2));
+ };
+
+ // We try each unpack from the largest to the smallest to try and find one
+ // that fits this mask.
+ int OrigNumElements = VT.getVectorNumElements();
+ int OrigScalarSize = VT.getScalarSizeInBits();
+ for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2) {
+ int Scale = ScalarSize / OrigScalarSize;
+ int NumElements = OrigNumElements / Scale;
+ MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), NumElements);
+ if (SDValue Unpack = TryUnpack(UnpackVT, Scale))
+ return Unpack;
+ }
+
+ return SDValue();
+}
+
/// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
///
/// This is the basis function for the 2-lane 64-bit shuffles as we have full
assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
if (isSingleInputShuffleMask(Mask)) {
+ // Use low duplicate instructions for masks that match their pattern.
+ if (Subtarget->hasSSE3())
+ if (isShuffleEquivalent(V1, V2, Mask, 0, 0))
+ return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, V1);
+
// Straight shuffle of a single input vector. Simulate this by using the
// single input as both of the "inputs" to this instruction..
unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
assert(Mask[1] >= 2 && "Non-canonicalized blend!");
- // Use dedicated unpack instructions for masks that match their pattern.
- if (isShuffleEquivalent(Mask, 0, 2))
- return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2f64, V1, V2);
- if (isShuffleEquivalent(Mask, 1, 3))
- return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2f64, V1, V2);
-
// If we have a single input, insert that into V1 if we can do so cheaply.
if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
// Try to use one of the special instruction patterns to handle two common
// blend patterns if a zero-blend above didn't work.
- if (isShuffleEquivalent(Mask, 0, 3) || isShuffleEquivalent(Mask, 1, 3))
+ if (isShuffleEquivalent(V1, V2, Mask, 0, 3) || isShuffleEquivalent(V1, V2, Mask, 1, 3))
if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
// We can either use a special instruction to load over the low double or
// to move just the low double.
Subtarget, DAG))
return Blend;
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (isShuffleEquivalent(V1, V2, Mask, 0, 2))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2f64, V1, V2);
+ if (isShuffleEquivalent(V1, V2, Mask, 1, 3))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2f64, V1, V2);
+
unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V2,
DAG.getConstant(SHUFPDMask, MVT::i8));
DAG.getNode(X86ISD::PSHUFD, SDLoc(Op), MVT::v4i32, V1,
getV4X86ShuffleImm8ForMask(WidenedMask, DAG)));
}
+ assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
+ assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
+ assert(Mask[0] < 2 && "We sort V1 to be the first input.");
+ assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
+
+ // If we have a blend of two PACKUS operations an the blend aligns with the
+ // low and half halves, we can just merge the PACKUS operations. This is
+ // particularly important as it lets us merge shuffles that this routine itself
+ // creates.
+ auto GetPackNode = [](SDValue V) {
+ while (V.getOpcode() == ISD::BITCAST)
+ V = V.getOperand(0);
- // Try to use byte shift instructions.
- if (SDValue Shift = lowerVectorShuffleAsByteShift(
- DL, MVT::v2i64, V1, V2, Mask, DAG))
+ return V.getOpcode() == X86ISD::PACKUS ? V : SDValue();
+ };
+ if (SDValue V1Pack = GetPackNode(V1))
+ if (SDValue V2Pack = GetPackNode(V2))
+ return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
+ DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8,
+ Mask[0] == 0 ? V1Pack.getOperand(0)
+ : V1Pack.getOperand(1),
+ Mask[1] == 2 ? V2Pack.getOperand(0)
+ : V2Pack.getOperand(1)));
+
+ // Try to use shift instructions.
+ if (SDValue Shift =
+ lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, DAG))
return Shift;
- // If we have a single input from V2 insert that into V1 if we can do so
- // cheaply.
- if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
- if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
- MVT::v2i64, DL, V1, V2, Mask, Subtarget, DAG))
- return Insertion;
- // Try inverting the insertion since for v2 masks it is easy to do and we
- // can't reliably sort the mask one way or the other.
- int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
- Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
- if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
- MVT::v2i64, DL, V2, V1, InverseMask, Subtarget, DAG))
- return Insertion;
- }
+ // When loading a scalar and then shuffling it into a vector we can often do
+ // the insertion cheaply.
+ if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
+ MVT::v2i64, DL, V1, V2, Mask, Subtarget, DAG))
+ return Insertion;
+ // Try inverting the insertion since for v2 masks it is easy to do and we
+ // can't reliably sort the mask one way or the other.
+ int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
+ if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
+ MVT::v2i64, DL, V2, V1, InverseMask, Subtarget, DAG))
+ return Insertion;
+
+ // We have different paths for blend lowering, but they all must use the
+ // *exact* same predicate.
+ bool IsBlendSupported = Subtarget->hasSSE41();
+ if (IsBlendSupported)
+ if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
+ Subtarget, DAG))
+ return Blend;
// Use dedicated unpack instructions for masks that match their pattern.
- if (isShuffleEquivalent(Mask, 0, 2))
+ if (isShuffleEquivalent(V1, V2, Mask, 0, 2))
return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, V1, V2);
- if (isShuffleEquivalent(Mask, 1, 3))
+ if (isShuffleEquivalent(V1, V2, Mask, 1, 3))
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2i64, V1, V2);
- if (Subtarget->hasSSE41())
- if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
- Subtarget, DAG))
- return Blend;
-
// Try to use byte rotation instructions.
// Its more profitable for pre-SSSE3 to use shuffles/unpacks.
if (Subtarget->hasSSSE3())
DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
return Rotate;
+ // If we have direct support for blends, we should lower by decomposing into
+ // a permute. That will be faster than the domain cross.
+ if (IsBlendSupported)
+ return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2,
+ Mask, DAG);
+
// We implement this with SHUFPD which is pretty lame because it will likely
// incur 2 cycles of stall for integer vectors on Nehalem and older chips.
// However, all the alternatives are still more cycles and newer chips don't
DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
}
+/// \brief Test whether this can be lowered with a single SHUFPS instruction.
+///
+/// This is used to disable more specialized lowerings when the shufps lowering
+/// will happen to be efficient.
+static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
+ // This routine only handles 128-bit shufps.
+ assert(Mask.size() == 4 && "Unsupported mask size!");
+
+ // To lower with a single SHUFPS we need to have the low half and high half
+ // each requiring a single input.
+ if (Mask[0] != -1 && Mask[1] != -1 && (Mask[0] < 4) != (Mask[1] < 4))
+ return false;
+ if (Mask[2] != -1 && Mask[3] != -1 && (Mask[2] < 4) != (Mask[3] < 4))
+ return false;
+
+ return true;
+}
+
/// \brief Lower a vector shuffle using the SHUFPS instruction.
///
/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
Mask, Subtarget, DAG))
return Broadcast;
+ // Use even/odd duplicate instructions for masks that match their pattern.
+ if (Subtarget->hasSSE3()) {
+ if (isShuffleEquivalent(V1, V2, Mask, 0, 0, 2, 2))
+ return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
+ if (isShuffleEquivalent(V1, V2, Mask, 1, 1, 3, 3))
+ return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
+ }
+
if (Subtarget->hasAVX()) {
// If we have AVX, we can use VPERMILPS which will allow folding a load
// into the shuffle.
getV4X86ShuffleImm8ForMask(Mask, DAG));
}
- // Use dedicated unpack instructions for masks that match their pattern.
- if (isShuffleEquivalent(Mask, 0, 4, 1, 5))
- return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V1, V2);
- if (isShuffleEquivalent(Mask, 2, 6, 3, 7))
- return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V1, V2);
-
// There are special ways we can lower some single-element blends. However, we
// have custom ways we can lower more complex single-element blends below that
// we defer to if both this and BLENDPS fail to match, so restrict this to
Mask, Subtarget, DAG))
return V;
- if (Subtarget->hasSSE41())
+ if (Subtarget->hasSSE41()) {
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
Subtarget, DAG))
return Blend;
- // Check for whether we can use INSERTPS to perform the blend. We only use
- // INSERTPS when the V1 elements are already in the correct locations
- // because otherwise we can just always use two SHUFPS instructions which
- // are much smaller to encode than a SHUFPS and an INSERTPS.
- if (NumV2Elements == 1 && Subtarget->hasSSE41()) {
- int V2Index =
- std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) -
- Mask.begin();
-
- // When using INSERTPS we can zero any lane of the destination. Collect
- // the zero inputs into a mask and drop them from the lanes of V1 which
- // actually need to be present as inputs to the INSERTPS.
- SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
-
- // Synthesize a shuffle mask for the non-zero and non-v2 inputs.
- bool InsertNeedsShuffle = false;
- unsigned ZMask = 0;
- for (int i = 0; i < 4; ++i)
- if (i != V2Index) {
- if (Zeroable[i]) {
- ZMask |= 1 << i;
- } else if (Mask[i] != i) {
- InsertNeedsShuffle = true;
- break;
- }
- }
-
- // We don't want to use INSERTPS or other insertion techniques if it will
- // require shuffling anyways.
- if (!InsertNeedsShuffle) {
- // If all of V1 is zeroable, replace it with undef.
- if ((ZMask | 1 << V2Index) == 0xF)
- V1 = DAG.getUNDEF(MVT::v4f32);
-
- unsigned InsertPSMask = (Mask[V2Index] - 4) << 6 | V2Index << 4 | ZMask;
- assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
+ // Use INSERTPS if we can complete the shuffle efficiently.
+ if (SDValue V = lowerVectorShuffleAsInsertPS(Op, V1, V2, Mask, DAG))
+ return V;
- // Insert the V2 element into the desired position.
- return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
- DAG.getConstant(InsertPSMask, MVT::i8));
- }
+ if (!isSingleSHUFPSMask(Mask))
+ if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(
+ DL, MVT::v4f32, V1, V2, Mask, DAG))
+ return BlendPerm;
}
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (isShuffleEquivalent(V1, V2, Mask, 0, 4, 1, 5))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V1, V2);
+ if (isShuffleEquivalent(V1, V2, Mask, 2, 6, 3, 7))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V1, V2);
+ if (isShuffleEquivalent(V1, V2, Mask, 4, 0, 5, 1))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V2, V1);
+ if (isShuffleEquivalent(V1, V2, Mask, 6, 2, 7, 3))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V2, V1);
+
// Otherwise fall back to a SHUFPS lowering strategy.
return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
}
// so prevents folding a load into this instruction or making a copy.
const int UnpackLoMask[] = {0, 0, 1, 1};
const int UnpackHiMask[] = {2, 2, 3, 3};
- if (isShuffleEquivalent(Mask, 0, 0, 1, 1))
+ if (isShuffleEquivalent(V1, V2, Mask, 0, 0, 1, 1))
Mask = UnpackLoMask;
- else if (isShuffleEquivalent(Mask, 2, 2, 3, 3))
+ else if (isShuffleEquivalent(V1, V2, Mask, 2, 2, 3, 3))
Mask = UnpackHiMask;
return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
getV4X86ShuffleImm8ForMask(Mask, DAG));
}
- // Try to use byte shift instructions.
- if (SDValue Shift = lowerVectorShuffleAsByteShift(
- DL, MVT::v4i32, V1, V2, Mask, DAG))
+ // Try to use shift instructions.
+ if (SDValue Shift =
+ lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, DAG))
return Shift;
// There are special ways we can lower some single-element blends.
Mask, Subtarget, DAG))
return V;
- // Use dedicated unpack instructions for masks that match their pattern.
- if (isShuffleEquivalent(Mask, 0, 4, 1, 5))
- return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V1, V2);
- if (isShuffleEquivalent(Mask, 2, 6, 3, 7))
- return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V1, V2);
-
- if (Subtarget->hasSSE41())
+ // We have different paths for blend lowering, but they all must use the
+ // *exact* same predicate.
+ bool IsBlendSupported = Subtarget->hasSSE41();
+ if (IsBlendSupported)
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
Subtarget, DAG))
return Blend;
+ if (SDValue Masked =
+ lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask, DAG))
+ return Masked;
+
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (isShuffleEquivalent(V1, V2, Mask, 0, 4, 1, 5))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V1, V2);
+ if (isShuffleEquivalent(V1, V2, Mask, 2, 6, 3, 7))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V1, V2);
+ if (isShuffleEquivalent(V1, V2, Mask, 4, 0, 5, 1))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V2, V1);
+ if (isShuffleEquivalent(V1, V2, Mask, 6, 2, 7, 3))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V2, V1);
+
// Try to use byte rotation instructions.
// Its more profitable for pre-SSSE3 to use shuffles/unpacks.
if (Subtarget->hasSSSE3())
DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
return Rotate;
+ // If we have direct support for blends, we should lower by decomposing into
+ // a permute. That will be faster than the domain cross.
+ if (IsBlendSupported)
+ return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2,
+ Mask, DAG);
+
+ // Try to lower by permuting the inputs into an unpack instruction.
+ if (SDValue Unpack =
+ lowerVectorShuffleAsUnpack(MVT::v4i32, DL, V1, V2, Mask, DAG))
+ return Unpack;
+
// We implement this with SHUFPS because it can blend from two vectors.
// Because we're going to eventually use SHUFPS, we use SHUFPS even to build
// up the inputs, bypassing domain shift penalties that we would encur if we
Mask, Subtarget, DAG))
return Broadcast;
- // Try to use byte shift instructions.
- if (SDValue Shift = lowerVectorShuffleAsByteShift(
- DL, MVT::v8i16, V, V, Mask, DAG))
+ // Try to use shift instructions.
+ if (SDValue Shift =
+ lowerVectorShuffleAsShift(DL, MVT::v8i16, V, V, Mask, DAG))
return Shift;
// Use dedicated unpack instructions for masks that match their pattern.
- if (isShuffleEquivalent(Mask, 0, 0, 1, 1, 2, 2, 3, 3))
+ if (isShuffleEquivalent(V, V, Mask, 0, 0, 1, 1, 2, 2, 3, 3))
return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V, V);
- if (isShuffleEquivalent(Mask, 4, 4, 5, 5, 6, 6, 7, 7))
+ if (isShuffleEquivalent(V, V, Mask, 4, 4, 5, 5, 6, 6, 7, 7))
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V, V);
// Try to use byte rotation instructions.
DAG.getUNDEF(MVT::v8i16), Mask);
}
+/// \brief Helper to form a PSHUFB-based shuffle+blend.
+static SDValue lowerVectorShuffleAsPSHUFB(SDLoc DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ SelectionDAG &DAG, bool &V1InUse,
+ bool &V2InUse) {
+ SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+ SDValue V1Mask[16];
+ SDValue V2Mask[16];
+ V1InUse = false;
+ V2InUse = false;
+
+ int Size = Mask.size();
+ int Scale = 16 / Size;
+ for (int i = 0; i < 16; ++i) {
+ if (Mask[i / Scale] == -1) {
+ V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
+ } else {
+ const int ZeroMask = 0x80;
+ int V1Idx = Mask[i / Scale] < Size ? Mask[i / Scale] * Scale + i % Scale
+ : ZeroMask;
+ int V2Idx = Mask[i / Scale] < Size
+ ? ZeroMask
+ : (Mask[i / Scale] - Size) * Scale + i % Scale;
+ if (Zeroable[i / Scale])
+ V1Idx = V2Idx = ZeroMask;
+ V1Mask[i] = DAG.getConstant(V1Idx, MVT::i8);
+ V2Mask[i] = DAG.getConstant(V2Idx, MVT::i8);
+ V1InUse |= (ZeroMask != V1Idx);
+ V2InUse |= (ZeroMask != V2Idx);
+ }
+ }
+
+ if (V1InUse)
+ V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
+ DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, V1),
+ DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V1Mask));
+ if (V2InUse)
+ V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
+ DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, V2),
+ DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V2Mask));
+
+ // If we need shuffled inputs from both, blend the two.
+ SDValue V;
+ if (V1InUse && V2InUse)
+ V = DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
+ else
+ V = V1InUse ? V1 : V2;
+
+ // Cast the result back to the correct type.
+ return DAG.getNode(ISD::BITCAST, DL, VT, V);
+}
+
/// \brief Generic lowering of 8-lane i16 shuffles.
///
/// This handles both single-input shuffles and combined shuffle/blends with
assert(NumV1Inputs > 0 && "All single-input shuffles should be canonicalized "
"to be V1-input shuffles.");
- // Try to use byte shift instructions.
- if (SDValue Shift = lowerVectorShuffleAsByteShift(
- DL, MVT::v8i16, V1, V2, Mask, DAG))
+ // Try to use shift instructions.
+ if (SDValue Shift =
+ lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, DAG))
return Shift;
// There are special ways we can lower some single-element blends.
Mask, Subtarget, DAG))
return V;
- // Use dedicated unpack instructions for masks that match their pattern.
- if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 2, 10, 3, 11))
- return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V1, V2);
- if (isShuffleEquivalent(Mask, 4, 12, 5, 13, 6, 14, 7, 15))
- return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V1, V2);
-
- if (Subtarget->hasSSE41())
+ // We have different paths for blend lowering, but they all must use the
+ // *exact* same predicate.
+ bool IsBlendSupported = Subtarget->hasSSE41();
+ if (IsBlendSupported)
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
Subtarget, DAG))
return Blend;
+ if (SDValue Masked =
+ lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask, DAG))
+ return Masked;
+
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (isShuffleEquivalent(V1, V2, Mask, 0, 8, 1, 9, 2, 10, 3, 11))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V1, V2);
+ if (isShuffleEquivalent(V1, V2, Mask, 4, 12, 5, 13, 6, 14, 7, 15))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V1, V2);
+
// Try to use byte rotation instructions.
if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
return Rotate;
+ if (SDValue BitBlend =
+ lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
+ return BitBlend;
+
if (NumV1Inputs + NumV2Inputs <= 4)
return lowerV8I16BasicBlendVectorShuffle(DL, V1, V2, Mask, Subtarget, DAG);
return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, Evens, Odds);
}
- int LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
- int HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
+ // Try to lower by permuting the inputs into an unpack instruction unless we
+ // have direct support for blending.
+ if (!IsBlendSupported) {
+ if (SDValue Unpack =
+ lowerVectorShuffleAsUnpack(MVT::v8i16, DL, V1, V2, Mask, DAG))
+ return Unpack;
- for (int i = 0; i < 4; ++i) {
- LoBlendMask[i] = Mask[i];
- HiBlendMask[i] = Mask[i + 4];
+ // If we can use PSHUFB, that will be better as it can both shuffle and set
+ // up an efficient blend.
+ if (Subtarget->hasSSSE3()) {
+ bool V1InUse, V2InUse;
+ return lowerVectorShuffleAsPSHUFB(DL, MVT::v8i16, V1, V2, Mask, DAG,
+ V1InUse, V2InUse);
+ }
}
- SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, LoBlendMask);
- SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, HiBlendMask);
- LoV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, LoV);
- HiV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, HiV);
-
- return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
- DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, LoV, HiV));
+ // We can always bit-blend if we have to so the fallback strategy is to
+ // decompose into single-input permutes and blends.
+ return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
+ Mask, DAG);
}
/// \brief Check whether a compaction lowering can be done by dropping even
assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
- ArrayRef<int> OrigMask = SVOp->getMask();
- assert(OrigMask.size() == 16 && "Unexpected mask size for v16 shuffle!");
+ ArrayRef<int> Mask = SVOp->getMask();
+ assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
- // Try to use byte shift instructions.
- if (SDValue Shift = lowerVectorShuffleAsByteShift(
- DL, MVT::v16i8, V1, V2, OrigMask, DAG))
+ // Try to use shift instructions.
+ if (SDValue Shift =
+ lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, DAG))
return Shift;
// Try to use byte rotation instructions.
if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
- DL, MVT::v16i8, V1, V2, OrigMask, Subtarget, DAG))
+ DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
return Rotate;
// Try to use a zext lowering.
if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
- DL, MVT::v16i8, V1, V2, OrigMask, Subtarget, DAG))
+ DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
return ZExt;
- int MaskStorage[16] = {
- OrigMask[0], OrigMask[1], OrigMask[2], OrigMask[3],
- OrigMask[4], OrigMask[5], OrigMask[6], OrigMask[7],
- OrigMask[8], OrigMask[9], OrigMask[10], OrigMask[11],
- OrigMask[12], OrigMask[13], OrigMask[14], OrigMask[15]};
- MutableArrayRef<int> Mask(MaskStorage);
- MutableArrayRef<int> LoMask = Mask.slice(0, 8);
- MutableArrayRef<int> HiMask = Mask.slice(8, 8);
-
int NumV2Elements =
std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 16; });
return V;
}
- // Check whether an interleaving lowering is likely to be more efficient.
- // This isn't perfect but it is a strong heuristic that tends to work well on
- // the kinds of shuffles that show up in practice.
- //
- // FIXME: We need to handle other interleaving widths (i16, i32, ...).
- if (shouldLowerAsInterleaving(Mask)) {
- int NumLoHalf = std::count_if(Mask.begin(), Mask.end(), [](int M) {
- return (M >= 0 && M < 8) || (M >= 16 && M < 24);
- });
- int NumHiHalf = std::count_if(Mask.begin(), Mask.end(), [](int M) {
- return (M >= 8 && M < 16) || M >= 24;
- });
- int EMask[16] = {-1, -1, -1, -1, -1, -1, -1, -1,
- -1, -1, -1, -1, -1, -1, -1, -1};
- int OMask[16] = {-1, -1, -1, -1, -1, -1, -1, -1,
- -1, -1, -1, -1, -1, -1, -1, -1};
- bool UnpackLo = NumLoHalf >= NumHiHalf;
- MutableArrayRef<int> TargetEMask(UnpackLo ? EMask : EMask + 8, 8);
- MutableArrayRef<int> TargetOMask(UnpackLo ? OMask : OMask + 8, 8);
- for (int i = 0; i < 8; ++i) {
- TargetEMask[i] = Mask[2 * i];
- TargetOMask[i] = Mask[2 * i + 1];
- }
-
- SDValue Evens = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, EMask);
- SDValue Odds = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, OMask);
-
- return DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
- MVT::v16i8, Evens, Odds);
- }
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (isShuffleEquivalent(V1, V2, Mask,
+ 0, 16, 1, 17, 2, 18, 3, 19,
+ 4, 20, 5, 21, 6, 22, 7, 23))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V1, V2);
+ if (isShuffleEquivalent(V1, V2, Mask,
+ 8, 24, 9, 25, 10, 26, 11, 27,
+ 12, 28, 13, 29, 14, 30, 15, 31))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V1, V2);
// Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
// with PSHUFB. It is important to do this before we attempt to generate any
// interleavings with direct instructions supporting them. We currently don't
// handle those well here.
if (Subtarget->hasSSSE3()) {
- SDValue V1Mask[16];
- SDValue V2Mask[16];
- for (int i = 0; i < 16; ++i)
- if (Mask[i] == -1) {
- V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
- } else {
- V1Mask[i] = DAG.getConstant(Mask[i] < 16 ? Mask[i] : 0x80, MVT::i8);
- V2Mask[i] =
- DAG.getConstant(Mask[i] < 16 ? 0x80 : Mask[i] - 16, MVT::i8);
- }
- V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V1,
- DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V1Mask));
- if (isSingleInputShuffleMask(Mask))
- return V1; // Single inputs are easy.
+ bool V1InUse = false;
+ bool V2InUse = false;
- // Otherwise, blend the two.
- V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V2,
- DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V2Mask));
- return DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
+ SDValue PSHUFB = lowerVectorShuffleAsPSHUFB(DL, MVT::v16i8, V1, V2, Mask,
+ DAG, V1InUse, V2InUse);
+
+ // If both V1 and V2 are in use and we can use a direct blend or an unpack,
+ // do so. This avoids using them to handle blends-with-zero which is
+ // important as a single pshufb is significantly faster for that.
+ if (V1InUse && V2InUse) {
+ if (Subtarget->hasSSE41())
+ if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i8, V1, V2,
+ Mask, Subtarget, DAG))
+ return Blend;
+
+ // We can use an unpack to do the blending rather than an or in some
+ // cases. Even though the or may be (very minorly) more efficient, we
+ // preference this lowering because there are common cases where part of
+ // the complexity of the shuffles goes away when we do the final blend as
+ // an unpack.
+ // FIXME: It might be worth trying to detect if the unpack-feeding
+ // shuffles will both be pshufb, in which case we shouldn't bother with
+ // this.
+ if (SDValue Unpack =
+ lowerVectorShuffleAsUnpack(MVT::v16i8, DL, V1, V2, Mask, DAG))
+ return Unpack;
+ }
+
+ return PSHUFB;
}
// There are special ways we can lower some single-element blends.
Mask, Subtarget, DAG))
return V;
+ if (SDValue BitBlend =
+ lowerVectorShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
+ return BitBlend;
+
// Check whether a compaction lowering can be done. This handles shuffles
// which take every Nth element for some even N. See the helper function for
// details.
return Result;
}
- int V1LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
- int V1HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
- int V2LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
- int V2HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
+ // Handle multi-input cases by blending single-input shuffles.
+ if (NumV2Elements > 0)
+ return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2,
+ Mask, DAG);
- auto buildBlendMasks = [](MutableArrayRef<int> HalfMask,
- MutableArrayRef<int> V1HalfBlendMask,
- MutableArrayRef<int> V2HalfBlendMask) {
- for (int i = 0; i < 8; ++i)
- if (HalfMask[i] >= 0 && HalfMask[i] < 16) {
- V1HalfBlendMask[i] = HalfMask[i];
- HalfMask[i] = i;
- } else if (HalfMask[i] >= 16) {
- V2HalfBlendMask[i] = HalfMask[i] - 16;
- HalfMask[i] = i + 8;
- }
- };
- buildBlendMasks(LoMask, V1LoBlendMask, V2LoBlendMask);
- buildBlendMasks(HiMask, V1HiBlendMask, V2HiBlendMask);
+ // The fallback path for single-input shuffles widens this into two v8i16
+ // vectors with unpacks, shuffles those, and then pulls them back together
+ // with a pack.
+ SDValue V = V1;
- SDValue Zero = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
+ int LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
+ int HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
+ for (int i = 0; i < 16; ++i)
+ if (Mask[i] >= 0)
+ (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
- auto buildLoAndHiV8s = [&](SDValue V, MutableArrayRef<int> LoBlendMask,
- MutableArrayRef<int> HiBlendMask) {
- SDValue V1, V2;
- // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
- // them out and avoid using UNPCK{L,H} to extract the elements of V as
- // i16s.
- if (std::none_of(LoBlendMask.begin(), LoBlendMask.end(),
- [](int M) { return M >= 0 && M % 2 == 1; }) &&
- std::none_of(HiBlendMask.begin(), HiBlendMask.end(),
- [](int M) { return M >= 0 && M % 2 == 1; })) {
- // Use a mask to drop the high bytes.
- V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V);
- V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, V1,
- DAG.getConstant(0x00FF, MVT::v8i16));
-
- // This will be a single vector shuffle instead of a blend so nuke V2.
- V2 = DAG.getUNDEF(MVT::v8i16);
-
- // Squash the masks to point directly into V1.
- for (int &M : LoBlendMask)
- if (M >= 0)
- M /= 2;
- for (int &M : HiBlendMask)
- if (M >= 0)
- M /= 2;
- } else {
- // Otherwise just unpack the low half of V into V1 and the high half into
- // V2 so that we can blend them as i16s.
- V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
- DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
- V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
- DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
- }
+ SDValue Zero = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
- SDValue BlendedLo = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, LoBlendMask);
- SDValue BlendedHi = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, HiBlendMask);
- return std::make_pair(BlendedLo, BlendedHi);
- };
- SDValue V1Lo, V1Hi, V2Lo, V2Hi;
- std::tie(V1Lo, V1Hi) = buildLoAndHiV8s(V1, V1LoBlendMask, V1HiBlendMask);
- std::tie(V2Lo, V2Hi) = buildLoAndHiV8s(V2, V2LoBlendMask, V2HiBlendMask);
+ SDValue VLoHalf, VHiHalf;
+ // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
+ // them out and avoid using UNPCK{L,H} to extract the elements of V as
+ // i16s.
+ if (std::none_of(std::begin(LoBlendMask), std::end(LoBlendMask),
+ [](int M) { return M >= 0 && M % 2 == 1; }) &&
+ std::none_of(std::begin(HiBlendMask), std::end(HiBlendMask),
+ [](int M) { return M >= 0 && M % 2 == 1; })) {
+ // Use a mask to drop the high bytes.
+ VLoHalf = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V);
+ VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
+ DAG.getConstant(0x00FF, MVT::v8i16));
+
+ // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
+ VHiHalf = DAG.getUNDEF(MVT::v8i16);
+
+ // Squash the masks to point directly into VLoHalf.
+ for (int &M : LoBlendMask)
+ if (M >= 0)
+ M /= 2;
+ for (int &M : HiBlendMask)
+ if (M >= 0)
+ M /= 2;
+ } else {
+ // Otherwise just unpack the low half of V into VLoHalf and the high half into
+ // VHiHalf so that we can blend them as i16s.
+ VLoHalf = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
+ DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
+ VHiHalf = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
+ DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
+ }
- SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, V1Lo, V2Lo, LoMask);
- SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, V1Hi, V2Hi, HiMask);
+ SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
+ SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
}
return true;
}
-/// \brief Generic routine to split ector shuffle into half-sized shuffles.
+/// \brief Generic routine to split vector shuffle into half-sized shuffles.
///
/// This routine just extracts two subvectors, shuffles them independently, and
/// then concatenates them back together. This should work effectively with all
MVT ScalarVT = VT.getScalarType();
MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
- SDValue LoV1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V1,
- DAG.getIntPtrConstant(0));
- SDValue HiV1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V1,
- DAG.getIntPtrConstant(SplitNumElements));
- SDValue LoV2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V2,
- DAG.getIntPtrConstant(0));
- SDValue HiV2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V2,
- DAG.getIntPtrConstant(SplitNumElements));
+ // Rather than splitting build-vectors, just build two narrower build
+ // vectors. This helps shuffling with splats and zeros.
+ auto SplitVector = [&](SDValue V) {
+ while (V.getOpcode() == ISD::BITCAST)
+ V = V->getOperand(0);
+
+ MVT OrigVT = V.getSimpleValueType();
+ int OrigNumElements = OrigVT.getVectorNumElements();
+ int OrigSplitNumElements = OrigNumElements / 2;
+ MVT OrigScalarVT = OrigVT.getScalarType();
+ MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);
+
+ SDValue LoV, HiV;
+
+ auto *BV = dyn_cast<BuildVectorSDNode>(V);
+ if (!BV) {
+ LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
+ DAG.getIntPtrConstant(0));
+ HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
+ DAG.getIntPtrConstant(OrigSplitNumElements));
+ } else {
+
+ SmallVector<SDValue, 16> LoOps, HiOps;
+ for (int i = 0; i < OrigSplitNumElements; ++i) {
+ LoOps.push_back(BV->getOperand(i));
+ HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));
+ }
+ LoV = DAG.getNode(ISD::BUILD_VECTOR, DL, OrigSplitVT, LoOps);
+ HiV = DAG.getNode(ISD::BUILD_VECTOR, DL, OrigSplitVT, HiOps);
+ }
+ return std::make_pair(DAG.getNode(ISD::BITCAST, DL, SplitVT, LoV),
+ DAG.getNode(ISD::BITCAST, DL, SplitVT, HiV));
+ };
+
+ SDValue LoV1, HiV1, LoV2, HiV2;
+ std::tie(LoV1, HiV1) = SplitVector(V1);
+ std::tie(LoV2, HiV2) = SplitVector(V2);
// Now create two 4-way blends of these half-width vectors.
auto HalfBlend = [&](ArrayRef<int> HalfMask) {
VT.getVectorNumElements() / 2);
// Check for patterns which can be matched with a single insert of a 128-bit
// subvector.
- if (isShuffleEquivalent(Mask, 0, 1, 0, 1) ||
- isShuffleEquivalent(Mask, 0, 1, 4, 5)) {
+ if (isShuffleEquivalent(V1, V2, Mask, 0, 1, 0, 1) ||
+ isShuffleEquivalent(V1, V2, Mask, 0, 1, 4, 5)) {
SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
DAG.getIntPtrConstant(0));
SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
Mask[2] < 4 ? V1 : V2, DAG.getIntPtrConstant(0));
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
}
- if (isShuffleEquivalent(Mask, 0, 1, 6, 7)) {
+ if (isShuffleEquivalent(V1, V2, Mask, 0, 1, 6, 7)) {
SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
DAG.getIntPtrConstant(0));
SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
Mask, Subtarget, DAG))
return Broadcast;
+ // Use low duplicate instructions for masks that match their pattern.
+ if (isShuffleEquivalent(V1, V2, Mask, 0, 0, 2, 2))
+ return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
+
if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
// Non-half-crossing single input shuffles can be lowerid with an
// interleaved permutation.
// X86 has dedicated unpack instructions that can handle specific blend
// operations: UNPCKH and UNPCKL.
- if (isShuffleEquivalent(Mask, 0, 4, 2, 6))
+ if (isShuffleEquivalent(V1, V2, Mask, 0, 4, 2, 6))
return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f64, V1, V2);
- if (isShuffleEquivalent(Mask, 1, 5, 3, 7))
+ if (isShuffleEquivalent(V1, V2, Mask, 1, 5, 3, 7))
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V1, V2);
+ if (isShuffleEquivalent(V1, V2, Mask, 4, 0, 6, 2))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f64, V2, V1);
+ if (isShuffleEquivalent(V1, V2, Mask, 5, 1, 7, 3))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V2, V1);
// If we have a single input to the zero element, insert that into V1 if we
// can do so cheaply.
DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, V1),
getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
}
-
- // Use dedicated unpack instructions for masks that match their pattern.
- if (isShuffleEquivalent(Mask, 0, 4, 2, 6))
- return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V1, V2);
- if (isShuffleEquivalent(Mask, 1, 5, 3, 7))
- return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V1, V2);
}
// AVX2 provides a direct instruction for permuting a single input across
return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
getV4X86ShuffleImm8ForMask(Mask, DAG));
+ // Try to use shift instructions.
+ if (SDValue Shift =
+ lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, DAG))
+ return Shift;
+
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (isShuffleEquivalent(V1, V2, Mask, 0, 4, 2, 6))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V1, V2);
+ if (isShuffleEquivalent(V1, V2, Mask, 1, 5, 3, 7))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V1, V2);
+ if (isShuffleEquivalent(V1, V2, Mask, 4, 0, 6, 2))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V2, V1);
+ if (isShuffleEquivalent(V1, V2, Mask, 5, 1, 7, 3))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V2, V1);
+
// Try to simplify this by merging 128-bit lanes to enable a lane-based
// shuffle. However, if we have AVX2 and either inputs are already in place,
// we will be able to shuffle even across lanes the other input in a single
if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
assert(RepeatedMask.size() == 4 &&
"Repeated masks must be half the mask width!");
+
+ // Use even/odd duplicate instructions for masks that match their pattern.
+ if (isShuffleEquivalent(V1, V2, Mask, 0, 0, 2, 2, 4, 4, 6, 6))
+ return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
+ if (isShuffleEquivalent(V1, V2, Mask, 1, 1, 3, 3, 5, 5, 7, 7))
+ return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
+
if (isSingleInputShuffleMask(Mask))
return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
getV4X86ShuffleImm8ForMask(RepeatedMask, DAG));
// Use dedicated unpack instructions for masks that match their pattern.
- if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 4, 12, 5, 13))
+ if (isShuffleEquivalent(V1, V2, Mask, 0, 8, 1, 9, 4, 12, 5, 13))
return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f32, V1, V2);
- if (isShuffleEquivalent(Mask, 2, 10, 3, 11, 6, 14, 7, 15))
+ if (isShuffleEquivalent(V1, V2, Mask, 2, 10, 3, 11, 6, 14, 7, 15))
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f32, V1, V2);
+ if (isShuffleEquivalent(V1, V2, Mask, 8, 0, 9, 1, 12, 4, 13, 5))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f32, V2, V1);
+ if (isShuffleEquivalent(V1, V2, Mask, 10, 2, 11, 3, 14, 6, 15, 7))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f32, V2, V1);
// Otherwise, fall back to a SHUFPS sequence. Here it is important that we
// have already handled any direct blends. We also need to squash the
assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
assert(Subtarget->hasAVX2() && "We can only lower v8i32 with AVX2!");
+ // Whenever we can lower this as a zext, that instruction is strictly faster
+ // than any alternative. It also allows us to fold memory operands into the
+ // shuffle in many cases.
+ if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2,
+ Mask, Subtarget, DAG))
+ return ZExt;
+
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
Subtarget, DAG))
return Blend;
getV4X86ShuffleImm8ForMask(RepeatedMask, DAG));
// Use dedicated unpack instructions for masks that match their pattern.
- if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 4, 12, 5, 13))
+ if (isShuffleEquivalent(V1, V2, Mask, 0, 8, 1, 9, 4, 12, 5, 13))
return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i32, V1, V2);
- if (isShuffleEquivalent(Mask, 2, 10, 3, 11, 6, 14, 7, 15))
+ if (isShuffleEquivalent(V1, V2, Mask, 2, 10, 3, 11, 6, 14, 7, 15))
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i32, V1, V2);
+ if (isShuffleEquivalent(V1, V2, Mask, 8, 0, 9, 1, 12, 4, 13, 5))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i32, V2, V1);
+ if (isShuffleEquivalent(V1, V2, Mask, 10, 2, 11, 3, 14, 6, 15, 7))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i32, V2, V1);
}
+ // Try to use shift instructions.
+ if (SDValue Shift =
+ lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, DAG))
+ return Shift;
+
+ if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
+ DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
+ return Rotate;
+
// If the shuffle patterns aren't repeated but it is a single input, directly
// generate a cross-lane VPERMD instruction.
if (isSingleInputShuffleMask(Mask)) {
assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
assert(Subtarget->hasAVX2() && "We can only lower v16i16 with AVX2!");
+ // Whenever we can lower this as a zext, that instruction is strictly faster
+ // than any alternative. It also allows us to fold memory operands into the
+ // shuffle in many cases.
+ if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v16i16, V1, V2,
+ Mask, Subtarget, DAG))
+ return ZExt;
+
// Check for being able to broadcast a single element.
if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16i16, DL, V1,
Mask, Subtarget, DAG))
return Blend;
// Use dedicated unpack instructions for masks that match their pattern.
- if (isShuffleEquivalent(Mask,
+ if (isShuffleEquivalent(V1, V2, Mask,
// First 128-bit lane:
0, 16, 1, 17, 2, 18, 3, 19,
// Second 128-bit lane:
8, 24, 9, 25, 10, 26, 11, 27))
return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i16, V1, V2);
- if (isShuffleEquivalent(Mask,
+ if (isShuffleEquivalent(V1, V2, Mask,
// First 128-bit lane:
4, 20, 5, 21, 6, 22, 7, 23,
// Second 128-bit lane:
12, 28, 13, 29, 14, 30, 15, 31))
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i16, V1, V2);
+ // Try to use shift instructions.
+ if (SDValue Shift =
+ lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, DAG))
+ return Shift;
+
+ // Try to use byte rotation instructions.
+ if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
+ DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
+ return Rotate;
+
if (isSingleInputShuffleMask(Mask)) {
// There are no generalized cross-lane shuffle operations available on i16
// element types.
assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
assert(Subtarget->hasAVX2() && "We can only lower v32i8 with AVX2!");
+ // Whenever we can lower this as a zext, that instruction is strictly faster
+ // than any alternative. It also allows us to fold memory operands into the
+ // shuffle in many cases.
+ if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2,
+ Mask, Subtarget, DAG))
+ return ZExt;
+
// Check for being able to broadcast a single element.
if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v32i8, DL, V1,
Mask, Subtarget, DAG))
// Note that these are repeated 128-bit lane unpacks, not unpacks across all
// 256-bit lanes.
if (isShuffleEquivalent(
- Mask,
+ V1, V2, Mask,
// First 128-bit lane:
0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39,
// Second 128-bit lane:
16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55))
return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v32i8, V1, V2);
if (isShuffleEquivalent(
- Mask,
+ V1, V2, Mask,
// First 128-bit lane:
8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47,
// Second 128-bit lane:
24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63))
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v32i8, V1, V2);
+ // Try to use shift instructions.
+ if (SDValue Shift =
+ lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, DAG))
+ return Shift;
+
+ // Try to use byte rotation instructions.
+ if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
+ DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
+ return Rotate;
+
if (isSingleInputShuffleMask(Mask)) {
// There are no generalized cross-lane shuffle operations available on i8
// element types.
ArrayRef<int> Mask = SVOp->getMask();
assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
+ // X86 has dedicated unpack instructions that can handle specific blend
+ // operations: UNPCKH and UNPCKL.
+ if (isShuffleEquivalent(V1, V2, Mask, 0, 8, 2, 10, 4, 12, 6, 14))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f64, V1, V2);
+ if (isShuffleEquivalent(V1, V2, Mask, 1, 9, 3, 11, 5, 13, 7, 15))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f64, V1, V2);
+
// FIXME: Implement direct support for this type!
return splitAndLowerVectorShuffle(DL, MVT::v8f64, V1, V2, Mask, DAG);
}
ArrayRef<int> Mask = SVOp->getMask();
assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (isShuffleEquivalent(V1, V2, Mask,
+ 0, 16, 1, 17, 4, 20, 5, 21,
+ 8, 24, 9, 25, 12, 28, 13, 29))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16f32, V1, V2);
+ if (isShuffleEquivalent(V1, V2, Mask,
+ 2, 18, 3, 19, 6, 22, 7, 23,
+ 10, 26, 11, 27, 14, 30, 15, 31))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16f32, V1, V2);
+
// FIXME: Implement direct support for this type!
return splitAndLowerVectorShuffle(DL, MVT::v16f32, V1, V2, Mask, DAG);
}
ArrayRef<int> Mask = SVOp->getMask();
assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
+ // X86 has dedicated unpack instructions that can handle specific blend
+ // operations: UNPCKH and UNPCKL.
+ if (isShuffleEquivalent(V1, V2, Mask, 0, 8, 2, 10, 4, 12, 6, 14))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i64, V1, V2);
+ if (isShuffleEquivalent(V1, V2, Mask, 1, 9, 3, 11, 5, 13, 7, 15))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i64, V1, V2);
+
// FIXME: Implement direct support for this type!
return splitAndLowerVectorShuffle(DL, MVT::v8i64, V1, V2, Mask, DAG);
}
ArrayRef<int> Mask = SVOp->getMask();
assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (isShuffleEquivalent(V1, V2, Mask,
+ 0, 16, 1, 17, 4, 20, 5, 21,
+ 8, 24, 9, 25, 12, 28, 13, 29))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i32, V1, V2);
+ if (isShuffleEquivalent(V1, V2, Mask,
+ 2, 18, 3, 19, 6, 22, 7, 23,
+ 10, 26, 11, 27, 14, 30, 15, 31))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i32, V1, V2);
+
// FIXME: Implement direct support for this type!
return splitAndLowerVectorShuffle(DL, MVT::v16i32, V1, V2, Mask, DAG);
}
return DAG.getVectorShuffle(VT, dl, V1, V2, NewMask);
}
+ // We actually see shuffles that are entirely re-arrangements of a set of
+ // zero inputs. This mostly happens while decomposing complex shuffles into
+ // simple ones. Directly lower these as a buildvector of zeros.
+ SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+ if (Zeroable.all())
+ return getZeroVector(VT, Subtarget, DAG, dl);
+
// Try to collapse shuffles into using a vector type with fewer elements but
// wider element types. We cap this to not form integers or floating point
// elements wider than 64 bits, but it might be interesting to form i128
unsigned NumLanes = (NumElems - 1) / 8 + 1;
unsigned NumElemsInLane = NumElems / NumLanes;
- // Blend for v16i16 should be symetric for the both lanes.
+ // Blend for v16i16 should be symmetric for both lanes.
for (unsigned i = 0; i < NumElemsInLane; ++i) {
int SndLaneEltIdx = (NumLanes == 2) ? MaskVals[i + NumElemsInLane] : -1;
SDValue getMOVDDup(SDValue &Op, SDLoc &dl, SDValue V1, SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
- // Canonizalize to v2f64.
+ // Canonicalize to v2f64.
V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1);
return DAG.getNode(ISD::BITCAST, dl, VT,
getTargetShuffleNode(X86ISD::MOVDDUP, dl, MVT::v2f64,
if (HasSSE2 && VT == MVT::v2f64)
return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG);
- // v4f32 or v4i32: canonizalized to v4f32 (which is legal for SSE1)
+ // v4f32 or v4i32: canonicalize to v4f32 (which is legal for SSE1)
return DAG.getNode(ISD::BITCAST, dl, VT,
getTargetShuffleNode(X86ISD::MOVLHPS, dl, MVT::v4f32,
DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V1),
bool HasFp256 = Subtarget->hasFp256();
bool HasInt256 = Subtarget->hasInt256();
MachineFunction &MF = DAG.getMachineFunction();
- bool OptForSize = MF.getFunction()->getAttributes().
- hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
+ bool OptForSize =
+ MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize);
// Check if we should use the experimental vector shuffle lowering. If so,
// delegate completely to that code path.
DAG);
unsigned MaskValue;
- if (isBlendMask(M, VT, Subtarget->hasSSE41(), Subtarget->hasInt256(),
- &MaskValue))
+ if (isBlendMask(M, VT, Subtarget->hasSSE41(), HasInt256, &MaskValue))
return LowerVECTOR_SHUFFLEtoBlend(SVOp, MaskValue, Subtarget, DAG);
if (isSHUFPMask(M, VT))
return NewOp;
}
- if (VT == MVT::v16i16 && Subtarget->hasInt256()) {
+ if (VT == MVT::v16i16 && HasInt256) {
SDValue NewOp = LowerVECTOR_SHUFFLEv16i16(Op, DAG);
if (NewOp.getNode())
return NewOp;
return true;
}
-/// \brief Try to lower a VSELECT instruction to an immediate-controlled blend
-/// instruction.
-static SDValue lowerVSELECTtoBLENDI(SDValue Op, const X86Subtarget *Subtarget,
- SelectionDAG &DAG) {
+/// \brief Try to lower a VSELECT instruction to a vector shuffle.
+static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
SDValue Cond = Op.getOperand(0);
SDValue LHS = Op.getOperand(1);
SDValue RHS = Op.getOperand(2);
SDLoc dl(Op);
MVT VT = Op.getSimpleValueType();
- MVT EltVT = VT.getVectorElementType();
- unsigned NumElems = VT.getVectorNumElements();
-
- // There is no blend with immediate in AVX-512.
- if (VT.is512BitVector())
- return SDValue();
-
- if (!Subtarget->hasSSE41() || EltVT == MVT::i8)
- return SDValue();
- if (!Subtarget->hasInt256() && VT == MVT::v16i16)
- return SDValue();
if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
return SDValue();
+ auto *CondBV = cast<BuildVectorSDNode>(Cond);
- // Check the mask for BLEND and build the value.
- unsigned MaskValue = 0;
- if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue))
- return SDValue();
-
- // Convert i32 vectors to floating point if it is not AVX2.
- // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.
- MVT BlendVT = VT;
- if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) {
- BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()),
- NumElems);
- LHS = DAG.getNode(ISD::BITCAST, dl, VT, LHS);
- RHS = DAG.getNode(ISD::BITCAST, dl, VT, RHS);
+ // Only non-legal VSELECTs reach this lowering, convert those into generic
+ // shuffles and re-use the shuffle lowering path for blends.
+ SmallVector<int, 32> Mask;
+ for (int i = 0, Size = VT.getVectorNumElements(); i < Size; ++i) {
+ SDValue CondElt = CondBV->getOperand(i);
+ Mask.push_back(
+ isa<ConstantSDNode>(CondElt) ? i + (isZero(CondElt) ? Size : 0) : -1);
}
-
- SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, LHS, RHS,
- DAG.getConstant(MaskValue, MVT::i32));
- return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
+ return DAG.getVectorShuffle(VT, dl, LHS, RHS, Mask);
}
SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
return SDValue();
- SDValue BlendOp = lowerVSELECTtoBLENDI(Op, Subtarget, DAG);
+ // Try to lower this to a blend-style vector shuffle. This can handle all
+ // constant condition cases.
+ SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG);
if (BlendOp.getNode())
return BlendOp;
+ // Variable blends are only legal from SSE4.1 onward.
+ if (!Subtarget->hasSSE41())
+ return SDValue();
+
// Some types for vselect were previously set to Expand, not Legal or
// Custom. Return an empty SDValue so we fall-through to Expand, after
// the Custom lowering phase.
MVT EltVT = Op.getSimpleValueType();
assert((EltVT == MVT::i1) && "Unexpected operands in ExtractBitFromMaskVector");
+ assert((VecVT.getVectorNumElements() <= 16 || Subtarget->hasBWI()) &&
+ "Unexpected vector type in ExtractBitFromMaskVector");
// variable index can't be handled in mask registers,
// extend vector to VR512
unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
const TargetRegisterClass* rc = getRegClassFor(VecVT);
+ if (!Subtarget->hasDQI() && (VecVT.getVectorNumElements() <= 8))
+ rc = getRegClassFor(MVT::v16i1);
unsigned MaxSift = rc->getSize()*8 - 1;
Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
DAG.getConstant(MaxSift - IdxVal, MVT::i8));
// the upper bits of a vector.
static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
- if (Subtarget->hasFp256()) {
- SDLoc dl(Op.getNode());
- SDValue Vec = Op.getNode()->getOperand(0);
- SDValue SubVec = Op.getNode()->getOperand(1);
- SDValue Idx = Op.getNode()->getOperand(2);
-
- if ((Op.getNode()->getSimpleValueType(0).is256BitVector() ||
- Op.getNode()->getSimpleValueType(0).is512BitVector()) &&
- SubVec.getNode()->getSimpleValueType(0).is128BitVector() &&
- isa<ConstantSDNode>(Idx)) {
- unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
- return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
- }
+ if (!Subtarget->hasAVX())
+ return SDValue();
- if (Op.getNode()->getSimpleValueType(0).is512BitVector() &&
- SubVec.getNode()->getSimpleValueType(0).is256BitVector() &&
- isa<ConstantSDNode>(Idx)) {
- unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
- return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl);
+ SDLoc dl(Op);
+ SDValue Vec = Op.getOperand(0);
+ SDValue SubVec = Op.getOperand(1);
+ SDValue Idx = Op.getOperand(2);
+
+ if (!isa<ConstantSDNode>(Idx))
+ return SDValue();
+
+ unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+ MVT OpVT = Op.getSimpleValueType();
+ MVT SubVecVT = SubVec.getSimpleValueType();
+
+ // Fold two 16-byte subvector loads into one 32-byte load:
+ // (insert_subvector (insert_subvector undef, (load addr), 0),
+ // (load addr + 16), Elts/2)
+ // --> load32 addr
+ if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
+ Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
+ OpVT.is256BitVector() && SubVecVT.is128BitVector() &&
+ !Subtarget->isUnalignedMem32Slow()) {
+ SDValue SubVec2 = Vec.getOperand(1);
+ if (auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2))) {
+ if (Idx2->getZExtValue() == 0) {
+ SDValue Ops[] = { SubVec2, SubVec };
+ SDValue LD = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false);
+ if (LD.getNode())
+ return LD;
+ }
}
}
+
+ if ((OpVT.is256BitVector() || OpVT.is512BitVector()) &&
+ SubVecVT.is128BitVector())
+ return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
+
+ if (OpVT.is512BitVector() && SubVecVT.is256BitVector())
+ return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl);
+
return SDValue();
}
SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1);
// Next, clear the sign bit from the first operand (magnitude).
- CV[0] = ConstantFP::get(
- *Context, APFloat(Sem, APInt::getLowBitsSet(SizeInBits, SizeInBits - 1)));
+ // If it's a constant, we can clear it here.
+ if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Op0)) {
+ APFloat APF = Op0CN->getValueAPF();
+ // If the magnitude is a positive zero, the sign bit alone is enough.
+ if (APF.isPosZero())
+ return SignBit;
+ APF.clearSign();
+ CV[0] = ConstantFP::get(*Context, APF);
+ } else {
+ CV[0] = ConstantFP::get(
+ *Context,
+ APFloat(Sem, APInt::getLowBitsSet(SizeInBits, SizeInBits - 1)));
+ }
C = ConstantVector::get(CV);
CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16);
- SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
- MachinePointerInfo::getConstantPool(),
- false, false, false, 16);
- SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2);
+ SDValue Val = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
+ MachinePointerInfo::getConstantPool(),
+ false, false, false, 16);
+ // If the magnitude operand wasn't a constant, we need to AND out the sign.
+ if (!isa<ConstantFPSDNode>(Op0))
+ Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Val);
// OR the magnitude value with the sign bit.
return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit);
/// equivalent.
SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl,
SelectionDAG &DAG) const {
- if (Op.getValueType() == MVT::i1)
- // KORTEST instruction should be selected
- return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
- DAG.getConstant(0, Op.getValueType()));
-
+ if (Op.getValueType() == MVT::i1) {
+ SDValue ExtOp = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, Op);
+ return DAG.getNode(X86ISD::CMP, dl, MVT::i32, ExtOp,
+ DAG.getConstant(0, MVT::i8));
+ }
// CF and OF aren't always set the way we want. Determine which
// of these we need.
bool NeedCF = false;
DAG.getConstant(0, Op.getValueType()));
SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
- SmallVector<SDValue, 4> Ops;
- for (unsigned i = 0; i != NumOperands; ++i)
- Ops.push_back(Op.getOperand(i));
+ SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
DAG.ReplaceAllUsesWith(Op, New);
// if we're optimizing for size, however, as that'll allow better folding
// of memory operations.
if (Op0.getValueType() != MVT::i32 && Op0.getValueType() != MVT::i64 &&
- !DAG.getMachineFunction().getFunction()->getAttributes().hasAttribute(
- AttributeSet::FunctionIndex, Attribute::MinSize) &&
+ !DAG.getMachineFunction().getFunction()->hasFnAttribute(
+ Attribute::MinSize) &&
!Subtarget->isAtom()) {
unsigned ExtendOp =
isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
// may emit an illegal shuffle but the expansion is still better than scalar
// code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
// we'll emit a shuffle and a arithmetic shift.
+// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
// TODO: It is possible to support ZExt by zeroing the undef values during
// the shuffle phase or after the shuffle.
static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget,
// Attempt to load the original value using scalar loads.
// Find the largest scalar type that divides the total loaded size.
MVT SclrLoadTy = MVT::i8;
- for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE;
- tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) {
- MVT Tp = (MVT::SimpleValueType)tp;
+ for (MVT Tp : MVT::integer_valuetypes()) {
if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
SclrLoadTy = Tp;
}
SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
Chain = SP.getValue(1);
unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue();
- const TargetFrameLowering &TFI = *DAG.getSubtarget().getFrameLowering();
+ const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
unsigned StackAlign = TFI.getStackAlignment();
Tmp1 = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
if (Align > StackAlign)
Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag);
- const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
- DAG.getSubtarget().getRegisterInfo());
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
unsigned SPReg = RegInfo->getStackRegister();
SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
Chain = SP.getValue(1);
if (ArgMode == 2) {
// Sanity Check: Make sure using fp_offset makes sense.
assert(!DAG.getTarget().Options.UseSoftFloat &&
- !(DAG.getMachineFunction()
- .getFunction()->getAttributes()
- .hasAttribute(AttributeSet::FunctionIndex,
- Attribute::NoImplicitFloat)) &&
+ !(DAG.getMachineFunction().getFunction()->hasFnAttribute(
+ Attribute::NoImplicitFloat)) &&
Subtarget->hasSSE1());
}
}
const X86Subtarget &Subtarget =
- DAG.getTarget().getSubtarget<X86Subtarget>();
+ static_cast<const X86Subtarget &>(DAG.getSubtarget());
if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
// Let the shuffle legalizer expand this shift amount node.
/// The mask is comming as MVT::i8 and it should be truncated
/// to MVT::i1 while lowering masking intrinsics.
/// The main difference between ScalarMaskingNode and VectorMaskingNode is using
-/// "X86select" instead of "vselect". We just can't create the "vselect" node for
+/// "X86select" instead of "vselect". We just can't create the "vselect" node for
/// a scalar instruction.
static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
SDValue PreservedSrc,
return DAG.getNode(X86ISD::SELECT, dl, VT, IMask, Op, PreservedSrc);
}
-static unsigned getOpcodeForFMAIntrinsic(unsigned IntNo) {
- switch (IntNo) {
- default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
- case Intrinsic::x86_fma_vfmadd_ps:
- case Intrinsic::x86_fma_vfmadd_pd:
- case Intrinsic::x86_fma_vfmadd_ps_256:
- case Intrinsic::x86_fma_vfmadd_pd_256:
- case Intrinsic::x86_fma_mask_vfmadd_ps_512:
- case Intrinsic::x86_fma_mask_vfmadd_pd_512:
- return X86ISD::FMADD;
- case Intrinsic::x86_fma_vfmsub_ps:
- case Intrinsic::x86_fma_vfmsub_pd:
- case Intrinsic::x86_fma_vfmsub_ps_256:
- case Intrinsic::x86_fma_vfmsub_pd_256:
- case Intrinsic::x86_fma_mask_vfmsub_ps_512:
- case Intrinsic::x86_fma_mask_vfmsub_pd_512:
- return X86ISD::FMSUB;
- case Intrinsic::x86_fma_vfnmadd_ps:
- case Intrinsic::x86_fma_vfnmadd_pd:
- case Intrinsic::x86_fma_vfnmadd_ps_256:
- case Intrinsic::x86_fma_vfnmadd_pd_256:
- case Intrinsic::x86_fma_mask_vfnmadd_ps_512:
- case Intrinsic::x86_fma_mask_vfnmadd_pd_512:
- return X86ISD::FNMADD;
- case Intrinsic::x86_fma_vfnmsub_ps:
- case Intrinsic::x86_fma_vfnmsub_pd:
- case Intrinsic::x86_fma_vfnmsub_ps_256:
- case Intrinsic::x86_fma_vfnmsub_pd_256:
- case Intrinsic::x86_fma_mask_vfnmsub_ps_512:
- case Intrinsic::x86_fma_mask_vfnmsub_pd_512:
- return X86ISD::FNMSUB;
- case Intrinsic::x86_fma_vfmaddsub_ps:
- case Intrinsic::x86_fma_vfmaddsub_pd:
- case Intrinsic::x86_fma_vfmaddsub_ps_256:
- case Intrinsic::x86_fma_vfmaddsub_pd_256:
- case Intrinsic::x86_fma_mask_vfmaddsub_ps_512:
- case Intrinsic::x86_fma_mask_vfmaddsub_pd_512:
- return X86ISD::FMADDSUB;
- case Intrinsic::x86_fma_vfmsubadd_ps:
- case Intrinsic::x86_fma_vfmsubadd_pd:
- case Intrinsic::x86_fma_vfmsubadd_ps_256:
- case Intrinsic::x86_fma_vfmsubadd_pd_256:
- case Intrinsic::x86_fma_mask_vfmsubadd_ps_512:
- case Intrinsic::x86_fma_mask_vfmsubadd_pd_512:
- return X86ISD::FMSUBADD;
- }
-}
-
static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
SDLoc dl(Op);
SDValue Src2 = Op.getOperand(2);
SDValue Src0 = Op.getOperand(3);
SDValue Mask = Op.getOperand(4);
- SDValue RoundingMode = Op.getOperand(5);
+ // There are 2 kinds of intrinsics in this group:
+ // (1) With supress-all-exceptions (sae) - 6 operands
+ // (2) With rounding mode and sae - 7 operands.
+ if (Op.getNumOperands() == 6) {
+ SDValue Sae = Op.getOperand(5);
+ return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
+ Sae),
+ Mask, Src0, Subtarget, DAG);
+ }
+ assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form");
+ SDValue RoundingMode = Op.getOperand(5);
+ SDValue Sae = Op.getOperand(6);
return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
- RoundingMode),
+ RoundingMode, Sae),
Mask, Src0, Subtarget, DAG);
}
case INTR_TYPE_2OP_MASK: {
- return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Op.getOperand(1),
- Op.getOperand(2)),
- Op.getOperand(4), Op.getOperand(3), Subtarget, DAG);
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Src2 = Op.getOperand(2);
+ SDValue PassThru = Op.getOperand(3);
+ SDValue Mask = Op.getOperand(4);
+ // We specify 2 possible opcodes for intrinsics with rounding modes.
+ // First, we check if the intrinsic may have non-default rounding mode,
+ // (IntrData->Opc1 != 0), then we check the rounding mode operand.
+ unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
+ if (IntrWithRoundingModeOpcode != 0) {
+ SDValue Rnd = Op.getOperand(5);
+ unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
+ if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) {
+ return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
+ dl, Op.getValueType(),
+ Src1, Src2, Rnd),
+ Mask, PassThru, Subtarget, DAG);
+ }
+ }
+ return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
+ Src1,Src2),
+ Mask, PassThru, Subtarget, DAG);
+ }
+ case FMA_OP_MASK: {
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Src2 = Op.getOperand(2);
+ SDValue Src3 = Op.getOperand(3);
+ SDValue Mask = Op.getOperand(4);
+ // We specify 2 possible opcodes for intrinsics with rounding modes.
+ // First, we check if the intrinsic may have non-default rounding mode,
+ // (IntrData->Opc1 != 0), then we check the rounding mode operand.
+ unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
+ if (IntrWithRoundingModeOpcode != 0) {
+ SDValue Rnd = Op.getOperand(5);
+ if (cast<ConstantSDNode>(Rnd)->getZExtValue() !=
+ X86::STATIC_ROUNDING::CUR_DIRECTION)
+ return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
+ dl, Op.getValueType(),
+ Src1, Src2, Src3, Rnd),
+ Mask, Src1, Subtarget, DAG);
+ }
+ return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
+ dl, Op.getValueType(),
+ Src1, Src2, Src3),
+ Mask, Src1, Subtarget, DAG);
}
case CMP_MASK:
case CMP_MASK_CC: {
SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
return DAG.getNode(Opcode, dl, VTs, NewOps);
}
-
- case Intrinsic::x86_fma_mask_vfmadd_ps_512:
- case Intrinsic::x86_fma_mask_vfmadd_pd_512:
- case Intrinsic::x86_fma_mask_vfmsub_ps_512:
- case Intrinsic::x86_fma_mask_vfmsub_pd_512:
- case Intrinsic::x86_fma_mask_vfnmadd_ps_512:
- case Intrinsic::x86_fma_mask_vfnmadd_pd_512:
- case Intrinsic::x86_fma_mask_vfnmsub_ps_512:
- case Intrinsic::x86_fma_mask_vfnmsub_pd_512:
- case Intrinsic::x86_fma_mask_vfmaddsub_ps_512:
- case Intrinsic::x86_fma_mask_vfmaddsub_pd_512:
- case Intrinsic::x86_fma_mask_vfmsubadd_ps_512:
- case Intrinsic::x86_fma_mask_vfmsubadd_pd_512: {
- auto *SAE = cast<ConstantSDNode>(Op.getOperand(5));
- if (SAE->getZExtValue() == X86::STATIC_ROUNDING::CUR_DIRECTION)
- return getVectorMaskingNode(DAG.getNode(getOpcodeForFMAIntrinsic(IntNo),
- dl, Op.getValueType(),
- Op.getOperand(1),
- Op.getOperand(2),
- Op.getOperand(3)),
- Op.getOperand(4), Op.getOperand(1),
- Subtarget, DAG);
- else
- return SDValue();
- }
-
- case Intrinsic::x86_fma_vfmadd_ps:
- case Intrinsic::x86_fma_vfmadd_pd:
- case Intrinsic::x86_fma_vfmsub_ps:
- case Intrinsic::x86_fma_vfmsub_pd:
- case Intrinsic::x86_fma_vfnmadd_ps:
- case Intrinsic::x86_fma_vfnmadd_pd:
- case Intrinsic::x86_fma_vfnmsub_ps:
- case Intrinsic::x86_fma_vfnmsub_pd:
- case Intrinsic::x86_fma_vfmaddsub_ps:
- case Intrinsic::x86_fma_vfmaddsub_pd:
- case Intrinsic::x86_fma_vfmsubadd_ps:
- case Intrinsic::x86_fma_vfmsubadd_pd:
- case Intrinsic::x86_fma_vfmadd_ps_256:
- case Intrinsic::x86_fma_vfmadd_pd_256:
- case Intrinsic::x86_fma_vfmsub_ps_256:
- case Intrinsic::x86_fma_vfmsub_pd_256:
- case Intrinsic::x86_fma_vfnmadd_ps_256:
- case Intrinsic::x86_fma_vfnmadd_pd_256:
- case Intrinsic::x86_fma_vfnmsub_ps_256:
- case Intrinsic::x86_fma_vfnmsub_pd_256:
- case Intrinsic::x86_fma_vfmaddsub_ps_256:
- case Intrinsic::x86_fma_vfmaddsub_pd_256:
- case Intrinsic::x86_fma_vfmsubadd_ps_256:
- case Intrinsic::x86_fma_vfmsubadd_pd_256:
- return DAG.getNode(getOpcodeForFMAIntrinsic(IntNo), dl, Op.getValueType(),
- Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
}
}
if (Depth > 0) {
SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
- const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
- DAG.getSubtarget().getRegisterInfo());
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), PtrVT);
return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
DAG.getNode(ISD::ADD, dl, PtrVT,
}
SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
- MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineFrameInfo *MFI = MF.getFrameInfo();
+ X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
+ EVT VT = Op.getValueType();
+
MFI->setFrameAddressIsTaken(true);
- EVT VT = Op.getValueType();
+ if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
+ // Depth > 0 makes no sense on targets which use Windows unwind codes. It
+ // is not possible to crawl up the stack without looking at the unwind codes
+ // simultaneously.
+ int FrameAddrIndex = FuncInfo->getFAIndex();
+ if (!FrameAddrIndex) {
+ // Set up a frame object for the return address.
+ unsigned SlotSize = RegInfo->getSlotSize();
+ FrameAddrIndex = MF.getFrameInfo()->CreateFixedObject(
+ SlotSize, /*Offset=*/INT64_MIN, /*IsImmutable=*/false);
+ FuncInfo->setFAIndex(FrameAddrIndex);
+ }
+ return DAG.getFrameIndex(FrameAddrIndex, VT);
+ }
+
+ unsigned FrameReg =
+ RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
SDLoc dl(Op); // FIXME probably not meaningful
unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
- const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
- DAG.getSubtarget().getRegisterInfo());
- unsigned FrameReg = RegInfo->getPtrSizedFrameRegister(
- DAG.getMachineFunction());
assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
(FrameReg == X86::EBP && VT == MVT::i32)) &&
"Invalid Frame Register!");
SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
SelectionDAG &DAG) const {
- const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
- DAG.getSubtarget().getRegisterInfo());
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize());
}
SDLoc dl (Op);
EVT PtrVT = getPointerTy();
- const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
- DAG.getSubtarget().getRegisterInfo());
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
(FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
SDLoc dl (Op);
const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
- const TargetRegisterInfo *TRI = DAG.getSubtarget().getRegisterInfo();
+ const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
if (Subtarget->is64Bit()) {
SDValue OutChains[6];
*/
MachineFunction &MF = DAG.getMachineFunction();
- const TargetMachine &TM = MF.getTarget();
- const TargetFrameLowering &TFI = *TM.getSubtargetImpl()->getFrameLowering();
+ const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
unsigned StackAlignment = TFI.getStackAlignment();
MVT VT = Op.getSimpleValueType();
SDLoc DL(Op);
/// Used to know whether to use cmpxchg8/16b when expanding atomic operations
/// (otherwise we leave them alone to become __sync_fetch_and_... calls).
bool X86TargetLowering::needsCmpXchgNb(const Type *MemType) const {
- const X86Subtarget &Subtarget =
- getTargetMachine().getSubtarget<X86Subtarget>();
unsigned OpWidth = MemType->getPrimitiveSizeInBits();
if (OpWidth == 64)
- return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
+ return !Subtarget->is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
else if (OpWidth == 128)
- return Subtarget.hasCmpxchg16b();
+ return Subtarget->hasCmpxchg16b();
else
return false;
}
}
bool X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
- const X86Subtarget &Subtarget =
- getTargetMachine().getSubtarget<X86Subtarget>();
- unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
+ unsigned NativeWidth = Subtarget->is64Bit() ? 64 : 32;
const Type *MemType = AI->getType();
// If the operand is too big, we must see if cmpxchg8/16b is available
LoadInst *
X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
- const X86Subtarget &Subtarget =
- getTargetMachine().getSubtarget<X86Subtarget>();
- unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
+ unsigned NativeWidth = Subtarget->is64Bit() ? 64 : 32;
const Type *MemType = AI->getType();
// Accesses larger than the native width are turned into cmpxchg/libcalls, so
// there is no benefit in turning such RMWs into loads, and it is actually
// FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
// the IR level, so we must wrap it in an intrinsic.
return nullptr;
- } else if (hasMFENCE(Subtarget)) {
+ } else if (hasMFENCE(*Subtarget)) {
Function *MFence = llvm::Intrinsic::getDeclaration(M,
Intrinsic::x86_sse2_mfence);
Builder.CreateCall(MFence);
DAG.getIntPtrConstant(i)));
// Explicitly mark the extra elements as Undef.
- SDValue Undef = DAG.getUNDEF(SVT);
- for (unsigned i = NumElts, e = NumElts * 2; i != e; ++i)
- Elts.push_back(Undef);
+ Elts.append(NumElts, DAG.getUNDEF(SVT));
EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Elts);
switch (N->getOpcode()) {
default:
llvm_unreachable("Do not know how to custom type legalize this operation!");
+ // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
+ case X86ISD::FMINC:
+ case X86ISD::FMIN:
+ case X86ISD::FMAXC:
+ case X86ISD::FMAX: {
+ EVT VT = N->getValueType(0);
+ if (VT != MVT::v2f32)
+ llvm_unreachable("Unexpected type (!= v2f32) on FMIN/FMAX.");
+ SDValue UNDEF = DAG.getUNDEF(VT);
+ SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
+ N->getOperand(0), UNDEF);
+ SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
+ N->getOperand(1), UNDEF);
+ Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
+ return;
+ }
case ISD::SIGN_EXTEND_INREG:
case ISD::ADDC:
case ISD::ADDE:
case X86ISD::XTEST: return "X86ISD::XTEST";
case X86ISD::COMPRESS: return "X86ISD::COMPRESS";
case X86ISD::EXPAND: return "X86ISD::EXPAND";
+ case X86ISD::SELECT: return "X86ISD::SELECT";
+ case X86ISD::ADDSUB: return "X86ISD::ADDSUB";
+ case X86ISD::RCP28: return "X86ISD::RCP28";
+ case X86ISD::RSQRT28: return "X86ISD::RSQRT28";
+ case X86ISD::FADD_RND: return "X86ISD::FADD_RND";
+ case X86ISD::FSUB_RND: return "X86ISD::FSUB_RND";
+ case X86ISD::FMUL_RND: return "X86ISD::FMUL_RND";
+ case X86ISD::FDIV_RND: return "X86ISD::FDIV_RND";
}
}
return false;
}
+bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; }
+
bool
X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
if (!(Subtarget->hasFMA() || Subtarget->hasFMA4()))
if (VT.getSizeInBits() == 64)
return false;
+ // This is an experimental legality test that is tailored to match the
+ // legality test of the experimental lowering more closely. They are gated
+ // separately to ease testing of performance differences.
+ if (ExperimentalVectorShuffleLegality)
+ // We only care that the types being shuffled are legal. The lowering can
+ // handle any possible shuffle mask that results.
+ return isTypeLegal(SVT);
+
// If this is a single-input shuffle with no 128 bit lane crossings we can
// lower it into pshufb.
if ((SVT.is128BitVector() && Subtarget->hasSSSE3()) ||
return false;
MVT SVT = VT.getSimpleVT();
+
+ // This is an experimental legality test that is tailored to match the
+ // legality test of the experimental lowering more closely. They are gated
+ // separately to ease testing of performance differences.
+ if (ExperimentalVectorShuffleLegality)
+ // The new vector shuffle lowering is very good at managing zero-inputs.
+ return isShuffleMaskLegal(Mask, VT);
+
unsigned NumElts = SVT.getVectorNumElements();
// FIXME: This collection of masks seems suspect.
if (NumElts == 2)
return BB;
}
-static MachineBasicBlock * EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB,
- const TargetInstrInfo *TII,
- const X86Subtarget* Subtarget) {
+static MachineBasicBlock *EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB,
+ const X86Subtarget *Subtarget) {
DebugLoc dl = MI->getDebugLoc();
-
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
// Address into RAX/EAX, other two args into ECX, EDX.
unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r;
unsigned MemReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
}
MachineBasicBlock *
-X86TargetLowering::EmitVAARG64WithCustomInserter(
- MachineInstr *MI,
- MachineBasicBlock *MBB) const {
+X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr *MI,
+ MachineBasicBlock *MBB) const {
// Emit va_arg instruction on X86-64.
// Operands to this pseudo-instruction:
MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
// Machine Information
- const TargetInstrInfo *TII = MBB->getParent()->getSubtarget().getInstrInfo();
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
.setMemRefs(MMOBegin, MMOEnd);
// Jump to endMBB
- BuildMI(offsetMBB, DL, TII->get(X86::JMP_4))
+ BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
.addMBB(endMBB);
}
XMMSaveMBB->addSuccessor(EndMBB);
// Now add the instructions.
- const TargetInstrInfo *TII = MBB->getParent()->getSubtarget().getInstrInfo();
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
DebugLoc DL = MI->getDebugLoc();
unsigned CountReg = MI->getOperand(0).getReg();
if (!Subtarget->isTargetWin64()) {
// If %al is 0, branch around the XMM save block.
BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
- BuildMI(MBB, DL, TII->get(X86::JE_4)).addMBB(EndMBB);
+ BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
MBB->addSuccessor(EndMBB);
}
MachineBasicBlock *
X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
MachineBasicBlock *BB) const {
- const TargetInstrInfo *TII = BB->getParent()->getSubtarget().getInstrInfo();
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
DebugLoc DL = MI->getDebugLoc();
// To "insert" a SELECT_CC instruction, we actually have to insert the
// If the EFLAGS register isn't dead in the terminator, then claim that it's
// live into the sink and copy blocks.
- const TargetRegisterInfo *TRI =
- BB->getParent()->getSubtarget().getRegisterInfo();
+ const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
if (!MI->killsRegister(X86::EFLAGS) &&
!checkAndUpdateEFLAGSKill(MI, BB, TRI)) {
copy0MBB->addLiveIn(X86::EFLAGS);
X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI,
MachineBasicBlock *BB) const {
MachineFunction *MF = BB->getParent();
- const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
DebugLoc DL = MI->getDebugLoc();
const BasicBlock *LLVM_BB = BB->getBasicBlock();
BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
.addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
.addReg(SPLimitVReg);
- BuildMI(BB, DL, TII->get(X86::JG_4)).addMBB(mallocMBB);
+ BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB);
// bumpMBB simply decreases the stack pointer, since we know the current
// stacklet has enough space.
.addReg(SPLimitVReg);
BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
.addReg(SPLimitVReg);
- BuildMI(bumpMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB);
+ BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
// Calls into a routine in libgcc to allocate more space from the heap.
- const uint32_t *RegMask = MF->getTarget()
- .getSubtargetImpl()
- ->getRegisterInfo()
- ->getCallPreservedMask(CallingConv::C);
+ const uint32_t *RegMask =
+ Subtarget->getRegisterInfo()->getCallPreservedMask(CallingConv::C);
if (IsLP64) {
BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
.addReg(sizeVReg);
BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
.addReg(IsLP64 ? X86::RAX : X86::EAX);
- BuildMI(mallocMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB);
+ BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
// Set up the CFG correctly.
BB->addSuccessor(bumpMBB);
MachineBasicBlock *
X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI,
MachineBasicBlock *BB) const {
- const TargetInstrInfo *TII = BB->getParent()->getSubtarget().getInstrInfo();
DebugLoc DL = MI->getDebugLoc();
assert(!Subtarget->isTargetMachO());
- // The lowering is pretty easy: we're just emitting the call to _alloca. The
- // non-trivial part is impdef of ESP.
-
- if (Subtarget->isTargetWin64()) {
- if (Subtarget->isTargetCygMing()) {
- // ___chkstk(Mingw64):
- // Clobbers R10, R11, RAX and EFLAGS.
- // Updates RSP.
- BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA))
- .addExternalSymbol("___chkstk")
- .addReg(X86::RAX, RegState::Implicit)
- .addReg(X86::RSP, RegState::Implicit)
- .addReg(X86::RAX, RegState::Define | RegState::Implicit)
- .addReg(X86::RSP, RegState::Define | RegState::Implicit)
- .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
- } else {
- // __chkstk(MSVCRT): does not update stack pointer.
- // Clobbers R10, R11 and EFLAGS.
- BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA))
- .addExternalSymbol("__chkstk")
- .addReg(X86::RAX, RegState::Implicit)
- .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
- // RAX has the offset to be subtracted from RSP.
- BuildMI(*BB, MI, DL, TII->get(X86::SUB64rr), X86::RSP)
- .addReg(X86::RSP)
- .addReg(X86::RAX);
- }
- } else {
- const char *StackProbeSymbol = (Subtarget->isTargetKnownWindowsMSVC() ||
- Subtarget->isTargetWindowsItanium())
- ? "_chkstk"
- : "_alloca";
-
- BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32))
- .addExternalSymbol(StackProbeSymbol)
- .addReg(X86::EAX, RegState::Implicit)
- .addReg(X86::ESP, RegState::Implicit)
- .addReg(X86::EAX, RegState::Define | RegState::Implicit)
- .addReg(X86::ESP, RegState::Define | RegState::Implicit)
- .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
- }
+ X86FrameLowering::emitStackProbeCall(*BB->getParent(), *BB, MI, DL);
MI->eraseFromParent(); // The pseudo instruction is gone now.
return BB;
// or EAX and doing an indirect call. The return value will then
// be in the normal return register.
MachineFunction *F = BB->getParent();
- const X86InstrInfo *TII =
- static_cast<const X86InstrInfo *>(F->getSubtarget().getInstrInfo());
+ const X86InstrInfo *TII = Subtarget->getInstrInfo();
DebugLoc DL = MI->getDebugLoc();
assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?");
// Get a register mask for the lowered call.
// FIXME: The 32-bit calls have non-standard calling conventions. Use a
// proper register mask.
- const uint32_t *RegMask = F->getTarget()
- .getSubtargetImpl()
- ->getRegisterInfo()
- ->getCallPreservedMask(CallingConv::C);
+ const uint32_t *RegMask =
+ Subtarget->getRegisterInfo()->getCallPreservedMask(CallingConv::C);
if (Subtarget->is64Bit()) {
MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
TII->get(X86::MOV64rm), X86::RDI)
MachineBasicBlock *MBB) const {
DebugLoc DL = MI->getDebugLoc();
MachineFunction *MF = MBB->getParent();
- const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
MachineRegisterInfo &MRI = MF->getRegInfo();
const BasicBlock *BB = MBB->getBasicBlock();
MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
.addMBB(restoreMBB);
- const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
- MF->getSubtarget().getRegisterInfo());
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
MIB.addRegMask(RegInfo->getNoPreservedMask());
thisMBB->addSuccessor(mainMBB);
thisMBB->addSuccessor(restoreMBB);
// restoreMBB:
if (RegInfo->hasBasePointer(*MF)) {
- const X86Subtarget &STI = MF->getTarget().getSubtarget<X86Subtarget>();
- const bool Uses64BitFramePtr = STI.isTarget64BitLP64() || STI.isTargetNaCl64();
+ const bool Uses64BitFramePtr =
+ Subtarget->isTarget64BitLP64() || Subtarget->isTargetNaCl64();
X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
X86FI->setRestoreBasePointer(MF);
unsigned FramePtr = RegInfo->getFrameRegister(*MF);
.setMIFlag(MachineInstr::FrameSetup);
}
BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
- BuildMI(restoreMBB, DL, TII->get(X86::JMP_4)).addMBB(sinkMBB);
+ BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
restoreMBB->addSuccessor(sinkMBB);
MI->eraseFromParent();
MachineBasicBlock *MBB) const {
DebugLoc DL = MI->getDebugLoc();
MachineFunction *MF = MBB->getParent();
- const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
MachineRegisterInfo &MRI = MF->getRegInfo();
// Memory Reference
(PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
unsigned Tmp = MRI.createVirtualRegister(RC);
// Since FP is only updated here but NOT referenced, it's treated as GPR.
- const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
- MF->getSubtarget().getRegisterInfo());
+ const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
unsigned SP = RegInfo->getStackRegister();
default: llvm_unreachable("Unrecognized FMA variant.");
}
- const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+ const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
MachineInstrBuilder MIB =
BuildMI(MF, MI->getDebugLoc(), TII.get(NewFMAOpc))
.addOperand(MI->getOperand(0))
case X86::TAILJMPd64:
case X86::TAILJMPr64:
case X86::TAILJMPm64:
+ case X86::TAILJMPd64_REX:
+ case X86::TAILJMPr64_REX:
+ case X86::TAILJMPm64_REX:
llvm_unreachable("TAILJMP64 would not be touched here.");
case X86::TCRETURNdi64:
case X86::TCRETURNri64:
case X86::FP80_TO_INT32_IN_MEM:
case X86::FP80_TO_INT64_IN_MEM: {
MachineFunction *F = BB->getParent();
- const TargetInstrInfo *TII = F->getSubtarget().getInstrInfo();
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
DebugLoc DL = MI->getDebugLoc();
// Change the floating point control register to use "round towards zero"
case X86::VPCMPESTRM128MEM:
assert(Subtarget->hasSSE42() &&
"Target must have SSE4.2 or AVX features enabled");
- return EmitPCMPSTRM(MI, BB, BB->getParent()->getSubtarget().getInstrInfo());
+ return EmitPCMPSTRM(MI, BB, Subtarget->getInstrInfo());
// String/text processing lowering.
case X86::PCMPISTRIREG:
case X86::VPCMPESTRIMEM:
assert(Subtarget->hasSSE42() &&
"Target must have SSE4.2 or AVX features enabled");
- return EmitPCMPSTRI(MI, BB, BB->getParent()->getSubtarget().getInstrInfo());
+ return EmitPCMPSTRI(MI, BB, Subtarget->getInstrInfo());
// Thread synchronization.
case X86::MONITOR:
- return EmitMonitor(MI, BB, BB->getParent()->getSubtarget().getInstrInfo(),
- Subtarget);
+ return EmitMonitor(MI, BB, Subtarget);
// xbegin
case X86::XBEGIN:
- return EmitXBegin(MI, BB, BB->getParent()->getSubtarget().getInstrInfo());
+ return EmitXBegin(MI, BB, Subtarget->getInstrInfo());
case X86::VASTART_SAVE_XMM_REGS:
return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
// We're looking for blends between FADD and FSUB nodes. We insist on these
// nodes being lined up in a specific expected pattern.
- if (!(isShuffleEquivalent(Mask, 0, 3) ||
- isShuffleEquivalent(Mask, 0, 5, 2, 7) ||
- isShuffleEquivalent(Mask, 0, 9, 2, 11, 4, 13, 6, 15)))
+ if (!(isShuffleEquivalent(V1, V2, Mask, 0, 3) ||
+ isShuffleEquivalent(V1, V2, Mask, 0, 5, 2, 7) ||
+ isShuffleEquivalent(V1, V2, Mask, 0, 9, 2, 11, 4, 13, 6, 15)))
return SDValue();
// Only specific types are legal at this point, assert so we notice if and
: InVec.getOperand(1);
// If inputs to shuffle are the same for both ops, then allow 2 uses
- unsigned AllowedUses = InVec.getOperand(0) == InVec.getOperand(1) ? 2 : 1;
+ unsigned AllowedUses = InVec.getNumOperands() > 1 &&
+ InVec.getOperand(0) == InVec.getOperand(1) ? 2 : 1;
if (LdNode.getOpcode() == ISD::BITCAST) {
// Don't duplicate a load with other uses.
EltNo);
}
+/// \brief Detect bitcasts between i32 to x86mmx low word. Since MMX types are
+/// special and don't usually play with other vector types, it's better to
+/// handle them early to be sure we emit efficient code by avoiding
+/// store-load conversions.
+static SDValue PerformBITCASTCombine(SDNode *N, SelectionDAG &DAG) {
+ if (N->getValueType(0) != MVT::x86mmx ||
+ N->getOperand(0)->getOpcode() != ISD::BUILD_VECTOR ||
+ N->getOperand(0)->getValueType(0) != MVT::v2i32)
+ return SDValue();
+
+ SDValue V = N->getOperand(0);
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(V.getOperand(1));
+ if (C && C->getZExtValue() == 0 && V.getOperand(0).getValueType() == MVT::i32)
+ return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(V.getOperand(0)),
+ N->getValueType(0), V.getOperand(0));
+
+ return SDValue();
+}
+
/// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index
/// generation and convert it from being a bunch of shuffles and extracts
/// into a somewhat faster sequence. For i686, the best sequence is apparently
SDValue InputVector = N->getOperand(0);
- // Detect whether we are trying to convert from mmx to i32 and the bitcast
- // from mmx to v2i32 has a single usage.
- if (InputVector.getNode()->getOpcode() == llvm::ISD::BITCAST &&
- InputVector.getNode()->getOperand(0).getValueType() == MVT::x86mmx &&
- InputVector.hasOneUse() && N->getValueType(0) == MVT::i32)
- return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector),
- N->getValueType(0),
- InputVector.getNode()->getOperand(0));
+ // Detect mmx to i32 conversion through a v2i32 elt extract.
+ if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
+ N->getValueType(0) == MVT::i32 &&
+ InputVector.getValueType() == MVT::v2i32) {
+
+ // The bitcast source is a direct mmx result.
+ SDValue MMXSrc = InputVector.getNode()->getOperand(0);
+ if (MMXSrc.getValueType() == MVT::x86mmx)
+ return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector),
+ N->getValueType(0),
+ InputVector.getNode()->getOperand(0));
+
+ // The mmx is indirect: (i64 extract_elt (v1i64 bitcast (x86mmx ...))).
+ SDValue MMXSrcOp = MMXSrc.getOperand(0);
+ if (MMXSrc.getOpcode() == ISD::EXTRACT_VECTOR_ELT && MMXSrc.hasOneUse() &&
+ MMXSrc.getValueType() == MVT::i64 && MMXSrcOp.hasOneUse() &&
+ MMXSrcOp.getOpcode() == ISD::BITCAST &&
+ MMXSrcOp.getValueType() == MVT::v1i64 &&
+ MMXSrcOp.getOperand(0).getValueType() == MVT::x86mmx)
+ return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector),
+ N->getValueType(0),
+ MMXSrcOp.getOperand(0));
+ }
// Only operate on vectors of 4 elements, where the alternative shuffling
// gets to be more expensive.
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
SDValue Vals[4];
SDLoc dl(InputVector);
-
+
if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) {
SDValue Cst = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, InputVector);
EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy();
SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
DAG.getConstant(1, VecIdxTy));
- SDValue ShAmt = DAG.getConstant(32,
+ SDValue ShAmt = DAG.getConstant(32,
DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64));
Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf);
Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
// instructions match the semantics of the common C idiom x<y?x:y but not
// x<=y?x:y, because of how they handle negative zero (which can be
// ignored in unsafe-math mode).
+ // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
- VT != MVT::f80 && TLI.isTypeLegal(VT) &&
+ VT != MVT::f80 && (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
(Subtarget->hasSSE2() ||
(Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) {
ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
// to simplify previous instructions.
if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
!DCI.isBeforeLegalize() &&
- // We explicitly check against v8i16 and v16i16 because, although
- // they're marked as Custom, they might only be legal when Cond is a
- // build_vector of constants. This will be taken care in a later
- // condition.
- (TLI.isOperationLegalOrCustom(ISD::VSELECT, VT) && VT != MVT::v16i16 &&
- VT != MVT::v8i16) &&
+ // We explicitly check against SSE4.1, v8i16 and v16i16 because, although
+ // vselect nodes may be marked as Custom, they might only be legal when
+ // Cond is a build_vector of constants. This will be taken care in
+ // a later condition.
+ (TLI.isOperationLegalOrCustom(ISD::VSELECT, VT) &&
+ Subtarget->hasSSE41() && VT != MVT::v16i16 && VT != MVT::v8i16) &&
// Don't optimize vector of constants. Those are handled by
// the generic code and all the bits must be properly set for
// the generic optimizer.
return SDValue();
EVT VT = N->getValueType(0);
- if (VT != MVT::i64)
+ if (VT != MVT::i64 && VT != MVT::i32)
return SDValue();
ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
}
}
+static SDValue VectorZextCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget *Subtarget) {
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ SDLoc DL(N);
+
+ // A vector zext_in_reg may be represented as a shuffle,
+ // feeding into a bitcast (this represents anyext) feeding into
+ // an and with a mask.
+ // We'd like to try to combine that into a shuffle with zero
+ // plus a bitcast, removing the and.
+ if (N0.getOpcode() != ISD::BITCAST ||
+ N0.getOperand(0).getOpcode() != ISD::VECTOR_SHUFFLE)
+ return SDValue();
+
+ // The other side of the AND should be a splat of 2^C, where C
+ // is the number of bits in the source type.
+ if (N1.getOpcode() == ISD::BITCAST)
+ N1 = N1.getOperand(0);
+ if (N1.getOpcode() != ISD::BUILD_VECTOR)
+ return SDValue();
+ BuildVectorSDNode *Vector = cast<BuildVectorSDNode>(N1);
+
+ ShuffleVectorSDNode *Shuffle = cast<ShuffleVectorSDNode>(N0.getOperand(0));
+ EVT SrcType = Shuffle->getValueType(0);
+
+ // We expect a single-source shuffle
+ if (Shuffle->getOperand(1)->getOpcode() != ISD::UNDEF)
+ return SDValue();
+
+ unsigned SrcSize = SrcType.getScalarSizeInBits();
+
+ APInt SplatValue, SplatUndef;
+ unsigned SplatBitSize;
+ bool HasAnyUndefs;
+ if (!Vector->isConstantSplat(SplatValue, SplatUndef,
+ SplatBitSize, HasAnyUndefs))
+ return SDValue();
+
+ unsigned ResSize = N1.getValueType().getScalarSizeInBits();
+ // Make sure the splat matches the mask we expect
+ if (SplatBitSize > ResSize ||
+ (SplatValue + 1).exactLogBase2() != (int)SrcSize)
+ return SDValue();
+
+ // Make sure the input and output size make sense
+ if (SrcSize >= ResSize || ResSize % SrcSize)
+ return SDValue();
+
+ // We expect a shuffle of the form <0, u, u, u, 1, u, u, u...>
+ // The number of u's between each two values depends on the ratio between
+ // the source and dest type.
+ unsigned ZextRatio = ResSize / SrcSize;
+ bool IsZext = true;
+ for (unsigned i = 0; i < SrcType.getVectorNumElements(); ++i) {
+ if (i % ZextRatio) {
+ if (Shuffle->getMaskElt(i) > 0) {
+ // Expected undef
+ IsZext = false;
+ break;
+ }
+ } else {
+ if (Shuffle->getMaskElt(i) != (int)(i / ZextRatio)) {
+ // Expected element number
+ IsZext = false;
+ break;
+ }
+ }
+ }
+
+ if (!IsZext)
+ return SDValue();
+
+ // Ok, perform the transformation - replace the shuffle with
+ // a shuffle of the form <0, k, k, k, 1, k, k, k> with zero
+ // (instead of undef) where the k elements come from the zero vector.
+ SmallVector<int, 8> Mask;
+ unsigned NumElems = SrcType.getVectorNumElements();
+ for (unsigned i = 0; i < NumElems; ++i)
+ if (i % ZextRatio)
+ Mask.push_back(NumElems);
+ else
+ Mask.push_back(i / ZextRatio);
+
+ SDValue NewShuffle = DAG.getVectorShuffle(Shuffle->getValueType(0), DL,
+ Shuffle->getOperand(0), DAG.getConstant(0, SrcType), Mask);
+ return DAG.getNode(ISD::BITCAST, DL, N0.getValueType(), NewShuffle);
+}
+
static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget *Subtarget) {
- EVT VT = N->getValueType(0);
if (DCI.isBeforeLegalizeOps())
return SDValue();
+ SDValue Zext = VectorZextCombine(N, DAG, DCI, Subtarget);
+ if (Zext.getNode())
+ return Zext;
+
SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget);
if (R.getNode())
return R;
+ EVT VT = N->getValueType(0);
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ SDLoc DL(N);
+
// Create BEXTR instructions
// BEXTR is ((X >> imm) & (2**size-1))
if (VT == MVT::i32 || VT == MVT::i64) {
- SDValue N0 = N->getOperand(0);
- SDValue N1 = N->getOperand(1);
- SDLoc DL(N);
-
// Check for BEXTR.
if ((Subtarget->hasBMI() || Subtarget->hasTBM()) &&
(N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::SRL)) {
uint64_t Mask = MaskNode->getZExtValue();
uint64_t Shift = ShiftNode->getZExtValue();
if (isMask_64(Mask)) {
- uint64_t MaskSize = CountPopulation_64(Mask);
+ uint64_t MaskSize = countPopulation(Mask);
if (Shift + MaskSize <= VT.getSizeInBits())
return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0),
DAG.getConstant(Shift | (MaskSize << 8), VT));
if (VT != MVT::v2i64 && VT != MVT::v4i64)
return SDValue();
- SDValue N0 = N->getOperand(0);
- SDValue N1 = N->getOperand(1);
- SDLoc DL(N);
-
// Check LHS for vnot
if (N0.getOpcode() == ISD::XOR &&
//ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
// fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
MachineFunction &MF = DAG.getMachineFunction();
- bool OptForSize = MF.getFunction()->getAttributes().
- hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
+ bool OptForSize =
+ MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize);
// SHLD/SHRD instructions have lower register pressure, but on some
// platforms they have higher latency than the equivalent
return SDValue();
}
+/// PerformMLOADCombine - Resolve extending loads
+static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget *Subtarget) {
+ MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
+ if (Mld->getExtensionType() != ISD::SEXTLOAD)
+ return SDValue();
+
+ EVT VT = Mld->getValueType(0);
+ unsigned NumElems = VT.getVectorNumElements();
+ EVT LdVT = Mld->getMemoryVT();
+ SDLoc dl(Mld);
+
+ assert(LdVT != VT && "Cannot extend to the same type");
+ unsigned ToSz = VT.getVectorElementType().getSizeInBits();
+ unsigned FromSz = LdVT.getVectorElementType().getSizeInBits();
+ // From, To sizes and ElemCount must be pow of two
+ assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
+ "Unexpected size for extending masked load");
+
+ unsigned SizeRatio = ToSz / FromSz;
+ assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());
+
+ // Create a type on which we perform the shuffle
+ EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
+ LdVT.getScalarType(), NumElems*SizeRatio);
+ assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
+
+ // Convert Src0 value
+ SDValue WideSrc0 = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mld->getSrc0());
+ if (Mld->getSrc0().getOpcode() != ISD::UNDEF) {
+ SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
+ for (unsigned i = 0; i != NumElems; ++i)
+ ShuffleVec[i] = i * SizeRatio;
+
+ // Can't shuffle using an illegal type.
+ assert (DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT)
+ && "WideVecVT should be legal");
+ WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
+ DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
+ }
+ // Prepare the new mask
+ SDValue NewMask;
+ SDValue Mask = Mld->getMask();
+ if (Mask.getValueType() == VT) {
+ // Mask and original value have the same type
+ NewMask = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mask);
+ SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
+ for (unsigned i = 0; i != NumElems; ++i)
+ ShuffleVec[i] = i * SizeRatio;
+ for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
+ ShuffleVec[i] = NumElems*SizeRatio;
+ NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
+ DAG.getConstant(0, WideVecVT),
+ &ShuffleVec[0]);
+ }
+ else {
+ assert(Mask.getValueType().getVectorElementType() == MVT::i1);
+ unsigned WidenNumElts = NumElems*SizeRatio;
+ unsigned MaskNumElts = VT.getVectorNumElements();
+ EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+ WidenNumElts);
+
+ unsigned NumConcat = WidenNumElts / MaskNumElts;
+ SmallVector<SDValue, 16> Ops(NumConcat);
+ SDValue ZeroVal = DAG.getConstant(0, Mask.getValueType());
+ Ops[0] = Mask;
+ for (unsigned i = 1; i != NumConcat; ++i)
+ Ops[i] = ZeroVal;
+
+ NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
+ }
+
+ SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
+ Mld->getBasePtr(), NewMask, WideSrc0,
+ Mld->getMemoryVT(), Mld->getMemOperand(),
+ ISD::NON_EXTLOAD);
+ SDValue NewVec = DAG.getNode(X86ISD::VSEXT, dl, VT, WideLd);
+ return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
+
+}
+/// PerformMSTORECombine - Resolve truncating stores
+static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
+ if (!Mst->isTruncatingStore())
+ return SDValue();
+
+ EVT VT = Mst->getValue().getValueType();
+ unsigned NumElems = VT.getVectorNumElements();
+ EVT StVT = Mst->getMemoryVT();
+ SDLoc dl(Mst);
+
+ assert(StVT != VT && "Cannot truncate to the same type");
+ unsigned FromSz = VT.getVectorElementType().getSizeInBits();
+ unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
+
+ // From, To sizes and ElemCount must be pow of two
+ assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
+ "Unexpected size for truncating masked store");
+ // We are going to use the original vector elt for storing.
+ // Accumulated smaller vector elements must be a multiple of the store size.
+ assert (((NumElems * FromSz) % ToSz) == 0 &&
+ "Unexpected ratio for truncating masked store");
+
+ unsigned SizeRatio = FromSz / ToSz;
+ assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
+
+ // Create a type on which we perform the shuffle
+ EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
+ StVT.getScalarType(), NumElems*SizeRatio);
+
+ assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
+
+ SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mst->getValue());
+ SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
+ for (unsigned i = 0; i != NumElems; ++i)
+ ShuffleVec[i] = i * SizeRatio;
+
+ // Can't shuffle using an illegal type.
+ assert (DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT)
+ && "WideVecVT should be legal");
+
+ SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
+ DAG.getUNDEF(WideVecVT),
+ &ShuffleVec[0]);
+
+ SDValue NewMask;
+ SDValue Mask = Mst->getMask();
+ if (Mask.getValueType() == VT) {
+ // Mask and original value have the same type
+ NewMask = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mask);
+ for (unsigned i = 0; i != NumElems; ++i)
+ ShuffleVec[i] = i * SizeRatio;
+ for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
+ ShuffleVec[i] = NumElems*SizeRatio;
+ NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
+ DAG.getConstant(0, WideVecVT),
+ &ShuffleVec[0]);
+ }
+ else {
+ assert(Mask.getValueType().getVectorElementType() == MVT::i1);
+ unsigned WidenNumElts = NumElems*SizeRatio;
+ unsigned MaskNumElts = VT.getVectorNumElements();
+ EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+ WidenNumElts);
+
+ unsigned NumConcat = WidenNumElts / MaskNumElts;
+ SmallVector<SDValue, 16> Ops(NumConcat);
+ SDValue ZeroVal = DAG.getConstant(0, Mask.getValueType());
+ Ops[0] = Mask;
+ for (unsigned i = 1; i != NumConcat; ++i)
+ Ops[i] = ZeroVal;
+
+ NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
+ }
+
+ return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal, Mst->getBasePtr(),
+ NewMask, StVT, Mst->getMemOperand(), false);
+}
/// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
const X86Subtarget *Subtarget) {
// Find the largest store unit
MVT StoreType = MVT::i8;
- for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE;
- tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) {
- MVT Tp = (MVT::SimpleValueType)tp;
+ for (MVT Tp : MVT::integer_valuetypes()) {
if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
StoreType = Tp;
}
return SDValue();
const Function *F = DAG.getMachineFunction().getFunction();
- bool NoImplicitFloatOps = F->getAttributes().
- hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat);
+ bool NoImplicitFloatOps = F->hasFnAttribute(Attribute::NoImplicitFloat);
bool F64IsLegal = !DAG.getTarget().Options.UseSoftFloat && !NoImplicitFloatOps
&& Subtarget->hasSSE2();
if ((VT.isVector() ||
/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) {
assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
+
// F[X]OR(0.0, x) -> x
- // F[X]OR(x, 0.0) -> x
if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
if (C->getValueAPF().isPosZero())
return N->getOperand(1);
+
+ // F[X]OR(x, 0.0) -> x
if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
if (C->getValueAPF().isPosZero())
return N->getOperand(0);
/// Do target-specific dag combines on X86ISD::FAND nodes.
static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
// FAND(0.0, x) -> 0.0
- // FAND(x, 0.0) -> 0.0
if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
if (C->getValueAPF().isPosZero())
return N->getOperand(0);
+
+ // FAND(x, 0.0) -> 0.0
if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
if (C->getValueAPF().isPosZero())
return N->getOperand(1);
+
return SDValue();
}
/// Do target-specific dag combines on X86ISD::FANDN nodes
static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG) {
- // FANDN(x, 0.0) -> 0.0
// FANDN(0.0, x) -> x
if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
if (C->getValueAPF().isPosZero())
return N->getOperand(1);
+
+ // FANDN(x, 0.0) -> 0.0
if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
if (C->getValueAPF().isPosZero())
return N->getOperand(1);
+
return SDValue();
}
}
static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
- const X86TargetLowering *XTLI) {
+ const X86Subtarget *Subtarget) {
// First try to optimize away the conversion entirely when it's
// conditionally from a constant. Vectors only.
SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG);
EVT VT = Ld->getValueType(0);
if (!Ld->isVolatile() && !N->getValueType(0).isVector() &&
ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
- !XTLI->getSubtarget()->is64Bit() &&
- VT == MVT::i64) {
- SDValue FILDChain = XTLI->BuildFILD(SDValue(N, 0), Ld->getValueType(0),
- Ld->getChain(), Op0, DAG);
+ !Subtarget->is64Bit() && VT == MVT::i64) {
+ SDValue FILDChain = Subtarget->getTargetLowering()->BuildFILD(
+ SDValue(N, 0), Ld->getValueType(0), Ld->getChain(), Op0, DAG);
DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
return FILDChain;
}
case ISD::SELECT:
case X86ISD::SHRUNKBLEND:
return PerformSELECTCombine(N, DAG, DCI, Subtarget);
+ case ISD::BITCAST: return PerformBITCASTCombine(N, DAG);
case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI, Subtarget);
case ISD::ADD: return PerformAddCombine(N, DAG, Subtarget);
case ISD::SUB: return PerformSubCombine(N, DAG, Subtarget);
case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget);
case ISD::XOR: return PerformXorCombine(N, DAG, DCI, Subtarget);
case ISD::LOAD: return PerformLOADCombine(N, DAG, DCI, Subtarget);
+ case ISD::MLOAD: return PerformMLOADCombine(N, DAG, DCI, Subtarget);
case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget);
- case ISD::SINT_TO_FP: return PerformSINT_TO_FPCombine(N, DAG, this);
+ case ISD::MSTORE: return PerformMSTORECombine(N, DAG, Subtarget);
+ case ISD::SINT_TO_FP: return PerformSINT_TO_FPCombine(N, DAG, Subtarget);
case ISD::FADD: return PerformFADDCombine(N, DAG, Subtarget);
case ISD::FSUB: return PerformFSUBCombine(N, DAG, Subtarget);
case X86ISD::FXOR:
case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget);
case ISD::INTRINSIC_WO_CHAIN:
return PerformINTRINSIC_WO_CHAINCombine(N, DAG, Subtarget);
- case X86ISD::INSERTPS:
- return PerformINSERTPSCombine(N, DAG, Subtarget);
+ case X86ISD::INSERTPS: {
+ if (getTargetMachine().getOptLevel() > CodeGenOpt::None)
+ return PerformINSERTPSCombine(N, DAG, Subtarget);
+ break;
+ }
case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DAG, Subtarget);
}
}
}
return;
+ case 'L':
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
+ (Subtarget->is64Bit() && C->getZExtValue() == 0xffffffff)) {
+ Result = DAG.getTargetConstant(C->getSExtValue(), Op.getValueType());
+ break;
+ }
+ }
+ return;
+ case 'M':
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ if (C->getZExtValue() <= 3) {
+ Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
+ break;
+ }
+ }
+ return;
case 'N':
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
if (C->getZExtValue() <= 255) {
}
}
return;
+ case 'O':
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ if (C->getZExtValue() <= 127) {
+ Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
+ break;
+ }
+ }
+ return;
case 'e': {
// 32-bit signed value
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {