MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize());
// Set up the TargetLowering object.
- static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
// X86 is weird. It always uses i8 for shift amounts and setcc results.
setBooleanContents(ZeroOrOneBooleanContent);
// (low) operations are left as Legal, as there are single-result
// instructions for this in x86. Using the two-result multiply instructions
// when both high and low results are needed must be arranged by dagcombine.
- for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
- MVT VT = IntVTs[i];
+ for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
setOperationAction(ISD::MULHS, VT, Expand);
setOperationAction(ISD::MULHU, VT, Expand);
setOperationAction(ISD::SDIV, VT, Expand);
setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
// Expand certain atomics
- for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
- MVT VT = IntVTs[i];
+ for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
// ISD::CTTZ_ZERO_UNDEF v2i64 - scalarization is faster.
// Custom lower build_vector, vector_shuffle, and extract_vector_elt.
- for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
- MVT VT = (MVT::SimpleValueType)i;
- // Do not attempt to custom lower non-power-of-2 vectors
- if (!isPowerOf2_32(VT.getVectorNumElements()))
- continue;
- // Do not attempt to custom lower non-128-bit vectors
- if (!VT.is128BitVector())
- continue;
+ for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
setOperationAction(ISD::VSELECT, VT, Custom);
}
// Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
- for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
- MVT VT = (MVT::SimpleValueType)i;
-
- // Do not attempt to promote non-128-bit vectors
- if (!VT.is128BitVector())
- continue;
-
+ for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
setOperationAction(ISD::AND, VT, Promote);
AddPromotedToType (ISD::AND, VT, MVT::v2i64);
setOperationAction(ISD::OR, VT, Promote);
setOperationAction(ISD::SRA, MVT::v4i32, Custom);
}
+ if (Subtarget->hasXOP()) {
+ setOperationAction(ISD::ROTL, MVT::v16i8, Custom);
+ setOperationAction(ISD::ROTL, MVT::v8i16, Custom);
+ setOperationAction(ISD::ROTL, MVT::v4i32, Custom);
+ setOperationAction(ISD::ROTL, MVT::v2i64, Custom);
+ setOperationAction(ISD::ROTL, MVT::v32i8, Custom);
+ setOperationAction(ISD::ROTL, MVT::v16i16, Custom);
+ setOperationAction(ISD::ROTL, MVT::v8i32, Custom);
+ setOperationAction(ISD::ROTL, MVT::v4i64, Custom);
+ }
+
if (!Subtarget->useSoftFloat() && Subtarget->hasFp256()) {
addRegisterClass(MVT::v32i8, &X86::VR256RegClass);
addRegisterClass(MVT::v16i16, &X86::VR256RegClass);
if (Subtarget->hasInt256())
setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
-
// Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
- for (int i = MVT::v32i8; i != MVT::v4i64; ++i) {
- MVT VT = (MVT::SimpleValueType)i;
-
- // Do not attempt to promote non-256-bit vectors
- if (!VT.is256BitVector())
- continue;
-
+ for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
setOperationAction(ISD::AND, VT, Promote);
AddPromotedToType (ISD::AND, VT, MVT::v4i64);
setOperationAction(ISD::OR, VT, Promote);
setOperationAction(ISD::BR_CC, MVT::i1, Expand);
setOperationAction(ISD::SETCC, MVT::i1, Custom);
+ setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
setOperationAction(ISD::XOR, MVT::i1, Legal);
setOperationAction(ISD::OR, MVT::i1, Legal);
setOperationAction(ISD::AND, MVT::i1, Legal);
setOperationAction(ISD::XOR, MVT::v16i32, Legal);
if (Subtarget->hasCDI()) {
- setOperationAction(ISD::CTLZ, MVT::v8i64, Legal);
+ setOperationAction(ISD::CTLZ, MVT::v8i64, Legal);
setOperationAction(ISD::CTLZ, MVT::v16i32, Legal);
- setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v8i64, Legal);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v8i64, Legal);
setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v16i32, Legal);
- setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i64, Custom);
+ setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
+ setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
+ setOperationAction(ISD::CTLZ, MVT::v16i16, Custom);
+ setOperationAction(ISD::CTLZ, MVT::v32i8, Custom);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v8i16, Custom);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v16i8, Custom);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v16i16, Custom);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v32i8, Custom);
+
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i64, Custom);
setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i32, Custom);
- }
- if (Subtarget->hasVLX() && Subtarget->hasCDI()) {
- setOperationAction(ISD::CTLZ, MVT::v4i64, Legal);
- setOperationAction(ISD::CTLZ, MVT::v8i32, Legal);
- setOperationAction(ISD::CTLZ, MVT::v2i64, Legal);
- setOperationAction(ISD::CTLZ, MVT::v4i32, Legal);
- setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i64, Legal);
- setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v8i32, Legal);
- setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v2i64, Legal);
- setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i32, Legal);
-
- setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i64, Custom);
- setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i32, Custom);
- setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom);
- setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom);
- }
+
+ if (Subtarget->hasVLX()) {
+ setOperationAction(ISD::CTLZ, MVT::v4i64, Legal);
+ setOperationAction(ISD::CTLZ, MVT::v8i32, Legal);
+ setOperationAction(ISD::CTLZ, MVT::v2i64, Legal);
+ setOperationAction(ISD::CTLZ, MVT::v4i32, Legal);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i64, Legal);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v8i32, Legal);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v2i64, Legal);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i32, Legal);
+
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i64, Custom);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i32, Custom);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom);
+ } else {
+ setOperationAction(ISD::CTLZ, MVT::v4i64, Custom);
+ setOperationAction(ISD::CTLZ, MVT::v8i32, Custom);
+ setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
+ setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i64, Custom);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v8i32, Custom);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v2i64, Custom);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i32, Custom);
+ }
+ } // Subtarget->hasCDI()
+
if (Subtarget->hasDQI()) {
setOperationAction(ISD::MUL, MVT::v2i64, Legal);
setOperationAction(ISD::MUL, MVT::v4i64, Legal);
setOperationAction(ISD::MSTORE, VT, Legal);
}
}
- for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
- MVT VT = (MVT::SimpleValueType)i;
-
- // Do not attempt to promote non-512-bit vectors.
- if (!VT.is512BitVector())
- continue;
-
+ for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) {
setOperationAction(ISD::SELECT, VT, Promote);
AddPromotedToType (ISD::SELECT, VT, MVT::v8i64);
}
setOperationAction(ISD::MULHU, MVT::v32i16, Legal);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Legal);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Legal);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i16, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i8, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
setOperationAction(ISD::SELECT, MVT::v32i1, Custom);
setOperationAction(ISD::SELECT, MVT::v64i1, Custom);
setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i16, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i8, Custom);
setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i1, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i1, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom);
setOperationAction(ISD::VSELECT, MVT::v32i16, Legal);
setOperationAction(ISD::VSELECT, MVT::v64i8, Legal);
setOperationAction(ISD::TRUNCATE, MVT::v32i1, Custom);
if (Subtarget->hasVLX())
setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
- for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
- const MVT VT = (MVT::SimpleValueType)i;
-
- const unsigned EltSize = VT.getVectorElementType().getSizeInBits();
-
- // Do not attempt to promote non-512-bit vectors.
- if (!VT.is512BitVector())
- continue;
+ if (Subtarget->hasCDI()) {
+ setOperationAction(ISD::CTLZ, MVT::v32i16, Custom);
+ setOperationAction(ISD::CTLZ, MVT::v64i8, Custom);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v32i16, Custom);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v64i8, Custom);
+ }
- if (EltSize < 32) {
- setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
- setOperationAction(ISD::VSELECT, VT, Legal);
- }
+ for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
+ setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+ setOperationAction(ISD::VSELECT, VT, Legal);
}
}
// FIXME: We really should do custom legalization for addition and
// subtraction on x86-32 once PR3203 is fixed. We really can't do much better
// than generic legalization for 64-bit multiplication-with-overflow, though.
- for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) {
+ for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
+ if (VT == MVT::i64 && !Subtarget->is64Bit())
+ continue;
// Add/Sub/Mul with overflow operations are custom lowered.
- MVT VT = IntVTs[i];
setOperationAction(ISD::SADDO, VT, Custom);
setOperationAction(ISD::UADDO, VT, Custom);
setOperationAction(ISD::SSUBO, VT, Custom);
setOperationAction(ISD::UMULO, VT, Custom);
}
-
if (!Subtarget->is64Bit()) {
// These libcalls are not available in 32-bit.
setLibcallName(RTLIB::SHL_I128, nullptr);
MaxStoresPerMemmoveOptSize = 4;
setPrefLoopAlignment(4); // 2^4 bytes.
- // Predictable cmov don't hurt on atom because it's in-order.
+ // A predictable cmov does not hurt on an in-order CPU.
+ // FIXME: Use a CPU attribute to trigger this, not a CPU model.
PredictableSelectIsExpensive = !Subtarget->isAtom();
EnableExtLdPromotion = true;
setPrefFunctionAlignment(4); // 2^4 bytes.
if (!VT.isVector())
return Subtarget->hasAVX512() ? MVT::i1: MVT::i8;
- const unsigned NumElts = VT.getVectorNumElements();
- const EVT EltVT = VT.getVectorElementType();
- if (VT.is512BitVector()) {
- if (Subtarget->hasAVX512())
- if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
- EltVT == MVT::f32 || EltVT == MVT::f64)
- switch(NumElts) {
- case 8: return MVT::v8i1;
- case 16: return MVT::v16i1;
- }
- if (Subtarget->hasBWI())
- if (EltVT == MVT::i8 || EltVT == MVT::i16)
- switch(NumElts) {
- case 32: return MVT::v32i1;
- case 64: return MVT::v64i1;
- }
- }
+ if (VT.isSimple()) {
+ MVT VVT = VT.getSimpleVT();
+ const unsigned NumElts = VVT.getVectorNumElements();
+ const MVT EltVT = VVT.getVectorElementType();
+ if (VVT.is512BitVector()) {
+ if (Subtarget->hasAVX512())
+ if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
+ EltVT == MVT::f32 || EltVT == MVT::f64)
+ switch(NumElts) {
+ case 8: return MVT::v8i1;
+ case 16: return MVT::v16i1;
+ }
+ if (Subtarget->hasBWI())
+ if (EltVT == MVT::i8 || EltVT == MVT::i16)
+ switch(NumElts) {
+ case 32: return MVT::v32i1;
+ case 64: return MVT::v64i1;
+ }
+ }
- if (VT.is256BitVector() || VT.is128BitVector()) {
- if (Subtarget->hasVLX())
- if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
- EltVT == MVT::f32 || EltVT == MVT::f64)
- switch(NumElts) {
- case 2: return MVT::v2i1;
- case 4: return MVT::v4i1;
- case 8: return MVT::v8i1;
- }
- if (Subtarget->hasBWI() && Subtarget->hasVLX())
- if (EltVT == MVT::i8 || EltVT == MVT::i16)
- switch(NumElts) {
- case 8: return MVT::v8i1;
- case 16: return MVT::v16i1;
- case 32: return MVT::v32i1;
- }
+ if (VVT.is256BitVector() || VVT.is128BitVector()) {
+ if (Subtarget->hasVLX())
+ if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
+ EltVT == MVT::f32 || EltVT == MVT::f64)
+ switch(NumElts) {
+ case 2: return MVT::v2i1;
+ case 4: return MVT::v4i1;
+ case 8: return MVT::v8i1;
+ }
+ if (Subtarget->hasBWI() && Subtarget->hasVLX())
+ if (EltVT == MVT::i8 || EltVT == MVT::i16)
+ switch(NumElts) {
+ case 8: return MVT::v8i1;
+ case 16: return MVT::v16i1;
+ case 32: return MVT::v32i1;
+ }
+ }
}
return VT.changeVectorElementTypeToInteger();
return true;
}
+Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
+ if (!Subtarget->isTargetAndroid())
+ return TargetLowering::getSafeStackPointerLocation(IRB);
+
+ // Android provides a fixed TLS slot for the SafeStack pointer. See the
+ // definition of TLS_SLOT_SAFESTACK in
+ // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
+ unsigned AddressSpace, Offset;
+ if (Subtarget->is64Bit()) {
+ // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
+ Offset = 0x48;
+ if (getTargetMachine().getCodeModel() == CodeModel::Kernel)
+ AddressSpace = 256;
+ else
+ AddressSpace = 257;
+ } else {
+ // %gs:0x24 on i386
+ Offset = 0x24;
+ AddressSpace = 256;
+ }
+
+ return ConstantExpr::getIntToPtr(
+ ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
+ Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
+}
+
bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
unsigned DestAS) const {
assert(SrcAS != DestAS && "Expected different address spaces!");
#include "X86GenCallingConv.inc"
-bool
-X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv,
- MachineFunction &MF, bool isVarArg,
- const SmallVectorImpl<ISD::OutputArg> &Outs,
- LLVMContext &Context) const {
+bool X86TargetLowering::CanLowerReturn(
+ CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
SmallVector<CCValAssign, 16> RVLocs;
CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
return CCInfo.CheckReturn(Outs, RetCC_X86);
MachinePointerInfo(), MachinePointerInfo());
}
-/// Return true if the calling convention is one that
-/// supports tail call optimization.
-static bool IsTailCallConvention(CallingConv::ID CC) {
+/// Return true if the calling convention is one that we can guarantee TCO for.
+static bool canGuaranteeTCO(CallingConv::ID CC) {
return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
- CC == CallingConv::HiPE);
+ CC == CallingConv::HiPE || CC == CallingConv::HHVM);
}
-/// \brief Return true if the calling convention is a C calling convention.
-static bool IsCCallConvention(CallingConv::ID CC) {
- return (CC == CallingConv::C || CC == CallingConv::X86_64_Win64 ||
- CC == CallingConv::X86_64_SysV);
+/// Return true if we might ever do TCO for calls with this calling convention.
+static bool mayTailCallThisCC(CallingConv::ID CC) {
+ switch (CC) {
+ // C calling conventions:
+ case CallingConv::C:
+ case CallingConv::X86_64_Win64:
+ case CallingConv::X86_64_SysV:
+ // Callee pop conventions:
+ case CallingConv::X86_ThisCall:
+ case CallingConv::X86_StdCall:
+ case CallingConv::X86_VectorCall:
+ case CallingConv::X86_FastCall:
+ return true;
+ default:
+ return canGuaranteeTCO(CC);
+ }
+}
+
+/// Return true if the function is being made into a tailcall target by
+/// changing its ABI.
+static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
+ return GuaranteedTailCallOpt && canGuaranteeTCO(CC);
}
bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
CallSite CS(CI);
CallingConv::ID CalleeCC = CS.getCallingConv();
- if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
+ if (!mayTailCallThisCC(CalleeCC))
return false;
return true;
}
-/// Return true if the function is being made into
-/// a tailcall target by changing its ABI.
-static bool FuncIsMadeTailCallSafe(CallingConv::ID CC,
- bool GuaranteedTailCallOpt) {
- return GuaranteedTailCallOpt && IsTailCallConvention(CC);
-}
-
SDValue
X86TargetLowering::LowerMemArgument(SDValue Chain,
CallingConv::ID CallConv,
unsigned i) const {
// Create the nodes corresponding to a load from this parameter slot.
ISD::ArgFlagsTy Flags = Ins[i].Flags;
- bool AlwaysUseMutable = FuncIsMadeTailCallSafe(
+ bool AlwaysUseMutable = shouldGuaranteeTCO(
CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
EVT ValVT;
return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
}
-SDValue
-X86TargetLowering::LowerFormalArguments(SDValue Chain,
- CallingConv::ID CallConv,
- bool isVarArg,
- const SmallVectorImpl<ISD::InputArg> &Ins,
- SDLoc dl,
- SelectionDAG &DAG,
- SmallVectorImpl<SDValue> &InVals)
- const {
+SDValue X86TargetLowering::LowerFormalArguments(
+ SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc dl, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals) const {
MachineFunction &MF = DAG.getMachineFunction();
X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
bool Is64Bit = Subtarget->is64Bit();
bool IsWin64 = Subtarget->isCallingConvWin64(CallConv);
- assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
+ assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
"Var args not supported with calling convention fastcc, ghc or hipe");
// Assign locations to all of the incoming arguments.
unsigned StackSize = CCInfo.getNextStackOffset();
// Align stack specially for tail calls.
- if (FuncIsMadeTailCallSafe(CallConv,
- MF.getTarget().Options.GuaranteedTailCallOpt))
+ if (shouldGuaranteeTCO(CallConv,
+ MF.getTarget().Options.GuaranteedTailCallOpt))
StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
// If the function takes variable number of arguments, make a frame index for
}
MachineModuleInfo &MMI = MF.getMMI();
- const Function *WinEHParent = nullptr;
- if (MMI.hasWinEHFuncInfo(Fn))
- WinEHParent = MMI.getWinEHParent(Fn);
- bool IsWinEHParent = WinEHParent && WinEHParent == Fn;
// Figure out if XMM registers are in use.
assert(!(Subtarget->useSoftFloat() &&
} else {
FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
// If this is an sret function, the return should pop the hidden pointer.
- if (!Is64Bit && !IsTailCallConvention(CallConv) &&
+ if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
!Subtarget->getTargetTriple().isOSMSVCRT() &&
argsAreStructReturn(Ins) == StackStructReturn)
FuncInfo->setBytesToPopOnReturn(4);
FuncInfo->setArgumentStackSize(StackSize);
- if (IsWinEHParent) {
+ if (MMI.hasWinEHFuncInfo(Fn)) {
if (Is64Bit) {
int UnwindHelpFI = MFI->CreateStackObject(8, 8, /*isSS=*/false);
SDValue StackSlot = DAG.getFrameIndex(UnwindHelpFI, MVT::i64);
++NumTailCalls;
}
- assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
+ assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
"Var args not supported with calling convention fastcc, ghc or hipe");
// Analyze operands of the call, assigning locations to each operand.
CCInfo.AnalyzeCallOperands(Outs, CC_X86);
// Get a count of how many bytes are to be pushed on the stack.
- unsigned NumBytes = CCInfo.getNextStackOffset();
+ unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
if (IsSibcall)
// This is a sibcall. The memory operands are available in caller's
// own caller's stack.
NumBytes = 0;
else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
- IsTailCallConvention(CallConv))
+ canGuaranteeTCO(CallConv))
NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
int FPDiff = 0;
const uint32_t *Mask = RegInfo->getCallPreservedMask(MF, CallConv);
assert(Mask && "Missing call preserved mask for calling convention");
- // If this is an invoke in a 32-bit function using an MSVC personality, assume
- // the function clobbers all registers. If an exception is thrown, the runtime
- // will not restore CSRs.
+ // If this is an invoke in a 32-bit function using a funclet-based
+ // personality, assume the function clobbers all registers. If an exception
+ // is thrown, the runtime will not restore CSRs.
// FIXME: Model this more precisely so that we can register allocate across
// the normal edge and spill and fill across the exceptional edge.
if (!Is64Bit && CLI.CS && CLI.CS->isInvoke()) {
CallerFn->hasPersonalityFn()
? classifyEHPersonality(CallerFn->getPersonalityFn())
: EHPersonality::Unknown;
- if (isMSVCEHPersonality(Pers))
+ if (isFuncletEHPersonality(Pers))
Mask = RegInfo->getNoPreservedMask();
}
if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
DAG.getTarget().Options.GuaranteedTailCallOpt))
NumBytesForCalleeToPop = NumBytes; // Callee pops everything
- else if (!Is64Bit && !IsTailCallConvention(CallConv) &&
+ else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
!Subtarget->getTargetTriple().isOSMSVCRT() &&
SR == StackStructReturn)
// If this is a call to a struct-return function, the callee
/// Check whether the call is eligible for tail call optimization. Targets
/// that want to do tail call optimization should implement this function.
-bool
-X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
- CallingConv::ID CalleeCC,
- bool isVarArg,
- bool isCalleeStructRet,
- bool isCallerStructRet,
- Type *RetTy,
- const SmallVectorImpl<ISD::OutputArg> &Outs,
- const SmallVectorImpl<SDValue> &OutVals,
- const SmallVectorImpl<ISD::InputArg> &Ins,
- SelectionDAG &DAG) const {
- if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
+bool X86TargetLowering::IsEligibleForTailCallOptimization(
+ SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
+ bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
+ if (!mayTailCallThisCC(CalleeCC))
return false;
// If -tailcallopt is specified, make fastcc functions tail-callable.
- const MachineFunction &MF = DAG.getMachineFunction();
+ MachineFunction &MF = DAG.getMachineFunction();
const Function *CallerF = MF.getFunction();
// If the function return type is x86_fp80 and the callee return type is not,
return false;
if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
- if (IsTailCallConvention(CalleeCC) && CCMatch)
+ if (canGuaranteeTCO(CalleeCC) && CCMatch)
return true;
return false;
}
if (isCalleeStructRet || isCallerStructRet)
return false;
- // An stdcall/thiscall caller is expected to clean up its arguments; the
- // callee isn't going to do that.
- // FIXME: this is more restrictive than needed. We could produce a tailcall
- // when the stack adjustment matches. For example, with a thiscall that takes
- // only one argument.
- if (!CCMatch && (CallerCC == CallingConv::X86_StdCall ||
- CallerCC == CallingConv::X86_ThisCall))
- return false;
-
// Do not sibcall optimize vararg calls unless all arguments are passed via
// registers.
if (isVarArg && !Outs.empty()) {
-
// Optimizing for varargs on Win64 is unlikely to be safe without
// additional testing.
if (IsCalleeWin64 || IsCallerWin64)
}
}
+ unsigned StackArgsSize = 0;
+
// If the callee takes no arguments then go on to check the results of the
// call.
if (!Outs.empty()) {
CCInfo.AllocateStack(32, 8);
CCInfo.AnalyzeCallOperands(Outs, CC_X86);
- if (CCInfo.getNextStackOffset()) {
- MachineFunction &MF = DAG.getMachineFunction();
- if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn())
- return false;
+ StackArgsSize = CCInfo.getNextStackOffset();
+ if (CCInfo.getNextStackOffset()) {
// Check if the arguments are already laid out in the right way as
// the caller's fixed stack objects.
MachineFrameInfo *MFI = MF.getFrameInfo();
}
}
+ bool CalleeWillPop =
+ X86::isCalleePop(CalleeCC, Subtarget->is64Bit(), isVarArg,
+ MF.getTarget().Options.GuaranteedTailCallOpt);
+
+ if (unsigned BytesToPop =
+ MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
+ // If we have bytes to pop, the callee must pop them.
+ bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
+ if (!CalleePopMatches)
+ return false;
+ } else if (CalleeWillPop && StackArgsSize > 0) {
+ // If we don't have bytes to pop, make sure the callee doesn't pop any.
+ return false;
+ }
+
return true;
}
/// Determines whether the callee is required to pop its own arguments.
/// Callee pop is necessary to support tail calls.
bool X86::isCalleePop(CallingConv::ID CallingConv,
- bool is64Bit, bool IsVarArg, bool TailCallOpt) {
+ bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
+ // If GuaranteeTCO is true, we force some calls to be callee pop so that we
+ // can guarantee TCO.
+ if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
+ return true;
+
switch (CallingConv) {
default:
return false;
case CallingConv::X86_StdCall:
case CallingConv::X86_FastCall:
case CallingConv::X86_ThisCall:
+ case CallingConv::X86_VectorCall:
return !is64Bit;
- case CallingConv::Fast:
- case CallingConv::GHC:
- case CallingConv::HiPE:
- if (IsVarArg)
- return false;
- return TailCallOpt;
}
}
return getInsertVINSERTImmediate(N, 256);
}
-/// Returns true if Elt is a constant integer zero
+/// Returns true if V is a constant integer zero.
static bool isZero(SDValue V) {
ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
return C && C->isNullValue();
return false;
}
+// Build a vector of constants
+// Use an UNDEF node if MaskElt == -1.
+// Spilt 64-bit constants in the 32-bit mode.
+static SDValue getConstVector(ArrayRef<int> Values, EVT VT,
+ SelectionDAG &DAG,
+ SDLoc dl, bool IsMask = false) {
+
+ SmallVector<SDValue, 32> Ops;
+ bool Split = false;
+
+ EVT ConstVecVT = VT;
+ unsigned NumElts = VT.getVectorNumElements();
+ bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
+ if (!In64BitMode && VT.getScalarType() == MVT::i64) {
+ ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
+ Split = true;
+ }
+
+ EVT EltVT = ConstVecVT.getScalarType();
+ for (unsigned i = 0; i < NumElts; ++i) {
+ bool IsUndef = Values[i] < 0 && IsMask;
+ SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
+ DAG.getConstant(Values[i], dl, EltVT);
+ Ops.push_back(OpNode);
+ if (Split)
+ Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
+ DAG.getConstant(0, dl, EltVT));
+ }
+ SDValue ConstsNode = DAG.getNode(ISD::BUILD_VECTOR, dl, ConstVecVT, Ops);
+ if (Split)
+ ConstsNode = DAG.getBitcast(VT, ConstsNode);
+ return ConstsNode;
+}
+
/// Returns a vector of specified type with all zero elements.
static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
SelectionDAG &DAG, SDLoc dl) {
// Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
+ assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
// This is the index of the first element of the vectorWidth-bit chunk
- // we want.
- unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / vectorWidth)
- * ElemsPerChunk);
+ // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
+ IdxVal &= ~(ElemsPerChunk - 1);
// If the input is a buildvector just emit a smaller one.
if (Vec.getOpcode() == ISD::BUILD_VECTOR)
return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
- makeArrayRef(Vec->op_begin() + NormalizedIdxVal,
- ElemsPerChunk));
+ makeArrayRef(Vec->op_begin() + IdxVal, ElemsPerChunk));
- SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal, dl);
+ SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
}
// Insert the relevant vectorWidth bits.
unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
+ assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
// This is the index of the first element of the vectorWidth-bit chunk
- // we want.
- unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/vectorWidth)
- * ElemsPerChunk);
+ // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
+ IdxVal &= ~(ElemsPerChunk - 1);
- SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal, dl);
+ SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
}
return LowerAVXCONCAT_VECTORS(Op, DAG);
}
-
//===----------------------------------------------------------------------===//
// Vector shuffle lowering
//
SDValue V1, SDValue V2,
SelectionDAG &DAG) {
int NumElts = VT.getVectorNumElements();
- bool Unpckl = true;
- bool Unpckh = true;
- bool UnpcklSwapped = true;
- bool UnpckhSwapped = true;
int NumEltsInLane = 128 / VT.getScalarSizeInBits();
+ SmallVector<int, 8> Unpckl;
+ SmallVector<int, 8> Unpckh;
for (int i = 0; i < NumElts; ++i) {
unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
-
int LoPos = (i % NumEltsInLane) / 2 + LaneStart + NumElts * (i % 2);
int HiPos = LoPos + NumEltsInLane / 2;
- int LoPosSwapped = (LoPos + NumElts) % (NumElts * 2);
- int HiPosSwapped = (HiPos + NumElts) % (NumElts * 2);
-
- if (Mask[i] == -1)
- continue;
- if (Mask[i] != LoPos)
- Unpckl = false;
- if (Mask[i] != HiPos)
- Unpckh = false;
- if (Mask[i] != LoPosSwapped)
- UnpcklSwapped = false;
- if (Mask[i] != HiPosSwapped)
- UnpckhSwapped = false;
- if (!Unpckl && !Unpckh && !UnpcklSwapped && !UnpckhSwapped)
- return SDValue();
+ Unpckl.push_back(LoPos);
+ Unpckh.push_back(HiPos);
}
- if (Unpckl)
+
+ if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
- if (Unpckh)
+ if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
- if (UnpcklSwapped)
+
+ // Commute and try again.
+ ShuffleVectorSDNode::commuteMask(Unpckl);
+ if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
- if (UnpckhSwapped)
+
+ ShuffleVectorSDNode::commuteMask(Unpckh);
+ if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
- llvm_unreachable("Unexpected result of UNPCK mask analysis");
return SDValue();
}
/// This doesn't do any checks for the availability of instructions for blending
/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
/// be matched in the backend with the type given. What it does check for is
-/// that the shuffle mask is in fact a blend.
+/// that the shuffle mask is a blend, or convertible into a blend with zero.
static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
- SDValue V2, ArrayRef<int> Mask,
+ SDValue V2, ArrayRef<int> Original,
const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
+ bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
+ bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
+ SmallVector<int, 8> Mask(Original.begin(), Original.end());
+ SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+ bool ForceV1Zero = false, ForceV2Zero = false;
+
+ // Attempt to generate the binary blend mask. If an input is zero then
+ // we can use any lane.
+ // TODO: generalize the zero matching to any scalar like isShuffleEquivalent.
unsigned BlendMask = 0;
for (int i = 0, Size = Mask.size(); i < Size; ++i) {
- if (Mask[i] >= Size) {
- if (Mask[i] != i + Size)
- return SDValue(); // Shuffled V2 input!
+ int M = Mask[i];
+ if (M < 0)
+ continue;
+ if (M == i)
+ continue;
+ if (M == i + Size) {
BlendMask |= 1u << i;
continue;
}
- if (Mask[i] >= 0 && Mask[i] != i)
- return SDValue(); // Shuffled V1 input!
+ if (Zeroable[i]) {
+ if (V1IsZero) {
+ ForceV1Zero = true;
+ Mask[i] = i;
+ continue;
+ }
+ if (V2IsZero) {
+ ForceV2Zero = true;
+ BlendMask |= 1u << i;
+ Mask[i] = i + Size;
+ continue;
+ }
+ }
+ return SDValue(); // Shuffled input!
}
+
+ // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
+ if (ForceV1Zero)
+ V1 = getZeroVector(VT, Subtarget, DAG, DL);
+ if (ForceV2Zero)
+ V2 = getZeroVector(VT, Subtarget, DAG, DL);
+
+ auto ScaleBlendMask = [](unsigned BlendMask, int Size, int Scale) {
+ unsigned ScaledMask = 0;
+ for (int i = 0; i != Size; ++i)
+ if (BlendMask & (1u << i))
+ for (int j = 0; j != Scale; ++j)
+ ScaledMask |= 1u << (i * Scale + j);
+ return ScaledMask;
+ };
+
switch (VT.SimpleTy) {
case MVT::v2f64:
case MVT::v4f32:
if (Subtarget->hasAVX2()) {
// Scale the blend by the number of 32-bit dwords per element.
int Scale = VT.getScalarSizeInBits() / 32;
- BlendMask = 0;
- for (int i = 0, Size = Mask.size(); i < Size; ++i)
- if (Mask[i] >= Size)
- for (int j = 0; j < Scale; ++j)
- BlendMask |= 1u << (i * Scale + j);
-
+ BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale);
MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
V1 = DAG.getBitcast(BlendVT, V1);
V2 = DAG.getBitcast(BlendVT, V2);
// For integer shuffles we need to expand the mask and cast the inputs to
// v8i16s prior to blending.
int Scale = 8 / VT.getVectorNumElements();
- BlendMask = 0;
- for (int i = 0, Size = Mask.size(); i < Size; ++i)
- if (Mask[i] >= Size)
- for (int j = 0; j < Scale; ++j)
- BlendMask |= 1u << (i * Scale + j);
-
+ BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale);
V1 = DAG.getBitcast(MVT::v8i16, V1);
V2 = DAG.getBitcast(MVT::v8i16, V2);
return DAG.getBitcast(VT,
// FALLTHROUGH
case MVT::v16i8:
case MVT::v32i8: {
- assert((VT.getSizeInBits() == 128 || Subtarget->hasAVX2()) &&
+ assert((VT.is128BitVector() || Subtarget->hasAVX2()) &&
"256-bit byte-blends require AVX2 support!");
// Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
DAG.getConstant(Rotation * Scale, DL, MVT::i8)));
}
- assert(VT.getSizeInBits() == 128 &&
+ assert(VT.is128BitVector() &&
"Rotate-based lowering only supports 128-bit lowering!");
assert(Mask.size() <= 16 &&
"Can shuffle at most 16 bytes in a 128-bit vector!");
// Determine the extraction length from the part of the
// lower half that isn't zeroable.
int Len = HalfSize;
- for (; Len >= 0; --Len)
+ for (; Len > 0; --Len)
if (!Zeroable[Len - 1])
break;
assert(Len > 0 && "Zeroable shuffle mask");
M = M % Size;
// All mask elements must be in the lower half.
- if (M > HalfSize)
+ if (M >= HalfSize)
return SDValue();
if (Idx < 0 || (Src == V && Idx == (M - i))) {
///
/// Given a specific number of elements, element bit width, and extension
/// stride, produce either a zero or any extension based on the available
-/// features of the subtarget.
+/// features of the subtarget. The extended elements are consecutive and
+/// begin and can start from an offseted element index in the input; to
+/// avoid excess shuffling the offset must either being in the bottom lane
+/// or at the start of a higher lane. All extended elements must be from
+/// the same lane.
static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
- SDLoc DL, MVT VT, int Scale, bool AnyExt, SDValue InputV,
+ SDLoc DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
ArrayRef<int> Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) {
assert(Scale > 1 && "Need a scale to extend.");
- int NumElements = VT.getVectorNumElements();
int EltBits = VT.getScalarSizeInBits();
+ int NumElements = VT.getVectorNumElements();
+ int NumEltsPerLane = 128 / EltBits;
+ int OffsetLane = Offset / NumEltsPerLane;
assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
"Only 8, 16, and 32 bit elements can be extended.");
assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
+ assert(0 <= Offset && "Extension offset must be positive.");
+ assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
+ "Extension offset must be in the first lane or start an upper lane.");
+
+ // Check that an index is in same lane as the base offset.
+ auto SafeOffset = [&](int Idx) {
+ return OffsetLane == (Idx / NumEltsPerLane);
+ };
+
+ // Shift along an input so that the offset base moves to the first element.
+ auto ShuffleOffset = [&](SDValue V) {
+ if (!Offset)
+ return V;
+
+ SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
+ for (int i = 0; i * Scale < NumElements; ++i) {
+ int SrcIdx = i + Offset;
+ ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
+ }
+ return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
+ };
// Found a valid zext mask! Try various lowering strategies based on the
// input type and available ISA extensions.
if (Subtarget->hasSSE41()) {
+ // Not worth offseting 128-bit vectors if scale == 2, a pattern using
+ // PUNPCK will catch this in a later shuffle match.
+ if (Offset && Scale == 2 && VT.is128BitVector())
+ return SDValue();
MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
NumElements / Scale);
- return DAG.getBitcast(VT, DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV));
+ InputV = DAG.getNode(X86ISD::VZEXT, DL, ExtVT, ShuffleOffset(InputV));
+ return DAG.getBitcast(VT, InputV);
}
+ assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
+
// For any extends we can cheat for larger element sizes and use shuffle
// instructions that can fold with a load and/or copy.
if (AnyExt && EltBits == 32) {
- int PSHUFDMask[4] = {0, -1, 1, -1};
+ int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
+ -1};
return DAG.getBitcast(
VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
DAG.getBitcast(MVT::v4i32, InputV),
getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
}
if (AnyExt && EltBits == 16 && Scale > 2) {
- int PSHUFDMask[4] = {0, -1, 0, -1};
+ int PSHUFDMask[4] = {Offset / 2, -1,
+ SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
DAG.getBitcast(MVT::v4i32, InputV),
getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
- int PSHUFHWMask[4] = {1, -1, -1, -1};
+ int PSHUFWMask[4] = {1, -1, -1, -1};
+ unsigned OddEvenOp = (Offset & 1 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW);
return DAG.getBitcast(
- VT, DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16,
+ VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
DAG.getBitcast(MVT::v8i16, InputV),
- getV4X86ShuffleImm8ForMask(PSHUFHWMask, DL, DAG)));
+ getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
}
// The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
// to 64-bits.
if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget->hasSSE4A()) {
assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
- assert(VT.getSizeInBits() == 128 && "Unexpected vector width!");
+ assert(VT.is128BitVector() && "Unexpected vector width!");
+ int LoIdx = Offset * EltBits;
SDValue Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
DAG.getConstant(EltBits, DL, MVT::i8),
- DAG.getConstant(0, DL, MVT::i8)));
- if (isUndefInRange(Mask, NumElements/2, NumElements/2))
+ DAG.getConstant(LoIdx, DL, MVT::i8)));
+
+ if (isUndefInRange(Mask, NumElements / 2, NumElements / 2) ||
+ !SafeOffset(Offset + 1))
return DAG.getNode(ISD::BITCAST, DL, VT, Lo);
- SDValue Hi =
- DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
- DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
- DAG.getConstant(EltBits, DL, MVT::i8),
- DAG.getConstant(EltBits, DL, MVT::i8)));
+ int HiIdx = (Offset + 1) * EltBits;
+ SDValue Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
+ DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
+ DAG.getConstant(EltBits, DL, MVT::i8),
+ DAG.getConstant(HiIdx, DL, MVT::i8)));
return DAG.getNode(ISD::BITCAST, DL, VT,
DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
}
if (Scale > 4 && EltBits == 8 && Subtarget->hasSSSE3()) {
assert(NumElements == 16 && "Unexpected byte vector width!");
SDValue PSHUFBMask[16];
- for (int i = 0; i < 16; ++i)
- PSHUFBMask[i] =
- DAG.getConstant((i % Scale == 0) ? i / Scale : 0x80, DL, MVT::i8);
+ for (int i = 0; i < 16; ++i) {
+ int Idx = Offset + (i / Scale);
+ PSHUFBMask[i] = DAG.getConstant(
+ (i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8);
+ }
InputV = DAG.getBitcast(MVT::v16i8, InputV);
return DAG.getBitcast(VT,
DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
MVT::v16i8, PSHUFBMask)));
}
+ // If we are extending from an offset, ensure we start on a boundary that
+ // we can unpack from.
+ int AlignToUnpack = Offset % (NumElements / Scale);
+ if (AlignToUnpack) {
+ SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
+ for (int i = AlignToUnpack; i < NumElements; ++i)
+ ShMask[i - AlignToUnpack] = i;
+ InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
+ Offset -= AlignToUnpack;
+ }
+
// Otherwise emit a sequence of unpacks.
do {
+ unsigned UnpackLoHi = X86ISD::UNPCKL;
+ if (Offset >= (NumElements / 2)) {
+ UnpackLoHi = X86ISD::UNPCKH;
+ Offset -= (NumElements / 2);
+ }
+
MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
: getZeroVector(InputVT, Subtarget, DAG, DL);
InputV = DAG.getBitcast(InputVT, InputV);
- InputV = DAG.getNode(X86ISD::UNPCKL, DL, InputVT, InputV, Ext);
+ InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
Scale /= 2;
EltBits *= 2;
NumElements /= 2;
SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
int Bits = VT.getSizeInBits();
+ int NumLanes = Bits / 128;
int NumElements = VT.getVectorNumElements();
+ int NumEltsPerLane = NumElements / NumLanes;
assert(VT.getScalarSizeInBits() <= 32 &&
"Exceeds 32-bit integer zero extension limit");
assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
auto Lower = [&](int Scale) -> SDValue {
SDValue InputV;
bool AnyExt = true;
+ int Offset = 0;
+ int Matches = 0;
for (int i = 0; i < NumElements; ++i) {
- if (Mask[i] == -1)
+ int M = Mask[i];
+ if (M == -1)
continue; // Valid anywhere but doesn't tell us anything.
if (i % Scale != 0) {
// Each of the extended elements need to be zeroable.
// Each of the base elements needs to be consecutive indices into the
// same input vector.
- SDValue V = Mask[i] < NumElements ? V1 : V2;
- if (!InputV)
+ SDValue V = M < NumElements ? V1 : V2;
+ M = M % NumElements;
+ if (!InputV) {
InputV = V;
- else if (InputV != V)
+ Offset = M - (i / Scale);
+ } else if (InputV != V)
return SDValue(); // Flip-flopping inputs.
- if (Mask[i] % NumElements != i / Scale)
+ // Offset must start in the lowest 128-bit lane or at the start of an
+ // upper lane.
+ // FIXME: Is it ever worth allowing a negative base offset?
+ if (!((0 <= Offset && Offset < NumEltsPerLane) ||
+ (Offset % NumEltsPerLane) == 0))
+ return SDValue();
+
+ // If we are offsetting, all referenced entries must come from the same
+ // lane.
+ if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
+ return SDValue();
+
+ if ((M % NumElements) != (Offset + (i / Scale)))
return SDValue(); // Non-consecutive strided elements.
+ Matches++;
}
// If we fail to find an input, we have a zero-shuffle which should always
if (!InputV)
return SDValue();
+ // If we are offsetting, don't extend if we only match a single input, we
+ // can always do better by using a basic PSHUF or PUNPCK.
+ if (Offset != 0 && Matches < 2)
+ return SDValue();
+
return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
- DL, VT, Scale, AnyExt, InputV, Mask, Subtarget, DAG);
+ DL, VT, Scale, Offset, AnyExt, InputV, Mask, Subtarget, DAG);
};
// The widest scale possible for extending is to a 64-bit integer.
return Blend;
// Use dedicated unpack instructions for masks that match their pattern.
- if (isShuffleEquivalent(V1, V2, Mask, {0, 2}))
- return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2f64, V1, V2);
- if (isShuffleEquivalent(V1, V2, Mask, {1, 3}))
- return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2f64, V1, V2);
+ if (SDValue V =
+ lowerVectorShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
+ return V;
unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
return Blend;
// Use dedicated unpack instructions for masks that match their pattern.
- if (isShuffleEquivalent(V1, V2, Mask, {0, 2}))
- return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, V1, V2);
- if (isShuffleEquivalent(V1, V2, Mask, {1, 3}))
- return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2i64, V1, V2);
+ if (SDValue V =
+ lowerVectorShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
+ return V;
// Try to use byte rotation instructions.
// Its more profitable for pre-SSSE3 to use shuffles/unpacks.
}
// Use dedicated unpack instructions for masks that match their pattern.
- if (isShuffleEquivalent(V1, V2, Mask, {0, 4, 1, 5}))
- return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V1, V2);
- if (isShuffleEquivalent(V1, V2, Mask, {2, 6, 3, 7}))
- return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V1, V2);
- if (isShuffleEquivalent(V1, V2, Mask, {4, 0, 5, 1}))
- return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V2, V1);
- if (isShuffleEquivalent(V1, V2, Mask, {6, 2, 7, 3}))
- return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V2, V1);
+ if (SDValue V =
+ lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
+ return V;
// Otherwise fall back to a SHUFPS lowering strategy.
return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
return Masked;
// Use dedicated unpack instructions for masks that match their pattern.
- if (isShuffleEquivalent(V1, V2, Mask, {0, 4, 1, 5}))
- return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V1, V2);
- if (isShuffleEquivalent(V1, V2, Mask, {2, 6, 3, 7}))
- return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V1, V2);
- if (isShuffleEquivalent(V1, V2, Mask, {4, 0, 5, 1}))
- return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V2, V1);
- if (isShuffleEquivalent(V1, V2, Mask, {6, 2, 7, 3}))
- return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V2, V1);
+ if (SDValue V =
+ lowerVectorShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
+ return V;
// Try to use byte rotation instructions.
// Its more profitable for pre-SSSE3 to use shuffles/unpacks.
return Shift;
// Use dedicated unpack instructions for masks that match their pattern.
- if (isShuffleEquivalent(V1, V1, Mask, {0, 0, 1, 1, 2, 2, 3, 3}))
- return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V1, V1);
- if (isShuffleEquivalent(V1, V1, Mask, {4, 4, 5, 5, 6, 6, 7, 7}))
- return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V1, V1);
+ if (SDValue V =
+ lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
+ return V;
// Try to use byte rotation instructions.
if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1,
return Masked;
// Use dedicated unpack instructions for masks that match their pattern.
- if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 1, 9, 2, 10, 3, 11}))
- return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V1, V2);
- if (isShuffleEquivalent(V1, V2, Mask, {4, 12, 5, 13, 6, 14, 7, 15}))
- return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V1, V2);
+ if (SDValue V =
+ lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
+ return V;
// Try to use byte rotation instructions.
if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
return Masked;
// Use dedicated unpack instructions for masks that match their pattern.
- if (isShuffleEquivalent(V1, V2, Mask, {// Low half.
- 0, 16, 1, 17, 2, 18, 3, 19,
- // High half.
- 4, 20, 5, 21, 6, 22, 7, 23}))
- return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V1, V2);
- if (isShuffleEquivalent(V1, V2, Mask, {// Low half.
- 8, 24, 9, 25, 10, 26, 11, 27,
- // High half.
- 12, 28, 13, 29, 14, 30, 15, 31}))
- return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V1, V2);
+ if (SDValue V =
+ lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
+ return V;
// Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
// with PSHUFB. It is important to do this before we attempt to generate any
ArrayRef<int> Mask,
SelectionDAG &DAG) {
// FIXME: This should probably be generalized for 512-bit vectors as well.
- assert(VT.getSizeInBits() == 256 && "Only for 256-bit vector shuffles!");
+ assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
int LaneSize = Mask.size() / 2;
// If there are only inputs from one 128-bit lane, splitting will in fact be
DAG);
}
- // X86 has dedicated unpack instructions that can handle specific blend
- // operations: UNPCKH and UNPCKL.
- if (isShuffleEquivalent(V1, V2, Mask, {0, 4, 2, 6}))
- return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f64, V1, V2);
- if (isShuffleEquivalent(V1, V2, Mask, {1, 5, 3, 7}))
- return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V1, V2);
- if (isShuffleEquivalent(V1, V2, Mask, {4, 0, 6, 2}))
- return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f64, V2, V1);
- if (isShuffleEquivalent(V1, V2, Mask, {5, 1, 7, 3}))
- return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V2, V1);
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (SDValue V =
+ lowerVectorShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
+ return V;
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
Subtarget, DAG))
return Shift;
// Use dedicated unpack instructions for masks that match their pattern.
- if (isShuffleEquivalent(V1, V2, Mask, {0, 4, 2, 6}))
- return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V1, V2);
- if (isShuffleEquivalent(V1, V2, Mask, {1, 5, 3, 7}))
- return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V1, V2);
- if (isShuffleEquivalent(V1, V2, Mask, {4, 0, 6, 2}))
- return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V2, V1);
- if (isShuffleEquivalent(V1, V2, Mask, {5, 1, 7, 3}))
- return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V2, V1);
+ if (SDValue V =
+ lowerVectorShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
+ return V;
// Try to simplify this by merging 128-bit lanes to enable a lane-based
// shuffle. However, if we have AVX2 and either inputs are already in place,
getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
// Use dedicated unpack instructions for masks that match their pattern.
- if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 1, 9, 4, 12, 5, 13}))
- return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f32, V1, V2);
- if (isShuffleEquivalent(V1, V2, Mask, {2, 10, 3, 11, 6, 14, 7, 15}))
- return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f32, V1, V2);
- if (isShuffleEquivalent(V1, V2, Mask, {8, 0, 9, 1, 12, 4, 13, 5}))
- return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f32, V2, V1);
- if (isShuffleEquivalent(V1, V2, Mask, {10, 2, 11, 3, 14, 6, 15, 7}))
- return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f32, V2, V1);
+ if (SDValue V =
+ lowerVectorShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
+ return V;
// Otherwise, fall back to a SHUFPS sequence. Here it is important that we
// have already handled any direct blends. We also need to squash the
getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
// Use dedicated unpack instructions for masks that match their pattern.
- if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 1, 9, 4, 12, 5, 13}))
- return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i32, V1, V2);
- if (isShuffleEquivalent(V1, V2, Mask, {2, 10, 3, 11, 6, 14, 7, 15}))
- return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i32, V1, V2);
- if (isShuffleEquivalent(V1, V2, Mask, {8, 0, 9, 1, 12, 4, 13, 5}))
- return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i32, V2, V1);
- if (isShuffleEquivalent(V1, V2, Mask, {10, 2, 11, 3, 14, 6, 15, 7}))
- return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i32, V2, V1);
+ if (SDValue V =
+ lowerVectorShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
+ return V;
}
// Try to use shift instructions.
return Blend;
// Use dedicated unpack instructions for masks that match their pattern.
- if (isShuffleEquivalent(V1, V2, Mask,
- {// First 128-bit lane:
- 0, 16, 1, 17, 2, 18, 3, 19,
- // Second 128-bit lane:
- 8, 24, 9, 25, 10, 26, 11, 27}))
- return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i16, V1, V2);
- if (isShuffleEquivalent(V1, V2, Mask,
- {// First 128-bit lane:
- 4, 20, 5, 21, 6, 22, 7, 23,
- // Second 128-bit lane:
- 12, 28, 13, 29, 14, 30, 15, 31}))
- return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i16, V1, V2);
+ if (SDValue V =
+ lowerVectorShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
+ return V;
// Try to use shift instructions.
if (SDValue Shift =
return Blend;
// Use dedicated unpack instructions for masks that match their pattern.
- // Note that these are repeated 128-bit lane unpacks, not unpacks across all
- // 256-bit lanes.
- if (isShuffleEquivalent(
- V1, V2, Mask,
- {// First 128-bit lane:
- 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39,
- // Second 128-bit lane:
- 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55}))
- return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v32i8, V1, V2);
- if (isShuffleEquivalent(
- V1, V2, Mask,
- {// First 128-bit lane:
- 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47,
- // Second 128-bit lane:
- 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63}))
- return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v32i8, V1, V2);
+ if (SDValue V =
+ lowerVectorShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
+ return V;
// Try to use shift instructions.
if (SDValue Shift =
DL, VT, V1, V2, Mask, Subtarget, DAG))
return Insertion;
- // There is a really nice hard cut-over between AVX1 and AVX2 that means we can
- // check for those subtargets here and avoid much of the subtarget querying in
- // the per-vector-type lowering routines. With AVX1 we have essentially *zero*
- // ability to manipulate a 256-bit vector with integer types. Since we'll use
- // floating point types there eventually, just immediately cast everything to
- // a float and operate entirely in that domain.
+ // There is a really nice hard cut-over between AVX1 and AVX2 that means we
+ // can check for those subtargets here and avoid much of the subtarget
+ // querying in the per-vector-type lowering routines. With AVX1 we have
+ // essentially *zero* ability to manipulate a 256-bit vector with integer
+ // types. Since we'll use floating point types there eventually, just
+ // immediately cast everything to a float and operate entirely in that domain.
if (VT.isInteger() && !Subtarget->hasAVX2()) {
int ElementBits = VT.getScalarSizeInBits();
if (ElementBits < 32)
}
}
+/// \brief Try to lower a vector shuffle as a 128-bit shuffles.
+static SDValue lowerV4X128VectorShuffle(SDLoc DL, MVT VT,
+ ArrayRef<int> Mask,
+ SDValue V1, SDValue V2,
+ SelectionDAG &DAG) {
+ assert(VT.getScalarSizeInBits() == 64 &&
+ "Unexpected element type size for 128bit shuffle.");
+
+ // To handle 256 bit vector requires VLX and most probably
+ // function lowerV2X128VectorShuffle() is better solution.
+ assert(VT.is512BitVector() && "Unexpected vector size for 128bit shuffle.");
+
+ SmallVector<int, 4> WidenedMask;
+ if (!canWidenShuffleElements(Mask, WidenedMask))
+ return SDValue();
+
+ // Form a 128-bit permutation.
+ // Convert the 64-bit shuffle mask selection values into 128-bit selection
+ // bits defined by a vshuf64x2 instruction's immediate control byte.
+ unsigned PermMask = 0, Imm = 0;
+ unsigned ControlBitsNum = WidenedMask.size() / 2;
+
+ for (int i = 0, Size = WidenedMask.size(); i < Size; ++i) {
+ if (WidenedMask[i] == SM_SentinelZero)
+ return SDValue();
+
+ // Use first element in place of undef mask.
+ Imm = (WidenedMask[i] == SM_SentinelUndef) ? 0 : WidenedMask[i];
+ PermMask |= (Imm % WidenedMask.size()) << (i * ControlBitsNum);
+ }
+
+ return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
+ DAG.getConstant(PermMask, DL, MVT::i8));
+}
+
static SDValue lowerVectorShuffleWithPERMV(SDLoc DL, MVT VT,
ArrayRef<int> Mask, SDValue V1,
SDValue V2, SelectionDAG &DAG) {
MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
- SmallVector<SDValue, 32> VPermMask;
- for (unsigned i = 0; i < VT.getVectorNumElements(); ++i)
- VPermMask.push_back(Mask[i] < 0 ? DAG.getUNDEF(MaskEltVT) :
- DAG.getConstant(Mask[i], DL, MaskEltVT));
- SDValue MaskNode = DAG.getNode(ISD::BUILD_VECTOR, DL, MaskVecVT,
- VPermMask);
+ SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
if (isSingleInputShuffleMask(Mask))
return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);
ArrayRef<int> Mask = SVOp->getMask();
assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
+ if (SDValue Shuf128 =
+ lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, V1, V2, DAG))
+ return Shuf128;
+
if (SDValue Unpck =
lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
return Unpck;
/// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
static SDValue lowerV16F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
- const X86Subtarget *Subtarget,
- SelectionDAG &DAG) {
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
SDLoc DL(Op);
assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
ArrayRef<int> Mask = SVOp->getMask();
assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
+ if (SDValue Shuf128 =
+ lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG))
+ return Shuf128;
+
if (SDValue Unpck =
lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
return Unpck;
/// \brief Handle lowering of 16-lane 32-bit integer shuffles.
static SDValue lowerV16I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
- const X86Subtarget *Subtarget,
- SelectionDAG &DAG) {
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
SDLoc DL(Op);
assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
EVT ExtVT;
switch (VT.SimpleTy) {
default:
- assert(false && "Expected a vector of i1 elements");
- break;
+ llvm_unreachable("Expected a vector of i1 elements");
case MVT::v2i1:
ExtVT = MVT::v2i64;
break;
}
// For each vector width, delegate to a specialized lowering routine.
- if (VT.getSizeInBits() == 128)
+ if (VT.is128BitVector())
return lower128BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
- if (VT.getSizeInBits() == 256)
+ if (VT.is256BitVector())
return lower256BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
- if (VT.getSizeInBits() == 512)
+ if (VT.is512BitVector())
return lower512BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
if (Is1BitVector)
unsigned &MaskValue) {
MaskValue = 0;
unsigned NumElems = BuildVector->getNumOperands();
+
// There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
+ // We don't handle the >2 lanes case right now.
unsigned NumLanes = (NumElems - 1) / 8 + 1;
+ if (NumLanes > 2)
+ return false;
+
unsigned NumElemsInLane = NumElems / NumLanes;
// Blend for v16i16 should be symmetric for the both lanes.
if (isa<ConstantSDNode>(SndLaneEltCond))
Lane2Cond = !isZero(SndLaneEltCond);
+ unsigned LaneMask = 0;
if (Lane1Cond == Lane2Cond || Lane2Cond < 0)
// Lane1Cond != 0, means we want the first argument.
// Lane1Cond == 0, means we want the second argument.
// The encoding of this argument is 0 for the first argument, 1
// for the second. Therefore, invert the condition.
- MaskValue |= !Lane1Cond << i;
+ LaneMask = !Lane1Cond << i;
else if (Lane1Cond < 0)
- MaskValue |= !Lane2Cond << i;
+ LaneMask = !Lane2Cond << i;
else
return false;
+
+ MaskValue |= LaneMask;
+ if (NumLanes == 2)
+ MaskValue |= LaneMask << NumElemsInLane;
}
return true;
}
MVT EltVT = VecVT.getVectorElementType();
unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
+ assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
- //if (IdxVal >= NumElems/2)
- // IdxVal -= NumElems/2;
- IdxVal -= (IdxVal/ElemsPerChunk)*ElemsPerChunk;
+ // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
+ // this can be done with a mask.
+ IdxVal &= ElemsPerChunk - 1;
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
DAG.getConstant(IdxVal, dl, MVT::i32));
}
// Insert the element into the desired chunk.
unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
- unsigned IdxIn128 = IdxVal - (IdxVal / NumEltsIn128) * NumEltsIn128;
+ assert(isPowerOf2_32(NumEltsIn128));
+ // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
+ unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
DAG.getConstant(IdxIn128, dl, MVT::i32));
// float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
// return (float4) lo + fhi;
+ // We shouldn't use it when unsafe-fp-math is enabled though: we might later
+ // reassociate the two FADDs, and if we do that, the algorithm fails
+ // spectacularly (PR24512).
+ // FIXME: If we ever have some kind of Machine FMF, this should be marked
+ // as non-fast and always be enabled. Why isn't SDAG FMF enough? Because
+ // there's also the MachineCombiner reassociations happening on Machine IR.
+ if (DAG.getTarget().Options.UnsafeFPMath)
+ return SDValue();
+
SDLoc DL(Op);
SDValue V = Op->getOperand(0);
EVT VecIntVT = V.getValueType();
}
// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
-// is legal, or has an f16 source (which needs to be promoted to f32),
+// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
// just return an <SDValue(), SDValue()> pair.
// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
// to i16, i32 or i64, and we lower it to a legal sequence.
EVT TheVT = Op.getOperand(0).getValueType();
auto PtrVT = getPointerTy(DAG.getDataLayout());
- if (TheVT == MVT::f16)
- // We need to promote the f16 to f32 before using the lowering
- // in this routine.
+ if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
+ // f16 must be promoted before using the lowering in this routine.
+ // fp128 does not use this lowering.
return std::make_pair(SDValue(), SDValue());
-
- assert((TheVT == MVT::f32 ||
- TheVT == MVT::f64 ||
- TheVT == MVT::f80) &&
- "Unexpected FP operand type in FP_TO_INTHelper");
+ }
// If using FIST to compute an unsigned i64, we'll need some fixup
// to handle values above the maximum signed i64. A FIST is always
// for DAG type consistency we have to match the FP operand type.
APFloat Thresh(APFloat::IEEEsingle, APInt(32, 0x5f000000));
- APFloat::opStatus Status = APFloat::opOK;
+ LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
bool LosesInfo = false;
if (TheVT == MVT::f64)
// The rounding mode is irrelevant as the conversion should be exact.
DAG.getConstant(SSECC, dl, MVT::i8));
}
+ MVT VTOp0 = Op0.getSimpleValueType();
+ assert(VTOp0 == Op1.getSimpleValueType() &&
+ "Expected operands with same type!");
+ assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
+ "Invalid number of packed elements for source and destination!");
+
+ if (VT.is128BitVector() && VTOp0.is256BitVector()) {
+ // On non-AVX512 targets, a vector of MVT::i1 is promoted by the type
+ // legalizer to a wider vector type. In the case of 'vsetcc' nodes, the
+ // legalizer firstly checks if the first operand in input to the setcc has
+ // a legal type. If so, then it promotes the return type to that same type.
+ // Otherwise, the return type is promoted to the 'next legal type' which,
+ // for a vector of MVT::i1 is always a 128-bit integer vector type.
+ //
+ // We reach this code only if the following two conditions are met:
+ // 1. Both return type and operand type have been promoted to wider types
+ // by the type legalizer.
+ // 2. The original operand type has been promoted to a 256-bit vector.
+ //
+ // Note that condition 2. only applies for AVX targets.
+ SDValue NewOp = DAG.getSetCC(dl, VTOp0, Op0, Op1, SetCCOpcode);
+ return DAG.getZExtOrTrunc(NewOp, dl, VT);
+ }
+
+ // The non-AVX512 code below works under the assumption that source and
+ // destination types are the same.
+ assert((Subtarget->hasAVX512() || (VT == VTOp0)) &&
+ "Value types for source and destination must be the same!");
+
// Break 256-bit integer vector compare into smaller ones.
if (VT.is256BitVector() && !Subtarget->hasInt256())
return Lower256IntVSETCC(Op, DAG);
DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC));
}
+ // Lower using XOP integer comparisons.
+ if ((VT == MVT::v16i8 || VT == MVT::v8i16 ||
+ VT == MVT::v4i32 || VT == MVT::v2i64) && Subtarget->hasXOP()) {
+ // Translate compare code to XOP PCOM compare mode.
+ unsigned CmpMode = 0;
+ switch (SetCCOpcode) {
+ default: llvm_unreachable("Unexpected SETCC condition");
+ case ISD::SETULT:
+ case ISD::SETLT: CmpMode = 0x00; break;
+ case ISD::SETULE:
+ case ISD::SETLE: CmpMode = 0x01; break;
+ case ISD::SETUGT:
+ case ISD::SETGT: CmpMode = 0x02; break;
+ case ISD::SETUGE:
+ case ISD::SETGE: CmpMode = 0x03; break;
+ case ISD::SETEQ: CmpMode = 0x04; break;
+ case ISD::SETNE: CmpMode = 0x05; break;
+ }
+
+ // Are we comparing unsigned or signed integers?
+ unsigned Opc = ISD::isUnsignedIntSetCC(SetCCOpcode)
+ ? X86ISD::VPCOMU : X86ISD::VPCOM;
+
+ return DAG.getNode(Opc, dl, VT, Op0, Op1,
+ DAG.getConstant(CmpMode, dl, MVT::i8));
+ }
+
// We are handling one of the integer comparisons here. Since SSE only has
// GT and EQ comparisons for integer, swapping operands and multiple
// operations may be required for some comparisons.
// Since SSE has no unsigned integer comparisons, we need to flip the sign
// bits of the inputs before performing those operations.
if (FlipSigns) {
- EVT EltVT = VT.getVectorElementType();
+ MVT EltVT = VT.getVectorElementType();
SDValue SB = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), dl,
VT);
Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SB);
return Sext;
}
- // Otherwise we'll shuffle the small elements in the high bits of the
- // larger type and perform an arithmetic shift. If the shift is not legal
- // it's better to scalarize.
- assert(TLI.isOperationLegalOrCustom(ISD::SRA, RegVT) &&
- "We can't implement a sext load without an arithmetic right shift!");
-
- // Redistribute the loaded elements into the different locations.
- SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
- for (unsigned i = 0; i != NumElems; ++i)
- ShuffleVec[i * SizeRatio + SizeRatio - 1] = i;
-
- SDValue Shuff = DAG.getVectorShuffle(
- WideVecVT, dl, SlicedVec, DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
-
- Shuff = DAG.getBitcast(RegVT, Shuff);
-
- // Build the arithmetic shift.
- unsigned Amt = RegVT.getVectorElementType().getSizeInBits() -
- MemVT.getVectorElementType().getSizeInBits();
- Shuff =
- DAG.getNode(ISD::SRA, dl, RegVT, Shuff,
- DAG.getConstant(Amt, dl, RegVT));
+ // Otherwise we'll use SIGN_EXTEND_VECTOR_INREG to sign extend the lowest
+ // lanes.
+ assert(TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND_VECTOR_INREG, RegVT) &&
+ "We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!");
+ SDValue Shuff = DAG.getSignExtendVectorInReg(SlicedVec, dl, RegVT);
DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
return Shuff;
}
case X86ISD::CMPM:
case X86ISD::CMPMU:
return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
+ case X86ISD::VFPCLASS:
+ return DAG.getNode(ISD::OR, dl, VT, Op, VMask);
case X86ISD::VTRUNC:
case X86ISD::VTRUNCS:
case X86ISD::VTRUNCUS:
SDValue PreservedSrc,
const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
- if (isAllOnes(Mask))
- return Op;
+ if (isAllOnes(Mask))
+ return Op;
- EVT VT = Op.getValueType();
- SDLoc dl(Op);
- // The mask should be of type MVT::i1
- SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask);
+ EVT VT = Op.getValueType();
+ SDLoc dl(Op);
+ // The mask should be of type MVT::i1
+ SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask);
- if (PreservedSrc.getOpcode() == ISD::UNDEF)
- PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
- return DAG.getNode(X86ISD::SELECT, dl, VT, IMask, Op, PreservedSrc);
+ if (Op.getOpcode() == X86ISD::FSETCC)
+ return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
+ if (Op.getOpcode() == X86ISD::VFPCLASS)
+ return DAG.getNode(ISD::OR, dl, VT, Op, IMask);
+
+ if (PreservedSrc.getOpcode() == ISD::UNDEF)
+ PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
+ return DAG.getNode(X86ISD::SELECT, dl, VT, IMask, Op, PreservedSrc);
}
static int getSEHRegistrationNodeSize(const Function *Fn) {
return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
Mask, PassThru, Subtarget, DAG);
}
+ case INTR_TYPE_SCALAR_MASK: {
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Src2 = Op.getOperand(2);
+ SDValue passThru = Op.getOperand(3);
+ SDValue Mask = Op.getOperand(4);
+ return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2),
+ Mask, passThru, Subtarget, DAG);
+ }
case INTR_TYPE_SCALAR_MASK_RM: {
SDValue Src1 = Op.getOperand(1);
SDValue Src2 = Op.getOperand(2);
RoundingMode, Sae),
Mask, Src0, Subtarget, DAG);
}
- case INTR_TYPE_2OP_MASK: {
+ case INTR_TYPE_2OP_MASK:
+ case INTR_TYPE_2OP_IMM8_MASK: {
SDValue Src1 = Op.getOperand(1);
SDValue Src2 = Op.getOperand(2);
SDValue PassThru = Op.getOperand(3);
SDValue Mask = Op.getOperand(4);
+
+ if (IntrData->Type == INTR_TYPE_2OP_IMM8_MASK)
+ Src2 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src2);
+
// We specify 2 possible opcodes for intrinsics with rounding modes.
// First, we check if the intrinsic may have non-default rounding mode,
// (IntrData->Opc1 != 0), then we check the rounding mode operand.
SDValue Src2 = Op.getOperand(2);
SDValue PassThru = Op.getOperand(3);
SDValue Mask = Op.getOperand(4);
- // We specify 2 possible modes for intrinsics, with/without rounding modes.
+ // We specify 2 possible modes for intrinsics, with/without rounding
+ // modes.
// First, we check if the intrinsic have rounding mode (6 operands),
// if not, we set rounding mode to "current".
SDValue Rnd;
SDValue Imm = Op.getOperand(3);
SDValue PassThru = Op.getOperand(4);
SDValue Mask = Op.getOperand(5);
- // We specify 2 possible modes for intrinsics, with/without rounding modes.
+ // We specify 2 possible modes for intrinsics, with/without rounding
+ // modes.
// First, we check if the intrinsic have rounding mode (7 operands),
// if not, we set rounding mode to "current".
SDValue Rnd;
Src1, Src2, Src3),
Mask, PassThru, Subtarget, DAG);
}
+ case TERLOG_OP_MASK:
+ case TERLOG_OP_MASKZ: {
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Src2 = Op.getOperand(2);
+ SDValue Src3 = Op.getOperand(3);
+ SDValue Src4 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(4));
+ SDValue Mask = Op.getOperand(5);
+ EVT VT = Op.getValueType();
+ SDValue PassThru = Src1;
+ // Set PassThru element.
+ if (IntrData->Type == TERLOG_OP_MASKZ)
+ PassThru = getZeroVector(VT, Subtarget, DAG, dl);
+
+ return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
+ Src1, Src2, Src3, Src4),
+ Mask, PassThru, Subtarget, DAG);
+ }
+ case FPCLASS: {
+ // FPclass intrinsics with mask
+ SDValue Src1 = Op.getOperand(1);
+ EVT VT = Src1.getValueType();
+ EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+ VT.getVectorNumElements());
+ SDValue Imm = Op.getOperand(2);
+ SDValue Mask = Op.getOperand(3);
+ EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+ Mask.getValueType().getSizeInBits());
+ SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Imm);
+ SDValue FPclassMask = getVectorMaskingNode(FPclass, Mask,
+ DAG.getTargetConstant(0, dl, MaskVT),
+ Subtarget, DAG);
+ SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
+ DAG.getUNDEF(BitcastVT), FPclassMask,
+ DAG.getIntPtrConstant(0, dl));
+ return DAG.getBitcast(Op.getValueType(), Res);
+ }
+ case FPCLASSS: {
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Imm = Op.getOperand(2);
+ SDValue Mask = Op.getOperand(3);
+ SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Imm);
+ SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask,
+ DAG.getTargetConstant(0, dl, MVT::i1), Subtarget, DAG);
+ return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i8, FPclassMask);
+ }
case CMP_MASK:
case CMP_MASK_CC: {
// Comparison intrinsics with masks.
DAG.getIntPtrConstant(0, dl));
return DAG.getBitcast(Op.getValueType(), Res);
}
+ case CMP_MASK_SCALAR_CC: {
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Src2 = Op.getOperand(2);
+ SDValue CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(3));
+ SDValue Mask = Op.getOperand(4);
+
+ SDValue Cmp;
+ if (IntrData->Opc1 != 0) {
+ SDValue Rnd = Op.getOperand(5);
+ if (cast<ConstantSDNode>(Rnd)->getZExtValue() !=
+ X86::STATIC_ROUNDING::CUR_DIRECTION)
+ Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::i1, Src1, Src2, CC, Rnd);
+ }
+ //default rounding mode
+ if(!Cmp.getNode())
+ Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Src2, CC);
+
+ SDValue CmpMask = getScalarMaskingNode(Cmp, Mask,
+ DAG.getTargetConstant(0, dl,
+ MVT::i1),
+ Subtarget, DAG);
+
+ return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i8,
+ DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, CmpMask),
+ DAG.getValueType(MVT::i1));
+ }
case COMI: { // Comparison intrinsics
ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
SDValue LHS = Op.getOperand(1);
ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
}
-static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) {
+/// \brief Lower a vector CTLZ using native supported vector CTLZ instruction.
+//
+// 1. i32/i64 128/256-bit vector (native support require VLX) are expended
+// to 512-bit vector.
+// 2. i8/i16 vector implemented using dword LZCNT vector instruction
+// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
+// split the vector, perform operation on it's Lo a Hi part and
+// concatenate the results.
+static SDValue LowerVectorCTLZ_AVX512(SDValue Op, SelectionDAG &DAG) {
+ SDLoc dl(Op);
+ MVT VT = Op.getSimpleValueType();
+ MVT EltVT = VT.getVectorElementType();
+ unsigned NumElems = VT.getVectorNumElements();
+
+ if (EltVT == MVT::i64 || EltVT == MVT::i32) {
+ // Extend to 512 bit vector.
+ assert((VT.is256BitVector() || VT.is128BitVector()) &&
+ "Unsupported value type for operation");
+
+ MVT NewVT = MVT::getVectorVT(EltVT, 512 / VT.getScalarSizeInBits());
+ SDValue Vec512 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NewVT,
+ DAG.getUNDEF(NewVT),
+ Op.getOperand(0),
+ DAG.getIntPtrConstant(0, dl));
+ SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Vec512);
+
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CtlzNode,
+ DAG.getIntPtrConstant(0, dl));
+ }
+
+ assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
+ "Unsupported element type");
+
+ if (16 < NumElems) {
+ // Split vector, it's Lo and Hi parts will be handled in next iteration.
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = DAG.SplitVector(Op.getOperand(0), dl);
+ MVT OutVT = MVT::getVectorVT(EltVT, NumElems/2);
+
+ Lo = DAG.getNode(Op.getOpcode(), dl, OutVT, Lo);
+ Hi = DAG.getNode(Op.getOpcode(), dl, OutVT, Hi);
+
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
+ }
+
+ MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
+
+ assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
+ "Unsupported value type for operation");
+
+ // Use native supported vector instruction vplzcntd.
+ Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
+ SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
+ SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
+ SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
+
+ return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
+}
+
+static SDValue LowerCTLZ(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
EVT OpVT = VT;
unsigned NumBits = VT.getSizeInBits();
SDLoc dl(Op);
+ if (VT.isVector() && Subtarget->hasAVX512())
+ return LowerVectorCTLZ_AVX512(Op, DAG);
+
Op = Op.getOperand(0);
if (VT == MVT::i8) {
// Zero extend to i32 since there is not an i8 bsr.
return Op;
}
-static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) {
+static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
EVT OpVT = VT;
unsigned NumBits = VT.getSizeInBits();
SDLoc dl(Op);
+ if (VT.isVector() && Subtarget->hasAVX512())
+ return LowerVectorCTLZ_AVX512(Op, DAG);
+
Op = Op.getOperand(0);
if (VT == MVT::i8) {
// Zero extend to i32 since there is not an i8 bsr.
// i64 SRA needs to be performed as partial shifts.
if ((VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) &&
- Op.getOpcode() == ISD::SRA)
+ Op.getOpcode() == ISD::SRA && !Subtarget->hasXOP())
return ArithmeticShiftRight64(ShiftAmt);
if (VT == MVT::v16i8 || (Subtarget->hasInt256() && VT == MVT::v32i8)) {
unsigned NumElts = VT.getVectorNumElements();
MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
- if (Op.getOpcode() == ISD::SHL) {
- // Simple i8 add case
- if (ShiftAmt == 1)
- return DAG.getNode(ISD::ADD, dl, VT, R, R);
+ // Simple i8 add case
+ if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
+ return DAG.getNode(ISD::ADD, dl, VT, R, R);
+
+ // ashr(R, 7) === cmp_slt(R, 0)
+ if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
+ SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
+ return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
+ }
+ // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
+ if (VT == MVT::v16i8 && Subtarget->hasXOP())
+ return SDValue();
+
+ if (Op.getOpcode() == ISD::SHL) {
// Make a large shift.
SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT,
R, ShiftAmt, DAG);
DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
}
if (Op.getOpcode() == ISD::SRA) {
- if (ShiftAmt == 7) {
- // ashr(R, 7) === cmp_slt(R, 0)
- SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
- return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
- }
-
// ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
SmallVector<SDValue, 32> V(NumElts,
}
// Special case in 32-bit mode, where i64 is expanded into high and low parts.
- if (!Subtarget->is64Bit() &&
+ if (!Subtarget->is64Bit() && !Subtarget->hasXOP() &&
(VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64))) {
// Peek through any splat that was introduced for i64 shift vectorization.
if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode())) {
SDValue BaseShAmt;
- EVT EltVT = VT.getVectorElementType();
+ MVT EltVT = VT.getVectorElementType();
if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) {
// Check if this build_vector node is doing a splat.
return V;
if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
- return V;
+ return V;
if (SupportedVectorVarShift(VT, Subtarget, Op.getOpcode()))
return Op;
+ // XOP has 128-bit variable logical/arithmetic shifts.
+ // +ve/-ve Amt = shift left/right.
+ if (Subtarget->hasXOP() &&
+ (VT == MVT::v2i64 || VT == MVT::v4i32 ||
+ VT == MVT::v8i16 || VT == MVT::v16i8)) {
+ if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SRA) {
+ SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
+ Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
+ }
+ if (Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL)
+ return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
+ if (Op.getOpcode() == ISD::SRA)
+ return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
+ }
+
// 2i64 vector logical shifts can efficiently avoid scalarization - do the
// shifts per-lane and then shuffle the partial results back together.
if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) {
return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
}
- if (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget->hasInt256())) {
+ if (VT == MVT::v16i8 ||
+ (VT == MVT::v32i8 && Subtarget->hasInt256() && !Subtarget->hasXOP())) {
MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
unsigned ShiftOpcode = Op->getOpcode();
DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt));
}
- if (Subtarget->hasInt256() && VT == MVT::v16i16) {
+ if (Subtarget->hasInt256() && !Subtarget->hasXOP() && VT == MVT::v16i16) {
MVT ExtVT = MVT::v8i32;
SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Amt, Z);
return SDValue();
}
+static SDValue LowerRotate(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ MVT VT = Op.getSimpleValueType();
+ SDLoc DL(Op);
+ SDValue R = Op.getOperand(0);
+ SDValue Amt = Op.getOperand(1);
+
+ assert(VT.isVector() && "Custom lowering only for vector rotates!");
+ assert(Subtarget->hasXOP() && "XOP support required for vector rotates!");
+ assert((Op.getOpcode() == ISD::ROTL) && "Only ROTL supported");
+
+ // XOP has 128-bit vector variable + immediate rotates.
+ // +ve/-ve Amt = rotate left/right.
+
+ // Split 256-bit integers.
+ if (VT.is256BitVector())
+ return Lower256IntArith(Op, DAG);
+
+ assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
+
+ // Attempt to rotate by immediate.
+ if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
+ if (auto *RotateConst = BVAmt->getConstantSplatNode()) {
+ uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue();
+ assert(RotateAmt < VT.getScalarSizeInBits() && "Rotation out of range");
+ return DAG.getNode(X86ISD::VPROTI, DL, VT, R,
+ DAG.getConstant(RotateAmt, DL, MVT::i8));
+ }
+ }
+
+ // Use general rotate by variable (per-element).
+ return DAG.getNode(X86ISD::VPROT, DL, VT, R, Amt);
+}
+
static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
// Lower the "add/sub/mul with overflow" instruction into a regular ins plus
// a "setcc" instruction that checks the overflow flag. The "brcond" lowering
SDValue InVec = Op->getOperand(0);
SDLoc dl(Op);
unsigned NumElts = SrcVT.getVectorNumElements();
- EVT SVT = SrcVT.getVectorElementType();
+ MVT SVT = SrcVT.getVectorElementType();
// Widen the vector in input in the case of MVT::v2i32.
// Example: from MVT::v2i32 to MVT::v4i32.
case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
- case ISD::CTLZ: return LowerCTLZ(Op, DAG);
- case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ_ZERO_UNDEF(Op, DAG);
+ case ISD::CTLZ: return LowerCTLZ(Op, Subtarget, DAG);
+ case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ_ZERO_UNDEF(Op, Subtarget, DAG);
case ISD::CTTZ:
case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, DAG);
case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
case ISD::UMUL_LOHI:
case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG);
+ case ISD::ROTL: return LowerRotate(Op, Subtarget, DAG);
case ISD::SRA:
case ISD::SRL:
case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
case X86ISD::VPERMV3: return "X86ISD::VPERMV3";
case X86ISD::VPERMIV3: return "X86ISD::VPERMIV3";
case X86ISD::VPERMI: return "X86ISD::VPERMI";
+ case X86ISD::VPTERNLOG: return "X86ISD::VPTERNLOG";
case X86ISD::VFIXUPIMM: return "X86ISD::VFIXUPIMM";
case X86ISD::VRANGE: return "X86ISD::VRANGE";
case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ";
case X86ISD::RDSEED: return "X86ISD::RDSEED";
case X86ISD::VPMADDUBSW: return "X86ISD::VPMADDUBSW";
case X86ISD::VPMADDWD: return "X86ISD::VPMADDWD";
+ case X86ISD::VPROT: return "X86ISD::VPROT";
+ case X86ISD::VPROTI: return "X86ISD::VPROTI";
+ case X86ISD::VPSHA: return "X86ISD::VPSHA";
+ case X86ISD::VPSHL: return "X86ISD::VPSHL";
+ case X86ISD::VPCOM: return "X86ISD::VPCOM";
+ case X86ISD::VPCOMU: return "X86ISD::VPCOMU";
case X86ISD::FMADD: return "X86ISD::FMADD";
case X86ISD::FMSUB: return "X86ISD::FMSUB";
case X86ISD::FNMADD: return "X86ISD::FNMADD";
case X86ISD::UINT_TO_FP_RND: return "X86ISD::UINT_TO_FP_RND";
case X86ISD::FP_TO_SINT_RND: return "X86ISD::FP_TO_SINT_RND";
case X86ISD::FP_TO_UINT_RND: return "X86ISD::FP_TO_UINT_RND";
+ case X86ISD::VFPCLASS: return "X86ISD::VFPCLASS";
}
return nullptr;
}
DebugLoc DL = MI->getDebugLoc();
const BasicBlock *BB = MBB->getBasicBlock();
- MachineFunction::iterator I = MBB;
- ++I;
+ MachineFunction::iterator I = ++MBB->getIterator();
// For the v = xbegin(), we generate
//
offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
- MachineFunction::iterator MBBIter = MBB;
- ++MBBIter;
+ MachineFunction::iterator MBBIter = ++MBB->getIterator();
// Insert the new basic blocks
MF->insert(MBBIter, offsetMBB);
// stores were performed.
const BasicBlock *LLVM_BB = MBB->getBasicBlock();
MachineFunction *F = MBB->getParent();
- MachineFunction::iterator MBBIter = MBB;
- ++MBBIter;
+ MachineFunction::iterator MBBIter = ++MBB->getIterator();
MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
F->insert(MBBIter, XMMSaveMBB);
// destination vreg to set, the condition code register to branch on, the
// true/false values to select between, and a branch opcode to use.
const BasicBlock *LLVM_BB = BB->getBasicBlock();
- MachineFunction::iterator It = BB;
- ++It;
+ MachineFunction::iterator It = ++BB->getIterator();
// thisMBB:
// ...
const X86InstrInfo *TII = Subtarget->getInstrInfo();
DebugLoc DL = MI->getDebugLoc();
MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
- unsigned MSrc = MI->getOperand(0).getReg();
+ MachineOperand MSrc = MI->getOperand(0);
unsigned VSrc = MI->getOperand(5).getReg();
+ const MachineOperand &Disp = MI->getOperand(3);
+ MachineOperand ZeroDisp = MachineOperand::CreateImm(0);
+ bool hasDisp = Disp.isGlobal() || Disp.isImm();
+ if (hasDisp && MSrc.isReg())
+ MSrc.setIsKill(false);
MachineInstrBuilder MIM = BuildMI(*BB, MI, DL, TII->get(MOp))
- .addReg(/*Base=*/MSrc)
+ .addOperand(/*Base=*/MSrc)
.addImm(/*Scale=*/1)
.addReg(/*Index=*/0)
- .addImm(0)
+ .addDisp(hasDisp ? Disp : ZeroDisp, /*off=*/0)
.addReg(0);
MachineInstr *MIO = BuildMI(*BB, (MachineInstr *)MIM, DL, TII->get(FOp),
MRI.createVirtualRegister(MRI.getRegClass(VSrc)))
.addReg(VSrc)
- .addReg(/*Base=*/MSrc)
+ .addOperand(/*Base=*/MSrc)
.addImm(/*Scale=*/1)
.addReg(/*Index=*/0)
- .addImm(/*Disp=*/0)
+ .addDisp(hasDisp ? Disp : ZeroDisp, /*off=*/0)
.addReg(/*Segment=*/0);
MIM.addReg(MIO->getOperand(0).getReg(), RegState::Kill);
MI->eraseFromParent(); // The pseudo instruction is gone now.
sizeVReg = MI->getOperand(1).getReg(),
physSPReg = IsLP64 || Subtarget->isTargetNaCl64() ? X86::RSP : X86::ESP;
- MachineFunction::iterator MBBIter = BB;
- ++MBBIter;
+ MachineFunction::iterator MBBIter = ++BB->getIterator();
MF->insert(MBBIter, bumpMBB);
MF->insert(MBBIter, mallocMBB);
MachineRegisterInfo &MRI = MF->getRegInfo();
const BasicBlock *BB = MBB->getBasicBlock();
- MachineFunction::iterator I = MBB;
- ++I;
+ MachineFunction::iterator I = ++MBB->getIterator();
// Memory Reference
MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
// For v = setjmp(buf), we generate
//
// thisMBB:
- // buf[LabelOffset] = restoreMBB
+ // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
// SjLjSetup restoreMBB
//
// mainMBB:
MF->insert(I, mainMBB);
MF->insert(I, sinkMBB);
MF->push_back(restoreMBB);
+ restoreMBB->setHasAddressTaken();
MachineInstrBuilder MIB;
MVT RootVT = Root.getSimpleValueType();
SDLoc DL(Root);
- // Just remove no-op shuffle masks.
if (Mask.size() == 1) {
- DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Input),
- /*AddTo*/ true);
+ int Index = Mask[0];
+ assert((Index >= 0 || Index == SM_SentinelUndef ||
+ Index == SM_SentinelZero) &&
+ "Invalid shuffle index found!");
+
+ // We may end up with an accumulated mask of size 1 as a result of
+ // widening of shuffle operands (see function canWidenShuffleElements).
+ // If the only shuffle index is equal to SM_SentinelZero then propagate
+ // a zero vector. Otherwise, the combine shuffle mask is a no-op shuffle
+ // mask, and therefore the entire chain of shuffles can be folded away.
+ if (Index == SM_SentinelZero)
+ DCI.CombineTo(Root.getNode(), getZeroVector(RootVT, Subtarget, DAG, DL));
+ else
+ DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Input),
+ /*AddTo*/ true);
return true;
}
// doesn't preclude something switching to the shorter encoding post-RA.
//
// FIXME: Should teach these routines about AVX vector widths.
- if (FloatDomain && VT.getSizeInBits() == 128) {
+ if (FloatDomain && VT.is128BitVector()) {
if (Mask.equals({0, 0}) || Mask.equals({1, 1})) {
bool Lo = Mask.equals({0, 0});
unsigned Shuffle;
// We always canonicalize the 8 x i16 and 16 x i8 shuffles into their UNPCK
// variants as none of these have single-instruction variants that are
// superior to the UNPCK formulation.
- if (!FloatDomain && VT.getSizeInBits() == 128 &&
+ if (!FloatDomain && VT.is128BitVector() &&
(Mask.equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
Mask.equals({4, 4, 5, 5, 6, 6, 7, 7}) ||
Mask.equals({0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7}) ||
// See if we can recurse into the operand to combine more things.
switch (Op.getOpcode()) {
- case X86ISD::PSHUFB:
- HasPSHUFB = true;
- case X86ISD::PSHUFD:
- case X86ISD::PSHUFHW:
- case X86ISD::PSHUFLW:
- if (Op.getOperand(0).hasOneUse() &&
- combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1,
- HasPSHUFB, DAG, DCI, Subtarget))
- return true;
- break;
+ case X86ISD::PSHUFB:
+ HasPSHUFB = true;
+ case X86ISD::PSHUFD:
+ case X86ISD::PSHUFHW:
+ case X86ISD::PSHUFLW:
+ if (Op.getOperand(0).hasOneUse() &&
+ combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1,
+ HasPSHUFB, DAG, DCI, Subtarget))
+ return true;
+ break;
- case X86ISD::UNPCKL:
- case X86ISD::UNPCKH:
- assert(Op.getOperand(0) == Op.getOperand(1) && "We only combine unary shuffles!");
- // We can't check for single use, we have to check that this shuffle is the only user.
- if (Op->isOnlyUserOf(Op.getOperand(0).getNode()) &&
- combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1,
- HasPSHUFB, DAG, DCI, Subtarget))
- return true;
- break;
+ case X86ISD::UNPCKL:
+ case X86ISD::UNPCKH:
+ assert(Op.getOperand(0) == Op.getOperand(1) &&
+ "We only combine unary shuffles!");
+ // We can't check for single use, we have to check that this shuffle is the
+ // only user.
+ if (Op->isOnlyUserOf(Op.getOperand(0).getNode()) &&
+ combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1,
+ HasPSHUFB, DAG, DCI, Subtarget))
+ return true;
+ break;
}
// Minor canonicalization of the accumulated shuffle mask to make it easier
return V;
}
-/// \brief Search for a combinable shuffle across a chain ending in pshuflw or pshufhw.
+/// \brief Search for a combinable shuffle across a chain ending in pshuflw or
+/// pshufhw.
///
/// We walk up the chain, skipping shuffles of the other half and looking
/// through shuffles which switch halves trying to find a shuffle of the same
EltNo);
}
-/// \brief Detect bitcasts between i32 to x86mmx low word. Since MMX types are
-/// special and don't usually play with other vector types, it's better to
-/// handle them early to be sure we emit efficient code by avoiding
-/// store-load conversions.
-static SDValue PerformBITCASTCombine(SDNode *N, SelectionDAG &DAG) {
- if (N->getValueType(0) != MVT::x86mmx ||
- N->getOperand(0)->getOpcode() != ISD::BUILD_VECTOR ||
- N->getOperand(0)->getValueType(0) != MVT::v2i32)
- return SDValue();
+static SDValue PerformBITCASTCombine(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ SDValue N0 = N->getOperand(0);
+ EVT VT = N->getValueType(0);
- SDValue V = N->getOperand(0);
- ConstantSDNode *C = dyn_cast<ConstantSDNode>(V.getOperand(1));
- if (C && C->getZExtValue() == 0 && V.getOperand(0).getValueType() == MVT::i32)
- return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(V.getOperand(0)),
- N->getValueType(0), V.getOperand(0));
+ // Detect bitcasts between i32 to x86mmx low word. Since MMX types are
+ // special and don't usually play with other vector types, it's better to
+ // handle them early to be sure we emit efficient code by avoiding
+ // store-load conversions.
+ if (VT == MVT::x86mmx && N0.getOpcode() == ISD::BUILD_VECTOR &&
+ N0.getValueType() == MVT::v2i32 &&
+ isa<ConstantSDNode>(N0.getOperand(1))) {
+ SDValue N00 = N0->getOperand(0);
+ if (N0.getConstantOperandVal(1) == 0 && N00.getValueType() == MVT::i32)
+ return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(N00), VT, N00);
+ }
+
+ // Convert a bitcasted integer logic operation that has one bitcasted
+ // floating-point operand and one constant operand into a floating-point
+ // logic operation. This may create a load of the constant, but that is
+ // cheaper than materializing the constant in an integer register and
+ // transferring it to an SSE register or transferring the SSE operand to
+ // integer register and back.
+ unsigned FPOpcode;
+ switch (N0.getOpcode()) {
+ case ISD::AND: FPOpcode = X86ISD::FAND; break;
+ case ISD::OR: FPOpcode = X86ISD::FOR; break;
+ case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
+ default: return SDValue();
+ }
+ if (((Subtarget->hasSSE1() && VT == MVT::f32) ||
+ (Subtarget->hasSSE2() && VT == MVT::f64)) &&
+ isa<ConstantSDNode>(N0.getOperand(1)) &&
+ N0.getOperand(0).getOpcode() == ISD::BITCAST &&
+ N0.getOperand(0).getOperand(0).getValueType() == VT) {
+ SDValue N000 = N0.getOperand(0).getOperand(0);
+ SDValue FPConst = DAG.getBitcast(VT, N0.getOperand(1));
+ return DAG.getNode(FPOpcode, SDLoc(N0), VT, N000, FPConst);
+ }
return SDValue();
}
InputVector.getNode()->getOperand(0));
// The mmx is indirect: (i64 extract_elt (v1i64 bitcast (x86mmx ...))).
- SDValue MMXSrcOp = MMXSrc.getOperand(0);
if (MMXSrc.getOpcode() == ISD::EXTRACT_VECTOR_ELT && MMXSrc.hasOneUse() &&
- MMXSrc.getValueType() == MVT::i64 && MMXSrcOp.hasOneUse() &&
- MMXSrcOp.getOpcode() == ISD::BITCAST &&
- MMXSrcOp.getValueType() == MVT::v1i64 &&
- MMXSrcOp.getOperand(0).getValueType() == MVT::x86mmx)
- return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector),
- N->getValueType(0),
- MMXSrcOp.getOperand(0));
+ MMXSrc.getValueType() == MVT::i64) {
+ SDValue MMXSrcOp = MMXSrc.getOperand(0);
+ if (MMXSrcOp.hasOneUse() && MMXSrcOp.getOpcode() == ISD::BITCAST &&
+ MMXSrcOp.getValueType() == MVT::v1i64 &&
+ MMXSrcOp.getOperand(0).getValueType() == MVT::x86mmx)
+ return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector),
+ N->getValueType(0), MMXSrcOp.getOperand(0));
+ }
}
EVT VT = N->getValueType(0);
InputVector.getOpcode() == ISD::BITCAST &&
dyn_cast<ConstantSDNode>(InputVector.getOperand(0))) {
uint64_t ExtractedElt =
- cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+ cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
uint64_t InputValue =
- cast<ConstantSDNode>(InputVector.getOperand(0))->getZExtValue();
+ cast<ConstantSDNode>(InputVector.getOperand(0))->getZExtValue();
uint64_t Res = (InputValue >> ExtractedElt) & 1;
return DAG.getConstant(Res, dl, MVT::i1);
}
if (VT.getScalarType() == MVT::i16)
return SDValue();
// Dynamic blending was only available from SSE4.1 onward.
- if (VT.getSizeInBits() == 128 && !Subtarget->hasSSE41())
+ if (VT.is128BitVector() && !Subtarget->hasSSE41())
return SDValue();
// Byte blends are only available in AVX2
- if (VT.getSizeInBits() == 256 && VT.getScalarType() == MVT::i8 &&
- !Subtarget->hasAVX2())
+ if (VT == MVT::v32i8 && !Subtarget->hasAVX2())
return SDValue();
assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt))
if (auto *AmtSplat = AmtBV->getConstantSplatNode()) {
APInt ShiftAmt = AmtSplat->getAPIntValue();
- unsigned MaxAmount = VT.getVectorElementType().getSizeInBits();
+ unsigned MaxAmount =
+ VT.getSimpleVT().getVectorElementType().getSizeInBits();
// SSE2/AVX2 logical shifts always return a vector of 0s
// if the shift amount is bigger than or equal to
return DAG.getBitcast(N0.getValueType(), NewShuffle);
}
+/// If both input operands of a logic op are being cast from floating point
+/// types, try to convert this into a floating point logic node to avoid
+/// unnecessary moves from SSE to integer registers.
+static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ unsigned FPOpcode = ISD::DELETED_NODE;
+ if (N->getOpcode() == ISD::AND)
+ FPOpcode = X86ISD::FAND;
+ else if (N->getOpcode() == ISD::OR)
+ FPOpcode = X86ISD::FOR;
+ else if (N->getOpcode() == ISD::XOR)
+ FPOpcode = X86ISD::FXOR;
+
+ assert(FPOpcode != ISD::DELETED_NODE &&
+ "Unexpected input node for FP logic conversion");
+
+ EVT VT = N->getValueType(0);
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ SDLoc DL(N);
+ if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
+ ((Subtarget->hasSSE1() && VT == MVT::i32) ||
+ (Subtarget->hasSSE2() && VT == MVT::i64))) {
+ SDValue N00 = N0.getOperand(0);
+ SDValue N10 = N1.getOperand(0);
+ EVT N00Type = N00.getValueType();
+ EVT N10Type = N10.getValueType();
+ if (N00Type.isFloatingPoint() && N10Type.isFloatingPoint()) {
+ SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
+ return DAG.getBitcast(VT, FPLogic);
+ }
+ }
+ return SDValue();
+}
+
static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget *Subtarget) {
if (SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget))
return R;
+ if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
+ return FPLogic;
+
EVT VT = N->getValueType(0);
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
if (SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget))
return R;
+ if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
+ return FPLogic;
+
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
EVT VT = N->getValueType(0);
if (SDValue RV = performIntegerAbsCombine(N, DAG))
return RV;
+ if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
+ return FPLogic;
+
return SDValue();
}
ShuffleVec[i] = i * SizeRatio;
// Can't shuffle using an illegal type.
- assert (DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT)
- && "WideVecVT should be legal");
+ assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
+ "WideVecVT should be legal");
WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
}
ISD::NON_EXTLOAD);
SDValue NewVec = DAG.getNode(X86ISD::VSEXT, dl, VT, WideLd);
return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
-
}
/// PerformMSTORECombine - Resolve truncating stores
static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG,
ShuffleVec[i] = i * SizeRatio;
// Can't shuffle using an illegal type.
- assert (DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT)
- && "WideVecVT should be legal");
+ assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
+ "WideVecVT should be legal");
SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
DAG.getUNDEF(WideVecVT),
return SDValue();
}
+/// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
+/// Promoting a sign extension ahead of an 'add nsw' exposes opportunities
+/// to combine math ops, use an LEA, or use a complex addressing mode. This can
+/// eliminate extend, add, and shift instructions.
+static SDValue promoteSextBeforeAddNSW(SDNode *Sext, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ // TODO: This should be valid for other integer types.
+ EVT VT = Sext->getValueType(0);
+ if (VT != MVT::i64)
+ return SDValue();
+
+ // We need an 'add nsw' feeding into the 'sext'.
+ SDValue Add = Sext->getOperand(0);
+ if (Add.getOpcode() != ISD::ADD || !Add->getFlags()->hasNoSignedWrap())
+ return SDValue();
+
+ // Having a constant operand to the 'add' ensures that we are not increasing
+ // the instruction count because the constant is extended for free below.
+ // A constant operand can also become the displacement field of an LEA.
+ auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
+ if (!AddOp1)
+ return SDValue();
+
+ // Don't make the 'add' bigger if there's no hope of combining it with some
+ // other 'add' or 'shl' instruction.
+ // TODO: It may be profitable to generate simpler LEA instructions in place
+ // of single 'add' instructions, but the cost model for selecting an LEA
+ // currently has a high threshold.
+ bool HasLEAPotential = false;
+ for (auto *User : Sext->uses()) {
+ if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
+ HasLEAPotential = true;
+ break;
+ }
+ }
+ if (!HasLEAPotential)
+ return SDValue();
+
+ // Everything looks good, so pull the 'sext' ahead of the 'add'.
+ int64_t AddConstant = AddOp1->getSExtValue();
+ SDValue AddOp0 = Add.getOperand(0);
+ SDValue NewSext = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(Sext), VT, AddOp0);
+ SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
+
+ // The wider add is guaranteed to not wrap because both operands are
+ // sign-extended.
+ SDNodeFlags Flags;
+ Flags.setNoSignedWrap(true);
+ return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewSext, NewConstant, &Flags);
+}
+
static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget *Subtarget) {
}
}
- if (!Subtarget->hasFp256())
- return SDValue();
-
- if (VT.isVector() && VT.getSizeInBits() == 256)
+ if (Subtarget->hasAVX() && VT.is256BitVector())
if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
return R;
+ if (SDValue NewAdd = promoteSextBeforeAddNSW(N, DAG, Subtarget))
+ return NewAdd;
+
return SDValue();
}
case ISD::SELECT:
case X86ISD::SHRUNKBLEND:
return PerformSELECTCombine(N, DAG, DCI, Subtarget);
- case ISD::BITCAST: return PerformBITCASTCombine(N, DAG);
+ case ISD::BITCAST: return PerformBITCASTCombine(N, DAG, Subtarget);
case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI, Subtarget);
case ISD::ADD: return PerformAddCombine(N, DAG, Subtarget);
case ISD::SUB: return PerformSubCombine(N, DAG, Subtarget);
Attribute::MinSize);
return OptSize && !VT.isVector();
}
+
+void X86TargetLowering::markInRegArguments(SelectionDAG &DAG,
+ TargetLowering::ArgListTy& Args) const {
+ // The MCU psABI requires some arguments to be passed in-register.
+ // For regular calls, the inreg arguments are marked by the front-end.
+ // However, for compiler generated library calls, we have to patch this
+ // up here.
+ if (!Subtarget->isTargetMCU() || !Args.size())
+ return;
+
+ unsigned FreeRegs = 3;
+ for (auto &Arg : Args) {
+ // For library functions, we do not expect any fancy types.
+ unsigned Size = DAG.getDataLayout().getTypeSizeInBits(Arg.Ty);
+ unsigned SizeInRegs = (Size + 31) / 32;
+ if (SizeInRegs > 2 || SizeInRegs > FreeRegs)
+ continue;
+
+ Arg.isInReg = true;
+ FreeRegs -= SizeInRegs;
+ if (!FreeRegs)
+ break;
+ }
+}