setOperationAction(ISD::BR_CC , MVT::f32, Expand);
setOperationAction(ISD::BR_CC , MVT::f64, Expand);
setOperationAction(ISD::BR_CC , MVT::f80, Expand);
+ setOperationAction(ISD::BR_CC , MVT::f128, Expand);
setOperationAction(ISD::BR_CC , MVT::i8, Expand);
setOperationAction(ISD::BR_CC , MVT::i16, Expand);
setOperationAction(ISD::BR_CC , MVT::i32, Expand);
setOperationAction(ISD::SELECT_CC , MVT::f32, Expand);
setOperationAction(ISD::SELECT_CC , MVT::f64, Expand);
setOperationAction(ISD::SELECT_CC , MVT::f80, Expand);
+ setOperationAction(ISD::SELECT_CC , MVT::f128, Expand);
setOperationAction(ISD::SELECT_CC , MVT::i8, Expand);
setOperationAction(ISD::SELECT_CC , MVT::i16, Expand);
setOperationAction(ISD::SELECT_CC , MVT::i32, Expand);
setOperationAction(ISD::SELECT , MVT::f32 , Custom);
setOperationAction(ISD::SELECT , MVT::f64 , Custom);
setOperationAction(ISD::SELECT , MVT::f80 , Custom);
+ setOperationAction(ISD::SELECT , MVT::f128 , Custom);
setOperationAction(ISD::SETCC , MVT::i8 , Custom);
setOperationAction(ISD::SETCC , MVT::i16 , Custom);
setOperationAction(ISD::SETCC , MVT::i32 , Custom);
setOperationAction(ISD::SETCC , MVT::f32 , Custom);
setOperationAction(ISD::SETCC , MVT::f64 , Custom);
setOperationAction(ISD::SETCC , MVT::f80 , Custom);
+ setOperationAction(ISD::SETCC , MVT::f128 , Custom);
setOperationAction(ISD::SETCCE , MVT::i8 , Custom);
setOperationAction(ISD::SETCCE , MVT::i16 , Custom);
setOperationAction(ISD::SETCCE , MVT::i32 , Custom);
setOperationAction(ISD::FMA, MVT::f64, Expand);
setOperationAction(ISD::FMA, MVT::f32, Expand);
- // Long double always uses X87.
+ // Long double always uses X87, except f128 in MMX.
if (!Subtarget->useSoftFloat()) {
+ if (Subtarget->is64Bit() && Subtarget->hasMMX()) {
+ addRegisterClass(MVT::f128, &X86::FR128RegClass);
+ ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat);
+ setOperationAction(ISD::FABS , MVT::f128, Custom);
+ setOperationAction(ISD::FNEG , MVT::f128, Custom);
+ setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
+ }
+
addRegisterClass(MVT::f80, &X86::RFP80RegClass);
setOperationAction(ISD::UNDEF, MVT::f80, Expand);
setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
setOperationAction(ISD::FDIV, MVT::v16f32, Legal);
setOperationAction(ISD::FSQRT, MVT::v16f32, Legal);
setOperationAction(ISD::FNEG, MVT::v16f32, Custom);
+ setOperationAction(ISD::FABS, MVT::v16f32, Custom);
setOperationAction(ISD::FADD, MVT::v8f64, Legal);
setOperationAction(ISD::FSUB, MVT::v8f64, Legal);
setOperationAction(ISD::FDIV, MVT::v8f64, Legal);
setOperationAction(ISD::FSQRT, MVT::v8f64, Legal);
setOperationAction(ISD::FNEG, MVT::v8f64, Custom);
+ setOperationAction(ISD::FABS, MVT::v8f64, Custom);
setOperationAction(ISD::FMA, MVT::v8f64, Legal);
setOperationAction(ISD::FMA, MVT::v16f32, Legal);
setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
+ } else {
+ setOperationAction(ISD::MLOAD, MVT::v8i32, Custom);
+ setOperationAction(ISD::MLOAD, MVT::v8f32, Custom);
+ setOperationAction(ISD::MSTORE, MVT::v8i32, Custom);
+ setOperationAction(ISD::MSTORE, MVT::v8f32, Custom);
}
setOperationAction(ISD::TRUNCATE, MVT::i1, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16i1, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i1, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i1, Custom);
setOperationAction(ISD::BUILD_VECTOR, MVT::v8i1, Custom);
setOperationAction(ISD::OR, VT, Legal);
setOperationAction(ISD::XOR, VT, Legal);
}
- if (EltSize >= 32 && VT.getSizeInBits() <= 512) {
+ if ((VT.is128BitVector() || VT.is256BitVector()) && EltSize >= 32) {
setOperationAction(ISD::MGATHER, VT, Custom);
setOperationAction(ISD::MSCATTER, VT, Custom);
}
setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
setOperationAction(ISD::MLOAD, VT, Legal);
setOperationAction(ISD::MSTORE, VT, Legal);
+ setOperationAction(ISD::MGATHER, VT, Legal);
+ setOperationAction(ISD::MSCATTER, VT, Custom);
}
}
for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) {
for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::VSELECT, VT, Legal);
+ setOperationAction(ISD::SRL, VT, Custom);
+ setOperationAction(ISD::SHL, VT, Custom);
+ setOperationAction(ISD::SRA, VT, Custom);
+
+ setOperationAction(ISD::AND, VT, Promote);
+ AddPromotedToType (ISD::AND, VT, MVT::v8i64);
+ setOperationAction(ISD::OR, VT, Promote);
+ AddPromotedToType (ISD::OR, VT, MVT::v8i64);
+ setOperationAction(ISD::XOR, VT, Promote);
+ AddPromotedToType (ISD::XOR, VT, MVT::v8i64);
}
}
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
- if (!Subtarget->is64Bit())
+ if (!Subtarget->is64Bit()) {
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
+ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
+ }
// Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
// handle type legalization for these operations here.
setTargetDAGCombine(ISD::FSUB);
setTargetDAGCombine(ISD::FNEG);
setTargetDAGCombine(ISD::FMA);
+ setTargetDAGCombine(ISD::FMAXNUM);
setTargetDAGCombine(ISD::SUB);
setTargetDAGCombine(ISD::LOAD);
setTargetDAGCombine(ISD::MLOAD);
setTargetDAGCombine(ISD::BUILD_VECTOR);
setTargetDAGCombine(ISD::MUL);
setTargetDAGCombine(ISD::XOR);
+ setTargetDAGCombine(ISD::MSCATTER);
+ setTargetDAGCombine(ISD::MGATHER);
computeRegisterProperties(Subtarget->getRegisterInfo());
MachineFunction &MF = DAG.getMachineFunction();
X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+ if (CallConv == CallingConv::X86_INTR && !Outs.empty())
+ report_fatal_error("X86 interrupts may not return any value");
+
SmallVector<CCValAssign, 16> RVLocs;
CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
CCInfo.AnalyzeReturn(Outs, RetCC_X86);
if (Flag.getNode())
RetOps.push_back(Flag);
- return DAG.getNode(X86ISD::RET_FLAG, dl, MVT::Other, RetOps);
+ X86ISD::NodeType opcode = X86ISD::RET_FLAG;
+ if (CallConv == CallingConv::X86_INTR)
+ opcode = X86ISD::IRET;
+ return DAG.getNode(opcode, dl, MVT::Other, RetOps);
}
bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
EVT CopyVT = VA.getLocVT();
// If this is x86-64, and we disabled SSE, we can't return FP values
- if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
+ if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) &&
((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
report_fatal_error("SSE register return with SSE disabled");
}
else
ValVT = VA.getValVT();
+ // Calculate SP offset of interrupt parameter, re-arrange the slot normally
+ // taken by a return address.
+ int Offset = 0;
+ if (CallConv == CallingConv::X86_INTR) {
+ const X86Subtarget& Subtarget =
+ static_cast<const X86Subtarget&>(DAG.getSubtarget());
+ // X86 interrupts may take one or two arguments.
+ // On the stack there will be no return address as in regular call.
+ // Offset of last argument need to be set to -4/-8 bytes.
+ // Where offset of the first argument out of two, should be set to 0 bytes.
+ Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1);
+ }
+
// FIXME: For now, all byval parameter objects are marked mutable. This can be
// changed with more analysis.
// In case of tail call optimization mark all arguments mutable. Since they
unsigned Bytes = Flags.getByValSize();
if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
+ // Adjust SP offset of interrupt parameter.
+ if (CallConv == CallingConv::X86_INTR) {
+ MFI->setObjectOffset(FI, Offset);
+ }
return DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
} else {
int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
VA.getLocMemOffset(), isImmutable);
+ // Adjust SP offset of interrupt parameter.
+ if (CallConv == CallingConv::X86_INTR) {
+ MFI->setObjectOffset(FI, Offset);
+ }
+
SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
SDValue Val = DAG.getLoad(
ValVT, dl, Chain, FIN,
assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
"Var args not supported with calling convention fastcc, ghc or hipe");
+ if (CallConv == CallingConv::X86_INTR) {
+ bool isLegal = Ins.size() == 1 ||
+ (Ins.size() == 2 && ((Is64Bit && Ins[1].VT == MVT::i64) ||
+ (!Is64Bit && Ins[1].VT == MVT::i32)));
+ if (!isLegal)
+ report_fatal_error("X86 interrupts may take one or two arguments");
+ }
+
// Assign locations to all of the incoming arguments.
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
RC = &X86::FR32RegClass;
else if (RegVT == MVT::f64)
RC = &X86::FR64RegClass;
+ else if (RegVT == MVT::f128)
+ RC = &X86::FR128RegClass;
else if (RegVT.is512BitVector())
RC = &X86::VR512RegClass;
else if (RegVT.is256BitVector())
if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
MF.getTarget().Options.GuaranteedTailCallOpt)) {
FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
+ } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
+ // X86 interrupts must pop the error code if present
+ FuncInfo->setBytesToPopOnReturn(Is64Bit ? 8 : 4);
} else {
FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
// If this is an sret function, the return should pop the hidden pointer.
X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls");
+ if (CallConv == CallingConv::X86_INTR)
+ report_fatal_error("X86 interrupts may not be called directly");
+
if (Attr.getValueAsString() == "true")
isTailCall = false;
if (MaskNode->getOpcode() == X86ISD::VBROADCAST) {
unsigned NumEltsInMask = MaskNode->getNumOperands();
MaskNode = MaskNode->getOperand(0);
- auto *CN = dyn_cast<ConstantSDNode>(MaskNode);
- if (CN) {
+ if (auto *CN = dyn_cast<ConstantSDNode>(MaskNode)) {
APInt MaskEltValue = CN->getAPIntValue();
for (unsigned i = 0; i < NumEltsInMask; ++i)
RawMask.push_back(MaskEltValue.getLoBits(MaskLoBits).getZExtValue());
if (!MaskCP || MaskCP->isMachineConstantPoolEntry())
return false;
- auto *C = dyn_cast<Constant>(MaskCP->getConstVal());
- if (C) {
+ if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) {
DecodeVPERMVMask(C, VT, Mask);
if (Mask.empty())
return false;
if (!MaskCP || MaskCP->isMachineConstantPoolEntry())
return false;
- auto *C = dyn_cast<Constant>(MaskCP->getConstVal());
- if (C) {
+ if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) {
DecodeVPERMV3Mask(C, VT, Mask);
if (Mask.empty())
return false;
/// For convenience, this code also bundles all of the subtarget feature set
/// filtering. While a little annoying to re-dispatch on type here, there isn't
/// a convenient way to factor it out.
+/// FIXME: This is very similar to LowerVectorBroadcast - can we merge them?
static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V,
ArrayRef<int> Mask,
const X86Subtarget *Subtarget,
// Only AVX2 has register broadcasts.
if (!Subtarget->hasAVX2() && !isShuffleFoldableLoad(V))
return SDValue();
+ } else if (MayFoldLoad(V) && !cast<LoadSDNode>(V)->isVolatile()) {
+ // If we are broadcasting a load that is only used by the shuffle
+ // then we can reduce the vector load to the broadcasted scalar load.
+ LoadSDNode *Ld = cast<LoadSDNode>(V);
+ SDValue BaseAddr = Ld->getOperand(1);
+ EVT AddrVT = BaseAddr.getValueType();
+ EVT SVT = VT.getScalarType();
+ unsigned Offset = BroadcastIdx * SVT.getStoreSize();
+ SDValue NewAddr = DAG.getNode(
+ ISD::ADD, DL, AddrVT, BaseAddr,
+ DAG.getConstant(Offset, DL, AddrVT));
+ V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
+ DAG.getMachineFunction().getMachineMemOperand(
+ Ld->getMemOperand(), Offset, SVT.getStoreSize()));
} else if (BroadcastIdx != 0 || !Subtarget->hasAVX2()) {
// We can't broadcast from a vector register without AVX2, and we can only
// broadcast from the zero-element of a vector register.
return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
}
+/// Lower shuffles where an entire half of a 256-bit vector is UNDEF.
+/// This allows for fast cases such as subvector extraction/insertion
+/// or shuffling smaller vector types which can lower more efficiently.
+static SDValue lowerVectorShuffleWithUndefHalf(SDLoc DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ assert(VT.getSizeInBits() == 256 && "Expected 256-bit vector");
+
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned HalfNumElts = NumElts / 2;
+ MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);
+
+ bool UndefLower = isUndefInRange(Mask, 0, HalfNumElts);
+ bool UndefUpper = isUndefInRange(Mask, HalfNumElts, HalfNumElts);
+ if (!UndefLower && !UndefUpper)
+ return SDValue();
+
+ // Upper half is undef and lower half is whole upper subvector.
+ // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
+ if (UndefUpper &&
+ isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
+ SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
+ DAG.getIntPtrConstant(HalfNumElts, DL));
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
+ DAG.getIntPtrConstant(0, DL));
+ }
+
+ // Lower half is undef and upper half is whole lower subvector.
+ // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
+ if (UndefLower &&
+ isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
+ SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
+ DAG.getIntPtrConstant(0, DL));
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
+ DAG.getIntPtrConstant(HalfNumElts, DL));
+ }
+
+ // AVX2 supports efficient immediate 64-bit element cross-lane shuffles.
+ if (UndefLower && Subtarget->hasAVX2() &&
+ (VT == MVT::v4f64 || VT == MVT::v4i64))
+ return SDValue();
+
+ // If the shuffle only uses the lower halves of the input operands,
+ // then extract them and perform the 'half' shuffle at half width.
+ // e.g. vector_shuffle <X, X, X, X, u, u, u, u> or <X, X, u, u>
+ int HalfIdx1 = -1, HalfIdx2 = -1;
+ SmallVector<int, 8> HalfMask;
+ unsigned Offset = UndefLower ? HalfNumElts : 0;
+ for (unsigned i = 0; i != HalfNumElts; ++i) {
+ int M = Mask[i + Offset];
+ if (M < 0) {
+ HalfMask.push_back(M);
+ continue;
+ }
+
+ // Determine which of the 4 half vectors this element is from.
+ // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
+ int HalfIdx = M / HalfNumElts;
+
+ // Only shuffle using the lower halves of the inputs.
+ // TODO: Investigate usefulness of shuffling with upper halves.
+ if (HalfIdx != 0 && HalfIdx != 2)
+ return SDValue();
+
+ // Determine the element index into its half vector source.
+ int HalfElt = M % HalfNumElts;
+
+ // We can shuffle with up to 2 half vectors, set the new 'half'
+ // shuffle mask accordingly.
+ if (-1 == HalfIdx1 || HalfIdx1 == HalfIdx) {
+ HalfMask.push_back(HalfElt);
+ HalfIdx1 = HalfIdx;
+ continue;
+ }
+ if (-1 == HalfIdx2 || HalfIdx2 == HalfIdx) {
+ HalfMask.push_back(HalfElt + HalfNumElts);
+ HalfIdx2 = HalfIdx;
+ continue;
+ }
+
+ // Too many half vectors referenced.
+ return SDValue();
+ }
+ assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
+
+ auto GetHalfVector = [&](int HalfIdx) {
+ if (HalfIdx < 0)
+ return DAG.getUNDEF(HalfVT);
+ SDValue V = (HalfIdx < 2 ? V1 : V2);
+ HalfIdx = (HalfIdx % 2) * HalfNumElts;
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
+ DAG.getIntPtrConstant(HalfIdx, DL));
+ };
+
+ SDValue Half1 = GetHalfVector(HalfIdx1);
+ SDValue Half2 = GetHalfVector(HalfIdx2);
+ SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
+ DAG.getIntPtrConstant(Offset, DL));
+}
+
/// \brief Test whether the specified input (0 or 1) is in-place blended by the
/// given mask.
///
DL, VT, V1, V2, Mask, Subtarget, DAG))
return Insertion;
+ // Handle special cases where the lower or upper half is UNDEF.
+ if (SDValue V =
+ lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
+ return V;
+
// There is a really nice hard cut-over between AVX1 and AVX2 that means we
// can check for those subtargets here and avoid much of the subtarget
// querying in the per-vector-type lowering routines. With AVX1 we have
X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
+
+ // Cygwin uses emutls.
+ // FIXME: It may be EmulatedTLS-generic also for X86-Android.
+ if (Subtarget->isTargetWindowsCygwin())
+ return LowerToTLSEmulatedModel(GA, DAG);
+
const GlobalValue *GV = GA->getGlobal();
auto PtrVT = getPointerTy(DAG.getDataLayout());
&& InVT.getScalarSizeInBits() >= 32 &&
Subtarget->hasDQI() && Subtarget->hasVLX())
return Op; // legal, will go to VPMOVB2M, VPMOVQ2M
- }
+ }
if (VT.getVectorElementType() == MVT::i1) {
assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type");
SDLoc dl(Op);
MVT VT = Op.getSimpleValueType();
+ bool IsF128 = (VT == MVT::f128);
+
// FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
// decide if we should generate a 16-byte constant mask when we only need 4 or
// 8 bytes for the scalar case.
LogicVT = VT;
EltVT = VT.getVectorElementType();
NumElts = VT.getVectorNumElements();
+ } else if (IsF128) {
+ // SSE instructions are used for optimized f128 logical operations.
+ LogicVT = MVT::f128;
+ EltVT = VT;
+ NumElts = 1;
} else {
// There are no scalar bitwise logical SSE/AVX instructions, so we
// generate a 16-byte vector constant and logic op even for the scalar case.
IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
- if (VT.isVector())
+ if (VT.isVector() || IsF128)
return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
// For the scalar case extend to a 128-bit vector, perform the logic op,
SDLoc dl(Op);
MVT VT = Op.getSimpleValueType();
MVT SrcVT = Op1.getSimpleValueType();
+ bool IsF128 = (VT == MVT::f128);
// If second operand is smaller, extend it first.
if (SrcVT.bitsLT(VT)) {
// At this point the operands and the result should have the same
// type, and that won't be f80 since that is not custom lowered.
+ assert((VT == MVT::f64 || VT == MVT::f32 || IsF128) &&
+ "Unexpected type in LowerFCOPYSIGN");
const fltSemantics &Sem =
- VT == MVT::f64 ? APFloat::IEEEdouble : APFloat::IEEEsingle;
+ VT == MVT::f64 ? APFloat::IEEEdouble :
+ (IsF128 ? APFloat::IEEEquad : APFloat::IEEEsingle);
const unsigned SizeInBits = VT.getSizeInBits();
SmallVector<Constant *, 4> CV(
- VT == MVT::f64 ? 2 : 4,
+ VT == MVT::f64 ? 2 : (IsF128 ? 1 : 4),
ConstantFP::get(*Context, APFloat(Sem, APInt(SizeInBits, 0))));
// First, clear all bits but the sign bit from the second operand (sign).
// Perform all logic operations as 16-byte vectors because there are no
// scalar FP logic instructions in SSE. This allows load folding of the
// constants into the logic instructions.
- MVT LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
+ MVT LogicVT = (VT == MVT::f64) ? MVT::v2f64 : (IsF128 ? MVT::f128 : MVT::v4f32);
SDValue Mask1 =
DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx,
MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
false, false, false, 16);
- Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op1);
+ if (!IsF128)
+ Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op1);
SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Op1, Mask1);
// Next, clear the sign bit from the first operand (magnitude).
APFloat APF = Op0CN->getValueAPF();
// If the magnitude is a positive zero, the sign bit alone is enough.
if (APF.isPosZero())
- return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, SignBit,
- DAG.getIntPtrConstant(0, dl));
+ return IsF128 ? SignBit :
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, SignBit,
+ DAG.getIntPtrConstant(0, dl));
APF.clearSign();
CV[0] = ConstantFP::get(*Context, APF);
} else {
false, false, false, 16);
// If the magnitude operand wasn't a constant, we need to AND out the sign.
if (!isa<ConstantFPSDNode>(Op0)) {
- Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op0);
+ if (!IsF128)
+ Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op0);
Val = DAG.getNode(X86ISD::FAND, dl, LogicVT, Op0, Val);
}
// OR the magnitude value with the sign bit.
Val = DAG.getNode(X86ISD::FOR, dl, LogicVT, Val, SignBit);
- return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, Val,
- DAG.getIntPtrConstant(0, dl));
+ return IsF128 ? Val :
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, Val,
+ DAG.getIntPtrConstant(0, dl));
}
static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
DAG.getConstant(8, dl, MVT::i8));
SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
+
+ // Some 64-bit targets lack SAHF support, but they do support FCOMI.
+ assert(Subtarget->hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?");
return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
}
return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
}
+/// \brief Return Mask with the necessary casting or extending
+/// for \p Mask according to \p MaskVT when lowering masking intrinsics
+static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG, SDLoc dl) {
+
+ if (MaskVT.bitsGT(Mask.getSimpleValueType())) {
+ // Mask should be extended
+ Mask = DAG.getNode(ISD::ANY_EXTEND, dl,
+ MVT::getIntegerVT(MaskVT.getSizeInBits()), Mask);
+ }
+
+ if (Mask.getSimpleValueType() == MVT::i64 && Subtarget->is32Bit()) {
+ if (MaskVT == MVT::v64i1) {
+ assert(Subtarget->hasBWI() && "Expected AVX512BW target!");
+ // In case 32bit mode, bitcast i64 is illegal, extend/split it.
+ SDValue Lo, Hi;
+ Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
+ DAG.getConstant(0, dl, MVT::i32));
+ Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
+ DAG.getConstant(1, dl, MVT::i32));
+
+ Lo = DAG.getBitcast(MVT::v32i1, Lo);
+ Hi = DAG.getBitcast(MVT::v32i1, Hi);
+
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
+ } else {
+ // MaskVT require < 64bit. Truncate mask (should succeed in any case),
+ // and bitcast.
+ MVT TruncVT = MVT::getIntegerVT(MaskVT.getSizeInBits());
+ return DAG.getBitcast(MaskVT,
+ DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Mask));
+ }
+
+ } else {
+ MVT BitcastVT = MVT::getVectorVT(MVT::i1,
+ Mask.getSimpleValueType().getSizeInBits());
+ // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
+ // are extracted by EXTRACT_SUBVECTOR.
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
+ DAG.getBitcast(BitcastVT, Mask),
+ DAG.getIntPtrConstant(0, dl));
+ }
+}
+
/// \brief Return (and \p Op, \p Mask) for compare instructions or
/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
/// necessary casting or extending for \p Mask when lowering masking intrinsics
static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
- SDValue PreservedSrc,
- const X86Subtarget *Subtarget,
- SelectionDAG &DAG) {
- MVT VT = Op.getSimpleValueType();
- MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
- SDValue VMask;
- unsigned OpcodeSelect = ISD::VSELECT;
- SDLoc dl(Op);
+ SDValue PreservedSrc,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ MVT VT = Op.getSimpleValueType();
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+ unsigned OpcodeSelect = ISD::VSELECT;
+ SDLoc dl(Op);
- if (isAllOnesConstant(Mask))
- return Op;
+ if (isAllOnesConstant(Mask))
+ return Op;
- if (MaskVT.bitsGT(Mask.getSimpleValueType())) {
- MVT newMaskVT = MVT::getIntegerVT(MaskVT.getSizeInBits());
- VMask = DAG.getBitcast(MaskVT,
- DAG.getNode(ISD::ANY_EXTEND, dl, newMaskVT, Mask));
- } else {
- MVT BitcastVT = MVT::getVectorVT(MVT::i1,
- Mask.getSimpleValueType().getSizeInBits());
- // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
- // are extracted by EXTRACT_SUBVECTOR.
- VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
- DAG.getBitcast(BitcastVT, Mask),
- DAG.getIntPtrConstant(0, dl));
- }
+ SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
- switch (Op.getOpcode()) {
- default: break;
- case X86ISD::PCMPEQM:
- case X86ISD::PCMPGTM:
- case X86ISD::CMPM:
- case X86ISD::CMPMU:
- return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
- case X86ISD::VFPCLASS:
+ switch (Op.getOpcode()) {
+ default: break;
+ case X86ISD::PCMPEQM:
+ case X86ISD::PCMPGTM:
+ case X86ISD::CMPM:
+ case X86ISD::CMPMU:
+ return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
+ case X86ISD::VFPCLASS:
case X86ISD::VFPCLASSS:
- return DAG.getNode(ISD::OR, dl, VT, Op, VMask);
- case X86ISD::VTRUNC:
- case X86ISD::VTRUNCS:
- case X86ISD::VTRUNCUS:
- // We can't use ISD::VSELECT here because it is not always "Legal"
- // for the destination type. For example vpmovqb require only AVX512
- // and vselect that can operate on byte element type require BWI
- OpcodeSelect = X86ISD::SELECT;
- break;
- }
- if (PreservedSrc.getOpcode() == ISD::UNDEF)
- PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
- return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
+ return DAG.getNode(ISD::OR, dl, VT, Op, VMask);
+ case X86ISD::VTRUNC:
+ case X86ISD::VTRUNCS:
+ case X86ISD::VTRUNCUS:
+ // We can't use ISD::VSELECT here because it is not always "Legal"
+ // for the destination type. For example vpmovqb require only AVX512
+ // and vselect that can operate on byte element type require BWI
+ OpcodeSelect = X86ISD::SELECT;
+ break;
+ }
+ if (PreservedSrc.getOpcode() == ISD::UNDEF)
+ PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
+ return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
}
/// \brief Creates an SDNode for a predicated scalar operation.
case EHPersonality::MSVC_CXX: return 16;
default: break;
}
- report_fatal_error("can only recover FP for MSVC EH personality functions");
+ report_fatal_error(
+ "can only recover FP for 32-bit MSVC EH personality functions");
}
-/// When the 32-bit MSVC runtime transfers control to us, either to an outlined
+/// When the MSVC runtime transfers control to us, either to an outlined
/// function or when returning to a parent frame after catching an exception, we
/// recover the parent frame pointer by doing arithmetic on the incoming EBP.
/// Here's the math:
/// RegNodeBase = EntryEBP - RegNodeSize
-/// ParentFP = RegNodeBase - RegNodeFrameOffset
+/// ParentFP = RegNodeBase - ParentFrameOffset
/// Subtracting RegNodeSize takes us to the offset of the registration node, and
/// subtracting the offset (negative on x86) takes us back to the parent FP.
static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
if (!Fn->hasPersonalityFn())
return EntryEBP;
- int RegNodeSize = getSEHRegistrationNodeSize(Fn);
-
// Get an MCSymbol that will ultimately resolve to the frame offset of the EH
- // registration.
+ // registration, or the .set_setframe offset.
MCSymbol *OffsetSym =
MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
GlobalValue::getRealLinkageName(Fn->getName()));
SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
- SDValue RegNodeFrameOffset =
+ SDValue ParentFrameOffset =
DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
+ // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
+ // prologue to RBP in the parent function.
+ const X86Subtarget &Subtarget =
+ static_cast<const X86Subtarget &>(DAG.getSubtarget());
+ if (Subtarget.is64Bit())
+ return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
+
+ int RegNodeSize = getSEHRegistrationNodeSize(Fn);
// RegNodeBase = EntryEBP - RegNodeSize
- // ParentFP = RegNodeBase - RegNodeFrameOffset
+ // ParentFP = RegNodeBase - ParentFrameOffset
SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
DAG.getConstant(RegNodeSize, dl, PtrVT));
- return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, RegNodeFrameOffset);
+ return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
}
static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
SDValue Mask = Op.getOperand(3);
MVT VT = Op.getSimpleValueType();
MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
- MVT BitcastVT = MVT::getVectorVT(MVT::i1,
- Mask.getSimpleValueType().getSizeInBits());
- SDLoc dl(Op);
- SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
- DAG.getBitcast(BitcastVT, Mask),
- DAG.getIntPtrConstant(0, dl));
+ SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
return DAG.getNode(IntrData->Opc0, dl, VT, VMask, Op.getOperand(1),
Op.getOperand(2));
}
+ case KUNPCK: {
+ MVT VT = Op.getSimpleValueType();
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()/2);
+
+ SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
+ SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
+ // Arguments should be swapped.
+ SDValue Res = DAG.getNode(IntrData->Opc0, dl,
+ MVT::getVectorVT(MVT::i1, VT.getSizeInBits()),
+ Src2, Src1);
+ return DAG.getBitcast(VT, Res);
+ }
+ case CONVERT_MASK_TO_VEC: {
+ SDValue Mask = Op.getOperand(1);
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+ SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
+ return DAG.getNode(IntrData->Opc0, dl, VT, VMask);
+ }
default:
break;
}
return DAG.getMergeValues(Results, DL);
}
-static SDValue LowerSEHRESTOREFRAME(SDValue Op, const X86Subtarget *Subtarget,
- SelectionDAG &DAG) {
- MachineFunction &MF = DAG.getMachineFunction();
- const Function *Fn = MF.getFunction();
- SDLoc dl(Op);
- SDValue Chain = Op.getOperand(0);
-
- assert(Subtarget->getFrameLowering()->hasFP(MF) &&
- "using llvm.x86.seh.restoreframe requires a frame pointer");
-
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- MVT VT = TLI.getPointerTy(DAG.getDataLayout());
-
- const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
- unsigned FrameReg =
- RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
- unsigned SPReg = RegInfo->getStackRegister();
- unsigned SlotSize = RegInfo->getSlotSize();
-
- // Get incoming EBP.
- SDValue IncomingEBP =
- DAG.getCopyFromReg(Chain, dl, FrameReg, VT);
-
- // SP is saved in the first field of every registration node, so load
- // [EBP-RegNodeSize] into SP.
- int RegNodeSize = getSEHRegistrationNodeSize(Fn);
- SDValue SPAddr = DAG.getNode(ISD::ADD, dl, VT, IncomingEBP,
- DAG.getConstant(-RegNodeSize, dl, VT));
- SDValue NewSP =
- DAG.getLoad(VT, dl, Chain, SPAddr, MachinePointerInfo(), false, false,
- false, VT.getScalarSizeInBits() / 8);
- Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP);
-
- if (!RegInfo->needsStackRealignment(MF)) {
- // Adjust EBP to point back to the original frame position.
- SDValue NewFP = recoverFramePointer(DAG, Fn, IncomingEBP);
- Chain = DAG.getCopyToReg(Chain, dl, FrameReg, NewFP);
- } else {
- assert(RegInfo->hasBasePointer(MF) &&
- "functions with Win32 EH must use frame or base pointer register");
-
- // Reload the base pointer (ESI) with the adjusted incoming EBP.
- SDValue NewBP = recoverFramePointer(DAG, Fn, IncomingEBP);
- Chain = DAG.getCopyToReg(Chain, dl, RegInfo->getBaseRegister(), NewBP);
-
- // Reload the spilled EBP value, now that the stack and base pointers are
- // set up.
- X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
- X86FI->setHasSEHFramePtrSave(true);
- int FI = MF.getFrameInfo()->CreateSpillStackObject(SlotSize, SlotSize);
- X86FI->setSEHFramePtrSaveIndex(FI);
- SDValue NewFP = DAG.getLoad(VT, dl, Chain, DAG.getFrameIndex(FI, VT),
- MachinePointerInfo(), false, false, false,
- VT.getScalarSizeInBits() / 8);
- Chain = DAG.getCopyToReg(NewFP, dl, FrameReg, NewFP);
- }
-
- return Chain;
-}
-
static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
MachineFunction &MF = DAG.getMachineFunction();
SDValue Chain = Op.getOperand(0);
const IntrinsicData* IntrData = getIntrinsicWithChain(IntNo);
if (!IntrData) {
- if (IntNo == llvm::Intrinsic::x86_seh_restoreframe)
- return LowerSEHRESTOREFRAME(Op, Subtarget, DAG);
- else if (IntNo == llvm::Intrinsic::x86_seh_ehregnode)
+ if (IntNo == llvm::Intrinsic::x86_seh_ehregnode)
return MarkEHRegistrationNode(Op, DAG);
return SDValue();
}
Op.getOpcode() == ISD::SRA && !Subtarget->hasXOP())
return ArithmeticShiftRight64(ShiftAmt);
- if (VT == MVT::v16i8 || (Subtarget->hasInt256() && VT == MVT::v32i8)) {
+ if (VT == MVT::v16i8 ||
+ (Subtarget->hasInt256() && VT == MVT::v32i8) ||
+ VT == MVT::v64i8) {
unsigned NumElts = VT.getVectorNumElements();
MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
R, ShiftAmt, DAG);
SHL = DAG.getBitcast(VT, SHL);
// Zero out the rightmost bits.
- SmallVector<SDValue, 32> V(
- NumElts, DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, MVT::i8));
return DAG.getNode(ISD::AND, dl, VT, SHL,
- DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
+ DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, VT));
}
if (Op.getOpcode() == ISD::SRL) {
// Make a large shift.
R, ShiftAmt, DAG);
SRL = DAG.getBitcast(VT, SRL);
// Zero out the leftmost bits.
- SmallVector<SDValue, 32> V(
- NumElts, DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, MVT::i8));
return DAG.getNode(ISD::AND, dl, VT, SRL,
- DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
+ DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT));
}
if (Op.getOpcode() == ISD::SRA) {
// ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
- SmallVector<SDValue, 32> V(NumElts,
- DAG.getConstant(128 >> ShiftAmt, dl,
- MVT::i8));
- SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V);
+
+ SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
return Res;
return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
}
+/// Widen a vector input to a vector of NVT. The
+/// input vector must have the same element type as NVT.
+static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
+ bool FillWithZeroes = false) {
+ // Check if InOp already has the right width.
+ MVT InVT = InOp.getSimpleValueType();
+ if (InVT == NVT)
+ return InOp;
+
+ if (InOp.isUndef())
+ return DAG.getUNDEF(NVT);
+
+ assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
+ "input and widen element type must match");
+
+ unsigned InNumElts = InVT.getVectorNumElements();
+ unsigned WidenNumElts = NVT.getVectorNumElements();
+ assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
+ "Unexpected request for vector widening");
+
+ EVT EltVT = NVT.getVectorElementType();
+
+ SDLoc dl(InOp);
+ if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
+ InOp.getNumOperands() == 2) {
+ SDValue N1 = InOp.getOperand(1);
+ if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
+ N1.isUndef()) {
+ InOp = InOp.getOperand(0);
+ InVT = InOp.getSimpleValueType();
+ InNumElts = InVT.getVectorNumElements();
+ }
+ }
+ if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
+ ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
+ SmallVector<SDValue, 16> Ops;
+ for (unsigned i = 0; i < InNumElts; ++i)
+ Ops.push_back(InOp.getOperand(i));
+
+ SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
+ DAG.getUNDEF(EltVT);
+ for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
+ Ops.push_back(FillVal);
+ return DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, Ops);
+ }
+ SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
+ DAG.getUNDEF(NVT);
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
+ InOp, DAG.getIntPtrConstant(0, dl));
+}
+
static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
assert(Subtarget->hasAVX512() &&
"MGATHER/MSCATTER are supported on AVX-512 arch only");
+ // X86 scatter kills mask register, so its type should be added to
+ // the list of return values.
+ // If the "scatter" has 2 return values, it is already handled.
+ if (Op.getNode()->getNumValues() == 2)
+ return Op;
+
MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
- MVT VT = N->getValue().getSimpleValueType();
+ SDValue Src = N->getValue();
+ MVT VT = Src.getSimpleValueType();
assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
SDLoc dl(Op);
- // X86 scatter kills mask register, so its type should be added to
- // the list of return values
- if (N->getNumValues() == 1) {
- SDValue Index = N->getIndex();
- if (!Subtarget->hasVLX() && !VT.is512BitVector() &&
- !Index.getSimpleValueType().is512BitVector())
+ SDValue NewScatter;
+ SDValue Index = N->getIndex();
+ SDValue Mask = N->getMask();
+ SDValue Chain = N->getChain();
+ SDValue BasePtr = N->getBasePtr();
+ MVT MemVT = N->getMemoryVT().getSimpleVT();
+ MVT IndexVT = Index.getSimpleValueType();
+ MVT MaskVT = Mask.getSimpleValueType();
+
+ if (MemVT.getScalarSizeInBits() < VT.getScalarSizeInBits()) {
+ // The v2i32 value was promoted to v2i64.
+ // Now we "redo" the type legalizer's work and widen the original
+ // v2i32 value to v4i32. The original v2i32 is retrieved from v2i64
+ // with a shuffle.
+ assert((MemVT == MVT::v2i32 && VT == MVT::v2i64) &&
+ "Unexpected memory type");
+ int ShuffleMask[] = {0, 2, -1, -1};
+ Src = DAG.getVectorShuffle(MVT::v4i32, dl, DAG.getBitcast(MVT::v4i32, Src),
+ DAG.getUNDEF(MVT::v4i32), ShuffleMask);
+ // Now we have 4 elements instead of 2.
+ // Expand the index.
+ MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), 4);
+ Index = ExtendToType(Index, NewIndexVT, DAG);
+
+ // Expand the mask with zeroes
+ // Mask may be <2 x i64> or <2 x i1> at this moment
+ assert((MaskVT == MVT::v2i1 || MaskVT == MVT::v2i64) &&
+ "Unexpected mask type");
+ MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), 4);
+ Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
+ VT = MVT::v4i32;
+ }
+
+ unsigned NumElts = VT.getVectorNumElements();
+ if (!Subtarget->hasVLX() && !VT.is512BitVector() &&
+ !Index.getSimpleValueType().is512BitVector()) {
+ // AVX512F supports only 512-bit vectors. Or data or index should
+ // be 512 bit wide. If now the both index and data are 256-bit, but
+ // the vector contains 8 elements, we just sign-extend the index
+ if (IndexVT == MVT::v8i32)
+ // Just extend index
Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
+ else {
+ // The minimal number of elts in scatter is 8
+ NumElts = 8;
+ // Index
+ MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
+ // Use original index here, do not modify the index twice
+ Index = ExtendToType(N->getIndex(), NewIndexVT, DAG);
+ if (IndexVT.getScalarType() == MVT::i32)
+ Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
+
+ // Mask
+ // At this point we have promoted mask operand
+ assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
+ MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
+ // Use the original mask here, do not modify the mask twice
+ Mask = ExtendToType(N->getMask(), ExtMaskVT, DAG, true);
+
+ // The value that should be stored
+ MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
+ Src = ExtendToType(Src, NewVT, DAG);
+ }
+ }
+ // If the mask is "wide" at this point - truncate it to i1 vector
+ MVT BitMaskVT = MVT::getVectorVT(MVT::i1, NumElts);
+ Mask = DAG.getNode(ISD::TRUNCATE, dl, BitMaskVT, Mask);
+
+ // The mask is killed by scatter, add it to the values
+ SDVTList VTs = DAG.getVTList(BitMaskVT, MVT::Other);
+ SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index};
+ NewScatter = DAG.getMaskedScatter(VTs, N->getMemoryVT(), dl, Ops,
+ N->getMemOperand());
+ DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
+ return SDValue(NewScatter.getNode(), 0);
+}
+
+static SDValue LowerMLOAD(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
- SDVTList VTs = DAG.getVTList(N->getMask().getValueType(), MVT::Other);
- SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
- N->getOperand(3), Index };
+ MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
+ MVT VT = Op.getSimpleValueType();
+ SDValue Mask = N->getMask();
+ SDLoc dl(Op);
- SDValue NewScatter = DAG.getMaskedScatter(VTs, VT, dl, Ops, N->getMemOperand());
- DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
- return SDValue(NewScatter.getNode(), 0);
+ if (Subtarget->hasAVX512() && !Subtarget->hasVLX() &&
+ !VT.is512BitVector() && Mask.getValueType() == MVT::v8i1) {
+ // This operation is legal for targets with VLX, but without
+ // VLX the vector should be widened to 512 bit
+ unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
+ MVT WideDataVT = MVT::getVectorVT(VT.getScalarType(), NumEltsInWideVec);
+ MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
+ SDValue Src0 = N->getSrc0();
+ Src0 = ExtendToType(Src0, WideDataVT, DAG);
+ Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
+ SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(),
+ N->getBasePtr(), Mask, Src0,
+ N->getMemoryVT(), N->getMemOperand(),
+ N->getExtensionType());
+
+ SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
+ NewLoad.getValue(0),
+ DAG.getIntPtrConstant(0, dl));
+ SDValue RetOps[] = {Exract, NewLoad.getValue(1)};
+ return DAG.getMergeValues(RetOps, dl);
+ }
+ return Op;
+}
+
+static SDValue LowerMSTORE(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
+ SDValue DataToStore = N->getValue();
+ MVT VT = DataToStore.getSimpleValueType();
+ SDValue Mask = N->getMask();
+ SDLoc dl(Op);
+
+ if (Subtarget->hasAVX512() && !Subtarget->hasVLX() &&
+ !VT.is512BitVector() && Mask.getValueType() == MVT::v8i1) {
+ // This operation is legal for targets with VLX, but without
+ // VLX the vector should be widened to 512 bit
+ unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
+ MVT WideDataVT = MVT::getVectorVT(VT.getScalarType(), NumEltsInWideVec);
+ MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
+ DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
+ Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
+ return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
+ Mask, N->getMemoryVT(), N->getMemOperand(),
+ N->isTruncatingStore());
}
return Op;
}
"MGATHER/MSCATTER are supported on AVX-512 arch only");
MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
+ SDLoc dl(Op);
MVT VT = Op.getSimpleValueType();
+ SDValue Index = N->getIndex();
+ SDValue Mask = N->getMask();
+ SDValue Src0 = N->getValue();
+ MVT IndexVT = Index.getSimpleValueType();
+ MVT MaskVT = Mask.getSimpleValueType();
+
+ unsigned NumElts = VT.getVectorNumElements();
assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
- SDLoc dl(Op);
- SDValue Index = N->getIndex();
if (!Subtarget->hasVLX() && !VT.is512BitVector() &&
!Index.getSimpleValueType().is512BitVector()) {
- Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
- SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
- N->getOperand(3), Index };
- DAG.UpdateNodeOperands(N, Ops);
+ // AVX512F supports only 512-bit vectors. Or data or index should
+ // be 512 bit wide. If now the both index and data are 256-bit, but
+ // the vector contains 8 elements, we just sign-extend the index
+ if (NumElts == 8) {
+ Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
+ SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
+ N->getOperand(3), Index };
+ DAG.UpdateNodeOperands(N, Ops);
+ return Op;
+ }
+
+ // Minimal number of elements in Gather
+ NumElts = 8;
+ // Index
+ MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
+ Index = ExtendToType(Index, NewIndexVT, DAG);
+ if (IndexVT.getScalarType() == MVT::i32)
+ Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
+
+ // Mask
+ MVT MaskBitVT = MVT::getVectorVT(MVT::i1, NumElts);
+ // At this point we have promoted mask operand
+ assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
+ MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
+ Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
+ Mask = DAG.getNode(ISD::TRUNCATE, dl, MaskBitVT, Mask);
+
+ // The pass-thru value
+ MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
+ Src0 = ExtendToType(Src0, NewVT, DAG);
+
+ SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
+ SDValue NewGather = DAG.getMaskedGather(DAG.getVTList(NewVT, MVT::Other),
+ N->getMemoryVT(), dl, Ops,
+ N->getMemOperand());
+ SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
+ NewGather.getValue(0),
+ DAG.getIntPtrConstant(0, dl));
+ SDValue RetOps[] = {Exract, NewGather.getValue(1)};
+ return DAG.getMergeValues(RetOps, dl);
}
return Op;
}
case ISD::UMAX:
case ISD::UMIN: return LowerMINMAX(Op, DAG);
case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
+ case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
+ case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
case ISD::GC_TRANSITION_START:
return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
}
}
+ case ISD::INTRINSIC_WO_CHAIN: {
+ if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), Subtarget, DAG))
+ Results.push_back(V);
+ return;
+ }
case ISD::READCYCLECOUNTER: {
return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
Results);
case X86ISD::CMOV: return "X86ISD::CMOV";
case X86ISD::BRCOND: return "X86ISD::BRCOND";
case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG";
+ case X86ISD::IRET: return "X86ISD::IRET";
case X86ISD::REP_STOS: return "X86ISD::REP_STOS";
case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS";
case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg";
return EmitLoweredTLSCall(MI, BB);
case X86::CMOV_FR32:
case X86::CMOV_FR64:
+ case X86::CMOV_FR128:
case X86::CMOV_GR8:
case X86::CMOV_GR16:
case X86::CMOV_GR32:
return TargetLowering::isGAPlusOffset(N, GA, Offset);
}
-/// isShuffleHigh128VectorInsertLow - Checks whether the shuffle node is the
-/// same as extracting the high 128-bit part of 256-bit vector and then
-/// inserting the result into the low part of a new 256-bit vector
-static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) {
- EVT VT = SVOp->getValueType(0);
- unsigned NumElems = VT.getVectorNumElements();
-
- // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
- for (unsigned i = 0, j = NumElems/2; i != NumElems/2; ++i, ++j)
- if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
- SVOp->getMaskElt(j) >= 0)
- return false;
-
- return true;
-}
-
-/// isShuffleLow128VectorInsertHigh - Checks whether the shuffle node is the
-/// same as extracting the low 128-bit part of 256-bit vector and then
-/// inserting the result into the high part of a new 256-bit vector
-static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) {
- EVT VT = SVOp->getValueType(0);
- unsigned NumElems = VT.getVectorNumElements();
-
- // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
- for (unsigned i = NumElems/2, j = 0; i != NumElems; ++i, ++j)
- if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
- SVOp->getMaskElt(j) >= 0)
- return false;
-
- return true;
-}
-
/// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors.
+/// FIXME: This could be expanded to support 512 bit vectors as well.
static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget* Subtarget) {
return DCI.CombineTo(N, InsV);
}
- //===--------------------------------------------------------------------===//
- // Combine some shuffles into subvector extracts and inserts:
- //
-
- // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
- if (isShuffleHigh128VectorInsertLow(SVOp)) {
- SDValue V = Extract128BitVector(V1, NumElems/2, DAG, dl);
- SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, 0, DAG, dl);
- return DCI.CombineTo(N, InsV);
- }
-
- // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
- if (isShuffleLow128VectorInsertHigh(SVOp)) {
- SDValue V = Extract128BitVector(V1, 0, DAG, dl);
- SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, NumElems/2, DAG, dl);
- return DCI.CombineTo(N, InsV);
- }
-
return SDValue();
}
// ignored in unsafe-math mode).
// We also try to create v2f32 min/max nodes, which we later widen to v4f32.
if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
- VT != MVT::f80 && (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
+ VT != MVT::f80 && VT != MVT::f128 &&
+ (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
(Subtarget->hasSSE2() ||
(Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) {
ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
MulAmt1 = 3;
MulAmt2 = MulAmt / 3;
}
+
+ SDLoc DL(N);
+ SDValue NewMul;
if (MulAmt2 &&
(isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
- SDLoc DL(N);
if (isPowerOf2_64(MulAmt2) &&
!(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
// is an add.
std::swap(MulAmt1, MulAmt2);
- SDValue NewMul;
if (isPowerOf2_64(MulAmt1))
NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
else
NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
DAG.getConstant(MulAmt2, DL, VT));
+ }
+ if (!NewMul) {
+ assert(MulAmt != 0 && MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX)
+ && "Both cases that could cause potential overflows should have "
+ "already been handled.");
+ if (isPowerOf2_64(MulAmt - 1))
+ // (mul x, 2^N + 1) => (add (shl x, N), x)
+ NewMul = DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
+ DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
+ DAG.getConstant(Log2_64(MulAmt - 1), DL,
+ MVT::i8)));
+
+ else if (isPowerOf2_64(MulAmt + 1))
+ // (mul x, 2^N - 1) => (sub (shl x, N), x)
+ NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getNode(ISD::SHL, DL, VT,
+ N->getOperand(0),
+ DAG.getConstant(Log2_64(MulAmt + 1),
+ DL, MVT::i8)), N->getOperand(0));
+ }
+
+ if (NewMul)
// Do not add new nodes to DAG combiner worklist.
DCI.CombineTo(N, NewMul, false);
- }
+
return SDValue();
}
return SDValue();
}
+static SDValue PerformSRACombine(SDNode *N, SelectionDAG &DAG) {
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ EVT VT = N0.getValueType();
+ unsigned Size = VT.getSizeInBits();
+
+ // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
+ // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
+ // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
+ // depending on sign of (SarConst - [56,48,32,24,16])
+
+ // sexts in X86 are MOVs. The MOVs have the same code size
+ // as above SHIFTs (only SHIFT on 1 has lower code size).
+ // However the MOVs have 2 advantages to a SHIFT:
+ // 1. MOVs can write to a register that differs from source
+ // 2. MOVs accept memory operands
+
+ if (!VT.isInteger() || VT.isVector() || N1.getOpcode() != ISD::Constant ||
+ N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
+ N0.getOperand(1).getOpcode() != ISD::Constant)
+ return SDValue();
+
+ SDValue N00 = N0.getOperand(0);
+ SDValue N01 = N0.getOperand(1);
+ APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
+ APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
+ EVT CVT = N1.getValueType();
+
+ if (SarConst.isNegative())
+ return SDValue();
+
+ for (MVT SVT : MVT::integer_valuetypes()) {
+ unsigned ShiftSize = SVT.getSizeInBits();
+ // skipping types without corresponding sext/zext and
+ // ShlConst that is not one of [56,48,32,24,16]
+ if (ShiftSize < 8 || ShiftSize > 64 || ShlConst != Size - ShiftSize)
+ continue;
+ SDLoc DL(N);
+ SDValue NN =
+ DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
+ SarConst = SarConst - (Size - ShiftSize);
+ if (SarConst == 0)
+ return NN;
+ else if (SarConst.isNegative())
+ return DAG.getNode(ISD::SHL, DL, VT, NN,
+ DAG.getConstant(-SarConst, DL, CVT));
+ else
+ return DAG.getNode(ISD::SRA, DL, VT, NN,
+ DAG.getConstant(SarConst, DL, CVT));
+ }
+ return SDValue();
+}
+
/// \brief Returns a vector of 0s if the node in input is a vector logical
/// shift by a constant amount which is known to be bigger than or equal
/// to the vector element size in bits.
if (SDValue V = PerformSHLCombine(N, DAG))
return V;
+ if (N->getOpcode() == ISD::SRA)
+ if (SDValue V = PerformSRACombine(N, DAG))
+ return V;
+
// Try to fold this logical shift into a zero vector.
if (N->getOpcode() != ISD::SRA)
if (SDValue V = performShiftToAllZeros(N, DAG, Subtarget))
return SDValue();
}
-static SDValue PerformTRUNCATECombine(SDNode *N, SelectionDAG &DAG,
- const X86Subtarget *Subtarget) {
- return detectAVGPattern(N->getOperand(0), N->getValueType(0), DAG, Subtarget,
- SDLoc(N));
-}
-
/// PerformLOADCombine - Do target-specific dag combines on LOAD nodes.
static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
return SDValue();
}
+/// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS.
+static SDValue
+combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG,
+ SmallVector<SDValue, 8> &Regs) {
+ assert(Regs.size() > 0 && (Regs[0].getValueType() == MVT::v4i32 ||
+ Regs[0].getValueType() == MVT::v2i64));
+ EVT OutVT = N->getValueType(0);
+ EVT OutSVT = OutVT.getVectorElementType();
+ EVT InVT = Regs[0].getValueType();
+ EVT InSVT = InVT.getVectorElementType();
+ SDLoc DL(N);
+
+ // First, use mask to unset all bits that won't appear in the result.
+ assert((OutSVT == MVT::i8 || OutSVT == MVT::i16) &&
+ "OutSVT can only be either i8 or i16.");
+ SDValue MaskVal =
+ DAG.getConstant(OutSVT == MVT::i8 ? 0xFF : 0xFFFF, DL, InSVT);
+ SDValue MaskVec = DAG.getNode(
+ ISD::BUILD_VECTOR, DL, InVT,
+ SmallVector<SDValue, 8>(InVT.getVectorNumElements(), MaskVal));
+ for (auto &Reg : Regs)
+ Reg = DAG.getNode(ISD::AND, DL, InVT, MaskVec, Reg);
+
+ MVT UnpackedVT, PackedVT;
+ if (OutSVT == MVT::i8) {
+ UnpackedVT = MVT::v8i16;
+ PackedVT = MVT::v16i8;
+ } else {
+ UnpackedVT = MVT::v4i32;
+ PackedVT = MVT::v8i16;
+ }
+
+ // In each iteration, truncate the type by a half size.
+ auto RegNum = Regs.size();
+ for (unsigned j = 1, e = InSVT.getSizeInBits() / OutSVT.getSizeInBits();
+ j < e; j *= 2, RegNum /= 2) {
+ for (unsigned i = 0; i < RegNum; i++)
+ Regs[i] = DAG.getNode(ISD::BITCAST, DL, UnpackedVT, Regs[i]);
+ for (unsigned i = 0; i < RegNum / 2; i++)
+ Regs[i] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[i * 2],
+ Regs[i * 2 + 1]);
+ }
+
+ // If the type of the result is v8i8, we need do one more X86ISD::PACKUS, and
+ // then extract a subvector as the result since v8i8 is not a legal type.
+ if (OutVT == MVT::v8i8) {
+ Regs[0] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[0], Regs[0]);
+ Regs[0] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, Regs[0],
+ DAG.getIntPtrConstant(0, DL));
+ return Regs[0];
+ } else if (RegNum > 1) {
+ Regs.resize(RegNum);
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
+ } else
+ return Regs[0];
+}
+
+/// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
+static SDValue
+combineVectorTruncationWithPACKSS(SDNode *N, SelectionDAG &DAG,
+ SmallVector<SDValue, 8> &Regs) {
+ assert(Regs.size() > 0 && Regs[0].getValueType() == MVT::v4i32);
+ EVT OutVT = N->getValueType(0);
+ SDLoc DL(N);
+
+ // Shift left by 16 bits, then arithmetic-shift right by 16 bits.
+ SDValue ShAmt = DAG.getConstant(16, DL, MVT::i32);
+ for (auto &Reg : Regs) {
+ Reg = getTargetVShiftNode(X86ISD::VSHLI, DL, MVT::v4i32, Reg, ShAmt, DAG);
+ Reg = getTargetVShiftNode(X86ISD::VSRAI, DL, MVT::v4i32, Reg, ShAmt, DAG);
+ }
+
+ for (unsigned i = 0, e = Regs.size() / 2; i < e; i++)
+ Regs[i] = DAG.getNode(X86ISD::PACKSS, DL, MVT::v8i16, Regs[i * 2],
+ Regs[i * 2 + 1]);
+
+ if (Regs.size() > 2) {
+ Regs.resize(Regs.size() / 2);
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
+ } else
+ return Regs[0];
+}
+
+/// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
+/// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
+/// legalization the truncation will be translated into a BUILD_VECTOR with each
+/// element that is extracted from a vector and then truncated, and it is
+/// diffcult to do this optimization based on them.
+static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ EVT OutVT = N->getValueType(0);
+ if (!OutVT.isVector())
+ return SDValue();
+
+ SDValue In = N->getOperand(0);
+ if (!In.getValueType().isSimple())
+ return SDValue();
+
+ EVT InVT = In.getValueType();
+ unsigned NumElems = OutVT.getVectorNumElements();
+
+ // TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on
+ // SSE2, and we need to take care of it specially.
+ // AVX512 provides vpmovdb.
+ if (!Subtarget->hasSSE2() || Subtarget->hasAVX2())
+ return SDValue();
+
+ EVT OutSVT = OutVT.getVectorElementType();
+ EVT InSVT = InVT.getVectorElementType();
+ if (!((InSVT == MVT::i32 || InSVT == MVT::i64) &&
+ (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
+ NumElems >= 8))
+ return SDValue();
+
+ // SSSE3's pshufb results in less instructions in the cases below.
+ if (Subtarget->hasSSSE3() && NumElems == 8 &&
+ ((OutSVT == MVT::i8 && InSVT != MVT::i64) ||
+ (InSVT == MVT::i32 && OutSVT == MVT::i16)))
+ return SDValue();
+
+ SDLoc DL(N);
+
+ // Split a long vector into vectors of legal type.
+ unsigned RegNum = InVT.getSizeInBits() / 128;
+ SmallVector<SDValue, 8> SubVec(RegNum);
+ if (InSVT == MVT::i32) {
+ for (unsigned i = 0; i < RegNum; i++)
+ SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
+ DAG.getIntPtrConstant(i * 4, DL));
+ } else {
+ for (unsigned i = 0; i < RegNum; i++)
+ SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
+ DAG.getIntPtrConstant(i * 2, DL));
+ }
+
+ // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PAKCUS
+ // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
+ // truncate 2 x v4i32 to v8i16.
+ if (Subtarget->hasSSE41() || OutSVT == MVT::i8)
+ return combineVectorTruncationWithPACKUS(N, DAG, SubVec);
+ else if (InSVT == MVT::i32)
+ return combineVectorTruncationWithPACKSS(N, DAG, SubVec);
+ else
+ return SDValue();
+}
+
+static SDValue PerformTRUNCATECombine(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ // Try to detect AVG pattern first.
+ SDValue Avg = detectAVGPattern(N->getOperand(0), N->getValueType(0), DAG,
+ Subtarget, SDLoc(N));
+ if (Avg.getNode())
+ return Avg;
+
+ return combineVectorTruncation(N, DAG, Subtarget);
+}
+
/// Do target-specific dag combines on floating point negations.
static SDValue PerformFNEGCombine(SDNode *N, SelectionDAG &DAG,
const X86Subtarget *Subtarget) {
// If we're negating a FMUL node on a target with FMA, then we can avoid the
// use of a constant by performing (-0 - A*B) instead.
- // FIXME: Check rounding control flags as well once it becomes available.
+ // FIXME: Check rounding control flags as well once it becomes available.
if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
Arg->getFlags()->hasNoSignedZeros() && Subtarget->hasAnyFMA()) {
SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
return SDValue();
}
+static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ EVT VT = N->getValueType(0);
+ if (VT.is512BitVector() && !Subtarget->hasDQI()) {
+ // VXORPS, VORPS, VANDPS, VANDNPS are supported only under DQ extention.
+ // These logic operations may be executed in the integer domain.
+ SDLoc dl(N);
+ MVT IntScalar = MVT::getIntegerVT(VT.getScalarSizeInBits());
+ MVT IntVT = MVT::getVectorVT(IntScalar, VT.getVectorNumElements());
+
+ SDValue Op0 = DAG.getNode(ISD::BITCAST, dl, IntVT, N->getOperand(0));
+ SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, IntVT, N->getOperand(1));
+ unsigned IntOpcode = 0;
+ switch (N->getOpcode()) {
+ default: llvm_unreachable("Unexpected FP logic op");
+ case X86ISD::FOR: IntOpcode = ISD::OR; break;
+ case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
+ case X86ISD::FAND: IntOpcode = ISD::AND; break;
+ case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
+ }
+ SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
+ return DAG.getNode(ISD::BITCAST, dl, VT, IntOp);
+ }
+ return SDValue();
+}
/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG,
const X86Subtarget *Subtarget) {
if (C->getValueAPF().isPosZero())
return N->getOperand(0);
- EVT VT = N->getValueType(0);
- if (VT.is512BitVector() && !Subtarget->hasDQI()) {
- SDLoc dl(N);
- MVT IntScalar = MVT::getIntegerVT(VT.getScalarSizeInBits());
- MVT IntVT = MVT::getVectorVT(IntScalar, VT.getVectorNumElements());
-
- SDValue Op0 = DAG.getNode(ISD::BITCAST, dl, IntVT, N->getOperand(0));
- SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, IntVT, N->getOperand(1));
- unsigned IntOpcode = (N->getOpcode() == X86ISD::FOR) ? ISD::OR : ISD::XOR;
- SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
- return DAG.getNode(ISD::BITCAST, dl, VT, IntOp);
- }
- return SDValue();
+ return lowerX86FPLogicOp(N, DAG, Subtarget);
}
/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
N->getOperand(0), N->getOperand(1));
}
+static SDValue performFMaxNumCombine(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ // This takes at least 3 instructions, so favor a library call when
+ // minimizing code size.
+ if (DAG.getMachineFunction().getFunction()->optForMinSize())
+ return SDValue();
+
+ EVT VT = N->getValueType(0);
+
+ // TODO: Check for global or instruction-level "nnan". In that case, we
+ // should be able to lower to FMAX/FMIN alone.
+ // TODO: If an operand is already known to be a NaN or not a NaN, this
+ // should be an optional swap and FMAX/FMIN.
+ // TODO: Allow f64, vectors, and fminnum.
+
+ if (VT != MVT::f32 || !Subtarget->hasSSE1() || Subtarget->useSoftFloat())
+ return SDValue();
+
+ SDValue Op0 = N->getOperand(0);
+ SDValue Op1 = N->getOperand(1);
+ SDLoc DL(N);
+ EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType(
+ DAG.getDataLayout(), *DAG.getContext(), VT);
+
+ // There are 4 possibilities involving NaN inputs, and these are the required
+ // outputs:
+ // Op1
+ // Num NaN
+ // ----------------
+ // Num | Max | Op0 |
+ // Op0 ----------------
+ // NaN | Op1 | NaN |
+ // ----------------
+ //
+ // The SSE FP max/min instructions were not designed for this case, but rather
+ // to implement:
+ // Max = Op1 > Op0 ? Op1 : Op0
+ //
+ // So they always return Op0 if either input is a NaN. However, we can still
+ // use those instructions for fmaxnum by selecting away a NaN input.
+
+ // If either operand is NaN, the 2nd source operand (Op0) is passed through.
+ SDValue Max = DAG.getNode(X86ISD::FMAX, DL, VT, Op1, Op0);
+ SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType , Op0, Op0, ISD::SETUO);
+
+ // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
+ // are NaN, the NaN value of Op1 is the result.
+ return DAG.getNode(ISD::SELECT, DL, VT, IsOp0Nan, Op1, Max);
+}
+
/// Do target-specific dag combines on X86ISD::FAND nodes.
-static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
+static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
// FAND(0.0, x) -> 0.0
if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
if (C->getValueAPF().isPosZero())
if (C->getValueAPF().isPosZero())
return N->getOperand(1);
- return SDValue();
+ return lowerX86FPLogicOp(N, DAG, Subtarget);
}
/// Do target-specific dag combines on X86ISD::FANDN nodes
-static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG) {
+static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
// FANDN(0.0, x) -> x
if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
if (C->getValueAPF().isPosZero())
if (C->getValueAPF().isPosZero())
return N->getOperand(1);
- return SDValue();
+ return lowerX86FPLogicOp(N, DAG, Subtarget);
}
static SDValue PerformBTCombine(SDNode *N,
return SDValue();
}
+static SDValue PerformGatherScatterCombine(SDNode *N, SelectionDAG &DAG) {
+ SDLoc DL(N);
+ // Gather and Scatter instructions use k-registers for masks. The type of
+ // the masks is v*i1. So the mask will be truncated anyway.
+ // The SIGN_EXTEND_INREG my be dropped.
+ SDValue Mask = N->getOperand(2);
+ if (Mask.getOpcode() == ISD::SIGN_EXTEND_INREG) {
+ SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
+ NewOps[2] = Mask.getOperand(0);
+ DAG.UpdateNodeOperands(N, NewOps);
+ }
+ return SDValue();
+}
+
// Helper function of PerformSETCCCombine. It is to materialize "setb reg"
// as "sbb reg,reg", since it can be extended without zext and produces
// an all-ones bit which is more useful than 0/1 in some cases.
case X86ISD::FOR: return PerformFORCombine(N, DAG, Subtarget);
case X86ISD::FMIN:
case X86ISD::FMAX: return PerformFMinFMaxCombine(N, DAG);
- case X86ISD::FAND: return PerformFANDCombine(N, DAG);
- case X86ISD::FANDN: return PerformFANDNCombine(N, DAG);
+ case ISD::FMAXNUM: return performFMaxNumCombine(N, DAG, Subtarget);
+ case X86ISD::FAND: return PerformFANDCombine(N, DAG, Subtarget);
+ case X86ISD::FANDN: return PerformFANDNCombine(N, DAG, Subtarget);
case X86ISD::BT: return PerformBTCombine(N, DAG, DCI);
case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG);
case ISD::ANY_EXTEND:
case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget);
case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget);
case X86ISD::BLENDI: return PerformBLENDICombine(N, DAG);
+ case ISD::MGATHER:
+ case ISD::MSCATTER: return PerformGatherScatterCombine(N, DAG);
}
return SDValue();
case MVT::f64:
case MVT::i64:
return std::make_pair(0U, &X86::FR64RegClass);
+ // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
// Vector types.
case MVT::v16i8:
case MVT::v8i16:
if (Class == &X86::GR8RegClass || Class == &X86::GR16RegClass ||
Class == &X86::GR32RegClass || Class == &X86::GR64RegClass) {
unsigned Size = VT.getSizeInBits();
- MVT::SimpleValueType SimpleTy = Size == 1 || Size == 8 ? MVT::i8
- : Size == 16 ? MVT::i16
- : Size == 32 ? MVT::i32
- : Size == 64 ? MVT::i64
- : MVT::Other;
- unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, SimpleTy);
+ if (Size == 1) Size = 8;
+ unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
if (DestReg > 0) {
Res.first = DestReg;
- Res.second = SimpleTy == MVT::i8 ? &X86::GR8RegClass
- : SimpleTy == MVT::i16 ? &X86::GR16RegClass
- : SimpleTy == MVT::i32 ? &X86::GR32RegClass
+ Res.second = Size == 8 ? &X86::GR8RegClass
+ : Size == 16 ? &X86::GR16RegClass
+ : Size == 32 ? &X86::GR32RegClass
: &X86::GR64RegClass;
assert(Res.second->contains(Res.first) && "Register in register class");
} else {
// target independent register mapper will just pick the first match it can
// find, ignoring the required type.
+ // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
if (VT == MVT::f32 || VT == MVT::i32)
Res.second = &X86::FR32RegClass;
else if (VT == MVT::f64 || VT == MVT::i64)