setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
+ // Only provide customized ctpop vector bit twiddling for vector types we
+ // know to perform better than using the popcnt instructions on each vector
+ // element. If popcnt isn't supported, always provide the custom version.
+ if (!Subtarget->hasPOPCNT()) {
+ setOperationAction(ISD::CTPOP, MVT::v4i32, Custom);
+ setOperationAction(ISD::CTPOP, MVT::v2i64, Custom);
+ }
+
// Custom lower build_vector, vector_shuffle, and extract_vector_elt.
for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
MVT VT = (MVT::SimpleValueType)i;
// The custom lowering for UINT_TO_FP for v8i32 becomes interesting
// when we have a 256bit-wide blend with immediate.
setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
+
+ // Only provide customized ctpop vector bit twiddling for vector types we
+ // know to perform better than using the popcnt instructions on each
+ // vector element. If popcnt isn't supported, always provide the custom
+ // version.
+ if (!Subtarget->hasPOPCNT())
+ setOperationAction(ISD::CTPOP, MVT::v4i64, Custom);
+
+ // Custom CTPOP always performs better on natively supported v8i32
+ setOperationAction(ISD::CTPOP, MVT::v8i32, Custom);
} else {
setOperationAction(ISD::ADD, MVT::v4i64, Custom);
setOperationAction(ISD::ADD, MVT::v8i32, Custom);
MFI->CreateFixedObject(1, StackSize, true));
}
+ // Figure out if XMM registers are in use.
+ assert(!(MF.getTarget().Options.UseSoftFloat &&
+ Fn->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+ Attribute::NoImplicitFloat)) &&
+ "SSE register cannot be used when SSE is disabled!");
+
// 64-bit calling conventions support varargs and register parameters, so we
- // have to do extra work to spill them in the prologue or forward them to
- // musttail calls.
- if (Is64Bit && isVarArg &&
- (MFI->hasVAStart() || MFI->hasMustTailInVarArgFunc())) {
+ // have to do extra work to spill them in the prologue.
+ if (Is64Bit && isVarArg && MFI->hasVAStart()) {
// Find the first unallocated argument registers.
ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
}
}
- // Store them to the va_list returned by va_start.
- if (MFI->hasVAStart()) {
- if (IsWin64) {
- const TargetFrameLowering &TFI = *MF.getSubtarget().getFrameLowering();
- // Get to the caller-allocated home save location. Add 8 to account
- // for the return address.
- int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
- FuncInfo->setRegSaveFrameIndex(
+ if (IsWin64) {
+ const TargetFrameLowering &TFI = *MF.getSubtarget().getFrameLowering();
+ // Get to the caller-allocated home save location. Add 8 to account
+ // for the return address.
+ int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
+ FuncInfo->setRegSaveFrameIndex(
MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
- // Fixup to set vararg frame on shadow area (4 x i64).
- if (NumIntRegs < 4)
- FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
- } else {
- // For X86-64, if there are vararg parameters that are passed via
- // registers, then we must store them to their spots on the stack so
- // they may be loaded by deferencing the result of va_next.
- FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
- FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
- FuncInfo->setRegSaveFrameIndex(MFI->CreateStackObject(
- ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
- }
-
- // Store the integer parameter registers.
- SmallVector<SDValue, 8> MemOps;
- SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
- getPointerTy());
- unsigned Offset = FuncInfo->getVarArgsGPOffset();
- for (SDValue Val : LiveGPRs) {
- SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
- DAG.getIntPtrConstant(Offset));
- SDValue Store =
- DAG.getStore(Val.getValue(1), dl, Val, FIN,
- MachinePointerInfo::getFixedStack(
- FuncInfo->getRegSaveFrameIndex(), Offset),
- false, false, 0);
- MemOps.push_back(Store);
- Offset += 8;
- }
-
- if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
- // Now store the XMM (fp + vector) parameter registers.
- SmallVector<SDValue, 12> SaveXMMOps;
- SaveXMMOps.push_back(Chain);
- SaveXMMOps.push_back(ALVal);
- SaveXMMOps.push_back(DAG.getIntPtrConstant(
- FuncInfo->getRegSaveFrameIndex()));
- SaveXMMOps.push_back(DAG.getIntPtrConstant(
- FuncInfo->getVarArgsFPOffset()));
- SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
- LiveXMMRegs.end());
- MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
- MVT::Other, SaveXMMOps));
- }
-
- if (!MemOps.empty())
- Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
+ // Fixup to set vararg frame on shadow area (4 x i64).
+ if (NumIntRegs < 4)
+ FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
} else {
- // Add all GPRs, al, and XMMs to the list of forwards. We will add then
- // to the liveout set on a musttail call.
- assert(MFI->hasMustTailInVarArgFunc());
- auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
- typedef X86MachineFunctionInfo::Forward Forward;
-
- for (unsigned I = 0, E = LiveGPRs.size(); I != E; ++I) {
- unsigned VReg =
- MF.getRegInfo().createVirtualRegister(&X86::GR64RegClass);
- Chain = DAG.getCopyToReg(Chain, dl, VReg, LiveGPRs[I]);
- Forwards.push_back(Forward(VReg, ArgGPRs[NumIntRegs + I], MVT::i64));
- }
-
- if (!ArgXMMs.empty()) {
- unsigned ALVReg =
- MF.getRegInfo().createVirtualRegister(&X86::GR8RegClass);
- Chain = DAG.getCopyToReg(Chain, dl, ALVReg, ALVal);
- Forwards.push_back(Forward(ALVReg, X86::AL, MVT::i8));
-
- for (unsigned I = 0, E = LiveXMMRegs.size(); I != E; ++I) {
- unsigned VReg =
- MF.getRegInfo().createVirtualRegister(&X86::VR128RegClass);
- Chain = DAG.getCopyToReg(Chain, dl, VReg, LiveXMMRegs[I]);
- Forwards.push_back(
- Forward(VReg, ArgXMMs[NumXMMRegs + I], MVT::v4f32));
- }
- }
+ // For X86-64, if there are vararg parameters that are passed via
+ // registers, then we must store them to their spots on the stack so
+ // they may be loaded by deferencing the result of va_next.
+ FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
+ FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
+ FuncInfo->setRegSaveFrameIndex(MFI->CreateStackObject(
+ ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
+ }
+
+ // Store the integer parameter registers.
+ SmallVector<SDValue, 8> MemOps;
+ SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
+ getPointerTy());
+ unsigned Offset = FuncInfo->getVarArgsGPOffset();
+ for (SDValue Val : LiveGPRs) {
+ SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
+ DAG.getIntPtrConstant(Offset));
+ SDValue Store =
+ DAG.getStore(Val.getValue(1), dl, Val, FIN,
+ MachinePointerInfo::getFixedStack(
+ FuncInfo->getRegSaveFrameIndex(), Offset),
+ false, false, 0);
+ MemOps.push_back(Store);
+ Offset += 8;
+ }
+
+ if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
+ // Now store the XMM (fp + vector) parameter registers.
+ SmallVector<SDValue, 12> SaveXMMOps;
+ SaveXMMOps.push_back(Chain);
+ SaveXMMOps.push_back(ALVal);
+ SaveXMMOps.push_back(DAG.getIntPtrConstant(
+ FuncInfo->getRegSaveFrameIndex()));
+ SaveXMMOps.push_back(DAG.getIntPtrConstant(
+ FuncInfo->getVarArgsFPOffset()));
+ SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
+ LiveXMMRegs.end());
+ MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
+ MVT::Other, SaveXMMOps));
+ }
+
+ if (!MemOps.empty())
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
+ }
+
+ if (isVarArg && MFI->hasMustTailInVarArgFunc()) {
+ // Find the largest legal vector type.
+ MVT VecVT = MVT::Other;
+ // FIXME: Only some x86_32 calling conventions support AVX512.
+ if (Subtarget->hasAVX512() &&
+ (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
+ CallConv == CallingConv::Intel_OCL_BI)))
+ VecVT = MVT::v16f32;
+ else if (Subtarget->hasAVX())
+ VecVT = MVT::v8f32;
+ else if (Subtarget->hasSSE2())
+ VecVT = MVT::v4f32;
+
+ // We forward some GPRs and some vector types.
+ SmallVector<MVT, 2> RegParmTypes;
+ MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
+ RegParmTypes.push_back(IntVT);
+ if (VecVT != MVT::Other)
+ RegParmTypes.push_back(VecVT);
+
+ // Compute the set of forwarded registers. The rest are scratch.
+ SmallVectorImpl<ForwardedRegister> &Forwards =
+ FuncInfo->getForwardedMustTailRegParms();
+ CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
+
+ // Conservatively forward AL on x86_64, since it might be used for varargs.
+ if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
+ unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
+ Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
+ }
+
+ // Copy all forwards from physical to virtual registers.
+ for (ForwardedRegister &F : Forwards) {
+ // FIXME: Can we use a less constrained schedule?
+ SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
+ F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
+ Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
}
}
DAG.getConstant(NumXMMRegs, MVT::i8)));
}
- if (Is64Bit && isVarArg && IsMustTail) {
+ if (isVarArg && IsMustTail) {
const auto &Forwards = X86Info->getForwardedMustTailRegParms();
for (const auto &F : Forwards) {
SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
return (Index == 0 || Index == ResVT.getVectorNumElements());
}
+bool X86TargetLowering::isCheapToSpeculateCttz() const {
+ // Speculate cttz only if we can directly use TZCNT.
+ return Subtarget->hasBMI();
+}
+
+bool X86TargetLowering::isCheapToSpeculateCtlz() const {
+ // Speculate ctlz only if we can directly use LZCNT.
+ return Subtarget->hasLZCNT();
+}
+
/// isUndefOrInRange - Return true if Val is undef or if its value falls within
/// the specified range (L, H].
static bool isUndefOrInRange(int Val, int Low, int Hi) {
SDValue NewLd = SDValue();
- if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16)
- NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
- LDBase->getPointerInfo(),
- LDBase->isVolatile(), LDBase->isNonTemporal(),
- LDBase->isInvariant(), 0);
NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
- LDBase->getPointerInfo(),
- LDBase->isVolatile(), LDBase->isNonTemporal(),
- LDBase->isInvariant(), LDBase->getAlignment());
+ LDBase->getPointerInfo(), LDBase->isVolatile(),
+ LDBase->isNonTemporal(), LDBase->isInvariant(),
+ LDBase->getAlignment());
if (LDBase->hasAnyUseOfValue(1)) {
SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
return DAG.getNode(IntrData->Opc0, dl, VT, VMask, DataToCompress,
PassThru);
}
+ case BLEND: {
+ SDValue Mask = Op.getOperand(3);
+ EVT VT = Op.getValueType();
+ EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+ VT.getVectorNumElements());
+ EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+ Mask.getValueType().getSizeInBits());
+ SDLoc dl(Op);
+ SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
+ DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
+ DAG.getIntPtrConstant(0));
+ return DAG.getNode(IntrData->Opc0, dl, VT, VMask, Op.getOperand(1),
+ Op.getOperand(2));
+ }
+ case FMA_OP_MASK:
+ {
+ return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
+ dl, Op.getValueType(),
+ Op.getOperand(1),
+ Op.getOperand(2),
+ Op.getOperand(3)),
+ Op.getOperand(4), Op.getOperand(1),
+ Subtarget, DAG);
+ }
default:
break;
}
return SDValue();
}
+static SDValue LowerCTPOP(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ SDNode *Node = Op.getNode();
+ SDLoc dl(Node);
+
+ Op = Op.getOperand(0);
+ EVT VT = Op.getValueType();
+ assert((VT.is128BitVector() || VT.is256BitVector()) &&
+ "CTPOP lowering only implemented for 128/256-bit wide vector types");
+
+ unsigned NumElts = VT.getVectorNumElements();
+ EVT EltVT = VT.getVectorElementType();
+ unsigned Len = EltVT.getSizeInBits();
+
+ // This is the vectorized version of the "best" algorithm from
+ // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
+ // with a minor tweak to use a series of adds + shifts instead of vector
+ // multiplications. Implemented for the v2i64, v4i64, v4i32, v8i32 types:
+ //
+ // v2i64, v4i64, v4i32 => Only profitable w/ popcnt disabled
+ // v8i32 => Always profitable
+ //
+ // FIXME: There a couple of possible improvements:
+ //
+ // 1) Support for i8 and i16 vectors (needs measurements if popcnt enabled).
+ // 2) Use strategies from http://wm.ite.pl/articles/sse-popcount.html
+ //
+ assert(EltVT.isInteger() && (Len == 32 || Len == 64) && Len % 8 == 0 &&
+ "CTPOP not implemented for this vector element type.");
+
+ // X86 canonicalize ANDs to vXi64, generate the appropriate bitcasts to avoid
+ // extra legalization.
+ bool NeedsBitcast = EltVT == MVT::i32;
+ MVT BitcastVT = VT.is256BitVector() ? MVT::v4i64 : MVT::v2i64;
+
+ SDValue Cst55 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x55)), EltVT);
+ SDValue Cst33 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x33)), EltVT);
+ SDValue Cst0F = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x0F)), EltVT);
+
+ // v = v - ((v >> 1) & 0x55555555...)
+ SmallVector<SDValue, 8> Ones(NumElts, DAG.getConstant(1, EltVT));
+ SDValue OnesV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ones);
+ SDValue Srl = DAG.getNode(ISD::SRL, dl, VT, Op, OnesV);
+ if (NeedsBitcast)
+ Srl = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Srl);
+
+ SmallVector<SDValue, 8> Mask55(NumElts, Cst55);
+ SDValue M55 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask55);
+ if (NeedsBitcast)
+ M55 = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M55);
+
+ SDValue And = DAG.getNode(ISD::AND, dl, Srl.getValueType(), Srl, M55);
+ if (VT != And.getValueType())
+ And = DAG.getNode(ISD::BITCAST, dl, VT, And);
+ SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, Op, And);
+
+ // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
+ SmallVector<SDValue, 8> Mask33(NumElts, Cst33);
+ SDValue M33 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask33);
+ SmallVector<SDValue, 8> Twos(NumElts, DAG.getConstant(2, EltVT));
+ SDValue TwosV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Twos);
+
+ Srl = DAG.getNode(ISD::SRL, dl, VT, Sub, TwosV);
+ if (NeedsBitcast) {
+ Srl = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Srl);
+ M33 = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M33);
+ Sub = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Sub);
+ }
+
+ SDValue AndRHS = DAG.getNode(ISD::AND, dl, M33.getValueType(), Srl, M33);
+ SDValue AndLHS = DAG.getNode(ISD::AND, dl, M33.getValueType(), Sub, M33);
+ if (VT != AndRHS.getValueType()) {
+ AndRHS = DAG.getNode(ISD::BITCAST, dl, VT, AndRHS);
+ AndLHS = DAG.getNode(ISD::BITCAST, dl, VT, AndLHS);
+ }
+ SDValue Add = DAG.getNode(ISD::ADD, dl, VT, AndLHS, AndRHS);
+
+ // v = (v + (v >> 4)) & 0x0F0F0F0F...
+ SmallVector<SDValue, 8> Fours(NumElts, DAG.getConstant(4, EltVT));
+ SDValue FoursV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Fours);
+ Srl = DAG.getNode(ISD::SRL, dl, VT, Add, FoursV);
+ Add = DAG.getNode(ISD::ADD, dl, VT, Add, Srl);
+
+ SmallVector<SDValue, 8> Mask0F(NumElts, Cst0F);
+ SDValue M0F = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask0F);
+ if (NeedsBitcast) {
+ Add = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Add);
+ M0F = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M0F);
+ }
+ And = DAG.getNode(ISD::AND, dl, M0F.getValueType(), Add, M0F);
+ if (VT != And.getValueType())
+ And = DAG.getNode(ISD::BITCAST, dl, VT, And);
+
+ // The algorithm mentioned above uses:
+ // v = (v * 0x01010101...) >> (Len - 8)
+ //
+ // Change it to use vector adds + vector shifts which yield faster results on
+ // Haswell than using vector integer multiplication.
+ //
+ // For i32 elements:
+ // v = v + (v >> 8)
+ // v = v + (v >> 16)
+ //
+ // For i64 elements:
+ // v = v + (v >> 8)
+ // v = v + (v >> 16)
+ // v = v + (v >> 32)
+ //
+ Add = And;
+ SmallVector<SDValue, 8> Csts;
+ for (unsigned i = 8; i <= Len/2; i *= 2) {
+ Csts.assign(NumElts, DAG.getConstant(i, EltVT));
+ SDValue CstsV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Csts);
+ Srl = DAG.getNode(ISD::SRL, dl, VT, Add, CstsV);
+ Add = DAG.getNode(ISD::ADD, dl, VT, Add, Srl);
+ Csts.clear();
+ }
+
+ // The result is on the least significant 6-bits on i32 and 7-bits on i64.
+ SDValue Cst3F = DAG.getConstant(APInt(Len, Len == 32 ? 0x3F : 0x7F), EltVT);
+ SmallVector<SDValue, 8> Cst3FV(NumElts, Cst3F);
+ SDValue M3F = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Cst3FV);
+ if (NeedsBitcast) {
+ Add = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Add);
+ M3F = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M3F);
+ }
+ And = DAG.getNode(ISD::AND, dl, M3F.getValueType(), Add, M3F);
+ if (VT != And.getValueType())
+ And = DAG.getNode(ISD::BITCAST, dl, VT, And);
+
+ return And;
+}
+
static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) {
SDNode *Node = Op.getNode();
SDLoc dl(Node);
case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
return LowerCMP_SWAP(Op, Subtarget, DAG);
+ case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG);
case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op,DAG);
case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
case X86ISD::XTEST: return "X86ISD::XTEST";
case X86ISD::COMPRESS: return "X86ISD::COMPRESS";
case X86ISD::EXPAND: return "X86ISD::EXPAND";
+ case X86ISD::SELECT: return "X86ISD::SELECT";
}
}