static SDValue Insert128BitVector(SDValue Result, SDValue Vec,
unsigned IdxVal, SelectionDAG &DAG,
DebugLoc dl) {
+ // Inserting UNDEF is Result
+ if (Vec.getOpcode() == ISD::UNDEF)
+ return Result;
+
EVT VT = Vec.getValueType();
assert(VT.getSizeInBits() == 128 && "Unexpected vector size!");
* ElemsPerChunk);
SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32);
- Result = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec,
- VecIdx);
- return Result;
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec,
+ VecIdx);
}
/// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
if (Subtarget->isTargetEnvMacho()) {
if (is64Bit)
- return new X8664_MachoTargetObjectFile();
+ return new X86_64MachoTargetObjectFile();
return new TargetLoweringObjectFileMachO();
}
+ if (Subtarget->isTargetLinux())
+ return new X86LinuxTargetObjectFile();
if (Subtarget->isTargetELF())
return new TargetLoweringObjectFileELF();
if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho())
setOperationAction(ISD::FSIN, (MVT::SimpleValueType)VT, Expand);
setOperationAction(ISD::FCOS, (MVT::SimpleValueType)VT, Expand);
setOperationAction(ISD::FREM, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::FMA, (MVT::SimpleValueType)VT, Expand);
setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand);
setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand);
setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand);
setOperationAction(ISD::VSELECT, MVT::v8i32, Legal);
setOperationAction(ISD::VSELECT, MVT::v8f32, Legal);
+ if (Subtarget->hasFMA()) {
+ setOperationAction(ISD::FMA, MVT::v8f32, Custom);
+ setOperationAction(ISD::FMA, MVT::v4f64, Custom);
+ setOperationAction(ISD::FMA, MVT::v4f32, Custom);
+ setOperationAction(ISD::FMA, MVT::v2f64, Custom);
+ setOperationAction(ISD::FMA, MVT::f32, Custom);
+ setOperationAction(ISD::FMA, MVT::f64, Custom);
+ }
if (Subtarget->hasAVX2()) {
setOperationAction(ISD::ADD, MVT::v4i64, Legal);
setOperationAction(ISD::ADD, MVT::v8i32, Legal);
// We want to custom lower some of our intrinsics.
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
// Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
setTargetDAGCombine(ISD::ADD);
setTargetDAGCombine(ISD::FADD);
setTargetDAGCombine(ISD::FSUB);
+ setTargetDAGCombine(ISD::FMA);
setTargetDAGCombine(ISD::SUB);
setTargetDAGCombine(ISD::LOAD);
setTargetDAGCombine(ISD::STORE);
SDValue Val;
// If this is a call to a function that returns an fp value on the floating
- // point stack, we must guarantee the the value is popped from the stack, so
+ // point stack, we must guarantee the value is popped from the stack, so
// a CopyFromReg is not good enough - the copy instruction may be eliminated
// if the return value is not used. We use the FpPOP_RETVAL instruction
// instead.
/// CallIsStructReturn - Determines whether a call uses struct return
/// semantics.
-static bool CallIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) {
+enum StructReturnType {
+ NotStructReturn,
+ RegStructReturn,
+ StackStructReturn
+};
+static StructReturnType
+callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) {
if (Outs.empty())
- return false;
+ return NotStructReturn;
- return Outs[0].Flags.isSRet();
+ const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
+ if (!Flags.isSRet())
+ return NotStructReturn;
+ if (Flags.isInReg())
+ return RegStructReturn;
+ return StackStructReturn;
}
/// ArgsAreStructReturn - Determines whether a function uses struct
/// return semantics.
-static bool
-ArgsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
+static StructReturnType
+argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
if (Ins.empty())
- return false;
+ return NotStructReturn;
- return Ins[0].Flags.isSRet();
+ const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
+ if (!Flags.isSRet())
+ return NotStructReturn;
+ if (Flags.isInReg())
+ return RegStructReturn;
+ return StackStructReturn;
}
/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
// If this is an sret function, the return should pop the hidden pointer.
if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows &&
- ArgsAreStructReturn(Ins))
+ argsAreStructReturn(Ins) == StackStructReturn)
FuncInfo->setBytesToPopOnReturn(4);
}
bool Is64Bit = Subtarget->is64Bit();
bool IsWin64 = Subtarget->isTargetWin64();
bool IsWindows = Subtarget->isTargetWindows();
- bool IsStructRet = CallIsStructReturn(Outs);
+ StructReturnType SR = callIsStructReturn(Outs);
bool IsSibcall = false;
if (MF.getTarget().Options.DisableTailCalls)
if (isTailCall) {
// Check if it's really possible to do a tail call.
isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
- isVarArg, IsStructRet, MF.getFunction()->hasStructRetAttr(),
- Outs, OutVals, Ins, DAG);
+ isVarArg, SR != NotStructReturn,
+ MF.getFunction()->hasStructRetAttr(),
+ Outs, OutVals, Ins, DAG);
// Sibcalls are automatically detected tailcalls which do not require
// ABI changes.
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
&MemOpChains[0], MemOpChains.size());
- // Build a sequence of copy-to-reg nodes chained together with token chain
- // and flag operands which copy the outgoing args into registers.
- SDValue InFlag;
- // Tail call byval lowering might overwrite argument registers so in case of
- // tail call optimization the copies to registers are lowered later.
- if (!isTailCall)
- for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
- Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
- RegsToPass[i].second, InFlag);
- InFlag = Chain.getValue(1);
- }
-
if (Subtarget->isPICStyleGOT()) {
// ELF / PIC requires GOT in the EBX register before function calls via PLT
// GOT pointer.
if (!isTailCall) {
- Chain = DAG.getCopyToReg(Chain, dl, X86::EBX,
- DAG.getNode(X86ISD::GlobalBaseReg,
- DebugLoc(), getPointerTy()),
- InFlag);
- InFlag = Chain.getValue(1);
+ RegsToPass.push_back(std::make_pair(unsigned(X86::EBX),
+ DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy())));
} else {
// If we are tail calling and generating PIC/GOT style code load the
// address of the callee into ECX. The value in ecx is used as target of
assert((Subtarget->hasSSE1() || !NumXMMRegs)
&& "SSE registers cannot be used when SSE is disabled");
- Chain = DAG.getCopyToReg(Chain, dl, X86::AL,
- DAG.getConstant(NumXMMRegs, MVT::i8), InFlag);
- InFlag = Chain.getValue(1);
+ RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
+ DAG.getConstant(NumXMMRegs, MVT::i8)));
}
-
// For tail calls lower the arguments to the 'real' stack slot.
if (isTailCall) {
// Force all the incoming stack arguments to be loaded from the stack
SmallVector<SDValue, 8> MemOpChains2;
SDValue FIN;
int FI = 0;
- // Do not flag preceding copytoreg stuff together with the following stuff.
- InFlag = SDValue();
if (getTargetMachine().Options.GuaranteedTailCallOpt) {
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
CCValAssign &VA = ArgLocs[i];
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
&MemOpChains2[0], MemOpChains2.size());
- // Copy arguments to their registers.
- for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
- Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
- RegsToPass[i].second, InFlag);
- InFlag = Chain.getValue(1);
- }
- InFlag =SDValue();
-
// Store the return address to the appropriate stack slot.
Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit,
FPDiff, dl);
}
+ // Build a sequence of copy-to-reg nodes chained together with token chain
+ // and flag operands which copy the outgoing args into registers.
+ SDValue InFlag;
+ for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+ Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+ RegsToPass[i].second, InFlag);
+ InFlag = Chain.getValue(1);
+ }
+
if (getTargetMachine().getCodeModel() == CodeModel::Large) {
assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
// In the 64-bit large code model, we have to make all calls
Ops.push_back(DAG.getRegister(RegsToPass[i].first,
RegsToPass[i].second.getValueType()));
- // Add an implicit use GOT pointer in EBX.
- if (!isTailCall && Subtarget->isPICStyleGOT())
- Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy()));
-
- // Add an implicit use of AL for non-Windows x86 64-bit vararg functions.
- if (Is64Bit && isVarArg && !IsWin64)
- Ops.push_back(DAG.getRegister(X86::AL, MVT::i8));
-
// Add a register mask operand representing the call-preserved registers.
const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
getTargetMachine().Options.GuaranteedTailCallOpt))
NumBytesForCalleeToPush = NumBytes; // Callee pops everything
else if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows &&
- IsStructRet)
+ SR == StackStructReturn)
// If this is a call to a struct-return function, the callee
// pops the hidden struct pointer, so we have to push it back.
// This is common for Darwin/X86, Linux & Mingw32 targets.
return true;
}
+//
+// Some special combinations that can be optimized.
+//
+static
+SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp,
+ SelectionDAG &DAG) {
+ EVT VT = SVOp->getValueType(0);
+ DebugLoc dl = SVOp->getDebugLoc();
+
+ if (VT != MVT::v8i32 && VT != MVT::v8f32)
+ return SDValue();
+
+ ArrayRef<int> Mask = SVOp->getMask();
+
+ // These are the special masks that may be optimized.
+ static const int MaskToOptimizeEven[] = {0, 8, 2, 10, 4, 12, 6, 14};
+ static const int MaskToOptimizeOdd[] = {1, 9, 3, 11, 5, 13, 7, 15};
+ bool MatchEvenMask = true;
+ bool MatchOddMask = true;
+ for (int i=0; i<8; ++i) {
+ if (!isUndefOrEqual(Mask[i], MaskToOptimizeEven[i]))
+ MatchEvenMask = false;
+ if (!isUndefOrEqual(Mask[i], MaskToOptimizeOdd[i]))
+ MatchOddMask = false;
+ }
+ static const int CompactionMaskEven[] = {0, 2, -1, -1, 4, 6, -1, -1};
+ static const int CompactionMaskOdd [] = {1, 3, -1, -1, 5, 7, -1, -1};
+
+ const int *CompactionMask;
+ if (MatchEvenMask)
+ CompactionMask = CompactionMaskEven;
+ else if (MatchOddMask)
+ CompactionMask = CompactionMaskOdd;
+ else
+ return SDValue();
+
+ SDValue UndefNode = DAG.getNode(ISD::UNDEF, dl, VT);
+
+ SDValue Op0 = DAG.getVectorShuffle(VT, dl, SVOp->getOperand(0),
+ UndefNode, CompactionMask);
+ SDValue Op1 = DAG.getVectorShuffle(VT, dl, SVOp->getOperand(1),
+ UndefNode, CompactionMask);
+ static const int UnpackMask[] = {0, 8, 1, 9, 4, 12, 5, 13};
+ return DAG.getVectorShuffle(VT, dl, Op0, Op1, UnpackMask);
+}
+
/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to UNPCKL.
static bool isUNPCKLMask(ArrayRef<int> Mask, EVT VT,
SDValue Sc = Op.getOperand(0);
if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR &&
- Sc.getOpcode() != ISD::BUILD_VECTOR)
- return SDValue();
+ Sc.getOpcode() != ISD::BUILD_VECTOR) {
+
+ if (!Subtarget->hasAVX2())
+ return SDValue();
+
+ // Use the register form of the broadcast instruction available on AVX2.
+ if (VT.is256BitVector())
+ Sc = Extract128BitVector(Sc, 0, DAG, dl);
+ return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc);
+ }
Ld = Sc.getOperand(0);
ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
/// which could not be matched by any known target speficic shuffle
static SDValue
LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
+
+ SDValue NewOp = Compact8x32ShuffleNode(SVOp, DAG);
+ if (NewOp.getNode())
+ return NewOp;
+
EVT VT = SVOp->getValueType(0);
unsigned NumElems = VT.getVectorNumElements();
Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), PtrVT),
Offset);
- } else {
- Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
- MachinePointerInfo::getGOT(), false, false, false,
- 0);
}
+
+ Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
+ MachinePointerInfo::getGOT(), false, false, false,
+ 0);
}
// The address of the thread local variable is the add of the thread
const GlobalValue *GV = GA->getGlobal();
if (Subtarget->isTargetELF()) {
- // If GV is an alias then use the aliasee for determining
- // thread-localness.
- if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
- GV = GA->resolveAliasedGlobal(false);
-
TLSModel::Model model = getTargetMachine().getTLSModel(GV);
switch (model) {
assert(ShAmt.getValueType() == MVT::i32 && "ShAmt is not i32");
if (isa<ConstantSDNode>(ShAmt)) {
+ // Constant may be a TargetConstant. Use a regular constant.
+ uint32_t ShiftAmt = cast<ConstantSDNode>(ShAmt)->getZExtValue();
switch (Opc) {
default: llvm_unreachable("Unknown target vector shift node");
case X86ISD::VSHLI:
case X86ISD::VSRLI:
case X86ISD::VSRAI:
- return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
+ return DAG.getNode(Opc, dl, VT, SrcOp,
+ DAG.getConstant(ShiftAmt, MVT::i32));
}
}
ShOps[2] = DAG.getUNDEF(MVT::i32);
ShOps[3] = DAG.getUNDEF(MVT::i32);
ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, &ShOps[0], 4);
- ShAmt = DAG.getNode(ISD::BITCAST, dl, VT, ShAmt);
+
+ // The return type has to be a 128-bit type with the same element
+ // type as the input type.
+ MVT EltVT = VT.getVectorElementType().getSimpleVT();
+ EVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
+
+ ShAmt = DAG.getNode(ISD::BITCAST, dl, ShVT, ShAmt);
return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
}
DAG.getConstant(X86CC, MVT::i8), Cond);
return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
}
- // XOP comparison intrinsics
- case Intrinsic::x86_xop_vpcomltb:
- case Intrinsic::x86_xop_vpcomltw:
- case Intrinsic::x86_xop_vpcomltd:
- case Intrinsic::x86_xop_vpcomltq:
- case Intrinsic::x86_xop_vpcomltub:
- case Intrinsic::x86_xop_vpcomltuw:
- case Intrinsic::x86_xop_vpcomltud:
- case Intrinsic::x86_xop_vpcomltuq:
- case Intrinsic::x86_xop_vpcomleb:
- case Intrinsic::x86_xop_vpcomlew:
- case Intrinsic::x86_xop_vpcomled:
- case Intrinsic::x86_xop_vpcomleq:
- case Intrinsic::x86_xop_vpcomleub:
- case Intrinsic::x86_xop_vpcomleuw:
- case Intrinsic::x86_xop_vpcomleud:
- case Intrinsic::x86_xop_vpcomleuq:
- case Intrinsic::x86_xop_vpcomgtb:
- case Intrinsic::x86_xop_vpcomgtw:
- case Intrinsic::x86_xop_vpcomgtd:
- case Intrinsic::x86_xop_vpcomgtq:
- case Intrinsic::x86_xop_vpcomgtub:
- case Intrinsic::x86_xop_vpcomgtuw:
- case Intrinsic::x86_xop_vpcomgtud:
- case Intrinsic::x86_xop_vpcomgtuq:
- case Intrinsic::x86_xop_vpcomgeb:
- case Intrinsic::x86_xop_vpcomgew:
- case Intrinsic::x86_xop_vpcomged:
- case Intrinsic::x86_xop_vpcomgeq:
- case Intrinsic::x86_xop_vpcomgeub:
- case Intrinsic::x86_xop_vpcomgeuw:
- case Intrinsic::x86_xop_vpcomgeud:
- case Intrinsic::x86_xop_vpcomgeuq:
- case Intrinsic::x86_xop_vpcomeqb:
- case Intrinsic::x86_xop_vpcomeqw:
- case Intrinsic::x86_xop_vpcomeqd:
- case Intrinsic::x86_xop_vpcomeqq:
- case Intrinsic::x86_xop_vpcomequb:
- case Intrinsic::x86_xop_vpcomequw:
- case Intrinsic::x86_xop_vpcomequd:
- case Intrinsic::x86_xop_vpcomequq:
- case Intrinsic::x86_xop_vpcomneb:
- case Intrinsic::x86_xop_vpcomnew:
- case Intrinsic::x86_xop_vpcomned:
- case Intrinsic::x86_xop_vpcomneq:
- case Intrinsic::x86_xop_vpcomneub:
- case Intrinsic::x86_xop_vpcomneuw:
- case Intrinsic::x86_xop_vpcomneud:
- case Intrinsic::x86_xop_vpcomneuq:
- case Intrinsic::x86_xop_vpcomfalseb:
- case Intrinsic::x86_xop_vpcomfalsew:
- case Intrinsic::x86_xop_vpcomfalsed:
- case Intrinsic::x86_xop_vpcomfalseq:
- case Intrinsic::x86_xop_vpcomfalseub:
- case Intrinsic::x86_xop_vpcomfalseuw:
- case Intrinsic::x86_xop_vpcomfalseud:
- case Intrinsic::x86_xop_vpcomfalseuq:
- case Intrinsic::x86_xop_vpcomtrueb:
- case Intrinsic::x86_xop_vpcomtruew:
- case Intrinsic::x86_xop_vpcomtrued:
- case Intrinsic::x86_xop_vpcomtrueq:
- case Intrinsic::x86_xop_vpcomtrueub:
- case Intrinsic::x86_xop_vpcomtrueuw:
- case Intrinsic::x86_xop_vpcomtrueud:
- case Intrinsic::x86_xop_vpcomtrueuq: {
- unsigned CC = 0;
- unsigned Opc = 0;
-
- switch (IntNo) {
- default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
- case Intrinsic::x86_xop_vpcomltb:
- case Intrinsic::x86_xop_vpcomltw:
- case Intrinsic::x86_xop_vpcomltd:
- case Intrinsic::x86_xop_vpcomltq:
- CC = 0;
- Opc = X86ISD::VPCOM;
- break;
- case Intrinsic::x86_xop_vpcomltub:
- case Intrinsic::x86_xop_vpcomltuw:
- case Intrinsic::x86_xop_vpcomltud:
- case Intrinsic::x86_xop_vpcomltuq:
- CC = 0;
- Opc = X86ISD::VPCOMU;
- break;
- case Intrinsic::x86_xop_vpcomleb:
- case Intrinsic::x86_xop_vpcomlew:
- case Intrinsic::x86_xop_vpcomled:
- case Intrinsic::x86_xop_vpcomleq:
- CC = 1;
- Opc = X86ISD::VPCOM;
- break;
- case Intrinsic::x86_xop_vpcomleub:
- case Intrinsic::x86_xop_vpcomleuw:
- case Intrinsic::x86_xop_vpcomleud:
- case Intrinsic::x86_xop_vpcomleuq:
- CC = 1;
- Opc = X86ISD::VPCOMU;
- break;
- case Intrinsic::x86_xop_vpcomgtb:
- case Intrinsic::x86_xop_vpcomgtw:
- case Intrinsic::x86_xop_vpcomgtd:
- case Intrinsic::x86_xop_vpcomgtq:
- CC = 2;
- Opc = X86ISD::VPCOM;
- break;
- case Intrinsic::x86_xop_vpcomgtub:
- case Intrinsic::x86_xop_vpcomgtuw:
- case Intrinsic::x86_xop_vpcomgtud:
- case Intrinsic::x86_xop_vpcomgtuq:
- CC = 2;
- Opc = X86ISD::VPCOMU;
- break;
- case Intrinsic::x86_xop_vpcomgeb:
- case Intrinsic::x86_xop_vpcomgew:
- case Intrinsic::x86_xop_vpcomged:
- case Intrinsic::x86_xop_vpcomgeq:
- CC = 3;
- Opc = X86ISD::VPCOM;
- break;
- case Intrinsic::x86_xop_vpcomgeub:
- case Intrinsic::x86_xop_vpcomgeuw:
- case Intrinsic::x86_xop_vpcomgeud:
- case Intrinsic::x86_xop_vpcomgeuq:
- CC = 3;
- Opc = X86ISD::VPCOMU;
- break;
- case Intrinsic::x86_xop_vpcomeqb:
- case Intrinsic::x86_xop_vpcomeqw:
- case Intrinsic::x86_xop_vpcomeqd:
- case Intrinsic::x86_xop_vpcomeqq:
- CC = 4;
- Opc = X86ISD::VPCOM;
- break;
- case Intrinsic::x86_xop_vpcomequb:
- case Intrinsic::x86_xop_vpcomequw:
- case Intrinsic::x86_xop_vpcomequd:
- case Intrinsic::x86_xop_vpcomequq:
- CC = 4;
- Opc = X86ISD::VPCOMU;
- break;
- case Intrinsic::x86_xop_vpcomneb:
- case Intrinsic::x86_xop_vpcomnew:
- case Intrinsic::x86_xop_vpcomned:
- case Intrinsic::x86_xop_vpcomneq:
- CC = 5;
- Opc = X86ISD::VPCOM;
- break;
- case Intrinsic::x86_xop_vpcomneub:
- case Intrinsic::x86_xop_vpcomneuw:
- case Intrinsic::x86_xop_vpcomneud:
- case Intrinsic::x86_xop_vpcomneuq:
- CC = 5;
- Opc = X86ISD::VPCOMU;
- break;
- case Intrinsic::x86_xop_vpcomfalseb:
- case Intrinsic::x86_xop_vpcomfalsew:
- case Intrinsic::x86_xop_vpcomfalsed:
- case Intrinsic::x86_xop_vpcomfalseq:
- CC = 6;
- Opc = X86ISD::VPCOM;
- break;
- case Intrinsic::x86_xop_vpcomfalseub:
- case Intrinsic::x86_xop_vpcomfalseuw:
- case Intrinsic::x86_xop_vpcomfalseud:
- case Intrinsic::x86_xop_vpcomfalseuq:
- CC = 6;
- Opc = X86ISD::VPCOMU;
- break;
- case Intrinsic::x86_xop_vpcomtrueb:
- case Intrinsic::x86_xop_vpcomtruew:
- case Intrinsic::x86_xop_vpcomtrued:
- case Intrinsic::x86_xop_vpcomtrueq:
- CC = 7;
- Opc = X86ISD::VPCOM;
- break;
- case Intrinsic::x86_xop_vpcomtrueub:
- case Intrinsic::x86_xop_vpcomtrueuw:
- case Intrinsic::x86_xop_vpcomtrueud:
- case Intrinsic::x86_xop_vpcomtrueuq:
- CC = 7;
- Opc = X86ISD::VPCOMU;
- break;
- }
-
- SDValue LHS = Op.getOperand(1);
- SDValue RHS = Op.getOperand(2);
- return DAG.getNode(Opc, dl, Op.getValueType(), LHS, RHS,
- DAG.getConstant(CC, MVT::i8));
- }
-
// Arithmetic intrinsics.
case Intrinsic::x86_sse2_pmulu_dq:
case Intrinsic::x86_avx2_pmulu_dq:
}
}
+SDValue
+X86TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const {
+ DebugLoc dl = Op.getDebugLoc();
+ unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+ switch (IntNo) {
+ default: return SDValue(); // Don't custom lower most intrinsics.
+
+ // RDRAND intrinsics.
+ case Intrinsic::x86_rdrand_16:
+ case Intrinsic::x86_rdrand_32:
+ case Intrinsic::x86_rdrand_64: {
+ // Emit the node with the right value type.
+ SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other);
+ SDValue Result = DAG.getNode(X86ISD::RDRAND, dl, VTs, Op.getOperand(0));
+
+ // If the value returned by RDRAND was valid (CF=1), return 1. Otherwise
+ // return the value from Rand, which is always 0, casted to i32.
+ SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
+ DAG.getConstant(1, Op->getValueType(1)),
+ DAG.getConstant(X86::COND_B, MVT::i32),
+ SDValue(Result.getNode(), 1) };
+ SDValue isValid = DAG.getNode(X86ISD::CMOV, dl,
+ DAG.getVTList(Op->getValueType(1), MVT::Glue),
+ Ops, 4);
+
+ // Return { result, isValid, chain }.
+ return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
+ SDValue(Result.getNode(), 2));
+ }
+ }
+}
+
SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
SelectionDAG &DAG) const {
MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
}
SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
- MachineFunction &MF = DAG.getMachineFunction();
SDValue Chain = Op.getOperand(0);
SDValue Offset = Op.getOperand(1);
SDValue Handler = Op.getOperand(2);
Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(),
false, false, 0);
Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
- MF.getRegInfo().addLiveOut(StoreAddrReg);
return DAG.getNode(X86ISD::EH_RETURN, dl,
MVT::Other,
case ISD::VAARG: return LowerVAARG(Op, DAG);
case ISD::VACOPY: return LowerVACOPY(Op, DAG);
case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+ case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);
case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
case ISD::FRAME_TO_ARGS_OFFSET:
case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA";
case X86ISD::WIN_FTOL: return "X86ISD::WIN_FTOL";
case X86ISD::SAHF: return "X86ISD::SAHF";
+ case X86ISD::RDRAND: return "X86ISD::RDRAND";
+ case X86ISD::FMADD: return "X86ISD::FMADD";
+ case X86ISD::FMSUB: return "X86ISD::FMSUB";
+ case X86ISD::FNMADD: return "X86ISD::FNMADD";
+ case X86ISD::FNMSUB: return "X86ISD::FNMSUB";
+ case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB";
+ case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD";
}
}
return true;
}
+bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
+ return Imm == (int32_t)Imm;
+}
+
+bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
+ // Can also use sub to handle negated immediates.
+ return Imm == (int32_t)Imm;
+}
+
bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
if (!VT1.isInteger() || !VT2.isInteger())
return false;
BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
.addReg(sizeVReg);
BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
- .addExternalSymbol("__morestack_allocate_stack_space").addReg(X86::RDI)
+ .addExternalSymbol("__morestack_allocate_stack_space")
.addRegMask(RegMask)
+ .addReg(X86::RDI, RegState::Implicit)
.addReg(X86::RAX, RegState::ImplicitDefine);
} else {
BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
if (DCI.isBeforeLegalizeOps())
return SDValue();
- SDValue RV = performIntegerAbsCombine(N, DAG);
- if (RV.getNode())
- return RV;
+ if (Subtarget->hasCMov()) {
+ SDValue RV = performIntegerAbsCombine(N, DAG);
+ if (RV.getNode())
+ return RV;
+ }
// Try forming BMI if it is available.
if (!Subtarget->hasBMI())
/// PerformLOADCombine - Do target-specific dag combines on LOAD nodes.
static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
- const X86Subtarget *Subtarget) {
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget *Subtarget) {
LoadSDNode *Ld = cast<LoadSDNode>(N);
EVT RegVT = Ld->getValueType(0);
EVT MemVT = Ld->getMemoryVT();
unsigned RegSz = RegVT.getSizeInBits();
unsigned MemSz = MemVT.getSizeInBits();
assert(RegSz > MemSz && "Register size must be greater than the mem size");
- // All sizes must be a power of two
- if (!isPowerOf2_32(RegSz * MemSz * NumElems)) return SDValue();
- // Attempt to load the original value using a single load op.
- // Find a scalar type which is equal to the loaded word size.
+ // All sizes must be a power of two.
+ if (!isPowerOf2_32(RegSz * MemSz * NumElems))
+ return SDValue();
+
+ // Attempt to load the original value using scalar loads.
+ // Find the largest scalar type that divides the total loaded size.
MVT SclrLoadTy = MVT::i8;
for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE;
tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) {
MVT Tp = (MVT::SimpleValueType)tp;
- if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() == MemSz) {
+ if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
SclrLoadTy = Tp;
- break;
}
}
- // Proceed if a load word is found.
- if (SclrLoadTy.getSizeInBits() != MemSz) return SDValue();
+ // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
+ if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
+ (64 <= MemSz))
+ SclrLoadTy = MVT::f64;
+ // Calculate the number of scalar loads that we need to perform
+ // in order to load our vector from memory.
+ unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
+
+ // Represent our vector as a sequence of elements which are the
+ // largest scalar that we can load.
EVT LoadUnitVecVT = EVT::getVectorVT(*DAG.getContext(), SclrLoadTy,
RegSz/SclrLoadTy.getSizeInBits());
+ // Represent the data using the same element type that is stored in
+ // memory. In practice, we ''widen'' MemVT.
EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
RegSz/MemVT.getScalarType().getSizeInBits());
- // Can't shuffle using an illegal type.
- if (!TLI.isTypeLegal(WideVecVT)) return SDValue();
- // Perform a single load.
- SDValue ScalarLoad = DAG.getLoad(SclrLoadTy, dl, Ld->getChain(),
- Ld->getBasePtr(),
- Ld->getPointerInfo(), Ld->isVolatile(),
- Ld->isNonTemporal(), Ld->isInvariant(),
- Ld->getAlignment());
+ assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
+ "Invalid vector type");
+
+ // We can't shuffle using an illegal type.
+ if (!TLI.isTypeLegal(WideVecVT))
+ return SDValue();
- // Insert the word loaded into a vector.
- SDValue ScalarInVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
- LoadUnitVecVT, ScalarLoad);
+ SmallVector<SDValue, 8> Chains;
+ SDValue Ptr = Ld->getBasePtr();
+ SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits()/8,
+ TLI.getPointerTy());
+ SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
+
+ for (unsigned i = 0; i < NumLoads; ++i) {
+ // Perform a single load.
+ SDValue ScalarLoad = DAG.getLoad(SclrLoadTy, dl, Ld->getChain(),
+ Ptr, Ld->getPointerInfo(),
+ Ld->isVolatile(), Ld->isNonTemporal(),
+ Ld->isInvariant(), Ld->getAlignment());
+ Chains.push_back(ScalarLoad.getValue(1));
+ // Create the first element type using SCALAR_TO_VECTOR in order to avoid
+ // another round of DAGCombining.
+ if (i == 0)
+ Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
+ else
+ Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
+ ScalarLoad, DAG.getIntPtrConstant(i));
+
+ Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
+ }
+
+ SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0],
+ Chains.size());
// Bitcast the loaded value to a vector of the original element type, in
// the size of the target vector type.
- SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT,
- ScalarInVector);
+ SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Res);
unsigned SizeRatio = RegSz/MemSz;
// Redistribute the loaded elements into the different locations.
Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
// Replace the original load with the new sequence
// and return the new chain.
- DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Shuff);
- return SDValue(ScalarLoad.getNode(), 1);
+ return DCI.CombineTo(N, Shuff, TF, true);
}
return SDValue();
for (unsigned i = 0; i != NumElems; ++i)
ShuffleVec[i] = i * SizeRatio;
- // Can't shuffle using an illegal type
- if (!TLI.isTypeLegal(WideVecVT)) return SDValue();
+ // Can't shuffle using an illegal type.
+ if (!TLI.isTypeLegal(WideVecVT))
+ return SDValue();
SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
DAG.getUNDEF(WideVecVT),
for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE;
tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) {
MVT Tp = (MVT::SimpleValueType)tp;
- if (TLI.isTypeLegal(Tp) && StoreType.getSizeInBits() < NumElems * ToSz)
+ if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
StoreType = Tp;
}
+ // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
+ if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
+ (64 <= NumElems * ToSz))
+ StoreType = MVT::f64;
+
// Bitcast the original vector into a vector of store-size units
EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
- StoreType, VT.getSizeInBits()/EVT(StoreType).getSizeInBits());
+ StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
SDValue ShuffWide = DAG.getNode(ISD::BITCAST, dl, StoreVecVT, Shuff);
SmallVector<SDValue, 8> Chains;
return SDValue();
}
+static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget* Subtarget) {
+ DebugLoc dl = N->getDebugLoc();
+ EVT VT = N->getValueType(0);
+
+ EVT ScalarVT = VT.getScalarType();
+ if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget->hasFMA())
+ return SDValue();
+
+ SDValue A = N->getOperand(0);
+ SDValue B = N->getOperand(1);
+ SDValue C = N->getOperand(2);
+
+ bool NegA = (A.getOpcode() == ISD::FNEG);
+ bool NegB = (B.getOpcode() == ISD::FNEG);
+ bool NegC = (C.getOpcode() == ISD::FNEG);
+
+ // Negative multiplication when NegA xor NegB
+ bool NegMul = (NegA != NegB);
+ if (NegA)
+ A = A.getOperand(0);
+ if (NegB)
+ B = B.getOperand(0);
+ if (NegC)
+ C = C.getOperand(0);
+
+ unsigned Opcode;
+ if (!NegMul)
+ Opcode = (!NegC)? X86ISD::FMADD : X86ISD::FMSUB;
+ else
+ Opcode = (!NegC)? X86ISD::FNMADD : X86ISD::FNMSUB;
+ return DAG.getNode(Opcode, dl, VT, A, B, C);
+}
+
static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget *Subtarget) {
case ISD::AND: return PerformAndCombine(N, DAG, DCI, Subtarget);
case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget);
case ISD::XOR: return PerformXorCombine(N, DAG, DCI, Subtarget);
- case ISD::LOAD: return PerformLOADCombine(N, DAG, Subtarget);
+ case ISD::LOAD: return PerformLOADCombine(N, DAG, DCI, Subtarget);
case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget);
case ISD::UINT_TO_FP: return PerformUINT_TO_FPCombine(N, DAG);
case ISD::SINT_TO_FP: return PerformSINT_TO_FPCombine(N, DAG, this);
case X86ISD::VPERMILP:
case X86ISD::VPERM2X128:
case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget);
+ case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget);
}
return SDValue();
// wrong class. This can happen with constraints like {xmm0} where the
// target independent register mapper will just pick the first match it can
// find, ignoring the required type.
- if (VT == MVT::f32)
+
+ if (VT == MVT::f32 || VT == MVT::i32)
Res.second = &X86::FR32RegClass;
- else if (VT == MVT::f64)
+ else if (VT == MVT::f64 || VT == MVT::i64)
Res.second = &X86::FR64RegClass;
else if (X86::VR128RegClass.hasType(VT))
Res.second = &X86::VR128RegClass;
+ else if (X86::VR256RegClass.hasType(VT))
+ Res.second = &X86::VR256RegClass;
}
return Res;