X-Git-Url: http://plrg.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FX86%2FX86ISelLowering.cpp;h=834ef6c4233d6856456ca03e35cf4020a9c2f2cb;hb=3ea97550e361bc6ae23e9415abc7b0a34c540f53;hp=228244f06bfd9f5d5bace6ead7d67168da3d15d9;hpb=942619695f4bd77934c09a1cae0fb39ae59edac3;p=oota-llvm.git diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 228244f06bf..834ef6c4233 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -45,10 +45,12 @@ #include "llvm/ADT/VectorExtras.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/Dwarf.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; +using namespace dwarf; STATISTIC(NumTailCalls, "Number of tail calls"); @@ -73,15 +75,16 @@ static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) { case X86Subtarget::isDarwin: if (TM.getSubtarget().is64Bit()) return new X8664_MachoTargetObjectFile(); - return new X8632_MachoTargetObjectFile(); + return new TargetLoweringObjectFileMachO(); case X86Subtarget::isELF: - return new TargetLoweringObjectFileELF(); + if (TM.getSubtarget().is64Bit()) + return new X8664_ELFTargetObjectFile(TM); + return new X8632_ELFTargetObjectFile(TM); case X86Subtarget::isMingw: case X86Subtarget::isCygwin: case X86Subtarget::isWindows: return new TargetLoweringObjectFileCOFF(); } - } X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) @@ -791,6 +794,9 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) } // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64. + // FIXME: This produces lots of inefficiencies in isel since + // we then need notice that most of our operands have been implicitly + // converted to v2i64. for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; i++) { MVT::SimpleValueType SVT = (MVT::SimpleValueType)i; EVT VT = SVT; @@ -799,6 +805,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) if (!VT.is128BitVector()) { continue; } + setOperationAction(ISD::AND, SVT, Promote); AddPromotedToType (ISD::AND, SVT, MVT::v2i64); setOperationAction(ISD::OR, SVT, Promote); @@ -987,6 +994,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) // We have target-specific dag combine patterns for the following nodes: setTargetDAGCombine(ISD::VECTOR_SHUFFLE); + setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); setTargetDAGCombine(ISD::BUILD_VECTOR); setTargetDAGCombine(ISD::SELECT); setTargetDAGCombine(ISD::SHL); @@ -1001,23 +1009,10 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) computeRegisterProperties(); - // Divide and reminder operations have no vector equivalent and can - // trap. Do a custom widening for these operations in which we never - // generate more divides/remainder than the original vector width. - for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; - VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) { - if (!isTypeLegal((MVT::SimpleValueType)VT)) { - setOperationAction(ISD::SDIV, (MVT::SimpleValueType) VT, Custom); - setOperationAction(ISD::UDIV, (MVT::SimpleValueType) VT, Custom); - setOperationAction(ISD::SREM, (MVT::SimpleValueType) VT, Custom); - setOperationAction(ISD::UREM, (MVT::SimpleValueType) VT, Custom); - } - } - // FIXME: These should be based on subtarget info. Plus, the values should // be smaller when we are in optimizing for size mode. maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores - maxStoresPerMemcpy = 16; // For @llvm.memcpy -> sequence of stores + maxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores maxStoresPerMemmove = 3; // For @llvm.memmove -> sequence of stores setPrefLoopAlignment(16); benefitFromCodePlacementOpt = true; @@ -1075,23 +1070,37 @@ unsigned X86TargetLowering::getByValTypeAlignment(const Type *Ty) const { } /// getOptimalMemOpType - Returns the target specific optimal type for load -/// and store operations as a result of memset, memcpy, and memmove -/// lowering. It returns MVT::iAny if SelectionDAG should be responsible for -/// determining it. +/// and store operations as a result of memset, memcpy, and memmove lowering. +/// If DstAlign is zero that means it's safe to destination alignment can +/// satisfy any constraint. Similarly if SrcAlign is zero it means there +/// isn't a need to check it against alignment requirement, probably because +/// the source does not need to be loaded. It returns EVT::Other if +/// SelectionDAG should be responsible for determining it. EVT -X86TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned Align, - bool isSrcConst, bool isSrcStr, +X86TargetLowering::getOptimalMemOpType(uint64_t Size, + unsigned DstAlign, unsigned SrcAlign, + bool SafeToUseFP, SelectionDAG &DAG) const { // FIXME: This turns off use of xmm stores for memset/memcpy on targets like // linux. This is because the stack realignment code can't handle certain // cases like PR2962. This should be removed when PR2962 is fixed. const Function *F = DAG.getMachineFunction().getFunction(); - bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat); - if (!NoImplicitFloatOps && Subtarget->getStackAlignment() >= 16) { - if ((isSrcConst || isSrcStr) && Subtarget->hasSSE2() && Size >= 16) - return MVT::v4i32; - if ((isSrcConst || isSrcStr) && Subtarget->hasSSE1() && Size >= 16) - return MVT::v4f32; + if (!F->hasFnAttr(Attribute::NoImplicitFloat)) { + if (Size >= 16 && + (Subtarget->isUnalignedMemAccessFast() || + (DstAlign == 0 || DstAlign >= 16) && + (SrcAlign == 0 || SrcAlign >= 16)) && + Subtarget->getStackAlignment() >= 16) { + if (Subtarget->hasSSE2()) + return MVT::v4i32; + if (SafeToUseFP && Subtarget->hasSSE1()) + return MVT::v4f32; + } else if (SafeToUseFP && + Size >= 8 && + !Subtarget->is64Bit() && + Subtarget->getStackAlignment() >= 8 && + Subtarget->hasSSE2()) + return MVT::f64; } if (Subtarget->is64Bit() && Size >= 8) return MVT::i64; @@ -1130,10 +1139,8 @@ X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, Subtarget->isPICStyleGOT()); // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF // entries. - - // FIXME: @GOTOFF should be a property of MCSymbolRefExpr not in the MCSymbol. - std::string Name = MBB->getSymbol(Ctx)->getName() + "@GOTOFF"; - return MCSymbolRefExpr::Create(Ctx.GetOrCreateSymbol(StringRef(Name)), Ctx); + return MCSymbolRefExpr::Create(MBB->getSymbol(), + MCSymbolRefExpr::VK_GOTOFF, Ctx); } /// getPICJumpTableRelocaBase - Returns relocation base for the given PIC @@ -1195,13 +1202,11 @@ X86TargetLowering::LowerReturn(SDValue Chain, RVLocs, *DAG.getContext()); CCInfo.AnalyzeReturn(Outs, RetCC_X86); - // If this is the first return lowered for this function, add the regs to the - // liveout set for the function. - if (DAG.getMachineFunction().getRegInfo().liveout_empty()) { - for (unsigned i = 0; i != RVLocs.size(); ++i) - if (RVLocs[i].isRegLoc()) - DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg()); - } + // Add the regs to the liveout set for the function. + MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); + for (unsigned i = 0; i != RVLocs.size(); ++i) + if (RVLocs[i].isRegLoc() && !MRI.isLiveOut(RVLocs[i].getLocReg())) + MRI.addLiveOut(RVLocs[i].getLocReg()); SDValue Flag; @@ -1254,7 +1259,7 @@ X86TargetLowering::LowerReturn(SDValue Chain, X86MachineFunctionInfo *FuncInfo = MF.getInfo(); unsigned Reg = FuncInfo->getSRetReturnReg(); if (!Reg) { - Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64)); + Reg = MRI.createVirtualRegister(getRegClassFor(MVT::i64)); FuncInfo->setSRetReturnReg(Reg); } SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy()); @@ -1263,7 +1268,7 @@ X86TargetLowering::LowerReturn(SDValue Chain, Flag = Chain.getValue(1); // RAX now acts like a return value. - MF.getRegInfo().addLiveOut(X86::RAX); + MRI.addLiveOut(X86::RAX); } RetOps[0] = Chain; // Update chain. @@ -1393,7 +1398,9 @@ bool X86TargetLowering::IsCalleePop(bool IsVarArg, CallingConv::ID CallingConv){ case CallingConv::X86_FastCall: return !Subtarget->is64Bit(); case CallingConv::Fast: - return PerformTailCallOpt; + return GuaranteedTailCallOpt; + case CallingConv::GHC: + return GuaranteedTailCallOpt; } } @@ -1401,7 +1408,9 @@ bool X86TargetLowering::IsCalleePop(bool IsVarArg, CallingConv::ID CallingConv){ /// given CallingConvention value. CCAssignFn *X86TargetLowering::CCAssignFnForNode(CallingConv::ID CC) const { if (Subtarget->is64Bit()) { - if (Subtarget->isTargetWin64()) + if (CC == CallingConv::GHC) + return CC_X86_64_GHC; + else if (Subtarget->isTargetWin64()) return CC_X86_Win64_C; else return CC_X86_64_C; @@ -1411,22 +1420,12 @@ CCAssignFn *X86TargetLowering::CCAssignFnForNode(CallingConv::ID CC) const { return CC_X86_32_FastCall; else if (CC == CallingConv::Fast) return CC_X86_32_FastCC; + else if (CC == CallingConv::GHC) + return CC_X86_32_GHC; else return CC_X86_32_C; } -/// NameDecorationForCallConv - Selects the appropriate decoration to -/// apply to a MachineFunction containing a given calling convention. -NameDecorationStyle -X86TargetLowering::NameDecorationForCallConv(CallingConv::ID CallConv) { - if (CallConv == CallingConv::X86_FastCall) - return FastCall; - else if (CallConv == CallingConv::X86_StdCall) - return StdCall; - return None; -} - - /// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified /// by "Src" to address "Dst" with size and alignment information specified by /// the specific parameter attribute. The copy will be passed as a byval @@ -1440,10 +1439,16 @@ CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, /*AlwaysInline=*/true, NULL, 0, NULL, 0); } +/// IsTailCallConvention - Return true if the calling convention is one that +/// supports tail call optimization. +static bool IsTailCallConvention(CallingConv::ID CC) { + return (CC == CallingConv::Fast || CC == CallingConv::GHC); +} + /// FuncIsMadeTailCallSafe - Return true if the function is being made into /// a tailcall target by changing its ABI. static bool FuncIsMadeTailCallSafe(CallingConv::ID CC) { - return PerformTailCallOpt && CC == CallingConv::Fast; + return GuaranteedTailCallOpt && IsTailCallConvention(CC); } SDValue @@ -1471,13 +1476,18 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, // changed with more analysis. // In case of tail call optimization mark all arguments mutable. Since they // could be overwritten by lowering of arguments in case of a tail call. - int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8, - VA.getLocMemOffset(), isImmutable, false); - SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); - if (Flags.isByVal()) - return FIN; - return DAG.getLoad(ValVT, dl, Chain, FIN, - PseudoSourceValue::getFixedStack(FI), 0); + if (Flags.isByVal()) { + int FI = MFI->CreateFixedObject(Flags.getByValSize(), + VA.getLocMemOffset(), isImmutable, false); + return DAG.getFrameIndex(FI, getPointerTy()); + } else { + int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8, + VA.getLocMemOffset(), isImmutable, false); + SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); + return DAG.getLoad(ValVT, dl, Chain, FIN, + PseudoSourceValue::getFixedStack(FI), 0, + false, false, 0); + } } SDValue @@ -1488,7 +1498,6 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, DebugLoc dl, SelectionDAG &DAG, SmallVectorImpl &InVals) { - MachineFunction &MF = DAG.getMachineFunction(); X86MachineFunctionInfo *FuncInfo = MF.getInfo(); @@ -1498,15 +1507,12 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, Fn->getName() == "main") FuncInfo->setForceFramePointer(true); - // Decorate the function name. - FuncInfo->setDecorationStyle(NameDecorationForCallConv(CallConv)); - MachineFrameInfo *MFI = MF.getFrameInfo(); bool Is64Bit = Subtarget->is64Bit(); bool IsWin64 = Subtarget->isTargetWin64(); - assert(!(isVarArg && CallConv == CallingConv::Fast) && - "Var args not supported with calling convention fastcc"); + assert(!(isVarArg && IsTailCallConvention(CallConv)) && + "Var args not supported with calling convention fastcc or ghc"); // Assign locations to all of the incoming arguments. SmallVector ArgLocs; @@ -1573,7 +1579,8 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, // If value is passed via pointer - do a load. if (VA.getLocInfo() == CCValAssign::Indirect) - ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, NULL, 0); + ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, NULL, 0, + false, false, 0); InVals.push_back(ArgValue); } @@ -1668,7 +1675,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, PseudoSourceValue::getFixedStack(RegSaveFrameIndex), - Offset); + Offset, false, false, 0); MemOps.push_back(Store); Offset += 8; } @@ -1708,7 +1715,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, } else { BytesToPopOnReturn = 0; // Callee pops nothing. // If this is an sret function, the return should pop the hidden pointer. - if (!Is64Bit && CallConv != CallingConv::Fast && ArgsAreStructReturn(Ins)) + if (!Is64Bit && !IsTailCallConvention(CallConv) && ArgsAreStructReturn(Ins)) BytesToPopOnReturn = 4; } @@ -1737,7 +1744,8 @@ X86TargetLowering::LowerMemOpCallTo(SDValue Chain, return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); } return DAG.getStore(Chain, dl, Arg, PtrOff, - PseudoSourceValue::getStack(), LocMemOffset); + PseudoSourceValue::getStack(), LocMemOffset, + false, false, 0); } /// EmitTailCallLoadRetAddr - Emit a load of return address if tail call @@ -1747,14 +1755,12 @@ X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall, bool Is64Bit, int FPDiff, DebugLoc dl) { - if (!IsTailCall || FPDiff==0) return Chain; - // Adjust the Return address stack slot. EVT VT = getPointerTy(); OutRetAddr = getReturnAddressFrameIndex(DAG); // Load the "old" Return address. - OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, NULL, 0); + OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, NULL, 0, false, false, 0); return SDValue(OutRetAddr.getNode(), 1); } @@ -1769,16 +1775,17 @@ EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF, // Calculate the new stack slot for the return address. int SlotSize = Is64Bit ? 8 : 4; int NewReturnAddrFI = - MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, true,false); + MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, false, false); EVT VT = Is64Bit ? MVT::i64 : MVT::i32; SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT); Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, - PseudoSourceValue::getFixedStack(NewReturnAddrFI), 0); + PseudoSourceValue::getFixedStack(NewReturnAddrFI), 0, + false, false, 0); return Chain; } SDValue -X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, const Type *RetTy, +X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg, bool &isTailCall, const SmallVectorImpl &Outs, @@ -1788,14 +1795,25 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, const Type *RetTy, MachineFunction &MF = DAG.getMachineFunction(); bool Is64Bit = Subtarget->is64Bit(); bool IsStructRet = CallIsStructReturn(Outs); + bool IsSibcall = false; - if (isTailCall) + if (isTailCall) { // Check if it's really possible to do a tail call. - isTailCall = IsEligibleForTailCallOptimization(Callee, RetTy, CallConv, - isVarArg, Outs, Ins, DAG); + isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, + isVarArg, IsStructRet, MF.getFunction()->hasStructRetAttr(), + Outs, Ins, DAG); - assert(!(isVarArg && CallConv == CallingConv::Fast) && - "Var args not supported with calling convention fastcc"); + // Sibcalls are automatically detected tailcalls which do not require + // ABI changes. + if (!GuaranteedTailCallOpt && isTailCall) + IsSibcall = true; + + if (isTailCall) + ++NumTailCalls; + } + + assert(!(isVarArg && IsTailCallConvention(CallConv)) && + "Var args not supported with calling convention fastcc or ghc"); // Analyze operands of the call, assigning locations to each operand. SmallVector ArgLocs; @@ -1805,17 +1823,15 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, const Type *RetTy, // Get a count of how many bytes are to be pushed on the stack. unsigned NumBytes = CCInfo.getNextStackOffset(); - if (FuncIsMadeTailCallSafe(CallConv)) - NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); - else if (isTailCall && !PerformTailCallOpt) + if (IsSibcall) // This is a sibcall. The memory operands are available in caller's // own caller's stack. NumBytes = 0; + else if (GuaranteedTailCallOpt && IsTailCallConvention(CallConv)) + NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); int FPDiff = 0; - if (isTailCall) { - ++NumTailCalls; - + if (isTailCall && !IsSibcall) { // Lower arguments at fp - stackoffset + fpdiff. unsigned NumBytesCallerPushed = MF.getInfo()->getBytesToPopOnReturn(); @@ -1827,12 +1843,14 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, const Type *RetTy, MF.getInfo()->setTCReturnAddrDelta(FPDiff); } - Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true)); + if (!IsSibcall) + Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true)); SDValue RetAddrFrIdx; // Load return adress for tail calls. - Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall, Is64Bit, - FPDiff, dl); + if (isTailCall && FPDiff) + Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall, + Is64Bit, FPDiff, dl); SmallVector, 8> RegsToPass; SmallVector MemOpChains; @@ -1874,7 +1892,8 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, const Type *RetTy, SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT()); int FI = cast(SpillSlot)->getIndex(); Chain = DAG.getStore(Chain, dl, Arg, SpillSlot, - PseudoSourceValue::getFixedStack(FI), 0); + PseudoSourceValue::getFixedStack(FI), 0, + false, false, 0); Arg = SpillSlot; break; } @@ -1882,15 +1901,12 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, const Type *RetTy, if (VA.isRegLoc()) { RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); - } else { - if (!isTailCall || (isTailCall && isByVal)) { - assert(VA.isMemLoc()); - if (StackPtr.getNode() == 0) - StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, getPointerTy()); - - MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, - dl, DAG, VA, Flags)); - } + } else if (!IsSibcall && (!isTailCall || isByVal)) { + assert(VA.isMemLoc()); + if (StackPtr.getNode() == 0) + StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, getPointerTy()); + MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, + dl, DAG, VA, Flags)); } } @@ -1910,7 +1926,6 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, const Type *RetTy, InFlag = Chain.getValue(1); } - if (Subtarget->isPICStyleGOT()) { // ELF / PIC requires GOT in the EBX register before function calls via PLT // GOT pointer. @@ -1980,7 +1995,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, const Type *RetTy, int FI = 0; // Do not flag preceeding copytoreg stuff together with the following stuff. InFlag = SDValue(); - if (PerformTailCallOpt) { + if (GuaranteedTailCallOpt) { for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; if (VA.isRegLoc()) @@ -2009,7 +2024,8 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, const Type *RetTy, // Store relative to framepointer. MemOpChains2.push_back( DAG.getStore(ArgChain, dl, Arg, FIN, - PseudoSourceValue::getFixedStack(FI), 0)); + PseudoSourceValue::getFixedStack(FI), 0, + false, false, 0)); } } } @@ -2091,22 +2107,11 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, const Type *RetTy, OpFlags); } - if (isTailCall && !WasGlobalOrExternal) { - unsigned Opc = Is64Bit ? X86::R11 : X86::EAX; - - Chain = DAG.getCopyToReg(Chain, dl, - DAG.getRegister(Opc, getPointerTy()), - Callee,InFlag); - Callee = DAG.getRegister(Opc, getPointerTy()); - // Add register as live out. - MF.getRegInfo().addLiveOut(Opc); - } - // Returns a chain & a flag for retval copy to use. SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); SmallVector Ops; - if (isTailCall) { + if (!IsSibcall && isTailCall) { Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), DAG.getIntPtrConstant(0, true), InFlag); InFlag = Chain.getValue(1); @@ -2147,14 +2152,6 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, const Type *RetTy, if (RVLocs[i].isRegLoc()) MF.getRegInfo().addLiveOut(RVLocs[i].getLocReg()); } - - assert(((Callee.getOpcode() == ISD::Register && - (cast(Callee)->getReg() == X86::EAX || - cast(Callee)->getReg() == X86::R11)) || - Callee.getOpcode() == ISD::TargetExternalSymbol || - Callee.getOpcode() == ISD::TargetGlobalAddress) && - "Expecting a global address, external symbol, or scratch register"); - return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, &Ops[0], Ops.size()); } @@ -2166,8 +2163,8 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, const Type *RetTy, unsigned NumBytesForCalleeToPush; if (IsCalleePop(isVarArg, CallConv)) NumBytesForCalleeToPush = NumBytes; // Callee pops everything - else if (!Is64Bit && CallConv != CallingConv::Fast && IsStructRet) - // If this is is a call to a struct-return function, the callee + else if (!Is64Bit && !IsTailCallConvention(CallConv) && IsStructRet) + // If this is a call to a struct-return function, the callee // pops the hidden struct pointer, so we have to push it back. // This is common for Darwin/X86, Linux & Mingw32 targets. NumBytesForCalleeToPush = 4; @@ -2175,12 +2172,14 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, const Type *RetTy, NumBytesForCalleeToPush = 0; // Callee pops nothing. // Returns a flag for retval copy to use. - Chain = DAG.getCALLSEQ_END(Chain, - DAG.getIntPtrConstant(NumBytes, true), - DAG.getIntPtrConstant(NumBytesForCalleeToPush, - true), - InFlag); - InFlag = Chain.getValue(1); + if (!IsSibcall) { + Chain = DAG.getCALLSEQ_END(Chain, + DAG.getIntPtrConstant(NumBytes, true), + DAG.getIntPtrConstant(NumBytesForCalleeToPush, + true), + InFlag); + InFlag = Chain.getValue(1); + } // Handle result values, copying them out of physregs into vregs that we // return. @@ -2242,38 +2241,122 @@ unsigned X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, return Offset; } +/// MatchingStackOffset - Return true if the given stack call argument is +/// already available in the same position (relatively) of the caller's +/// incoming argument stack. +static +bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, + MachineFrameInfo *MFI, const MachineRegisterInfo *MRI, + const X86InstrInfo *TII) { + unsigned Bytes = Arg.getValueType().getSizeInBits() / 8; + int FI = INT_MAX; + if (Arg.getOpcode() == ISD::CopyFromReg) { + unsigned VR = cast(Arg.getOperand(1))->getReg(); + if (!VR || TargetRegisterInfo::isPhysicalRegister(VR)) + return false; + MachineInstr *Def = MRI->getVRegDef(VR); + if (!Def) + return false; + if (!Flags.isByVal()) { + if (!TII->isLoadFromStackSlot(Def, FI)) + return false; + } else { + unsigned Opcode = Def->getOpcode(); + if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) && + Def->getOperand(1).isFI()) { + FI = Def->getOperand(1).getIndex(); + Bytes = Flags.getByValSize(); + } else + return false; + } + } else if (LoadSDNode *Ld = dyn_cast(Arg)) { + if (Flags.isByVal()) + // ByVal argument is passed in as a pointer but it's now being + // dereferenced. e.g. + // define @foo(%struct.X* %A) { + // tail call @bar(%struct.X* byval %A) + // } + return false; + SDValue Ptr = Ld->getBasePtr(); + FrameIndexSDNode *FINode = dyn_cast(Ptr); + if (!FINode) + return false; + FI = FINode->getIndex(); + } else + return false; + + assert(FI != INT_MAX); + if (!MFI->isFixedObjectIndex(FI)) + return false; + return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI); +} + /// IsEligibleForTailCallOptimization - Check whether the call is eligible /// for tail call optimization. Targets which want to do tail call /// optimization should implement this function. bool X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, - const Type *RetTy, CallingConv::ID CalleeCC, bool isVarArg, + bool isCalleeStructRet, + bool isCallerStructRet, const SmallVectorImpl &Outs, const SmallVectorImpl &Ins, SelectionDAG& DAG) const { - if (CalleeCC != CallingConv::Fast && + if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C) return false; // If -tailcallopt is specified, make fastcc functions tail-callable. + const MachineFunction &MF = DAG.getMachineFunction(); const Function *CallerF = DAG.getMachineFunction().getFunction(); - if (PerformTailCallOpt) { - if (CalleeCC == CallingConv::Fast && + if (GuaranteedTailCallOpt) { + if (IsTailCallConvention(CalleeCC) && CallerF->getCallingConv() == CalleeCC) return true; return false; } - // Look for obvious safe cases to perform tail call optimization that does not // requite ABI changes. This is what gcc calls sibcall. - // Do not tail call optimize vararg calls for now. - if (isVarArg) + // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to + // emit a special epilogue. + if (RegInfo->needsStackRealignment(MF)) + return false; + + // Do not sibcall optimize vararg calls unless the call site is not passing any + // arguments. + if (isVarArg && !Outs.empty()) return false; + // Also avoid sibcall optimization if either caller or callee uses struct + // return semantics. + if (isCalleeStructRet || isCallerStructRet) + return false; + + // If the call result is in ST0 / ST1, it needs to be popped off the x87 stack. + // Therefore if it's not used by the call it is not safe to optimize this into + // a sibcall. + bool Unused = false; + for (unsigned i = 0, e = Ins.size(); i != e; ++i) { + if (!Ins[i].Used) { + Unused = true; + break; + } + } + if (Unused) { + SmallVector RVLocs; + CCState CCInfo(CalleeCC, false, getTargetMachine(), + RVLocs, *DAG.getContext()); + CCInfo.AnalyzeCallResult(Ins, RetCC_X86); + for (unsigned i = 0; i != RVLocs.size(); ++i) { + CCValAssign &VA = RVLocs[i]; + if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) + return false; + } + } + // If the callee takes no arguments then go on to check the results of the // call. if (!Outs.empty()) { @@ -2294,42 +2377,26 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, // Check if the arguments are already laid out in the right way as // the caller's fixed stack objects. MachineFrameInfo *MFI = MF.getFrameInfo(); + const MachineRegisterInfo *MRI = &MF.getRegInfo(); + const X86InstrInfo *TII = + ((X86TargetMachine&)getTargetMachine()).getInstrInfo(); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; EVT RegVT = VA.getLocVT(); SDValue Arg = Outs[i].Val; ISD::ArgFlagsTy Flags = Outs[i].Flags; - if (Flags.isByVal()) - return false; // TODO if (VA.getLocInfo() == CCValAssign::Indirect) return false; if (!VA.isRegLoc()) { - LoadSDNode *Ld = dyn_cast(Arg); - if (!Ld) - return false; - SDValue Ptr = Ld->getBasePtr(); - FrameIndexSDNode *FINode = dyn_cast(Ptr); - if (!FINode) - return false; - int FI = FINode->getIndex(); - if (!MFI->isFixedObjectIndex(FI)) - return false; - if (VA.getLocMemOffset() != MFI->getObjectOffset(FI)) + if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, + MFI, MRI, TII)) return false; } } } } - // If the caller does not return a value, then this is obviously safe. - // This is one case where it's safe to perform this optimization even - // if the return types do not match. - const Type *CallerRetTy = CallerF->getReturnType(); - if (CallerRetTy->isVoidTy()) - return true; - - // If the return types match, then it's safe. - return CallerRetTy == RetTy; + return true; } FastISel * @@ -2364,7 +2431,7 @@ SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) { // Set up a frame object for the return address. uint64_t SlotSize = TD->getPointerSize(); ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize, - true, false); + false, false); FuncInfo->setRAIndex(ReturnAddrIndex); } @@ -2375,7 +2442,7 @@ SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) { bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, bool hasSymbolicDisplacement) { // Offset should fit into 32 bit immediate field. - if (!isInt32(Offset)) + if (!isInt<32>(Offset)) return false; // If we don't have a symbolic displacement - we don't have any extra @@ -3559,7 +3626,8 @@ X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl, int EltNo = (Offset - StartOffset) >> 2; int Mask[4] = { EltNo, EltNo, EltNo, EltNo }; EVT VT = (PVT == MVT::i32) ? MVT::v4i32 : MVT::v4f32; - SDValue V1 = DAG.getLoad(VT, dl, Chain, Ptr,LD->getSrcValue(),0); + SDValue V1 = DAG.getLoad(VT, dl, Chain, Ptr,LD->getSrcValue(),0, + false, false, 0); // Canonicalize it to a v4i32 shuffle. V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32, V1); return DAG.getNode(ISD::BIT_CONVERT, dl, VT, @@ -3570,6 +3638,69 @@ X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl, return SDValue(); } +/// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a +/// vector of type 'VT', see if the elements can be replaced by a single large +/// load which has the same value as a build_vector whose operands are 'elts'. +/// +/// Example: -> zextload a +/// +/// FIXME: we'd also like to handle the case where the last elements are zero +/// rather than undef via VZEXT_LOAD, but we do not detect that case today. +/// There's even a handy isZeroNode for that purpose. +static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl &Elts, + DebugLoc &dl, SelectionDAG &DAG) { + EVT EltVT = VT.getVectorElementType(); + unsigned NumElems = Elts.size(); + + LoadSDNode *LDBase = NULL; + unsigned LastLoadedElt = -1U; + + // For each element in the initializer, see if we've found a load or an undef. + // If we don't find an initial load element, or later load elements are + // non-consecutive, bail out. + for (unsigned i = 0; i < NumElems; ++i) { + SDValue Elt = Elts[i]; + + if (!Elt.getNode() || + (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode()))) + return SDValue(); + if (!LDBase) { + if (Elt.getNode()->getOpcode() == ISD::UNDEF) + return SDValue(); + LDBase = cast(Elt.getNode()); + LastLoadedElt = i; + continue; + } + if (Elt.getOpcode() == ISD::UNDEF) + continue; + + LoadSDNode *LD = cast(Elt); + if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i)) + return SDValue(); + LastLoadedElt = i; + } + + // If we have found an entire vector of loads and undefs, then return a large + // load of the entire vector width starting at the base pointer. If we found + // consecutive loads for the low half, generate a vzext_load node. + if (LastLoadedElt == NumElems - 1) { + if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16) + return DAG.getLoad(VT, dl, LDBase->getChain(), LDBase->getBasePtr(), + LDBase->getSrcValue(), LDBase->getSrcValueOffset(), + LDBase->isVolatile(), LDBase->isNonTemporal(), 0); + return DAG.getLoad(VT, dl, LDBase->getChain(), LDBase->getBasePtr(), + LDBase->getSrcValue(), LDBase->getSrcValueOffset(), + LDBase->isVolatile(), LDBase->isNonTemporal(), + LDBase->getAlignment()); + } else if (NumElems == 4 && LastLoadedElt == 1) { + SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); + SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; + SDValue ResNode = DAG.getNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2); + return DAG.getNode(ISD::BIT_CONVERT, dl, VT, ResNode); + } + return SDValue(); +} + SDValue X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) { DebugLoc dl = Op.getDebugLoc(); @@ -3798,14 +3929,18 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) { return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]); } - if (Values.size() > 2) { - // If we have SSE 4.1, Expand into a number of inserts unless the number of - // values to be inserted is equal to the number of elements, in which case - // use the unpack code below in the hopes of matching the consecutive elts - // load merge pattern for shuffles. - // FIXME: We could probably just check that here directly. - if (Values.size() < NumElems && VT.getSizeInBits() == 128 && - getSubtarget()->hasSSE41()) { + if (Values.size() > 1 && VT.getSizeInBits() == 128) { + // Check for a build vector of consecutive loads. + for (unsigned i = 0; i < NumElems; ++i) + V[i] = Op.getOperand(i); + + // Check for elements which are consecutive loads. + SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG); + if (LD.getNode()) + return LD; + + // For SSE 4.1, use inserts into undef. + if (getSubtarget()->hasSSE41()) { V[0] = DAG.getUNDEF(VT); for (unsigned i = 0; i < NumElems; ++i) if (Op.getOperand(i).getOpcode() != ISD::UNDEF) @@ -3813,7 +3948,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) { Op.getOperand(i), DAG.getIntPtrConstant(i)); return V[0]; } - // Expand into a number of unpckl*. + + // Otherwise, expand into a number of unpckl* // e.g. for v4f32 // Step 1: unpcklps 0, 2 ==> X: // : unpcklps 1, 3 ==> Y: @@ -3828,7 +3964,6 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) { } return V[0]; } - return SDValue(); } @@ -4803,8 +4938,16 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG){ if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) && isa(N2)) { - unsigned Opc = (EltVT.getSizeInBits() == 8) ? X86ISD::PINSRB - : X86ISD::PINSRW; + unsigned Opc; + if (VT == MVT::v8i16) + Opc = X86ISD::PINSRW; + else if (VT == MVT::v4i16) + Opc = X86ISD::MMX_PINSRW; + else if (VT == MVT::v16i8) + Opc = X86ISD::PINSRB; + else + Opc = X86ISD::PINSRB; + // Transform it so it match pinsr{b,w} which expects a GR32 as its second // argument. if (N1.getValueType() != MVT::i32) @@ -4855,7 +4998,8 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); if (N2.getValueType() != MVT::i32) N2 = DAG.getIntPtrConstant(cast(N2)->getZExtValue()); - return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2); + return DAG.getNode(VT == MVT::v8i16 ? X86ISD::PINSRW : X86ISD::MMX_PINSRW, + dl, VT, N0, N1, N2); } return SDValue(); } @@ -5058,7 +5202,7 @@ X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl, // load. if (isGlobalStubReference(OpFlags)) Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result, - PseudoSourceValue::getGOT(), 0); + PseudoSourceValue::getGOT(), 0, false, false, 0); // If there was a non-zero offset that we didn't fold, create an explicit // addition for it. @@ -5138,7 +5282,7 @@ static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, MVT::i32)); SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Base, - NULL, 0); + NULL, 0, false, false, 0); unsigned char OperandFlags = 0; // Most TLS accesses are not RIP relative, even on x86-64. One exception is @@ -5163,7 +5307,7 @@ static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, if (model == TLSModel::InitialExec) Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset, - PseudoSourceValue::getGOT(), 0); + PseudoSourceValue::getGOT(), 0, false, false, 0); // The address of the thread local variable is the add of the thread // pointer with the offset of the variable. @@ -5231,7 +5375,7 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) { SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, DAG.getConstant(VTBits, MVT::i8)); - SDValue Cond = DAG.getNode(X86ISD::CMP, dl, VT, + SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32, AndNode, DAG.getConstant(0, MVT::i8)); SDValue Hi, Lo; @@ -5280,7 +5424,8 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) { SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), StackSlot, - PseudoSourceValue::getFixedStack(SSFI), 0); + PseudoSourceValue::getFixedStack(SSFI), 0, + false, false, 0); return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG); } @@ -5315,7 +5460,8 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, }; Chain = DAG.getNode(X86ISD::FST, dl, Tys, Ops, array_lengthof(Ops)); Result = DAG.getLoad(Op.getValueType(), dl, Chain, StackSlot, - PseudoSourceValue::getFixedStack(SSFI), 0); + PseudoSourceValue::getFixedStack(SSFI), 0, + false, false, 0); } return Result; @@ -5388,12 +5534,12 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG) { SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, XR1, XR2); SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, PseudoSourceValue::getConstantPool(), 0, - false, 16); + false, false, 16); SDValue Unpck2 = getUnpackl(DAG, dl, MVT::v4i32, Unpck1, CLod0); SDValue XR2F = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Unpck2); SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, PseudoSourceValue::getConstantPool(), 0, - false, 16); + false, false, 16); SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); // Add the halves; easiest way is to swap them into another reg first. @@ -5480,9 +5626,9 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) { SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackSlot, WordOff); SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), - StackSlot, NULL, 0); + StackSlot, NULL, 0, false, false, 0); SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32), - OffsetSlot, NULL, 0); + OffsetSlot, NULL, 0, false, false, 0); return BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG); } @@ -5530,7 +5676,8 @@ FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) { if (isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) { assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"); Chain = DAG.getStore(Chain, dl, Value, StackSlot, - PseudoSourceValue::getFixedStack(SSFI), 0); + PseudoSourceValue::getFixedStack(SSFI), 0, + false, false, 0); SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other); SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(Op.getOperand(0).getValueType()) @@ -5564,7 +5711,7 @@ SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) { // Load the result. return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), - FIST, StackSlot, NULL, 0); + FIST, StackSlot, NULL, 0, false, false, 0); } SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) { @@ -5574,7 +5721,7 @@ SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) { // Load the result. return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), - FIST, StackSlot, NULL, 0); + FIST, StackSlot, NULL, 0, false, false, 0); } SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) { @@ -5599,8 +5746,8 @@ SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) { Constant *C = ConstantVector::get(CV); SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, - PseudoSourceValue::getConstantPool(), 0, - false, 16); + PseudoSourceValue::getConstantPool(), 0, + false, false, 16); return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask); } @@ -5626,8 +5773,8 @@ SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) { Constant *C = ConstantVector::get(CV); SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, - PseudoSourceValue::getConstantPool(), 0, - false, 16); + PseudoSourceValue::getConstantPool(), 0, + false, false, 16); if (VT.isVector()) { return DAG.getNode(ISD::BIT_CONVERT, dl, VT, DAG.getNode(ISD::XOR, dl, MVT::v2i64, @@ -5675,8 +5822,8 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { Constant *C = ConstantVector::get(CV); SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx, - PseudoSourceValue::getConstantPool(), 0, - false, 16); + PseudoSourceValue::getConstantPool(), 0, + false, false, 16); SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1); // Shift sign bit right or left if the two operands have different types. @@ -5704,8 +5851,8 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { C = ConstantVector::get(CV); CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, - PseudoSourceValue::getConstantPool(), 0, - false, 16); + PseudoSourceValue::getConstantPool(), 0, + false, false, 16); SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2); // Or the value with the sign bit. @@ -5857,26 +6004,31 @@ SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, /// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node /// if it's possible. -static SDValue LowerToBT(SDValue Op0, ISD::CondCode CC, +static SDValue LowerToBT(SDValue And, ISD::CondCode CC, DebugLoc dl, SelectionDAG &DAG) { + SDValue Op0 = And.getOperand(0); + SDValue Op1 = And.getOperand(1); + if (Op0.getOpcode() == ISD::TRUNCATE) + Op0 = Op0.getOperand(0); + if (Op1.getOpcode() == ISD::TRUNCATE) + Op1 = Op1.getOperand(0); + SDValue LHS, RHS; - if (Op0.getOperand(1).getOpcode() == ISD::SHL) { - if (ConstantSDNode *Op010C = - dyn_cast(Op0.getOperand(1).getOperand(0))) - if (Op010C->getZExtValue() == 1) { - LHS = Op0.getOperand(0); - RHS = Op0.getOperand(1).getOperand(1); + if (Op1.getOpcode() == ISD::SHL) { + if (ConstantSDNode *And10C = dyn_cast(Op1.getOperand(0))) + if (And10C->getZExtValue() == 1) { + LHS = Op0; + RHS = Op1.getOperand(1); } - } else if (Op0.getOperand(0).getOpcode() == ISD::SHL) { - if (ConstantSDNode *Op000C = - dyn_cast(Op0.getOperand(0).getOperand(0))) - if (Op000C->getZExtValue() == 1) { - LHS = Op0.getOperand(1); - RHS = Op0.getOperand(0).getOperand(1); + } else if (Op0.getOpcode() == ISD::SHL) { + if (ConstantSDNode *And00C = dyn_cast(Op0.getOperand(0))) + if (And00C->getZExtValue() == 1) { + LHS = Op1; + RHS = Op0.getOperand(1); } - } else if (Op0.getOperand(1).getOpcode() == ISD::Constant) { - ConstantSDNode *AndRHS = cast(Op0.getOperand(1)); - SDValue AndLHS = Op0.getOperand(0); + } else if (Op1.getOpcode() == ISD::Constant) { + ConstantSDNode *AndRHS = cast(Op1); + SDValue AndLHS = Op0; if (AndRHS->getZExtValue() == 1 && AndLHS.getOpcode() == ISD::SRL) { LHS = AndLHS.getOperand(0); RHS = AndLHS.getOperand(1); @@ -5926,6 +6078,21 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) { return NewSetCC; } + // Look for "(setcc) == / != 1" to avoid unncessary setcc. + if (Op0.getOpcode() == X86ISD::SETCC && + Op1.getOpcode() == ISD::Constant && + (cast(Op1)->getZExtValue() == 1 || + cast(Op1)->isNullValue()) && + (CC == ISD::SETEQ || CC == ISD::SETNE)) { + X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0); + bool Invert = (CC == ISD::SETNE) ^ + cast(Op1)->isNullValue(); + if (Invert) + CCode = X86::GetOppositeBranchCondition(CCode); + return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, + DAG.getConstant(CCode, MVT::i8), Op0.getOperand(1)); + } + bool isFP = Op.getOperand(1).getValueType().isFloatingPoint(); unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG); if (X86CC == X86::COND_INVALID) @@ -6107,7 +6274,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) { N2C && N2C->isNullValue() && RHSC && RHSC->isNullValue()) { SDValue CmpOp0 = Cmp.getOperand(0); - Cmp = DAG.getNode(X86ISD::CMP, dl, CmpOp0.getValueType(), + Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32, CmpOp0, DAG.getConstant(1, CmpOp0.getValueType())); return DAG.getNode(X86ISD::SETCC_CARRY, dl, Op.getValueType(), DAG.getConstant(X86::COND_B, MVT::i8), Cmp); @@ -6367,24 +6534,13 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, EVT IntPtr = getPointerTy(); EVT SPTy = Subtarget->is64Bit() ? MVT::i64 : MVT::i32; - Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, true)); - Chain = DAG.getCopyToReg(Chain, dl, X86::EAX, Size, Flag); Flag = Chain.getValue(1); - SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); - SDValue Ops[] = { Chain, - DAG.getTargetExternalSymbol("_alloca", IntPtr), - DAG.getRegister(X86::EAX, IntPtr), - DAG.getRegister(X86StackPtr, SPTy), - Flag }; - Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops, 5); - Flag = Chain.getValue(1); + SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); - Chain = DAG.getCALLSEQ_END(Chain, - DAG.getIntPtrConstant(0, true), - DAG.getIntPtrConstant(0, true), - Flag); + Chain = DAG.getNode(X86ISD::MINGW_ALLOCA, dl, NodeTys, Chain, Flag); + Flag = Chain.getValue(1); Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1); @@ -6428,8 +6584,7 @@ X86TargetLowering::EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl, LowerCallTo(Chain, Type::getVoidTy(*DAG.getContext()), false, false, false, false, 0, CallingConv::C, false, /*isReturnValueUsed=*/false, - DAG.getExternalSymbol(bzeroEntry, IntPtr), Args, DAG, dl, - DAG.GetOrdering(Chain.getNode())); + DAG.getExternalSymbol(bzeroEntry, IntPtr), Args, DAG, dl); return CallResult.second; } @@ -6613,7 +6768,8 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) { // vastart just stores the address of the VarArgsFrameIndex slot into the // memory location argument. SDValue FR = DAG.getFrameIndex(VarArgsFrameIndex, getPointerTy()); - return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), SV, 0); + return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), SV, 0, + false, false, 0); } // __va_list_tag: @@ -6625,8 +6781,8 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) { SDValue FIN = Op.getOperand(1); // Store gp_offset SDValue Store = DAG.getStore(Op.getOperand(0), dl, - DAG.getConstant(VarArgsGPOffset, MVT::i32), - FIN, SV, 0); + DAG.getConstant(VarArgsGPOffset, MVT::i32), + FIN, SV, 0, false, false, 0); MemOps.push_back(Store); // Store fp_offset @@ -6634,21 +6790,23 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) { FIN, DAG.getIntPtrConstant(4)); Store = DAG.getStore(Op.getOperand(0), dl, DAG.getConstant(VarArgsFPOffset, MVT::i32), - FIN, SV, 0); + FIN, SV, 0, false, false, 0); MemOps.push_back(Store); // Store ptr to overflow_arg_area FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), FIN, DAG.getIntPtrConstant(4)); SDValue OVFIN = DAG.getFrameIndex(VarArgsFrameIndex, getPointerTy()); - Store = DAG.getStore(Op.getOperand(0), dl, OVFIN, FIN, SV, 0); + Store = DAG.getStore(Op.getOperand(0), dl, OVFIN, FIN, SV, 0, + false, false, 0); MemOps.push_back(Store); // Store ptr to reg_save_area. FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), FIN, DAG.getIntPtrConstant(8)); SDValue RSFIN = DAG.getFrameIndex(RegSaveFrameIndex, getPointerTy()); - Store = DAG.getStore(Op.getOperand(0), dl, RSFIN, FIN, SV, 0); + Store = DAG.getStore(Op.getOperand(0), dl, RSFIN, FIN, SV, 0, + false, false, 0); MemOps.push_back(Store); return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &MemOps[0], MemOps.size()); @@ -6934,13 +7092,13 @@ SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) { return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), DAG.getNode(ISD::ADD, dl, getPointerTy(), FrameAddr, Offset), - NULL, 0); + NULL, 0, false, false, 0); } // Just load the return address. SDValue RetAddrFI = getReturnAddressFrameIndex(DAG); return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), - RetAddrFI, NULL, 0); + RetAddrFI, NULL, 0, false, false, 0); } SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) { @@ -6952,7 +7110,8 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) { unsigned FrameReg = Subtarget->is64Bit() ? X86::RBP : X86::EBP; SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); while (Depth--) - FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, NULL, 0); + FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, NULL, 0, + false, false, 0); return FrameAddr; } @@ -6976,7 +7135,7 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) SDValue StoreAddr = DAG.getNode(ISD::SUB, dl, getPointerTy(), Frame, DAG.getIntPtrConstant(-TD->getPointerSize())); StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StoreAddr, Offset); - Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, NULL, 0); + Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, NULL, 0, false, false, 0); Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr); MF.getRegInfo().addLiveOut(StoreAddrReg); @@ -6995,16 +7154,12 @@ SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op, const Value *TrmpAddr = cast(Op.getOperand(4))->getValue(); - const X86InstrInfo *TII = - ((X86TargetMachine&)getTargetMachine()).getInstrInfo(); - if (Subtarget->is64Bit()) { SDValue OutChains[6]; // Large code-model. - - const unsigned char JMP64r = TII->getBaseOpcodeFor(X86::JMP64r); - const unsigned char MOV64ri = TII->getBaseOpcodeFor(X86::MOV64ri); + const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode. + const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode. const unsigned char N86R10 = RegInfo->getX86RegNum(X86::R10); const unsigned char N86R11 = RegInfo->getX86RegNum(X86::R11); @@ -7015,11 +7170,12 @@ SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op, unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11 SDValue Addr = Trmp; OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), - Addr, TrmpAddr, 0); + Addr, TrmpAddr, 0, false, false, 0); Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, DAG.getConstant(2, MVT::i64)); - OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, TrmpAddr, 2, false, 2); + OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, TrmpAddr, 2, + false, false, 2); // Load the 'nest' parameter value into R10. // R10 is specified in X86CallingConv.td @@ -7027,24 +7183,25 @@ SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op, Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, DAG.getConstant(10, MVT::i64)); OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), - Addr, TrmpAddr, 10); + Addr, TrmpAddr, 10, false, false, 0); Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, DAG.getConstant(12, MVT::i64)); - OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 12, false, 2); + OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 12, + false, false, 2); // Jump to the nested function. OpCode = (JMP64r << 8) | REX_WB; // jmpq *... Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, DAG.getConstant(20, MVT::i64)); OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), - Addr, TrmpAddr, 20); + Addr, TrmpAddr, 20, false, false, 0); unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, DAG.getConstant(22, MVT::i64)); OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr, - TrmpAddr, 22); + TrmpAddr, 22, false, false, 0); SDValue Ops[] = { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6) }; @@ -7099,25 +7256,28 @@ SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op, DAG.getConstant(10, MVT::i32)); Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr); - const unsigned char MOV32ri = TII->getBaseOpcodeFor(X86::MOV32ri); + // This is storing the opcode for MOV32ri. + const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte. const unsigned char N86Reg = RegInfo->getX86RegNum(NestReg); OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(MOV32ri|N86Reg, MVT::i8), - Trmp, TrmpAddr, 0); + Trmp, TrmpAddr, 0, false, false, 0); Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, DAG.getConstant(1, MVT::i32)); - OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 1, false, 1); + OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 1, + false, false, 1); - const unsigned char JMP = TII->getBaseOpcodeFor(X86::JMP); + const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode. Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, DAG.getConstant(5, MVT::i32)); OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr, - TrmpAddr, 5, false, 1); + TrmpAddr, 5, false, false, 1); Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, DAG.getConstant(6, MVT::i32)); - OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, TrmpAddr, 6, false, 1); + OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, TrmpAddr, 6, + false, false, 1); SDValue Ops[] = { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4) }; @@ -7160,7 +7320,8 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) { DAG.getEntryNode(), StackSlot); // Load FP Control Word from stack slot - SDValue CWD = DAG.getLoad(MVT::i16, dl, Chain, StackSlot, NULL, 0); + SDValue CWD = DAG.getLoad(MVT::i16, dl, Chain, StackSlot, NULL, 0, + false, false, 0); // Transform as necessary SDValue CWD1 = @@ -7524,7 +7685,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, if (FIST.getNode() != 0) { EVT VT = N->getValueType(0); // Return a load from the stack slot. - Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, NULL, 0)); + Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, NULL, 0, + false, false, 0)); } return; } @@ -7542,14 +7704,6 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, Results.push_back(edx.getValue(1)); return; } - case ISD::SDIV: - case ISD::UDIV: - case ISD::SREM: - case ISD::UREM: { - EVT WidenVT = getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); - Results.push_back(DAG.UnrollVectorOp(N, WidenVT.getVectorNumElements())); - return; - } case ISD::ATOMIC_CMP_SWAP: { EVT T = N->getValueType(0); assert (T == MVT::i64 && "Only know how to expand i64 Cmp and Swap"); @@ -7647,6 +7801,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::INSERTPS: return "X86ISD::INSERTPS"; case X86ISD::PINSRB: return "X86ISD::PINSRB"; case X86ISD::PINSRW: return "X86ISD::PINSRW"; + case X86ISD::MMX_PINSRW: return "X86ISD::MMX_PINSRW"; case X86ISD::PSHUFB: return "X86ISD::PSHUFB"; case X86ISD::FMAX: return "X86ISD::FMAX"; case X86ISD::FMIN: return "X86ISD::FMIN"; @@ -7691,6 +7846,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM"; case X86ISD::PTEST: return "X86ISD::PTEST"; case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; + case X86ISD::MINGW_ALLOCA: return "X86ISD::MINGW_ALLOCA"; } } @@ -7748,13 +7904,13 @@ bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM, bool X86TargetLowering::isTruncateFree(const Type *Ty1, const Type *Ty2) const { - if (!Ty1->isInteger() || !Ty2->isInteger()) + if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) return false; unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); if (NumBits1 <= NumBits2) return false; - return Subtarget->is64Bit() || NumBits1 < 64; + return true; } bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { @@ -7764,12 +7920,12 @@ bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { unsigned NumBits2 = VT2.getSizeInBits(); if (NumBits1 <= NumBits2) return false; - return Subtarget->is64Bit() || NumBits1 < 64; + return true; } bool X86TargetLowering::isZExtFree(const Type *Ty1, const Type *Ty2) const { // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. - return Ty1->isInteger(32) && Ty2->isInteger(64) && Subtarget->is64Bit(); + return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit(); } bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const { @@ -7925,7 +8081,7 @@ X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr, MIB.addReg(EAXreg); // insert branch - BuildMI(newMBB, dl, TII->get(X86::JNE)).addMBB(newMBB); + BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); F->DeleteMachineInstr(bInstr); // The pseudo instruction is gone now. return nextMBB; @@ -8082,7 +8238,7 @@ X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr, MIB.addReg(X86::EDX); // insert branch - BuildMI(newMBB, dl, TII->get(X86::JNE)).addMBB(newMBB); + BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); F->DeleteMachineInstr(bInstr); // The pseudo instruction is gone now. return nextMBB; @@ -8185,7 +8341,7 @@ X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr, MIB.addReg(X86::EAX); // insert branch - BuildMI(newMBB, dl, TII->get(X86::JNE)).addMBB(newMBB); + BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); F->DeleteMachineInstr(mInstr); // The pseudo instruction is gone now. return nextMBB; @@ -8267,7 +8423,7 @@ X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( if (!Subtarget->isTargetWin64()) { // If %al is 0, branch around the XMM save block. BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg); - BuildMI(MBB, DL, TII->get(X86::JE)).addMBB(EndMBB); + BuildMI(MBB, DL, TII->get(X86::JE_4)).addMBB(EndMBB); MBB->addSuccessor(EndMBB); } @@ -8360,6 +8516,29 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, return BB; } +MachineBasicBlock * +X86TargetLowering::EmitLoweredMingwAlloca(MachineInstr *MI, + MachineBasicBlock *BB, + DenseMap *EM) const { + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + DebugLoc DL = MI->getDebugLoc(); + MachineFunction *F = BB->getParent(); + + // The lowering is pretty easy: we're just emitting the call to _alloca. The + // non-trivial part is impdef of ESP. + // FIXME: The code should be tweaked as soon as we'll try to do codegen for + // mingw-w64. + + BuildMI(BB, DL, TII->get(X86::CALLpcrel32)) + .addExternalSymbol("_alloca") + .addReg(X86::EAX, RegState::Implicit) + .addReg(X86::ESP, RegState::Implicit) + .addReg(X86::EAX, RegState::Define | RegState::Implicit) + .addReg(X86::ESP, RegState::Define | RegState::Implicit); + + F->DeleteMachineInstr(MI); // The pseudo instruction is gone now. + return BB; +} MachineBasicBlock * X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, @@ -8367,6 +8546,8 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, DenseMap *EM) const { switch (MI->getOpcode()) { default: assert(false && "Unexpected instr type to insert"); + case X86::MINGW_ALLOCA: + return EmitLoweredMingwAlloca(MI, BB, EM); case X86::CMOV_GR8: case X86::CMOV_V1I64: case X86::CMOV_FR32: @@ -8374,6 +8555,11 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case X86::CMOV_V4F32: case X86::CMOV_V2F64: case X86::CMOV_V2I64: + case X86::CMOV_GR16: + case X86::CMOV_GR32: + case X86::CMOV_RFP32: + case X86::CMOV_RFP64: + case X86::CMOV_RFP80: return EmitLoweredSelect(MI, BB, EM); case X86::FP32_TO_INT16_IN_MEM: @@ -8456,6 +8642,21 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, F->DeleteMachineInstr(MI); // The pseudo instruction is gone now. return BB; } + // DBG_VALUE. Only the frame index case is done here. + case X86::DBG_VALUE: { + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + DebugLoc DL = MI->getDebugLoc(); + X86AddressMode AM; + MachineFunction *F = BB->getParent(); + AM.BaseType = X86AddressMode::FrameIndexBase; + AM.Base.FrameIndex = MI->getOperand(0).getImm(); + addFullAddress(BuildMI(BB, DL, TII->get(X86::DBG_VALUE)), AM). + addImm(MI->getOperand(1).getImm()). + addMetadata(MI->getOperand(2).getMetadata()); + F->DeleteMachineInstr(MI); // Remove pseudo. + return BB; + } + // String/text processing lowering. case X86::PCMPISTRM128REG: return EmitPCMP(MI, BB, 3, false /* in-mem */); @@ -8688,81 +8889,104 @@ bool X86TargetLowering::isGAPlusOffset(SDNode *N, return TargetLowering::isGAPlusOffset(N, GA, Offset); } -static bool EltsFromConsecutiveLoads(ShuffleVectorSDNode *N, unsigned NumElems, - EVT EltVT, LoadSDNode *&LDBase, - unsigned &LastLoadedElt, - SelectionDAG &DAG, MachineFrameInfo *MFI, - const TargetLowering &TLI) { - LDBase = NULL; - LastLoadedElt = -1U; - for (unsigned i = 0; i < NumElems; ++i) { - if (N->getMaskElt(i) < 0) { - if (!LDBase) - return false; - continue; - } - - SDValue Elt = DAG.getShuffleScalarElt(N, i); - if (!Elt.getNode() || - (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode()))) - return false; - if (!LDBase) { - if (Elt.getNode()->getOpcode() == ISD::UNDEF) - return false; - LDBase = cast(Elt.getNode()); - LastLoadedElt = i; - continue; - } - if (Elt.getOpcode() == ISD::UNDEF) - continue; - - LoadSDNode *LD = cast(Elt); - if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i)) - return false; - LastLoadedElt = i; - } - return true; -} - /// PerformShuffleCombine - Combine a vector_shuffle that is equal to /// build_vector load1, load2, load3, load4, <0, 1, 2, 3> into a 128-bit load /// if the load addresses are consecutive, non-overlapping, and in the right -/// order. In the case of v2i64, it will see if it can rewrite the -/// shuffle to be an appropriate build vector so it can take advantage of -// performBuildVectorCombine. +/// order. static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, const TargetLowering &TLI) { DebugLoc dl = N->getDebugLoc(); EVT VT = N->getValueType(0); - EVT EltVT = VT.getVectorElementType(); ShuffleVectorSDNode *SVN = cast(N); - unsigned NumElems = VT.getVectorNumElements(); if (VT.getSizeInBits() != 128) return SDValue(); - // Try to combine a vector_shuffle into a 128-bit load. - MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); - LoadSDNode *LD = NULL; - unsigned LastLoadedElt; - if (!EltsFromConsecutiveLoads(SVN, NumElems, EltVT, LD, LastLoadedElt, DAG, - MFI, TLI)) + SmallVector Elts; + for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) + Elts.push_back(DAG.getShuffleScalarElt(SVN, i)); + + return EltsFromConsecutiveLoads(VT, Elts, dl, DAG); +} + +/// PerformShuffleCombine - Detect vector gather/scatter index generation +/// and convert it from being a bunch of shuffles and extracts to a simple +/// store and scalar loads to extract the elements. +static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, + const TargetLowering &TLI) { + SDValue InputVector = N->getOperand(0); + + // Only operate on vectors of 4 elements, where the alternative shuffling + // gets to be more expensive. + if (InputVector.getValueType() != MVT::v4i32) return SDValue(); - if (LastLoadedElt == NumElems - 1) { - if (DAG.InferPtrAlignment(LD->getBasePtr()) >= 16) - return DAG.getLoad(VT, dl, LD->getChain(), LD->getBasePtr(), - LD->getSrcValue(), LD->getSrcValueOffset(), - LD->isVolatile()); - return DAG.getLoad(VT, dl, LD->getChain(), LD->getBasePtr(), - LD->getSrcValue(), LD->getSrcValueOffset(), - LD->isVolatile(), LD->getAlignment()); - } else if (NumElems == 4 && LastLoadedElt == 1) { - SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); - SDValue Ops[] = { LD->getChain(), LD->getBasePtr() }; - SDValue ResNode = DAG.getNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2); - return DAG.getNode(ISD::BIT_CONVERT, dl, VT, ResNode); + // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a + // single use which is a sign-extend or zero-extend, and all elements are + // used. + SmallVector Uses; + unsigned ExtractedElements = 0; + for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(), + UE = InputVector.getNode()->use_end(); UI != UE; ++UI) { + if (UI.getUse().getResNo() != InputVector.getResNo()) + return SDValue(); + + SDNode *Extract = *UI; + if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return SDValue(); + + if (Extract->getValueType(0) != MVT::i32) + return SDValue(); + if (!Extract->hasOneUse()) + return SDValue(); + if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND && + Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND) + return SDValue(); + if (!isa(Extract->getOperand(1))) + return SDValue(); + + // Record which element was extracted. + ExtractedElements |= + 1 << cast(Extract->getOperand(1))->getZExtValue(); + + Uses.push_back(Extract); } + + // If not all the elements were used, this may not be worthwhile. + if (ExtractedElements != 15) + return SDValue(); + + // Ok, we've now decided to do the transformation. + DebugLoc dl = InputVector.getDebugLoc(); + + // Store the value to a temporary stack slot. + SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType()); + SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr, NULL, 0, + false, false, 0); + + // Replace each use (extract) with a load of the appropriate element. + for (SmallVectorImpl::iterator UI = Uses.begin(), + UE = Uses.end(); UI != UE; ++UI) { + SDNode *Extract = *UI; + + // Compute the element's address. + SDValue Idx = Extract->getOperand(1); + unsigned EltSize = + InputVector.getValueType().getVectorElementType().getSizeInBits()/8; + uint64_t Offset = EltSize * cast(Idx)->getZExtValue(); + SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy()); + + SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), OffsetVal, StackPtr); + + // Load the scalar. + SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch, ScalarAddr, + NULL, 0, false, false, 0); + + // Replace the exact with the load. + DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), LoadScalar); + } + + // The replacement was made in place; don't return anything. return SDValue(); } @@ -8776,10 +9000,9 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, SDValue RHS = N->getOperand(2); // If we have SSE[12] support, try to form min/max nodes. SSE min/max - // instructions have the peculiarity that if either operand is a NaN, - // they chose what we call the RHS operand (and as such are not symmetric). - // It happens that this matches the semantics of the common C idiom - // xhasSSE2() && (LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64) && Cond.getOpcode() == ISD::SETCC) { @@ -8787,36 +9010,34 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, unsigned Opcode = 0; // Check for x CC y ? x : y. - if (LHS == Cond.getOperand(0) && RHS == Cond.getOperand(1)) { + if (DAG.isEqualTo(LHS, Cond.getOperand(0)) && + DAG.isEqualTo(RHS, Cond.getOperand(1))) { switch (CC) { default: break; case ISD::SETULT: - // This can be a min if we can prove that at least one of the operands - // is not a nan. - if (!FiniteOnlyFPMath()) { - if (DAG.isKnownNeverNaN(RHS)) { - // Put the potential NaN in the RHS so that SSE will preserve it. - std::swap(LHS, RHS); - } else if (!DAG.isKnownNeverNaN(LHS)) + // Converting this to a min would handle NaNs incorrectly, and swapping + // the operands would cause it to handle comparisons between positive + // and negative zero incorrectly. + if (!FiniteOnlyFPMath() && + (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) { + if (!UnsafeFPMath && + !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) break; + std::swap(LHS, RHS); } Opcode = X86ISD::FMIN; break; case ISD::SETOLE: - // This can be a min if we can prove that at least one of the operands - // is not a nan. - if (!FiniteOnlyFPMath()) { - if (DAG.isKnownNeverNaN(LHS)) { - // Put the potential NaN in the RHS so that SSE will preserve it. - std::swap(LHS, RHS); - } else if (!DAG.isKnownNeverNaN(RHS)) - break; - } + // Converting this to a min would handle comparisons between positive + // and negative zero incorrectly. + if (!UnsafeFPMath && + !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) + break; Opcode = X86ISD::FMIN; break; case ISD::SETULE: - // This can be a min, but if either operand is a NaN we need it to - // preserve the original LHS. + // Converting this to a min would handle both negative zeros and NaNs + // incorrectly, but we can swap the operands to fix both. std::swap(LHS, RHS); case ISD::SETOLT: case ISD::SETLT: @@ -8825,32 +9046,29 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, break; case ISD::SETOGE: - // This can be a max if we can prove that at least one of the operands - // is not a nan. - if (!FiniteOnlyFPMath()) { - if (DAG.isKnownNeverNaN(LHS)) { - // Put the potential NaN in the RHS so that SSE will preserve it. - std::swap(LHS, RHS); - } else if (!DAG.isKnownNeverNaN(RHS)) - break; - } + // Converting this to a max would handle comparisons between positive + // and negative zero incorrectly. + if (!UnsafeFPMath && + !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(LHS)) + break; Opcode = X86ISD::FMAX; break; case ISD::SETUGT: - // This can be a max if we can prove that at least one of the operands - // is not a nan. - if (!FiniteOnlyFPMath()) { - if (DAG.isKnownNeverNaN(RHS)) { - // Put the potential NaN in the RHS so that SSE will preserve it. - std::swap(LHS, RHS); - } else if (!DAG.isKnownNeverNaN(LHS)) + // Converting this to a max would handle NaNs incorrectly, and swapping + // the operands would cause it to handle comparisons between positive + // and negative zero incorrectly. + if (!FiniteOnlyFPMath() && + (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) { + if (!UnsafeFPMath && + !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) break; + std::swap(LHS, RHS); } Opcode = X86ISD::FMAX; break; case ISD::SETUGE: - // This can be a max, but if either operand is a NaN we need it to - // preserve the original LHS. + // Converting this to a max would handle both negative zeros and NaNs + // incorrectly, but we can swap the operands to fix both. std::swap(LHS, RHS); case ISD::SETOGT: case ISD::SETGT: @@ -8859,36 +9077,33 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, break; } // Check for x CC y ? y : x -- a min/max with reversed arms. - } else if (LHS == Cond.getOperand(1) && RHS == Cond.getOperand(0)) { + } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) && + DAG.isEqualTo(RHS, Cond.getOperand(0))) { switch (CC) { default: break; case ISD::SETOGE: - // This can be a min if we can prove that at least one of the operands - // is not a nan. - if (!FiniteOnlyFPMath()) { - if (DAG.isKnownNeverNaN(RHS)) { - // Put the potential NaN in the RHS so that SSE will preserve it. - std::swap(LHS, RHS); - } else if (!DAG.isKnownNeverNaN(LHS)) + // Converting this to a min would handle comparisons between positive + // and negative zero incorrectly, and swapping the operands would + // cause it to handle NaNs incorrectly. + if (!UnsafeFPMath && + !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) { + if (!FiniteOnlyFPMath() && + (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) break; + std::swap(LHS, RHS); } Opcode = X86ISD::FMIN; break; case ISD::SETUGT: - // This can be a min if we can prove that at least one of the operands - // is not a nan. - if (!FiniteOnlyFPMath()) { - if (DAG.isKnownNeverNaN(LHS)) { - // Put the potential NaN in the RHS so that SSE will preserve it. - std::swap(LHS, RHS); - } else if (!DAG.isKnownNeverNaN(RHS)) - break; - } + // Converting this to a min would handle NaNs incorrectly. + if (!UnsafeFPMath && + (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) + break; Opcode = X86ISD::FMIN; break; case ISD::SETUGE: - // This can be a min, but if either operand is a NaN we need it to - // preserve the original LHS. + // Converting this to a min would handle both negative zeros and NaNs + // incorrectly, but we can swap the operands to fix both. std::swap(LHS, RHS); case ISD::SETOGT: case ISD::SETGT: @@ -8897,32 +9112,28 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, break; case ISD::SETULT: - // This can be a max if we can prove that at least one of the operands - // is not a nan. - if (!FiniteOnlyFPMath()) { - if (DAG.isKnownNeverNaN(LHS)) { - // Put the potential NaN in the RHS so that SSE will preserve it. - std::swap(LHS, RHS); - } else if (!DAG.isKnownNeverNaN(RHS)) - break; - } + // Converting this to a max would handle NaNs incorrectly. + if (!FiniteOnlyFPMath() && + (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) + break; Opcode = X86ISD::FMAX; break; case ISD::SETOLE: - // This can be a max if we can prove that at least one of the operands - // is not a nan. - if (!FiniteOnlyFPMath()) { - if (DAG.isKnownNeverNaN(RHS)) { - // Put the potential NaN in the RHS so that SSE will preserve it. - std::swap(LHS, RHS); - } else if (!DAG.isKnownNeverNaN(LHS)) + // Converting this to a max would handle comparisons between positive + // and negative zero incorrectly, and swapping the operands would + // cause it to handle NaNs incorrectly. + if (!UnsafeFPMath && + !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) { + if (!FiniteOnlyFPMath() && + (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) break; + std::swap(LHS, RHS); } Opcode = X86ISD::FMAX; break; case ISD::SETULE: - // This can be a max, but if either operand is a NaN we need it to - // preserve the original LHS. + // Converting this to a max would handle both negative zeros and NaNs + // incorrectly, but we can swap the operands to fix both. std::swap(LHS, RHS); case ISD::SETOLT: case ISD::SETLT: @@ -9147,10 +9358,6 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, /// LEA + SHL, LEA + LEA. static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { - if (DAG.getMachineFunction(). - getFunction()->hasFnAttr(Attribute::OptimizeForSize)) - return SDValue(); - if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) return SDValue(); @@ -9289,7 +9496,7 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, } } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) { if (ConstantSDNode *C = dyn_cast(InVec.getOperand(2))) { - unsigned SplatIdx = cast(ShAmtOp)->getSplatIndex(); + unsigned SplatIdx= cast(ShAmtOp)->getSplatIndex(); if (C->getZExtValue() == SplatIdx) BaseShAmt = InVec.getOperand(1); } @@ -9475,7 +9682,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(), Ld->getSrcValue(), Ld->getSrcValueOffset(), Ld->isVolatile(), - Ld->getAlignment()); + Ld->isNonTemporal(), Ld->getAlignment()); SDValue NewChain = NewLd.getValue(1); if (TokenFactorIndex != -1) { Ops.push_back(NewChain); @@ -9484,7 +9691,8 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, } return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(), St->getSrcValue(), St->getSrcValueOffset(), - St->isVolatile(), St->getAlignment()); + St->isVolatile(), St->isNonTemporal(), + St->getAlignment()); } // Otherwise, lower to two pairs of 32-bit loads / stores. @@ -9494,10 +9702,11 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr, Ld->getSrcValue(), Ld->getSrcValueOffset(), - Ld->isVolatile(), Ld->getAlignment()); + Ld->isVolatile(), Ld->isNonTemporal(), + Ld->getAlignment()); SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr, Ld->getSrcValue(), Ld->getSrcValueOffset()+4, - Ld->isVolatile(), + Ld->isVolatile(), Ld->isNonTemporal(), MinAlign(Ld->getAlignment(), 4)); SDValue NewChain = LoLd.getValue(1); @@ -9514,11 +9723,13 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr, St->getSrcValue(), St->getSrcValueOffset(), - St->isVolatile(), St->getAlignment()); + St->isVolatile(), St->isNonTemporal(), + St->getAlignment()); SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr, St->getSrcValue(), St->getSrcValueOffset() + 4, St->isVolatile(), + St->isNonTemporal(), MinAlign(St->getAlignment(), 4)); return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt); } @@ -9668,6 +9879,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, switch (N->getOpcode()) { default: break; case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, *this); + case ISD::EXTRACT_VECTOR_ELT: + return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, *this); case ISD::SELECT: return PerformSELECTCombine(N, DAG, Subtarget); case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI); case ISD::MUL: return PerformMulCombine(N, DAG, DCI); @@ -9701,7 +9914,7 @@ static bool LowerToBSwap(CallInst *CI) { // Verify this is a simple bswap. if (CI->getNumOperands() != 2 || CI->getType() != CI->getOperand(1)->getType() || - !CI->getType()->isInteger()) + !CI->getType()->isIntegerTy()) return false; const IntegerType *Ty = dyn_cast(CI->getType()); @@ -9750,17 +9963,27 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { return LowerToBSwap(CI); } // rorw $$8, ${0:w} --> llvm.bswap.i16 - if (CI->getType()->isInteger(16) && + if (CI->getType()->isIntegerTy(16) && AsmPieces.size() == 3 && - AsmPieces[0] == "rorw" && + (AsmPieces[0] == "rorw" || AsmPieces[0] == "rolw") && AsmPieces[1] == "$$8," && AsmPieces[2] == "${0:w}" && - IA->getConstraintString() == "=r,0,~{dirflag},~{fpsr},~{flags},~{cc}") { - return LowerToBSwap(CI); + IA->getConstraintString().compare(0, 5, "=r,0,") == 0) { + AsmPieces.clear(); + const std::string &Constraints = IA->getConstraintString(); + SplitString(StringRef(Constraints).substr(5), AsmPieces, ","); + std::sort(AsmPieces.begin(), AsmPieces.end()); + if (AsmPieces.size() == 4 && + AsmPieces[0] == "~{cc}" && + AsmPieces[1] == "~{dirflag}" && + AsmPieces[2] == "~{flags}" && + AsmPieces[3] == "~{fpsr}") { + return LowerToBSwap(CI); + } } break; case 3: - if (CI->getType()->isInteger(64) && + if (CI->getType()->isIntegerTy(64) && Constraints.size() >= 2 && Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" && Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") { @@ -10203,41 +10426,3 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, return Res; } - -//===----------------------------------------------------------------------===// -// X86 Widen vector type -//===----------------------------------------------------------------------===// - -/// getWidenVectorType: given a vector type, returns the type to widen -/// to (e.g., v7i8 to v8i8). If the vector type is legal, it returns itself. -/// If there is no vector type that we want to widen to, returns MVT::Other -/// When and where to widen is target dependent based on the cost of -/// scalarizing vs using the wider vector type. - -EVT X86TargetLowering::getWidenVectorType(EVT VT) const { - assert(VT.isVector()); - if (isTypeLegal(VT)) - return VT; - - // TODO: In computeRegisterProperty, we can compute the list of legal vector - // type based on element type. This would speed up our search (though - // it may not be worth it since the size of the list is relatively - // small). - EVT EltVT = VT.getVectorElementType(); - unsigned NElts = VT.getVectorNumElements(); - - // On X86, it make sense to widen any vector wider than 1 - if (NElts <= 1) - return MVT::Other; - - for (unsigned nVT = MVT::FIRST_VECTOR_VALUETYPE; - nVT <= MVT::LAST_VECTOR_VALUETYPE; ++nVT) { - EVT SVT = (MVT::SimpleValueType)nVT; - - if (isTypeLegal(SVT) && - SVT.getVectorElementType() == EltVT && - SVT.getVectorNumElements() > NElts) - return SVT; - } - return MVT::Other; -}