X-Git-Url: http://plrg.eecs.uci.edu/git/?p=oota-llvm.git;a=blobdiff_plain;f=lib%2FTarget%2FX86%2FX86FrameLowering.cpp;h=ef1a2d7acc946f21608367ed5db3eb1c5e88acda;hp=4c1374f70f4231a69e325972539771b2aa2754ad;hb=c1896335867238b01457db7e5a2a1a66d81a39d9;hpb=de70176f5ff5465cb32828ffcd70797c6ccb1f81 diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp index 4c1374f70f4..ef1a2d7acc9 100644 --- a/lib/Target/X86/X86FrameLowering.cpp +++ b/lib/Target/X86/X86FrameLowering.cpp @@ -29,6 +29,8 @@ #include "llvm/MC/MCSymbol.h" #include "llvm/Support/CommandLine.h" #include "llvm/Target/TargetOptions.h" +#include "llvm/Support/Debug.h" +#include using namespace llvm; @@ -36,7 +38,34 @@ using namespace llvm; extern cl::opt ForceStackAlign; bool X86FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const { - return !MF.getFrameInfo()->hasVarSizedObjects(); + return !MF.getFrameInfo()->hasVarSizedObjects() && + !MF.getInfo()->getHasPushSequences(); +} + +/// canSimplifyCallFramePseudos - If there is a reserved call frame, the +/// call frame pseudos can be simplified. Having a FP, as in the default +/// implementation, is not sufficient here since we can't always use it. +/// Use a more nuanced condition. +bool +X86FrameLowering::canSimplifyCallFramePseudos(const MachineFunction &MF) const { + const X86RegisterInfo *TRI = static_cast + (MF.getSubtarget().getRegisterInfo()); + return hasReservedCallFrame(MF) || + (hasFP(MF) && !TRI->needsStackRealignment(MF)) + || TRI->hasBasePointer(MF); +} + +// needsFrameIndexResolution - Do we need to perform FI resolution for +// this function. Normally, this is required only when the function +// has any stack objects. However, FI resolution actually has another job, +// not apparent from the title - it resolves callframesetup/destroy +// that were not simplified earlier. +// So, this is required for x86 functions that have push sequences even +// when there are no stack objects. +bool +X86FrameLowering::needsFrameIndexResolution(const MachineFunction &MF) const { + return MF.getFrameInfo()->hasStackObjects() || + MF.getInfo()->getHasPushSequences(); } /// hasFP - Return true if the specified function should have a dedicated frame @@ -45,14 +74,15 @@ bool X86FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const { bool X86FrameLowering::hasFP(const MachineFunction &MF) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); const MachineModuleInfo &MMI = MF.getMMI(); - const TargetRegisterInfo *RegInfo = TM.getRegisterInfo(); + const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo(); return (MF.getTarget().Options.DisableFramePointerElim(MF) || RegInfo->needsStackRealignment(MF) || MFI->hasVarSizedObjects() || MFI->isFrameAddressTaken() || MFI->hasInlineAsmWithSPAdjust() || MF.getInfo()->getForceFramePointer() || - MMI.callsUnwindInit() || MMI.callsEHReturn()); + MMI.callsUnwindInit() || MMI.callsEHReturn() || + MFI->hasStackMap() || MFI->hasPatchPoint()); } static unsigned getSUBriOpcode(unsigned IsLP64, int64_t Imm) { @@ -79,6 +109,25 @@ static unsigned getADDriOpcode(unsigned IsLP64, int64_t Imm) { } } +static unsigned getSUBrrOpcode(unsigned isLP64) { + return isLP64 ? X86::SUB64rr : X86::SUB32rr; +} + +static unsigned getADDrrOpcode(unsigned isLP64) { + return isLP64 ? X86::ADD64rr : X86::ADD32rr; +} + +static unsigned getANDriOpcode(bool IsLP64, int64_t Imm) { + if (IsLP64) { + if (isInt<8>(Imm)) + return X86::AND64ri8; + return X86::AND64ri32; + } + if (isInt<8>(Imm)) + return X86::AND32ri8; + return X86::AND32ri; +} + static unsigned getLEArOpcode(unsigned IsLP64) { return IsLP64 ? X86::LEA64r : X86::LEA32r; } @@ -141,38 +190,76 @@ static unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB, return 0; } +static bool isEAXLiveIn(MachineFunction &MF) { + for (MachineRegisterInfo::livein_iterator II = MF.getRegInfo().livein_begin(), + EE = MF.getRegInfo().livein_end(); II != EE; ++II) { + unsigned Reg = II->first; + + if (Reg == X86::RAX || Reg == X86::EAX || Reg == X86::AX || + Reg == X86::AH || Reg == X86::AL) + return true; + } + + return false; +} /// emitSPUpdate - Emit a series of instructions to increment / decrement the /// stack pointer by a constant value. static void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, unsigned StackPtr, int64_t NumBytes, - bool Is64Bit, bool IsLP64, bool UseLEA, + bool Is64BitTarget, bool Is64BitStackPtr, bool UseLEA, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI) { bool isSub = NumBytes < 0; uint64_t Offset = isSub ? -NumBytes : NumBytes; unsigned Opc; if (UseLEA) - Opc = getLEArOpcode(IsLP64); + Opc = getLEArOpcode(Is64BitStackPtr); else Opc = isSub - ? getSUBriOpcode(IsLP64, Offset) - : getADDriOpcode(IsLP64, Offset); + ? getSUBriOpcode(Is64BitStackPtr, Offset) + : getADDriOpcode(Is64BitStackPtr, Offset); uint64_t Chunk = (1LL << 31) - 1; DebugLoc DL = MBB.findDebugLoc(MBBI); while (Offset) { - uint64_t ThisVal = (Offset > Chunk) ? Chunk : Offset; - if (ThisVal == (Is64Bit ? 8 : 4)) { + if (Offset > Chunk) { + // Rather than emit a long series of instructions for large offsets, + // load the offset into a register and do one sub/add + unsigned Reg = 0; + + if (isSub && !isEAXLiveIn(*MBB.getParent())) + Reg = (unsigned)(Is64BitTarget ? X86::RAX : X86::EAX); + else + Reg = findDeadCallerSavedReg(MBB, MBBI, TRI, Is64BitTarget); + + if (Reg) { + Opc = Is64BitTarget ? X86::MOV64ri : X86::MOV32ri; + BuildMI(MBB, MBBI, DL, TII.get(Opc), Reg) + .addImm(Offset); + Opc = isSub + ? getSUBrrOpcode(Is64BitTarget) + : getADDrrOpcode(Is64BitTarget); + MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr) + .addReg(StackPtr) + .addReg(Reg); + MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead. + Offset = 0; + continue; + } + } + + uint64_t ThisVal = std::min(Offset, Chunk); + if (ThisVal == (Is64BitTarget ? 8 : 4)) { // Use push / pop instead. unsigned Reg = isSub - ? (unsigned)(Is64Bit ? X86::RAX : X86::EAX) - : findDeadCallerSavedReg(MBB, MBBI, TRI, Is64Bit); + ? (unsigned)(Is64BitTarget ? X86::RAX : X86::EAX) + : findDeadCallerSavedReg(MBB, MBBI, TRI, Is64BitTarget); if (Reg) { Opc = isSub - ? (Is64Bit ? X86::PUSH64r : X86::PUSH32r) - : (Is64Bit ? X86::POP64r : X86::POP32r); + ? (Is64BitTarget ? X86::PUSH64r : X86::PUSH32r) + : (Is64BitTarget ? X86::POP64r : X86::POP32r); MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc)) .addReg(Reg, getDefRegState(!isSub) | getUndefRegState(isSub)); if (isSub) @@ -225,38 +312,6 @@ void mergeSPUpdatesUp(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, } } -/// mergeSPUpdatesDown - Merge two stack-manipulating instructions lower -/// iterator. -static -void mergeSPUpdatesDown(MachineBasicBlock &MBB, - MachineBasicBlock::iterator &MBBI, - unsigned StackPtr, uint64_t *NumBytes = nullptr) { - // FIXME: THIS ISN'T RUN!!! - return; - - if (MBBI == MBB.end()) return; - - MachineBasicBlock::iterator NI = std::next(MBBI); - if (NI == MBB.end()) return; - - unsigned Opc = NI->getOpcode(); - if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 || - Opc == X86::ADD32ri || Opc == X86::ADD32ri8) && - NI->getOperand(0).getReg() == StackPtr) { - if (NumBytes) - *NumBytes -= NI->getOperand(2).getImm(); - MBB.erase(NI); - MBBI = NI; - } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 || - Opc == X86::SUB32ri || Opc == X86::SUB32ri8) && - NI->getOperand(0).getReg() == StackPtr) { - if (NumBytes) - *NumBytes += NI->getOperand(2).getImm(); - MBB.erase(NI); - MBBI = NI; - } -} - /// mergeSPUpdates - Checks the instruction before/after the passed /// instruction. If it is an ADD/SUB/LEA instruction it is deleted argument and /// the stack adjustment is returned as a positive value for ADD/LEA and a @@ -292,78 +347,25 @@ static int mergeSPUpdates(MachineBasicBlock &MBB, return Offset; } -static bool isEAXLiveIn(MachineFunction &MF) { - for (MachineRegisterInfo::livein_iterator II = MF.getRegInfo().livein_begin(), - EE = MF.getRegInfo().livein_end(); II != EE; ++II) { - unsigned Reg = II->first; - - if (Reg == X86::EAX || Reg == X86::AX || - Reg == X86::AH || Reg == X86::AL) - return true; - } - - return false; -} - -void X86FrameLowering::emitCalleeSavedFrameMoves( - MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc DL, - unsigned FramePtr) const { +void +X86FrameLowering::emitCalleeSavedFrameMoves(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc DL) const { MachineFunction &MF = *MBB.getParent(); MachineFrameInfo *MFI = MF.getFrameInfo(); MachineModuleInfo &MMI = MF.getMMI(); const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo(); - const X86InstrInfo &TII = *TM.getInstrInfo(); + const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); // Add callee saved registers to move list. const std::vector &CSI = MFI->getCalleeSavedInfo(); if (CSI.empty()) return; - const X86RegisterInfo *RegInfo = TM.getRegisterInfo(); - bool HasFP = hasFP(MF); - - // Calculate amount of bytes used for return address storing. - int stackGrowth = -RegInfo->getSlotSize(); - - // FIXME: This is dirty hack. The code itself is pretty mess right now. - // It should be rewritten from scratch and generalized sometimes. - - // Determine maximum offset (minimum due to stack growth). - int64_t MaxOffset = 0; - for (std::vector::const_iterator - I = CSI.begin(), E = CSI.end(); I != E; ++I) - MaxOffset = std::min(MaxOffset, - MFI->getObjectOffset(I->getFrameIdx())); - // Calculate offsets. - int64_t saveAreaOffset = (HasFP ? 3 : 2) * stackGrowth; for (std::vector::const_iterator I = CSI.begin(), E = CSI.end(); I != E; ++I) { int64_t Offset = MFI->getObjectOffset(I->getFrameIdx()); unsigned Reg = I->getReg(); - Offset = MaxOffset - Offset + saveAreaOffset; - - // Don't output a new machine move if we're re-saving the frame - // pointer. This happens when the PrologEpilogInserter has inserted an extra - // "PUSH" of the frame pointer -- the "emitPrologue" method automatically - // generates one when frame pointers are used. If we generate a "machine - // move" for this extra "PUSH", the linker will lose track of the fact that - // the frame pointer should have the value of the first "PUSH" when it's - // trying to unwind. - // - // FIXME: This looks inelegant. It's possibly correct, but it's covering up - // another bug. I.e., one where we generate a prolog like this: - // - // pushl %ebp - // movl %esp, %ebp - // pushl %ebp - // pushl %esi - // ... - // - // The immediate re-push of EBP is unnecessary. At the least, it's an - // optimization bug. EBP can be used as a scratch register in certain - // cases, but probably not when we have a frame pointer. - if (HasFP && FramePtr == Reg) - continue; unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true); unsigned CFIIndex = @@ -391,65 +393,243 @@ static bool usesTheStack(const MachineFunction &MF) { return false; } +void X86FrameLowering::emitStackProbeCall(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc DL) { + const X86Subtarget &STI = MF.getSubtarget(); + const TargetInstrInfo &TII = *STI.getInstrInfo(); + bool Is64Bit = STI.is64Bit(); + bool IsLargeCodeModel = MF.getTarget().getCodeModel() == CodeModel::Large; + + unsigned CallOp; + if (Is64Bit) + CallOp = IsLargeCodeModel ? X86::CALL64r : X86::CALL64pcrel32; + else + CallOp = X86::CALLpcrel32; + + const char *Symbol; + if (Is64Bit) { + if (STI.isTargetCygMing()) { + Symbol = "___chkstk_ms"; + } else { + Symbol = "__chkstk"; + } + } else if (STI.isTargetCygMing()) + Symbol = "_alloca"; + else + Symbol = "_chkstk"; + + MachineInstrBuilder CI; + + // All current stack probes take AX and SP as input, clobber flags, and + // preserve all registers. x86_64 probes leave RSP unmodified. + if (Is64Bit && MF.getTarget().getCodeModel() == CodeModel::Large) { + // For the large code model, we have to call through a register. Use R11, + // as it is scratch in all supported calling conventions. + BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::R11) + .addExternalSymbol(Symbol); + CI = BuildMI(MBB, MBBI, DL, TII.get(CallOp)).addReg(X86::R11); + } else { + CI = BuildMI(MBB, MBBI, DL, TII.get(CallOp)).addExternalSymbol(Symbol); + } + + unsigned AX = Is64Bit ? X86::RAX : X86::EAX; + unsigned SP = Is64Bit ? X86::RSP : X86::ESP; + CI.addReg(AX, RegState::Implicit) + .addReg(SP, RegState::Implicit) + .addReg(AX, RegState::Define | RegState::Implicit) + .addReg(SP, RegState::Define | RegState::Implicit) + .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); + + if (Is64Bit) { + // MSVC x64's __chkstk and cygwin/mingw's ___chkstk_ms do not adjust %rsp + // themselves. It also does not clobber %rax so we can reuse it when + // adjusting %rsp. + BuildMI(MBB, MBBI, DL, TII.get(X86::SUB64rr), X86::RSP) + .addReg(X86::RSP) + .addReg(X86::RAX); + } +} + +static unsigned calculateSetFPREG(uint64_t SPAdjust) { + // Win64 ABI has a less restrictive limitation of 240; 128 works equally well + // and might require smaller successive adjustments. + const uint64_t Win64MaxSEHOffset = 128; + uint64_t SEHFrameOffset = std::min(SPAdjust, Win64MaxSEHOffset); + // Win64 ABI requires 16-byte alignment for the UWOP_SET_FPREG opcode. + return SEHFrameOffset & -16; +} + +// If we're forcing a stack realignment we can't rely on just the frame +// info, we need to know the ABI stack alignment as well in case we +// have a call out. Otherwise just make sure we have some alignment - we'll +// go with the minimum SlotSize. +static uint64_t calculateMaxStackAlign(const MachineFunction &MF) { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + uint64_t MaxAlign = MFI->getMaxAlignment(); // Desired stack alignment. + const X86Subtarget &STI = MF.getSubtarget(); + const X86RegisterInfo *RegInfo = STI.getRegisterInfo(); + unsigned SlotSize = RegInfo->getSlotSize(); + unsigned StackAlign = STI.getFrameLowering()->getStackAlignment(); + if (ForceStackAlign) { + if (MFI->hasCalls()) + MaxAlign = (StackAlign > MaxAlign) ? StackAlign : MaxAlign; + else if (MaxAlign < SlotSize) + MaxAlign = SlotSize; + } + return MaxAlign; +} + /// emitPrologue - Push callee-saved registers onto the stack, which /// automatically adjust the stack pointer. Adjust the stack pointer to allocate /// space for local variables. Also emit labels used by the exception handler to /// generate the exception handling frames. -void X86FrameLowering::emitPrologue(MachineFunction &MF) const { - MachineBasicBlock &MBB = MF.front(); // Prologue goes in entry BB. + +/* + Here's a gist of what gets emitted: + + ; Establish frame pointer, if needed + [if needs FP] + push %rbp + .cfi_def_cfa_offset 16 + .cfi_offset %rbp, -16 + .seh_pushreg %rpb + mov %rsp, %rbp + .cfi_def_cfa_register %rbp + + ; Spill general-purpose registers + [for all callee-saved GPRs] + pushq % + [if not needs FP] + .cfi_def_cfa_offset (offset from RETADDR) + .seh_pushreg % + + ; If the required stack alignment > default stack alignment + ; rsp needs to be re-aligned. This creates a "re-alignment gap" + ; of unknown size in the stack frame. + [if stack needs re-alignment] + and $MASK, %rsp + + ; Allocate space for locals + [if target is Windows and allocated space > 4096 bytes] + ; Windows needs special care for allocations larger + ; than one page. + mov $NNN, %rax + call ___chkstk_ms/___chkstk + sub %rax, %rsp + [else] + sub $NNN, %rsp + + [if needs FP] + .seh_stackalloc (size of XMM spill slots) + .seh_setframe %rbp, SEHFrameOffset ; = size of all spill slots + [else] + .seh_stackalloc NNN + + ; Spill XMMs + ; Note, that while only Windows 64 ABI specifies XMMs as callee-preserved, + ; they may get spilled on any platform, if the current function + ; calls @llvm.eh.unwind.init + [if needs FP] + [for all callee-saved XMM registers] + movaps %, -MMM(%rbp) + [for all callee-saved XMM registers] + .seh_savexmm %, (-MMM + SEHFrameOffset) + ; i.e. the offset relative to (%rbp - SEHFrameOffset) + [else] + [for all callee-saved XMM registers] + movaps %, KKK(%rsp) + [for all callee-saved XMM registers] + .seh_savexmm %, KKK + + .seh_endprologue + + [if needs base pointer] + mov %rsp, %rbx + [if needs to restore base pointer] + mov %rsp, -MMM(%rbp) + + ; Emit CFI info + [if needs FP] + [for all callee-saved registers] + .cfi_offset %, (offset from %rbp) + [else] + .cfi_def_cfa_offset (offset from RETADDR) + [for all callee-saved registers] + .cfi_offset %, (offset from %rsp) + + Notes: + - .seh directives are emitted only for Windows 64 ABI + - .cfi directives are emitted for all other ABIs + - for 32-bit code, substitute %e?? registers for %r?? +*/ + +void X86FrameLowering::emitPrologue(MachineFunction &MF, + MachineBasicBlock &MBB) const { + assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported"); MachineBasicBlock::iterator MBBI = MBB.begin(); MachineFrameInfo *MFI = MF.getFrameInfo(); const Function *Fn = MF.getFunction(); - const X86RegisterInfo *RegInfo = TM.getRegisterInfo(); - const X86InstrInfo &TII = *TM.getInstrInfo(); + const X86Subtarget &STI = MF.getSubtarget(); + const X86RegisterInfo *RegInfo = STI.getRegisterInfo(); + const TargetInstrInfo &TII = *STI.getInstrInfo(); MachineModuleInfo &MMI = MF.getMMI(); X86MachineFunctionInfo *X86FI = MF.getInfo(); - bool needsFrameMoves = MMI.hasDebugInfo() || - Fn->needsUnwindTableEntry(); - uint64_t MaxAlign = MFI->getMaxAlignment(); // Desired stack alignment. + uint64_t MaxAlign = calculateMaxStackAlign(MF); // Desired stack alignment. uint64_t StackSize = MFI->getStackSize(); // Number of bytes to allocate. bool HasFP = hasFP(MF); bool Is64Bit = STI.is64Bit(); - bool IsLP64 = STI.isTarget64BitLP64(); - bool IsWin64 = STI.isTargetWin64(); + // standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit. + const bool Uses64BitFramePtr = STI.isTarget64BitLP64() || STI.isTargetNaCl64(); + bool IsWin64 = STI.isCallingConvWin64(Fn->getCallingConv()); + // Not necessarily synonymous with IsWin64. + bool IsWinEH = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); + bool NeedsWinEH = IsWinEH && Fn->needsUnwindTableEntry(); + bool NeedsDwarfCFI = + !IsWinEH && (MMI.hasDebugInfo() || Fn->needsUnwindTableEntry()); bool UseLEA = STI.useLeaForSP(); - unsigned StackAlign = getStackAlignment(); unsigned SlotSize = RegInfo->getSlotSize(); unsigned FramePtr = RegInfo->getFrameRegister(MF); + const unsigned MachineFramePtr = + STI.isTarget64BitILP32() + ? getX86SubSuperRegister(FramePtr, MVT::i64, false) + : FramePtr; unsigned StackPtr = RegInfo->getStackRegister(); unsigned BasePtr = RegInfo->getBaseRegister(); DebugLoc DL; - // If we're forcing a stack realignment we can't rely on just the frame - // info, we need to know the ABI stack alignment as well in case we - // have a call out. Otherwise just make sure we have some alignment - we'll - // go with the minimum SlotSize. - if (ForceStackAlign) { - if (MFI->hasCalls()) - MaxAlign = (StackAlign > MaxAlign) ? StackAlign : MaxAlign; - else if (MaxAlign < SlotSize) - MaxAlign = SlotSize; - } - // Add RETADDR move area to callee saved frame size. int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta(); + if (TailCallReturnAddrDelta && IsWinEH) + report_fatal_error("Can't handle guaranteed tail call under win64 yet"); + if (TailCallReturnAddrDelta < 0) X86FI->setCalleeSavedFrameSize( X86FI->getCalleeSavedFrameSize() - TailCallReturnAddrDelta); + bool UseStackProbe = (STI.isOSWindows() && !STI.isTargetMachO()); + + // The default stack probe size is 4096 if the function has no stackprobesize + // attribute. + unsigned StackProbeSize = 4096; + if (Fn->hasFnAttribute("stack-probe-size")) + Fn->getFnAttribute("stack-probe-size") + .getValueAsString() + .getAsInteger(0, StackProbeSize); + // If this is x86-64 and the Red Zone is not disabled, if we are a leaf // function, and use up to 128 bytes of stack space, don't have a frame // pointer, calls, or dynamic alloca then we do not need to adjust the // stack pointer (we fit in the Red Zone). We also check that we don't // push and pop from the stack. - if (Is64Bit && !Fn->getAttributes().hasAttribute(AttributeSet::FunctionIndex, - Attribute::NoRedZone) && + if (Is64Bit && !Fn->hasFnAttribute(Attribute::NoRedZone) && !RegInfo->needsStackRealignment(MF) && - !MFI->hasVarSizedObjects() && // No dynamic alloca. - !MFI->adjustsStack() && // No calls. - !IsWin64 && // Win64 has no Red Zone - !usesTheStack(MF) && // Don't push and pop. - !MF.shouldSplitStack()) { // Regular stack + !MFI->hasVarSizedObjects() && // No dynamic alloca. + !MFI->adjustsStack() && // No calls. + !IsWin64 && // Win64 has no Red Zone + !usesTheStack(MF) && // Don't push and pop. + !MF.shouldSplitStack()) { // Regular stack uint64_t MinSize = X86FI->getCalleeSavedFrameSize(); if (HasFP) MinSize += SlotSize; StackSize = std::max(MinSize, StackSize > 128 ? StackSize - 128 : 0); @@ -462,7 +642,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { if (TailCallReturnAddrDelta < 0) { MachineInstr *MI = BuildMI(MBB, MBBI, DL, - TII.get(getSUBriOpcode(IsLP64, -TailCallReturnAddrDelta)), + TII.get(getSUBriOpcode(Uses64BitFramePtr, -TailCallReturnAddrDelta)), StackPtr) .addReg(StackPtr) .addImm(-TailCallReturnAddrDelta) @@ -490,14 +670,15 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { if (HasFP) { // Calculate required stack adjustment. uint64_t FrameSize = StackSize - SlotSize; - if (RegInfo->needsStackRealignment(MF)) { - // Callee-saved registers are pushed on stack before the stack - // is realigned. - FrameSize -= X86FI->getCalleeSavedFrameSize(); - NumBytes = (FrameSize + MaxAlign - 1) / MaxAlign * MaxAlign; - } else { - NumBytes = FrameSize - X86FI->getCalleeSavedFrameSize(); - } + // If required, include space for extra hidden slot for stashing base pointer. + if (X86FI->getRestoreBasePointer()) + FrameSize += SlotSize; + + NumBytes = FrameSize - X86FI->getCalleeSavedFrameSize(); + + // Callee-saved registers are pushed on stack before the stack is realigned. + if (RegInfo->needsStackRealignment(MF) && !IsWinEH) + NumBytes = RoundUpToAlignment(NumBytes, MaxAlign); // Get the offset of the stack slot for the EBP register, which is // guaranteed to be the last slot by processFunctionBeforeFrameFinalized. @@ -506,10 +687,10 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { // Save EBP/RBP into the appropriate stack slot. BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::PUSH64r : X86::PUSH32r)) - .addReg(FramePtr, RegState::Kill) + .addReg(MachineFramePtr, RegState::Kill) .setMIFlag(MachineInstr::FrameSetup); - if (needsFrameMoves) { + if (NeedsDwarfCFI) { // Mark the place where EBP/RBP was saved. // Define the current CFA rule to use the provided offset. assert(StackSize); @@ -519,7 +700,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { .addCFIIndex(CFIIndex); // Change the rule for the FramePtr to be an "offset" rule. - unsigned DwarfFramePtr = RegInfo->getDwarfRegNum(FramePtr, true); + unsigned DwarfFramePtr = RegInfo->getDwarfRegNum(MachineFramePtr, true); CFIIndex = MMI.addFrameInst( MCCFIInstruction::createOffset(nullptr, DwarfFramePtr, 2 * stackGrowth)); @@ -527,26 +708,34 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { .addCFIIndex(CFIIndex); } - // Update EBP with the new base value. - BuildMI(MBB, MBBI, DL, - TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr), FramePtr) - .addReg(StackPtr) - .setMIFlag(MachineInstr::FrameSetup); + if (NeedsWinEH) { + BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg)) + .addImm(FramePtr) + .setMIFlag(MachineInstr::FrameSetup); + } - if (needsFrameMoves) { + if (!IsWinEH) { + // Update EBP with the new base value. + BuildMI(MBB, MBBI, DL, + TII.get(Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr), + FramePtr) + .addReg(StackPtr) + .setMIFlag(MachineInstr::FrameSetup); + } + + if (NeedsDwarfCFI) { // Mark effective beginning of when frame pointer becomes valid. // Define the current CFA to use the EBP/RBP register. - unsigned DwarfFramePtr = RegInfo->getDwarfRegNum(FramePtr, true); + unsigned DwarfFramePtr = RegInfo->getDwarfRegNum(MachineFramePtr, true); unsigned CFIIndex = MMI.addFrameInst( MCCFIInstruction::createDefCfaRegister(nullptr, DwarfFramePtr)); BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex); } - // Mark the FramePtr as live-in in every block except the entry. - for (MachineFunction::iterator I = std::next(MF.begin()), E = MF.end(); - I != E; ++I) - I->addLiveIn(FramePtr); + // Mark the FramePtr as live-in in every block. + for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) + I->addLiveIn(MachineFramePtr); } else { NumBytes = StackSize - X86FI->getCalleeSavedFrameSize(); } @@ -559,10 +748,10 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { (MBBI->getOpcode() == X86::PUSH32r || MBBI->getOpcode() == X86::PUSH64r)) { PushedRegs = true; - MBBI->setFlag(MachineInstr::FrameSetup); + unsigned Reg = MBBI->getOperand(0).getReg(); ++MBBI; - if (!HasFP && needsFrameMoves) { + if (!HasFP && NeedsDwarfCFI) { // Mark callee-saved push instruction. // Define the current CFA rule to use the provided offset. assert(StackSize); @@ -572,24 +761,25 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { .addCFIIndex(CFIIndex); StackOffset += stackGrowth; } + + if (NeedsWinEH) { + BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg)).addImm(Reg).setMIFlag( + MachineInstr::FrameSetup); + } } // Realign stack after we pushed callee-saved registers (so that we'll be // able to calculate their offsets from the frame pointer). - - // NOTE: We push the registers before realigning the stack, so - // vector callee-saved (xmm) registers may be saved w/o proper - // alignment in this way. However, currently these regs are saved in - // stack slots (see X86FrameLowering::spillCalleeSavedRegisters()), so - // this shouldn't be a problem. - if (RegInfo->needsStackRealignment(MF)) { + // Don't do this for Win64, it needs to realign the stack after the prologue. + if (!IsWinEH && RegInfo->needsStackRealignment(MF)) { assert(HasFP && "There should be a frame pointer if stack is realigned."); + uint64_t Val = -MaxAlign; MachineInstr *MI = - BuildMI(MBB, MBBI, DL, - TII.get(Is64Bit ? X86::AND64ri32 : X86::AND32ri), StackPtr) - .addReg(StackPtr) - .addImm(-MaxAlign) - .setMIFlag(MachineInstr::FrameSetup); + BuildMI(MBB, MBBI, DL, TII.get(getANDriOpcode(Uses64BitFramePtr, Val)), + StackPtr) + .addReg(StackPtr) + .addImm(Val) + .setMIFlag(MachineInstr::FrameSetup); // The EFLAGS implicit def is dead. MI->getOperand(3).setIsDead(); @@ -600,10 +790,6 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { // the callee has more arguments then the caller. NumBytes -= mergeSPUpdates(MBB, MBBI, StackPtr, true); - // If there is an ADD32ri or SUB32ri of ESP immediately after this - // instruction, merge the two instructions. - mergeSPUpdatesDown(MBB, MBBI, StackPtr, &NumBytes); - // Adjust stack pointer: ESP -= numbytes. // Windows and cygwin/mingw require a prologue helper routine when allocating @@ -614,20 +800,10 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { // responsible for adjusting the stack pointer. Touching the stack at 4K // increments is necessary to ensure that the guard pages used by the OS // virtual memory manager are allocated in correct sequence. - if (NumBytes >= 4096 && STI.isOSWindows() && !STI.isTargetMacho()) { - const char *StackProbeSymbol; - - if (Is64Bit) { - if (STI.isTargetCygMing()) { - StackProbeSymbol = "___chkstk_ms"; - } else { - StackProbeSymbol = "__chkstk"; - } - } else if (STI.isTargetCygMing()) - StackProbeSymbol = "_alloca"; - else - StackProbeSymbol = "_chkstk"; - + uint64_t AlignedNumBytes = NumBytes; + if (IsWinEH && RegInfo->needsStackRealignment(MF)) + AlignedNumBytes = RoundUpToAlignment(AlignedNumBytes, MaxAlign); + if (AlignedNumBytes >= StackProbeSize && UseStackProbe) { // Check whether EAX is livein for this function. bool isEAXAlive = isEAXLiveIn(MF); @@ -645,9 +821,19 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { if (Is64Bit) { // Handle the 64-bit Windows ABI case where we need to call __chkstk. // Function prologue is responsible for adjusting the stack pointer. - BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::RAX) - .addImm(NumBytes) - .setMIFlag(MachineInstr::FrameSetup); + if (isUInt<32>(NumBytes)) { + BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX) + .addImm(NumBytes) + .setMIFlag(MachineInstr::FrameSetup); + } else if (isInt<32>(NumBytes)) { + BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri32), X86::RAX) + .addImm(NumBytes) + .setMIFlag(MachineInstr::FrameSetup); + } else { + BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::RAX) + .addImm(NumBytes) + .setMIFlag(MachineInstr::FrameSetup); + } } else { // Allocate NumBytes-4 bytes on stack in case of isEAXAlive. // We'll also use 4 already allocated bytes for EAX. @@ -656,47 +842,113 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { .setMIFlag(MachineInstr::FrameSetup); } - BuildMI(MBB, MBBI, DL, - TII.get(Is64Bit ? X86::W64ALLOCA : X86::CALLpcrel32)) - .addExternalSymbol(StackProbeSymbol) - .addReg(StackPtr, RegState::Define | RegState::Implicit) - .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit) - .setMIFlag(MachineInstr::FrameSetup); + // Save a pointer to the MI where we set AX. + MachineBasicBlock::iterator SetRAX = MBBI; + --SetRAX; + + // Call __chkstk, __chkstk_ms, or __alloca. + emitStackProbeCall(MF, MBB, MBBI, DL); + + // Apply the frame setup flag to all inserted instrs. + for (; SetRAX != MBBI; ++SetRAX) + SetRAX->setFlag(MachineInstr::FrameSetup); - if (Is64Bit) { - // MSVC x64's __chkstk and cygwin/mingw's ___chkstk_ms do not adjust %rsp - // themself. It also does not clobber %rax so we can reuse it when - // adjusting %rsp. - BuildMI(MBB, MBBI, DL, TII.get(X86::SUB64rr), StackPtr) - .addReg(StackPtr) - .addReg(X86::RAX) - .setMIFlag(MachineInstr::FrameSetup); - } if (isEAXAlive) { - // Restore EAX - MachineInstr *MI = addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm), - X86::EAX), - StackPtr, false, NumBytes - 4); - MI->setFlag(MachineInstr::FrameSetup); - MBB.insert(MBBI, MI); + // Restore EAX + MachineInstr *MI = addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm), + X86::EAX), + StackPtr, false, NumBytes - 4); + MI->setFlag(MachineInstr::FrameSetup); + MBB.insert(MBBI, MI); } - } else if (NumBytes) - emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit, IsLP64, + } else if (NumBytes) { + emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit, Uses64BitFramePtr, UseLEA, TII, *RegInfo); + } + + if (NeedsWinEH && NumBytes) + BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_StackAlloc)) + .addImm(NumBytes) + .setMIFlag(MachineInstr::FrameSetup); + + int SEHFrameOffset = 0; + if (IsWinEH && HasFP) { + SEHFrameOffset = calculateSetFPREG(NumBytes); + if (SEHFrameOffset) + addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::LEA64r), FramePtr), + StackPtr, false, SEHFrameOffset); + else + BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64rr), FramePtr).addReg(StackPtr); + + if (NeedsWinEH) + BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SetFrame)) + .addImm(FramePtr) + .addImm(SEHFrameOffset) + .setMIFlag(MachineInstr::FrameSetup); + } + + while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup)) { + const MachineInstr *FrameInstr = &*MBBI; + ++MBBI; + + if (NeedsWinEH) { + int FI; + if (unsigned Reg = TII.isStoreToStackSlot(FrameInstr, FI)) { + if (X86::FR64RegClass.contains(Reg)) { + int Offset = getFrameIndexOffset(MF, FI); + Offset += SEHFrameOffset; + + BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SaveXMM)) + .addImm(Reg) + .addImm(Offset) + .setMIFlag(MachineInstr::FrameSetup); + } + } + } + } + + if (NeedsWinEH) + BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_EndPrologue)) + .setMIFlag(MachineInstr::FrameSetup); + + // Realign stack after we spilled callee-saved registers (so that we'll be + // able to calculate their offsets from the frame pointer). + // Win64 requires aligning the stack after the prologue. + if (IsWinEH && RegInfo->needsStackRealignment(MF)) { + assert(HasFP && "There should be a frame pointer if stack is realigned."); + uint64_t Val = -MaxAlign; + MachineInstr *MI = + BuildMI(MBB, MBBI, DL, TII.get(getANDriOpcode(Uses64BitFramePtr, Val)), + StackPtr) + .addReg(StackPtr) + .addImm(Val) + .setMIFlag(MachineInstr::FrameSetup); + + // The EFLAGS implicit def is dead. + MI->getOperand(3).setIsDead(); + } // If we need a base pointer, set it up here. It's whatever the value // of the stack pointer is at this point. Any variable size objects // will be allocated after this, so we can still use the base pointer // to reference locals. if (RegInfo->hasBasePointer(MF)) { - // Update the frame pointer with the current stack pointer. - unsigned Opc = Is64Bit ? X86::MOV64rr : X86::MOV32rr; + // Update the base pointer with the current stack pointer. + unsigned Opc = Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr; BuildMI(MBB, MBBI, DL, TII.get(Opc), BasePtr) .addReg(StackPtr) .setMIFlag(MachineInstr::FrameSetup); + if (X86FI->getRestoreBasePointer()) { + // Stash value of base pointer. Saving RSP instead of EBP shortens dependence chain. + unsigned Opm = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr; + addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opm)), + FramePtr, true, X86FI->getRestoreBasePointerOffset()) + .addReg(StackPtr) + .setMIFlag(MachineInstr::FrameSetup); + } } - if (( (!HasFP && NumBytes) || PushedRegs) && needsFrameMoves) { + if (((!HasFP && NumBytes) || PushedRegs) && NeedsDwarfCFI) { // Mark end of stack pointer adjustment. if (!HasFP && NumBytes) { // Define the current CFA rule to use the provided offset. @@ -711,7 +963,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { // Emit DWARF info specifying the offsets of the callee-saved registers. if (PushedRegs) - emitCalleeSavedFrameMoves(MBB, MBBI, DL, HasFP ? FramePtr : StackPtr); + emitCalleeSavedFrameMoves(MBB, MBBI, DL); } } @@ -719,23 +971,45 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); X86MachineFunctionInfo *X86FI = MF.getInfo(); - const X86RegisterInfo *RegInfo = TM.getRegisterInfo(); - const X86InstrInfo &TII = *TM.getInstrInfo(); + const X86Subtarget &STI = MF.getSubtarget(); + const X86RegisterInfo *RegInfo = STI.getRegisterInfo(); + const TargetInstrInfo &TII = *STI.getInstrInfo(); MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); assert(MBBI != MBB.end() && "Returning block has no instructions"); unsigned RetOpcode = MBBI->getOpcode(); DebugLoc DL = MBBI->getDebugLoc(); bool Is64Bit = STI.is64Bit(); - bool IsLP64 = STI.isTarget64BitLP64(); - bool UseLEA = STI.useLeaForSP(); - unsigned StackAlign = getStackAlignment(); + // standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit. + const bool Uses64BitFramePtr = STI.isTarget64BitLP64() || STI.isTargetNaCl64(); + bool HasFP = hasFP(MF); + const bool Is64BitILP32 = STI.isTarget64BitILP32(); unsigned SlotSize = RegInfo->getSlotSize(); unsigned FramePtr = RegInfo->getFrameRegister(MF); + unsigned MachineFramePtr = + Is64BitILP32 ? getX86SubSuperRegister(FramePtr, MVT::i64, false) + : FramePtr; unsigned StackPtr = RegInfo->getStackRegister(); + bool IsWinEH = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); + bool NeedsWinEH = IsWinEH && MF.getFunction()->needsUnwindTableEntry(); + bool UseLEAForSP = false; + + // We can't use LEA instructions for adjusting the stack pointer if this is a + // leaf function in the Win64 ABI. Only ADD instructions may be used to + // deallocate the stack. + if (STI.useLeaForSP()) { + if (!IsWinEH) { + // We *aren't* using the Win64 ABI which means we are free to use LEA. + UseLEAForSP = true; + } else if (HasFP) { + // We *have* a frame pointer which means we are permitted to use LEA. + UseLEAForSP = true; + } + } + switch (RetOpcode) { default: - llvm_unreachable("Can only insert epilog into returning blocks"); + llvm_unreachable("Can only insert epilogue into returning blocks"); case X86::RETQ: case X86::RETL: case X86::RETIL: @@ -753,39 +1027,27 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, // Get the number of bytes to allocate from the FrameInfo. uint64_t StackSize = MFI->getStackSize(); - uint64_t MaxAlign = MFI->getMaxAlignment(); + uint64_t MaxAlign = calculateMaxStackAlign(MF); unsigned CSSize = X86FI->getCalleeSavedFrameSize(); uint64_t NumBytes = 0; - // If we're forcing a stack realignment we can't rely on just the frame - // info, we need to know the ABI stack alignment as well in case we - // have a call out. Otherwise just make sure we have some alignment - we'll - // go with the minimum. - if (ForceStackAlign) { - if (MFI->hasCalls()) - MaxAlign = (StackAlign > MaxAlign) ? StackAlign : MaxAlign; - else - MaxAlign = MaxAlign ? MaxAlign : 4; - } - if (hasFP(MF)) { // Calculate required stack adjustment. uint64_t FrameSize = StackSize - SlotSize; - if (RegInfo->needsStackRealignment(MF)) { - // Callee-saved registers were pushed on stack before the stack - // was realigned. - FrameSize -= CSSize; - NumBytes = (FrameSize + MaxAlign - 1) / MaxAlign * MaxAlign; - } else { - NumBytes = FrameSize - CSSize; - } + NumBytes = FrameSize - CSSize; + + // Callee-saved registers were pushed on stack before the stack was + // realigned. + if (RegInfo->needsStackRealignment(MF) && !IsWinEH) + NumBytes = RoundUpToAlignment(FrameSize, MaxAlign); // Pop EBP. BuildMI(MBB, MBBI, DL, - TII.get(Is64Bit ? X86::POP64r : X86::POP32r), FramePtr); + TII.get(Is64Bit ? X86::POP64r : X86::POP32r), MachineFramePtr); } else { NumBytes = StackSize - CSSize; } + uint64_t SEHStackAllocAmt = NumBytes; // Skip the callee-saved pop instructions. while (MBBI != MBB.begin()) { @@ -813,28 +1075,50 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, if (RegInfo->needsStackRealignment(MF) || MFI->hasVarSizedObjects()) { if (RegInfo->needsStackRealignment(MF)) MBBI = FirstCSPop; - if (CSSize != 0) { - unsigned Opc = getLEArOpcode(IsLP64); + unsigned SEHFrameOffset = calculateSetFPREG(SEHStackAllocAmt); + uint64_t LEAAmount = IsWinEH ? SEHStackAllocAmt - SEHFrameOffset : -CSSize; + + // There are only two legal forms of epilogue: + // - add SEHAllocationSize, %rsp + // - lea SEHAllocationSize(%FramePtr), %rsp + // + // 'mov %FramePtr, %rsp' will not be recognized as an epilogue sequence. + // However, we may use this sequence if we have a frame pointer because the + // effects of the prologue can safely be undone. + if (LEAAmount != 0) { + unsigned Opc = getLEArOpcode(Uses64BitFramePtr); addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr), - FramePtr, false, -CSSize); + FramePtr, false, LEAAmount); + --MBBI; } else { - unsigned Opc = (Is64Bit ? X86::MOV64rr : X86::MOV32rr); + unsigned Opc = (Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr); BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr) .addReg(FramePtr); + --MBBI; } } else if (NumBytes) { // Adjust stack pointer back: ESP += numbytes. - emitSPUpdate(MBB, MBBI, StackPtr, NumBytes, Is64Bit, IsLP64, UseLEA, - TII, *RegInfo); + emitSPUpdate(MBB, MBBI, StackPtr, NumBytes, Is64Bit, Uses64BitFramePtr, + UseLEAForSP, TII, *RegInfo); + --MBBI; } + // Windows unwinder will not invoke function's exception handler if IP is + // either in prologue or in epilogue. This behavior causes a problem when a + // call immediately precedes an epilogue, because the return address points + // into the epilogue. To cope with that, we insert an epilogue marker here, + // then replace it with a 'nop' if it ends up immediately after a CALL in the + // final emitted code. + if (NeedsWinEH) + BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_Epilogue)); + // We're returning from function via eh_return. if (RetOpcode == X86::EH_RETURN || RetOpcode == X86::EH_RETURN64) { MBBI = MBB.getLastNonDebugInstr(); MachineOperand &DestAddr = MBBI->getOperand(0); assert(DestAddr.isReg() && "Offset should be in register!"); BuildMI(MBB, MBBI, DL, - TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr), + TII.get(Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr), StackPtr).addReg(DestAddr.getReg()); } else if (RetOpcode == X86::TCRETURNri || RetOpcode == X86::TCRETURNdi || RetOpcode == X86::TCRETURNmi || @@ -860,15 +1144,17 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, if (Offset) { // Check for possible merge with preceding ADD instruction. Offset += mergeSPUpdates(MBB, MBBI, StackPtr, true); - emitSPUpdate(MBB, MBBI, StackPtr, Offset, Is64Bit, IsLP64, - UseLEA, TII, *RegInfo); + emitSPUpdate(MBB, MBBI, StackPtr, Offset, Is64Bit, Uses64BitFramePtr, + UseLEAForSP, TII, *RegInfo); } // Jump to label or value in register. + bool IsWin64 = STI.isTargetWin64(); if (RetOpcode == X86::TCRETURNdi || RetOpcode == X86::TCRETURNdi64) { - MachineInstrBuilder MIB = - BuildMI(MBB, MBBI, DL, TII.get((RetOpcode == X86::TCRETURNdi) - ? X86::TAILJMPd : X86::TAILJMPd64)); + unsigned Op = (RetOpcode == X86::TCRETURNdi) + ? X86::TAILJMPd + : (IsWin64 ? X86::TAILJMPd64_REX : X86::TAILJMPd64); + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII.get(Op)); if (JumpTarget.isGlobal()) MIB.addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset(), JumpTarget.getTargetFlags()); @@ -878,14 +1164,16 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, JumpTarget.getTargetFlags()); } } else if (RetOpcode == X86::TCRETURNmi || RetOpcode == X86::TCRETURNmi64) { - MachineInstrBuilder MIB = - BuildMI(MBB, MBBI, DL, TII.get((RetOpcode == X86::TCRETURNmi) - ? X86::TAILJMPm : X86::TAILJMPm64)); + unsigned Op = (RetOpcode == X86::TCRETURNmi) + ? X86::TAILJMPm + : (IsWin64 ? X86::TAILJMPm64_REX : X86::TAILJMPm64); + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII.get(Op)); for (unsigned i = 0; i != 5; ++i) MIB.addOperand(MBBI->getOperand(i)); } else if (RetOpcode == X86::TCRETURNri64) { - BuildMI(MBB, MBBI, DL, TII.get(X86::TAILJMPr64)). - addReg(JumpTarget.getReg(), RegState::Kill); + BuildMI(MBB, MBBI, DL, + TII.get(IsWin64 ? X86::TAILJMPr64_REX : X86::TAILJMPr64)) + .addReg(JumpTarget.getReg(), RegState::Kill); } else { BuildMI(MBB, MBBI, DL, TII.get(X86::TAILJMPr)). addReg(JumpTarget.getReg(), RegState::Kill); @@ -905,24 +1193,58 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, // Check for possible merge with preceding ADD instruction. delta += mergeSPUpdates(MBB, MBBI, StackPtr, true); - emitSPUpdate(MBB, MBBI, StackPtr, delta, Is64Bit, IsLP64, UseLEA, TII, - *RegInfo); + emitSPUpdate(MBB, MBBI, StackPtr, delta, Is64Bit, Uses64BitFramePtr, + UseLEAForSP, TII, *RegInfo); } } int X86FrameLowering::getFrameIndexOffset(const MachineFunction &MF, int FI) const { const X86RegisterInfo *RegInfo = - static_cast(MF.getTarget().getRegisterInfo()); + MF.getSubtarget().getRegisterInfo(); const MachineFrameInfo *MFI = MF.getFrameInfo(); + // Offset will hold the offset from the stack pointer at function entry to the + // object. + // We need to factor in additional offsets applied during the prologue to the + // frame, base, and stack pointer depending on which is used. int Offset = MFI->getObjectOffset(FI) - getOffsetOfLocalArea(); + const X86MachineFunctionInfo *X86FI = MF.getInfo(); + unsigned CSSize = X86FI->getCalleeSavedFrameSize(); uint64_t StackSize = MFI->getStackSize(); + unsigned SlotSize = RegInfo->getSlotSize(); + bool HasFP = hasFP(MF); + bool IsWinEH = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); + int64_t FPDelta = 0; + + if (IsWinEH) { + assert(!MFI->hasCalls() || (StackSize % 16) == 8); + + // Calculate required stack adjustment. + uint64_t FrameSize = StackSize - SlotSize; + // If required, include space for extra hidden slot for stashing base pointer. + if (X86FI->getRestoreBasePointer()) + FrameSize += SlotSize; + uint64_t NumBytes = FrameSize - CSSize; + + uint64_t SEHFrameOffset = calculateSetFPREG(NumBytes); + if (FI && FI == X86FI->getFAIndex()) + return -SEHFrameOffset; + + // FPDelta is the offset from the "traditional" FP location of the old base + // pointer followed by return address and the location required by the + // restricted Win64 prologue. + // Add FPDelta to all offsets below that go through the frame pointer. + FPDelta = FrameSize - SEHFrameOffset; + assert((!MFI->hasCalls() || (FPDelta % 16) == 0) && + "FPDelta isn't aligned per the Win64 ABI!"); + } + if (RegInfo->hasBasePointer(MF)) { - assert (hasFP(MF) && "VLAs and dynamic stack realign, but no FP?!"); + assert(HasFP && "VLAs and dynamic stack realign, but no FP?!"); if (FI < 0) { // Skip the saved EBP. - return Offset + RegInfo->getSlotSize(); + return Offset + SlotSize + FPDelta; } else { assert((-(Offset + StackSize)) % MFI->getObjectAlignment(FI) == 0); return Offset + StackSize; @@ -930,33 +1252,32 @@ int X86FrameLowering::getFrameIndexOffset(const MachineFunction &MF, } else if (RegInfo->needsStackRealignment(MF)) { if (FI < 0) { // Skip the saved EBP. - return Offset + RegInfo->getSlotSize(); + return Offset + SlotSize + FPDelta; } else { assert((-(Offset + StackSize)) % MFI->getObjectAlignment(FI) == 0); return Offset + StackSize; } // FIXME: Support tail calls } else { - if (!hasFP(MF)) + if (!HasFP) return Offset + StackSize; // Skip the saved EBP. - Offset += RegInfo->getSlotSize(); + Offset += SlotSize; // Skip the RETADDR move area - const X86MachineFunctionInfo *X86FI = MF.getInfo(); int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta(); if (TailCallReturnAddrDelta < 0) Offset -= TailCallReturnAddrDelta; } - return Offset; + return Offset + FPDelta; } int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, unsigned &FrameReg) const { const X86RegisterInfo *RegInfo = - static_cast(MF.getTarget().getRegisterInfo()); + MF.getSubtarget().getRegisterInfo(); // We can't calculate offset from frame pointer if the stack is realigned, // so enforce usage of stack/base pointer. The base pointer is used when we // have dynamic allocas in addition to dynamic realignment. @@ -969,56 +1290,183 @@ int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, return getFrameIndexOffset(MF, FI); } -bool X86FrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - const std::vector &CSI, - const TargetRegisterInfo *TRI) const { - if (CSI.empty()) - return false; +// Simplified from getFrameIndexOffset keeping only StackPointer cases +int X86FrameLowering::getFrameIndexOffsetFromSP(const MachineFunction &MF, int FI) const { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + // Does not include any dynamic realign. + const uint64_t StackSize = MFI->getStackSize(); + { +#ifndef NDEBUG + const X86RegisterInfo *RegInfo = + MF.getSubtarget().getRegisterInfo(); + // Note: LLVM arranges the stack as: + // Args > Saved RetPC (<--FP) > CSRs > dynamic alignment (<--BP) + // > "Stack Slots" (<--SP) + // We can always address StackSlots from RSP. We can usually (unless + // needsStackRealignment) address CSRs from RSP, but sometimes need to + // address them from RBP. FixedObjects can be placed anywhere in the stack + // frame depending on their specific requirements (i.e. we can actually + // refer to arguments to the function which are stored in the *callers* + // frame). As a result, THE RESULT OF THIS CALL IS MEANINGLESS FOR CSRs + // AND FixedObjects IFF needsStackRealignment or hasVarSizedObject. + + assert(!RegInfo->hasBasePointer(MF) && "we don't handle this case"); + + // We don't handle tail calls, and shouldn't be seeing them + // either. + int TailCallReturnAddrDelta = + MF.getInfo()->getTCReturnAddrDelta(); + assert(!(TailCallReturnAddrDelta < 0) && "we don't handle this case!"); +#endif + } - DebugLoc DL = MBB.findDebugLoc(MI); + // This is how the math works out: + // + // %rsp grows (i.e. gets lower) left to right. Each box below is + // one word (eight bytes). Obj0 is the stack slot we're trying to + // get to. + // + // ---------------------------------- + // | BP | Obj0 | Obj1 | ... | ObjN | + // ---------------------------------- + // ^ ^ ^ ^ + // A B C E + // + // A is the incoming stack pointer. + // (B - A) is the local area offset (-8 for x86-64) [1] + // (C - A) is the Offset returned by MFI->getObjectOffset for Obj0 [2] + // + // |(E - B)| is the StackSize (absolute value, positive). For a + // stack that grown down, this works out to be (B - E). [3] + // + // E is also the value of %rsp after stack has been set up, and we + // want (C - E) -- the value we can add to %rsp to get to Obj0. Now + // (C - E) == (C - A) - (B - A) + (B - E) + // { Using [1], [2] and [3] above } + // == getObjectOffset - LocalAreaOffset + StackSize + // - MachineFunction &MF = *MBB.getParent(); + // Get the Offset from the StackPointer + int Offset = MFI->getObjectOffset(FI) - getOffsetOfLocalArea(); + + return Offset + StackSize; +} +// Simplified from getFrameIndexReference keeping only StackPointer cases +int X86FrameLowering::getFrameIndexReferenceFromSP(const MachineFunction &MF, + int FI, + unsigned &FrameReg) const { + const X86RegisterInfo *RegInfo = + MF.getSubtarget().getRegisterInfo(); + assert(!RegInfo->hasBasePointer(MF) && "we don't handle this case"); - unsigned SlotSize = STI.is64Bit() ? 8 : 4; - unsigned FPReg = TRI->getFrameRegister(MF); - unsigned CalleeFrameSize = 0; + FrameReg = RegInfo->getStackRegister(); + return getFrameIndexOffsetFromSP(MF, FI); +} - const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); +bool X86FrameLowering::assignCalleeSavedSpillSlots( + MachineFunction &MF, const TargetRegisterInfo *TRI, + std::vector &CSI) const { + MachineFrameInfo *MFI = MF.getFrameInfo(); + const X86RegisterInfo *RegInfo = + MF.getSubtarget().getRegisterInfo(); + unsigned SlotSize = RegInfo->getSlotSize(); X86MachineFunctionInfo *X86FI = MF.getInfo(); + unsigned CalleeSavedFrameSize = 0; + int SpillSlotOffset = getOffsetOfLocalArea() + X86FI->getTCReturnAddrDelta(); + + if (hasFP(MF)) { + // emitPrologue always spills frame register the first thing. + SpillSlotOffset -= SlotSize; + MFI->CreateFixedSpillStackObject(SlotSize, SpillSlotOffset); + + // Since emitPrologue and emitEpilogue will handle spilling and restoring of + // the frame register, we can delete it from CSI list and not have to worry + // about avoiding it later. + unsigned FPReg = RegInfo->getFrameRegister(MF); + for (unsigned i = 0; i < CSI.size(); ++i) { + if (TRI->regsOverlap(CSI[i].getReg(),FPReg)) { + CSI.erase(CSI.begin() + i); + break; + } + } + } + + // Assign slots for GPRs. It increases frame size. + for (unsigned i = CSI.size(); i != 0; --i) { + unsigned Reg = CSI[i - 1].getReg(); + + if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg)) + continue; + + SpillSlotOffset -= SlotSize; + CalleeSavedFrameSize += SlotSize; + + int SlotIndex = MFI->CreateFixedSpillStackObject(SlotSize, SpillSlotOffset); + CSI[i - 1].setFrameIdx(SlotIndex); + } + + X86FI->setCalleeSavedFrameSize(CalleeSavedFrameSize); + + // Assign slots for XMMs. + for (unsigned i = CSI.size(); i != 0; --i) { + unsigned Reg = CSI[i - 1].getReg(); + if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg)) + continue; + + const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg); + // ensure alignment + SpillSlotOffset -= std::abs(SpillSlotOffset) % RC->getAlignment(); + // spill into slot + SpillSlotOffset -= RC->getSize(); + int SlotIndex = + MFI->CreateFixedSpillStackObject(RC->getSize(), SpillSlotOffset); + CSI[i - 1].setFrameIdx(SlotIndex); + MFI->ensureMaxAlignment(RC->getAlignment()); + } + + return true; +} + +bool X86FrameLowering::spillCalleeSavedRegisters( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, + const std::vector &CSI, + const TargetRegisterInfo *TRI) const { + DebugLoc DL = MBB.findDebugLoc(MI); + + MachineFunction &MF = *MBB.getParent(); + const X86Subtarget &STI = MF.getSubtarget(); + const TargetInstrInfo &TII = *STI.getInstrInfo(); + // Push GPRs. It increases frame size. unsigned Opc = STI.is64Bit() ? X86::PUSH64r : X86::PUSH32r; for (unsigned i = CSI.size(); i != 0; --i) { - unsigned Reg = CSI[i-1].getReg(); - if (!X86::GR64RegClass.contains(Reg) && - !X86::GR32RegClass.contains(Reg)) + unsigned Reg = CSI[i - 1].getReg(); + + if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg)) continue; // Add the callee-saved register as live-in. It's killed at the spill. MBB.addLiveIn(Reg); - if (Reg == FPReg) - // X86RegisterInfo::emitPrologue will handle spilling of frame register. - continue; - CalleeFrameSize += SlotSize; + BuildMI(MBB, MI, DL, TII.get(Opc)).addReg(Reg, RegState::Kill) .setMIFlag(MachineInstr::FrameSetup); } - X86FI->setCalleeSavedFrameSize(CalleeFrameSize); - // Make XMM regs spilled. X86 does not have ability of push/pop XMM. // It can be done by spilling XMMs to stack frame. - // Note that only Win64 ABI might spill XMMs. for (unsigned i = CSI.size(); i != 0; --i) { unsigned Reg = CSI[i-1].getReg(); - if (X86::GR64RegClass.contains(Reg) || - X86::GR32RegClass.contains(Reg)) + if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg)) continue; // Add the callee-saved register as live-in. It's killed at the spill. MBB.addLiveIn(Reg); const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); - TII.storeRegToStackSlot(MBB, MI, Reg, true, CSI[i-1].getFrameIdx(), - RC, TRI); + + TII.storeRegToStackSlot(MBB, MI, Reg, true, CSI[i - 1].getFrameIdx(), RC, + TRI); + --MI; + MI->setFlag(MachineInstr::FrameSetup); + ++MI; } return true; @@ -1034,7 +1482,8 @@ bool X86FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB, DebugLoc DL = MBB.findDebugLoc(MI); MachineFunction &MF = *MBB.getParent(); - const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); + const X86Subtarget &STI = MF.getSubtarget(); + const TargetInstrInfo &TII = *STI.getInstrInfo(); // Reload XMMs from stack frame. for (unsigned i = 0, e = CSI.size(); i != e; ++i) { @@ -1042,22 +1491,19 @@ bool X86FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB, if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg)) continue; + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); - TII.loadRegFromStackSlot(MBB, MI, Reg, CSI[i].getFrameIdx(), - RC, TRI); + TII.loadRegFromStackSlot(MBB, MI, Reg, CSI[i].getFrameIdx(), RC, TRI); } // POP GPRs. - unsigned FPReg = TRI->getFrameRegister(MF); unsigned Opc = STI.is64Bit() ? X86::POP64r : X86::POP32r; for (unsigned i = 0, e = CSI.size(); i != e; ++i) { unsigned Reg = CSI[i].getReg(); if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg)) continue; - if (Reg == FPReg) - // X86RegisterInfo::emitEpilogue will handle restoring of frame register. - continue; + BuildMI(MBB, MI, DL, TII.get(Opc), Reg); } return true; @@ -1065,9 +1511,10 @@ bool X86FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB, void X86FrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, - RegScavenger *RS) const { + RegScavenger *RS) const { MachineFrameInfo *MFI = MF.getFrameInfo(); - const X86RegisterInfo *RegInfo = TM.getRegisterInfo(); + const X86RegisterInfo *RegInfo = + MF.getSubtarget().getRegisterInfo(); unsigned SlotSize = RegInfo->getSlotSize(); X86MachineFunctionInfo *X86FI = MF.getInfo(); @@ -1087,22 +1534,6 @@ X86FrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, TailCallReturnAddrDelta - SlotSize, true); } - if (hasFP(MF)) { - assert((TailCallReturnAddrDelta <= 0) && - "The Delta should always be zero or negative"); - const TargetFrameLowering &TFI = *MF.getTarget().getFrameLowering(); - - // Create a frame entry for the EBP register that must be saved. - int FrameIdx = MFI->CreateFixedObject(SlotSize, - -(int)SlotSize + - TFI.getOffsetOfLocalArea() + - TailCallReturnAddrDelta, - true); - assert(FrameIdx == MFI->getObjectIndexBegin() && - "Slot for EBP register must be last in order to be found!"); - (void)FrameIdx; - } - // Spill the BasePtr if it's used. if (RegInfo->hasBasePointer(MF)) MF.getRegInfo().setPhysRegUsed(RegInfo->getBaseRegister()); @@ -1124,7 +1555,7 @@ HasNestArgument(const MachineFunction *MF) { /// and the properties of the function either one or two registers will be /// needed. Set primary to true for the first register, false for the second. static unsigned -GetScratchRegister(bool Is64Bit, const MachineFunction &MF, bool Primary) { +GetScratchRegister(bool Is64Bit, bool IsLP64, const MachineFunction &MF, bool Primary) { CallingConv::ID CallingConvention = MF.getFunction()->getCallingConv(); // Erlang stuff. @@ -1135,8 +1566,12 @@ GetScratchRegister(bool Is64Bit, const MachineFunction &MF, bool Primary) { return Primary ? X86::EBX : X86::EDI; } - if (Is64Bit) - return Primary ? X86::R11 : X86::R12; + if (Is64Bit) { + if (IsLP64) + return Primary ? X86::R11 : X86::R12; + else + return Primary ? X86::R11D : X86::R12D; + } bool IsNested = HasNestArgument(&MF); @@ -1156,24 +1591,28 @@ GetScratchRegister(bool Is64Bit, const MachineFunction &MF, bool Primary) { // limit. static const uint64_t kSplitStackAvailable = 256; -void -X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const { - MachineBasicBlock &prologueMBB = MF.front(); +void X86FrameLowering::adjustForSegmentedStacks( + MachineFunction &MF, MachineBasicBlock &PrologueMBB) const { + assert(&PrologueMBB == &MF.front() && + "Shrink-wrapping is not implemented yet"); MachineFrameInfo *MFI = MF.getFrameInfo(); - const X86InstrInfo &TII = *TM.getInstrInfo(); + const X86Subtarget &STI = MF.getSubtarget(); + const TargetInstrInfo &TII = *STI.getInstrInfo(); uint64_t StackSize; bool Is64Bit = STI.is64Bit(); + const bool IsLP64 = STI.isTarget64BitLP64(); unsigned TlsReg, TlsOffset; DebugLoc DL; - unsigned ScratchReg = GetScratchRegister(Is64Bit, MF, true); + unsigned ScratchReg = GetScratchRegister(Is64Bit, IsLP64, MF, true); assert(!MF.getRegInfo().isLiveIn(ScratchReg) && "Scratch register is live-in"); if (MF.getFunction()->isVarArg()) report_fatal_error("Segmented stacks do not support vararg functions."); - if (!STI.isTargetLinux() && !STI.isTargetDarwin() && - !STI.isTargetWin32() && !STI.isTargetWin64() && !STI.isTargetFreeBSD()) + if (!STI.isTargetLinux() && !STI.isTargetDarwin() && !STI.isTargetWin32() && + !STI.isTargetWin64() && !STI.isTargetFreeBSD() && + !STI.isTargetDragonFly()) report_fatal_error("Segmented stacks not supported on this platform."); // Eventually StackSize will be calculated by a link-time pass; which will @@ -1197,14 +1636,15 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const { // The MOV R10, RAX needs to be in a different block, since the RET we emit in // allocMBB needs to be last (terminating) instruction. - for (MachineBasicBlock::livein_iterator i = prologueMBB.livein_begin(), - e = prologueMBB.livein_end(); i != e; i++) { + for (MachineBasicBlock::livein_iterator i = PrologueMBB.livein_begin(), + e = PrologueMBB.livein_end(); + i != e; i++) { allocMBB->addLiveIn(*i); checkMBB->addLiveIn(*i); } if (IsNested) - allocMBB->addLiveIn(X86::R10); + allocMBB->addLiveIn(IsLP64 ? X86::R10 : X86::R10D); MF.push_front(allocMBB); MF.push_front(checkMBB); @@ -1217,7 +1657,7 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const { if (Is64Bit) { if (STI.isTargetLinux()) { TlsReg = X86::FS; - TlsOffset = 0x70; + TlsOffset = IsLP64 ? 0x70 : 0x40; } else if (STI.isTargetDarwin()) { TlsReg = X86::GS; TlsOffset = 0x60 + 90*8; // See pthread_machdep.h. Steal TLS slot 90. @@ -1227,17 +1667,20 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const { } else if (STI.isTargetFreeBSD()) { TlsReg = X86::FS; TlsOffset = 0x18; + } else if (STI.isTargetDragonFly()) { + TlsReg = X86::FS; + TlsOffset = 0x20; // use tls_tcb.tcb_segstack } else { report_fatal_error("Segmented stacks not supported on this platform."); } if (CompareStackPointer) - ScratchReg = X86::RSP; + ScratchReg = IsLP64 ? X86::RSP : X86::ESP; else - BuildMI(checkMBB, DL, TII.get(X86::LEA64r), ScratchReg).addReg(X86::RSP) + BuildMI(checkMBB, DL, TII.get(IsLP64 ? X86::LEA64r : X86::LEA64_32r), ScratchReg).addReg(X86::RSP) .addImm(1).addReg(0).addImm(-StackSize).addReg(0); - BuildMI(checkMBB, DL, TII.get(X86::CMP64rm)).addReg(ScratchReg) + BuildMI(checkMBB, DL, TII.get(IsLP64 ? X86::CMP64rm : X86::CMP32rm)).addReg(ScratchReg) .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg); } else { if (STI.isTargetLinux()) { @@ -1249,6 +1692,9 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const { } else if (STI.isTargetWin32()) { TlsReg = X86::FS; TlsOffset = 0x14; // pvArbitrary, reserved for application use + } else if (STI.isTargetDragonFly()) { + TlsReg = X86::FS; + TlsOffset = 0x10; // use tls_tcb.tcb_segstack } else if (STI.isTargetFreeBSD()) { report_fatal_error("Segmented stacks not supported on FreeBSD i386."); } else { @@ -1261,7 +1707,8 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const { BuildMI(checkMBB, DL, TII.get(X86::LEA32r), ScratchReg).addReg(X86::ESP) .addImm(1).addReg(0).addImm(-StackSize).addReg(0); - if (STI.isTargetLinux() || STI.isTargetWin32() || STI.isTargetWin64()) { + if (STI.isTargetLinux() || STI.isTargetWin32() || STI.isTargetWin64() || + STI.isTargetDragonFly()) { BuildMI(checkMBB, DL, TII.get(X86::CMP32rm)).addReg(ScratchReg) .addReg(0).addImm(0).addReg(0).addImm(TlsOffset).addReg(TlsReg); } else if (STI.isTargetDarwin()) { @@ -1271,11 +1718,11 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const { bool SaveScratch2; if (CompareStackPointer) { // The primary scratch register is available for holding the TLS offset. - ScratchReg2 = GetScratchRegister(Is64Bit, MF, true); + ScratchReg2 = GetScratchRegister(Is64Bit, IsLP64, MF, true); SaveScratch2 = false; } else { // Need to use a second register to hold the TLS offset - ScratchReg2 = GetScratchRegister(Is64Bit, MF, false); + ScratchReg2 = GetScratchRegister(Is64Bit, IsLP64, MF, false); // Unfortunately, with fastcc the second scratch register may hold an // argument. @@ -1305,7 +1752,7 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const { // This jump is taken if SP >= (Stacklet Limit + Stack Space required). // It jumps to normal execution of the function body. - BuildMI(checkMBB, DL, TII.get(X86::JA_4)).addMBB(&prologueMBB); + BuildMI(checkMBB, DL, TII.get(X86::JA_1)).addMBB(&PrologueMBB); // On 32 bit we first push the arguments size and then the frame size. On 64 // bit, we pass the stack frame size in r10 and the argument size in r11. @@ -1313,15 +1760,21 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const { // Functions with nested arguments use R10, so it needs to be saved across // the call to _morestack + const unsigned RegAX = IsLP64 ? X86::RAX : X86::EAX; + const unsigned Reg10 = IsLP64 ? X86::R10 : X86::R10D; + const unsigned Reg11 = IsLP64 ? X86::R11 : X86::R11D; + const unsigned MOVrr = IsLP64 ? X86::MOV64rr : X86::MOV32rr; + const unsigned MOVri = IsLP64 ? X86::MOV64ri : X86::MOV32ri; + if (IsNested) - BuildMI(allocMBB, DL, TII.get(X86::MOV64rr), X86::RAX).addReg(X86::R10); + BuildMI(allocMBB, DL, TII.get(MOVrr), RegAX).addReg(Reg10); - BuildMI(allocMBB, DL, TII.get(X86::MOV64ri), X86::R10) + BuildMI(allocMBB, DL, TII.get(MOVri), Reg10) .addImm(StackSize); - BuildMI(allocMBB, DL, TII.get(X86::MOV64ri), X86::R11) + BuildMI(allocMBB, DL, TII.get(MOVri), Reg11) .addImm(X86FI->getArgumentStackSize()); - MF.getRegInfo().setPhysRegUsed(X86::R10); - MF.getRegInfo().setPhysRegUsed(X86::R11); + MF.getRegInfo().setPhysRegUsed(Reg10); + MF.getRegInfo().setPhysRegUsed(Reg11); } else { BuildMI(allocMBB, DL, TII.get(X86::PUSHi32)) .addImm(X86FI->getArgumentStackSize()); @@ -1330,22 +1783,46 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const { } // __morestack is in libgcc - if (Is64Bit) - BuildMI(allocMBB, DL, TII.get(X86::CALL64pcrel32)) - .addExternalSymbol("__morestack"); - else - BuildMI(allocMBB, DL, TII.get(X86::CALLpcrel32)) - .addExternalSymbol("__morestack"); + if (Is64Bit && MF.getTarget().getCodeModel() == CodeModel::Large) { + // Under the large code model, we cannot assume that __morestack lives + // within 2^31 bytes of the call site, so we cannot use pc-relative + // addressing. We cannot perform the call via a temporary register, + // as the rax register may be used to store the static chain, and all + // other suitable registers may be either callee-save or used for + // parameter passing. We cannot use the stack at this point either + // because __morestack manipulates the stack directly. + // + // To avoid these issues, perform an indirect call via a read-only memory + // location containing the address. + // + // This solution is not perfect, as it assumes that the .rodata section + // is laid out within 2^31 bytes of each function body, but this seems + // to be sufficient for JIT. + BuildMI(allocMBB, DL, TII.get(X86::CALL64m)) + .addReg(X86::RIP) + .addImm(0) + .addReg(0) + .addExternalSymbol("__morestack_addr") + .addReg(0); + MF.getMMI().setUsesMorestackAddr(true); + } else { + if (Is64Bit) + BuildMI(allocMBB, DL, TII.get(X86::CALL64pcrel32)) + .addExternalSymbol("__morestack"); + else + BuildMI(allocMBB, DL, TII.get(X86::CALLpcrel32)) + .addExternalSymbol("__morestack"); + } if (IsNested) BuildMI(allocMBB, DL, TII.get(X86::MORESTACK_RET_RESTORE_R10)); else BuildMI(allocMBB, DL, TII.get(X86::MORESTACK_RET)); - allocMBB->addSuccessor(&prologueMBB); + allocMBB->addSuccessor(&PrologueMBB); checkMBB->addSuccessor(allocMBB); - checkMBB->addSuccessor(&prologueMBB); + checkMBB->addSuccessor(&PrologueMBB); #ifdef XDEBUG MF.verify(); @@ -1367,11 +1844,14 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const { /// call inc_stack # doubles the stack space /// temp0 = sp - MaxStack /// if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart -void X86FrameLowering::adjustForHiPEPrologue(MachineFunction &MF) const { - const X86InstrInfo &TII = *TM.getInstrInfo(); +void X86FrameLowering::adjustForHiPEPrologue( + MachineFunction &MF, MachineBasicBlock &PrologueMBB) const { + const X86Subtarget &STI = MF.getSubtarget(); + const TargetInstrInfo &TII = *STI.getInstrInfo(); MachineFrameInfo *MFI = MF.getFrameInfo(); - const unsigned SlotSize = TM.getRegisterInfo()->getSlotSize(); + const unsigned SlotSize = STI.getRegisterInfo()->getSlotSize(); const bool Is64Bit = STI.is64Bit(); + const bool IsLP64 = STI.isTarget64BitLP64(); DebugLoc DL; // HiPE-specific values const unsigned HipeLeafWords = 24; @@ -1434,12 +1914,14 @@ void X86FrameLowering::adjustForHiPEPrologue(MachineFunction &MF) const { // If the stack frame needed is larger than the guaranteed then runtime checks // and calls to "inc_stack_0" BIF should be inserted in the assembly prologue. if (MaxStack > Guaranteed) { - MachineBasicBlock &prologueMBB = MF.front(); + assert(&PrologueMBB == &MF.front() && + "Shrink-wrapping is not implemented yet"); MachineBasicBlock *stackCheckMBB = MF.CreateMachineBasicBlock(); MachineBasicBlock *incStackMBB = MF.CreateMachineBasicBlock(); - for (MachineBasicBlock::livein_iterator I = prologueMBB.livein_begin(), - E = prologueMBB.livein_end(); I != E; I++) { + for (MachineBasicBlock::livein_iterator I = PrologueMBB.livein_begin(), + E = PrologueMBB.livein_end(); + I != E; I++) { stackCheckMBB->addLiveIn(*I); incStackMBB->addLiveIn(*I); } @@ -1465,7 +1947,7 @@ void X86FrameLowering::adjustForHiPEPrologue(MachineFunction &MF) const { SPLimitOffset = 0x4c; } - ScratchReg = GetScratchRegister(Is64Bit, MF, true); + ScratchReg = GetScratchRegister(Is64Bit, IsLP64, MF, true); assert(!MF.getRegInfo().isLiveIn(ScratchReg) && "HiPE prologue scratch register is live-in"); @@ -1475,7 +1957,7 @@ void X86FrameLowering::adjustForHiPEPrologue(MachineFunction &MF) const { // SPLimitOffset is in a fixed heap location (pointed by BP). addRegOffset(BuildMI(stackCheckMBB, DL, TII.get(CMPop)) .addReg(ScratchReg), PReg, false, SPLimitOffset); - BuildMI(stackCheckMBB, DL, TII.get(X86::JAE_4)).addMBB(&prologueMBB); + BuildMI(stackCheckMBB, DL, TII.get(X86::JAE_1)).addMBB(&PrologueMBB); // Create new MBB for IncStack: BuildMI(incStackMBB, DL, TII.get(CALLop)). @@ -1484,11 +1966,11 @@ void X86FrameLowering::adjustForHiPEPrologue(MachineFunction &MF) const { SPReg, false, -MaxStack); addRegOffset(BuildMI(incStackMBB, DL, TII.get(CMPop)) .addReg(ScratchReg), PReg, false, SPLimitOffset); - BuildMI(incStackMBB, DL, TII.get(X86::JLE_4)).addMBB(incStackMBB); + BuildMI(incStackMBB, DL, TII.get(X86::JLE_1)).addMBB(incStackMBB); - stackCheckMBB->addSuccessor(&prologueMBB, 99); + stackCheckMBB->addSuccessor(&PrologueMBB, 99); stackCheckMBB->addSuccessor(incStackMBB, 1); - incStackMBB->addSuccessor(&prologueMBB, 99); + incStackMBB->addSuccessor(&PrologueMBB, 99); incStackMBB->addSuccessor(incStackMBB, 1); } #ifdef XDEBUG @@ -1499,45 +1981,45 @@ void X86FrameLowering::adjustForHiPEPrologue(MachineFunction &MF) const { void X86FrameLowering:: eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const { - const X86InstrInfo &TII = *TM.getInstrInfo(); - const X86RegisterInfo &RegInfo = *TM.getRegisterInfo(); + const X86Subtarget &STI = MF.getSubtarget(); + const TargetInstrInfo &TII = *STI.getInstrInfo(); + const X86RegisterInfo &RegInfo = *STI.getRegisterInfo(); unsigned StackPtr = RegInfo.getStackRegister(); - bool reseveCallFrame = hasReservedCallFrame(MF); - int Opcode = I->getOpcode(); + bool reserveCallFrame = hasReservedCallFrame(MF); + unsigned Opcode = I->getOpcode(); bool isDestroy = Opcode == TII.getCallFrameDestroyOpcode(); bool IsLP64 = STI.isTarget64BitLP64(); DebugLoc DL = I->getDebugLoc(); - uint64_t Amount = !reseveCallFrame ? I->getOperand(0).getImm() : 0; - uint64_t CalleeAmt = isDestroy ? I->getOperand(1).getImm() : 0; + uint64_t Amount = !reserveCallFrame ? I->getOperand(0).getImm() : 0; + uint64_t InternalAmt = (isDestroy || Amount) ? I->getOperand(1).getImm() : 0; I = MBB.erase(I); - if (!reseveCallFrame) { + if (!reserveCallFrame) { // If the stack pointer can be changed after prologue, turn the // adjcallstackup instruction into a 'sub ESP, ' and the // adjcallstackdown instruction into 'add ESP, ' - // TODO: consider using push / pop instead of sub + store / add if (Amount == 0) return; // We need to keep the stack aligned properly. To do this, we round the // amount of space needed for the outgoing arguments up to the next // alignment boundary. - unsigned StackAlign = TM.getFrameLowering()->getStackAlignment(); - Amount = (Amount + StackAlign - 1) / StackAlign * StackAlign; + unsigned StackAlign = getStackAlignment(); + Amount = RoundUpToAlignment(Amount, StackAlign); MachineInstr *New = nullptr; - if (Opcode == TII.getCallFrameSetupOpcode()) { - New = BuildMI(MF, DL, TII.get(getSUBriOpcode(IsLP64, Amount)), - StackPtr) - .addReg(StackPtr) - .addImm(Amount); - } else { - assert(Opcode == TII.getCallFrameDestroyOpcode()); - // Factor out the amount the callee already popped. - Amount -= CalleeAmt; + // Factor out the amount that gets handled inside the sequence + // (Pushes of argument for frame setup, callee pops for frame destroy) + Amount -= InternalAmt; + + if (Amount) { + if (Opcode == TII.getCallFrameSetupOpcode()) { + New = BuildMI(MF, DL, TII.get(getSUBriOpcode(IsLP64, Amount)), StackPtr) + .addReg(StackPtr).addImm(Amount); + } else { + assert(Opcode == TII.getCallFrameDestroyOpcode()); - if (Amount) { unsigned Opc = getADDriOpcode(IsLP64, Amount); New = BuildMI(MF, DL, TII.get(Opc), StackPtr) .addReg(StackPtr).addImm(Amount); @@ -1555,13 +2037,13 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, return; } - if (Opcode == TII.getCallFrameDestroyOpcode() && CalleeAmt) { + if (Opcode == TII.getCallFrameDestroyOpcode() && InternalAmt) { // If we are performing frame pointer elimination and if the callee pops // something off the stack pointer, add it back. We do this until we have // more advanced stack pointer tracking ability. - unsigned Opc = getSUBriOpcode(IsLP64, CalleeAmt); + unsigned Opc = getSUBriOpcode(IsLP64, InternalAmt); MachineInstr *New = BuildMI(MF, DL, TII.get(Opc), StackPtr) - .addReg(StackPtr).addImm(CalleeAmt); + .addReg(StackPtr).addImm(InternalAmt); // The EFLAGS implicit def is dead. New->getOperand(3).setIsDead();