X-Git-Url: http://plrg.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FARM%2FARMFrameLowering.cpp;h=916417882bae4775e099ee84429396225aed0d47;hb=80668d18e8064560bb6c227cde4e2a01d32e683e;hp=2b801458f8ef1f73e2caf81a0ff0d9209e912646;hpb=63b46faeb8acae9b7e5f865b7417dc00b9b9dad3;p=oota-llvm.git diff --git a/lib/Target/ARM/ARMFrameLowering.cpp b/lib/Target/ARM/ARMFrameLowering.cpp index 2b801458f8e..916417882ba 100644 --- a/lib/Target/ARM/ARMFrameLowering.cpp +++ b/lib/Target/ARM/ARMFrameLowering.cpp @@ -1,4 +1,4 @@ -//=======- ARMFrameLowering.cpp - ARM Frame Information --------*- C++ -*-====// +//===-- ARMFrameLowering.cpp - ARM Frame Information ----------------------===// // // The LLVM Compiler Infrastructure // @@ -12,32 +12,46 @@ //===----------------------------------------------------------------------===// #include "ARMFrameLowering.h" -#include "ARMAddressingModes.h" #include "ARMBaseInstrInfo.h" #include "ARMBaseRegisterInfo.h" #include "ARMMachineFunctionInfo.h" +#include "MCTargetDesc/ARMAddressingModes.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/RegisterScavenging.h" +#include "llvm/IR/CallingConv.h" +#include "llvm/IR/Function.h" +#include "llvm/MC/MCContext.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Target/TargetOptions.h" using namespace llvm; +static cl::opt +SpillAlignedNEONRegs("align-neon-spills", cl::Hidden, cl::init(true), + cl::desc("Align ARM NEON spills in prolog and epilog")); + +static MachineBasicBlock::iterator +skipAlignedDPRCS2Spills(MachineBasicBlock::iterator MI, + unsigned NumAlignedDPRCS2Regs); + /// hasFP - Return true if the specified function should have a dedicated frame /// pointer register. This is true if the function has variable sized allocas /// or if frame pointer elimination is disabled. bool ARMFrameLowering::hasFP(const MachineFunction &MF) const { const TargetRegisterInfo *RegInfo = MF.getTarget().getRegisterInfo(); - // Mac OS X requires FP not to be clobbered for backtracing purpose. - if (STI.isTargetDarwin()) + // iOS requires FP not to be clobbered for backtracing purpose. + if (STI.isTargetIOS()) return true; const MachineFrameInfo *MFI = MF.getFrameInfo(); // Always eliminate non-leaf frame pointers. - return ((DisableFramePointerElim(MF) && MFI->hasCalls()) || + return ((MF.getTarget().Options.DisableFramePointerElim(MF) && + MFI->hasCalls()) || RegInfo->needsStackRealignment(MF) || MFI->hasVarSizedObjects() || MFI->isFrameAddressTaken()); @@ -70,22 +84,11 @@ ARMFrameLowering::canSimplifyCallFramePseudos(const MachineFunction &MF) const { return hasReservedCallFrame(MF) || MF.getFrameInfo()->hasVarSizedObjects(); } -static bool isCalleeSavedRegister(unsigned Reg, const unsigned *CSRegs) { - for (unsigned i = 0; CSRegs[i]; ++i) - if (Reg == CSRegs[i]) - return true; - return false; -} - static bool isCSRestore(MachineInstr *MI, const ARMBaseInstrInfo &TII, - const unsigned *CSRegs) { + const uint16_t *CSRegs) { // Integer spill area is handled with "pop". - if (MI->getOpcode() == ARM::LDMIA_RET || - MI->getOpcode() == ARM::t2LDMIA_RET || - MI->getOpcode() == ARM::LDMIA_UPD || - MI->getOpcode() == ARM::t2LDMIA_UPD || - MI->getOpcode() == ARM::VLDMDIA_UPD) { + if (isPopOpcode(MI->getOpcode())) { // The first two operands are predicates. The last two are // imp-def and imp-use of SP. Check everything in between. for (int i = 5, e = MI->getNumOperands(); i != e; ++i) @@ -93,7 +96,8 @@ static bool isCSRestore(MachineInstr *MI, return false; return true; } - if ((MI->getOpcode() == ARM::LDR_POST || + if ((MI->getOpcode() == ARM::LDR_POST_IMM || + MI->getOpcode() == ARM::LDR_POST_REG || MI->getOpcode() == ARM::t2LDR_POST) && isCalleeSavedRegister(MI->getOperand(0).getReg(), CSRegs) && MI->getOperand(1).getReg() == ARM::SP) @@ -102,17 +106,39 @@ static bool isCSRestore(MachineInstr *MI, return false; } -static void -emitSPUpdate(bool isARM, - MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, - DebugLoc dl, const ARMBaseInstrInfo &TII, - int NumBytes, unsigned MIFlags = MachineInstr::NoFlags) { +static void emitRegPlusImmediate(bool isARM, MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, DebugLoc dl, + const ARMBaseInstrInfo &TII, unsigned DestReg, + unsigned SrcReg, int NumBytes, + unsigned MIFlags = MachineInstr::NoFlags, + ARMCC::CondCodes Pred = ARMCC::AL, + unsigned PredReg = 0) { if (isARM) - emitARMRegPlusImmediate(MBB, MBBI, dl, ARM::SP, ARM::SP, NumBytes, - ARMCC::AL, 0, TII, MIFlags); + emitARMRegPlusImmediate(MBB, MBBI, dl, DestReg, SrcReg, NumBytes, + Pred, PredReg, TII, MIFlags); else - emitT2RegPlusImmediate(MBB, MBBI, dl, ARM::SP, ARM::SP, NumBytes, - ARMCC::AL, 0, TII, MIFlags); + emitT2RegPlusImmediate(MBB, MBBI, dl, DestReg, SrcReg, NumBytes, + Pred, PredReg, TII, MIFlags); +} + +static void emitSPUpdate(bool isARM, MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, DebugLoc dl, + const ARMBaseInstrInfo &TII, int NumBytes, + unsigned MIFlags = MachineInstr::NoFlags, + ARMCC::CondCodes Pred = ARMCC::AL, + unsigned PredReg = 0) { + emitRegPlusImmediate(isARM, MBB, MBBI, dl, TII, ARM::SP, ARM::SP, NumBytes, + MIFlags, Pred, PredReg); +} + +static int sizeOfSPAdjustment(const MachineInstr *MI) { + assert(MI->getOpcode() == ARM::VSTMDDB_UPD); + int count = 0; + // ARM and Thumb2 push/pop insts have explicit "sp, sp" operands (+ + // pred) so the list starts at 4. + for (int i = MI->getNumOperands() - 1; i >= 4; --i) + count += 8; + return count; } void ARMFrameLowering::emitPrologue(MachineFunction &MF) const { @@ -120,6 +146,9 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const { MachineBasicBlock::iterator MBBI = MBB.begin(); MachineFrameInfo *MFI = MF.getFrameInfo(); ARMFunctionInfo *AFI = MF.getInfo(); + MachineModuleInfo &MMI = MF.getMMI(); + MCContext &Context = MMI.getContext(); + const MCRegisterInfo *MRI = Context.getRegisterInfo(); const ARMBaseRegisterInfo *RegInfo = static_cast(MF.getTarget().getRegisterInfo()); const ARMBaseInstrInfo &TII = @@ -127,33 +156,70 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const { assert(!AFI->isThumb1OnlyFunction() && "This emitPrologue does not support Thumb1!"); bool isARM = !AFI->isThumbFunction(); - unsigned VARegSaveSize = AFI->getVarArgsRegSaveSize(); + unsigned Align = MF.getTarget().getFrameLowering()->getStackAlignment(); + unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize(Align); unsigned NumBytes = MFI->getStackSize(); const std::vector &CSI = MFI->getCalleeSavedInfo(); DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); unsigned FramePtr = RegInfo->getFrameRegister(MF); + int CFAOffset = 0; // Determine the sizes of each callee-save spill areas and record which frame // belongs to which callee-save spill areas. unsigned GPRCS1Size = 0, GPRCS2Size = 0, DPRCSSize = 0; int FramePtrSpillFI = 0; + int D8SpillFI = 0; + + // All calls are tail calls in GHC calling conv, and functions have no + // prologue/epilogue. + if (MF.getFunction()->getCallingConv() == CallingConv::GHC) + return; // Allocate the vararg register save area. This is not counted in NumBytes. - if (VARegSaveSize) - emitSPUpdate(isARM, MBB, MBBI, dl, TII, -VARegSaveSize, + if (ArgRegsSaveSize) { + emitSPUpdate(isARM, MBB, MBBI, dl, TII, -ArgRegsSaveSize, MachineInstr::FrameSetup); + MCSymbol *SPLabel = Context.CreateTempSymbol(); + BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::PROLOG_LABEL)) + .addSym(SPLabel); + CFAOffset -= ArgRegsSaveSize; + MMI.addFrameInst( + MCCFIInstruction::createDefCfaOffset(SPLabel, CFAOffset)); + } if (!AFI->hasStackFrame()) { - if (NumBytes != 0) + if (NumBytes != 0) { emitSPUpdate(isARM, MBB, MBBI, dl, TII, -NumBytes, MachineInstr::FrameSetup); + MCSymbol *SPLabel = Context.CreateTempSymbol(); + BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::PROLOG_LABEL)) + .addSym(SPLabel); + CFAOffset -= NumBytes; + MMI.addFrameInst(MCCFIInstruction::createDefCfaOffset(SPLabel, + CFAOffset)); + } return; } + // Determine spill area sizes. for (unsigned i = 0, e = CSI.size(); i != e; ++i) { unsigned Reg = CSI[i].getReg(); int FI = CSI[i].getFrameIdx(); switch (Reg) { + case ARM::R8: + case ARM::R9: + case ARM::R10: + case ARM::R11: + case ARM::R12: + if (STI.isTargetMachO()) { + GPRCS2Size += 4; + break; + } + // fallthrough + case ARM::R0: + case ARM::R1: + case ARM::R2: + case ARM::R3: case ARM::R4: case ARM::R5: case ARM::R6: @@ -161,75 +227,75 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const { case ARM::LR: if (Reg == FramePtr) FramePtrSpillFI = FI; - AFI->addGPRCalleeSavedArea1Frame(FI); GPRCS1Size += 4; break; - case ARM::R8: - case ARM::R9: - case ARM::R10: - case ARM::R11: - if (Reg == FramePtr) - FramePtrSpillFI = FI; - if (STI.isTargetDarwin()) { - AFI->addGPRCalleeSavedArea2Frame(FI); - GPRCS2Size += 4; - } else { - AFI->addGPRCalleeSavedArea1Frame(FI); - GPRCS1Size += 4; - } - break; default: - AFI->addDPRCalleeSavedAreaFrame(FI); - DPRCSSize += 8; + // This is a DPR. Exclude the aligned DPRCS2 spills. + if (Reg == ARM::D8) + D8SpillFI = FI; + if (Reg < ARM::D8 || Reg >= ARM::D8 + AFI->getNumAlignedDPRCS2Regs()) + DPRCSSize += 8; } } // Move past area 1. - if (GPRCS1Size > 0) MBBI++; - - // Set FP to point to the stack slot that contains the previous FP. - // For Darwin, FP is R7, which has now been stored in spill area 1. - // Otherwise, if this is not Darwin, all the callee-saved registers go - // into spill area 1, including the FP in R11. In either case, it is - // now safe to emit this assignment. - bool HasFP = hasFP(MF); - if (HasFP) { - unsigned ADDriOpc = !AFI->isThumbFunction() ? ARM::ADDri : ARM::t2ADDri; - MachineInstrBuilder MIB = - BuildMI(MBB, MBBI, dl, TII.get(ADDriOpc), FramePtr) - .addFrameIndex(FramePtrSpillFI).addImm(0) - .setMIFlag(MachineInstr::FrameSetup); - AddDefaultCC(AddDefaultPred(MIB)); - } - - // Move past area 2. - if (GPRCS2Size > 0) MBBI++; + MachineBasicBlock::iterator LastPush = MBB.end(), GPRCS1Push, GPRCS2Push, + DPRCSPush; + if (GPRCS1Size > 0) + GPRCS1Push = LastPush = MBBI++; // Determine starting offsets of spill areas. + bool HasFP = hasFP(MF); unsigned DPRCSOffset = NumBytes - (GPRCS1Size + GPRCS2Size + DPRCSSize); unsigned GPRCS2Offset = DPRCSOffset + DPRCSSize; unsigned GPRCS1Offset = GPRCS2Offset + GPRCS2Size; - if (HasFP) + int FramePtrOffsetInPush = 0; + if (HasFP) { + FramePtrOffsetInPush = MFI->getObjectOffset(FramePtrSpillFI) + GPRCS1Size; AFI->setFramePtrSpillOffset(MFI->getObjectOffset(FramePtrSpillFI) + NumBytes); + } AFI->setGPRCalleeSavedArea1Offset(GPRCS1Offset); AFI->setGPRCalleeSavedArea2Offset(GPRCS2Offset); AFI->setDPRCalleeSavedAreaOffset(DPRCSOffset); + // Move past area 2. + if (GPRCS2Size > 0) + GPRCS2Push = LastPush = MBBI++; + // Move past area 3. if (DPRCSSize > 0) { - MBBI++; + DPRCSPush = MBBI; // Since vpush register list cannot have gaps, there may be multiple vpush // instructions in the prologue. while (MBBI->getOpcode() == ARM::VSTMDDB_UPD) - MBBI++; + LastPush = MBBI++; } - NumBytes = DPRCSOffset; + // Move past the aligned DPRCS2 area. + if (AFI->getNumAlignedDPRCS2Regs() > 0) { + MBBI = skipAlignedDPRCS2Spills(MBBI, AFI->getNumAlignedDPRCS2Regs()); + // The code inserted by emitAlignedDPRCS2Spills realigns the stack, and + // leaves the stack pointer pointing to the DPRCS2 area. + // + // Adjust NumBytes to represent the stack slots below the DPRCS2 area. + NumBytes += MFI->getObjectOffset(D8SpillFI); + } else + NumBytes = DPRCSOffset; + + unsigned adjustedGPRCS1Size = GPRCS1Size; if (NumBytes) { // Adjust SP after all the callee-save spills. - emitSPUpdate(isARM, MBB, MBBI, dl, TII, -NumBytes, - MachineInstr::FrameSetup); + if (tryFoldSPUpdateIntoPushPop(STI, MF, LastPush, NumBytes)) { + if (LastPush == GPRCS1Push) { + FramePtrOffsetInPush += NumBytes; + adjustedGPRCS1Size += NumBytes; + NumBytes = 0; + } + } else + emitSPUpdate(isARM, MBB, MBBI, dl, TII, -NumBytes, + MachineInstr::FrameSetup); + if (HasFP && isARM) // Restore from fp only in ARM mode: e.g. sub sp, r7, #24 // Note it's not safe to do this in Thumb2 mode because it would have @@ -242,6 +308,143 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const { AFI->setShouldRestoreSPFromFP(true); } + if (adjustedGPRCS1Size > 0) { + MCSymbol *SPLabel = Context.CreateTempSymbol(); + BuildMI(MBB, ++GPRCS1Push, dl, TII.get(TargetOpcode::PROLOG_LABEL)) + .addSym(SPLabel); + CFAOffset -= adjustedGPRCS1Size; + MMI.addFrameInst( + MCCFIInstruction::createDefCfaOffset(SPLabel, CFAOffset)); + for (std::vector::const_iterator I = CSI.begin(), + E = CSI.end(); I != E; ++I) { + unsigned Reg = I->getReg(); + int FI = I->getFrameIdx(); + switch (Reg) { + case ARM::R8: + case ARM::R9: + case ARM::R10: + case ARM::R11: + case ARM::R12: + if (STI.isTargetMachO()) + break; + // fallthrough + case ARM::R0: + case ARM::R1: + case ARM::R2: + case ARM::R3: + case ARM::R4: + case ARM::R5: + case ARM::R6: + case ARM::R7: + case ARM::LR: + MMI.addFrameInst(MCCFIInstruction::createOffset(SPLabel, + MRI->getDwarfRegNum(Reg, true), + MFI->getObjectOffset(FI) - ArgRegsSaveSize)); + break; + } + } + } + + // Set FP to point to the stack slot that contains the previous FP. + // For iOS, FP is R7, which has now been stored in spill area 1. + // Otherwise, if this is not iOS, all the callee-saved registers go + // into spill area 1, including the FP in R11. In either case, it + // is in area one and the adjustment needs to take place just after + // that push. + if (HasFP) { + emitRegPlusImmediate(!AFI->isThumbFunction(), MBB, GPRCS1Push, dl, TII, + FramePtr, ARM::SP, FramePtrOffsetInPush, + MachineInstr::FrameSetup); + MCSymbol *SPLabel = Context.CreateTempSymbol(); + BuildMI(MBB, GPRCS1Push, dl, TII.get(TargetOpcode::PROLOG_LABEL)) + .addSym(SPLabel); + if (FramePtrOffsetInPush) { + CFAOffset += FramePtrOffsetInPush; + MMI.addFrameInst( + MCCFIInstruction::createDefCfa(SPLabel, + MRI->getDwarfRegNum(FramePtr, true), CFAOffset)); + } else + MMI.addFrameInst( + MCCFIInstruction::createDefCfaRegister(SPLabel, + MRI->getDwarfRegNum(FramePtr, true))); + } + + if (GPRCS2Size > 0) { + MCSymbol *SPLabel = Context.CreateTempSymbol(); + BuildMI(MBB, ++GPRCS2Push, dl, TII.get(TargetOpcode::PROLOG_LABEL)) + .addSym(SPLabel); + if (!HasFP) { + CFAOffset -= GPRCS2Size; + MMI.addFrameInst( + MCCFIInstruction::createDefCfaOffset(SPLabel, CFAOffset)); + } + for (std::vector::const_iterator I = CSI.begin(), + E = CSI.end(); I != E; ++I) { + unsigned Reg = I->getReg(); + int FI = I->getFrameIdx(); + switch (Reg) { + case ARM::R8: + case ARM::R9: + case ARM::R10: + case ARM::R11: + case ARM::R12: + if (STI.isTargetMachO()) { + unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true); + unsigned Offset = MFI->getObjectOffset(FI) - ArgRegsSaveSize; + MMI.addFrameInst( + MCCFIInstruction::createOffset(SPLabel, DwarfReg, Offset)); + } + break; + } + } + } + + if (DPRCSSize > 0) { + // Since vpush register list cannot have gaps, there may be multiple vpush + // instructions in the prologue. + MCSymbol *SPLabel = NULL; + do { + MachineBasicBlock::iterator Push = DPRCSPush++; + if (!HasFP) { + SPLabel = Context.CreateTempSymbol(); + BuildMI(MBB, DPRCSPush, dl, TII.get(TargetOpcode::PROLOG_LABEL)) + .addSym(SPLabel); + CFAOffset -= sizeOfSPAdjustment(Push);; + MMI.addFrameInst( + MCCFIInstruction::createDefCfaOffset(SPLabel, CFAOffset)); + } + } while (DPRCSPush->getOpcode() == ARM::VSTMDDB_UPD); + + if (!SPLabel) { + SPLabel = Context.CreateTempSymbol(); + BuildMI(MBB, DPRCSPush, dl, TII.get(TargetOpcode::PROLOG_LABEL)) + .addSym(SPLabel); + } + for (std::vector::const_iterator I = CSI.begin(), + E = CSI.end(); I != E; ++I) { + unsigned Reg = I->getReg(); + int FI = I->getFrameIdx(); + if ((Reg >= ARM::D0 && Reg <= ARM::D31) && + (Reg < ARM::D8 || Reg >= ARM::D8 + AFI->getNumAlignedDPRCS2Regs())) { + unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true); + unsigned Offset = MFI->getObjectOffset(FI); + MMI.addFrameInst(MCCFIInstruction::createOffset(SPLabel, DwarfReg, + Offset)); + } + } + } + + if (NumBytes) { + if (!HasFP) { + MCSymbol *SPLabel = Context.CreateTempSymbol(); + BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::PROLOG_LABEL)) + .addSym(SPLabel); + CFAOffset -= NumBytes; + MMI.addFrameInst( + MCCFIInstruction::createDefCfaOffset(SPLabel, CFAOffset)); + } + } + if (STI.isTargetELF() && hasFP(MF)) MFI->setOffsetAdjustment(MFI->getOffsetAdjustment() - AFI->getFramePtrSpillOffset()); @@ -252,7 +455,9 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const { // If we need dynamic stack realignment, do it here. Be paranoid and make // sure if we also have VLAs, we have a base pointer for frame access. - if (RegInfo->needsStackRealignment(MF)) { + // If aligned NEON registers were spilled, the stack has already been + // realigned. + if (!AFI->getNumAlignedDPRCS2Regs() && RegInfo->needsStackRealignment(MF)) { unsigned MaxAlign = MFI->getMaxAlignment(); assert (!AFI->isThumb1OnlyFunction()); if (!AFI->isThumbFunction()) { @@ -268,13 +473,13 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const { // bic r4, r4, MaxAlign // mov sp, r4 // FIXME: It will be better just to find spare register here. - AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVgpr2tgpr), ARM::R4) + AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ARM::R4) .addReg(ARM::SP, RegState::Kill)); AddDefaultCC(AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::t2BICri), ARM::R4) .addReg(ARM::R4, RegState::Kill) .addImm(MaxAlign-1))); - AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVtgpr2gpr), ARM::SP) + AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ARM::SP) .addReg(ARM::R4, RegState::Kill)); } @@ -293,7 +498,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const { .addReg(ARM::SP) .addImm((unsigned)ARMCC::AL).addReg(0).addReg(0); else - AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVgpr2gpr), + AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), RegInfo->getBaseRegister()) .addReg(ARM::SP)); } @@ -308,8 +513,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const { void ARMFrameLowering::emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const { MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); - assert(MBBI->getDesc().isReturn() && - "Can only insert epilog into returning blocks"); + assert(MBBI->isReturn() && "Can only insert epilog into returning blocks"); unsigned RetOpcode = MBBI->getOpcode(); DebugLoc dl = MBBI->getDebugLoc(); MachineFrameInfo *MFI = MF.getFrameInfo(); @@ -321,20 +525,26 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF, "This emitEpilogue does not support Thumb1!"); bool isARM = !AFI->isThumbFunction(); - unsigned VARegSaveSize = AFI->getVarArgsRegSaveSize(); + unsigned Align = MF.getTarget().getFrameLowering()->getStackAlignment(); + unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize(Align); int NumBytes = (int)MFI->getStackSize(); unsigned FramePtr = RegInfo->getFrameRegister(MF); + // All calls are tail calls in GHC calling conv, and functions have no + // prologue/epilogue. + if (MF.getFunction()->getCallingConv() == CallingConv::GHC) + return; + if (!AFI->hasStackFrame()) { if (NumBytes != 0) emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes); } else { // Unwind MBBI to point to first LDR / VLDRD. - const unsigned *CSRegs = RegInfo->getCalleeSavedRegs(); + const uint16_t *CSRegs = RegInfo->getCalleeSavedRegs(&MF); if (MBBI != MBB.begin()) { - do + do { --MBBI; - while (MBBI != MBB.begin() && isCSRestore(MBBI, TII, CSRegs)); + } while (MBBI != MBB.begin() && isCSRestore(MBBI, TII, CSRegs)); if (!isCSRestore(MBBI, TII, CSRegs)) ++MBBI; } @@ -354,7 +564,7 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF, ARMCC::AL, 0, TII); else { // It's not possible to restore SP from FP in a single instruction. - // For Darwin, this looks like: + // For iOS, this looks like: // mov sp, r7 // sub sp, #24 // This is bad, if an interrupt is taken after the mov, sp is in an @@ -364,7 +574,7 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF, "No scratch register to restore SP from FP!"); emitT2RegPlusImmediate(MBB, MBBI, dl, ARM::R4, FramePtr, -NumBytes, ARMCC::AL, 0, TII); - AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVgpr2gpr), + AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ARM::SP) .addReg(ARM::R4)); } @@ -374,12 +584,13 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF, BuildMI(MBB, MBBI, dl, TII.get(ARM::MOVr), ARM::SP) .addReg(FramePtr).addImm((unsigned)ARMCC::AL).addReg(0).addReg(0); else - AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVgpr2gpr), + AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ARM::SP) .addReg(FramePtr)); } - } else if (NumBytes) - emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes); + } else if (NumBytes && + !tryFoldSPUpdateIntoPushPop(STI, MF, MBBI, NumBytes)) + emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes); // Increment past our save areas. if (AFI->getDPRCalleeSavedAreaSize()) { @@ -393,17 +604,16 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF, if (AFI->getGPRCalleeSavedArea1Size()) MBBI++; } - if (RetOpcode == ARM::TCRETURNdi || RetOpcode == ARM::TCRETURNdiND || - RetOpcode == ARM::TCRETURNri || RetOpcode == ARM::TCRETURNriND) { + if (RetOpcode == ARM::TCRETURNdi || RetOpcode == ARM::TCRETURNri) { // Tail call return: adjust the stack pointer and jump to callee. MBBI = MBB.getLastNonDebugInstr(); MachineOperand &JumpTarget = MBBI->getOperand(0); // Jump to label or value in register. - if (RetOpcode == ARM::TCRETURNdi || RetOpcode == ARM::TCRETURNdiND) { - unsigned TCOpcode = (RetOpcode == ARM::TCRETURNdi) - ? (STI.isThumb() ? ARM::tTAILJMPd : ARM::TAILJMPd) - : (STI.isThumb() ? ARM::tTAILJMPdND : ARM::TAILJMPdND); + if (RetOpcode == ARM::TCRETURNdi) { + unsigned TCOpcode = STI.isThumb() ? + (STI.isTargetMachO() ? ARM::tTAILJMPd : ARM::tTAILJMPdND) : + ARM::TAILJMPd; MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(TCOpcode)); if (JumpTarget.isGlobal()) MIB.addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset(), @@ -413,14 +623,13 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF, MIB.addExternalSymbol(JumpTarget.getSymbolName(), JumpTarget.getTargetFlags()); } + + // Add the default predicate in Thumb mode. + if (STI.isThumb()) MIB.addImm(ARMCC::AL).addReg(0); } else if (RetOpcode == ARM::TCRETURNri) { BuildMI(MBB, MBBI, dl, TII.get(STI.isThumb() ? ARM::tTAILJMPr : ARM::TAILJMPr)). addReg(JumpTarget.getReg(), RegState::Kill); - } else if (RetOpcode == ARM::TCRETURNriND) { - BuildMI(MBB, MBBI, dl, - TII.get(STI.isThumb() ? ARM::tTAILJMPrND : ARM::TAILJMPrND)). - addReg(JumpTarget.getReg(), RegState::Kill); } MachineInstr *NewMI = prior(MBBI); @@ -432,8 +641,8 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF, MBBI = NewMI; } - if (VARegSaveSize) - emitSPUpdate(isARM, MBB, MBBI, dl, TII, VARegSaveSize); + if (ArgRegsSaveSize) + emitSPUpdate(isARM, MBB, MBBI, dl, TII, ArgRegsSaveSize); } /// getFrameIndexReference - Provide a base+offset reference to an FI slot for @@ -460,12 +669,10 @@ ARMFrameLowering::ResolveFrameIndexReference(const MachineFunction &MF, FrameReg = ARM::SP; Offset += SPAdj; - if (AFI->isGPRCalleeSavedArea1Frame(FI)) - return Offset - AFI->getGPRCalleeSavedArea1Offset(); - else if (AFI->isGPRCalleeSavedArea2Frame(FI)) - return Offset - AFI->getGPRCalleeSavedArea2Offset(); - else if (AFI->isDPRCalleeSavedAreaFrame(FI)) - return Offset - AFI->getDPRCalleeSavedAreaOffset(); + + // SP can move around if there are allocas. We may also lose track of SP + // when emergency spilling inside a non-reserved call frame setup. + bool hasMovingSP = !hasReservedCallFrame(MF); // When dynamically realigning the stack, use the frame pointer for // parameters, and the stack/base pointer for locals. @@ -474,7 +681,7 @@ ARMFrameLowering::ResolveFrameIndexReference(const MachineFunction &MF, if (isFixed) { FrameReg = RegInfo->getFrameRegister(MF); Offset = FPOffset; - } else if (MFI->hasVarSizedObjects()) { + } else if (hasMovingSP) { assert(RegInfo->hasBasePointer(MF) && "VLAs and dynamic stack alignment, but missing base pointer!"); FrameReg = RegInfo->getBaseRegister(); @@ -486,11 +693,10 @@ ARMFrameLowering::ResolveFrameIndexReference(const MachineFunction &MF, if (hasFP(MF) && AFI->hasStackFrame()) { // Use frame pointer to reference fixed objects. Use it for locals if // there are VLAs (and thus the SP isn't reliable as a base). - if (isFixed || (MFI->hasVarSizedObjects() && - !RegInfo->hasBasePointer(MF))) { + if (isFixed || (hasMovingSP && !RegInfo->hasBasePointer(MF))) { FrameReg = RegInfo->getFrameRegister(MF); return FPOffset; - } else if (MFI->hasVarSizedObjects()) { + } else if (hasMovingSP) { assert(RegInfo->hasBasePointer(MF) && "missing base pointer!"); if (AFI->isThumb2Function()) { // Try to use the frame pointer if we can, else use the base pointer @@ -502,7 +708,7 @@ ARMFrameLowering::ResolveFrameIndexReference(const MachineFunction &MF, } } } else if (AFI->isThumb2Function()) { - // Use add , sp, # + // Use add , sp, # // ldr , [sp, #] // if at all possible to save space. if (Offset >= 0 && (Offset & 3) == 0 && Offset <= 1020) @@ -537,6 +743,7 @@ void ARMFrameLowering::emitPushInst(MachineBasicBlock &MBB, unsigned StmOpc, unsigned StrOpc, bool NoGap, bool(*Func)(unsigned, bool), + unsigned NumAlignedDPRCS2Regs, unsigned MIFlags) const { MachineFunction &MF = *MBB.getParent(); const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); @@ -550,7 +757,11 @@ void ARMFrameLowering::emitPushInst(MachineBasicBlock &MBB, unsigned LastReg = 0; for (; i != 0; --i) { unsigned Reg = CSI[i-1].getReg(); - if (!(Func)(Reg, STI.isTargetDarwin())) continue; + if (!(Func)(Reg, STI.isTargetMachO())) continue; + + // D-registers in the aligned area DPRCS2 are NOT spilled here. + if (Reg >= ARM::D8 && Reg < ARM::D8 + NumAlignedDPRCS2Regs) + continue; // Add the callee-saved register as live-in unless it's LR and // @llvm.returnaddress is called. If LR is returned for @@ -587,14 +798,8 @@ void ARMFrameLowering::emitPushInst(MachineBasicBlock &MBB, MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc), ARM::SP) .addReg(Regs[0].first, getKillRegState(Regs[0].second)) - .addReg(ARM::SP).setMIFlags(MIFlags); - // ARM mode needs an extra reg0 here due to addrmode2. Will go away once - // that refactoring is complete (eventually). - if (StrOpc == ARM::STR_PRE) { - MIB.addReg(0); - MIB.addImm(ARM_AM::getAM2Opc(ARM_AM::sub, 4, ARM_AM::no_shift)); - } else - MIB.addImm(-4); + .addReg(ARM::SP).setMIFlags(MIFlags) + .addImm(-4); AddDefaultPred(MIB); } Regs.clear(); @@ -606,16 +811,17 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB, const std::vector &CSI, unsigned LdmOpc, unsigned LdrOpc, bool isVarArg, bool NoGap, - bool(*Func)(unsigned, bool)) const { + bool(*Func)(unsigned, bool), + unsigned NumAlignedDPRCS2Regs) const { MachineFunction &MF = *MBB.getParent(); const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); ARMFunctionInfo *AFI = MF.getInfo(); DebugLoc DL = MI->getDebugLoc(); unsigned RetOpcode = MI->getOpcode(); bool isTailCall = (RetOpcode == ARM::TCRETURNdi || - RetOpcode == ARM::TCRETURNdiND || - RetOpcode == ARM::TCRETURNri || - RetOpcode == ARM::TCRETURNriND); + RetOpcode == ARM::TCRETURNri); + bool isInterrupt = + RetOpcode == ARM::SUBS_PC_LR || RetOpcode == ARM::t2SUBS_PC_LR; SmallVector Regs; unsigned i = CSI.size(); @@ -624,9 +830,14 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB, bool DeleteRet = false; for (; i != 0; --i) { unsigned Reg = CSI[i-1].getReg(); - if (!(Func)(Reg, STI.isTargetDarwin())) continue; + if (!(Func)(Reg, STI.isTargetMachO())) continue; + + // The aligned reloads from area DPRCS2 are not inserted here. + if (Reg >= ARM::D8 && Reg < ARM::D8 + NumAlignedDPRCS2Regs) + continue; - if (Reg == ARM::LR && !isTailCall && !isVarArg && STI.hasV5TOps()) { + if (Reg == ARM::LR && !isTailCall && !isVarArg && !isInterrupt && + STI.hasV5TOps()) { Reg = ARM::PC; LdmOpc = AFI->isThumbFunction() ? ARM::t2LDMIA_RET : ARM::LDMIA_RET; // Fold the return instruction into the LDM. @@ -651,8 +862,10 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB, .addReg(ARM::SP)); for (unsigned i = 0, e = Regs.size(); i < e; ++i) MIB.addReg(Regs[i], getDefRegState(true)); - if (DeleteRet) + if (DeleteRet) { + MIB.copyImplicitOps(&*MI); MI->eraseFromParent(); + } MI = MIB; } else if (Regs.size() == 1) { // If we adjusted the reg to PC from LR above, switch it back here. We @@ -665,7 +878,7 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB, .addReg(ARM::SP); // ARM mode needs an extra reg0 here due to addrmode2. Will go away once // that refactoring is complete (eventually). - if (LdrOpc == ARM::LDR_POST) { + if (LdrOpc == ARM::LDR_POST_REG || LdrOpc == ARM::LDR_POST_IMM) { MIB.addReg(0); MIB.addImm(ARM_AM::getAM2Opc(ARM_AM::add, 4, ARM_AM::no_shift)); } else @@ -676,6 +889,247 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB, } } +/// Emit aligned spill instructions for NumAlignedDPRCS2Regs D-registers +/// starting from d8. Also insert stack realignment code and leave the stack +/// pointer pointing to the d8 spill slot. +static void emitAlignedDPRCS2Spills(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned NumAlignedDPRCS2Regs, + const std::vector &CSI, + const TargetRegisterInfo *TRI) { + MachineFunction &MF = *MBB.getParent(); + ARMFunctionInfo *AFI = MF.getInfo(); + DebugLoc DL = MI->getDebugLoc(); + const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); + MachineFrameInfo &MFI = *MF.getFrameInfo(); + + // Mark the D-register spill slots as properly aligned. Since MFI computes + // stack slot layout backwards, this can actually mean that the d-reg stack + // slot offsets can be wrong. The offset for d8 will always be correct. + for (unsigned i = 0, e = CSI.size(); i != e; ++i) { + unsigned DNum = CSI[i].getReg() - ARM::D8; + if (DNum >= 8) + continue; + int FI = CSI[i].getFrameIdx(); + // The even-numbered registers will be 16-byte aligned, the odd-numbered + // registers will be 8-byte aligned. + MFI.setObjectAlignment(FI, DNum % 2 ? 8 : 16); + + // The stack slot for D8 needs to be maximally aligned because this is + // actually the point where we align the stack pointer. MachineFrameInfo + // computes all offsets relative to the incoming stack pointer which is a + // bit weird when realigning the stack. Any extra padding for this + // over-alignment is not realized because the code inserted below adjusts + // the stack pointer by numregs * 8 before aligning the stack pointer. + if (DNum == 0) + MFI.setObjectAlignment(FI, MFI.getMaxAlignment()); + } + + // Move the stack pointer to the d8 spill slot, and align it at the same + // time. Leave the stack slot address in the scratch register r4. + // + // sub r4, sp, #numregs * 8 + // bic r4, r4, #align - 1 + // mov sp, r4 + // + bool isThumb = AFI->isThumbFunction(); + assert(!AFI->isThumb1OnlyFunction() && "Can't realign stack for thumb1"); + AFI->setShouldRestoreSPFromFP(true); + + // sub r4, sp, #numregs * 8 + // The immediate is <= 64, so it doesn't need any special encoding. + unsigned Opc = isThumb ? ARM::t2SUBri : ARM::SUBri; + AddDefaultCC(AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(Opc), ARM::R4) + .addReg(ARM::SP) + .addImm(8 * NumAlignedDPRCS2Regs))); + + // bic r4, r4, #align-1 + Opc = isThumb ? ARM::t2BICri : ARM::BICri; + unsigned MaxAlign = MF.getFrameInfo()->getMaxAlignment(); + AddDefaultCC(AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(Opc), ARM::R4) + .addReg(ARM::R4, RegState::Kill) + .addImm(MaxAlign - 1))); + + // mov sp, r4 + // The stack pointer must be adjusted before spilling anything, otherwise + // the stack slots could be clobbered by an interrupt handler. + // Leave r4 live, it is used below. + Opc = isThumb ? ARM::tMOVr : ARM::MOVr; + MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(Opc), ARM::SP) + .addReg(ARM::R4); + MIB = AddDefaultPred(MIB); + if (!isThumb) + AddDefaultCC(MIB); + + // Now spill NumAlignedDPRCS2Regs registers starting from d8. + // r4 holds the stack slot address. + unsigned NextReg = ARM::D8; + + // 16-byte aligned vst1.64 with 4 d-regs and address writeback. + // The writeback is only needed when emitting two vst1.64 instructions. + if (NumAlignedDPRCS2Regs >= 6) { + unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0, + &ARM::QQPRRegClass); + MBB.addLiveIn(SupReg); + AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VST1d64Qwb_fixed), + ARM::R4) + .addReg(ARM::R4, RegState::Kill).addImm(16) + .addReg(NextReg) + .addReg(SupReg, RegState::ImplicitKill)); + NextReg += 4; + NumAlignedDPRCS2Regs -= 4; + } + + // We won't modify r4 beyond this point. It currently points to the next + // register to be spilled. + unsigned R4BaseReg = NextReg; + + // 16-byte aligned vst1.64 with 4 d-regs, no writeback. + if (NumAlignedDPRCS2Regs >= 4) { + unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0, + &ARM::QQPRRegClass); + MBB.addLiveIn(SupReg); + AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VST1d64Q)) + .addReg(ARM::R4).addImm(16).addReg(NextReg) + .addReg(SupReg, RegState::ImplicitKill)); + NextReg += 4; + NumAlignedDPRCS2Regs -= 4; + } + + // 16-byte aligned vst1.64 with 2 d-regs. + if (NumAlignedDPRCS2Regs >= 2) { + unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0, + &ARM::QPRRegClass); + MBB.addLiveIn(SupReg); + AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VST1q64)) + .addReg(ARM::R4).addImm(16).addReg(SupReg)); + NextReg += 2; + NumAlignedDPRCS2Regs -= 2; + } + + // Finally, use a vanilla vstr.64 for the odd last register. + if (NumAlignedDPRCS2Regs) { + MBB.addLiveIn(NextReg); + // vstr.64 uses addrmode5 which has an offset scale of 4. + AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VSTRD)) + .addReg(NextReg) + .addReg(ARM::R4).addImm((NextReg-R4BaseReg)*2)); + } + + // The last spill instruction inserted should kill the scratch register r4. + llvm::prior(MI)->addRegisterKilled(ARM::R4, TRI); +} + +/// Skip past the code inserted by emitAlignedDPRCS2Spills, and return an +/// iterator to the following instruction. +static MachineBasicBlock::iterator +skipAlignedDPRCS2Spills(MachineBasicBlock::iterator MI, + unsigned NumAlignedDPRCS2Regs) { + // sub r4, sp, #numregs * 8 + // bic r4, r4, #align - 1 + // mov sp, r4 + ++MI; ++MI; ++MI; + assert(MI->mayStore() && "Expecting spill instruction"); + + // These switches all fall through. + switch(NumAlignedDPRCS2Regs) { + case 7: + ++MI; + assert(MI->mayStore() && "Expecting spill instruction"); + default: + ++MI; + assert(MI->mayStore() && "Expecting spill instruction"); + case 1: + case 2: + case 4: + assert(MI->killsRegister(ARM::R4) && "Missed kill flag"); + ++MI; + } + return MI; +} + +/// Emit aligned reload instructions for NumAlignedDPRCS2Regs D-registers +/// starting from d8. These instructions are assumed to execute while the +/// stack is still aligned, unlike the code inserted by emitPopInst. +static void emitAlignedDPRCS2Restores(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned NumAlignedDPRCS2Regs, + const std::vector &CSI, + const TargetRegisterInfo *TRI) { + MachineFunction &MF = *MBB.getParent(); + ARMFunctionInfo *AFI = MF.getInfo(); + DebugLoc DL = MI->getDebugLoc(); + const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); + + // Find the frame index assigned to d8. + int D8SpillFI = 0; + for (unsigned i = 0, e = CSI.size(); i != e; ++i) + if (CSI[i].getReg() == ARM::D8) { + D8SpillFI = CSI[i].getFrameIdx(); + break; + } + + // Materialize the address of the d8 spill slot into the scratch register r4. + // This can be fairly complicated if the stack frame is large, so just use + // the normal frame index elimination mechanism to do it. This code runs as + // the initial part of the epilog where the stack and base pointers haven't + // been changed yet. + bool isThumb = AFI->isThumbFunction(); + assert(!AFI->isThumb1OnlyFunction() && "Can't realign stack for thumb1"); + + unsigned Opc = isThumb ? ARM::t2ADDri : ARM::ADDri; + AddDefaultCC(AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(Opc), ARM::R4) + .addFrameIndex(D8SpillFI).addImm(0))); + + // Now restore NumAlignedDPRCS2Regs registers starting from d8. + unsigned NextReg = ARM::D8; + + // 16-byte aligned vld1.64 with 4 d-regs and writeback. + if (NumAlignedDPRCS2Regs >= 6) { + unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0, + &ARM::QQPRRegClass); + AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VLD1d64Qwb_fixed), NextReg) + .addReg(ARM::R4, RegState::Define) + .addReg(ARM::R4, RegState::Kill).addImm(16) + .addReg(SupReg, RegState::ImplicitDefine)); + NextReg += 4; + NumAlignedDPRCS2Regs -= 4; + } + + // We won't modify r4 beyond this point. It currently points to the next + // register to be spilled. + unsigned R4BaseReg = NextReg; + + // 16-byte aligned vld1.64 with 4 d-regs, no writeback. + if (NumAlignedDPRCS2Regs >= 4) { + unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0, + &ARM::QQPRRegClass); + AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VLD1d64Q), NextReg) + .addReg(ARM::R4).addImm(16) + .addReg(SupReg, RegState::ImplicitDefine)); + NextReg += 4; + NumAlignedDPRCS2Regs -= 4; + } + + // 16-byte aligned vld1.64 with 2 d-regs. + if (NumAlignedDPRCS2Regs >= 2) { + unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0, + &ARM::QPRRegClass); + AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VLD1q64), SupReg) + .addReg(ARM::R4).addImm(16)); + NextReg += 2; + NumAlignedDPRCS2Regs -= 2; + } + + // Finally, use a vanilla vldr.64 for the remaining odd register. + if (NumAlignedDPRCS2Regs) + AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VLDRD), NextReg) + .addReg(ARM::R4).addImm(2*(NextReg-R4BaseReg))); + + // Last store kills r4. + llvm::prior(MI)->addRegisterKilled(ARM::R4, TRI); +} + bool ARMFrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const std::vector &CSI, @@ -687,14 +1141,22 @@ bool ARMFrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB, ARMFunctionInfo *AFI = MF.getInfo(); unsigned PushOpc = AFI->isThumbFunction() ? ARM::t2STMDB_UPD : ARM::STMDB_UPD; - unsigned PushOneOpc = AFI->isThumbFunction() ? ARM::t2STR_PRE : ARM::STR_PRE; + unsigned PushOneOpc = AFI->isThumbFunction() ? + ARM::t2STR_PRE : ARM::STR_PRE_IMM; unsigned FltOpc = ARM::VSTMDDB_UPD; - emitPushInst(MBB, MI, CSI, PushOpc, PushOneOpc, false, &isARMArea1Register, + unsigned NumAlignedDPRCS2Regs = AFI->getNumAlignedDPRCS2Regs(); + emitPushInst(MBB, MI, CSI, PushOpc, PushOneOpc, false, &isARMArea1Register, 0, MachineInstr::FrameSetup); - emitPushInst(MBB, MI, CSI, PushOpc, PushOneOpc, false, &isARMArea2Register, + emitPushInst(MBB, MI, CSI, PushOpc, PushOneOpc, false, &isARMArea2Register, 0, MachineInstr::FrameSetup); emitPushInst(MBB, MI, CSI, FltOpc, 0, true, &isARMArea3Register, - MachineInstr::FrameSetup); + NumAlignedDPRCS2Regs, MachineInstr::FrameSetup); + + // The code above does not insert spill code for the aligned DPRCS2 registers. + // The stack realignment code will be inserted between the push instructions + // and these spills. + if (NumAlignedDPRCS2Regs) + emitAlignedDPRCS2Spills(MBB, MI, NumAlignedDPRCS2Regs, CSI, TRI); return true; } @@ -708,16 +1170,23 @@ bool ARMFrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB, MachineFunction &MF = *MBB.getParent(); ARMFunctionInfo *AFI = MF.getInfo(); - bool isVarArg = AFI->getVarArgsRegSaveSize() > 0; + bool isVarArg = AFI->getArgRegsSaveSize() > 0; + unsigned NumAlignedDPRCS2Regs = AFI->getNumAlignedDPRCS2Regs(); + + // The emitPopInst calls below do not insert reloads for the aligned DPRCS2 + // registers. Do that here instead. + if (NumAlignedDPRCS2Regs) + emitAlignedDPRCS2Restores(MBB, MI, NumAlignedDPRCS2Regs, CSI, TRI); unsigned PopOpc = AFI->isThumbFunction() ? ARM::t2LDMIA_UPD : ARM::LDMIA_UPD; - unsigned LdrOpc = AFI->isThumbFunction() ? ARM::t2LDR_POST : ARM::LDR_POST; + unsigned LdrOpc = AFI->isThumbFunction() ? ARM::t2LDR_POST :ARM::LDR_POST_IMM; unsigned FltOpc = ARM::VLDMDIA_UPD; - emitPopInst(MBB, MI, CSI, FltOpc, 0, isVarArg, true, &isARMArea3Register); + emitPopInst(MBB, MI, CSI, FltOpc, 0, isVarArg, true, &isARMArea3Register, + NumAlignedDPRCS2Regs); emitPopInst(MBB, MI, CSI, PopOpc, LdrOpc, isVarArg, false, - &isARMArea2Register); + &isARMArea2Register, 0); emitPopInst(MBB, MI, CSI, PopOpc, LdrOpc, isVarArg, false, - &isARMArea1Register); + &isARMArea1Register, 0); return true; } @@ -736,26 +1205,6 @@ static unsigned GetFunctionSizeInBytes(const MachineFunction &MF, return FnSize; } -/// estimateStackSize - Estimate and return the size of the frame. -/// FIXME: Make generic? -static unsigned estimateStackSize(MachineFunction &MF) { - const MachineFrameInfo *FFI = MF.getFrameInfo(); - int Offset = 0; - for (int i = FFI->getObjectIndexBegin(); i != 0; ++i) { - int FixedOff = -FFI->getObjectOffset(i); - if (FixedOff > Offset) Offset = FixedOff; - } - for (unsigned i = 0, e = FFI->getObjectIndexEnd(); i != e; ++i) { - if (FFI->isDeadObjectIndex(i)) - continue; - Offset += FFI->getObjectSize(i); - unsigned Align = FFI->getObjectAlignment(i); - // Adjust to alignment boundary - Offset = (Offset+Align-1)/Align*Align; - } - return (unsigned)Offset; -} - /// estimateRSStackSizeLimit - Look at each instruction that references stack /// frames and return the stack size limit beyond which some of these /// instructions will require a scratch register during their expansion later. @@ -809,6 +1258,56 @@ static unsigned estimateRSStackSizeLimit(MachineFunction &MF, return Limit; } +// In functions that realign the stack, it can be an advantage to spill the +// callee-saved vector registers after realigning the stack. The vst1 and vld1 +// instructions take alignment hints that can improve performance. +// +static void checkNumAlignedDPRCS2Regs(MachineFunction &MF) { + MF.getInfo()->setNumAlignedDPRCS2Regs(0); + if (!SpillAlignedNEONRegs) + return; + + // Naked functions don't spill callee-saved registers. + if (MF.getFunction()->getAttributes().hasAttribute(AttributeSet::FunctionIndex, + Attribute::Naked)) + return; + + // We are planning to use NEON instructions vst1 / vld1. + if (!MF.getTarget().getSubtarget().hasNEON()) + return; + + // Don't bother if the default stack alignment is sufficiently high. + if (MF.getTarget().getFrameLowering()->getStackAlignment() >= 8) + return; + + // Aligned spills require stack realignment. + const ARMBaseRegisterInfo *RegInfo = + static_cast(MF.getTarget().getRegisterInfo()); + if (!RegInfo->canRealignStack(MF)) + return; + + // We always spill contiguous d-registers starting from d8. Count how many + // needs spilling. The register allocator will almost always use the + // callee-saved registers in order, but it can happen that there are holes in + // the range. Registers above the hole will be spilled to the standard DPRCS + // area. + MachineRegisterInfo &MRI = MF.getRegInfo(); + unsigned NumSpills = 0; + for (; NumSpills < 8; ++NumSpills) + if (!MRI.isPhysRegUsed(ARM::D8 + NumSpills)) + break; + + // Don't do this for just one d-register. It's not worth it. + if (NumSpills < 2) + return; + + // Spill the first NumSpills D-registers after realigning the stack. + MF.getInfo()->setNumAlignedDPRCS2Regs(NumSpills); + + // A scratch register is required for the vst1 / vld1 instructions. + MF.getRegInfo().setPhysRegUsed(ARM::R4); +} + void ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, RegScavenger *RS) const { @@ -828,6 +1327,7 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, *static_cast(MF.getTarget().getInstrInfo()); ARMFunctionInfo *AFI = MF.getInfo(); MachineFrameInfo *MFI = MF.getFrameInfo(); + MachineRegisterInfo &MRI = MF.getRegInfo(); unsigned FramePtr = RegInfo->getFrameRegister(MF); // Spill R4 if Thumb2 function requires stack realignment - it will be used as @@ -837,12 +1337,12 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, // FIXME: It will be better just to find spare register here. if (AFI->isThumb2Function() && (MFI->hasVarSizedObjects() || RegInfo->needsStackRealignment(MF))) - MF.getRegInfo().setPhysRegUsed(ARM::R4); + MRI.setPhysRegUsed(ARM::R4); if (AFI->isThumb1OnlyFunction()) { // Spill LR if Thumb1 function uses variable length argument lists. - if (AFI->getVarArgsRegSaveSize() > 0) - MF.getRegInfo().setPhysRegUsed(ARM::LR); + if (AFI->getArgRegsSaveSize() > 0) + MRI.setPhysRegUsed(ARM::LR); // Spill R4 if Thumb1 epilogue has to restore SP from FP. We don't know // for sure what the stack size will be, but for this, an estimate is good @@ -850,42 +1350,36 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, // we've used all the registers and so R4 is already used, so not marking // it here will be OK. // FIXME: It will be better just to find spare register here. - unsigned StackSize = estimateStackSize(MF); + unsigned StackSize = MFI->estimateStackSize(MF); if (MFI->hasVarSizedObjects() || StackSize > 508) - MF.getRegInfo().setPhysRegUsed(ARM::R4); + MRI.setPhysRegUsed(ARM::R4); } + // See if we can spill vector registers to aligned stack. + checkNumAlignedDPRCS2Regs(MF); + // Spill the BasePtr if it's used. if (RegInfo->hasBasePointer(MF)) - MF.getRegInfo().setPhysRegUsed(RegInfo->getBaseRegister()); + MRI.setPhysRegUsed(RegInfo->getBaseRegister()); // Don't spill FP if the frame can be eliminated. This is determined // by scanning the callee-save registers to see if any is used. - const unsigned *CSRegs = RegInfo->getCalleeSavedRegs(); + const uint16_t *CSRegs = RegInfo->getCalleeSavedRegs(&MF); for (unsigned i = 0; CSRegs[i]; ++i) { unsigned Reg = CSRegs[i]; bool Spilled = false; - if (MF.getRegInfo().isPhysRegUsed(Reg)) { + if (MRI.isPhysRegUsed(Reg)) { Spilled = true; CanEliminateFrame = false; - } else { - // Check alias registers too. - for (const unsigned *Aliases = - RegInfo->getAliasSet(Reg); *Aliases; ++Aliases) { - if (MF.getRegInfo().isPhysRegUsed(*Aliases)) { - Spilled = true; - CanEliminateFrame = false; - } - } } - if (!ARM::GPRRegisterClass->contains(Reg)) + if (!ARM::GPRRegClass.contains(Reg)) continue; if (Spilled) { NumGPRSpills++; - if (!STI.isTargetDarwin()) { + if (!STI.isTargetMachO()) { if (Reg == ARM::LR) LRSpilled = true; CS1Spilled = true; @@ -897,6 +1391,8 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, case ARM::LR: LRSpilled = true; // Fallthrough + case ARM::R0: case ARM::R1: + case ARM::R2: case ARM::R3: case ARM::R4: case ARM::R5: case ARM::R6: case ARM::R7: CS1Spilled = true; @@ -905,12 +1401,14 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, break; } } else { - if (!STI.isTargetDarwin()) { + if (!STI.isTargetMachO()) { UnspilledCS1GPRs.push_back(Reg); continue; } switch (Reg) { + case ARM::R0: case ARM::R1: + case ARM::R2: case ARM::R3: case ARM::R4: case ARM::R5: case ARM::R6: case ARM::R7: case ARM::LR: @@ -951,7 +1449,8 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, // worth the effort and added fragility? bool BigStack = (RS && - (estimateStackSize(MF) + ((hasFP(MF) && AFI->hasStackFrame()) ? 4:0) >= + (MFI->estimateStackSize(MF) + + ((hasFP(MF) && AFI->hasStackFrame()) ? 4:0) >= estimateRSStackSizeLimit(MF, this))) || MFI->hasVarSizedObjects() || (MFI->adjustsStack() && !canSimplifyCallFramePseudos(MF)); @@ -963,16 +1462,20 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, // If LR is not spilled, but at least one of R4, R5, R6, and R7 is spilled. // Spill LR as well so we can fold BX_RET to the registers restore (LDM). if (!LRSpilled && CS1Spilled) { - MF.getRegInfo().setPhysRegUsed(ARM::LR); + MRI.setPhysRegUsed(ARM::LR); NumGPRSpills++; - UnspilledCS1GPRs.erase(std::find(UnspilledCS1GPRs.begin(), - UnspilledCS1GPRs.end(), (unsigned)ARM::LR)); + SmallVectorImpl::iterator LRPos; + LRPos = std::find(UnspilledCS1GPRs.begin(), UnspilledCS1GPRs.end(), + (unsigned)ARM::LR); + if (LRPos != UnspilledCS1GPRs.end()) + UnspilledCS1GPRs.erase(LRPos); + ForceLRSpill = false; ExtraCSSpill = true; } if (hasFP(MF)) { - MF.getRegInfo().setPhysRegUsed(FramePtr); + MRI.setPhysRegUsed(FramePtr); NumGPRSpills++; } @@ -987,16 +1490,16 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, // Don't spill high register if the function is thumb1 if (!AFI->isThumb1OnlyFunction() || isARMLowRegister(Reg) || Reg == ARM::LR) { - MF.getRegInfo().setPhysRegUsed(Reg); - if (!RegInfo->isReservedReg(MF, Reg)) + MRI.setPhysRegUsed(Reg); + if (!MRI.isReserved(Reg)) ExtraCSSpill = true; break; } } } else if (!UnspilledCS2GPRs.empty() && !AFI->isThumb1OnlyFunction()) { unsigned Reg = UnspilledCS2GPRs.front(); - MF.getRegInfo().setPhysRegUsed(Reg); - if (!RegInfo->isReservedReg(MF, Reg)) + MRI.setPhysRegUsed(Reg); + if (!MRI.isReserved(Reg)) ExtraCSSpill = true; } } @@ -1014,7 +1517,7 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, while (NumExtras && !UnspilledCS1GPRs.empty()) { unsigned Reg = UnspilledCS1GPRs.back(); UnspilledCS1GPRs.pop_back(); - if (!RegInfo->isReservedReg(MF, Reg) && + if (!MRI.isReserved(Reg) && (!AFI->isThumb1OnlyFunction() || isARMLowRegister(Reg) || Reg == ARM::LR)) { Extras.push_back(Reg); @@ -1026,7 +1529,7 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, while (NumExtras && !UnspilledCS2GPRs.empty()) { unsigned Reg = UnspilledCS2GPRs.back(); UnspilledCS2GPRs.pop_back(); - if (!RegInfo->isReservedReg(MF, Reg)) { + if (!MRI.isReserved(Reg)) { Extras.push_back(Reg); NumExtras--; } @@ -1034,13 +1537,13 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, } if (Extras.size() && NumExtras == 0) { for (unsigned i = 0, e = Extras.size(); i != e; ++i) { - MF.getRegInfo().setPhysRegUsed(Extras[i]); + MRI.setPhysRegUsed(Extras[i]); } } else if (!AFI->isThumb1OnlyFunction()) { // note: Thumb1 functions spill to R12, not the stack. Reserve a slot // closest to SP or frame pointer. - const TargetRegisterClass *RC = ARM::GPRRegisterClass; - RS->setScavengingFrameIndex(MFI->CreateStackObject(RC->getSize(), + const TargetRegisterClass *RC = &ARM::GPRRegClass; + RS->addScavengingFrameIndex(MFI->CreateStackObject(RC->getSize(), RC->getAlignment(), false)); } @@ -1048,7 +1551,55 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, } if (ForceLRSpill) { - MF.getRegInfo().setPhysRegUsed(ARM::LR); + MRI.setPhysRegUsed(ARM::LR); AFI->setLRIsSpilledForFarJump(true); } } + + +void ARMFrameLowering:: +eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const { + const ARMBaseInstrInfo &TII = + *static_cast(MF.getTarget().getInstrInfo()); + if (!hasReservedCallFrame(MF)) { + // If we have alloca, convert as follows: + // ADJCALLSTACKDOWN -> sub, sp, sp, amount + // ADJCALLSTACKUP -> add, sp, sp, amount + MachineInstr *Old = I; + DebugLoc dl = Old->getDebugLoc(); + unsigned Amount = Old->getOperand(0).getImm(); + if (Amount != 0) { + // We need to keep the stack aligned properly. To do this, we round the + // amount of space needed for the outgoing arguments up to the next + // alignment boundary. + unsigned Align = getStackAlignment(); + Amount = (Amount+Align-1)/Align*Align; + + ARMFunctionInfo *AFI = MF.getInfo(); + assert(!AFI->isThumb1OnlyFunction() && + "This eliminateCallFramePseudoInstr does not support Thumb1!"); + bool isARM = !AFI->isThumbFunction(); + + // Replace the pseudo instruction with a new instruction... + unsigned Opc = Old->getOpcode(); + int PIdx = Old->findFirstPredOperandIdx(); + ARMCC::CondCodes Pred = (PIdx == -1) + ? ARMCC::AL : (ARMCC::CondCodes)Old->getOperand(PIdx).getImm(); + if (Opc == ARM::ADJCALLSTACKDOWN || Opc == ARM::tADJCALLSTACKDOWN) { + // Note: PredReg is operand 2 for ADJCALLSTACKDOWN. + unsigned PredReg = Old->getOperand(2).getReg(); + emitSPUpdate(isARM, MBB, I, dl, TII, -Amount, MachineInstr::NoFlags, + Pred, PredReg); + } else { + // Note: PredReg is operand 3 for ADJCALLSTACKUP. + unsigned PredReg = Old->getOperand(3).getReg(); + assert(Opc == ARM::ADJCALLSTACKUP || Opc == ARM::tADJCALLSTACKUP); + emitSPUpdate(isARM, MBB, I, dl, TII, Amount, MachineInstr::NoFlags, + Pred, PredReg); + } + } + } + MBB.erase(I); +} +