From: Tim Northover Date: Fri, 8 Nov 2013 17:18:07 +0000 (+0000) Subject: ARM: fold prologue/epilogue sp updates into push/pop for code size X-Git-Url: http://plrg.eecs.uci.edu/git/?a=commitdiff_plain;h=323ac85d6ad7ba5d9593d8e151d879bd91d82e08;p=oota-llvm.git ARM: fold prologue/epilogue sp updates into push/pop for code size ARM prologues usually look like: push {r7, lr} sub sp, sp, #4 If code size is extremely important, this can be optimised to the single instruction: push {r6, r7, lr} where we don't actually care about the contents of r6, but pushing it subtracts 4 from sp as a side effect. This should implement such a conversion, predicated on the "minsize" function attribute (-Oz) since I've yet to find any code it actually makes faster. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@194264 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp index d97f2469e83..7187d6a665b 100644 --- a/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -1857,6 +1857,103 @@ void llvm::emitARMRegPlusImmediate(MachineBasicBlock &MBB, } } +bool llvm::tryFoldSPUpdateIntoPushPop(MachineFunction &MF, + MachineInstr *MI, + unsigned NumBytes) { + // This optimisation potentially adds lots of load and store + // micro-operations, it's only really a great benefit to code-size. + if (!MF.getFunction()->hasFnAttribute(Attribute::MinSize)) + return false; + + // If only one register is pushed/popped, LLVM can use an LDR/STR + // instead. We can't modify those so make sure we're dealing with an + // instruction we understand. + bool IsPop = isPopOpcode(MI->getOpcode()); + bool IsPush = isPushOpcode(MI->getOpcode()); + if (!IsPush && !IsPop) + return false; + + bool IsVFPPushPop = MI->getOpcode() == ARM::VSTMDDB_UPD || + MI->getOpcode() == ARM::VLDMDIA_UPD; + bool IsT1PushPop = MI->getOpcode() == ARM::tPUSH || + MI->getOpcode() == ARM::tPOP || + MI->getOpcode() == ARM::tPOP_RET; + + assert((IsT1PushPop || (MI->getOperand(0).getReg() == ARM::SP && + MI->getOperand(1).getReg() == ARM::SP)) && + "trying to fold sp update into non-sp-updating push/pop"); + + // The VFP push & pop act on D-registers, so we can only fold an adjustment + // by a multiple of 8 bytes in correctly. Similarly rN is 4-bytes. Don't try + // if this is violated. + if (NumBytes % (IsVFPPushPop ? 8 : 4) != 0) + return false; + + // ARM and Thumb2 push/pop insts have explicit "sp, sp" operands (+ + // pred) so the list starts at 4. Thumb1 starts after the predicate. + int RegListIdx = IsT1PushPop ? 2 : 4; + + // Calculate the space we'll need in terms of registers. + unsigned FirstReg = MI->getOperand(RegListIdx).getReg(); + unsigned RD0Reg, RegsNeeded; + if (IsVFPPushPop) { + RD0Reg = ARM::D0; + RegsNeeded = NumBytes / 8; + } else { + RD0Reg = ARM::R0; + RegsNeeded = NumBytes / 4; + } + + // We're going to have to strip all list operands off before + // re-adding them since the order matters, so save the existing ones + // for later. + SmallVector RegList; + for (int i = MI->getNumOperands() - 1; i >= RegListIdx; --i) + RegList.push_back(MI->getOperand(i)); + + MachineBasicBlock *MBB = MI->getParent(); + const TargetRegisterInfo *TRI = MF.getRegInfo().getTargetRegisterInfo(); + + // Now try to find enough space in the reglist to allocate NumBytes. + for (unsigned CurReg = FirstReg - 1; CurReg >= RD0Reg && RegsNeeded; + --CurReg, --RegsNeeded) { + if (!IsPop) { + // Pushing any register is completely harmless, mark the + // register involved as undef since we don't care about it in + // the slightest. + RegList.push_back(MachineOperand::CreateReg(CurReg, false, false, + false, false, true)); + continue; + } + + // However, we can only pop an extra register if it's not live. Otherwise we + // might clobber a return value register. We assume that once we find a live + // return register all lower ones will be too so there's no use proceeding. + if (MBB->computeRegisterLiveness(TRI, CurReg, MI) != + MachineBasicBlock::LQR_Dead) + return false; + + // Mark the unimportant registers as in the POP. + RegList.push_back(MachineOperand::CreateReg(CurReg, true, false, true)); + } + + if (RegsNeeded > 0) + return false; + + // Finally we know we can profitably perform the optimisation so go + // ahead: strip all existing registers off and add them back again + // in the right order. + for (int i = MI->getNumOperands() - 1; i >= RegListIdx; --i) + MI->RemoveOperand(i); + + // Add the complete list back in. + MachineInstrBuilder MIB(MF, &*MI); + for (int i = RegList.size() - 1; i >= 0; --i) + MIB.addOperand(RegList[i]); + + return true; +} + bool llvm::rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx, unsigned FrameReg, int &Offset, const ARMBaseInstrInfo &TII) { diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h index 8ab06fd8c22..93e59647d22 100644 --- a/lib/Target/ARM/ARMBaseInstrInfo.h +++ b/lib/Target/ARM/ARMBaseInstrInfo.h @@ -362,6 +362,17 @@ bool isIndirectBranchOpcode(int Opc) { return Opc == ARM::BX || Opc == ARM::MOVPCRX || Opc == ARM::tBRIND; } +static inline bool isPopOpcode(int Opc) { + return Opc == ARM::tPOP_RET || Opc == ARM::LDMIA_RET || + Opc == ARM::t2LDMIA_RET || Opc == ARM::tPOP || Opc == ARM::LDMIA_UPD || + Opc == ARM::t2LDMIA_UPD || Opc == ARM::VLDMDIA_UPD; +} + +static inline bool isPushOpcode(int Opc) { + return Opc == ARM::tPUSH || Opc == ARM::t2STMDB_UPD || + Opc == ARM::STMDB_UPD || Opc == ARM::VSTMDDB_UPD; +} + /// getInstrPredicate - If instruction is predicated, returns its predicate /// condition, otherwise returns AL. It also returns the condition code /// register by reference. @@ -401,6 +412,13 @@ void emitThumbRegPlusImmediate(MachineBasicBlock &MBB, const ARMBaseRegisterInfo& MRI, unsigned MIFlags = 0); +/// Tries to add registers to the reglist of a given base-updating +/// push/pop instruction to adjust the stack by an additional +/// NumBytes. This can save a few bytes per function in code-size, but +/// obviously generates more memory traffic. As such, it only takes +/// effect in functions being optimised for size. +bool tryFoldSPUpdateIntoPushPop(MachineFunction &MF, MachineInstr *MI, + unsigned NumBytes); /// rewriteARMFrameIndex / rewriteT2FrameIndex - /// Rewrite MI to access 'Offset' bytes from the FP. Return false if the diff --git a/lib/Target/ARM/ARMFrameLowering.cpp b/lib/Target/ARM/ARMFrameLowering.cpp index a0cf54d81e0..7b02803c51f 100644 --- a/lib/Target/ARM/ARMFrameLowering.cpp +++ b/lib/Target/ARM/ARMFrameLowering.cpp @@ -93,11 +93,7 @@ static bool isCSRestore(MachineInstr *MI, const ARMBaseInstrInfo &TII, const uint16_t *CSRegs) { // Integer spill area is handled with "pop". - if (MI->getOpcode() == ARM::LDMIA_RET || - MI->getOpcode() == ARM::t2LDMIA_RET || - MI->getOpcode() == ARM::LDMIA_UPD || - MI->getOpcode() == ARM::t2LDMIA_UPD || - MI->getOpcode() == ARM::VLDMDIA_UPD) { + if (isPopOpcode(MI->getOpcode())) { // The first two operands are predicates. The last two are // imp-def and imp-use of SP. Check everything in between. for (int i = 5, e = MI->getNumOperands(); i != e; ++i) @@ -221,42 +217,37 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const { } // Move past area 1. - if (GPRCS1Size > 0) MBBI++; + MachineBasicBlock::iterator LastPush = MBB.end(), FramePtrPush; + if (GPRCS1Size > 0) + FramePtrPush = LastPush = MBBI++; // Determine starting offsets of spill areas. bool HasFP = hasFP(MF); unsigned DPRCSOffset = NumBytes - (GPRCS1Size + GPRCS2Size + DPRCSSize); unsigned GPRCS2Offset = DPRCSOffset + DPRCSSize; unsigned GPRCS1Offset = GPRCS2Offset + GPRCS2Size; - if (HasFP) + int FramePtrOffsetInPush = 0; + if (HasFP) { + FramePtrOffsetInPush = MFI->getObjectOffset(FramePtrSpillFI) + GPRCS1Size; AFI->setFramePtrSpillOffset(MFI->getObjectOffset(FramePtrSpillFI) + NumBytes); + } AFI->setGPRCalleeSavedArea1Offset(GPRCS1Offset); AFI->setGPRCalleeSavedArea2Offset(GPRCS2Offset); AFI->setDPRCalleeSavedAreaOffset(DPRCSOffset); - // Set FP to point to the stack slot that contains the previous FP. - // For iOS, FP is R7, which has now been stored in spill area 1. - // Otherwise, if this is not iOS, all the callee-saved registers go - // into spill area 1, including the FP in R11. In either case, it is - // now safe to emit this assignment. - if (HasFP) { - int FramePtrOffset = MFI->getObjectOffset(FramePtrSpillFI) + GPRCS1Size; - emitRegPlusImmediate(!AFI->isThumbFunction(), MBB, MBBI, dl, TII, - FramePtr, ARM::SP, FramePtrOffset, - MachineInstr::FrameSetup); - } - // Move past area 2. - if (GPRCS2Size > 0) MBBI++; + if (GPRCS2Size > 0) { + LastPush = MBBI++; + } // Move past area 3. if (DPRCSSize > 0) { - MBBI++; + LastPush = MBBI++; // Since vpush register list cannot have gaps, there may be multiple vpush // instructions in the prologue. while (MBBI->getOpcode() == ARM::VSTMDDB_UPD) - MBBI++; + LastPush = MBBI++; } // Move past the aligned DPRCS2 area. @@ -272,8 +263,12 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const { if (NumBytes) { // Adjust SP after all the callee-save spills. - emitSPUpdate(isARM, MBB, MBBI, dl, TII, -NumBytes, - MachineInstr::FrameSetup); + if (tryFoldSPUpdateIntoPushPop(MF, LastPush, NumBytes)) + FramePtrOffsetInPush += NumBytes; + else + emitSPUpdate(isARM, MBB, MBBI, dl, TII, -NumBytes, + MachineInstr::FrameSetup); + if (HasFP && isARM) // Restore from fp only in ARM mode: e.g. sub sp, r7, #24 // Note it's not safe to do this in Thumb2 mode because it would have @@ -286,6 +281,18 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const { AFI->setShouldRestoreSPFromFP(true); } + // Set FP to point to the stack slot that contains the previous FP. + // For iOS, FP is R7, which has now been stored in spill area 1. + // Otherwise, if this is not iOS, all the callee-saved registers go + // into spill area 1, including the FP in R11. In either case, it + // is in area one and the adjustment needs to take place just after + // that push. + if (HasFP) + emitRegPlusImmediate(!AFI->isThumbFunction(), MBB, ++FramePtrPush, dl, TII, + FramePtr, ARM::SP, FramePtrOffsetInPush, + MachineInstr::FrameSetup); + + if (STI.isTargetELF() && hasFP(MF)) MFI->setOffsetAdjustment(MFI->getOffsetAdjustment() - AFI->getFramePtrSpillOffset()); @@ -380,12 +387,17 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF, if (NumBytes != 0) emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes); } else { + MachineBasicBlock::iterator FirstPop = MBBI; + // Unwind MBBI to point to first LDR / VLDRD. const uint16_t *CSRegs = RegInfo->getCalleeSavedRegs(&MF); if (MBBI != MBB.begin()) { - do + do { + if (isPopOpcode(MBBI->getOpcode())) + FirstPop = MBBI; + --MBBI; - while (MBBI != MBB.begin() && isCSRestore(MBBI, TII, CSRegs)); + } while (MBBI != MBB.begin() && isCSRestore(MBBI, TII, CSRegs)); if (!isCSRestore(MBBI, TII, CSRegs)) ++MBBI; } @@ -429,8 +441,8 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF, ARM::SP) .addReg(FramePtr)); } - } else if (NumBytes) - emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes); + } else if (NumBytes && !tryFoldSPUpdateIntoPushPop(MF, FirstPop, NumBytes)) + emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes); // Increment past our save areas. if (AFI->getDPRCalleeSavedAreaSize()) { diff --git a/lib/Target/ARM/Thumb1FrameLowering.cpp b/lib/Target/ARM/Thumb1FrameLowering.cpp index cfaa792fc9b..d921c82cfb0 100644 --- a/lib/Target/ARM/Thumb1FrameLowering.cpp +++ b/lib/Target/ARM/Thumb1FrameLowering.cpp @@ -164,11 +164,17 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF) const { AFI->setDPRCalleeSavedAreaOffset(DPRCSOffset); NumBytes = DPRCSOffset; + int FramePtrOffsetInBlock = 0; + if (tryFoldSPUpdateIntoPushPop(MF, prior(MBBI), NumBytes)) { + FramePtrOffsetInBlock = NumBytes; + NumBytes = 0; + } + // Adjust FP so it point to the stack slot that contains the previous FP. if (HasFP) { - int FramePtrOffset = MFI->getObjectOffset(FramePtrSpillFI) + GPRCS1Size; + FramePtrOffsetInBlock += MFI->getObjectOffset(FramePtrSpillFI) + GPRCS1Size; AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tADDrSPi), FramePtr) - .addReg(ARM::SP).addImm(FramePtrOffset / 4) + .addReg(ARM::SP).addImm(FramePtrOffsetInBlock / 4) .setMIFlags(MachineInstr::FrameSetup)); if (NumBytes > 508) // If offset is > 508 then sp cannot be adjusted in a single instruction, @@ -292,8 +298,9 @@ void Thumb1FrameLowering::emitEpilogue(MachineFunction &MF, &MBB.front() != MBBI && prior(MBBI)->getOpcode() == ARM::tPOP) { MachineBasicBlock::iterator PMBBI = prior(MBBI); - emitSPUpdate(MBB, PMBBI, TII, dl, *RegInfo, NumBytes); - } else + if (!tryFoldSPUpdateIntoPushPop(MF, PMBBI, NumBytes)) + emitSPUpdate(MBB, PMBBI, TII, dl, *RegInfo, NumBytes); + } else if (!tryFoldSPUpdateIntoPushPop(MF, MBBI, NumBytes)) emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, NumBytes); } } diff --git a/test/CodeGen/ARM/fold-stack-adjust.ll b/test/CodeGen/ARM/fold-stack-adjust.ll new file mode 100644 index 00000000000..c8c48faffb2 --- /dev/null +++ b/test/CodeGen/ARM/fold-stack-adjust.ll @@ -0,0 +1,126 @@ +; RUN: llc -mtriple=thumbv7-apple-darwin-eabi < %s | FileCheck %s +; RUN: llc -mtriple=thumbv6m-apple-darwin-eabi -disable-fp-elim < %s | FileCheck %s --check-prefix=CHECK-T1 +; RUN: llc -mtriple=thumbv7-apple-darwin-ios -disable-fp-elim < %s | FileCheck %s --check-prefix=CHECK-IOS + + +declare void @bar(i8*) + +%bigVec = type [2 x double] + +@var = global %bigVec zeroinitializer + +define void @check_simple() minsize { +; CHECK-LABEL: check_simple: +; CHECK: push.w {r7, r8, r9, r10, r11, lr} +; CHECK-NOT: sub sp, sp, +; ... +; CHECK-NOT: add sp, sp, +; CHECK: pop.w {r7, r8, r9, r10, r11, pc} + +; CHECK-T1-LABEL: check_simple: +; CHECK-T1: push {r3, r4, r5, r6, r7, lr} +; CHECK-T1: add r7, sp, #16 +; CHECK-T1-NOT: sub sp, sp, +; ... +; CHECK-T1-NOT: add sp, sp, +; CHECK-T1: pop {r3, r4, r5, r6, r7, pc} + + ; iOS always has a frame pointer and messing with the push affects + ; how it's set in the prologue. Make sure we get that right. +; CHECK-IOS-LABEL: check_simple: +; CHECK-IOS: push {r3, r4, r5, r6, r7, lr} +; CHECK-NOT: sub sp, +; CHECK-IOS: add r7, sp, #16 +; CHECK-NOT: sub sp, +; ... +; CHECK-NOT: add sp, +; CHEC: pop {r3, r4, r5, r6, r7, pc} + + %var = alloca i8, i32 16 + call void @bar(i8* %var) + ret void +} + +define void @check_simple_too_big() minsize { +; CHECK-LABEL: check_simple_too_big: +; CHECK: push.w {r11, lr} +; CHECK: sub sp, +; ... +; CHECK: add sp, +; CHECK: pop.w {r11, pc} + %var = alloca i8, i32 64 + call void @bar(i8* %var) + ret void +} + +define void @check_vfp_fold() minsize { +; CHECK-LABEL: check_vfp_fold: +; CHECK: push {r[[GLOBREG:[0-9]+]], lr} +; CHECK: vpush {d6, d7, d8, d9} +; CHECK-NOT: sub sp, +; ... +; CHECK: vldmia r[[GLOBREG]], {d8, d9} +; ... +; CHECK-NOT: add sp, +; CHECK: vpop {d6, d7, d8, d9} +; CHECKL pop {r[[GLOBREG]], pc} + + ; iOS uses aligned NEON stores here, which is convenient since we + ; want to make sure that works too. +; CHECK-IOS-LABEL: check_vfp_fold: +; CHECK-IOS: push {r0, r1, r2, r3, r4, r7, lr} +; CHECK-IOS: sub.w r4, sp, #16 +; CHECK-IOS: bic r4, r4, #15 +; CHECK-IOS: mov sp, r4 +; CHECK-IOS: vst1.64 {d8, d9}, [r4:128] +; ... +; CHECK-IOS: add r4, sp, #16 +; CHECK-IOS: vld1.64 {d8, d9}, [r4:128] +; CHECK-IOS: mov sp, r4 +; CHECK-IOS: pop {r4, r7, pc} + + %var = alloca i8, i32 16 + + %tmp = load %bigVec* @var + call void @bar(i8* %var) + store %bigVec %tmp, %bigVec* @var + + ret void +} + +; This function should use just enough space that the "add sp, sp, ..." could be +; folded in except that doing so would clobber the value being returned. +define i64 @check_no_return_clobber() minsize { +; CHECK-LABEL: check_no_return_clobber: +; CHECK: push.w {r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NOT: sub sp, +; ... +; CHECK: add sp, #40 +; CHECK: pop.w {r11, pc} + + ; Just to keep iOS FileCheck within previous function: +; CHECK-IOS-LABEL: check_no_return_clobber: + + %var = alloca i8, i32 40 + call void @bar(i8* %var) + ret i64 0 +} + +define arm_aapcs_vfpcc double @check_vfp_no_return_clobber() minsize { +; CHECK-LABEL: check_vfp_no_return_clobber: +; CHECK: push {r[[GLOBREG:[0-9]+]], lr} +; CHECK: vpush {d0, d1, d2, d3, d4, d5, d6, d7, d8, d9} +; CHECK-NOT: sub sp, +; ... +; CHECK: add sp, #64 +; CHECK: vpop {d8, d9} +; CHECK: pop {r[[GLOBREG]], pc} + + %var = alloca i8, i32 64 + + %tmp = load %bigVec* @var + call void @bar(i8* %var) + store %bigVec %tmp, %bigVec* @var + + ret double 1.0 +}