From 99a92f269d4ea6f13a9858bb883e13382d021120 Mon Sep 17 00:00:00 2001 From: Alexey Samsonov Date: Mon, 16 Jul 2012 06:54:09 +0000 Subject: [PATCH] This CL changes the function prologue and epilogue emitted on X86 when stack needs realignment. It is intended to fix PR11468. Old prologue and epilogue looked like this: push %rbp mov %rsp, %rbp and $alignment, %rsp push %r14 push %r15 ... pop %r15 pop %r14 mov %rbp, %rsp pop %rbp The problem was to reference the locations of callee-saved registers in exception handling: locations of callee-saved had to be re-calculated regarding the stack alignment operation. It would take some effort to implement this in LLVM, as currently MachineLocation can only have the form "Register + Offset". Funciton prologue and epilogue are now changed to: push %rbp mov %rsp, %rbp push %14 push %15 and $alignment, %rsp ... lea -$size_of_saved_registers(%rbp), %rsp pop %r15 pop %r14 pop %rbp Reviewed by Chad Rosier. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@160248 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86FrameLowering.cpp | 100 +++++++++---------- test/CodeGen/X86/dynamic-allocas-VLAs.ll | 23 ++--- test/CodeGen/X86/force-align-stack-alloca.ll | 16 ++- test/CodeGen/X86/pr11468.ll | 33 ++++++ 4 files changed, 95 insertions(+), 77 deletions(-) create mode 100644 test/CodeGen/X86/pr11468.ll diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp index bf0ba09e23a..2775736717d 100644 --- a/lib/Target/X86/X86FrameLowering.cpp +++ b/lib/Target/X86/X86FrameLowering.cpp @@ -722,10 +722,14 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { if (HasFP) { // Calculate required stack adjustment. uint64_t FrameSize = StackSize - SlotSize; - if (RegInfo->needsStackRealignment(MF)) - FrameSize = (FrameSize + MaxAlign - 1) / MaxAlign * MaxAlign; - - NumBytes = FrameSize - X86FI->getCalleeSavedFrameSize(); + if (RegInfo->needsStackRealignment(MF)) { + // Callee-saved registers are pushed on stack before the stack + // is realigned. + FrameSize -= X86FI->getCalleeSavedFrameSize(); + NumBytes = (FrameSize + MaxAlign - 1) / MaxAlign * MaxAlign; + } else { + NumBytes = FrameSize - X86FI->getCalleeSavedFrameSize(); + } // Get the offset of the stack slot for the EBP register, which is // guaranteed to be the last slot by processFunctionBeforeFrameFinalized. @@ -782,19 +786,6 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { for (MachineFunction::iterator I = llvm::next(MF.begin()), E = MF.end(); I != E; ++I) I->addLiveIn(FramePtr); - - // Realign stack - if (RegInfo->needsStackRealignment(MF)) { - MachineInstr *MI = - BuildMI(MBB, MBBI, DL, - TII.get(Is64Bit ? X86::AND64ri32 : X86::AND32ri), StackPtr) - .addReg(StackPtr) - .addImm(-MaxAlign) - .setMIFlag(MachineInstr::FrameSetup); - - // The EFLAGS implicit def is dead. - MI->getOperand(3).setIsDead(); - } } else { NumBytes = StackSize - X86FI->getCalleeSavedFrameSize(); } @@ -824,6 +815,27 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { } } + // Realign stack after we pushed callee-saved registers (so that we'll be + // able to calculate their offsets from the frame pointer). + + // NOTE: We push the registers before realigning the stack, so + // vector callee-saved (xmm) registers may be saved w/o proper + // alignment in this way. However, currently these regs are saved in + // stack slots (see X86FrameLowering::spillCalleeSavedRegisters()), so + // this shouldn't be a problem. + if (RegInfo->needsStackRealignment(MF)) { + assert(HasFP && "There should be a frame pointer if stack is realigned."); + MachineInstr *MI = + BuildMI(MBB, MBBI, DL, + TII.get(Is64Bit ? X86::AND64ri32 : X86::AND32ri), StackPtr) + .addReg(StackPtr) + .addImm(-MaxAlign) + .setMIFlag(MachineInstr::FrameSetup); + + // The EFLAGS implicit def is dead. + MI->getOperand(3).setIsDead(); + } + DL = MBB.findDebugLoc(MBBI); // If there is an SUB32ri of ESP immediately before this instruction, merge @@ -975,7 +987,6 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, unsigned SlotSize = RegInfo->getSlotSize(); unsigned FramePtr = RegInfo->getFrameRegister(MF); unsigned StackPtr = RegInfo->getStackRegister(); - unsigned BasePtr = RegInfo->getBaseRegister(); switch (RetOpcode) { default: @@ -1013,10 +1024,14 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, if (hasFP(MF)) { // Calculate required stack adjustment. uint64_t FrameSize = StackSize - SlotSize; - if (RegInfo->needsStackRealignment(MF)) - FrameSize = (FrameSize + MaxAlign - 1)/MaxAlign*MaxAlign; - - NumBytes = FrameSize - CSSize; + if (RegInfo->needsStackRealignment(MF)) { + // Callee-saved registers were pushed on stack before the stack + // was realigned. + FrameSize -= CSSize; + NumBytes = (FrameSize + MaxAlign - 1) / MaxAlign * MaxAlign; + } else { + NumBytes = FrameSize - CSSize; + } // Pop EBP. BuildMI(MBB, MBBI, DL, @@ -1026,7 +1041,6 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, } // Skip the callee-saved pop instructions. - MachineBasicBlock::iterator LastCSPop = MBBI; while (MBBI != MBB.begin()) { MachineBasicBlock::iterator PI = prior(MBBI); unsigned Opc = PI->getOpcode(); @@ -1037,6 +1051,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, --MBBI; } + MachineBasicBlock::iterator FirstCSPop = MBBI; DL = MBBI->getDebugLoc(); @@ -1045,40 +1060,19 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, if (NumBytes || MFI->hasVarSizedObjects()) mergeSPUpdatesUp(MBB, MBBI, StackPtr, &NumBytes); - // Restore the SP from the BP, if necessary. - if (RegInfo->hasBasePointer(MF)) { - BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr), - StackPtr).addReg(BasePtr); - - // When restoring from the BP we must use a cached SP adjustment. - NumBytes = X86FI->getBasePtrStackAdjustment(); - } - // If dynamic alloca is used, then reset esp to point to the last callee-saved // slot before popping them off! Same applies for the case, when stack was // realigned. - if (RegInfo->needsStackRealignment(MF)) { - // We cannot use LEA here, because stack pointer was realigned. We need to - // deallocate local frame back. - if (CSSize) { - emitSPUpdate(MBB, MBBI, StackPtr, NumBytes, Is64Bit, UseLEA, TII, - *RegInfo); - MBBI = prior(LastCSPop); - } - - BuildMI(MBB, MBBI, DL, - TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr), - StackPtr).addReg(FramePtr); - } else if (MFI->hasVarSizedObjects()) { - if (CSSize) { - unsigned Opc = Is64Bit ? X86::LEA64r : X86::LEA32r; - MachineInstr *MI = - addRegOffset(BuildMI(MF, DL, TII.get(Opc), StackPtr), - FramePtr, false, -CSSize); - MBB.insert(MBBI, MI); + if (RegInfo->needsStackRealignment(MF) || MFI->hasVarSizedObjects()) { + if (RegInfo->needsStackRealignment(MF)) + MBBI = FirstCSPop; + if (CSSize != 0) { + unsigned Opc = getLEArOpcode(Is64Bit); + addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr), + FramePtr, false, -CSSize); } else { - BuildMI(MBB, MBBI, DL, - TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr), StackPtr) + unsigned Opc = (Is64Bit ? X86::MOV64rr : X86::MOV32rr); + BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr) .addReg(FramePtr); } } else if (NumBytes) { diff --git a/test/CodeGen/X86/dynamic-allocas-VLAs.ll b/test/CodeGen/X86/dynamic-allocas-VLAs.ll index c7970d491ee..54ae39b7112 100644 --- a/test/CodeGen/X86/dynamic-allocas-VLAs.ll +++ b/test/CodeGen/X86/dynamic-allocas-VLAs.ll @@ -85,20 +85,19 @@ entry: ; CHECK: _t4 ; CHECK: pushq %rbp ; CHECK: movq %rsp, %rbp -; CHECK: andq $-32, %rsp ; CHECK: pushq %r14 ; CHECK: pushq %rbx -; CHECK: subq $[[STACKADJ:[0-9]+]], %rsp +; CHECK: andq $-32, %rsp +; CHECK: subq ${{[0-9]+}}, %rsp ; CHECK: movq %rsp, %rbx ; ; CHECK: leaq {{[0-9]*}}(%rbx), %rdi ; CHECK: leaq {{[0-9]*}}(%rbx), %rdx ; CHECK: callq _t4_helper ; -; CHECK: addq $[[STACKADJ]], %rsp +; CHECK: leaq -16(%rbp), %rsp ; CHECK: popq %rbx ; CHECK: popq %r14 -; CHECK: movq %rbp, %rsp ; CHECK: popq %rbp } @@ -176,19 +175,17 @@ entry: ; CHECK: _t7 ; CHECK: pushq %rbp ; CHECK: movq %rsp, %rbp -; CHECK: andq $-32, %rsp ; CHECK: pushq %rbx -; CHECK: subq $[[ADJ:[0-9]+]], %rsp +; CHECK: andq $-32, %rsp +; CHECK: subq ${{[0-9]+}}, %rsp ; CHECK: movq %rsp, %rbx ; Stack adjustment for byval ; CHECK: subq {{.*}}, %rsp ; CHECK: callq _bar ; CHECK-NOT: addq {{.*}}, %rsp -; CHECK: movq %rbx, %rsp -; CHECK: addq $[[ADJ]], %rsp +; CHECK: leaq -8(%rbp), %rsp ; CHECK: popq %rbx -; CHECK: movq %rbp, %rsp ; CHECK: popq %rbp } @@ -229,14 +226,12 @@ entry: ; FORCE-ALIGN: _t9 ; FORCE-ALIGN: pushq %rbp ; FORCE-ALIGN: movq %rsp, %rbp -; FORCE-ALIGN: andq $-32, %rsp ; FORCE-ALIGN: pushq %rbx -; FORCE-ALIGN: subq $24, %rsp +; FORCE-ALIGN: andq $-32, %rsp +; FORCE-ALIGN: subq $32, %rsp ; FORCE-ALIGN: movq %rsp, %rbx -; FORCE-ALIGN: movq %rbx, %rsp -; FORCE-ALIGN: addq $24, %rsp +; FORCE-ALIGN: leaq -8(%rbp), %rsp ; FORCE-ALIGN: popq %rbx -; FORCE-ALIGN: movq %rbp, %rsp ; FORCE-ALIGN: popq %rbp } diff --git a/test/CodeGen/X86/force-align-stack-alloca.ll b/test/CodeGen/X86/force-align-stack-alloca.ll index ecef781f880..6d445594370 100644 --- a/test/CodeGen/X86/force-align-stack-alloca.ll +++ b/test/CodeGen/X86/force-align-stack-alloca.ll @@ -19,10 +19,10 @@ define i64 @g(i32 %i) nounwind { ; CHECK: g: ; CHECK: pushl %ebp ; CHECK-NEXT: movl %esp, %ebp -; CHECK-NEXT: andl $-32, %esp ; CHECK-NEXT: pushl ; CHECK-NEXT: pushl -; CHECK-NEXT: subl $24, %esp +; CHECK-NEXT: andl $-32, %esp +; CHECK-NEXT: subl $32, %esp ; ; Now setup the base pointer (%ebx). ; CHECK-NEXT: movl %esp, %ebx @@ -46,17 +46,13 @@ define i64 @g(i32 %i) nounwind { ; CHECK-NEXT: addl $32, %esp ; CHECK-NOT: {{[^ ,]*}}, %esp ; -; Restore %esp from %ebx (base pointer) so we can pop the callee-saved -; registers. This is the state prior to the allocation of VLAs. +; Restore %esp from %ebp (frame pointer) and subtract the size of +; zone with callee-saved registers to pop them. +; This is the state prior to stack realignment and the allocation of VLAs. ; CHECK-NOT: popl -; CHECK: movl %ebx, %esp -; CHECK-NEXT: addl $24, %esp +; CHECK: leal -8(%ebp), %esp ; CHECK-NEXT: popl ; CHECK-NEXT: popl -; -; Finally we need to restore %esp from %ebp due to dynamic stack -; realignment. -; CHECK-NEXT: movl %ebp, %esp ; CHECK-NEXT: popl %ebp ; CHECK-NEXT: ret diff --git a/test/CodeGen/X86/pr11468.ll b/test/CodeGen/X86/pr11468.ll new file mode 100644 index 00000000000..f7e9adb4a21 --- /dev/null +++ b/test/CodeGen/X86/pr11468.ll @@ -0,0 +1,33 @@ +; RUN: llc < %s -force-align-stack -stack-alignment=32 -march=x86-64 -mattr=+avx -mtriple=i686-apple-darwin10 | FileCheck %s +; PR11468 + +define void @f(i64 %sz) uwtable { +entry: + %a = alloca i32, align 32 + store volatile i32 0, i32* %a, align 32 + ; force to push r14 on stack + call void asm sideeffect "nop", "~{r14},~{dirflag},~{fpsr},~{flags}"() nounwind, !srcloc !0 + ret void + +; CHECK: _f +; CHECK: pushq %rbp +; CHECK: .cfi_offset %rbp, -16 +; CHECK: movq %rsp, %rbp +; CHECK: .cfi_def_cfa_register %rbp + +; We first push register on stack, and then realign it, so that +; .cfi_offset value is correct +; CHECK: pushq %r14 +; CHECK: andq $-32, %rsp +; CHECK: .cfi_offset %r14, -24 + +; Restore %rsp from %rbp and subtract the total size of saved regsiters. +; CHECK: leaq -8(%rbp), %rsp + +; Pop saved registers. +; CHECK: popq %r14 +; CHECK: popq %rbp +} + +!0 = metadata !{i32 125} + -- 2.34.1