From a97b180fc44718e525bb03b50e05fe66622c2b02 Mon Sep 17 00:00:00 2001 From: Chad Rosier Date: Wed, 6 Jun 2012 17:37:40 +0000 Subject: [PATCH] Add support for dynamic stack realignment in the presence of dynamic allocas on X86. rdar://11496434 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@158087 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86FrameLowering.cpp | 39 ++++- lib/Target/X86/X86RegisterInfo.cpp | 60 +++++-- lib/Target/X86/X86RegisterInfo.h | 8 + test/CodeGen/X86/alloca-align-rounding-32.ll | 7 +- test/CodeGen/X86/alloca-align-rounding.ll | 7 +- test/CodeGen/X86/dynamic-allocas-VLAs.ll | 158 +++++++++++++++++++ 6 files changed, 263 insertions(+), 16 deletions(-) create mode 100644 test/CodeGen/X86/dynamic-allocas-VLAs.ll diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp index c2b1cf7be2f..244f9bbfafe 100644 --- a/lib/Target/X86/X86FrameLowering.cpp +++ b/lib/Target/X86/X86FrameLowering.cpp @@ -650,6 +650,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { unsigned SlotSize = RegInfo->getSlotSize(); unsigned FramePtr = RegInfo->getFrameRegister(MF); unsigned StackPtr = RegInfo->getStackRegister(); + unsigned BasePtr = RegInfo->getBaseRegister(); DebugLoc DL; // If we're forcing a stack realignment we can't rely on just the frame @@ -913,6 +914,18 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit, UseLEA, TII, *RegInfo); + // If we need a base pointer, set it up here. It's whatever the value + // of the stack pointer is at this point. Any variable size objects + // will be allocated after this, so we can still use the base pointer + // to reference locals. + if (RegInfo->hasBasePointer(MF)) { + // Update the frame pointer with the current stack pointer. + unsigned Opc = Is64Bit ? X86::MOV64rr : X86::MOV32rr; + BuildMI(MBB, MBBI, DL, TII.get(Opc), BasePtr) + .addReg(StackPtr) + .setMIFlag(MachineInstr::FrameSetup); + } + if (( (!HasFP && NumBytes) || PushedRegs) && needsFrameMoves) { // Mark end of stack pointer adjustment. MCSymbol *Label = MMI.getContext().CreateTempSymbol(); @@ -1148,7 +1161,16 @@ int X86FrameLowering::getFrameIndexOffset(const MachineFunction &MF, int FI) con int Offset = MFI->getObjectOffset(FI) - getOffsetOfLocalArea(); uint64_t StackSize = MFI->getStackSize(); - if (RegInfo->needsStackRealignment(MF)) { + if (RegInfo->hasBasePointer(MF)) { + assert (hasFP(MF) && "VLAs and dynamic stack realign, but no FP?!"); + if (FI < 0) { + // Skip the saved EBP. + return Offset + RegInfo->getSlotSize(); + } else { + assert((-(Offset + StackSize)) % MFI->getObjectAlignment(FI) == 0); + return Offset + StackSize; + } + } else if (RegInfo->needsStackRealignment(MF)) { if (FI < 0) { // Skip the saved EBP. return Offset + RegInfo->getSlotSize(); @@ -1179,9 +1201,14 @@ int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, const X86RegisterInfo *RegInfo = static_cast(MF.getTarget().getRegisterInfo()); // We can't calculate offset from frame pointer if the stack is realigned, - // so enforce usage of stack pointer. - FrameReg = (RegInfo->needsStackRealignment(MF)) ? - RegInfo->getStackRegister() : RegInfo->getFrameRegister(MF); + // so enforce usage of stack/base pointer. The base pointer is used when we + // have dynamic allocas in addition to dynamic realignment. + if (RegInfo->hasBasePointer(MF)) + FrameReg = RegInfo->getBaseRegister(); + else if (RegInfo->needsStackRealignment(MF)) + FrameReg = RegInfo->getStackRegister(); + else + FrameReg = RegInfo->getFrameRegister(MF); return getFrameIndexOffset(MF, FI); } @@ -1318,6 +1345,10 @@ X86FrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, "Slot for EBP register must be last in order to be found!"); (void)FrameIdx; } + + // Spill the BasePtr if it's used. + if (RegInfo->hasBasePointer(MF)) + MF.getRegInfo().setPhysRegUsed(RegInfo->getBaseRegister()); } static bool diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp index b22a086b243..3b727881c70 100644 --- a/lib/Target/X86/X86RegisterInfo.cpp +++ b/lib/Target/X86/X86RegisterInfo.cpp @@ -50,6 +50,10 @@ ForceStackAlign("force-align-stack", " needed for the function."), cl::init(false), cl::Hidden); +cl::opt +EnableBasePointer("x86-use-base-pointer", cl::Hidden, cl::init(true), + cl::desc("Enable use of a base pointer for complex stack frames")); + X86RegisterInfo::X86RegisterInfo(X86TargetMachine &tm, const TargetInstrInfo &tii) : X86GenRegisterInfo(tm.getSubtarget().is64Bit() @@ -68,10 +72,12 @@ X86RegisterInfo::X86RegisterInfo(X86TargetMachine &tm, SlotSize = 8; StackPtr = X86::RSP; FramePtr = X86::RBP; + BasePtr = X86::RBX; } else { SlotSize = 4; StackPtr = X86::ESP; FramePtr = X86::EBP; + BasePtr = X86::EBX; } } @@ -290,6 +296,20 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const { Reserved.set(*I); } + // Set the base-pointer register and its aliases as reserved if needed. + if (hasBasePointer(MF)) { + CallingConv::ID CC = MF.getFunction()->getCallingConv(); + const uint32_t* RegMask = getCallPreservedMask(CC); + if (MachineOperand::clobbersPhysReg(RegMask, getBaseRegister())) + report_fatal_error( + "Stack realignment in presence of dynamic allocas is not supported with" + "this calling convention."); + + Reserved.set(getBaseRegister()); + for (MCSubRegIterator I(getBaseRegister(), this); I.isValid(); ++I) + Reserved.set(*I); + } + // Mark the segment registers as reserved. Reserved.set(X86::CS); Reserved.set(X86::SS); @@ -340,10 +360,35 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const { // Stack Frame Processing methods //===----------------------------------------------------------------------===// +bool X86RegisterInfo::hasBasePointer(const MachineFunction &MF) const { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + + if (!EnableBasePointer) + return false; + + // When we need stack realignment and there are dynamic allocas, we can't + // reference off of the stack pointer, so we reserve a base pointer. + if (needsStackRealignment(MF) && MFI->hasVarSizedObjects()) + return true; + + return false; +} + bool X86RegisterInfo::canRealignStack(const MachineFunction &MF) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); - return (MF.getTarget().Options.RealignStack && - !MFI->hasVarSizedObjects()); + const MachineRegisterInfo *MRI = &MF.getRegInfo(); + if (!MF.getTarget().Options.RealignStack) + return false; + + // Stack realignment requires a frame pointer. If we already started + // register allocation with frame pointer elimination, it is too late now. + if (!MRI->canReserveReg(FramePtr)) + return false; + + // If base pointer is necessary. Check that it isn't too late to reserve it. + if (MFI->hasVarSizedObjects()) + return MRI->canReserveReg(BasePtr); + return true; } bool X86RegisterInfo::needsStackRealignment(const MachineFunction &MF) const { @@ -353,13 +398,6 @@ bool X86RegisterInfo::needsStackRealignment(const MachineFunction &MF) const { bool requiresRealignment = ((MFI->getMaxAlignment() > StackAlign) || F->hasFnAttr(Attribute::StackAlignment)); - // FIXME: Currently we don't support stack realignment for functions with - // variable-sized allocas. - // FIXME: It's more complicated than this... - if (0 && requiresRealignment && MFI->hasVarSizedObjects()) - report_fatal_error( - "Stack realignment in presence of dynamic allocas is not supported"); - // If we've requested that we force align the stack do so now. if (ForceStackAlign) return canRealignStack(MF); @@ -499,7 +537,9 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, unsigned Opc = MI.getOpcode(); bool AfterFPPop = Opc == X86::TAILJMPm64 || Opc == X86::TAILJMPm; - if (needsStackRealignment(MF)) + if (hasBasePointer(MF)) + BasePtr = getBaseRegister(); + else if (needsStackRealignment(MF)) BasePtr = (FrameIndex < 0 ? FramePtr : StackPtr); else if (AfterFPPop) BasePtr = StackPtr; diff --git a/lib/Target/X86/X86RegisterInfo.h b/lib/Target/X86/X86RegisterInfo.h index ee69842b100..1bc32cbb78f 100644 --- a/lib/Target/X86/X86RegisterInfo.h +++ b/lib/Target/X86/X86RegisterInfo.h @@ -50,6 +50,11 @@ private: /// unsigned FramePtr; + /// BasePtr - X86 physical register used as a base ptr in complex stack + /// frames. I.e., when we need a 3rd base, not just SP and FP, due to + /// variable size stack objects. + unsigned BasePtr; + public: X86RegisterInfo(X86TargetMachine &tm, const TargetInstrInfo &tii); @@ -106,6 +111,8 @@ public: /// register scavenger to determine what registers are free. BitVector getReservedRegs(const MachineFunction &MF) const; + bool hasBasePointer(const MachineFunction &MF) const; + bool canRealignStack(const MachineFunction &MF) const; bool needsStackRealignment(const MachineFunction &MF) const; @@ -123,6 +130,7 @@ public: // Debug information queries. unsigned getFrameRegister(const MachineFunction &MF) const; unsigned getStackRegister() const { return StackPtr; } + unsigned getBaseRegister() const { return BasePtr; } // FIXME: Move to FrameInfok unsigned getSlotSize() const { return SlotSize; } diff --git a/test/CodeGen/X86/alloca-align-rounding-32.ll b/test/CodeGen/X86/alloca-align-rounding-32.ll index c0f1a18123e..a45284e10cf 100644 --- a/test/CodeGen/X86/alloca-align-rounding-32.ll +++ b/test/CodeGen/X86/alloca-align-rounding-32.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -march=x86 -mtriple=i686-apple-darwin | grep and | count 1 +; RUN: llc < %s -march=x86 -mtriple=i686-apple-darwin | FileCheck %s declare void @bar(<2 x i64>* %n) @@ -6,10 +6,15 @@ define void @foo(i32 %h) { %p = alloca <2 x i64>, i32 %h call void @bar(<2 x i64>* %p) ret void +; CHECK: foo +; CHECK-NOT: andl $-32, %eax } define void @foo2(i32 %h) { %p = alloca <2 x i64>, i32 %h, align 32 call void @bar(<2 x i64>* %p) ret void +; CHECK: foo2 +; CHECK: andl $-32, %esp +; CHECK: andl $-32, %eax } diff --git a/test/CodeGen/X86/alloca-align-rounding.ll b/test/CodeGen/X86/alloca-align-rounding.ll index 3c87dbf2bd7..3d76fb0aa25 100644 --- a/test/CodeGen/X86/alloca-align-rounding.ll +++ b/test/CodeGen/X86/alloca-align-rounding.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -march=x86-64 -mtriple=i686-pc-linux | grep and | count 1 +; RUN: llc < %s -march=x86-64 -mtriple=i686-pc-linux | FileCheck %s declare void @bar(<2 x i64>* %n) @@ -6,10 +6,15 @@ define void @foo(i64 %h) { %p = alloca <2 x i64>, i64 %h call void @bar(<2 x i64>* %p) ret void +; CHECK: foo +; CHECK-NOT: andq $-32, %rax } define void @foo2(i64 %h) { %p = alloca <2 x i64>, i64 %h, align 32 call void @bar(<2 x i64>* %p) ret void +; CHECK: foo2 +; CHECK: andq $-32, %rsp +; CHECK: andq $-32, %rax } diff --git a/test/CodeGen/X86/dynamic-allocas-VLAs.ll b/test/CodeGen/X86/dynamic-allocas-VLAs.ll new file mode 100644 index 00000000000..b787ee87c51 --- /dev/null +++ b/test/CodeGen/X86/dynamic-allocas-VLAs.ll @@ -0,0 +1,158 @@ +; RUN: llc < %s -march=x86-64 -mattr=+avx -mtriple=i686-apple-darwin10 | FileCheck %s +; rdar://11496434 + +; no VLAs or dynamic alignment +define i32 @t1() nounwind uwtable ssp { +entry: + %a = alloca i32, align 4 + call void @t1_helper(i32* %a) nounwind + %0 = load i32* %a, align 4 + %add = add nsw i32 %0, 13 + ret i32 %add + +; CHECK: _t1 +; CHECK-NOT: andq $-{{[0-9]+}}, %rsp +; CHECK: leaq [[OFFSET:[0-9]*]](%rsp), %rdi +; CHECK: callq _t1_helper +; CHECK: movl [[OFFSET]](%rsp), %eax +; CHECK: addl $13, %eax +} + +declare void @t1_helper(i32*) + +; dynamic realignment +define i32 @t2() nounwind uwtable ssp { +entry: + %a = alloca i32, align 4 + %v = alloca <8 x float>, align 32 + call void @t2_helper(i32* %a, <8 x float>* %v) nounwind + %0 = load i32* %a, align 4 + %add = add nsw i32 %0, 13 + ret i32 %add + +; CHECK: _t2 +; CHECK: pushq %rbp +; CHECK: movq %rsp, %rbp +; CHECK: andq $-32, %rsp +; CHECK: subq ${{[0-9]+}}, %rsp +; +; CHECK: leaq {{[0-9]*}}(%rsp), %rdi +; CHECK: leaq {{[0-9]*}}(%rsp), %rsi +; CHECK: callq _t2_helper +; +; CHECK: movq %rbp, %rsp +; CHECK: popq %rbp +} + +declare void @t2_helper(i32*, <8 x float>*) + +; VLAs +define i32 @t3(i64 %sz) nounwind uwtable ssp { +entry: + %a = alloca i32, align 4 + %vla = alloca i32, i64 %sz, align 16 + call void @t3_helper(i32* %a, i32* %vla) nounwind + %0 = load i32* %a, align 4 + %add = add nsw i32 %0, 13 + ret i32 %add + +; CHECK: _t3 +; CHECK: pushq %rbp +; CHECK: movq %rsp, %rbp +; CHECK: pushq %rbx +; CHECK-NOT: andq $-{{[0-9]+}}, %rsp +; CHECK: subq ${{[0-9]+}}, %rsp +; +; CHECK: leaq -{{[0-9]+}}(%rbp), %rsp +; CHECK: popq %rbx +; CHECK: popq %rbp +} + +declare void @t3_helper(i32*, i32*) + +; VLAs + Dynamic realignment +define i32 @t4(i64 %sz) nounwind uwtable ssp { +entry: + %a = alloca i32, align 4 + %v = alloca <8 x float>, align 32 + %vla = alloca i32, i64 %sz, align 16 + call void @t4_helper(i32* %a, i32* %vla, <8 x float>* %v) nounwind + %0 = load i32* %a, align 4 + %add = add nsw i32 %0, 13 + ret i32 %add + +; CHECK: _t4 +; CHECK: pushq %rbp +; CHECK: movq %rsp, %rbp +; CHECK: andq $-32, %rsp +; CHECK: pushq %r14 +; CHECK: pushq %rbx +; CHECK: subq $[[STACKADJ:[0-9]+]], %rsp +; CHECK: movq %rsp, %rbx +; +; CHECK: leaq {{[0-9]*}}(%rbx), %rdi +; CHECK: leaq {{[0-9]*}}(%rbx), %rdx +; CHECK: callq _t4_helper +; +; CHECK: addq $[[STACKADJ]], %rsp +; CHECK: popq %rbx +; CHECK: popq %r14 +; CHECK: movq %rbp, %rsp +; CHECK: popq %rbp +} + +declare void @t4_helper(i32*, i32*, <8 x float>*) + +; Dynamic realignment + Spill +define i32 @t5(float* nocapture %f) nounwind uwtable ssp { +entry: + %a = alloca i32, align 4 + %0 = bitcast float* %f to <8 x float>* + %1 = load <8 x float>* %0, align 32 + call void @t5_helper1(i32* %a) nounwind + call void @t5_helper2(<8 x float> %1) nounwind + %2 = load i32* %a, align 4 + %add = add nsw i32 %2, 13 + ret i32 %add + +; CHECK: _t5 +; CHECK: pushq %rbp +; CHECK: movq %rsp, %rbp +; CHECK: andq $-32, %rsp +; CHECK: subq ${{[0-9]+}}, %rsp +; +; CHECK: vmovaps (%rdi), [[AVXREG:%ymm[0-9]+]] +; CHECK: vmovaps [[AVXREG]], (%rsp) +; CHECK: leaq {{[0-9]+}}(%rsp), %rdi +; CHECK: callq _t5_helper1 +; CHECK: vmovaps (%rsp), %ymm0 +; CHECK: callq _t5_helper2 +; CHECK: movl {{[0-9]+}}(%rsp), %eax +; +; CHECK: movq %rbp, %rsp +; CHECK: popq %rbp +} + +declare void @t5_helper1(i32*) + +declare void @t5_helper2(<8 x float>) + +; VLAs + Dynamic realignment + Spill +; FIXME: RA has already reserved RBX, so we can't do dynamic realignment. +define i32 @t6(i64 %sz, float* nocapture %f) nounwind uwtable ssp { +entry: +; CHECK: _t6 + %a = alloca i32, align 4 + %0 = bitcast float* %f to <8 x float>* + %1 = load <8 x float>* %0, align 32 + %vla = alloca i32, i64 %sz, align 16 + call void @t6_helper1(i32* %a, i32* %vla) nounwind + call void @t6_helper2(<8 x float> %1) nounwind + %2 = load i32* %a, align 4 + %add = add nsw i32 %2, 13 + ret i32 %add +} + +declare void @t6_helper1(i32*, i32*) + +declare void @t6_helper2(<8 x float>) -- 2.34.1