From 70af20f0b303e826a4f350d4d90fda9937d78684 Mon Sep 17 00:00:00 2001 From: Manman Ren Date: Tue, 12 Jan 2016 01:08:46 +0000 Subject: [PATCH] CXX_FAST_TLS calling convention: performance improvement for x86-64. This is the same change on x86-64 as r255821 on AArch64. rdar://9001553 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@257428 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86CallingConv.td | 6 +++ lib/Target/X86/X86FastISel.cpp | 3 ++ lib/Target/X86/X86ISelLowering.cpp | 60 ++++++++++++++++++++++++ lib/Target/X86/X86ISelLowering.h | 9 ++++ lib/Target/X86/X86MachineFunctionInfo.h | 7 +++ lib/Target/X86/X86RegisterInfo.cpp | 12 ++++- lib/Target/X86/X86RegisterInfo.h | 2 + test/CodeGen/X86/cxx_tlscc64.ll | 61 ++++++++----------------- 8 files changed, 117 insertions(+), 43 deletions(-) diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td index 54d88cbb244..e8b96e74a7a 100644 --- a/lib/Target/X86/X86CallingConv.td +++ b/lib/Target/X86/X86CallingConv.td @@ -831,6 +831,12 @@ def CSR_Win64 : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, R12, R13, R14, R15, def CSR_64_TLS_Darwin : CalleeSavedRegs<(add CSR_64, RCX, RDX, RSI, R8, R9, R10, R11)>; +// CSRs that are handled by prologue, epilogue. +def CSR_64_CXX_TLS_Darwin_PE : CalleeSavedRegs<(add)>; + +// CSRs that are handled explicitly via copies. +def CSR_64_CXX_TLS_Darwin_ViaCopy : CalleeSavedRegs<(add CSR_64_TLS_Darwin)>; + // All GPRs - except r11 def CSR_64_RT_MostRegs : CalleeSavedRegs<(add CSR_64, RAX, RCX, RDX, RSI, RDI, R8, R9, R10, RSP)>; diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp index 629d4d3565f..f48b47934e0 100644 --- a/lib/Target/X86/X86FastISel.cpp +++ b/lib/Target/X86/X86FastISel.cpp @@ -1002,6 +1002,9 @@ bool X86FastISel::X86SelectRet(const Instruction *I) { if (!FuncInfo.CanLowerReturn) return false; + if (TLI.supportSplitCSR(FuncInfo.MF)) + return false; + CallingConv::ID CC = F.getCallingConv(); if (CC != CallingConv::C && CC != CallingConv::Fast && diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 9702eb860ac..2aa54e25b0e 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -2311,6 +2311,18 @@ X86TargetLowering::LowerReturn(SDValue Chain, DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout()))); } + const X86RegisterInfo *TRI = Subtarget->getRegisterInfo(); + const MCPhysReg *I = + TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); + if (I) { + for (; *I; ++I) { + if (X86::GR64RegClass.contains(*I)) + RetOps.push_back(DAG.getRegister(*I, MVT::i64)); + else + llvm_unreachable("Unexpected register class in CSRsViaCopy!"); + } + } + RetOps[0] = Chain; // Update chain. // Add the flag if we have it. @@ -28827,3 +28839,51 @@ bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const { Attribute::MinSize); return OptSize && !VT.isVector(); } + +void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { + if (!Subtarget->is64Bit()) + return; + + // Update IsSplitCSR in X86MachineFunctionInfo. + X86MachineFunctionInfo *AFI = + Entry->getParent()->getInfo(); + AFI->setIsSplitCSR(true); +} + +void X86TargetLowering::insertCopiesSplitCSR( + MachineBasicBlock *Entry, + const SmallVectorImpl &Exits) const { + const X86RegisterInfo *TRI = Subtarget->getRegisterInfo(); + const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent()); + if (!IStart) + return; + + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo(); + for (const MCPhysReg *I = IStart; *I; ++I) { + const TargetRegisterClass *RC = nullptr; + if (X86::GR64RegClass.contains(*I)) + RC = &X86::GR64RegClass; + else + llvm_unreachable("Unexpected register class in CSRsViaCopy!"); + + unsigned NewVR = MRI->createVirtualRegister(RC); + // Create copy from CSR to a virtual register. + // FIXME: this currently does not emit CFI pseudo-instructions, it works + // fine for CXX_FAST_TLS since the C++-style TLS access functions should be + // nounwind. If we want to generalize this later, we may need to emit + // CFI pseudo-instructions. + assert(Entry->getParent()->getFunction()->hasFnAttribute( + Attribute::NoUnwind) && + "Function should be nounwind in insertCopiesSplitCSR!"); + Entry->addLiveIn(*I); + BuildMI(*Entry, Entry->begin(), DebugLoc(), TII->get(TargetOpcode::COPY), + NewVR) + .addReg(*I); + + for (auto *Exit : Exits) + BuildMI(*Exit, Exit->begin(), DebugLoc(), TII->get(TargetOpcode::COPY), + *I) + .addReg(NewVR); + } +} diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 8bb0e5f8bd3..920c375456f 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -1057,6 +1057,15 @@ namespace llvm { const SmallVectorImpl &OutVals, SDLoc dl, SelectionDAG &DAG) const override; + bool supportSplitCSR(MachineFunction *MF) const override { + return MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS && + MF->getFunction()->hasFnAttribute(Attribute::NoUnwind); + } + void initializeSplitCSR(MachineBasicBlock *Entry) const override; + void insertCopiesSplitCSR( + MachineBasicBlock *Entry, + const SmallVectorImpl &Exits) const override; + bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override; bool mayBeEmittedAsTailCall(CallInst *CI) const override; diff --git a/lib/Target/X86/X86MachineFunctionInfo.h b/lib/Target/X86/X86MachineFunctionInfo.h index 3a7a98db50f..00515dde556 100644 --- a/lib/Target/X86/X86MachineFunctionInfo.h +++ b/lib/Target/X86/X86MachineFunctionInfo.h @@ -92,6 +92,10 @@ class X86MachineFunctionInfo : public MachineFunctionInfo { /// used to address arguments in a function using a base pointer. int SEHFramePtrSaveIndex = 0; + /// True if this function has a subset of CSRs that is handled explicitly via + /// copies. + bool IsSplitCSR = false; + private: /// ForwardedMustTailRegParms - A list of virtual and physical registers /// that must be forwarded to every musttail call. @@ -160,6 +164,9 @@ public: SmallVectorImpl &getForwardedMustTailRegParms() { return ForwardedMustTailRegParms; } + + bool isSplitCSR() const { return IsSplitCSR; } + void setIsSplitCSR(bool s) { IsSplitCSR = s; } }; } // End llvm namespace diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp index 58404433e1a..274b5668855 100644 --- a/lib/Target/X86/X86RegisterInfo.cpp +++ b/lib/Target/X86/X86RegisterInfo.cpp @@ -250,7 +250,8 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { return CSR_64_RT_AllRegs_SaveList; case CallingConv::CXX_FAST_TLS: if (Is64Bit) - return CSR_64_TLS_Darwin_SaveList; + return MF->getInfo()->isSplitCSR() ? + CSR_64_CXX_TLS_Darwin_PE_SaveList : CSR_64_TLS_Darwin_SaveList; break; case CallingConv::Intel_OCL_BI: { if (HasAVX512 && IsWin64) @@ -305,6 +306,15 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { return CSR_32_SaveList; } +const MCPhysReg *X86RegisterInfo::getCalleeSavedRegsViaCopy( + const MachineFunction *MF) const { + assert(MF && "Invalid MachineFunction pointer."); + if (MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS && + MF->getInfo()->isSplitCSR()) + return CSR_64_CXX_TLS_Darwin_ViaCopy_SaveList; + return nullptr; +} + const uint32_t * X86RegisterInfo::getCallPreservedMask(const MachineFunction &MF, CallingConv::ID CC) const { diff --git a/lib/Target/X86/X86RegisterInfo.h b/lib/Target/X86/X86RegisterInfo.h index f014c8f6ff6..8d0094cbf3d 100644 --- a/lib/Target/X86/X86RegisterInfo.h +++ b/lib/Target/X86/X86RegisterInfo.h @@ -99,6 +99,8 @@ public: /// callee-save registers on this target. const MCPhysReg * getCalleeSavedRegs(const MachineFunction* MF) const override; + const MCPhysReg * + getCalleeSavedRegsViaCopy(const MachineFunction *MF) const override; const uint32_t *getCallPreservedMask(const MachineFunction &MF, CallingConv::ID) const override; const uint32_t *getNoPreservedMask() const override; diff --git a/test/CodeGen/X86/cxx_tlscc64.ll b/test/CodeGen/X86/cxx_tlscc64.ll index 16e678a9222..70fe501040b 100644 --- a/test/CodeGen/X86/cxx_tlscc64.ll +++ b/test/CodeGen/X86/cxx_tlscc64.ll @@ -2,8 +2,8 @@ ; TLS function were wrongly model and after fixing that, shrink-wrapping ; cannot help here. To achieve the expected lowering, we need to playing ; tricks similar to AArch64 fast TLS calling convention (r255821). -; Re-enable the following run line when -; _RUN_: llc < %s -mtriple=x86_64-apple-darwin -enable-shrink-wrap=true | FileCheck --check-prefix=SHRINK %s +; Applying tricks on x86-64 similar to r255821. +; RUN: llc < %s -mtriple=x86_64-apple-darwin -enable-shrink-wrap=true | FileCheck %s %struct.S = type { i8 } @sg = internal thread_local global %struct.S zeroinitializer, align 1 @@ -16,51 +16,28 @@ declare i32 @_tlv_atexit(void (i8*)*, i8*, i8*) ; Every GPR should be saved - except rdi, rax, and rsp ; CHECK-LABEL: _ZTW2sg -; CHECK: pushq %r11 -; CHECK: pushq %r10 -; CHECK: pushq %r9 -; CHECK: pushq %r8 -; CHECK: pushq %rsi -; CHECK: pushq %rdx -; CHECK: pushq %rcx -; CHECK: pushq %rbx +; CHECK-NOT: pushq %r11 +; CHECK-NOT: pushq %r10 +; CHECK-NOT: pushq %r9 +; CHECK-NOT: pushq %r8 +; CHECK-NOT: pushq %rsi +; CHECK-NOT: pushq %rdx +; CHECK-NOT: pushq %rcx +; CHECK-NOT: pushq %rbx ; CHECK: callq ; CHECK: jne ; CHECK: callq ; CHECK: tlv_atexit ; CHECK: callq -; CHECK: popq %rbx -; CHECK: popq %rcx -; CHECK: popq %rdx -; CHECK: popq %rsi -; CHECK: popq %r8 -; CHECK: popq %r9 -; CHECK: popq %r10 -; CHECK: popq %r11 -; SHRINK-LABEL: _ZTW2sg -; SHRINK: callq -; SHRINK: jne -; SHRINK: pushq %r11 -; SHRINK: pushq %r10 -; SHRINK: pushq %r9 -; SHRINK: pushq %r8 -; SHRINK: pushq %rsi -; SHRINK: pushq %rdx -; SHRINK: pushq %rcx -; SHRINK: pushq %rbx -; SHRINK: callq -; SHRINK: tlv_atexit -; SHRINK: popq %rbx -; SHRINK: popq %rcx -; SHRINK: popq %rdx -; SHRINK: popq %rsi -; SHRINK: popq %r8 -; SHRINK: popq %r9 -; SHRINK: popq %r10 -; SHRINK: popq %r11 -; SHRINK: LBB{{.*}}: -; SHRINK: callq -define cxx_fast_tlscc nonnull %struct.S* @_ZTW2sg() { +; CHECK-NOT: popq %rbx +; CHECK-NOT: popq %rcx +; CHECK-NOT: popq %rdx +; CHECK-NOT: popq %rsi +; CHECK-NOT: popq %r8 +; CHECK-NOT: popq %r9 +; CHECK-NOT: popq %r10 +; CHECK-NOT: popq %r11 +define cxx_fast_tlscc nonnull %struct.S* @_ZTW2sg() nounwind { %.b.i = load i1, i1* @__tls_guard, align 1 br i1 %.b.i, label %__tls_init.exit, label %init.i -- 2.34.1