From 449ad4ca3bfe098003a3acc098e3836b39033da4 Mon Sep 17 00:00:00 2001 From: Hans Wennborg Date: Thu, 17 Dec 2015 23:18:39 +0000 Subject: [PATCH] [X86] Use push-pop for materializing small constants under 'minsize' Use the 3-byte (4 with REX prefix) push-pop sequence for materializing small constants. This is smaller than using a mov (5, 6 or 7 bytes depending on size and REX prefix), but it's likely to be slower, so only used for 'minsize'. This is a follow-up to r255656. Differential Revision: http://reviews.llvm.org/D15549 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255936 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelDAGToDAG.cpp | 10 +- lib/Target/X86/X86InstrCompiler.td | 15 ++- lib/Target/X86/X86InstrInfo.cpp | 48 ++++++++ lib/Target/X86/X86InstrInfo.h | 4 + lib/Target/X86/X86InstrInfo.td | 3 + test/CodeGen/X86/materialize-one.ll | 100 --------------- test/CodeGen/X86/materialize.ll | 184 ++++++++++++++++++++++++++++ test/CodeGen/X86/powi.ll | 4 +- 8 files changed, 262 insertions(+), 106 deletions(-) delete mode 100644 test/CodeGen/X86/materialize-one.ll create mode 100644 test/CodeGen/X86/materialize.ll diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp index 868ae4e19e5..4414e478b99 100644 --- a/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -157,9 +157,13 @@ namespace { /// performance. bool OptForSize; + /// If true, selector should try to optimize for minimum code size. + bool OptForMinSize; + public: explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOpt::Level OptLevel) - : SelectionDAGISel(tm, OptLevel), OptForSize(false) {} + : SelectionDAGISel(tm, OptLevel), OptForSize(false), + OptForMinSize(false) {} const char *getPassName() const override { return "X86 DAG->DAG Instruction Selection"; @@ -531,8 +535,10 @@ static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) { } void X86DAGToDAGISel::PreprocessISelDAG() { - // OptForSize is used in pattern predicates that isel is matching. + // OptFor[Min]Size are used in pattern predicates that isel is matching. OptForSize = MF->getFunction()->optForSize(); + OptForMinSize = MF->getFunction()->optForMinSize(); + assert((!OptForMinSize || OptForSize) && "OptForMinSize implies OptForSize"); for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(), E = CurDAG->allnodes_end(); I != E; ) { diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td index a585775f84e..5d7283f7bd5 100644 --- a/lib/Target/X86/X86InstrCompiler.td +++ b/lib/Target/X86/X86InstrCompiler.td @@ -250,7 +250,7 @@ def MORESTACK_RET_RESTORE_R10 : I<0, Pseudo, (outs), (ins), // Alias instruction mapping movr0 to xor. // FIXME: remove when we can teach regalloc that xor reg, reg is ok. let Defs = [EFLAGS], isReMaterializable = 1, isAsCheapAsAMove = 1, - isPseudo = 1 in + isPseudo = 1, AddedComplexity = 20 in def MOV32r0 : I<0, Pseudo, (outs GR32:$dst), (ins), "", [(set GR32:$dst, 0)], IIC_ALU_NONMEM>, Sched<[WriteZero]>; @@ -263,7 +263,7 @@ def : Pat<(i64 0), (SUBREG_TO_REG (i64 0), (MOV32r0), sub_32bit)> { } let Predicates = [OptForSize, NotSlowIncDec, Not64BitMode], - AddedComplexity = 1 in { + AddedComplexity = 15 in { // Pseudo instructions for materializing 1 and -1 using XOR+INC/DEC, // which only require 3 bytes compared to MOV32ri which requires 5. let Defs = [EFLAGS], isReMaterializable = 1, isPseudo = 1 in { @@ -278,6 +278,17 @@ let Predicates = [OptForSize, NotSlowIncDec, Not64BitMode], def : Pat<(i16 -1), (EXTRACT_SUBREG (MOV32r_1), sub_16bit)>; } +let isReMaterializable = 1, isPseudo = 1, AddedComplexity = 10 in { +// AddedComplexity higher than MOV64ri but lower than MOV32r0 and MOV32r1. +// FIXME: Add itinerary class and Schedule. +def MOV32ImmSExti8 : I<0, Pseudo, (outs GR32:$dst), (ins i32i8imm:$src), "", + [(set GR32:$dst, i32immSExt8:$src)]>, + Requires<[OptForMinSize]>; +def MOV64ImmSExti8 : I<0, Pseudo, (outs GR64:$dst), (ins i64i8imm:$src), "", + [(set GR64:$dst, i64immSExt8:$src)]>, + Requires<[OptForMinSize, NotWin64WithoutFP]>; +} + // Materialize i64 constant where top 32-bits are zero. This could theoretically // use MOV32ri with a SUBREG_TO_REG to represent the zero-extension, however // that would make it more difficult to rematerialize. diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index e028683c63b..0a259226167 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -23,6 +23,7 @@ #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/StackMaps.h" #include "llvm/IR/DerivedTypes.h" @@ -5297,6 +5298,50 @@ static bool expandMOV32r1(MachineInstrBuilder &MIB, const TargetInstrInfo &TII, return true; } +bool X86InstrInfo::ExpandMOVImmSExti8(MachineInstrBuilder &MIB) const { + MachineBasicBlock &MBB = *MIB->getParent(); + DebugLoc DL = MIB->getDebugLoc(); + int64_t Imm = MIB->getOperand(1).getImm(); + assert(Imm != 0 && "Using push/pop for 0 is not efficient."); + MachineBasicBlock::iterator I = MIB.getInstr(); + + int StackAdjustment; + + if (Subtarget.is64Bit()) { + assert(MIB->getOpcode() == X86::MOV64ImmSExti8 || + MIB->getOpcode() == X86::MOV32ImmSExti8); + // 64-bit mode doesn't have 32-bit push/pop, so use 64-bit operations and + // widen the register if necessary. + StackAdjustment = 8; + BuildMI(MBB, I, DL, get(X86::PUSH64i8)).addImm(Imm); + MIB->setDesc(get(X86::POP64r)); + MIB->getOperand(0) + .setReg(getX86SubSuperRegister(MIB->getOperand(0).getReg(), MVT::i64)); + } else { + assert(MIB->getOpcode() == X86::MOV32ImmSExti8); + StackAdjustment = 4; + BuildMI(MBB, I, DL, get(X86::PUSH32i8)).addImm(Imm); + MIB->setDesc(get(X86::POP32r)); + } + + // Build CFI if necessary. + MachineFunction &MF = *MBB.getParent(); + const X86FrameLowering *TFL = Subtarget.getFrameLowering(); + bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); + bool NeedsDwarfCFI = + !IsWin64Prologue && + (MF.getMMI().hasDebugInfo() || MF.getFunction()->needsUnwindTableEntry()); + bool EmitCFI = !TFL->hasFP(MF) && NeedsDwarfCFI; + if (EmitCFI) { + TFL->BuildCFI(MBB, I, DL, + MCCFIInstruction::createAdjustCfaOffset(nullptr, StackAdjustment)); + TFL->BuildCFI(MBB, std::next(I), DL, + MCCFIInstruction::createAdjustCfaOffset(nullptr, -StackAdjustment)); + } + + return true; +} + // LoadStackGuard has so far only been implemented for 64-bit MachO. Different // code sequence is needed for other targets. static void expandLoadStackGuard(MachineInstrBuilder &MIB, @@ -5329,6 +5374,9 @@ bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { return expandMOV32r1(MIB, *this, /*MinusOne=*/ false); case X86::MOV32r_1: return expandMOV32r1(MIB, *this, /*MinusOne=*/ true); + case X86::MOV32ImmSExti8: + case X86::MOV64ImmSExti8: + return ExpandMOVImmSExti8(MIB); case X86::SETB_C8r: return Expand2AddrUndef(MIB, get(X86::SBB8rr)); case X86::SETB_C16r: diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h index edd09d61759..9d40334206b 100644 --- a/lib/Target/X86/X86InstrInfo.h +++ b/lib/Target/X86/X86InstrInfo.h @@ -23,6 +23,7 @@ #include "X86GenInstrInfo.inc" namespace llvm { + class MachineInstrBuilder; class X86RegisterInfo; class X86Subtarget; @@ -564,6 +565,9 @@ private: /// operand and follow operands form a reference to the stack frame. bool isFrameOperand(const MachineInstr *MI, unsigned int Op, int &FrameIndex) const; + + /// Expand the MOVImmSExti8 pseudo-instructions. + bool ExpandMOVImmSExti8(MachineInstrBuilder &MIB) const; }; } // End llvm namespace diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td index f98970cd843..3f3fd82a6cd 100644 --- a/lib/Target/X86/X86InstrInfo.td +++ b/lib/Target/X86/X86InstrInfo.td @@ -820,6 +820,8 @@ def In32BitMode : Predicate<"Subtarget->is32Bit()">, AssemblerPredicate<"Mode32Bit", "32-bit mode">; def IsWin64 : Predicate<"Subtarget->isTargetWin64()">; def NotWin64 : Predicate<"!Subtarget->isTargetWin64()">; +def NotWin64WithoutFP : Predicate<"!Subtarget->isTargetWin64() ||" + "Subtarget->getFrameLowering()->hasFP(*MF)">; def IsPS4 : Predicate<"Subtarget->isTargetPS4()">; def NotPS4 : Predicate<"!Subtarget->isTargetPS4()">; def IsNaCl : Predicate<"Subtarget->isTargetNaCl()">; @@ -833,6 +835,7 @@ def NearData : Predicate<"TM.getCodeModel() == CodeModel::Small ||" def IsStatic : Predicate<"TM.getRelocationModel() == Reloc::Static">; def IsNotPIC : Predicate<"TM.getRelocationModel() != Reloc::PIC_">; def OptForSize : Predicate<"OptForSize">; +def OptForMinSize : Predicate<"OptForMinSize">; def OptForSpeed : Predicate<"!OptForSize">; def FastBTMem : Predicate<"!Subtarget->isBTMemSlow()">; def CallImmAddr : Predicate<"Subtarget->IsLegalToCallImmediateAddr(TM)">; diff --git a/test/CodeGen/X86/materialize-one.ll b/test/CodeGen/X86/materialize-one.ll deleted file mode 100644 index 49da8008b88..00000000000 --- a/test/CodeGen/X86/materialize-one.ll +++ /dev/null @@ -1,100 +0,0 @@ -; RUN: llc -mtriple=i686-unknown-linux-gnu -mattr=+cmov %s -o - | FileCheck %s --check-prefix=CHECK32 -; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+cmov %s -o - | FileCheck %s --check-prefix=CHECK64 - -define i32 @one32() optsize { -entry: - ret i32 1 - -; CHECK32-LABEL: one32 -; CHECK32: xorl %eax, %eax -; CHECK32-NEXT: incl %eax -; CHECK32-NEXT: ret - -; FIXME: Figure out the best approach in 64-bit mode. -; CHECK64-LABEL: one32 -; CHECK64: movl $1, %eax -; CHECK64-NEXT: retq -} - -define i32 @minus_one32() optsize { -entry: - ret i32 -1 - -; CHECK32-LABEL: minus_one32 -; CHECK32: xorl %eax, %eax -; CHECK32-NEXT: decl %eax -; CHECK32-NEXT: ret -} - -define i16 @one16() optsize { -entry: - ret i16 1 - -; CHECK32-LABEL: one16 -; CHECK32: xorl %eax, %eax -; CHECK32-NEXT: incl %eax -; CHECK32-NEXT: retl -} - -define i16 @minus_one16() optsize { -entry: - ret i16 -1 - -; CHECK32-LABEL: minus_one16 -; CHECK32: xorl %eax, %eax -; CHECK32-NEXT: decl %eax -; CHECK32-NEXT: retl -} - -define i32 @test_rematerialization() optsize { -entry: - ; Materialize -1 (thiscall forces it into %ecx). - tail call x86_thiscallcc void @f(i32 -1) - - ; Clobber all registers except %esp, leaving nowhere to store the -1 besides - ; spilling it to the stack. - tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{edi},~{esi},~{ebp},~{dirflag},~{fpsr},~{flags}"() - - ; -1 should be re-materialized here instead of getting spilled above. - ret i32 -1 - -; CHECK32-LABEL: test_rematerialization -; CHECK32: xorl %ecx, %ecx -; CHECK32-NEXT: decl %ecx -; CHECK32: calll -; CHECK32: xorl %eax, %eax -; CHECK32-NEXT: decl %eax -; CHECK32-NOT: %eax -; CHECK32: retl -} - -define i32 @test_rematerialization2(i32 %x) optsize { -entry: - ; Materialize -1 (thiscall forces it into %ecx). - tail call x86_thiscallcc void @f(i32 -1) - - ; Clobber all registers except %esp, leaving nowhere to store the -1 besides - ; spilling it to the stack. - tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{edi},~{esi},~{ebp},~{dirflag},~{fpsr},~{flags}"() - - ; Define eflags. - %a = icmp ne i32 %x, 123 - %b = zext i1 %a to i32 - ; Cause -1 to be rematerialized right in front of the cmov, which needs eflags. - ; It must therefore not use the xor-dec lowering. - %c = select i1 %a, i32 %b, i32 -1 - ret i32 %c - -; CHECK32-LABEL: test_rematerialization2 -; CHECK32: xorl %ecx, %ecx -; CHECK32-NEXT: decl %ecx -; CHECK32: calll -; CHECK32: cmpl -; CHECK32: setne -; CHECK32-NOT: xorl -; CHECK32: movl $-1 -; CHECK32: cmov -; CHECK32: retl -} - -declare x86_thiscallcc void @f(i32) diff --git a/test/CodeGen/X86/materialize.ll b/test/CodeGen/X86/materialize.ll new file mode 100644 index 00000000000..695bf0fa5b9 --- /dev/null +++ b/test/CodeGen/X86/materialize.ll @@ -0,0 +1,184 @@ +; RUN: llc -mtriple=i686-unknown-linux-gnu -mattr=+cmov %s -o - | FileCheck %s --check-prefix=CHECK32 +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+cmov %s -o - | FileCheck %s --check-prefix=CHECK64 +; RUN: llc -mtriple=x86_64-pc-win32 -mattr=+cmov %s -o - | FileCheck %s --check-prefix=CHECKWIN64 + +define i32 @one32_nooptsize() { +entry: + ret i32 1 + +; When not optimizing for size, use mov. +; CHECK32-LABEL: one32_nooptsize: +; CHECK32: movl $1, %eax +; CHECK32-NEXT: retl +; CHECK64-LABEL: one32_nooptsize: +; CHECK64: movl $1, %eax +; CHECK64-NEXT: retq +} + +define i32 @one32() optsize { +entry: + ret i32 1 + +; CHECK32-LABEL: one32: +; CHECK32: xorl %eax, %eax +; CHECK32-NEXT: incl %eax +; CHECK32-NEXT: retl + +; FIXME: Figure out the best approach in 64-bit mode. +; CHECK64-LABEL: one32: +; CHECK64: movl $1, %eax +; CHECK64-NEXT: retq +} + +define i32 @one32_minsize() minsize { +entry: + ret i32 1 + +; On 32-bit, xor-inc is preferred over push-pop. +; CHECK32-LABEL: one32_minsize: +; CHECK32: xorl %eax, %eax +; CHECK32-NEXT: incl %eax +; CHECK32-NEXT: retl + +; On 64-bit we don't do xor-inc yet, so push-pop it is. Note that we have to +; pop into a 64-bit register even when we just need 32 bits. +; CHECK64-LABEL: one32_minsize: +; CHECK64: pushq $1 +; CHECK64: .cfi_adjust_cfa_offset 8 +; CHECK64: popq %rax +; CHECK64: .cfi_adjust_cfa_offset -8 +; CHECK64-NEXT: retq +} + +define i64 @one64_minsize() minsize { +entry: + ret i64 1 +; On 64-bit we don't do xor-inc yet, so push-pop it is. +; CHECK64-LABEL: one64_minsize: +; CHECK64: pushq $1 +; CHECK64: .cfi_adjust_cfa_offset 8 +; CHECK64: popq %rax +; CHECK64: .cfi_adjust_cfa_offset -8 +; CHECK64-NEXT: retq + +; On Win64 we can't adjust the stack unless there's a frame pointer. +; CHECKWIN64-LABEL: one64_minsize: +; CHECKWIN64: movl $1, %eax +; CHECKWIN64-NEXT: retq +} + +define i32 @minus_one32() optsize { +entry: + ret i32 -1 + +; CHECK32-LABEL: minus_one32: +; CHECK32: xorl %eax, %eax +; CHECK32-NEXT: decl %eax +; CHECK32-NEXT: retl +} + +define i32 @minus_one32_minsize() minsize { +entry: + ret i32 -1 + +; xor-dec is preferred over push-pop. +; CHECK32-LABEL: minus_one32_minsize: +; CHECK32: xorl %eax, %eax +; CHECK32-NEXT: decl %eax +; CHECK32-NEXT: retl +} + +define i16 @one16() optsize { +entry: + ret i16 1 + +; CHECK32-LABEL: one16: +; CHECK32: xorl %eax, %eax +; CHECK32-NEXT: incl %eax +; CHECK32-NEXT: retl +} + +define i16 @minus_one16() optsize { +entry: + ret i16 -1 + +; CHECK32-LABEL: minus_one16: +; CHECK32: xorl %eax, %eax +; CHECK32-NEXT: decl %eax +; CHECK32-NEXT: retl +} + +define i32 @minus_five32() minsize { +entry: + ret i32 -5 + +; CHECK32-LABEL: minus_five32: +; CHECK32: pushl $-5 +; CHECK32: popl %eax +; CHECK32: retl +} + +define i64 @minus_five64() minsize { +entry: + ret i64 -5 + +; CHECK64-LABEL: minus_five64: +; CHECK64: pushq $-5 +; CHECK64: .cfi_adjust_cfa_offset 8 +; CHECK64: popq %rax +; CHECK64: .cfi_adjust_cfa_offset -8 +; CHECK64: retq +} + +define i32 @rematerialize_minus_one() optsize { +entry: + ; Materialize -1 (thiscall forces it into %ecx). + tail call x86_thiscallcc void @f(i32 -1) + + ; Clobber all registers except %esp, leaving nowhere to store the -1 besides + ; spilling it to the stack. + tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{edi},~{esi},~{ebp},~{dirflag},~{fpsr},~{flags}"() + + ; -1 should be re-materialized here instead of getting spilled above. + ret i32 -1 + +; CHECK32-LABEL: rematerialize_minus_one +; CHECK32: xorl %ecx, %ecx +; CHECK32-NEXT: decl %ecx +; CHECK32: calll +; CHECK32: xorl %eax, %eax +; CHECK32-NEXT: decl %eax +; CHECK32-NOT: %eax +; CHECK32: retl +} + +define i32 @rematerialize_minus_one_eflags(i32 %x) optsize { +entry: + ; Materialize -1 (thiscall forces it into %ecx). + tail call x86_thiscallcc void @f(i32 -1) + + ; Clobber all registers except %esp, leaving nowhere to store the -1 besides + ; spilling it to the stack. + tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{edi},~{esi},~{ebp},~{dirflag},~{fpsr},~{flags}"() + + ; Define eflags. + %a = icmp ne i32 %x, 123 + %b = zext i1 %a to i32 + ; Cause -1 to be rematerialized right in front of the cmov, which needs eflags. + ; It must therefore not use the xor-dec lowering. + %c = select i1 %a, i32 %b, i32 -1 + ret i32 %c + +; CHECK32-LABEL: rematerialize_minus_one_eflags +; CHECK32: xorl %ecx, %ecx +; CHECK32-NEXT: decl %ecx +; CHECK32: calll +; CHECK32: cmpl +; CHECK32: setne +; CHECK32-NOT: xorl +; CHECK32: movl $-1 +; CHECK32: cmov +; CHECK32: retl +} + +declare x86_thiscallcc void @f(i32) diff --git a/test/CodeGen/X86/powi.ll b/test/CodeGen/X86/powi.ll index 88b5f4eb14b..17d3e3e7d33 100644 --- a/test/CodeGen/X86/powi.ll +++ b/test/CodeGen/X86/powi.ll @@ -29,9 +29,9 @@ define double @pow_wrapper_optsize(double %a) optsize { define double @pow_wrapper_minsize(double %a) minsize { ; CHECK-LABEL: pow_wrapper_minsize: ; CHECK: # BB#0: -; CHECK-NEXT: movl $15, %edi +; CHECK-NEXT: movl $128, %edi ; CHECK-NEXT: jmp - %ret = tail call double @llvm.powi.f64(double %a, i32 15) nounwind ; [#uses=1] + %ret = tail call double @llvm.powi.f64(double %a, i32 128) nounwind ; [#uses=1] ret double %ret } -- 2.34.1