From: Hans Wennborg Date: Tue, 15 Dec 2015 17:10:28 +0000 (+0000) Subject: [X86] Smaller code for materializing 32-bit 1 and -1 constants X-Git-Url: http://plrg.eecs.uci.edu/git/?p=oota-llvm.git;a=commitdiff_plain;h=4bfd3034414d477377a955b3ed52237ec91bdd45 [X86] Smaller code for materializing 32-bit 1 and -1 constants "movl $-1, %eax" is 5 bytes, "xorl %eax, %eax; decl %eax" is 3 bytes. This commit makes LLVM use the latter when optimizing for size. Differential Revision: http://reviews.llvm.org/D14971 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255656 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td index edc02311db5..a585775f84e 100644 --- a/lib/Target/X86/X86InstrCompiler.td +++ b/lib/Target/X86/X86InstrCompiler.td @@ -262,6 +262,22 @@ def : Pat<(i64 0), (SUBREG_TO_REG (i64 0), (MOV32r0), sub_32bit)> { let AddedComplexity = 20; } +let Predicates = [OptForSize, NotSlowIncDec, Not64BitMode], + AddedComplexity = 1 in { + // Pseudo instructions for materializing 1 and -1 using XOR+INC/DEC, + // which only require 3 bytes compared to MOV32ri which requires 5. + let Defs = [EFLAGS], isReMaterializable = 1, isPseudo = 1 in { + def MOV32r1 : I<0, Pseudo, (outs GR32:$dst), (ins), "", + [(set GR32:$dst, 1)]>; + def MOV32r_1 : I<0, Pseudo, (outs GR32:$dst), (ins), "", + [(set GR32:$dst, -1)]>; + } + + // MOV16ri is 4 bytes, so the instructions above are smaller. + def : Pat<(i16 1), (EXTRACT_SUBREG (MOV32r1), sub_16bit)>; + def : Pat<(i16 -1), (EXTRACT_SUBREG (MOV32r_1), sub_16bit)>; +} + // Materialize i64 constant where top 32-bits are zero. This could theoretically // use MOV32ri with a SUBREG_TO_REG to represent the zero-extension, however // that would make it more difficult to rematerialize. diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index c417a1c48ae..7f0766df26f 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -2472,13 +2472,29 @@ void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB, unsigned DestReg, unsigned SubIdx, const MachineInstr *Orig, const TargetRegisterInfo &TRI) const { - // MOV32r0 is implemented with a xor which clobbers condition code. - // Re-materialize it as movri instructions to avoid side effects. - unsigned Opc = Orig->getOpcode(); - if (Opc == X86::MOV32r0 && !isSafeToClobberEFLAGS(MBB, I)) { + bool ClobbersEFLAGS = false; + for (const MachineOperand &MO : Orig->operands()) { + if (MO.isReg() && MO.isDef() && MO.getReg() == X86::EFLAGS) { + ClobbersEFLAGS = true; + break; + } + } + + if (ClobbersEFLAGS && !isSafeToClobberEFLAGS(MBB, I)) { + // The instruction clobbers EFLAGS. Re-materialize as MOV32ri to avoid side + // effects. + int Value; + switch (Orig->getOpcode()) { + case X86::MOV32r0: Value = 0; break; + case X86::MOV32r1: Value = 1; break; + case X86::MOV32r_1: Value = -1; break; + default: + llvm_unreachable("Unexpected instruction!"); + } + DebugLoc DL = Orig->getDebugLoc(); BuildMI(MBB, I, DL, get(X86::MOV32ri)).addOperand(Orig->getOperand(0)) - .addImm(0); + .addImm(Value); } else { MachineInstr *MI = MBB.getParent()->CloneMachineInstr(Orig); MBB.insert(I, MI); @@ -5262,6 +5278,24 @@ static bool Expand2AddrUndef(MachineInstrBuilder &MIB, return true; } +static bool expandMOV32r1(MachineInstrBuilder &MIB, const TargetInstrInfo &TII, + bool MinusOne) { + MachineBasicBlock &MBB = *MIB->getParent(); + DebugLoc DL = MIB->getDebugLoc(); + unsigned Reg = MIB->getOperand(0).getReg(); + + // Insert the XOR. + BuildMI(MBB, MIB.getInstr(), DL, TII.get(X86::XOR32rr), Reg) + .addReg(Reg, RegState::Undef) + .addReg(Reg, RegState::Undef); + + // Turn the pseudo into an INC or DEC. + MIB->setDesc(TII.get(MinusOne ? X86::DEC32r : X86::INC32r)); + MIB.addReg(Reg); + + return true; +} + // LoadStackGuard has so far only been implemented for 64-bit MachO. Different // code sequence is needed for other targets. static void expandLoadStackGuard(MachineInstrBuilder &MIB, @@ -5290,6 +5324,10 @@ bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { switch (MI->getOpcode()) { case X86::MOV32r0: return Expand2AddrUndef(MIB, get(X86::XOR32rr)); + case X86::MOV32r1: + return expandMOV32r1(MIB, *this, /*MinusOne=*/ false); + case X86::MOV32r_1: + return expandMOV32r1(MIB, *this, /*MinusOne=*/ true); case X86::SETB_C8r: return Expand2AddrUndef(MIB, get(X86::SBB8rr)); case X86::SETB_C16r: diff --git a/test/CodeGen/X86/materialize-one.ll b/test/CodeGen/X86/materialize-one.ll new file mode 100644 index 00000000000..49da8008b88 --- /dev/null +++ b/test/CodeGen/X86/materialize-one.ll @@ -0,0 +1,100 @@ +; RUN: llc -mtriple=i686-unknown-linux-gnu -mattr=+cmov %s -o - | FileCheck %s --check-prefix=CHECK32 +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+cmov %s -o - | FileCheck %s --check-prefix=CHECK64 + +define i32 @one32() optsize { +entry: + ret i32 1 + +; CHECK32-LABEL: one32 +; CHECK32: xorl %eax, %eax +; CHECK32-NEXT: incl %eax +; CHECK32-NEXT: ret + +; FIXME: Figure out the best approach in 64-bit mode. +; CHECK64-LABEL: one32 +; CHECK64: movl $1, %eax +; CHECK64-NEXT: retq +} + +define i32 @minus_one32() optsize { +entry: + ret i32 -1 + +; CHECK32-LABEL: minus_one32 +; CHECK32: xorl %eax, %eax +; CHECK32-NEXT: decl %eax +; CHECK32-NEXT: ret +} + +define i16 @one16() optsize { +entry: + ret i16 1 + +; CHECK32-LABEL: one16 +; CHECK32: xorl %eax, %eax +; CHECK32-NEXT: incl %eax +; CHECK32-NEXT: retl +} + +define i16 @minus_one16() optsize { +entry: + ret i16 -1 + +; CHECK32-LABEL: minus_one16 +; CHECK32: xorl %eax, %eax +; CHECK32-NEXT: decl %eax +; CHECK32-NEXT: retl +} + +define i32 @test_rematerialization() optsize { +entry: + ; Materialize -1 (thiscall forces it into %ecx). + tail call x86_thiscallcc void @f(i32 -1) + + ; Clobber all registers except %esp, leaving nowhere to store the -1 besides + ; spilling it to the stack. + tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{edi},~{esi},~{ebp},~{dirflag},~{fpsr},~{flags}"() + + ; -1 should be re-materialized here instead of getting spilled above. + ret i32 -1 + +; CHECK32-LABEL: test_rematerialization +; CHECK32: xorl %ecx, %ecx +; CHECK32-NEXT: decl %ecx +; CHECK32: calll +; CHECK32: xorl %eax, %eax +; CHECK32-NEXT: decl %eax +; CHECK32-NOT: %eax +; CHECK32: retl +} + +define i32 @test_rematerialization2(i32 %x) optsize { +entry: + ; Materialize -1 (thiscall forces it into %ecx). + tail call x86_thiscallcc void @f(i32 -1) + + ; Clobber all registers except %esp, leaving nowhere to store the -1 besides + ; spilling it to the stack. + tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{edi},~{esi},~{ebp},~{dirflag},~{fpsr},~{flags}"() + + ; Define eflags. + %a = icmp ne i32 %x, 123 + %b = zext i1 %a to i32 + ; Cause -1 to be rematerialized right in front of the cmov, which needs eflags. + ; It must therefore not use the xor-dec lowering. + %c = select i1 %a, i32 %b, i32 -1 + ret i32 %c + +; CHECK32-LABEL: test_rematerialization2 +; CHECK32: xorl %ecx, %ecx +; CHECK32-NEXT: decl %ecx +; CHECK32: calll +; CHECK32: cmpl +; CHECK32: setne +; CHECK32-NOT: xorl +; CHECK32: movl $-1 +; CHECK32: cmov +; CHECK32: retl +} + +declare x86_thiscallcc void @f(i32)