X-Git-Url: http://plrg.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FX86%2FREADME.txt;h=67eb2ce1a545c1140a86aab8fc2b97bd277c1118;hb=7ede51b964a2a7d2ca23c25bde9b8123efab401c;hp=46ef8b2225063338695478891935865c12560d24;hpb=6a9a76508c778f02ad3ed0d6a48c59b27d759925;p=oota-llvm.git diff --git a/lib/Target/X86/README.txt b/lib/Target/X86/README.txt index 46ef8b22250..67eb2ce1a54 100644 --- a/lib/Target/X86/README.txt +++ b/lib/Target/X86/README.txt @@ -2,24 +2,19 @@ // Random ideas for the X86 backend. //===---------------------------------------------------------------------===// -Add a MUL2U and MUL2S nodes to represent a multiply that returns both the -Hi and Lo parts (combination of MUL and MULH[SU] into one node). Add this to -X86, & make the dag combiner produce it when needed. This will eliminate one -imul from the code generated for: +Missing features: + - Support for SSE4: http://www.intel.com/software/penryn +http://softwarecommunity.intel.com/isn/Downloads/Intel%20SSE4%20Programming%20Reference.pdf + - support for 3DNow! + - weird abis? -long long test(long long X, long long Y) { return X*Y; } - -by using the EAX result from the mul. We should add a similar node for -DIVREM. - -another case is: - -long long test(int X, int Y) { return (long long)X*Y; } - -... which should only be one imul instruction. +//===---------------------------------------------------------------------===// -This can be done with a custom expander, but it would be nice to move this to -generic code. +CodeGen/X86/lea-3.ll:test3 should be a single LEA, not a shift/move. The X86 +backend knows how to three-addressify this shift, but it appears the register +allocator isn't even asking it to do so in this case. We should investigate +why this isn't happening, it could have significant impact on other important +cases for X86 as well. //===---------------------------------------------------------------------===// @@ -94,6 +89,8 @@ Should we promote i16 to i32 to avoid partial register update stalls? Leave any_extend as pseudo instruction and hint to register allocator. Delay codegen until post register allocation. +Note. any_extend is now turned into an INSERT_SUBREG. We still need to teach +the coalescer how to deal with it though. //===---------------------------------------------------------------------===// @@ -125,9 +122,7 @@ int foo (unsigned long j) { //===---------------------------------------------------------------------===// -Use push/pop instructions in prolog/epilog sequences instead of stores off -ESP (certain code size win, perf win on some [which?] processors). -Also, it appears icc use push for parameter passing. Need to investigate. +It appears icc use push for parameter passing. Need to investigate. //===---------------------------------------------------------------------===// @@ -267,10 +262,6 @@ It would be better to emit "cmp %al, 1" than a xor and test. //===---------------------------------------------------------------------===// -Enable X86InstrInfo::convertToThreeAddress(). - -//===---------------------------------------------------------------------===// - We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl We should leave these as libcalls for everything over a much lower threshold, since libc is hand tuned for medium and large mem ops (avoiding RFO for large @@ -339,6 +330,45 @@ lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor. //===---------------------------------------------------------------------===// +We are generating far worse code than gcc: + +volatile short X, Y; + +void foo(int N) { + int i; + for (i = 0; i < N; i++) { X = i; Y = i*4; } +} + +LBB1_1: # entry.bb_crit_edge + xorl %ecx, %ecx + xorw %dx, %dx +LBB1_2: # bb + movl L_X$non_lazy_ptr, %esi + movw %cx, (%esi) + movl L_Y$non_lazy_ptr, %esi + movw %dx, (%esi) + addw $4, %dx + incl %ecx + cmpl %eax, %ecx + jne LBB1_2 # bb + +vs. + + xorl %edx, %edx + movl L_X$non_lazy_ptr-"L00000000001$pb"(%ebx), %esi + movl L_Y$non_lazy_ptr-"L00000000001$pb"(%ebx), %ecx +L4: + movw %dx, (%esi) + leal 0(,%edx,4), %eax + movw %ax, (%ecx) + addl $1, %edx + cmpl %edx, %edi + jne L4 + +This is due to the lack of post regalloc LICM. + +//===---------------------------------------------------------------------===// + Teach the coalescer to coalesce vregs of different register classes. e.g. FR32 / FR64 to VR128. @@ -405,21 +435,6 @@ require a copy to be inserted (in X86InstrInfo::convertToThreeAddress). //===---------------------------------------------------------------------===// -Bad codegen: - -char foo(int x) { return x; } - -_foo: - movl 4(%esp), %eax - shll $24, %eax - sarl $24, %eax - ret - -SIGN_EXTEND_INREG can be implemented as (sext (trunc)) to take advantage of -sub-registers. - -//===---------------------------------------------------------------------===// - Consider this: typedef struct pair { float A, B; } pair; @@ -479,22 +494,6 @@ the load's chain result is read by the callseq_start. //===---------------------------------------------------------------------===// -Don't forget to find a way to squash noop truncates in the JIT environment. - -//===---------------------------------------------------------------------===// - -Implement anyext in the same manner as truncate that would allow them to be -eliminated. - -//===---------------------------------------------------------------------===// - -How about implementing truncate / anyext as a property of machine instruction -operand? i.e. Print as 32-bit super-class register / 16-bit sub-class register. -Do this for the cases where a truncate / anyext is guaranteed to be eliminated. -For IA32 that is truncate from 32 to 16 and anyext from 16 to 32. - -//===---------------------------------------------------------------------===// - For this: int test(int a) @@ -522,7 +521,25 @@ shorter than movl + leal. //===---------------------------------------------------------------------===// -Implement CTTZ, CTLZ with bsf and bsr. +Implement CTTZ, CTLZ with bsf and bsr. GCC produces: + +int ctz_(unsigned X) { return __builtin_ctz(X); } +int clz_(unsigned X) { return __builtin_clz(X); } +int ffs_(unsigned X) { return __builtin_ffs(X); } + +_ctz_: + bsfl 4(%esp), %eax + ret +_clz_: + bsrl 4(%esp), %eax + xorl $31, %eax + ret +_ffs_: + movl $-1, %edx + bsfl 4(%esp), %eax + cmove %edx, %eax + addl $1, %eax + ret //===---------------------------------------------------------------------===// @@ -634,6 +651,26 @@ _f: etc. +Another is: +int usesbb(unsigned int a, unsigned int b) { + return (a < b ? -1 : 0); +} +to: +_usesbb: + movl 8(%esp), %eax + cmpl %eax, 4(%esp) + sbbl %eax, %eax + ret + +instead of: +_usesbb: + xorl %eax, %eax + movl 8(%esp), %ecx + cmpl %ecx, 4(%esp) + movl $4294967295, %ecx + cmovb %ecx, %eax + ret + //===---------------------------------------------------------------------===// Currently we don't have elimination of redundant stack manipulations. Consider @@ -868,3 +905,755 @@ _f2: //===---------------------------------------------------------------------===// +This code: + +void test(int X) { + if (X) abort(); +} + +is currently compiled to: + +_test: + subl $12, %esp + cmpl $0, 16(%esp) + jne LBB1_1 + addl $12, %esp + ret +LBB1_1: + call L_abort$stub + +It would be better to produce: + +_test: + subl $12, %esp + cmpl $0, 16(%esp) + jne L_abort$stub + addl $12, %esp + ret + +This can be applied to any no-return function call that takes no arguments etc. +Alternatively, the stack save/restore logic could be shrink-wrapped, producing +something like this: + +_test: + cmpl $0, 4(%esp) + jne LBB1_1 + ret +LBB1_1: + subl $12, %esp + call L_abort$stub + +Both are useful in different situations. Finally, it could be shrink-wrapped +and tail called, like this: + +_test: + cmpl $0, 4(%esp) + jne LBB1_1 + ret +LBB1_1: + pop %eax # realign stack. + call L_abort$stub + +Though this probably isn't worth it. + +//===---------------------------------------------------------------------===// + +We need to teach the codegen to convert two-address INC instructions to LEA +when the flags are dead (likewise dec). For example, on X86-64, compile: + +int foo(int A, int B) { + return A+1; +} + +to: + +_foo: + leal 1(%edi), %eax + ret + +instead of: + +_foo: + incl %edi + movl %edi, %eax + ret + +Another example is: + +;; X's live range extends beyond the shift, so the register allocator +;; cannot coalesce it with Y. Because of this, a copy needs to be +;; emitted before the shift to save the register value before it is +;; clobbered. However, this copy is not needed if the register +;; allocator turns the shift into an LEA. This also occurs for ADD. + +; Check that the shift gets turned into an LEA. +; RUN: llvm-upgrade < %s | llvm-as | llc -march=x86 -x86-asm-syntax=intel | \ +; RUN: not grep {mov E.X, E.X} + +%G = external global int + +int %test1(int %X, int %Y) { + %Z = add int %X, %Y + volatile store int %Y, int* %G + volatile store int %Z, int* %G + ret int %X +} + +int %test2(int %X) { + %Z = add int %X, 1 ;; inc + volatile store int %Z, int* %G + ret int %X +} + +//===---------------------------------------------------------------------===// + +Sometimes it is better to codegen subtractions from a constant (e.g. 7-x) with +a neg instead of a sub instruction. Consider: + +int test(char X) { return 7-X; } + +we currently produce: +_test: + movl $7, %eax + movsbl 4(%esp), %ecx + subl %ecx, %eax + ret + +We would use one fewer register if codegen'd as: + + movsbl 4(%esp), %eax + neg %eax + add $7, %eax + ret + +Note that this isn't beneficial if the load can be folded into the sub. In +this case, we want a sub: + +int test(int X) { return 7-X; } +_test: + movl $7, %eax + subl 4(%esp), %eax + ret + +//===---------------------------------------------------------------------===// + +For code like: +phi (undef, x) + +We get an implicit def on the undef side. If the phi is spilled, we then get: +implicitdef xmm1 +store xmm1 -> stack + +It should be possible to teach the x86 backend to "fold" the store into the +implicitdef, which just deletes the implicit def. + +These instructions should go away: +#IMPLICIT_DEF %xmm1 +movaps %xmm1, 192(%esp) +movaps %xmm1, 224(%esp) +movaps %xmm1, 176(%esp) + +//===---------------------------------------------------------------------===// + +This is a "commutable two-address" register coallescing deficiency: + +define <4 x float> @test1(<4 x float> %V) { +entry: + %tmp8 = shufflevector <4 x float> %V, <4 x float> undef, + <4 x i32> < i32 3, i32 2, i32 1, i32 0 > + %add = add <4 x float> %tmp8, %V + ret <4 x float> %add +} + +this codegens to: + +_test1: + pshufd $27, %xmm0, %xmm1 + addps %xmm0, %xmm1 + movaps %xmm1, %xmm0 + ret + +instead of: + +_test1: + pshufd $27, %xmm0, %xmm1 + addps %xmm1, %xmm0 + ret + +//===---------------------------------------------------------------------===// + +Leaf functions that require one 4-byte spill slot have a prolog like this: + +_foo: + pushl %esi + subl $4, %esp +... +and an epilog like this: + addl $4, %esp + popl %esi + ret + +It would be smaller, and potentially faster, to push eax on entry and to +pop into a dummy register instead of using addl/subl of esp. Just don't pop +into any return registers :) + +//===---------------------------------------------------------------------===// + +The X86 backend should fold (branch (or (setcc, setcc))) into multiple +branches. We generate really poor code for: + +double testf(double a) { + return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0); +} + +For example, the entry BB is: + +_testf: + subl $20, %esp + pxor %xmm0, %xmm0 + movsd 24(%esp), %xmm1 + ucomisd %xmm0, %xmm1 + setnp %al + sete %cl + testb %cl, %al + jne LBB1_5 # UnifiedReturnBlock +LBB1_1: # cond_true + + +it would be better to replace the last four instructions with: + + jp LBB1_1 + je LBB1_5 +LBB1_1: + +We also codegen the inner ?: into a diamond: + + cvtss2sd LCPI1_0(%rip), %xmm2 + cvtss2sd LCPI1_1(%rip), %xmm3 + ucomisd %xmm1, %xmm0 + ja LBB1_3 # cond_true +LBB1_2: # cond_true + movapd %xmm3, %xmm2 +LBB1_3: # cond_true + movapd %xmm2, %xmm0 + ret + +We should sink the load into xmm3 into the LBB1_2 block. This should +be pretty easy, and will nuke all the copies. + +//===---------------------------------------------------------------------===// + +This: + #include + inline std::pair full_add(unsigned a, unsigned b) + { return std::make_pair(a + b, a + b < a); } + bool no_overflow(unsigned a, unsigned b) + { return !full_add(a, b).second; } + +Should compile to: + + + _Z11no_overflowjj: + addl %edi, %esi + setae %al + ret + +on x86-64, not: + +__Z11no_overflowjj: + addl %edi, %esi + cmpl %edi, %esi + setae %al + movzbl %al, %eax + ret + + +//===---------------------------------------------------------------------===// + +Re-materialize MOV32r0 etc. with xor instead of changing them to moves if the +condition register is dead. xor reg reg is shorter than mov reg, #0. + +//===---------------------------------------------------------------------===// + +We aren't matching RMW instructions aggressively +enough. Here's a reduced testcase (more in PR1160): + +define void @test(i32* %huge_ptr, i32* %target_ptr) { + %A = load i32* %huge_ptr ; [#uses=1] + %B = load i32* %target_ptr ; [#uses=1] + %C = or i32 %A, %B ; [#uses=1] + store i32 %C, i32* %target_ptr + ret void +} + +$ llvm-as < t.ll | llc -march=x86-64 + +_test: + movl (%rdi), %eax + orl (%rsi), %eax + movl %eax, (%rsi) + ret + +That should be something like: + +_test: + movl (%rdi), %eax + orl %eax, (%rsi) + ret + +//===---------------------------------------------------------------------===// + +The following code: + +bb114.preheader: ; preds = %cond_next94 + %tmp231232 = sext i16 %tmp62 to i32 ; [#uses=1] + %tmp233 = sub i32 32, %tmp231232 ; [#uses=1] + %tmp245246 = sext i16 %tmp65 to i32 ; [#uses=1] + %tmp252253 = sext i16 %tmp68 to i32 ; [#uses=1] + %tmp254 = sub i32 32, %tmp252253 ; [#uses=1] + %tmp553554 = bitcast i16* %tmp37 to i8* ; [#uses=2] + %tmp583584 = sext i16 %tmp98 to i32 ; [#uses=1] + %tmp585 = sub i32 32, %tmp583584 ; [#uses=1] + %tmp614615 = sext i16 %tmp101 to i32 ; [#uses=1] + %tmp621622 = sext i16 %tmp104 to i32 ; [#uses=1] + %tmp623 = sub i32 32, %tmp621622 ; [#uses=1] + br label %bb114 + +produces: + +LBB3_5: # bb114.preheader + movswl -68(%ebp), %eax + movl $32, %ecx + movl %ecx, -80(%ebp) + subl %eax, -80(%ebp) + movswl -52(%ebp), %eax + movl %ecx, -84(%ebp) + subl %eax, -84(%ebp) + movswl -70(%ebp), %eax + movl %ecx, -88(%ebp) + subl %eax, -88(%ebp) + movswl -50(%ebp), %eax + subl %eax, %ecx + movl %ecx, -76(%ebp) + movswl -42(%ebp), %eax + movl %eax, -92(%ebp) + movswl -66(%ebp), %eax + movl %eax, -96(%ebp) + movw $0, -98(%ebp) + +This appears to be bad because the RA is not folding the store to the stack +slot into the movl. The above instructions could be: + movl $32, -80(%ebp) +... + movl $32, -84(%ebp) +... +This seems like a cross between remat and spill folding. + +This has redundant subtractions of %eax from a stack slot. However, %ecx doesn't +change, so we could simply subtract %eax from %ecx first and then use %ecx (or +vice-versa). + +//===---------------------------------------------------------------------===// + +For this code: + +cond_next603: ; preds = %bb493, %cond_true336, %cond_next599 + %v.21050.1 = phi i32 [ %v.21050.0, %cond_next599 ], [ %tmp344, %cond_true336 ], [ %v.2, %bb493 ] ; [#uses=1] + %maxz.21051.1 = phi i32 [ %maxz.21051.0, %cond_next599 ], [ 0, %cond_true336 ], [ %maxz.2, %bb493 ] ; [#uses=2] + %cnt.01055.1 = phi i32 [ %cnt.01055.0, %cond_next599 ], [ 0, %cond_true336 ], [ %cnt.0, %bb493 ] ; [#uses=2] + %byteptr.9 = phi i8* [ %byteptr.12, %cond_next599 ], [ %byteptr.0, %cond_true336 ], [ %byteptr.10, %bb493 ] ; [#uses=9] + %bitptr.6 = phi i32 [ %tmp5571104.1, %cond_next599 ], [ %tmp4921049, %cond_true336 ], [ %bitptr.7, %bb493 ] ; [#uses=4] + %source.5 = phi i32 [ %tmp602, %cond_next599 ], [ %source.0, %cond_true336 ], [ %source.6, %bb493 ] ; [#uses=7] + %tmp606 = getelementptr %struct.const_tables* @tables, i32 0, i32 0, i32 %cnt.01055.1 ; [#uses=1] + %tmp607 = load i8* %tmp606, align 1 ; [#uses=1] + +We produce this: + +LBB4_70: # cond_next603 + movl -20(%ebp), %esi + movl L_tables$non_lazy_ptr-"L4$pb"(%esi), %esi + +However, ICC caches this information before the loop and produces this: + + movl 88(%esp), %eax #481.12 + +//===---------------------------------------------------------------------===// + +This code: + + %tmp659 = icmp slt i16 %tmp654, 0 ; [#uses=1] + br i1 %tmp659, label %cond_true662, label %cond_next715 + +produces this: + + testw %cx, %cx + movswl %cx, %esi + jns LBB4_109 # cond_next715 + +Shark tells us that using %cx in the testw instruction is sub-optimal. It +suggests using the 32-bit register (which is what ICC uses). + +//===---------------------------------------------------------------------===// + +rdar://5506677 - We compile this: + +define i32 @foo(double %x) { + %x14 = bitcast double %x to i64 ; [#uses=1] + %tmp713 = trunc i64 %x14 to i32 ; [#uses=1] + %tmp8 = and i32 %tmp713, 2147483647 ; [#uses=1] + ret i32 %tmp8 +} + +to: + +_foo: + subl $12, %esp + fldl 16(%esp) + fstpl (%esp) + movl $2147483647, %eax + andl (%esp), %eax + addl $12, %esp + #FP_REG_KILL + ret + +It would be much better to eliminate the fldl/fstpl by folding the bitcast +into the load SDNode. That would give us: + +_foo: + movl $2147483647, %eax + andl 4(%esp), %eax + ret + +//===---------------------------------------------------------------------===// + +We compile this: + +void compare (long long foo) { + if (foo < 4294967297LL) + abort(); +} + +to: + +_compare: + subl $12, %esp + cmpl $0, 16(%esp) + setne %al + movzbw %al, %ax + cmpl $1, 20(%esp) + setg %cl + movzbw %cl, %cx + cmove %ax, %cx + movw %cx, %ax + testb $1, %al + je LBB1_2 # cond_true + +(also really horrible code on ppc). This is due to the expand code for 64-bit +compares. GCC produces multiple branches, which is much nicer: + +_compare: + pushl %ebp + movl %esp, %ebp + subl $8, %esp + movl 8(%ebp), %eax + movl 12(%ebp), %edx + subl $1, %edx + jg L5 +L7: + jl L4 + cmpl $0, %eax + jbe L4 +L5: + +//===---------------------------------------------------------------------===// + +Tail call optimization improvements: Tail call optimization currently +pushes all arguments on the top of the stack (their normal place for +non-tail call optimized calls) before moving them to actual stack +slot. This is done to prevent overwriting of parameters (see example +below) that might be used, since the arguments of the callee +overwrites caller's arguments. + +example: + +int callee(int32, int64); +int caller(int32 arg1, int32 arg2) { + int64 local = arg2 * 2; + return callee(arg2, (int64)local); +} + +[arg1] [!arg2 no longer valid since we moved local onto it] +[arg2] -> [(int64) +[RETADDR] local ] + +Moving arg1 onto the stack slot of callee function would overwrite +arg2 of the caller. + +Possible optimizations: + + - Only push those arguments to the top of the stack that are actual + parameters of the caller function and have no local value in the + caller. + + In the above example local does not need to be pushed onto the top + of the stack as it is definitely not a caller's function + parameter. + + - Analyse the actual parameters of the callee to see which would + overwrite a caller parameter which is used by the callee and only + push them onto the top of the stack. + + int callee (int32 arg1, int32 arg2); + int caller (int32 arg1, int32 arg2) { + return callee(arg1,arg2); + } + + Here we don't need to write any variables to the top of the stack + since they don't overwrite each other. + + int callee (int32 arg1, int32 arg2); + int caller (int32 arg1, int32 arg2) { + return callee(arg2,arg1); + } + + Here we need to push the arguments because they overwrite each + other. + + + Code for lowering directly onto callers arguments: ++ SmallVector, 8> RegsToPass; ++ SmallVector MemOpChains; ++ ++ SDOperand FramePtr; ++ SDOperand PtrOff; ++ SDOperand FIN; ++ int FI = 0; ++ // Walk the register/memloc assignments, inserting copies/loads. ++ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { ++ CCValAssign &VA = ArgLocs[i]; ++ SDOperand Arg = Op.getOperand(5+2*VA.getValNo()); ++ ++ .... ++ ++ if (VA.isRegLoc()) { ++ RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); ++ } else { ++ assert(VA.isMemLoc()); ++ // create frame index ++ int32_t Offset = VA.getLocMemOffset()+FPDiff; ++ uint32_t OpSize = (MVT::getSizeInBits(VA.getLocVT())+7)/8; ++ FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset); ++ FIN = DAG.getFrameIndex(FI, MVT::i32); ++ // store relative to framepointer ++ MemOpChains.push_back(DAG.getStore(Chain, Arg, FIN, NULL, 0)); ++ } ++ } +//===---------------------------------------------------------------------===// + +main () +{ + int i = 0; + unsigned long int z = 0; + + do { + z -= 0x00004000; + i++; + if (i > 0x00040000) + abort (); + } while (z > 0); + exit (0); +} + +gcc compiles this to: + +_main: + subl $28, %esp + xorl %eax, %eax + jmp L2 +L3: + cmpl $262144, %eax + je L10 +L2: + addl $1, %eax + cmpl $262145, %eax + jne L3 + call L_abort$stub +L10: + movl $0, (%esp) + call L_exit$stub + +llvm: + +_main: + subl $12, %esp + movl $1, %eax + movl $16384, %ecx +LBB1_1: # bb + cmpl $262145, %eax + jge LBB1_4 # cond_true +LBB1_2: # cond_next + incl %eax + addl $4294950912, %ecx + cmpl $16384, %ecx + jne LBB1_1 # bb +LBB1_3: # bb11 + xorl %eax, %eax + addl $12, %esp + ret +LBB1_4: # cond_true + call L_abort$stub + +1. LSR should rewrite the first cmp with induction variable %ecx. +2. DAG combiner should fold + leal 1(%eax), %edx + cmpl $262145, %edx + => + cmpl $262144, %eax + +//===---------------------------------------------------------------------===// + +define i64 @test(double %X) { + %Y = fptosi double %X to i64 + ret i64 %Y +} + +compiles to: + +_test: + subl $20, %esp + movsd 24(%esp), %xmm0 + movsd %xmm0, 8(%esp) + fldl 8(%esp) + fisttpll (%esp) + movl 4(%esp), %edx + movl (%esp), %eax + addl $20, %esp + #FP_REG_KILL + ret + +This should just fldl directly from the input stack slot. + +//===---------------------------------------------------------------------===// + +This code: +int foo (int x) { return (x & 65535) | 255; } + +Should compile into: + +_foo: + movzwl 4(%esp), %eax + orb $-1, %al ;; 'orl 255' is also fine :) + ret + +instead of: +_foo: + movl $255, %eax + orl 4(%esp), %eax + andl $65535, %eax + ret + +//===---------------------------------------------------------------------===// + +We're missing an obvious fold of a load into imul: + +int test(long a, long b) { return a * b; } + +LLVM produces: +_test: + movl 4(%esp), %ecx + movl 8(%esp), %eax + imull %ecx, %eax + ret + +vs: +_test: + movl 8(%esp), %eax + imull 4(%esp), %eax + ret + +//===---------------------------------------------------------------------===// + +We can fold a store into "zeroing a reg". Instead of: + +xorl %eax, %eax +movl %eax, 124(%esp) + +we should get: + +movl $0, 124(%esp) + +if the flags of the xor are dead. + +//===---------------------------------------------------------------------===// + +This testcase misses a read/modify/write opportunity (from PR1425): + +void vertical_decompose97iH1(int *b0, int *b1, int *b2, int width){ + int i; + for(i=0; i>0; +} + +We compile it down to: + +LBB1_2: # bb + movl (%esi,%edi,4), %ebx + addl (%ecx,%edi,4), %ebx + addl (%edx,%edi,4), %ebx + movl %ebx, (%ecx,%edi,4) + incl %edi + cmpl %eax, %edi + jne LBB1_2 # bb + +the inner loop should add to the memory location (%ecx,%edi,4), saving +a mov. Something like: + + movl (%esi,%edi,4), %ebx + addl (%edx,%edi,4), %ebx + addl %ebx, (%ecx,%edi,4) + +Here is another interesting example: + +void vertical_compose97iH1(int *b0, int *b1, int *b2, int width){ + int i; + for(i=0; i>0; +} + +We miss the r/m/w opportunity here by using 2 subs instead of an add+sub[mem]: + +LBB9_2: # bb + movl (%ecx,%edi,4), %ebx + subl (%esi,%edi,4), %ebx + subl (%edx,%edi,4), %ebx + movl %ebx, (%ecx,%edi,4) + incl %edi + cmpl %eax, %edi + jne LBB9_2 # bb + +Additionally, LSR should rewrite the exit condition of these loops to use +a stride-4 IV, would would allow all the scales in the loop to go away. +This would result in smaller code and more efficient microops. + +//===---------------------------------------------------------------------===// + +We should be smarter about conversion from fpstack to XMM regs. + +double foo(); +void bar(double *P) { *P = foo(); } + +We compile that to: + +_bar: + subl $12, %esp + call L_foo$stub + fstpl (%esp) + movl 16(%esp), %eax + movsd (%esp), %xmm0 + movsd %xmm0, (%eax) + addl $12, %esp + ret + +for example. The magic to/from the stack is unneeded. + +//===---------------------------------------------------------------------===//