X-Git-Url: http://plrg.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FX86%2FREADME.txt;h=2374659a519a3bd5270cc5eae2d4298cc481ed81;hb=29681a53857b929fc8b4373884b8d958a144b9b7;hp=72223d14e41e973952cb1a32f935a6b6fc724b7a;hpb=08c33011d1443ce3e12b6c4026cde975b24c8e37;p=oota-llvm.git diff --git a/lib/Target/X86/README.txt b/lib/Target/X86/README.txt index 72223d14e41..2374659a519 100644 --- a/lib/Target/X86/README.txt +++ b/lib/Target/X86/README.txt @@ -2,21 +2,14 @@ // Random ideas for the X86 backend. //===---------------------------------------------------------------------===// -Add a MUL2U and MUL2S nodes to represent a multiply that returns both the -Hi and Lo parts (combination of MUL and MULH[SU] into one node). Add this to -X86, & make the dag combiner produce it when needed. This will eliminate one -imul from the code generated for: -long long test(long long X, long long Y) { return X*Y; } - -by using the EAX result from the mul. We should add a similar node for -DIVREM. - -another case is: - -long long test(int X, int Y) { return (long long)X*Y; } +//===---------------------------------------------------------------------===// -... which should only be one imul instruction. +CodeGen/X86/lea-3.ll:test3 should be a single LEA, not a shift/move. The X86 +backend knows how to three-addressify this shift, but it appears the register +allocator isn't even asking it to do so in this case. We should investigate +why this isn't happening, it could have significant impact on other important +cases for X86 as well. //===---------------------------------------------------------------------===// @@ -56,7 +49,19 @@ One better solution for 1LL << x is: But that requires good 8-bit subreg support. +Also, this might be better. It's an extra shift, but it's one instruction +shorter, and doesn't stress 8-bit subreg support. +(From http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01148.html, +but without the unnecessary and.) + movl %ecx, %eax + shrl $5, %eax + movl %eax, %edx + xorl $1, %edx + sall %cl, %eax + sall %cl. %edx +64-bit shifts (in general) expand to really bad code. Instead of using +cmovs, we should expand to a conditional branch like GCC produces. //===---------------------------------------------------------------------===// @@ -68,6 +73,9 @@ into: xorl $1, %eax ret +(Although note that this isn't a legal way to express the code that llvm-gcc +currently generates for that function.) + //===---------------------------------------------------------------------===// Some isel ideas: @@ -90,31 +98,12 @@ Should we promote i16 to i32 to avoid partial register update stalls? Leave any_extend as pseudo instruction and hint to register allocator. Delay codegen until post register allocation. +Note. any_extend is now turned into an INSERT_SUBREG. We still need to teach +the coalescer how to deal with it though. //===---------------------------------------------------------------------===// -Count leading zeros and count trailing zeros: - -int clz(int X) { return __builtin_clz(X); } -int ctz(int X) { return __builtin_ctz(X); } - -$ gcc t.c -S -o - -O3 -fomit-frame-pointer -masm=intel -clz: - bsr %eax, DWORD PTR [%esp+4] - xor %eax, 31 - ret -ctz: - bsf %eax, DWORD PTR [%esp+4] - ret - -however, check that these are defined for 0 and 32. Our intrinsics are, GCC's -aren't. - -//===---------------------------------------------------------------------===// - -Use push/pop instructions in prolog/epilog sequences instead of stores off -ESP (certain code size win, perf win on some [which?] processors). -Also, it appears icc use push for parameter passing. Need to investigate. +It appears icc use push for parameter passing. Need to investigate. //===---------------------------------------------------------------------===// @@ -200,9 +189,9 @@ when we can spare a register. It reduces code size. Evaluate what the best way to codegen sdiv X, (2^C) is. For X/8, we currently get this: -int %test1(int %X) { - %Y = div int %X, 8 - ret int %Y +define i32 @test1(i32 %X) { + %Y = sdiv i32 %X, 8 + ret i32 %Y } _test1: @@ -228,47 +217,6 @@ which is probably slower, but it's interesting at least :) //===---------------------------------------------------------------------===// -Should generate min/max for stuff like: - -void minf(float a, float b, float *X) { - *X = a <= b ? a : b; -} - -Make use of floating point min / max instructions. Perhaps introduce ISD::FMIN -and ISD::FMAX node types? - -//===---------------------------------------------------------------------===// - -The first BB of this code: - -declare bool %foo() -int %bar() { - %V = call bool %foo() - br bool %V, label %T, label %F -T: - ret int 1 -F: - call bool %foo() - ret int 12 -} - -compiles to: - -_bar: - subl $12, %esp - call L_foo$stub - xorb $1, %al - testb %al, %al - jne LBB_bar_2 # F - -It would be better to emit "cmp %al, 1" than a xor and test. - -//===---------------------------------------------------------------------===// - -Enable X86InstrInfo::convertToThreeAddress(). - -//===---------------------------------------------------------------------===// - We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl We should leave these as libcalls for everything over a much lower threshold, since libc is hand tuned for medium and large mem ops (avoiding RFO for large @@ -337,19 +285,47 @@ lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor. //===---------------------------------------------------------------------===// -Teach the coalescer to coalesce vregs of different register classes. e.g. FR32 / -FR64 to VR128. +We are generating far worse code than gcc: -//===---------------------------------------------------------------------===// +volatile short X, Y; -mov $reg, 48(%esp) -... -leal 48(%esp), %eax -mov %eax, (%esp) -call _foo +void foo(int N) { + int i; + for (i = 0; i < N; i++) { X = i; Y = i*4; } +} + +LBB1_1: # entry.bb_crit_edge + xorl %ecx, %ecx + xorw %dx, %dx +LBB1_2: # bb + movl L_X$non_lazy_ptr, %esi + movw %cx, (%esi) + movl L_Y$non_lazy_ptr, %esi + movw %dx, (%esi) + addw $4, %dx + incl %ecx + cmpl %eax, %ecx + jne LBB1_2 # bb + +vs. + + xorl %edx, %edx + movl L_X$non_lazy_ptr-"L00000000001$pb"(%ebx), %esi + movl L_Y$non_lazy_ptr-"L00000000001$pb"(%ebx), %ecx +L4: + movw %dx, (%esi) + leal 0(,%edx,4), %eax + movw %ax, (%ecx) + addl $1, %edx + cmpl %edx, %edi + jne L4 + +This is due to the lack of post regalloc LICM. + +//===---------------------------------------------------------------------===// -Obviously it would have been better for the first mov (or any op) to store -directly %esp[0] if there are no other uses. +Teach the coalescer to coalesce vregs of different register classes. e.g. FR32 / +FR64 to VR128. //===---------------------------------------------------------------------===// @@ -403,58 +379,6 @@ require a copy to be inserted (in X86InstrInfo::convertToThreeAddress). //===---------------------------------------------------------------------===// -Bad codegen: - -char foo(int x) { return x; } - -_foo: - movl 4(%esp), %eax - shll $24, %eax - sarl $24, %eax - ret - -SIGN_EXTEND_INREG can be implemented as (sext (trunc)) to take advantage of -sub-registers. - -//===---------------------------------------------------------------------===// - -Consider this: - -typedef struct pair { float A, B; } pair; -void pairtest(pair P, float *FP) { - *FP = P.A+P.B; -} - -We currently generate this code with llvmgcc4: - -_pairtest: - subl $12, %esp - movl 20(%esp), %eax - movl %eax, 4(%esp) - movl 16(%esp), %eax - movl %eax, (%esp) - movss (%esp), %xmm0 - addss 4(%esp), %xmm0 - movl 24(%esp), %eax - movss %xmm0, (%eax) - addl $12, %esp - ret - -we should be able to generate: -_pairtest: - movss 4(%esp), %xmm0 - movl 12(%esp), %eax - addss 8(%esp), %xmm0 - movss %xmm0, (%eax) - ret - -The issue is that llvmgcc4 is forcing the struct to memory, then passing it as -integer chunks. It does this so that structs like {short,short} are passed in -a single 32-bit integer stack slot. We should handle the safe cases above much -nicer, while still handling the hard cases. - -//===---------------------------------------------------------------------===// - Another instruction selector deficiency: void %bar() { @@ -476,22 +400,6 @@ the load's chain result is read by the callseq_start. //===---------------------------------------------------------------------===// -Don't forget to find a way to squash noop truncates in the JIT environment. - -//===---------------------------------------------------------------------===// - -Implement anyext in the same manner as truncate that would allow them to be -eliminated. - -//===---------------------------------------------------------------------===// - -How about implementing truncate / anyext as a property of machine instruction -operand? i.e. Print as 32-bit super-class register / 16-bit sub-class register. -Do this for the cases where a truncate / anyext is guaranteed to be eliminated. -For IA32 that is truncate from 32 to 16 and anyext from 16 to 32. - -//===---------------------------------------------------------------------===// - For this: int test(int a) @@ -519,7 +427,39 @@ shorter than movl + leal. //===---------------------------------------------------------------------===// -Implement CTTZ, CTLZ with bsf and bsr. +__builtin_ffs codegen is messy. + +int ffs_(unsigned X) { return __builtin_ffs(X); } + +llvm produces: +ffs_: + movl 4(%esp), %ecx + bsfl %ecx, %eax + movl $32, %edx + cmove %edx, %eax + incl %eax + xorl %edx, %edx + testl %ecx, %ecx + cmove %edx, %eax + ret + +vs gcc: + +_ffs_: + movl $-1, %edx + bsfl 4(%esp), %eax + cmove %edx, %eax + addl $1, %eax + ret + +Another example of __builtin_ffs (use predsimplify to eliminate a select): + +int foo (unsigned long j) { + if (j) + return __builtin_ffs (j) - 1; + else + return 0; +} //===---------------------------------------------------------------------===// @@ -531,37 +471,28 @@ do not make use of. //===---------------------------------------------------------------------===// -We should handle __attribute__ ((__visibility__ ("hidden"))). - -//===---------------------------------------------------------------------===// - -int %foo(int* %a, int %t) { +define i32 @foo(i32* %a, i32 %t) { entry: - br label %cond_true - -cond_true: ; preds = %cond_true, %entry - %x.0.0 = phi int [ 0, %entry ], [ %tmp9, %cond_true ] ; [#uses=3] - %t_addr.0.0 = phi int [ %t, %entry ], [ %tmp7, %cond_true ] ; [#uses=1] - %tmp2 = getelementptr int* %a, int %x.0.0 ; [#uses=1] - %tmp3 = load int* %tmp2 ; [#uses=1] - %tmp5 = add int %t_addr.0.0, %x.0.0 ; [#uses=1] - %tmp7 = add int %tmp5, %tmp3 ; [#uses=2] - %tmp9 = add int %x.0.0, 1 ; [#uses=2] - %tmp = setgt int %tmp9, 39 ; [#uses=1] - br bool %tmp, label %bb12, label %cond_true - -bb12: ; preds = %cond_true - ret int %tmp7 + br label %cond_true + +cond_true: ; preds = %cond_true, %entry + %x.0.0 = phi i32 [ 0, %entry ], [ %tmp9, %cond_true ] ; [#uses=3] + %t_addr.0.0 = phi i32 [ %t, %entry ], [ %tmp7, %cond_true ] ; [#uses=1] + %tmp2 = getelementptr i32* %a, i32 %x.0.0 ; [#uses=1] + %tmp3 = load i32* %tmp2 ; [#uses=1] + %tmp5 = add i32 %t_addr.0.0, %x.0.0 ; [#uses=1] + %tmp7 = add i32 %tmp5, %tmp3 ; [#uses=2] + %tmp9 = add i32 %x.0.0, 1 ; [#uses=2] + %tmp = icmp sgt i32 %tmp9, 39 ; [#uses=1] + br i1 %tmp, label %bb12, label %cond_true + +bb12: ; preds = %cond_true + ret i32 %tmp7 } - is pessimized by -loop-reduce and -indvars //===---------------------------------------------------------------------===// -Use cpuid to auto-detect CPU features such as SSE, SSE2, and SSE3. - -//===---------------------------------------------------------------------===// - u32 to float conversion improvement: float uint32_2_float( unsigned u ) { @@ -607,69 +538,6 @@ or eax, 2 cmp eax, 6 jz label -If we aren't going to do this, we should lower the switch better. We compile -the code to: - -_f: - movl 8(%esp), %eax - movl 4(%esp), %ecx - cmpl $6, %ecx - jl LBB1_4 #entry - jmp LBB1_3 #entry -LBB1_3: #entry - cmpl $6, %ecx - je LBB1_1 #bb - jmp LBB1_2 #UnifiedReturnBlock -LBB1_4: #entry - cmpl $4, %ecx - jne LBB1_2 #UnifiedReturnBlock -LBB1_1: #bb - incl %eax - ret -LBB1_2: #UnifiedReturnBlock - ret - -In the code above, the 'if' is turned into a 'switch' at the mid-level. It looks -like the 'lower to branches' mode could be improved a little here. In particular, -the fall-through to LBB1_3 doesn't need a branch. It would also be nice to -eliminate the redundant "cmp 6", maybe by lowering to a linear sequence of -compares if there are below a certain number of cases (instead of a binary sequence)? - -//===---------------------------------------------------------------------===// - -Compile: -int %test(ulong *%tmp) { - %tmp = load ulong* %tmp ; [#uses=1] - %tmp.mask = shr ulong %tmp, ubyte 50 ; [#uses=1] - %tmp.mask = cast ulong %tmp.mask to ubyte ; [#uses=1] - %tmp2 = and ubyte %tmp.mask, 3 ; [#uses=1] - %tmp2 = cast ubyte %tmp2 to int ; [#uses=1] - ret int %tmp2 -} - -to: - -_test: - movl 4(%esp), %eax - movl 4(%eax), %eax - shrl $18, %eax - andl $3, %eax - ret - -instead of: - -_test: - movl 4(%esp), %eax - movl 4(%eax), %eax - shrl $18, %eax - # TRUNCATE movb %al, %al - andb $3, %al - movzbl %al, %eax - ret - -This saves a movzbl, and saves a truncate if it doesn't get coallesced right. -This is a simple DAGCombine to propagate the zext through the and. - //===---------------------------------------------------------------------===// GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting @@ -702,5 +570,1140 @@ _f: etc. +Another is: +int usesbb(unsigned int a, unsigned int b) { + return (a < b ? -1 : 0); +} +to: +_usesbb: + movl 8(%esp), %eax + cmpl %eax, 4(%esp) + sbbl %eax, %eax + ret + +instead of: +_usesbb: + xorl %eax, %eax + movl 8(%esp), %ecx + cmpl %ecx, 4(%esp) + movl $4294967295, %ecx + cmovb %ecx, %eax + ret + //===---------------------------------------------------------------------===// +Currently we don't have elimination of redundant stack manipulations. Consider +the code: + +int %main() { +entry: + call fastcc void %test1( ) + call fastcc void %test2( sbyte* cast (void ()* %test1 to sbyte*) ) + ret int 0 +} + +declare fastcc void %test1() + +declare fastcc void %test2(sbyte*) + + +This currently compiles to: + + subl $16, %esp + call _test5 + addl $12, %esp + subl $16, %esp + movl $_test5, (%esp) + call _test6 + addl $12, %esp + +The add\sub pair is really unneeded here. + +//===---------------------------------------------------------------------===// + +Consider the expansion of: + +define i32 @test3(i32 %X) { + %tmp1 = urem i32 %X, 255 + ret i32 %tmp1 +} + +Currently it compiles to: + +... + movl $2155905153, %ecx + movl 8(%esp), %esi + movl %esi, %eax + mull %ecx +... + +This could be "reassociated" into: + + movl $2155905153, %eax + movl 8(%esp), %ecx + mull %ecx + +to avoid the copy. In fact, the existing two-address stuff would do this +except that mul isn't a commutative 2-addr instruction. I guess this has +to be done at isel time based on the #uses to mul? + +//===---------------------------------------------------------------------===// + +Make sure the instruction which starts a loop does not cross a cacheline +boundary. This requires knowning the exact length of each machine instruction. +That is somewhat complicated, but doable. Example 256.bzip2: + +In the new trace, the hot loop has an instruction which crosses a cacheline +boundary. In addition to potential cache misses, this can't help decoding as I +imagine there has to be some kind of complicated decoder reset and realignment +to grab the bytes from the next cacheline. + +532 532 0x3cfc movb (1809(%esp, %esi), %bl <<<--- spans 2 64 byte lines +942 942 0x3d03 movl %dh, (1809(%esp, %esi) +937 937 0x3d0a incl %esi +3 3 0x3d0b cmpb %bl, %dl +27 27 0x3d0d jnz 0x000062db + +//===---------------------------------------------------------------------===// + +In c99 mode, the preprocessor doesn't like assembly comments like #TRUNCATE. + +//===---------------------------------------------------------------------===// + +This could be a single 16-bit load. + +int f(char *p) { + if ((p[0] == 1) & (p[1] == 2)) return 1; + return 0; +} + +//===---------------------------------------------------------------------===// + +We should inline lrintf and probably other libc functions. + +//===---------------------------------------------------------------------===// + +Start using the flags more. For example, compile: + +int add_zf(int *x, int y, int a, int b) { + if ((*x += y) == 0) + return a; + else + return b; +} + +to: + addl %esi, (%rdi) + movl %edx, %eax + cmovne %ecx, %eax + ret +instead of: + +_add_zf: + addl (%rdi), %esi + movl %esi, (%rdi) + testl %esi, %esi + cmove %edx, %ecx + movl %ecx, %eax + ret + +and: + +int add_zf(int *x, int y, int a, int b) { + if ((*x + y) < 0) + return a; + else + return b; +} + +to: + +add_zf: + addl (%rdi), %esi + movl %edx, %eax + cmovns %ecx, %eax + ret + +instead of: + +_add_zf: + addl (%rdi), %esi + testl %esi, %esi + cmovs %edx, %ecx + movl %ecx, %eax + ret + +//===---------------------------------------------------------------------===// + +These two functions have identical effects: + +unsigned int f(unsigned int i, unsigned int n) {++i; if (i == n) ++i; return i;} +unsigned int f2(unsigned int i, unsigned int n) {++i; i += i == n; return i;} + +We currently compile them to: + +_f: + movl 4(%esp), %eax + movl %eax, %ecx + incl %ecx + movl 8(%esp), %edx + cmpl %edx, %ecx + jne LBB1_2 #UnifiedReturnBlock +LBB1_1: #cond_true + addl $2, %eax + ret +LBB1_2: #UnifiedReturnBlock + movl %ecx, %eax + ret +_f2: + movl 4(%esp), %eax + movl %eax, %ecx + incl %ecx + cmpl 8(%esp), %ecx + sete %cl + movzbl %cl, %ecx + leal 1(%ecx,%eax), %eax + ret + +both of which are inferior to GCC's: + +_f: + movl 4(%esp), %edx + leal 1(%edx), %eax + addl $2, %edx + cmpl 8(%esp), %eax + cmove %edx, %eax + ret +_f2: + movl 4(%esp), %eax + addl $1, %eax + xorl %edx, %edx + cmpl 8(%esp), %eax + sete %dl + addl %edx, %eax + ret + +//===---------------------------------------------------------------------===// + +This code: + +void test(int X) { + if (X) abort(); +} + +is currently compiled to: + +_test: + subl $12, %esp + cmpl $0, 16(%esp) + jne LBB1_1 + addl $12, %esp + ret +LBB1_1: + call L_abort$stub + +It would be better to produce: + +_test: + subl $12, %esp + cmpl $0, 16(%esp) + jne L_abort$stub + addl $12, %esp + ret + +This can be applied to any no-return function call that takes no arguments etc. +Alternatively, the stack save/restore logic could be shrink-wrapped, producing +something like this: + +_test: + cmpl $0, 4(%esp) + jne LBB1_1 + ret +LBB1_1: + subl $12, %esp + call L_abort$stub + +Both are useful in different situations. Finally, it could be shrink-wrapped +and tail called, like this: + +_test: + cmpl $0, 4(%esp) + jne LBB1_1 + ret +LBB1_1: + pop %eax # realign stack. + call L_abort$stub + +Though this probably isn't worth it. + +//===---------------------------------------------------------------------===// + +We need to teach the codegen to convert two-address INC instructions to LEA +when the flags are dead (likewise dec). For example, on X86-64, compile: + +int foo(int A, int B) { + return A+1; +} + +to: + +_foo: + leal 1(%edi), %eax + ret + +instead of: + +_foo: + incl %edi + movl %edi, %eax + ret + +Another example is: + +;; X's live range extends beyond the shift, so the register allocator +;; cannot coalesce it with Y. Because of this, a copy needs to be +;; emitted before the shift to save the register value before it is +;; clobbered. However, this copy is not needed if the register +;; allocator turns the shift into an LEA. This also occurs for ADD. + +; Check that the shift gets turned into an LEA. +; RUN: llvm-as < %s | llc -march=x86 -x86-asm-syntax=intel | \ +; RUN: not grep {mov E.X, E.X} + +@G = external global i32 ; [#uses=3] + +define i32 @test1(i32 %X, i32 %Y) { + %Z = add i32 %X, %Y ; [#uses=1] + volatile store i32 %Y, i32* @G + volatile store i32 %Z, i32* @G + ret i32 %X +} + +define i32 @test2(i32 %X) { + %Z = add i32 %X, 1 ; [#uses=1] + volatile store i32 %Z, i32* @G + ret i32 %X +} + +//===---------------------------------------------------------------------===// + +Sometimes it is better to codegen subtractions from a constant (e.g. 7-x) with +a neg instead of a sub instruction. Consider: + +int test(char X) { return 7-X; } + +we currently produce: +_test: + movl $7, %eax + movsbl 4(%esp), %ecx + subl %ecx, %eax + ret + +We would use one fewer register if codegen'd as: + + movsbl 4(%esp), %eax + neg %eax + add $7, %eax + ret + +Note that this isn't beneficial if the load can be folded into the sub. In +this case, we want a sub: + +int test(int X) { return 7-X; } +_test: + movl $7, %eax + subl 4(%esp), %eax + ret + +//===---------------------------------------------------------------------===// + +Leaf functions that require one 4-byte spill slot have a prolog like this: + +_foo: + pushl %esi + subl $4, %esp +... +and an epilog like this: + addl $4, %esp + popl %esi + ret + +It would be smaller, and potentially faster, to push eax on entry and to +pop into a dummy register instead of using addl/subl of esp. Just don't pop +into any return registers :) + +//===---------------------------------------------------------------------===// + +The X86 backend should fold (branch (or (setcc, setcc))) into multiple +branches. We generate really poor code for: + +double testf(double a) { + return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0); +} + +For example, the entry BB is: + +_testf: + subl $20, %esp + pxor %xmm0, %xmm0 + movsd 24(%esp), %xmm1 + ucomisd %xmm0, %xmm1 + setnp %al + sete %cl + testb %cl, %al + jne LBB1_5 # UnifiedReturnBlock +LBB1_1: # cond_true + + +it would be better to replace the last four instructions with: + + jp LBB1_1 + je LBB1_5 +LBB1_1: + +We also codegen the inner ?: into a diamond: + + cvtss2sd LCPI1_0(%rip), %xmm2 + cvtss2sd LCPI1_1(%rip), %xmm3 + ucomisd %xmm1, %xmm0 + ja LBB1_3 # cond_true +LBB1_2: # cond_true + movapd %xmm3, %xmm2 +LBB1_3: # cond_true + movapd %xmm2, %xmm0 + ret + +We should sink the load into xmm3 into the LBB1_2 block. This should +be pretty easy, and will nuke all the copies. + +//===---------------------------------------------------------------------===// + +This: + #include + inline std::pair full_add(unsigned a, unsigned b) + { return std::make_pair(a + b, a + b < a); } + bool no_overflow(unsigned a, unsigned b) + { return !full_add(a, b).second; } + +Should compile to: + + + _Z11no_overflowjj: + addl %edi, %esi + setae %al + ret + +FIXME: That code looks wrong; bool return is normally defined as zext. + +on x86-64, not: + +__Z11no_overflowjj: + addl %edi, %esi + cmpl %edi, %esi + setae %al + movzbl %al, %eax + ret + + +//===---------------------------------------------------------------------===// + +Re-materialize MOV32r0 etc. with xor instead of changing them to moves if the +condition register is dead. xor reg reg is shorter than mov reg, #0. + +//===---------------------------------------------------------------------===// + +We aren't matching RMW instructions aggressively +enough. Here's a reduced testcase (more in PR1160): + +define void @test(i32* %huge_ptr, i32* %target_ptr) { + %A = load i32* %huge_ptr ; [#uses=1] + %B = load i32* %target_ptr ; [#uses=1] + %C = or i32 %A, %B ; [#uses=1] + store i32 %C, i32* %target_ptr + ret void +} + +$ llvm-as < t.ll | llc -march=x86-64 + +_test: + movl (%rdi), %eax + orl (%rsi), %eax + movl %eax, (%rsi) + ret + +That should be something like: + +_test: + movl (%rdi), %eax + orl %eax, (%rsi) + ret + +//===---------------------------------------------------------------------===// + +The following code: + +bb114.preheader: ; preds = %cond_next94 + %tmp231232 = sext i16 %tmp62 to i32 ; [#uses=1] + %tmp233 = sub i32 32, %tmp231232 ; [#uses=1] + %tmp245246 = sext i16 %tmp65 to i32 ; [#uses=1] + %tmp252253 = sext i16 %tmp68 to i32 ; [#uses=1] + %tmp254 = sub i32 32, %tmp252253 ; [#uses=1] + %tmp553554 = bitcast i16* %tmp37 to i8* ; [#uses=2] + %tmp583584 = sext i16 %tmp98 to i32 ; [#uses=1] + %tmp585 = sub i32 32, %tmp583584 ; [#uses=1] + %tmp614615 = sext i16 %tmp101 to i32 ; [#uses=1] + %tmp621622 = sext i16 %tmp104 to i32 ; [#uses=1] + %tmp623 = sub i32 32, %tmp621622 ; [#uses=1] + br label %bb114 + +produces: + +LBB3_5: # bb114.preheader + movswl -68(%ebp), %eax + movl $32, %ecx + movl %ecx, -80(%ebp) + subl %eax, -80(%ebp) + movswl -52(%ebp), %eax + movl %ecx, -84(%ebp) + subl %eax, -84(%ebp) + movswl -70(%ebp), %eax + movl %ecx, -88(%ebp) + subl %eax, -88(%ebp) + movswl -50(%ebp), %eax + subl %eax, %ecx + movl %ecx, -76(%ebp) + movswl -42(%ebp), %eax + movl %eax, -92(%ebp) + movswl -66(%ebp), %eax + movl %eax, -96(%ebp) + movw $0, -98(%ebp) + +This appears to be bad because the RA is not folding the store to the stack +slot into the movl. The above instructions could be: + movl $32, -80(%ebp) +... + movl $32, -84(%ebp) +... +This seems like a cross between remat and spill folding. + +This has redundant subtractions of %eax from a stack slot. However, %ecx doesn't +change, so we could simply subtract %eax from %ecx first and then use %ecx (or +vice-versa). + +//===---------------------------------------------------------------------===// + +This code: + + %tmp659 = icmp slt i16 %tmp654, 0 ; [#uses=1] + br i1 %tmp659, label %cond_true662, label %cond_next715 + +produces this: + + testw %cx, %cx + movswl %cx, %esi + jns LBB4_109 # cond_next715 + +Shark tells us that using %cx in the testw instruction is sub-optimal. It +suggests using the 32-bit register (which is what ICC uses). + +//===---------------------------------------------------------------------===// + +We compile this: + +void compare (long long foo) { + if (foo < 4294967297LL) + abort(); +} + +to: + +compare: + subl $4, %esp + cmpl $0, 8(%esp) + setne %al + movzbw %al, %ax + cmpl $1, 12(%esp) + setg %cl + movzbw %cl, %cx + cmove %ax, %cx + testb $1, %cl + jne .LBB1_2 # UnifiedReturnBlock +.LBB1_1: # ifthen + call abort +.LBB1_2: # UnifiedReturnBlock + addl $4, %esp + ret + +(also really horrible code on ppc). This is due to the expand code for 64-bit +compares. GCC produces multiple branches, which is much nicer: + +compare: + subl $12, %esp + movl 20(%esp), %edx + movl 16(%esp), %eax + decl %edx + jle .L7 +.L5: + addl $12, %esp + ret + .p2align 4,,7 +.L7: + jl .L4 + cmpl $0, %eax + .p2align 4,,8 + ja .L5 +.L4: + .p2align 4,,9 + call abort + +//===---------------------------------------------------------------------===// + +Tail call optimization improvements: Tail call optimization currently +pushes all arguments on the top of the stack (their normal place for +non-tail call optimized calls) that source from the callers arguments +or that source from a virtual register (also possibly sourcing from +callers arguments). +This is done to prevent overwriting of parameters (see example +below) that might be used later. + +example: + +int callee(int32, int64); +int caller(int32 arg1, int32 arg2) { + int64 local = arg2 * 2; + return callee(arg2, (int64)local); +} + +[arg1] [!arg2 no longer valid since we moved local onto it] +[arg2] -> [(int64) +[RETADDR] local ] + +Moving arg1 onto the stack slot of callee function would overwrite +arg2 of the caller. + +Possible optimizations: + + + - Analyse the actual parameters of the callee to see which would + overwrite a caller parameter which is used by the callee and only + push them onto the top of the stack. + + int callee (int32 arg1, int32 arg2); + int caller (int32 arg1, int32 arg2) { + return callee(arg1,arg2); + } + + Here we don't need to write any variables to the top of the stack + since they don't overwrite each other. + + int callee (int32 arg1, int32 arg2); + int caller (int32 arg1, int32 arg2) { + return callee(arg2,arg1); + } + + Here we need to push the arguments because they overwrite each + other. + +//===---------------------------------------------------------------------===// + +main () +{ + int i = 0; + unsigned long int z = 0; + + do { + z -= 0x00004000; + i++; + if (i > 0x00040000) + abort (); + } while (z > 0); + exit (0); +} + +gcc compiles this to: + +_main: + subl $28, %esp + xorl %eax, %eax + jmp L2 +L3: + cmpl $262144, %eax + je L10 +L2: + addl $1, %eax + cmpl $262145, %eax + jne L3 + call L_abort$stub +L10: + movl $0, (%esp) + call L_exit$stub + +llvm: + +_main: + subl $12, %esp + movl $1, %eax + movl $16384, %ecx +LBB1_1: # bb + cmpl $262145, %eax + jge LBB1_4 # cond_true +LBB1_2: # cond_next + incl %eax + addl $4294950912, %ecx + cmpl $16384, %ecx + jne LBB1_1 # bb +LBB1_3: # bb11 + xorl %eax, %eax + addl $12, %esp + ret +LBB1_4: # cond_true + call L_abort$stub + +1. LSR should rewrite the first cmp with induction variable %ecx. +2. DAG combiner should fold + leal 1(%eax), %edx + cmpl $262145, %edx + => + cmpl $262144, %eax + +//===---------------------------------------------------------------------===// + +define i64 @test(double %X) { + %Y = fptosi double %X to i64 + ret i64 %Y +} + +compiles to: + +_test: + subl $20, %esp + movsd 24(%esp), %xmm0 + movsd %xmm0, 8(%esp) + fldl 8(%esp) + fisttpll (%esp) + movl 4(%esp), %edx + movl (%esp), %eax + addl $20, %esp + #FP_REG_KILL + ret + +This should just fldl directly from the input stack slot. + +//===---------------------------------------------------------------------===// + +This code: +int foo (int x) { return (x & 65535) | 255; } + +Should compile into: + +_foo: + movzwl 4(%esp), %eax + orl $255, %eax + ret + +instead of: +_foo: + movl $255, %eax + orl 4(%esp), %eax + andl $65535, %eax + ret + +//===---------------------------------------------------------------------===// + +We're codegen'ing multiply of long longs inefficiently: + +unsigned long long LLM(unsigned long long arg1, unsigned long long arg2) { + return arg1 * arg2; +} + +We compile to (fomit-frame-pointer): + +_LLM: + pushl %esi + movl 8(%esp), %ecx + movl 16(%esp), %esi + movl %esi, %eax + mull %ecx + imull 12(%esp), %esi + addl %edx, %esi + imull 20(%esp), %ecx + movl %esi, %edx + addl %ecx, %edx + popl %esi + ret + +This looks like a scheduling deficiency and lack of remat of the load from +the argument area. ICC apparently produces: + + movl 8(%esp), %ecx + imull 12(%esp), %ecx + movl 16(%esp), %eax + imull 4(%esp), %eax + addl %eax, %ecx + movl 4(%esp), %eax + mull 12(%esp) + addl %ecx, %edx + ret + +Note that it remat'd loads from 4(esp) and 12(esp). See this GCC PR: +http://gcc.gnu.org/bugzilla/show_bug.cgi?id=17236 + +//===---------------------------------------------------------------------===// + +We can fold a store into "zeroing a reg". Instead of: + +xorl %eax, %eax +movl %eax, 124(%esp) + +we should get: + +movl $0, 124(%esp) + +if the flags of the xor are dead. + +Likewise, we isel "x<<1" into "add reg,reg". If reg is spilled, this should +be folded into: shl [mem], 1 + +//===---------------------------------------------------------------------===// + +This testcase misses a read/modify/write opportunity (from PR1425): + +void vertical_decompose97iH1(int *b0, int *b1, int *b2, int width){ + int i; + for(i=0; i>0; +} + +We compile it down to: + +LBB1_2: # bb + movl (%esi,%edi,4), %ebx + addl (%ecx,%edi,4), %ebx + addl (%edx,%edi,4), %ebx + movl %ebx, (%ecx,%edi,4) + incl %edi + cmpl %eax, %edi + jne LBB1_2 # bb + +the inner loop should add to the memory location (%ecx,%edi,4), saving +a mov. Something like: + + movl (%esi,%edi,4), %ebx + addl (%edx,%edi,4), %ebx + addl %ebx, (%ecx,%edi,4) + +Here is another interesting example: + +void vertical_compose97iH1(int *b0, int *b1, int *b2, int width){ + int i; + for(i=0; i>0; +} + +We miss the r/m/w opportunity here by using 2 subs instead of an add+sub[mem]: + +LBB9_2: # bb + movl (%ecx,%edi,4), %ebx + subl (%esi,%edi,4), %ebx + subl (%edx,%edi,4), %ebx + movl %ebx, (%ecx,%edi,4) + incl %edi + cmpl %eax, %edi + jne LBB9_2 # bb + +Additionally, LSR should rewrite the exit condition of these loops to use +a stride-4 IV, would would allow all the scales in the loop to go away. +This would result in smaller code and more efficient microops. + +//===---------------------------------------------------------------------===// + +In SSE mode, we turn abs and neg into a load from the constant pool plus a xor +or and instruction, for example: + + xorpd LCPI1_0, %xmm2 + +However, if xmm2 gets spilled, we end up with really ugly code like this: + + movsd (%esp), %xmm0 + xorpd LCPI1_0, %xmm0 + movsd %xmm0, (%esp) + +Since we 'know' that this is a 'neg', we can actually "fold" the spill into +the neg/abs instruction, turning it into an *integer* operation, like this: + + xorl 2147483648, [mem+4] ## 2147483648 = (1 << 31) + +you could also use xorb, but xorl is less likely to lead to a partial register +stall. Here is a contrived testcase: + +double a, b, c; +void test(double *P) { + double X = *P; + a = X; + bar(); + X = -X; + b = X; + bar(); + c = X; +} + +//===---------------------------------------------------------------------===// + +handling llvm.memory.barrier on pre SSE2 cpus + +should generate: +lock ; mov %esp, %esp + +//===---------------------------------------------------------------------===// + +The generated code on x86 for checking for signed overflow on a multiply the +obvious way is much longer than it needs to be. + +int x(int a, int b) { + long long prod = (long long)a*b; + return prod > 0x7FFFFFFF || prod < (-0x7FFFFFFF-1); +} + +See PR2053 for more details. + +//===---------------------------------------------------------------------===// + +We should investigate using cdq/ctld (effect: edx = sar eax, 31) +more aggressively; it should cost the same as a move+shift on any modern +processor, but it's a lot shorter. Downside is that it puts more +pressure on register allocation because it has fixed operands. + +Example: +int abs(int x) {return x < 0 ? -x : x;} + +gcc compiles this to the following when using march/mtune=pentium2/3/4/m/etc.: +abs: + movl 4(%esp), %eax + cltd + xorl %edx, %eax + subl %edx, %eax + ret + +//===---------------------------------------------------------------------===// + +Consider: +int test(unsigned long a, unsigned long b) { return -(a < b); } + +We currently compile this to: + +define i32 @test(i32 %a, i32 %b) nounwind { + %tmp3 = icmp ult i32 %a, %b ; [#uses=1] + %tmp34 = zext i1 %tmp3 to i32 ; [#uses=1] + %tmp5 = sub i32 0, %tmp34 ; [#uses=1] + ret i32 %tmp5 +} + +and + +_test: + movl 8(%esp), %eax + cmpl %eax, 4(%esp) + setb %al + movzbl %al, %eax + negl %eax + ret + +Several deficiencies here. First, we should instcombine zext+neg into sext: + +define i32 @test2(i32 %a, i32 %b) nounwind { + %tmp3 = icmp ult i32 %a, %b ; [#uses=1] + %tmp34 = sext i1 %tmp3 to i32 ; [#uses=1] + ret i32 %tmp34 +} + +However, before we can do that, we have to fix the bad codegen that we get for +sext from bool: + +_test2: + movl 8(%esp), %eax + cmpl %eax, 4(%esp) + setb %al + movzbl %al, %eax + shll $31, %eax + sarl $31, %eax + ret + +This code should be at least as good as the code above. Once this is fixed, we +can optimize this specific case even more to: + + movl 8(%esp), %eax + xorl %ecx, %ecx + cmpl %eax, 4(%esp) + sbbl %ecx, %ecx + +//===---------------------------------------------------------------------===// + +Take the following code (from +http://gcc.gnu.org/bugzilla/show_bug.cgi?id=16541): + +extern unsigned char first_one[65536]; +int FirstOnet(unsigned long long arg1) +{ + if (arg1 >> 48) + return (first_one[arg1 >> 48]); + return 0; +} + + +The following code is currently generated: +FirstOnet: + movl 8(%esp), %eax + cmpl $65536, %eax + movl 4(%esp), %ecx + jb .LBB1_2 # UnifiedReturnBlock +.LBB1_1: # ifthen + shrl $16, %eax + movzbl first_one(%eax), %eax + ret +.LBB1_2: # UnifiedReturnBlock + xorl %eax, %eax + ret + +There are a few possible improvements here: +1. We should be able to eliminate the dead load into %ecx +2. We could change the "movl 8(%esp), %eax" into + "movzwl 10(%esp), %eax"; this lets us change the cmpl + into a testl, which is shorter, and eliminate the shift. + +We could also in theory eliminate the branch by using a conditional +for the address of the load, but that seems unlikely to be worthwhile +in general. + +//===---------------------------------------------------------------------===// + +We compile this function: + +define i32 @foo(i32 %a, i32 %b, i32 %c, i8 zeroext %d) nounwind { +entry: + %tmp2 = icmp eq i8 %d, 0 ; [#uses=1] + br i1 %tmp2, label %bb7, label %bb + +bb: ; preds = %entry + %tmp6 = add i32 %b, %a ; [#uses=1] + ret i32 %tmp6 + +bb7: ; preds = %entry + %tmp10 = sub i32 %a, %c ; [#uses=1] + ret i32 %tmp10 +} + +to: + +_foo: + cmpb $0, 16(%esp) + movl 12(%esp), %ecx + movl 8(%esp), %eax + movl 4(%esp), %edx + je LBB1_2 # bb7 +LBB1_1: # bb + addl %edx, %eax + ret +LBB1_2: # bb7 + movl %edx, %eax + subl %ecx, %eax + ret + +The coalescer could coalesce "edx" with "eax" to avoid the movl in LBB1_2 +if it commuted the addl in LBB1_1. + +//===---------------------------------------------------------------------===// + +See rdar://4653682. + +From flops: + +LBB1_15: # bb310 + cvtss2sd LCPI1_0, %xmm1 + addsd %xmm1, %xmm0 + movsd 176(%esp), %xmm2 + mulsd %xmm0, %xmm2 + movapd %xmm2, %xmm3 + mulsd %xmm3, %xmm3 + movapd %xmm3, %xmm4 + mulsd LCPI1_23, %xmm4 + addsd LCPI1_24, %xmm4 + mulsd %xmm3, %xmm4 + addsd LCPI1_25, %xmm4 + mulsd %xmm3, %xmm4 + addsd LCPI1_26, %xmm4 + mulsd %xmm3, %xmm4 + addsd LCPI1_27, %xmm4 + mulsd %xmm3, %xmm4 + addsd LCPI1_28, %xmm4 + mulsd %xmm3, %xmm4 + addsd %xmm1, %xmm4 + mulsd %xmm2, %xmm4 + movsd 152(%esp), %xmm1 + addsd %xmm4, %xmm1 + movsd %xmm1, 152(%esp) + incl %eax + cmpl %eax, %esi + jge LBB1_15 # bb310 +LBB1_16: # bb358.loopexit + movsd 152(%esp), %xmm0 + addsd %xmm0, %xmm0 + addsd LCPI1_22, %xmm0 + movsd %xmm0, 152(%esp) + +Rather than spilling the result of the last addsd in the loop, we should have +insert a copy to split the interval (one for the duration of the loop, one +extending to the fall through). The register pressure in the loop isn't high +enough to warrant the spill. + +Also check why xmm7 is not used at all in the function. + +//===---------------------------------------------------------------------===// + +Legalize loses track of the fact that bools are always zero extended when in +memory. This causes us to compile abort_gzip (from 164.gzip) from: + +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128" +target triple = "i386-apple-darwin8" +@in_exit.4870.b = internal global i1 false ; [#uses=2] +define fastcc void @abort_gzip() noreturn nounwind { +entry: + %tmp.b.i = load i1* @in_exit.4870.b ; [#uses=1] + br i1 %tmp.b.i, label %bb.i, label %bb4.i +bb.i: ; preds = %entry + tail call void @exit( i32 1 ) noreturn nounwind + unreachable +bb4.i: ; preds = %entry + store i1 true, i1* @in_exit.4870.b + tail call void @exit( i32 1 ) noreturn nounwind + unreachable +} +declare void @exit(i32) noreturn nounwind + +into: + +_abort_gzip: + subl $12, %esp + movb _in_exit.4870.b, %al + notb %al + testb $1, %al + jne LBB1_2 ## bb4.i +LBB1_1: ## bb.i + ... + +//===---------------------------------------------------------------------===// + +We compile: + +int test(int x, int y) { + return x-y-1; +} + +into (-m64): + +_test: + decl %edi + movl %edi, %eax + subl %esi, %eax + ret + +it would be better to codegen as: x+~y (notl+addl)