// Random ideas for the X86 backend.
//===---------------------------------------------------------------------===//
-Add a MUL2U and MUL2S nodes to represent a multiply that returns both the
-Hi and Lo parts (combination of MUL and MULH[SU] into one node). Add this to
-X86, & make the dag combiner produce it when needed. This will eliminate one
-imul from the code generated for:
+Missing features:
+ - Support for SSE4: http://www.intel.com/software/penryn
+http://softwarecommunity.intel.com/isn/Downloads/Intel%20SSE4%20Programming%20Reference.pdf
+ - support for 3DNow!
+ - weird abis?
-long long test(long long X, long long Y) { return X*Y; }
-
-by using the EAX result from the mul. We should add a similar node for
-DIVREM.
-
-another case is:
-
-long long test(int X, int Y) { return (long long)X*Y; }
+//===---------------------------------------------------------------------===//
-... which should only be one imul instruction.
+CodeGen/X86/lea-3.ll:test3 should be a single LEA, not a shift/move. The X86
+backend knows how to three-addressify this shift, but it appears the register
+allocator isn't even asking it to do so in this case. We should investigate
+why this isn't happening, it could have significant impact on other important
+cases for X86 as well.
//===---------------------------------------------------------------------===//
Another useful one would be ~0ULL >> X and ~0ULL << X.
+One better solution for 1LL << x is:
+ xorl %eax, %eax
+ xorl %edx, %edx
+ testb $32, %cl
+ sete %al
+ setne %dl
+ sall %cl, %eax
+ sall %cl, %edx
+
+But that requires good 8-bit subreg support.
+
+64-bit shifts (in general) expand to really bad code. Instead of using
+cmovs, we should expand to a conditional branch like GCC produces.
+
//===---------------------------------------------------------------------===//
Compile this:
Leave any_extend as pseudo instruction and hint to register
allocator. Delay codegen until post register allocation.
+Note. any_extend is now turned into an INSERT_SUBREG. We still need to teach
+the coalescer how to deal with it though.
//===---------------------------------------------------------------------===//
however, check that these are defined for 0 and 32. Our intrinsics are, GCC's
aren't.
+Another example (use predsimplify to eliminate a select):
+
+int foo (unsigned long j) {
+ if (j)
+ return __builtin_ffs (j) - 1;
+ else
+ return 0;
+}
+
//===---------------------------------------------------------------------===//
-Use push/pop instructions in prolog/epilog sequences instead of stores off
-ESP (certain code size win, perf win on some [which?] processors).
-Also, it appears icc use push for parameter passing. Need to investigate.
+It appears icc use push for parameter passing. Need to investigate.
//===---------------------------------------------------------------------===//
//===---------------------------------------------------------------------===//
-Should generate min/max for stuff like:
-
-void minf(float a, float b, float *X) {
- *X = a <= b ? a : b;
-}
-
-Make use of floating point min / max instructions. Perhaps introduce ISD::FMIN
-and ISD::FMAX node types?
-
-//===---------------------------------------------------------------------===//
-
The first BB of this code:
declare bool %foo()
//===---------------------------------------------------------------------===//
-Enable X86InstrInfo::convertToThreeAddress().
-
-//===---------------------------------------------------------------------===//
-
We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl
We should leave these as libcalls for everything over a much lower threshold,
since libc is hand tuned for medium and large mem ops (avoiding RFO for large
//===---------------------------------------------------------------------===//
+We are generating far worse code than gcc:
+
+volatile short X, Y;
+
+void foo(int N) {
+ int i;
+ for (i = 0; i < N; i++) { X = i; Y = i*4; }
+}
+
+LBB1_1: # entry.bb_crit_edge
+ xorl %ecx, %ecx
+ xorw %dx, %dx
+LBB1_2: # bb
+ movl L_X$non_lazy_ptr, %esi
+ movw %cx, (%esi)
+ movl L_Y$non_lazy_ptr, %esi
+ movw %dx, (%esi)
+ addw $4, %dx
+ incl %ecx
+ cmpl %eax, %ecx
+ jne LBB1_2 # bb
+
+vs.
+
+ xorl %edx, %edx
+ movl L_X$non_lazy_ptr-"L00000000001$pb"(%ebx), %esi
+ movl L_Y$non_lazy_ptr-"L00000000001$pb"(%ebx), %ecx
+L4:
+ movw %dx, (%esi)
+ leal 0(,%edx,4), %eax
+ movw %ax, (%ecx)
+ addl $1, %edx
+ cmpl %edx, %edi
+ jne L4
+
+This is due to the lack of post regalloc LICM.
+
+//===---------------------------------------------------------------------===//
+
Teach the coalescer to coalesce vregs of different register classes. e.g. FR32 /
FR64 to VR128.
//===---------------------------------------------------------------------===//
-Bad codegen:
-
-char foo(int x) { return x; }
-
-_foo:
- movl 4(%esp), %eax
- shll $24, %eax
- sarl $24, %eax
- ret
-
-SIGN_EXTEND_INREG can be implemented as (sext (trunc)) to take advantage of
-sub-registers.
-
-//===---------------------------------------------------------------------===//
-
Consider this:
typedef struct pair { float A, B; } pair;
We currently generate this code with llvmgcc4:
_pairtest:
- subl $12, %esp
- movl 20(%esp), %eax
- movl %eax, 4(%esp)
- movl 16(%esp), %eax
- movl %eax, (%esp)
- movss (%esp), %xmm0
- addss 4(%esp), %xmm0
- movl 24(%esp), %eax
- movss %xmm0, (%eax)
- addl $12, %esp
+ movl 8(%esp), %eax
+ movl 4(%esp), %ecx
+ movd %eax, %xmm0
+ movd %ecx, %xmm1
+ addss %xmm0, %xmm1
+ movl 12(%esp), %eax
+ movss %xmm1, (%eax)
ret
we should be able to generate:
a single 32-bit integer stack slot. We should handle the safe cases above much
nicer, while still handling the hard cases.
+While true in general, in this specific case we could do better by promoting
+load int + bitcast to float -> load fload. This basically needs alignment info,
+the code is already implemented (but disabled) in dag combine).
+
//===---------------------------------------------------------------------===//
Another instruction selector deficiency:
//===---------------------------------------------------------------------===//
-Don't forget to find a way to squash noop truncates in the JIT environment.
-
-//===---------------------------------------------------------------------===//
-
-Implement anyext in the same manner as truncate that would allow them to be
-eliminated.
-
-//===---------------------------------------------------------------------===//
-
-How about implementing truncate / anyext as a property of machine instruction
-operand? i.e. Print as 32-bit super-class register / 16-bit sub-class register.
-Do this for the cases where a truncate / anyext is guaranteed to be eliminated.
-For IA32 that is truncate from 32 to 16 and anyext from 16 to 32.
-
-//===---------------------------------------------------------------------===//
-
For this:
int test(int a)
//===---------------------------------------------------------------------===//
-Implement CTTZ, CTLZ with bsf and bsr.
+Implement CTTZ, CTLZ with bsf and bsr. GCC produces:
+
+int ctz_(unsigned X) { return __builtin_ctz(X); }
+int clz_(unsigned X) { return __builtin_clz(X); }
+int ffs_(unsigned X) { return __builtin_ffs(X); }
+
+_ctz_:
+ bsfl 4(%esp), %eax
+ ret
+_clz_:
+ bsrl 4(%esp), %eax
+ xorl $31, %eax
+ ret
+_ffs_:
+ movl $-1, %edx
+ bsfl 4(%esp), %eax
+ cmove %edx, %eax
+ addl $1, %eax
+ ret
//===---------------------------------------------------------------------===//
//===---------------------------------------------------------------------===//
-We should handle __attribute__ ((__visibility__ ("hidden"))).
-
-//===---------------------------------------------------------------------===//
-
int %foo(int* %a, int %t) {
entry:
br label %cond_true
cond_true: ; preds = %cond_true, %entry
- %x.0.0 = phi int [ 0, %entry ], [ %tmp9, %cond_true ] ; <int> [#uses=3]
- %t_addr.0.0 = phi int [ %t, %entry ], [ %tmp7, %cond_true ] ; <int> [#uses=1]
- %tmp2 = getelementptr int* %a, int %x.0.0 ; <int*> [#uses=1]
+ %x.0.0 = phi int [ 0, %entry ], [ %tmp9, %cond_true ]
+ %t_addr.0.0 = phi int [ %t, %entry ], [ %tmp7, %cond_true ]
+ %tmp2 = getelementptr int* %a, int %x.0.0
%tmp3 = load int* %tmp2 ; <int> [#uses=1]
%tmp5 = add int %t_addr.0.0, %x.0.0 ; <int> [#uses=1]
%tmp7 = add int %tmp5, %tmp3 ; <int> [#uses=2]
//===---------------------------------------------------------------------===//
-Use cpuid to auto-detect CPU features such as SSE, SSE2, and SSE3.
-
-//===---------------------------------------------------------------------===//
-
u32 to float conversion improvement:
float uint32_2_float( unsigned u ) {
//===---------------------------------------------------------------------===//
-Compile:
-int %test(ulong *%tmp) {
- %tmp = load ulong* %tmp ; <ulong> [#uses=1]
- %tmp.mask = shr ulong %tmp, ubyte 50 ; <ulong> [#uses=1]
- %tmp.mask = cast ulong %tmp.mask to ubyte ; <ubyte> [#uses=1]
- %tmp2 = and ubyte %tmp.mask, 3 ; <ubyte> [#uses=1]
- %tmp2 = cast ubyte %tmp2 to int ; <int> [#uses=1]
- ret int %tmp2
+GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting
+simplifications for integer "x cmp y ? a : b". For example, instead of:
+
+int G;
+void f(int X, int Y) {
+ G = X < 0 ? 14 : 13;
}
+compiling to:
+
+_f:
+ movl $14, %eax
+ movl $13, %ecx
+ movl 4(%esp), %edx
+ testl %edx, %edx
+ cmovl %eax, %ecx
+ movl %ecx, _G
+ ret
+
+it could be:
+_f:
+ movl 4(%esp), %eax
+ sarl $31, %eax
+ notl %eax
+ addl $14, %eax
+ movl %eax, _G
+ ret
+
+etc.
+
+Another is:
+int usesbb(unsigned int a, unsigned int b) {
+ return (a < b ? -1 : 0);
+}
to:
+_usesbb:
+ movl 8(%esp), %eax
+ cmpl %eax, 4(%esp)
+ sbbl %eax, %eax
+ ret
-_test:
+instead of:
+_usesbb:
+ xorl %eax, %eax
+ movl 8(%esp), %ecx
+ cmpl %ecx, 4(%esp)
+ movl $4294967295, %ecx
+ cmovb %ecx, %eax
+ ret
+
+//===---------------------------------------------------------------------===//
+
+Currently we don't have elimination of redundant stack manipulations. Consider
+the code:
+
+int %main() {
+entry:
+ call fastcc void %test1( )
+ call fastcc void %test2( sbyte* cast (void ()* %test1 to sbyte*) )
+ ret int 0
+}
+
+declare fastcc void %test1()
+
+declare fastcc void %test2(sbyte*)
+
+
+This currently compiles to:
+
+ subl $16, %esp
+ call _test5
+ addl $12, %esp
+ subl $16, %esp
+ movl $_test5, (%esp)
+ call _test6
+ addl $12, %esp
+
+The add\sub pair is really unneeded here.
+
+//===---------------------------------------------------------------------===//
+
+Consider the expansion of:
+
+uint %test3(uint %X) {
+ %tmp1 = rem uint %X, 255
+ ret uint %tmp1
+}
+
+Currently it compiles to:
+
+...
+ movl $2155905153, %ecx
+ movl 8(%esp), %esi
+ movl %esi, %eax
+ mull %ecx
+...
+
+This could be "reassociated" into:
+
+ movl $2155905153, %eax
+ movl 8(%esp), %ecx
+ mull %ecx
+
+to avoid the copy. In fact, the existing two-address stuff would do this
+except that mul isn't a commutative 2-addr instruction. I guess this has
+to be done at isel time based on the #uses to mul?
+
+//===---------------------------------------------------------------------===//
+
+Make sure the instruction which starts a loop does not cross a cacheline
+boundary. This requires knowning the exact length of each machine instruction.
+That is somewhat complicated, but doable. Example 256.bzip2:
+
+In the new trace, the hot loop has an instruction which crosses a cacheline
+boundary. In addition to potential cache misses, this can't help decoding as I
+imagine there has to be some kind of complicated decoder reset and realignment
+to grab the bytes from the next cacheline.
+
+532 532 0x3cfc movb (1809(%esp, %esi), %bl <<<--- spans 2 64 byte lines
+942 942 0x3d03 movl %dh, (1809(%esp, %esi)
+937 937 0x3d0a incl %esi
+3 3 0x3d0b cmpb %bl, %dl
+27 27 0x3d0d jnz 0x000062db <main+11707>
+
+//===---------------------------------------------------------------------===//
+
+In c99 mode, the preprocessor doesn't like assembly comments like #TRUNCATE.
+
+//===---------------------------------------------------------------------===//
+
+This could be a single 16-bit load.
+
+int f(char *p) {
+ if ((p[0] == 1) & (p[1] == 2)) return 1;
+ return 0;
+}
+
+//===---------------------------------------------------------------------===//
+
+We should inline lrintf and probably other libc functions.
+
+//===---------------------------------------------------------------------===//
+
+Start using the flags more. For example, compile:
+
+int add_zf(int *x, int y, int a, int b) {
+ if ((*x += y) == 0)
+ return a;
+ else
+ return b;
+}
+
+to:
+ addl %esi, (%rdi)
+ movl %edx, %eax
+ cmovne %ecx, %eax
+ ret
+instead of:
+
+_add_zf:
+ addl (%rdi), %esi
+ movl %esi, (%rdi)
+ testl %esi, %esi
+ cmove %edx, %ecx
+ movl %ecx, %eax
+ ret
+
+and:
+
+int add_zf(int *x, int y, int a, int b) {
+ if ((*x + y) < 0)
+ return a;
+ else
+ return b;
+}
+
+to:
+
+add_zf:
+ addl (%rdi), %esi
+ movl %edx, %eax
+ cmovns %ecx, %eax
+ ret
+
+instead of:
+
+_add_zf:
+ addl (%rdi), %esi
+ testl %esi, %esi
+ cmovs %edx, %ecx
+ movl %ecx, %eax
+ ret
+
+//===---------------------------------------------------------------------===//
+
+These two functions have identical effects:
+
+unsigned int f(unsigned int i, unsigned int n) {++i; if (i == n) ++i; return i;}
+unsigned int f2(unsigned int i, unsigned int n) {++i; i += i == n; return i;}
+
+We currently compile them to:
+
+_f:
+ movl 4(%esp), %eax
+ movl %eax, %ecx
+ incl %ecx
+ movl 8(%esp), %edx
+ cmpl %edx, %ecx
+ jne LBB1_2 #UnifiedReturnBlock
+LBB1_1: #cond_true
+ addl $2, %eax
+ ret
+LBB1_2: #UnifiedReturnBlock
+ movl %ecx, %eax
+ ret
+_f2:
movl 4(%esp), %eax
- movl 4(%eax), %eax
- shrl $18, %eax
- andl $3, %eax
+ movl %eax, %ecx
+ incl %ecx
+ cmpl 8(%esp), %ecx
+ sete %cl
+ movzbl %cl, %ecx
+ leal 1(%ecx,%eax), %eax
+ ret
+
+both of which are inferior to GCC's:
+
+_f:
+ movl 4(%esp), %edx
+ leal 1(%edx), %eax
+ addl $2, %edx
+ cmpl 8(%esp), %eax
+ cmove %edx, %eax
+ ret
+_f2:
+ movl 4(%esp), %eax
+ addl $1, %eax
+ xorl %edx, %edx
+ cmpl 8(%esp), %eax
+ sete %dl
+ addl %edx, %eax
+ ret
+
+//===---------------------------------------------------------------------===//
+
+This code:
+
+void test(int X) {
+ if (X) abort();
+}
+
+is currently compiled to:
+
+_test:
+ subl $12, %esp
+ cmpl $0, 16(%esp)
+ jne LBB1_1
+ addl $12, %esp
+ ret
+LBB1_1:
+ call L_abort$stub
+
+It would be better to produce:
+
+_test:
+ subl $12, %esp
+ cmpl $0, 16(%esp)
+ jne L_abort$stub
+ addl $12, %esp
+ ret
+
+This can be applied to any no-return function call that takes no arguments etc.
+Alternatively, the stack save/restore logic could be shrink-wrapped, producing
+something like this:
+
+_test:
+ cmpl $0, 4(%esp)
+ jne LBB1_1
+ ret
+LBB1_1:
+ subl $12, %esp
+ call L_abort$stub
+
+Both are useful in different situations. Finally, it could be shrink-wrapped
+and tail called, like this:
+
+_test:
+ cmpl $0, 4(%esp)
+ jne LBB1_1
+ ret
+LBB1_1:
+ pop %eax # realign stack.
+ call L_abort$stub
+
+Though this probably isn't worth it.
+
+//===---------------------------------------------------------------------===//
+
+We need to teach the codegen to convert two-address INC instructions to LEA
+when the flags are dead (likewise dec). For example, on X86-64, compile:
+
+int foo(int A, int B) {
+ return A+1;
+}
+
+to:
+
+_foo:
+ leal 1(%edi), %eax
ret
instead of:
+_foo:
+ incl %edi
+ movl %edi, %eax
+ ret
+
+Another example is:
+
+;; X's live range extends beyond the shift, so the register allocator
+;; cannot coalesce it with Y. Because of this, a copy needs to be
+;; emitted before the shift to save the register value before it is
+;; clobbered. However, this copy is not needed if the register
+;; allocator turns the shift into an LEA. This also occurs for ADD.
+
+; Check that the shift gets turned into an LEA.
+; RUN: llvm-upgrade < %s | llvm-as | llc -march=x86 -x86-asm-syntax=intel | \
+; RUN: not grep {mov E.X, E.X}
+
+%G = external global int
+
+int %test1(int %X, int %Y) {
+ %Z = add int %X, %Y
+ volatile store int %Y, int* %G
+ volatile store int %Z, int* %G
+ ret int %X
+}
+
+int %test2(int %X) {
+ %Z = add int %X, 1 ;; inc
+ volatile store int %Z, int* %G
+ ret int %X
+}
+
+//===---------------------------------------------------------------------===//
+
+Sometimes it is better to codegen subtractions from a constant (e.g. 7-x) with
+a neg instead of a sub instruction. Consider:
+
+int test(char X) { return 7-X; }
+
+we currently produce:
_test:
- movl 4(%esp), %eax
- movl 4(%eax), %eax
- shrl $18, %eax
- # TRUNCATE movb %al, %al
- andb $3, %al
- movzbl %al, %eax
+ movl $7, %eax
+ movsbl 4(%esp), %ecx
+ subl %ecx, %eax
ret
-This saves a movzbl, and saves a truncate if it doesn't get coallesced right.
-This is a simple DAGCombine to propagate the zext through the and.
+We would use one fewer register if codegen'd as:
+
+ movsbl 4(%esp), %eax
+ neg %eax
+ add $7, %eax
+ ret
+
+Note that this isn't beneficial if the load can be folded into the sub. In
+this case, we want a sub:
+
+int test(int X) { return 7-X; }
+_test:
+ movl $7, %eax
+ subl 4(%esp), %eax
+ ret
//===---------------------------------------------------------------------===//
-Instead of:
+This is a "commutable two-address" register coallescing deficiency:
+
+define <4 x float> @test1(<4 x float> %V) {
+entry:
+ %tmp8 = shufflevector <4 x float> %V, <4 x float> undef,
+ <4 x i32> < i32 3, i32 2, i32 1, i32 0 >
+ %add = add <4 x float> %tmp8, %V
+ ret <4 x float> %add
+}
+
+this codegens to:
+
+_test1:
+ pshufd $27, %xmm0, %xmm1
+ addps %xmm0, %xmm1
+ movaps %xmm1, %xmm0
+ ret
+
+instead of:
+
+_test1:
+ pshufd $27, %xmm0, %xmm1
+ addps %xmm1, %xmm0
+ ret
+
+//===---------------------------------------------------------------------===//
+
+Leaf functions that require one 4-byte spill slot have a prolog like this:
+
+_foo:
+ pushl %esi
+ subl $4, %esp
+...
+and an epilog like this:
+ addl $4, %esp
+ popl %esi
+ ret
+
+It would be smaller, and potentially faster, to push eax on entry and to
+pop into a dummy register instead of using addl/subl of esp. Just don't pop
+into any return registers :)
+
+//===---------------------------------------------------------------------===//
+
+The X86 backend should fold (branch (or (setcc, setcc))) into multiple
+branches. We generate really poor code for:
+
+double testf(double a) {
+ return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0);
+}
+
+For example, the entry BB is:
+
+_testf:
+ subl $20, %esp
+ pxor %xmm0, %xmm0
+ movsd 24(%esp), %xmm1
+ ucomisd %xmm0, %xmm1
+ setnp %al
+ sete %cl
+ testb %cl, %al
+ jne LBB1_5 # UnifiedReturnBlock
+LBB1_1: # cond_true
+
+
+it would be better to replace the last four instructions with:
+
+ jp LBB1_1
+ je LBB1_5
+LBB1_1:
+
+We also codegen the inner ?: into a diamond:
+
+ cvtss2sd LCPI1_0(%rip), %xmm2
+ cvtss2sd LCPI1_1(%rip), %xmm3
+ ucomisd %xmm1, %xmm0
+ ja LBB1_3 # cond_true
+LBB1_2: # cond_true
+ movapd %xmm3, %xmm2
+LBB1_3: # cond_true
+ movapd %xmm2, %xmm0
+ ret
+
+We should sink the load into xmm3 into the LBB1_2 block. This should
+be pretty easy, and will nuke all the copies.
+
+//===---------------------------------------------------------------------===//
+
+This:
+ #include <algorithm>
+ inline std::pair<unsigned, bool> full_add(unsigned a, unsigned b)
+ { return std::make_pair(a + b, a + b < a); }
+ bool no_overflow(unsigned a, unsigned b)
+ { return !full_add(a, b).second; }
+
+Should compile to:
+
- cmpl $4294967295, %edx
- jg LBB1_8 #cond_false49
+ _Z11no_overflowjj:
+ addl %edi, %esi
+ setae %al
+ ret
-emit:
+on x86-64, not:
+
+__Z11no_overflowjj:
+ addl %edi, %esi
+ cmpl %edi, %esi
+ setae %al
+ movzbl %al, %eax
+ ret
+
+
+//===---------------------------------------------------------------------===//
+
+Re-materialize MOV32r0 etc. with xor instead of changing them to moves if the
+condition register is dead. xor reg reg is shorter than mov reg, #0.
+
+//===---------------------------------------------------------------------===//
+
+We aren't matching RMW instructions aggressively
+enough. Here's a reduced testcase (more in PR1160):
+
+define void @test(i32* %huge_ptr, i32* %target_ptr) {
+ %A = load i32* %huge_ptr ; <i32> [#uses=1]
+ %B = load i32* %target_ptr ; <i32> [#uses=1]
+ %C = or i32 %A, %B ; <i32> [#uses=1]
+ store i32 %C, i32* %target_ptr
+ ret void
+}
+
+$ llvm-as < t.ll | llc -march=x86-64
+
+_test:
+ movl (%rdi), %eax
+ orl (%rsi), %eax
+ movl %eax, (%rsi)
+ ret
+
+That should be something like:
+
+_test:
+ movl (%rdi), %eax
+ orl %eax, (%rsi)
+ ret
+
+//===---------------------------------------------------------------------===//
+
+The following code:
+
+bb114.preheader: ; preds = %cond_next94
+ %tmp231232 = sext i16 %tmp62 to i32 ; <i32> [#uses=1]
+ %tmp233 = sub i32 32, %tmp231232 ; <i32> [#uses=1]
+ %tmp245246 = sext i16 %tmp65 to i32 ; <i32> [#uses=1]
+ %tmp252253 = sext i16 %tmp68 to i32 ; <i32> [#uses=1]
+ %tmp254 = sub i32 32, %tmp252253 ; <i32> [#uses=1]
+ %tmp553554 = bitcast i16* %tmp37 to i8* ; <i8*> [#uses=2]
+ %tmp583584 = sext i16 %tmp98 to i32 ; <i32> [#uses=1]
+ %tmp585 = sub i32 32, %tmp583584 ; <i32> [#uses=1]
+ %tmp614615 = sext i16 %tmp101 to i32 ; <i32> [#uses=1]
+ %tmp621622 = sext i16 %tmp104 to i32 ; <i32> [#uses=1]
+ %tmp623 = sub i32 32, %tmp621622 ; <i32> [#uses=1]
+ br label %bb114
+
+produces:
+
+LBB3_5: # bb114.preheader
+ movswl -68(%ebp), %eax
+ movl $32, %ecx
+ movl %ecx, -80(%ebp)
+ subl %eax, -80(%ebp)
+ movswl -52(%ebp), %eax
+ movl %ecx, -84(%ebp)
+ subl %eax, -84(%ebp)
+ movswl -70(%ebp), %eax
+ movl %ecx, -88(%ebp)
+ subl %eax, -88(%ebp)
+ movswl -50(%ebp), %eax
+ subl %eax, %ecx
+ movl %ecx, -76(%ebp)
+ movswl -42(%ebp), %eax
+ movl %eax, -92(%ebp)
+ movswl -66(%ebp), %eax
+ movl %eax, -96(%ebp)
+ movw $0, -98(%ebp)
+
+This appears to be bad because the RA is not folding the store to the stack
+slot into the movl. The above instructions could be:
+ movl $32, -80(%ebp)
+...
+ movl $32, -84(%ebp)
+...
+This seems like a cross between remat and spill folding.
+
+This has redundant subtractions of %eax from a stack slot. However, %ecx doesn't
+change, so we could simply subtract %eax from %ecx first and then use %ecx (or
+vice-versa).
+
+//===---------------------------------------------------------------------===//
+
+For this code:
+
+cond_next603: ; preds = %bb493, %cond_true336, %cond_next599
+ %v.21050.1 = phi i32 [ %v.21050.0, %cond_next599 ], [ %tmp344, %cond_true336 ], [ %v.2, %bb493 ] ; <i32> [#uses=1]
+ %maxz.21051.1 = phi i32 [ %maxz.21051.0, %cond_next599 ], [ 0, %cond_true336 ], [ %maxz.2, %bb493 ] ; <i32> [#uses=2]
+ %cnt.01055.1 = phi i32 [ %cnt.01055.0, %cond_next599 ], [ 0, %cond_true336 ], [ %cnt.0, %bb493 ] ; <i32> [#uses=2]
+ %byteptr.9 = phi i8* [ %byteptr.12, %cond_next599 ], [ %byteptr.0, %cond_true336 ], [ %byteptr.10, %bb493 ] ; <i8*> [#uses=9]
+ %bitptr.6 = phi i32 [ %tmp5571104.1, %cond_next599 ], [ %tmp4921049, %cond_true336 ], [ %bitptr.7, %bb493 ] ; <i32> [#uses=4]
+ %source.5 = phi i32 [ %tmp602, %cond_next599 ], [ %source.0, %cond_true336 ], [ %source.6, %bb493 ] ; <i32> [#uses=7]
+ %tmp606 = getelementptr %struct.const_tables* @tables, i32 0, i32 0, i32 %cnt.01055.1 ; <i8*> [#uses=1]
+ %tmp607 = load i8* %tmp606, align 1 ; <i8> [#uses=1]
+
+We produce this:
+
+LBB4_70: # cond_next603
+ movl -20(%ebp), %esi
+ movl L_tables$non_lazy_ptr-"L4$pb"(%esi), %esi
+
+However, ICC caches this information before the loop and produces this:
+
+ movl 88(%esp), %eax #481.12
+
+//===---------------------------------------------------------------------===//
+
+This code:
+
+ %tmp659 = icmp slt i16 %tmp654, 0 ; <i1> [#uses=1]
+ br i1 %tmp659, label %cond_true662, label %cond_next715
+
+produces this:
+
+ testw %cx, %cx
+ movswl %cx, %esi
+ jns LBB4_109 # cond_next715
+
+Shark tells us that using %cx in the testw instruction is sub-optimal. It
+suggests using the 32-bit register (which is what ICC uses).
+
+//===---------------------------------------------------------------------===//
+
+rdar://5506677 - We compile this:
+
+define i32 @foo(double %x) {
+ %x14 = bitcast double %x to i64 ; <i64> [#uses=1]
+ %tmp713 = trunc i64 %x14 to i32 ; <i32> [#uses=1]
+ %tmp8 = and i32 %tmp713, 2147483647 ; <i32> [#uses=1]
+ ret i32 %tmp8
+}
+
+to:
+
+_foo:
+ subl $12, %esp
+ fldl 16(%esp)
+ fstpl (%esp)
+ movl $2147483647, %eax
+ andl (%esp), %eax
+ addl $12, %esp
+ #FP_REG_KILL
+ ret
- testl %edx, %edx
- js LBB1_8
+It would be much better to eliminate the fldl/fstpl by folding the bitcast
+into the load SDNode. That would give us:
-This saves a byte of code space.
+_foo:
+ movl $2147483647, %eax
+ andl 4(%esp), %eax
+ ret
+
+//===---------------------------------------------------------------------===//
+
+We compile this:
+
+void compare (long long foo) {
+ if (foo < 4294967297LL)
+ abort();
+}
+
+to:
+
+_compare:
+ subl $12, %esp
+ cmpl $0, 16(%esp)
+ setne %al
+ movzbw %al, %ax
+ cmpl $1, 20(%esp)
+ setg %cl
+ movzbw %cl, %cx
+ cmove %ax, %cx
+ movw %cx, %ax
+ testb $1, %al
+ je LBB1_2 # cond_true
+
+(also really horrible code on ppc). This is due to the expand code for 64-bit
+compares. GCC produces multiple branches, which is much nicer:
+
+_compare:
+ pushl %ebp
+ movl %esp, %ebp
+ subl $8, %esp
+ movl 8(%ebp), %eax
+ movl 12(%ebp), %edx
+ subl $1, %edx
+ jg L5
+L7:
+ jl L4
+ cmpl $0, %eax
+ jbe L4
+L5:
+
+//===---------------------------------------------------------------------===//
+
+Tail call optimization improvements: Tail call optimization currently
+pushes all arguments on the top of the stack (their normal place for
+non-tail call optimized calls) that source from the callers arguments
+or that source from a virtual register (also possibly sourcing from
+callers arguments).
+This is done to prevent overwriting of parameters (see example
+below) that might be used later.
+
+example:
+
+int callee(int32, int64);
+int caller(int32 arg1, int32 arg2) {
+ int64 local = arg2 * 2;
+ return callee(arg2, (int64)local);
+}
+
+[arg1] [!arg2 no longer valid since we moved local onto it]
+[arg2] -> [(int64)
+[RETADDR] local ]
+
+Moving arg1 onto the stack slot of callee function would overwrite
+arg2 of the caller.
+
+Possible optimizations:
+
+
+ - Analyse the actual parameters of the callee to see which would
+ overwrite a caller parameter which is used by the callee and only
+ push them onto the top of the stack.
+
+ int callee (int32 arg1, int32 arg2);
+ int caller (int32 arg1, int32 arg2) {
+ return callee(arg1,arg2);
+ }
+
+ Here we don't need to write any variables to the top of the stack
+ since they don't overwrite each other.
+
+ int callee (int32 arg1, int32 arg2);
+ int caller (int32 arg1, int32 arg2) {
+ return callee(arg2,arg1);
+ }
+
+ Here we need to push the arguments because they overwrite each
+ other.
+
+//===---------------------------------------------------------------------===//
+
+main ()
+{
+ int i = 0;
+ unsigned long int z = 0;
+
+ do {
+ z -= 0x00004000;
+ i++;
+ if (i > 0x00040000)
+ abort ();
+ } while (z > 0);
+ exit (0);
+}
+
+gcc compiles this to:
+
+_main:
+ subl $28, %esp
+ xorl %eax, %eax
+ jmp L2
+L3:
+ cmpl $262144, %eax
+ je L10
+L2:
+ addl $1, %eax
+ cmpl $262145, %eax
+ jne L3
+ call L_abort$stub
+L10:
+ movl $0, (%esp)
+ call L_exit$stub
+
+llvm:
+
+_main:
+ subl $12, %esp
+ movl $1, %eax
+ movl $16384, %ecx
+LBB1_1: # bb
+ cmpl $262145, %eax
+ jge LBB1_4 # cond_true
+LBB1_2: # cond_next
+ incl %eax
+ addl $4294950912, %ecx
+ cmpl $16384, %ecx
+ jne LBB1_1 # bb
+LBB1_3: # bb11
+ xorl %eax, %eax
+ addl $12, %esp
+ ret
+LBB1_4: # cond_true
+ call L_abort$stub
+
+1. LSR should rewrite the first cmp with induction variable %ecx.
+2. DAG combiner should fold
+ leal 1(%eax), %edx
+ cmpl $262145, %edx
+ =>
+ cmpl $262144, %eax
+
+//===---------------------------------------------------------------------===//
+
+define i64 @test(double %X) {
+ %Y = fptosi double %X to i64
+ ret i64 %Y
+}
+
+compiles to:
+
+_test:
+ subl $20, %esp
+ movsd 24(%esp), %xmm0
+ movsd %xmm0, 8(%esp)
+ fldl 8(%esp)
+ fisttpll (%esp)
+ movl 4(%esp), %edx
+ movl (%esp), %eax
+ addl $20, %esp
+ #FP_REG_KILL
+ ret
+
+This should just fldl directly from the input stack slot.
+
+//===---------------------------------------------------------------------===//
+
+This code:
+int foo (int x) { return (x & 65535) | 255; }
+
+Should compile into:
+
+_foo:
+ movzwl 4(%esp), %eax
+ orb $-1, %al ;; 'orl 255' is also fine :)
+ ret
+
+instead of:
+_foo:
+ movl $255, %eax
+ orl 4(%esp), %eax
+ andl $65535, %eax
+ ret
+
+//===---------------------------------------------------------------------===//
+
+We're missing an obvious fold of a load into imul:
+
+int test(long a, long b) { return a * b; }
+
+LLVM produces:
+_test:
+ movl 4(%esp), %ecx
+ movl 8(%esp), %eax
+ imull %ecx, %eax
+ ret
+
+vs:
+_test:
+ movl 8(%esp), %eax
+ imull 4(%esp), %eax
+ ret
+
+//===---------------------------------------------------------------------===//
+
+We can fold a store into "zeroing a reg". Instead of:
+
+xorl %eax, %eax
+movl %eax, 124(%esp)
+
+we should get:
+
+movl $0, 124(%esp)
+
+if the flags of the xor are dead.
+
+Likewise, we isel "x<<1" into "add reg,reg". If reg is spilled, this should
+be folded into: shl [mem], 1
+
+//===---------------------------------------------------------------------===//
+
+This testcase misses a read/modify/write opportunity (from PR1425):
+
+void vertical_decompose97iH1(int *b0, int *b1, int *b2, int width){
+ int i;
+ for(i=0; i<width; i++)
+ b1[i] += (1*(b0[i] + b2[i])+0)>>0;
+}
+
+We compile it down to:
+
+LBB1_2: # bb
+ movl (%esi,%edi,4), %ebx
+ addl (%ecx,%edi,4), %ebx
+ addl (%edx,%edi,4), %ebx
+ movl %ebx, (%ecx,%edi,4)
+ incl %edi
+ cmpl %eax, %edi
+ jne LBB1_2 # bb
+
+the inner loop should add to the memory location (%ecx,%edi,4), saving
+a mov. Something like:
+
+ movl (%esi,%edi,4), %ebx
+ addl (%edx,%edi,4), %ebx
+ addl %ebx, (%ecx,%edi,4)
+
+Here is another interesting example:
+
+void vertical_compose97iH1(int *b0, int *b1, int *b2, int width){
+ int i;
+ for(i=0; i<width; i++)
+ b1[i] -= (1*(b0[i] + b2[i])+0)>>0;
+}
+
+We miss the r/m/w opportunity here by using 2 subs instead of an add+sub[mem]:
+
+LBB9_2: # bb
+ movl (%ecx,%edi,4), %ebx
+ subl (%esi,%edi,4), %ebx
+ subl (%edx,%edi,4), %ebx
+ movl %ebx, (%ecx,%edi,4)
+ incl %edi
+ cmpl %eax, %edi
+ jne LBB9_2 # bb
+
+Additionally, LSR should rewrite the exit condition of these loops to use
+a stride-4 IV, would would allow all the scales in the loop to go away.
+This would result in smaller code and more efficient microops.
+
+//===---------------------------------------------------------------------===//
+
+In SSE mode, we turn abs and neg into a load from the constant pool plus a xor
+or and instruction, for example:
+
+ xorpd LCPI1_0, %xmm2
+
+However, if xmm2 gets spilled, we end up with really ugly code like this:
+
+ movsd (%esp), %xmm0
+ xorpd LCPI1_0, %xmm0
+ movsd %xmm0, (%esp)
+
+Since we 'know' that this is a 'neg', we can actually "fold" the spill into
+the neg/abs instruction, turning it into an *integer* operation, like this:
+
+ xorl 2147483648, [mem+4] ## 2147483648 = (1 << 31)
+
+you could also use xorb, but xorl is less likely to lead to a partial register
+stall. Here is a contrived testcase:
+
+double a, b, c;
+void test(double *P) {
+ double X = *P;
+ a = X;
+ bar();
+ X = -X;
+ b = X;
+ bar();
+ c = X;
+}
+
+//===---------------------------------------------------------------------===//