// Random ideas for the X86 backend.
//===---------------------------------------------------------------------===//
-Missing features:
- - Support for SSE4: http://www.intel.com/software/penryn
-http://softwarecommunity.intel.com/isn/Downloads/Intel%20SSE4%20Programming%20Reference.pdf
- - support for 3DNow!
- - weird abis?
-
-//===---------------------------------------------------------------------===//
-
-Add a MUL2U and MUL2S nodes to represent a multiply that returns both the
-Hi and Lo parts (combination of MUL and MULH[SU] into one node). Add this to
-X86, & make the dag combiner produce it when needed. This will eliminate one
-imul from the code generated for:
-
-long long test(long long X, long long Y) { return X*Y; }
-
-by using the EAX result from the mul. We should add a similar node for
-DIVREM.
-
-another case is:
-
-long long test(int X, int Y) { return (long long)X*Y; }
-
-... which should only be one imul instruction.
-
-or:
-
-unsigned long long int t2(unsigned int a, unsigned int b) {
- return (unsigned long long)a * b;
-}
-
-... which should be one mul instruction.
-
-
-This can be done with a custom expander, but it would be nice to move this to
-generic code.
+We should add support for the "movbe" instruction, which does a byte-swapping
+copy (3-addr bswap + memory support?) This is available on Atom processors.
//===---------------------------------------------------------------------===//
But that requires good 8-bit subreg support.
+Also, this might be better. It's an extra shift, but it's one instruction
+shorter, and doesn't stress 8-bit subreg support.
+(From http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01148.html,
+but without the unnecessary and.)
+ movl %ecx, %eax
+ shrl $5, %eax
+ movl %eax, %edx
+ xorl $1, %edx
+ sall %cl, %eax
+ sall %cl. %edx
+
64-bit shifts (in general) expand to really bad code. Instead of using
cmovs, we should expand to a conditional branch like GCC produces.
xorl $1, %eax
ret
+(Although note that this isn't a legal way to express the code that llvm-gcc
+currently generates for that function.)
+
//===---------------------------------------------------------------------===//
Some isel ideas:
Leave any_extend as pseudo instruction and hint to register
allocator. Delay codegen until post register allocation.
-
-//===---------------------------------------------------------------------===//
-
-Count leading zeros and count trailing zeros:
-
-int clz(int X) { return __builtin_clz(X); }
-int ctz(int X) { return __builtin_ctz(X); }
-
-$ gcc t.c -S -o - -O3 -fomit-frame-pointer -masm=intel
-clz:
- bsr %eax, DWORD PTR [%esp+4]
- xor %eax, 31
- ret
-ctz:
- bsf %eax, DWORD PTR [%esp+4]
- ret
-
-however, check that these are defined for 0 and 32. Our intrinsics are, GCC's
-aren't.
-
-Another example (use predsimplify to eliminate a select):
-
-int foo (unsigned long j) {
- if (j)
- return __builtin_ffs (j) - 1;
- else
- return 0;
-}
+Note. any_extend is now turned into an INSERT_SUBREG. We still need to teach
+the coalescer how to deal with it though.
//===---------------------------------------------------------------------===//
//===---------------------------------------------------------------------===//
-How about intrinsics? An example is:
- *res = _mm_mulhi_epu16(*A, _mm_mul_epu32(*B, *C));
-
-compiles to
- pmuludq (%eax), %xmm0
- movl 8(%esp), %eax
- movdqa (%eax), %xmm1
- pmulhuw %xmm0, %xmm1
-
-The transformation probably requires a X86 specific pass or a DAG combiner
-target specific hook.
-
-//===---------------------------------------------------------------------===//
-
In many cases, LLVM generates code like this:
_test:
Evaluate what the best way to codegen sdiv X, (2^C) is. For X/8, we currently
get this:
-int %test1(int %X) {
- %Y = div int %X, 8
- ret int %Y
+define i32 @test1(i32 %X) {
+ %Y = sdiv i32 %X, 8
+ ret i32 %Y
}
_test1:
//===---------------------------------------------------------------------===//
-The first BB of this code:
-
-declare bool %foo()
-int %bar() {
- %V = call bool %foo()
- br bool %V, label %T, label %F
-T:
- ret int 1
-F:
- call bool %foo()
- ret int 12
-}
-
-compiles to:
-
-_bar:
- subl $12, %esp
- call L_foo$stub
- xorb $1, %al
- testb %al, %al
- jne LBB_bar_2 # F
-
-It would be better to emit "cmp %al, 1" than a xor and test.
-
-//===---------------------------------------------------------------------===//
-
-Enable X86InstrInfo::convertToThreeAddress().
-
-//===---------------------------------------------------------------------===//
-
We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl
We should leave these as libcalls for everything over a much lower threshold,
since libc is hand tuned for medium and large mem ops (avoiding RFO for large
//===---------------------------------------------------------------------===//
-%X = weak global int 0
-
-void %foo(int %N) {
- %N = cast int %N to uint
- %tmp.24 = setgt int %N, 0
- br bool %tmp.24, label %no_exit, label %return
-
-no_exit:
- %indvar = phi uint [ 0, %entry ], [ %indvar.next, %no_exit ]
- %i.0.0 = cast uint %indvar to int
- volatile store int %i.0.0, int* %X
- %indvar.next = add uint %indvar, 1
- %exitcond = seteq uint %indvar.next, %N
- br bool %exitcond, label %return, label %no_exit
-
-return:
- ret void
-}
-
-compiles into:
-
- .text
- .align 4
- .globl _foo
-_foo:
- movl 4(%esp), %eax
- cmpl $1, %eax
- jl LBB_foo_4 # return
-LBB_foo_1: # no_exit.preheader
- xorl %ecx, %ecx
-LBB_foo_2: # no_exit
- movl L_X$non_lazy_ptr, %edx
- movl %ecx, (%edx)
- incl %ecx
- cmpl %eax, %ecx
- jne LBB_foo_2 # no_exit
-LBB_foo_3: # return.loopexit
-LBB_foo_4: # return
- ret
-
-We should hoist "movl L_X$non_lazy_ptr, %edx" out of the loop after
-remateralization is implemented. This can be accomplished with 1) a target
-dependent LICM pass or 2) makeing SelectDAG represent the whole function.
-
-//===---------------------------------------------------------------------===//
-
The following tests perform worse with LSR:
lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor.
//===---------------------------------------------------------------------===//
-We are generating far worse code than gcc:
-
-volatile short X, Y;
-
-void foo(int N) {
- int i;
- for (i = 0; i < N; i++) { X = i; Y = i*4; }
-}
-
-LBB1_1: #bb.preheader
- xorl %ecx, %ecx
- xorw %dx, %dx
-LBB1_2: #bb
- movl L_X$non_lazy_ptr, %esi
- movw %dx, (%esi)
- movw %dx, %si
- shlw $2, %si
- movl L_Y$non_lazy_ptr, %edi
- movw %si, (%edi)
- incl %ecx
- incw %dx
- cmpl %eax, %ecx
- jne LBB1_2 #bb
-
-vs.
-
- xorl %edx, %edx
- movl L_X$non_lazy_ptr-"L00000000001$pb"(%ebx), %esi
- movl L_Y$non_lazy_ptr-"L00000000001$pb"(%ebx), %ecx
-L4:
- movw %dx, (%esi)
- leal 0(,%edx,4), %eax
- movw %ax, (%ecx)
- addl $1, %edx
- cmpl %edx, %edi
- jne L4
-
-There are 3 issues:
-
-1. Lack of post regalloc LICM.
-2. LSR unable to reused IV for a different type (i16 vs. i32) even though
- the cast would be free.
-
-//===---------------------------------------------------------------------===//
-
-Teach the coalescer to coalesce vregs of different register classes. e.g. FR32 /
-FR64 to VR128.
-
-//===---------------------------------------------------------------------===//
-
-mov $reg, 48(%esp)
-...
-leal 48(%esp), %eax
-mov %eax, (%esp)
-call _foo
-
-Obviously it would have been better for the first mov (or any op) to store
-directly %esp[0] if there are no other uses.
-
-//===---------------------------------------------------------------------===//
-
Adding to the list of cmp / test poor codegen issues:
int test(__m128 *A, __m128 *B) {
//===---------------------------------------------------------------------===//
-If shorter, we should use things like:
-movzwl %ax, %eax
-instead of:
-andl $65535, %EAX
-
-The former can also be used when the two-addressy nature of the 'and' would
-require a copy to be inserted (in X86InstrInfo::convertToThreeAddress).
-
-//===---------------------------------------------------------------------===//
-
-Consider this:
-
-typedef struct pair { float A, B; } pair;
-void pairtest(pair P, float *FP) {
- *FP = P.A+P.B;
-}
-
-We currently generate this code with llvmgcc4:
-
-_pairtest:
- movl 8(%esp), %eax
- movl 4(%esp), %ecx
- movd %eax, %xmm0
- movd %ecx, %xmm1
- addss %xmm0, %xmm1
- movl 12(%esp), %eax
- movss %xmm1, (%eax)
- ret
-
-we should be able to generate:
-_pairtest:
- movss 4(%esp), %xmm0
- movl 12(%esp), %eax
- addss 8(%esp), %xmm0
- movss %xmm0, (%eax)
- ret
-
-The issue is that llvmgcc4 is forcing the struct to memory, then passing it as
-integer chunks. It does this so that structs like {short,short} are passed in
-a single 32-bit integer stack slot. We should handle the safe cases above much
-nicer, while still handling the hard cases.
-
-While true in general, in this specific case we could do better by promoting
-load int + bitcast to float -> load fload. This basically needs alignment info,
-the code is already implemented (but disabled) in dag combine).
-
-//===---------------------------------------------------------------------===//
-
-Another instruction selector deficiency:
-
-void %bar() {
- %tmp = load int (int)** %foo
- %tmp = tail call int %tmp( int 3 )
- ret void
-}
-
-_bar:
- subl $12, %esp
- movl L_foo$non_lazy_ptr, %eax
- movl (%eax), %eax
- call *%eax
- addl $12, %esp
- ret
-
-The current isel scheme will not allow the load to be folded in the call since
-the load's chain result is read by the callseq_start.
-
-//===---------------------------------------------------------------------===//
-
For this:
int test(int a)
However, if we care more about code size, then imull is better. It's two bytes
shorter than movl + leal.
+On a Pentium M, both variants have the same characteristics with regard
+to throughput; however, the multiplication has a latency of four cycles, as
+opposed to two cycles for the movl+lea variant.
+
//===---------------------------------------------------------------------===//
-Implement CTTZ, CTLZ with bsf and bsr. GCC produces:
+__builtin_ffs codegen is messy.
-int ctz_(unsigned X) { return __builtin_ctz(X); }
-int clz_(unsigned X) { return __builtin_clz(X); }
int ffs_(unsigned X) { return __builtin_ffs(X); }
-_ctz_:
- bsfl 4(%esp), %eax
- ret
-_clz_:
- bsrl 4(%esp), %eax
- xorl $31, %eax
+llvm produces:
+ffs_:
+ movl 4(%esp), %ecx
+ bsfl %ecx, %eax
+ movl $32, %edx
+ cmove %edx, %eax
+ incl %eax
+ xorl %edx, %edx
+ testl %ecx, %ecx
+ cmove %edx, %eax
ret
+
+vs gcc:
+
_ffs_:
movl $-1, %edx
bsfl 4(%esp), %eax
addl $1, %eax
ret
+Another example of __builtin_ffs (use predsimplify to eliminate a select):
+
+int foo (unsigned long j) {
+ if (j)
+ return __builtin_ffs (j) - 1;
+ else
+ return 0;
+}
+
//===---------------------------------------------------------------------===//
It appears gcc place string data with linkonce linkage in
//===---------------------------------------------------------------------===//
-int %foo(int* %a, int %t) {
+define i32 @foo(i32* %a, i32 %t) {
entry:
- br label %cond_true
-
-cond_true: ; preds = %cond_true, %entry
- %x.0.0 = phi int [ 0, %entry ], [ %tmp9, %cond_true ]
- %t_addr.0.0 = phi int [ %t, %entry ], [ %tmp7, %cond_true ]
- %tmp2 = getelementptr int* %a, int %x.0.0
- %tmp3 = load int* %tmp2 ; <int> [#uses=1]
- %tmp5 = add int %t_addr.0.0, %x.0.0 ; <int> [#uses=1]
- %tmp7 = add int %tmp5, %tmp3 ; <int> [#uses=2]
- %tmp9 = add int %x.0.0, 1 ; <int> [#uses=2]
- %tmp = setgt int %tmp9, 39 ; <bool> [#uses=1]
- br bool %tmp, label %bb12, label %cond_true
-
-bb12: ; preds = %cond_true
- ret int %tmp7
+ br label %cond_true
+
+cond_true: ; preds = %cond_true, %entry
+ %x.0.0 = phi i32 [ 0, %entry ], [ %tmp9, %cond_true ] ; <i32> [#uses=3]
+ %t_addr.0.0 = phi i32 [ %t, %entry ], [ %tmp7, %cond_true ] ; <i32> [#uses=1]
+ %tmp2 = getelementptr i32* %a, i32 %x.0.0 ; <i32*> [#uses=1]
+ %tmp3 = load i32* %tmp2 ; <i32> [#uses=1]
+ %tmp5 = add i32 %t_addr.0.0, %x.0.0 ; <i32> [#uses=1]
+ %tmp7 = add i32 %tmp5, %tmp3 ; <i32> [#uses=2]
+ %tmp9 = add i32 %x.0.0, 1 ; <i32> [#uses=2]
+ %tmp = icmp sgt i32 %tmp9, 39 ; <i1> [#uses=1]
+ br i1 %tmp, label %bb12, label %cond_true
+
+bb12: ; preds = %cond_true
+ ret i32 %tmp7
}
-
is pessimized by -loop-reduce and -indvars
//===---------------------------------------------------------------------===//
etc.
-//===---------------------------------------------------------------------===//
-
-Currently we don't have elimination of redundant stack manipulations. Consider
-the code:
-
-int %main() {
-entry:
- call fastcc void %test1( )
- call fastcc void %test2( sbyte* cast (void ()* %test1 to sbyte*) )
- ret int 0
-}
-
-declare fastcc void %test1()
-
-declare fastcc void %test2(sbyte*)
-
-
-This currently compiles to:
-
- subl $16, %esp
- call _test5
- addl $12, %esp
- subl $16, %esp
- movl $_test5, (%esp)
- call _test6
- addl $12, %esp
-
-The add\sub pair is really unneeded here.
-
-//===---------------------------------------------------------------------===//
-
-We currently compile sign_extend_inreg into two shifts:
-
-long foo(long X) {
- return (long)(signed char)X;
+Another is:
+int usesbb(unsigned int a, unsigned int b) {
+ return (a < b ? -1 : 0);
}
+to:
+_usesbb:
+ movl 8(%esp), %eax
+ cmpl %eax, 4(%esp)
+ sbbl %eax, %eax
+ ret
-becomes:
-
-_foo:
- movl 4(%esp), %eax
- shll $24, %eax
- sarl $24, %eax
- ret
-
-This could be:
-
-_foo:
- movsbl 4(%esp),%eax
- ret
+instead of:
+_usesbb:
+ xorl %eax, %eax
+ movl 8(%esp), %ecx
+ cmpl %ecx, 4(%esp)
+ movl $4294967295, %ecx
+ cmovb %ecx, %eax
+ ret
//===---------------------------------------------------------------------===//
Consider the expansion of:
-uint %test3(uint %X) {
- %tmp1 = rem uint %X, 255
- ret uint %tmp1
+define i32 @test3(i32 %X) {
+ %tmp1 = urem i32 %X, 255
+ ret i32 %tmp1
}
Currently it compiles to:
to grab the bytes from the next cacheline.
532 532 0x3cfc movb (1809(%esp, %esi), %bl <<<--- spans 2 64 byte lines
-942 942 0x3d03 movl %dh, (1809(%esp, %esi)
-937 937 0x3d0a incl %esi
-3 3 0x3d0b cmpb %bl, %dl
+942 942 0x3d03 movl %dh, (1809(%esp, %esi)
+937 937 0x3d0a incl %esi
+3 3 0x3d0b cmpb %bl, %dl
27 27 0x3d0d jnz 0x000062db <main+11707>
//===---------------------------------------------------------------------===//
//===---------------------------------------------------------------------===//
-Start using the flags more. For example, compile:
+Use the FLAGS values from arithmetic instructions more. For example, compile:
int add_zf(int *x, int y, int a, int b) {
if ((*x += y) == 0)
movl %ecx, %eax
ret
-and:
-
-int add_zf(int *x, int y, int a, int b) {
- if ((*x + y) < 0)
- return a;
- else
- return b;
-}
-
-to:
-
-add_zf:
- addl (%rdi), %esi
- movl %edx, %eax
- cmovns %ecx, %eax
- ret
-
-instead of:
-
-_add_zf:
- addl (%rdi), %esi
- testl %esi, %esi
- cmovs %edx, %ecx
- movl %ecx, %eax
- ret
-
-//===---------------------------------------------------------------------===//
-
-This:
-#include <math.h>
-int foo(double X) { return isnan(X); }
-
-compiles to (-m64):
-
-_foo:
- pxor %xmm1, %xmm1
- ucomisd %xmm1, %xmm0
- setp %al
- movzbl %al, %eax
- ret
-
-the pxor is not needed, we could compare the value against itself.
+As another example, compile function f2 in test/CodeGen/X86/cmp-test.ll
+without a test instruction.
//===---------------------------------------------------------------------===//
//===---------------------------------------------------------------------===//
-We need to teach the codegen to convert two-address INC instructions to LEA
-when the flags are dead (likewise dec). For example, on X86-64, compile:
-
-int foo(int A, int B) {
- return A+1;
-}
-
-to:
-
-_foo:
- leal 1(%edi), %eax
- ret
-
-instead of:
-
-_foo:
- incl %edi
- movl %edi, %eax
- ret
-
-Another example is:
-
-;; X's live range extends beyond the shift, so the register allocator
-;; cannot coalesce it with Y. Because of this, a copy needs to be
-;; emitted before the shift to save the register value before it is
-;; clobbered. However, this copy is not needed if the register
-;; allocator turns the shift into an LEA. This also occurs for ADD.
-
-; Check that the shift gets turned into an LEA.
-; RUN: llvm-upgrade < %s | llvm-as | llc -march=x86 -x86-asm-syntax=intel | \
-; RUN: not grep {mov E.X, E.X}
-
-%G = external global int
-
-int %test1(int %X, int %Y) {
- %Z = add int %X, %Y
- volatile store int %Y, int* %G
- volatile store int %Z, int* %G
- ret int %X
-}
-
-int %test2(int %X) {
- %Z = add int %X, 1 ;; inc
- volatile store int %Z, int* %G
- ret int %X
-}
-
-//===---------------------------------------------------------------------===//
-
-This:
-#include <xmmintrin.h>
-unsigned test(float f) {
- return _mm_cvtsi128_si32( (__m128i) _mm_set_ss( f ));
-}
-
-Compiles to:
-_test:
- movss 4(%esp), %xmm0
- movd %xmm0, %eax
- ret
-
-it should compile to a move from the stack slot directly into eax. DAGCombine
-has this xform, but it is currently disabled until the alignment fields of
-the load/store nodes are trustworthy.
-
-//===---------------------------------------------------------------------===//
-
Sometimes it is better to codegen subtractions from a constant (e.g. 7-x) with
a neg instead of a sub instruction. Consider:
//===---------------------------------------------------------------------===//
-For code like:
-phi (undef, x)
-
-We get an implicit def on the undef side. If the phi is spilled, we then get:
-implicitdef xmm1
-store xmm1 -> stack
-
-It should be possible to teach the x86 backend to "fold" the store into the
-implicitdef, which just deletes the implicit def.
-
-These instructions should go away:
-#IMPLICIT_DEF %xmm1
-movaps %xmm1, 192(%esp)
-movaps %xmm1, 224(%esp)
-movaps %xmm1, 176(%esp)
-
-//===---------------------------------------------------------------------===//
-
-This is a "commutable two-address" register coallescing deficiency:
-
-define <4 x float> @test1(<4 x float> %V) {
-entry:
- %tmp8 = shufflevector <4 x float> %V, <4 x float> undef,
- <4 x i32> < i32 3, i32 2, i32 1, i32 0 >
- %add = add <4 x float> %tmp8, %V
- ret <4 x float> %add
-}
-
-this codegens to:
-
-_test1:
- pshufd $27, %xmm0, %xmm1
- addps %xmm0, %xmm1
- movaps %xmm1, %xmm0
- ret
-
-instead of:
-
-_test1:
- pshufd $27, %xmm0, %xmm1
- addps %xmm1, %xmm0
- ret
-
-//===---------------------------------------------------------------------===//
-
Leaf functions that require one 4-byte spill slot have a prolog like this:
_foo:
setae %al
ret
+FIXME: That code looks wrong; bool return is normally defined as zext.
+
on x86-64, not:
__Z11no_overflowjj:
//===---------------------------------------------------------------------===//
-Re-materialize MOV32r0 etc. with xor instead of changing them to moves if the
-condition register is dead. xor reg reg is shorter than mov reg, #0.
-
-//===---------------------------------------------------------------------===//
-
-We aren't matching RMW instructions aggressively
-enough. Here's a reduced testcase (more in PR1160):
-
-define void @test(i32* %huge_ptr, i32* %target_ptr) {
- %A = load i32* %huge_ptr ; <i32> [#uses=1]
- %B = load i32* %target_ptr ; <i32> [#uses=1]
- %C = or i32 %A, %B ; <i32> [#uses=1]
- store i32 %C, i32* %target_ptr
- ret void
-}
-
-$ llvm-as < t.ll | llc -march=x86-64
-
-_test:
- movl (%rdi), %eax
- orl (%rsi), %eax
- movl %eax, (%rsi)
- ret
-
-That should be something like:
-
-_test:
- movl (%rdi), %eax
- orl %eax, (%rsi)
- ret
-
-//===---------------------------------------------------------------------===//
-
The following code:
+bb114.preheader: ; preds = %cond_next94
+ %tmp231232 = sext i16 %tmp62 to i32 ; <i32> [#uses=1]
+ %tmp233 = sub i32 32, %tmp231232 ; <i32> [#uses=1]
+ %tmp245246 = sext i16 %tmp65 to i32 ; <i32> [#uses=1]
+ %tmp252253 = sext i16 %tmp68 to i32 ; <i32> [#uses=1]
+ %tmp254 = sub i32 32, %tmp252253 ; <i32> [#uses=1]
+ %tmp553554 = bitcast i16* %tmp37 to i8* ; <i8*> [#uses=2]
+ %tmp583584 = sext i16 %tmp98 to i32 ; <i32> [#uses=1]
+ %tmp585 = sub i32 32, %tmp583584 ; <i32> [#uses=1]
+ %tmp614615 = sext i16 %tmp101 to i32 ; <i32> [#uses=1]
+ %tmp621622 = sext i16 %tmp104 to i32 ; <i32> [#uses=1]
+ %tmp623 = sub i32 32, %tmp621622 ; <i32> [#uses=1]
+ br label %bb114
+
+produces:
+
LBB3_5: # bb114.preheader
movswl -68(%ebp), %eax
movl $32, %ecx
movl %eax, -96(%ebp)
movw $0, -98(%ebp)
-has redundant subtractions of %eax from a stack slot. However, %ecx doesn't
-change, so we could simply subtract %eax from %ecx first and then use %ecx (or
-vice-versa).
+This appears to be bad because the RA is not folding the store to the stack
+slot into the movl. The above instructions could be:
+ movl $32, -80(%ebp)
+...
+ movl $32, -84(%ebp)
+...
+This seems like a cross between remat and spill folding.
+
+This has redundant subtractions of %eax from a stack slot. However, %ecx doesn't
+change, so we could simply subtract %eax from %ecx first and then use %ecx (or
+vice-versa).
+
+//===---------------------------------------------------------------------===//
+
+This code:
+
+ %tmp659 = icmp slt i16 %tmp654, 0 ; <i1> [#uses=1]
+ br i1 %tmp659, label %cond_true662, label %cond_next715
+
+produces this:
+
+ testw %cx, %cx
+ movswl %cx, %esi
+ jns LBB4_109 # cond_next715
+
+Shark tells us that using %cx in the testw instruction is sub-optimal. It
+suggests using the 32-bit register (which is what ICC uses).
+
+//===---------------------------------------------------------------------===//
+
+We compile this:
+
+void compare (long long foo) {
+ if (foo < 4294967297LL)
+ abort();
+}
+
+to:
+
+compare:
+ subl $4, %esp
+ cmpl $0, 8(%esp)
+ setne %al
+ movzbw %al, %ax
+ cmpl $1, 12(%esp)
+ setg %cl
+ movzbw %cl, %cx
+ cmove %ax, %cx
+ testb $1, %cl
+ jne .LBB1_2 # UnifiedReturnBlock
+.LBB1_1: # ifthen
+ call abort
+.LBB1_2: # UnifiedReturnBlock
+ addl $4, %esp
+ ret
+
+(also really horrible code on ppc). This is due to the expand code for 64-bit
+compares. GCC produces multiple branches, which is much nicer:
+
+compare:
+ subl $12, %esp
+ movl 20(%esp), %edx
+ movl 16(%esp), %eax
+ decl %edx
+ jle .L7
+.L5:
+ addl $12, %esp
+ ret
+ .p2align 4,,7
+.L7:
+ jl .L4
+ cmpl $0, %eax
+ .p2align 4,,8
+ ja .L5
+.L4:
+ .p2align 4,,9
+ call abort
+
+//===---------------------------------------------------------------------===//
+
+Tail call optimization improvements: Tail call optimization currently
+pushes all arguments on the top of the stack (their normal place for
+non-tail call optimized calls) that source from the callers arguments
+or that source from a virtual register (also possibly sourcing from
+callers arguments).
+This is done to prevent overwriting of parameters (see example
+below) that might be used later.
+
+example:
+
+int callee(int32, int64);
+int caller(int32 arg1, int32 arg2) {
+ int64 local = arg2 * 2;
+ return callee(arg2, (int64)local);
+}
+
+[arg1] [!arg2 no longer valid since we moved local onto it]
+[arg2] -> [(int64)
+[RETADDR] local ]
+
+Moving arg1 onto the stack slot of callee function would overwrite
+arg2 of the caller.
+
+Possible optimizations:
+
+
+ - Analyse the actual parameters of the callee to see which would
+ overwrite a caller parameter which is used by the callee and only
+ push them onto the top of the stack.
+
+ int callee (int32 arg1, int32 arg2);
+ int caller (int32 arg1, int32 arg2) {
+ return callee(arg1,arg2);
+ }
+
+ Here we don't need to write any variables to the top of the stack
+ since they don't overwrite each other.
+
+ int callee (int32 arg1, int32 arg2);
+ int caller (int32 arg1, int32 arg2) {
+ return callee(arg2,arg1);
+ }
+
+ Here we need to push the arguments because they overwrite each
+ other.
+
+//===---------------------------------------------------------------------===//
+
+main ()
+{
+ int i = 0;
+ unsigned long int z = 0;
+
+ do {
+ z -= 0x00004000;
+ i++;
+ if (i > 0x00040000)
+ abort ();
+ } while (z > 0);
+ exit (0);
+}
+
+gcc compiles this to:
+
+_main:
+ subl $28, %esp
+ xorl %eax, %eax
+ jmp L2
+L3:
+ cmpl $262144, %eax
+ je L10
+L2:
+ addl $1, %eax
+ cmpl $262145, %eax
+ jne L3
+ call L_abort$stub
+L10:
+ movl $0, (%esp)
+ call L_exit$stub
+
+llvm:
+
+_main:
+ subl $12, %esp
+ movl $1, %eax
+ movl $16384, %ecx
+LBB1_1: # bb
+ cmpl $262145, %eax
+ jge LBB1_4 # cond_true
+LBB1_2: # cond_next
+ incl %eax
+ addl $4294950912, %ecx
+ cmpl $16384, %ecx
+ jne LBB1_1 # bb
+LBB1_3: # bb11
+ xorl %eax, %eax
+ addl $12, %esp
+ ret
+LBB1_4: # cond_true
+ call L_abort$stub
+
+1. LSR should rewrite the first cmp with induction variable %ecx.
+2. DAG combiner should fold
+ leal 1(%eax), %edx
+ cmpl $262145, %edx
+ =>
+ cmpl $262144, %eax
+
+//===---------------------------------------------------------------------===//
+
+define i64 @test(double %X) {
+ %Y = fptosi double %X to i64
+ ret i64 %Y
+}
+
+compiles to:
+
+_test:
+ subl $20, %esp
+ movsd 24(%esp), %xmm0
+ movsd %xmm0, 8(%esp)
+ fldl 8(%esp)
+ fisttpll (%esp)
+ movl 4(%esp), %edx
+ movl (%esp), %eax
+ addl $20, %esp
+ #FP_REG_KILL
+ ret
+
+This should just fldl directly from the input stack slot.
+
+//===---------------------------------------------------------------------===//
+
+This code:
+int foo (int x) { return (x & 65535) | 255; }
+
+Should compile into:
+
+_foo:
+ movzwl 4(%esp), %eax
+ orl $255, %eax
+ ret
+
+instead of:
+_foo:
+ movl $255, %eax
+ orl 4(%esp), %eax
+ andl $65535, %eax
+ ret
+
+//===---------------------------------------------------------------------===//
+
+We're codegen'ing multiply of long longs inefficiently:
+
+unsigned long long LLM(unsigned long long arg1, unsigned long long arg2) {
+ return arg1 * arg2;
+}
+
+We compile to (fomit-frame-pointer):
+
+_LLM:
+ pushl %esi
+ movl 8(%esp), %ecx
+ movl 16(%esp), %esi
+ movl %esi, %eax
+ mull %ecx
+ imull 12(%esp), %esi
+ addl %edx, %esi
+ imull 20(%esp), %ecx
+ movl %esi, %edx
+ addl %ecx, %edx
+ popl %esi
+ ret
+
+This looks like a scheduling deficiency and lack of remat of the load from
+the argument area. ICC apparently produces:
+
+ movl 8(%esp), %ecx
+ imull 12(%esp), %ecx
+ movl 16(%esp), %eax
+ imull 4(%esp), %eax
+ addl %eax, %ecx
+ movl 4(%esp), %eax
+ mull 12(%esp)
+ addl %ecx, %edx
+ ret
+
+Note that it remat'd loads from 4(esp) and 12(esp). See this GCC PR:
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=17236
+
+//===---------------------------------------------------------------------===//
+
+We can fold a store into "zeroing a reg". Instead of:
+
+xorl %eax, %eax
+movl %eax, 124(%esp)
+
+we should get:
+
+movl $0, 124(%esp)
+
+if the flags of the xor are dead.
+
+Likewise, we isel "x<<1" into "add reg,reg". If reg is spilled, this should
+be folded into: shl [mem], 1
+
+//===---------------------------------------------------------------------===//
+
+In SSE mode, we turn abs and neg into a load from the constant pool plus a xor
+or and instruction, for example:
+
+ xorpd LCPI1_0, %xmm2
+
+However, if xmm2 gets spilled, we end up with really ugly code like this:
+
+ movsd (%esp), %xmm0
+ xorpd LCPI1_0, %xmm0
+ movsd %xmm0, (%esp)
+
+Since we 'know' that this is a 'neg', we can actually "fold" the spill into
+the neg/abs instruction, turning it into an *integer* operation, like this:
+
+ xorl 2147483648, [mem+4] ## 2147483648 = (1 << 31)
+
+you could also use xorb, but xorl is less likely to lead to a partial register
+stall. Here is a contrived testcase:
+
+double a, b, c;
+void test(double *P) {
+ double X = *P;
+ a = X;
+ bar();
+ X = -X;
+ b = X;
+ bar();
+ c = X;
+}
+
+//===---------------------------------------------------------------------===//
+
+The generated code on x86 for checking for signed overflow on a multiply the
+obvious way is much longer than it needs to be.
+
+int x(int a, int b) {
+ long long prod = (long long)a*b;
+ return prod > 0x7FFFFFFF || prod < (-0x7FFFFFFF-1);
+}
+
+See PR2053 for more details.
+
+//===---------------------------------------------------------------------===//
+
+We should investigate using cdq/ctld (effect: edx = sar eax, 31)
+more aggressively; it should cost the same as a move+shift on any modern
+processor, but it's a lot shorter. Downside is that it puts more
+pressure on register allocation because it has fixed operands.
+
+Example:
+int abs(int x) {return x < 0 ? -x : x;}
+
+gcc compiles this to the following when using march/mtune=pentium2/3/4/m/etc.:
+abs:
+ movl 4(%esp), %eax
+ cltd
+ xorl %edx, %eax
+ subl %edx, %eax
+ ret
+
+//===---------------------------------------------------------------------===//
+
+Consider:
+int test(unsigned long a, unsigned long b) { return -(a < b); }
+
+We currently compile this to:
+
+define i32 @test(i32 %a, i32 %b) nounwind {
+ %tmp3 = icmp ult i32 %a, %b ; <i1> [#uses=1]
+ %tmp34 = zext i1 %tmp3 to i32 ; <i32> [#uses=1]
+ %tmp5 = sub i32 0, %tmp34 ; <i32> [#uses=1]
+ ret i32 %tmp5
+}
+
+and
+
+_test:
+ movl 8(%esp), %eax
+ cmpl %eax, 4(%esp)
+ setb %al
+ movzbl %al, %eax
+ negl %eax
+ ret
+
+Several deficiencies here. First, we should instcombine zext+neg into sext:
+
+define i32 @test2(i32 %a, i32 %b) nounwind {
+ %tmp3 = icmp ult i32 %a, %b ; <i1> [#uses=1]
+ %tmp34 = sext i1 %tmp3 to i32 ; <i32> [#uses=1]
+ ret i32 %tmp34
+}
+
+However, before we can do that, we have to fix the bad codegen that we get for
+sext from bool:
+
+_test2:
+ movl 8(%esp), %eax
+ cmpl %eax, 4(%esp)
+ setb %al
+ movzbl %al, %eax
+ shll $31, %eax
+ sarl $31, %eax
+ ret
+
+This code should be at least as good as the code above. Once this is fixed, we
+can optimize this specific case even more to:
+
+ movl 8(%esp), %eax
+ xorl %ecx, %ecx
+ cmpl %eax, 4(%esp)
+ sbbl %ecx, %ecx
+
+//===---------------------------------------------------------------------===//
+
+Take the following code (from
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=16541):
+
+extern unsigned char first_one[65536];
+int FirstOnet(unsigned long long arg1)
+{
+ if (arg1 >> 48)
+ return (first_one[arg1 >> 48]);
+ return 0;
+}
+
+
+The following code is currently generated:
+FirstOnet:
+ movl 8(%esp), %eax
+ cmpl $65536, %eax
+ movl 4(%esp), %ecx
+ jb .LBB1_2 # UnifiedReturnBlock
+.LBB1_1: # ifthen
+ shrl $16, %eax
+ movzbl first_one(%eax), %eax
+ ret
+.LBB1_2: # UnifiedReturnBlock
+ xorl %eax, %eax
+ ret
+
+We could change the "movl 8(%esp), %eax" into "movzwl 10(%esp), %eax"; this
+lets us change the cmpl into a testl, which is shorter, and eliminate the shift.
+
+//===---------------------------------------------------------------------===//
+
+We compile this function:
+
+define i32 @foo(i32 %a, i32 %b, i32 %c, i8 zeroext %d) nounwind {
+entry:
+ %tmp2 = icmp eq i8 %d, 0 ; <i1> [#uses=1]
+ br i1 %tmp2, label %bb7, label %bb
+
+bb: ; preds = %entry
+ %tmp6 = add i32 %b, %a ; <i32> [#uses=1]
+ ret i32 %tmp6
+
+bb7: ; preds = %entry
+ %tmp10 = sub i32 %a, %c ; <i32> [#uses=1]
+ ret i32 %tmp10
+}
+
+to:
+
+foo: # @foo
+# BB#0: # %entry
+ movl 4(%esp), %ecx
+ cmpb $0, 16(%esp)
+ je .LBB0_2
+# BB#1: # %bb
+ movl 8(%esp), %eax
+ addl %ecx, %eax
+ ret
+.LBB0_2: # %bb7
+ movl 12(%esp), %edx
+ movl %ecx, %eax
+ subl %edx, %eax
+ ret
+
+There's an obviously unnecessary movl in .LBB0_2, and we could eliminate a
+couple more movls by putting 4(%esp) into %eax instead of %ecx.
+
+//===---------------------------------------------------------------------===//
+
+See rdar://4653682.
+
+From flops:
+
+LBB1_15: # bb310
+ cvtss2sd LCPI1_0, %xmm1
+ addsd %xmm1, %xmm0
+ movsd 176(%esp), %xmm2
+ mulsd %xmm0, %xmm2
+ movapd %xmm2, %xmm3
+ mulsd %xmm3, %xmm3
+ movapd %xmm3, %xmm4
+ mulsd LCPI1_23, %xmm4
+ addsd LCPI1_24, %xmm4
+ mulsd %xmm3, %xmm4
+ addsd LCPI1_25, %xmm4
+ mulsd %xmm3, %xmm4
+ addsd LCPI1_26, %xmm4
+ mulsd %xmm3, %xmm4
+ addsd LCPI1_27, %xmm4
+ mulsd %xmm3, %xmm4
+ addsd LCPI1_28, %xmm4
+ mulsd %xmm3, %xmm4
+ addsd %xmm1, %xmm4
+ mulsd %xmm2, %xmm4
+ movsd 152(%esp), %xmm1
+ addsd %xmm4, %xmm1
+ movsd %xmm1, 152(%esp)
+ incl %eax
+ cmpl %eax, %esi
+ jge LBB1_15 # bb310
+LBB1_16: # bb358.loopexit
+ movsd 152(%esp), %xmm0
+ addsd %xmm0, %xmm0
+ addsd LCPI1_22, %xmm0
+ movsd %xmm0, 152(%esp)
+
+Rather than spilling the result of the last addsd in the loop, we should have
+insert a copy to split the interval (one for the duration of the loop, one
+extending to the fall through). The register pressure in the loop isn't high
+enough to warrant the spill.
+
+Also check why xmm7 is not used at all in the function.
+
+//===---------------------------------------------------------------------===//
+
+Take the following:
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+target triple = "i386-apple-darwin8"
+@in_exit.4870.b = internal global i1 false ; <i1*> [#uses=2]
+define fastcc void @abort_gzip() noreturn nounwind {
+entry:
+ %tmp.b.i = load i1* @in_exit.4870.b ; <i1> [#uses=1]
+ br i1 %tmp.b.i, label %bb.i, label %bb4.i
+bb.i: ; preds = %entry
+ tail call void @exit( i32 1 ) noreturn nounwind
+ unreachable
+bb4.i: ; preds = %entry
+ store i1 true, i1* @in_exit.4870.b
+ tail call void @exit( i32 1 ) noreturn nounwind
+ unreachable
+}
+declare void @exit(i32) noreturn nounwind
+
+This compiles into:
+_abort_gzip: ## @abort_gzip
+## BB#0: ## %entry
+ subl $12, %esp
+ movb _in_exit.4870.b, %al
+ cmpb $1, %al
+ jne LBB0_2
+
+We somehow miss folding the movb into the cmpb.
+
+//===---------------------------------------------------------------------===//
+
+We compile:
+
+int test(int x, int y) {
+ return x-y-1;
+}
+
+into (-m64):
+
+_test:
+ decl %edi
+ movl %edi, %eax
+ subl %esi, %eax
+ ret
+
+it would be better to codegen as: x+~y (notl+addl)
+
+//===---------------------------------------------------------------------===//
+
+This code:
+
+int foo(const char *str,...)
+{
+ __builtin_va_list a; int x;
+ __builtin_va_start(a,str); x = __builtin_va_arg(a,int); __builtin_va_end(a);
+ return x;
+}
+
+gets compiled into this on x86-64:
+ subq $200, %rsp
+ movaps %xmm7, 160(%rsp)
+ movaps %xmm6, 144(%rsp)
+ movaps %xmm5, 128(%rsp)
+ movaps %xmm4, 112(%rsp)
+ movaps %xmm3, 96(%rsp)
+ movaps %xmm2, 80(%rsp)
+ movaps %xmm1, 64(%rsp)
+ movaps %xmm0, 48(%rsp)
+ movq %r9, 40(%rsp)
+ movq %r8, 32(%rsp)
+ movq %rcx, 24(%rsp)
+ movq %rdx, 16(%rsp)
+ movq %rsi, 8(%rsp)
+ leaq (%rsp), %rax
+ movq %rax, 192(%rsp)
+ leaq 208(%rsp), %rax
+ movq %rax, 184(%rsp)
+ movl $48, 180(%rsp)
+ movl $8, 176(%rsp)
+ movl 176(%rsp), %eax
+ cmpl $47, %eax
+ jbe .LBB1_3 # bb
+.LBB1_1: # bb3
+ movq 184(%rsp), %rcx
+ leaq 8(%rcx), %rax
+ movq %rax, 184(%rsp)
+.LBB1_2: # bb4
+ movl (%rcx), %eax
+ addq $200, %rsp
+ ret
+.LBB1_3: # bb
+ movl %eax, %ecx
+ addl $8, %eax
+ addq 192(%rsp), %rcx
+ movl %eax, 176(%rsp)
+ jmp .LBB1_2 # bb4
+
+gcc 4.3 generates:
+ subq $96, %rsp
+.LCFI0:
+ leaq 104(%rsp), %rax
+ movq %rsi, -80(%rsp)
+ movl $8, -120(%rsp)
+ movq %rax, -112(%rsp)
+ leaq -88(%rsp), %rax
+ movq %rax, -104(%rsp)
+ movl $8, %eax
+ cmpl $48, %eax
+ jb .L6
+ movq -112(%rsp), %rdx
+ movl (%rdx), %eax
+ addq $96, %rsp
+ ret
+ .p2align 4,,10
+ .p2align 3
+.L6:
+ mov %eax, %edx
+ addq -104(%rsp), %rdx
+ addl $8, %eax
+ movl %eax, -120(%rsp)
+ movl (%rdx), %eax
+ addq $96, %rsp
+ ret
+
+and it gets compiled into this on x86:
+ pushl %ebp
+ movl %esp, %ebp
+ subl $4, %esp
+ leal 12(%ebp), %eax
+ movl %eax, -4(%ebp)
+ leal 16(%ebp), %eax
+ movl %eax, -4(%ebp)
+ movl 12(%ebp), %eax
+ addl $4, %esp
+ popl %ebp
+ ret
+
+gcc 4.3 generates:
+ pushl %ebp
+ movl %esp, %ebp
+ movl 12(%ebp), %eax
+ popl %ebp
+ ret
+
+//===---------------------------------------------------------------------===//
+
+Teach tblgen not to check bitconvert source type in some cases. This allows us
+to consolidate the following patterns in X86InstrMMX.td:
+
+def : Pat<(v2i32 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
+ (iPTR 0))))),
+ (v2i32 (MMX_MOVDQ2Qrr VR128:$src))>;
+def : Pat<(v4i16 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
+ (iPTR 0))))),
+ (v4i16 (MMX_MOVDQ2Qrr VR128:$src))>;
+def : Pat<(v8i8 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
+ (iPTR 0))))),
+ (v8i8 (MMX_MOVDQ2Qrr VR128:$src))>;
+
+There are other cases in various td files.
+
+//===---------------------------------------------------------------------===//
+
+Take something like the following on x86-32:
+unsigned a(unsigned long long x, unsigned y) {return x % y;}
+
+We currently generate a libcall, but we really shouldn't: the expansion is
+shorter and likely faster than the libcall. The expected code is something
+like the following:
+
+ movl 12(%ebp), %eax
+ movl 16(%ebp), %ecx
+ xorl %edx, %edx
+ divl %ecx
+ movl 8(%ebp), %eax
+ divl %ecx
+ movl %edx, %eax
+ ret
+
+A similar code sequence works for division.
+
+//===---------------------------------------------------------------------===//
+
+These should compile to the same code, but the later codegen's to useless
+instructions on X86. This may be a trivial dag combine (GCC PR7061):
+
+struct s1 { unsigned char a, b; };
+unsigned long f1(struct s1 x) {
+ return x.a + x.b;
+}
+struct s2 { unsigned a: 8, b: 8; };
+unsigned long f2(struct s2 x) {
+ return x.a + x.b;
+}
+
+//===---------------------------------------------------------------------===//
+
+We currently compile this:
+
+define i32 @func1(i32 %v1, i32 %v2) nounwind {
+entry:
+ %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
+ %sum = extractvalue {i32, i1} %t, 0
+ %obit = extractvalue {i32, i1} %t, 1
+ br i1 %obit, label %overflow, label %normal
+normal:
+ ret i32 %sum
+overflow:
+ call void @llvm.trap()
+ unreachable
+}
+declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32)
+declare void @llvm.trap()
+
+to:
+
+_func1:
+ movl 4(%esp), %eax
+ addl 8(%esp), %eax
+ jo LBB1_2 ## overflow
+LBB1_1: ## normal
+ ret
+LBB1_2: ## overflow
+ ud2
+
+it would be nice to produce "into" someday.
//===---------------------------------------------------------------------===//
+This code:
+
+void vec_mpys1(int y[], const int x[], int scaler) {
+int i;
+for (i = 0; i < 150; i++)
+ y[i] += (((long long)scaler * (long long)x[i]) >> 31);
+}
+
+Compiles to this loop with GCC 3.x:
+
+.L5:
+ movl %ebx, %eax
+ imull (%edi,%ecx,4)
+ shrdl $31, %edx, %eax
+ addl %eax, (%esi,%ecx,4)
+ incl %ecx
+ cmpl $149, %ecx
+ jle .L5
+
+llvm-gcc compiles it to the much uglier:
+
+LBB1_1: ## bb1
+ movl 24(%esp), %eax
+ movl (%eax,%edi,4), %ebx
+ movl %ebx, %ebp
+ imull %esi, %ebp
+ movl %ebx, %eax
+ mull %ecx
+ addl %ebp, %edx
+ sarl $31, %ebx
+ imull %ecx, %ebx
+ addl %edx, %ebx
+ shldl $1, %eax, %ebx
+ movl 20(%esp), %eax
+ addl %ebx, (%eax,%edi,4)
+ incl %edi
+ cmpl $150, %edi
+ jne LBB1_1 ## bb1
+
+The issue is that we hoist the cast of "scaler" to long long outside of the
+loop, the value comes into the loop as two values, and
+RegsForValue::getCopyFromRegs doesn't know how to put an AssertSext on the
+constructed BUILD_PAIR which represents the cast value.
+
+//===---------------------------------------------------------------------===//
+
+Test instructions can be eliminated by using EFLAGS values from arithmetic
+instructions. This is currently not done for mul, and, or, xor, neg, shl,
+sra, srl, shld, shrd, atomic ops, and others. It is also currently not done
+for read-modify-write instructions. It is also current not done if the
+OF or CF flags are needed.
+
+The shift operators have the complication that when the shift count is
+zero, EFLAGS is not set, so they can only subsume a test instruction if
+the shift count is known to be non-zero. Also, using the EFLAGS value
+from a shift is apparently very slow on some x86 implementations.
+
+In read-modify-write instructions, the root node in the isel match is
+the store, and isel has no way for the use of the EFLAGS result of the
+arithmetic to be remapped to the new node.
+
+Add and subtract instructions set OF on signed overflow and CF on unsiged
+overflow, while test instructions always clear OF and CF. In order to
+replace a test with an add or subtract in a situation where OF or CF is
+needed, codegen must be able to prove that the operation cannot see
+signed or unsigned overflow, respectively.
+
+//===---------------------------------------------------------------------===//
+
+memcpy/memmove do not lower to SSE copies when possible. A silly example is:
+define <16 x float> @foo(<16 x float> %A) nounwind {
+ %tmp = alloca <16 x float>, align 16
+ %tmp2 = alloca <16 x float>, align 16
+ store <16 x float> %A, <16 x float>* %tmp
+ %s = bitcast <16 x float>* %tmp to i8*
+ %s2 = bitcast <16 x float>* %tmp2 to i8*
+ call void @llvm.memcpy.i64(i8* %s, i8* %s2, i64 64, i32 16)
+ %R = load <16 x float>* %tmp2
+ ret <16 x float> %R
+}
+
+declare void @llvm.memcpy.i64(i8* nocapture, i8* nocapture, i64, i32) nounwind
+
+which compiles to:
+
+_foo:
+ subl $140, %esp
+ movaps %xmm3, 112(%esp)
+ movaps %xmm2, 96(%esp)
+ movaps %xmm1, 80(%esp)
+ movaps %xmm0, 64(%esp)
+ movl 60(%esp), %eax
+ movl %eax, 124(%esp)
+ movl 56(%esp), %eax
+ movl %eax, 120(%esp)
+ movl 52(%esp), %eax
+ <many many more 32-bit copies>
+ movaps (%esp), %xmm0
+ movaps 16(%esp), %xmm1
+ movaps 32(%esp), %xmm2
+ movaps 48(%esp), %xmm3
+ addl $140, %esp
+ ret
+
+On Nehalem, it may even be cheaper to just use movups when unaligned than to
+fall back to lower-granularity chunks.
+
+//===---------------------------------------------------------------------===//
+
+Implement processor-specific optimizations for parity with GCC on these
+processors. GCC does two optimizations:
+
+1. ix86_pad_returns inserts a noop before ret instructions if immediately
+ preceeded by a conditional branch or is the target of a jump.
+2. ix86_avoid_jump_misspredicts inserts noops in cases where a 16-byte block of
+ code contains more than 3 branches.
+
+The first one is done for all AMDs, Core2, and "Generic"
+The second one is done for: Atom, Pentium Pro, all AMDs, Pentium 4, Nocona,
+ Core 2, and "Generic"
+
+//===---------------------------------------------------------------------===//
+
+Testcase:
+int a(int x) { return (x & 127) > 31; }
+
+Current output:
+ movl 4(%esp), %eax
+ andl $127, %eax
+ cmpl $31, %eax
+ seta %al
+ movzbl %al, %eax
+ ret
+
+Ideal output:
+ xorl %eax, %eax
+ testl $96, 4(%esp)
+ setne %al
+ ret
+
+This should definitely be done in instcombine, canonicalizing the range
+condition into a != condition. We get this IR:
+
+define i32 @a(i32 %x) nounwind readnone {
+entry:
+ %0 = and i32 %x, 127 ; <i32> [#uses=1]
+ %1 = icmp ugt i32 %0, 31 ; <i1> [#uses=1]
+ %2 = zext i1 %1 to i32 ; <i32> [#uses=1]
+ ret i32 %2
+}
+
+Instcombine prefers to strength reduce relational comparisons to equality
+comparisons when possible, this should be another case of that. This could
+be handled pretty easily in InstCombiner::visitICmpInstWithInstAndIntCst, but it
+looks like InstCombiner::visitICmpInstWithInstAndIntCst should really already
+be redesigned to use ComputeMaskedBits and friends.
+
+
+//===---------------------------------------------------------------------===//
+Testcase:
+int x(int a) { return (a&0xf0)>>4; }
+
+Current output:
+ movl 4(%esp), %eax
+ shrl $4, %eax
+ andl $15, %eax
+ ret
+
+Ideal output:
+ movzbl 4(%esp), %eax
+ shrl $4, %eax
+ ret
+
+//===---------------------------------------------------------------------===//
+
+Testcase:
+int x(int a) { return (a & 0x80) ? 0x100 : 0; }
+int y(int a) { return (a & 0x80) *2; }
+
+Current:
+ testl $128, 4(%esp)
+ setne %al
+ movzbl %al, %eax
+ shll $8, %eax
+ ret
+
+Better:
+ movl 4(%esp), %eax
+ addl %eax, %eax
+ andl $256, %eax
+ ret
+
+This is another general instcombine transformation that is profitable on all
+targets. In LLVM IR, these functions look like this:
+
+define i32 @x(i32 %a) nounwind readnone {
+entry:
+ %0 = and i32 %a, 128
+ %1 = icmp eq i32 %0, 0
+ %iftmp.0.0 = select i1 %1, i32 0, i32 256
+ ret i32 %iftmp.0.0
+}
+
+define i32 @y(i32 %a) nounwind readnone {
+entry:
+ %0 = shl i32 %a, 1
+ %1 = and i32 %0, 256
+ ret i32 %1
+}
+
+Replacing an icmp+select with a shift should always be considered profitable in
+instcombine.
+
+//===---------------------------------------------------------------------===//
+
+Re-implement atomic builtins __sync_add_and_fetch() and __sync_sub_and_fetch
+properly.
+
+When the return value is not used (i.e. only care about the value in the
+memory), x86 does not have to use add to implement these. Instead, it can use
+add, sub, inc, dec instructions with the "lock" prefix.
+
+This is currently implemented using a bit of instruction selection trick. The
+issue is the target independent pattern produces one output and a chain and we
+want to map it into one that just output a chain. The current trick is to select
+it into a MERGE_VALUES with the first definition being an implicit_def. The
+proper solution is to add new ISD opcodes for the no-output variant. DAG
+combiner can then transform the node before it gets to target node selection.
+
+Problem #2 is we are adding a whole bunch of x86 atomic instructions when in
+fact these instructions are identical to the non-lock versions. We need a way to
+add target specific information to target nodes and have this information
+carried over to machine instructions. Asm printer (or JIT) can use this
+information to add the "lock" prefix.
+
+//===---------------------------------------------------------------------===//
+
+_Bool bar(int *x) { return *x & 1; }
+
+define zeroext i1 @bar(i32* nocapture %x) nounwind readonly {
+entry:
+ %tmp1 = load i32* %x ; <i32> [#uses=1]
+ %and = and i32 %tmp1, 1 ; <i32> [#uses=1]
+ %tobool = icmp ne i32 %and, 0 ; <i1> [#uses=1]
+ ret i1 %tobool
+}
+
+bar: # @bar
+# BB#0: # %entry
+ movl 4(%esp), %eax
+ movb (%eax), %al
+ andb $1, %al
+ movzbl %al, %eax
+ ret
+
+Missed optimization: should be movl+andl.
+
+//===---------------------------------------------------------------------===//
+
+Consider the following two functions compiled with clang:
+_Bool foo(int *x) { return !(*x & 4); }
+unsigned bar(int *x) { return !(*x & 4); }
+
+foo:
+ movl 4(%esp), %eax
+ testb $4, (%eax)
+ sete %al
+ movzbl %al, %eax
+ ret
+
+bar:
+ movl 4(%esp), %eax
+ movl (%eax), %eax
+ shrl $2, %eax
+ andl $1, %eax
+ xorl $1, %eax
+ ret
+
+The second function generates more code even though the two functions are
+are functionally identical.
+
+//===---------------------------------------------------------------------===//
+
+Take the following C code:
+int x(int y) { return (y & 63) << 14; }
+
+Code produced by gcc:
+ andl $63, %edi
+ sall $14, %edi
+ movl %edi, %eax
+ ret
+
+Code produced by clang:
+ shll $14, %edi
+ movl %edi, %eax
+ andl $1032192, %eax
+ ret
+
+The code produced by gcc is 3 bytes shorter. This sort of construct often
+shows up with bitfields.
+
+//===---------------------------------------------------------------------===//