Trampoline support for x86-64. This looks like

[oota-llvm.git] / lib / Target / X86 / README.txt
diff --git a/lib/Target/X86/README.txt b/lib/Target/X86/README.txt

index 110f399484bfe48b45455cf218b9a1384c5a699b..e9f0d7338b3857dc5018d7e4ee4d97a5631844a1 100644 (file)
--- a/lib/Target/X86/README.txt
+++ b/lib/Target/X86/README.txt
@@ -2,21 +2,19 @@
  // Random ideas for the X86 backend.
  //===---------------------------------------------------------------------===//
  
-Add a MUL2U and MUL2S nodes to represent a multiply that returns both the
-Hi and Lo parts (combination of MUL and MULH[SU] into one node).  Add this to
-X86, & make the dag combiner produce it when needed.  This will eliminate one
-imul from the code generated for:
+Missing features:
+  - Support for SSE4: http://www.intel.com/software/penryn
+http://softwarecommunity.intel.com/isn/Downloads/Intel%20SSE4%20Programming%20Reference.pdf
+  - support for 3DNow!
+  - weird abis?
  
-long long test(long long X, long long Y) { return X*Y; }
-
-by using the EAX result from the mul.  We should add a similar node for
-DIVREM.
-
-another case is:
-
-long long test(int X, int Y) { return (long long)X*Y; }
+//===---------------------------------------------------------------------===//
  
-... which should only be one imul instruction.
+CodeGen/X86/lea-3.ll:test3 should be a single LEA, not a shift/move.  The X86
+backend knows how to three-addressify this shift, but it appears the register
+allocator isn't even asking it to do so in this case.  We should investigate
+why this isn't happening, it could have significant impact on other important
+cases for X86 as well.
  
  //===---------------------------------------------------------------------===//
  
@@ -45,6 +43,20 @@ http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html
  
  Another useful one would be  ~0ULL >> X and ~0ULL << X.
  
+One better solution for 1LL << x is:
+        xorl    %eax, %eax
+        xorl    %edx, %edx
+        testb   $32, %cl
+        sete    %al
+        setne   %dl
+        sall    %cl, %eax
+        sall    %cl, %edx
+
+But that requires good 8-bit subreg support.
+
+64-bit shifts (in general) expand to really bad code.  Instead of using
+cmovs, we should expand to a conditional branch like GCC produces.
+
  //===---------------------------------------------------------------------===//
  
  Compile this:
@@ -77,6 +89,8 @@ Should we promote i16 to i32 to avoid partial register update stalls?
  
  Leave any_extend as pseudo instruction and hint to register
  allocator. Delay codegen until post register allocation.
+Note. any_extend is now turned into an INSERT_SUBREG. We still need to teach
+the coalescer how to deal with it though.
  
  //===---------------------------------------------------------------------===//
  
@@ -97,11 +111,18 @@ ctz:
  however, check that these are defined for 0 and 32.  Our intrinsics are, GCC's
  aren't.
  
+Another example (use predsimplify to eliminate a select):
+
+int foo (unsigned long j) {
+  if (j)
+    return __builtin_ffs (j) - 1;
+  else
+    return 0;
+}
+
  //===---------------------------------------------------------------------===//
  
-Use push/pop instructions in prolog/epilog sequences instead of stores off 
-ESP (certain code size win, perf win on some [which?] processors).
-Also, it appears icc use push for parameter passing. Need to investigate.
+It appears icc use push for parameter passing. Need to investigate.
  
  //===---------------------------------------------------------------------===//
  
@@ -215,17 +236,6 @@ which is probably slower, but it's interesting at least :)
  
  //===---------------------------------------------------------------------===//
  
-Should generate min/max for stuff like:
-
-void minf(float a, float b, float *X) {
-  *X = a <= b ? a : b;
-}
-
-Make use of floating point min / max instructions. Perhaps introduce ISD::FMIN
-and ISD::FMAX node types?
-
-//===---------------------------------------------------------------------===//
-
  The first BB of this code:
  
  declare bool %foo()
@@ -252,10 +262,6 @@ It would be better to emit "cmp %al, 1" than a xor and test.
  
  //===---------------------------------------------------------------------===//
  
-Enable X86InstrInfo::convertToThreeAddress().
-
-//===---------------------------------------------------------------------===//
-
  We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl
  We should leave these as libcalls for everything over a much lower threshold,
  since libc is hand tuned for medium and large mem ops (avoiding RFO for large
@@ -324,6 +330,45 @@ lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor.
  
  //===---------------------------------------------------------------------===//
  
+We are generating far worse code than gcc:
+
+volatile short X, Y;
+
+void foo(int N) {
+  int i;
+  for (i = 0; i < N; i++) { X = i; Y = i*4; }
+}
+
+LBB1_1:        # entry.bb_crit_edge
+       xorl    %ecx, %ecx
+       xorw    %dx, %dx
+LBB1_2:        # bb
+       movl    L_X$non_lazy_ptr, %esi
+       movw    %cx, (%esi)
+       movl    L_Y$non_lazy_ptr, %esi
+       movw    %dx, (%esi)
+       addw    $4, %dx
+       incl    %ecx
+       cmpl    %eax, %ecx
+       jne     LBB1_2  # bb
+
+vs.
+
+       xorl    %edx, %edx
+       movl    L_X$non_lazy_ptr-"L00000000001$pb"(%ebx), %esi
+       movl    L_Y$non_lazy_ptr-"L00000000001$pb"(%ebx), %ecx
+L4:
+       movw    %dx, (%esi)
+       leal    0(,%edx,4), %eax
+       movw    %ax, (%ecx)
+       addl    $1, %edx
+       cmpl    %edx, %edi
+       jne     L4
+
+This is due to the lack of post regalloc LICM.
+
+//===---------------------------------------------------------------------===//
+
  Teach the coalescer to coalesce vregs of different register classes. e.g. FR32 /
  FR64 to VR128.
  
@@ -390,21 +435,6 @@ require a copy to be inserted (in X86InstrInfo::convertToThreeAddress).
  
  //===---------------------------------------------------------------------===//
  
-Bad codegen:
-
-char foo(int x) { return x; }
-
-_foo:
-       movl 4(%esp), %eax
-       shll $24, %eax
-       sarl $24, %eax
-       ret
-
-SIGN_EXTEND_INREG can be implemented as (sext (trunc)) to take advantage of 
-sub-registers.
-
-//===---------------------------------------------------------------------===//
-
  Consider this:
  
  typedef struct pair { float A, B; } pair;
@@ -415,16 +445,13 @@ void pairtest(pair P, float *FP) {
  We currently generate this code with llvmgcc4:
  
  _pairtest:
-        subl $12, %esp
-        movl 20(%esp), %eax
-        movl %eax, 4(%esp)
-        movl 16(%esp), %eax
-        movl %eax, (%esp)
-        movss (%esp), %xmm0
-        addss 4(%esp), %xmm0
-        movl 24(%esp), %eax
-        movss %xmm0, (%eax)
-        addl $12, %esp
+        movl 8(%esp), %eax
+        movl 4(%esp), %ecx
+        movd %eax, %xmm0
+        movd %ecx, %xmm1
+        addss %xmm0, %xmm1
+        movl 12(%esp), %eax
+        movss %xmm1, (%eax)
          ret
  
  we should be able to generate:
@@ -440,6 +467,10 @@ integer chunks.  It does this so that structs like {short,short} are passed in
  a single 32-bit integer stack slot.  We should handle the safe cases above much
  nicer, while still handling the hard cases.
  
+While true in general, in this specific case we could do better by promoting
+load int + bitcast to float -> load fload.  This basically needs alignment info,
+the code is already implemented (but disabled) in dag combine).
+
  //===---------------------------------------------------------------------===//
  
  Another instruction selector deficiency:
@@ -463,22 +494,6 @@ the load's chain result is read by the callseq_start.
  
  //===---------------------------------------------------------------------===//
  
-Don't forget to find a way to squash noop truncates in the JIT environment.
-
-//===---------------------------------------------------------------------===//
-
-Implement anyext in the same manner as truncate that would allow them to be
-eliminated.
-
-//===---------------------------------------------------------------------===//
-
-How about implementing truncate / anyext as a property of machine instruction
-operand? i.e. Print as 32-bit super-class register / 16-bit sub-class register.
-Do this for the cases where a truncate / anyext is guaranteed to be eliminated.
-For IA32 that is truncate from 32 to 16 and anyext from 16 to 32.
-
-//===---------------------------------------------------------------------===//
-
  For this:
  
  int test(int a)
@@ -506,7 +521,25 @@ shorter than movl + leal.
  
  //===---------------------------------------------------------------------===//
  
-Implement CTTZ, CTLZ with bsf and bsr.
+Implement CTTZ, CTLZ with bsf and bsr. GCC produces:
+
+int ctz_(unsigned X) { return __builtin_ctz(X); }
+int clz_(unsigned X) { return __builtin_clz(X); }
+int ffs_(unsigned X) { return __builtin_ffs(X); }
+
+_ctz_:
+        bsfl    4(%esp), %eax
+        ret
+_clz_:
+        bsrl    4(%esp), %eax
+        xorl    $31, %eax
+        ret
+_ffs_:
+        movl    $-1, %edx
+        bsfl    4(%esp), %eax
+        cmove   %edx, %eax
+        addl    $1, %eax
+        ret
  
  //===---------------------------------------------------------------------===//
  
@@ -518,18 +551,14 @@ do not make use of.
  
  //===---------------------------------------------------------------------===//
  
-We should handle __attribute__ ((__visibility__ ("hidden"))).
-
-//===---------------------------------------------------------------------===//
-
  int %foo(int* %a, int %t) {
  entry:
          br label %cond_true
  
  cond_true:              ; preds = %cond_true, %entry
-        %x.0.0 = phi int [ 0, %entry ], [ %tmp9, %cond_true ]           ; <int> [#uses=3]
-        %t_addr.0.0 = phi int [ %t, %entry ], [ %tmp7, %cond_true ]             ; <int> [#uses=1]
-        %tmp2 = getelementptr int* %a, int %x.0.0               ; <int*> [#uses=1]
+        %x.0.0 = phi int [ 0, %entry ], [ %tmp9, %cond_true ]  
+        %t_addr.0.0 = phi int [ %t, %entry ], [ %tmp7, %cond_true ]
+        %tmp2 = getelementptr int* %a, int %x.0.0              
          %tmp3 = load int* %tmp2         ; <int> [#uses=1]
          %tmp5 = add int %t_addr.0.0, %x.0.0             ; <int> [#uses=1]
          %tmp7 = add int %tmp5, %tmp3            ; <int> [#uses=2]
@@ -545,10 +574,6 @@ is pessimized by -loop-reduce and -indvars
  
  //===---------------------------------------------------------------------===//
  
-Use cpuid to auto-detect CPU features such as SSE, SSE2, and SSE3.
-
-//===---------------------------------------------------------------------===//
-
  u32 to float conversion improvement:
  
  float uint32_2_float( unsigned u ) {
@@ -596,49 +621,961 @@ jz label
  
  //===---------------------------------------------------------------------===//
  
-Compile:
-int %test(ulong *%tmp) {
-        %tmp = load ulong* %tmp         ; <ulong> [#uses=1]
-        %tmp.mask = shr ulong %tmp, ubyte 50            ; <ulong> [#uses=1]
-        %tmp.mask = cast ulong %tmp.mask to ubyte               ; <ubyte> [#uses=1]
-        %tmp2 = and ubyte %tmp.mask, 3          ; <ubyte> [#uses=1]
-        %tmp2 = cast ubyte %tmp2 to int         ; <int> [#uses=1]
-        ret int %tmp2
+GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting
+simplifications for integer "x cmp y ? a : b".  For example, instead of:
+
+int G;
+void f(int X, int Y) {
+  G = X < 0 ? 14 : 13;
  }
  
+compiling to:
+
+_f:
+        movl $14, %eax
+        movl $13, %ecx
+        movl 4(%esp), %edx
+        testl %edx, %edx
+        cmovl %eax, %ecx
+        movl %ecx, _G
+        ret
+
+it could be:
+_f:
+        movl    4(%esp), %eax
+        sarl    $31, %eax
+        notl    %eax
+        addl    $14, %eax
+        movl    %eax, _G
+        ret
+
+etc.
+
+Another is:
+int usesbb(unsigned int a, unsigned int b) {
+       return (a < b ? -1 : 0);
+}
  to:
+_usesbb:
+       movl    8(%esp), %eax
+       cmpl    %eax, 4(%esp)
+       sbbl    %eax, %eax
+       ret
  
-_test:
+instead of:
+_usesbb:
+       xorl    %eax, %eax
+       movl    8(%esp), %ecx
+       cmpl    %ecx, 4(%esp)
+       movl    $4294967295, %ecx
+       cmovb   %ecx, %eax
+       ret
+
+//===---------------------------------------------------------------------===//
+
+Currently we don't have elimination of redundant stack manipulations. Consider
+the code:
+
+int %main() {
+entry:
+       call fastcc void %test1( )
+       call fastcc void %test2( sbyte* cast (void ()* %test1 to sbyte*) )
+       ret int 0
+}
+
+declare fastcc void %test1()
+
+declare fastcc void %test2(sbyte*)
+
+
+This currently compiles to:
+
+       subl $16, %esp
+       call _test5
+       addl $12, %esp
+       subl $16, %esp
+       movl $_test5, (%esp)
+       call _test6
+       addl $12, %esp
+
+The add\sub pair is really unneeded here.
+
+//===---------------------------------------------------------------------===//
+
+Consider the expansion of:
+
+uint %test3(uint %X) {
+        %tmp1 = rem uint %X, 255
+        ret uint %tmp1
+}
+
+Currently it compiles to:
+
+...
+        movl $2155905153, %ecx
+        movl 8(%esp), %esi
+        movl %esi, %eax
+        mull %ecx
+...
+
+This could be "reassociated" into:
+
+        movl $2155905153, %eax
+        movl 8(%esp), %ecx
+        mull %ecx
+
+to avoid the copy.  In fact, the existing two-address stuff would do this
+except that mul isn't a commutative 2-addr instruction.  I guess this has
+to be done at isel time based on the #uses to mul?
+
+//===---------------------------------------------------------------------===//
+
+Make sure the instruction which starts a loop does not cross a cacheline
+boundary. This requires knowning the exact length of each machine instruction.
+That is somewhat complicated, but doable. Example 256.bzip2:
+
+In the new trace, the hot loop has an instruction which crosses a cacheline
+boundary.  In addition to potential cache misses, this can't help decoding as I
+imagine there has to be some kind of complicated decoder reset and realignment
+to grab the bytes from the next cacheline.
+
+532  532 0x3cfc movb     (1809(%esp, %esi), %bl   <<<--- spans 2 64 byte lines
+942  942 0x3d03 movl     %dh, (1809(%esp, %esi)                                                                          
+937  937 0x3d0a incl     %esi                           
+3    3   0x3d0b cmpb     %bl, %dl                                               
+27   27  0x3d0d jnz      0x000062db <main+11707>
+
+//===---------------------------------------------------------------------===//
+
+In c99 mode, the preprocessor doesn't like assembly comments like #TRUNCATE.
+
+//===---------------------------------------------------------------------===//
+
+This could be a single 16-bit load.
+
+int f(char *p) {
+    if ((p[0] == 1) & (p[1] == 2)) return 1;
+    return 0;
+}
+
+//===---------------------------------------------------------------------===//
+
+We should inline lrintf and probably other libc functions.
+
+//===---------------------------------------------------------------------===//
+
+Start using the flags more.  For example, compile:
+
+int add_zf(int *x, int y, int a, int b) {
+     if ((*x += y) == 0)
+          return a;
+     else
+          return b;
+}
+
+to:
+       addl    %esi, (%rdi)
+       movl    %edx, %eax
+       cmovne  %ecx, %eax
+       ret
+instead of:
+
+_add_zf:
+        addl (%rdi), %esi
+        movl %esi, (%rdi)
+        testl %esi, %esi
+        cmove %edx, %ecx
+        movl %ecx, %eax
+        ret
+
+and:
+
+int add_zf(int *x, int y, int a, int b) {
+     if ((*x + y) < 0)
+          return a;
+     else
+          return b;
+}
+
+to:
+
+add_zf:
+        addl    (%rdi), %esi
+        movl    %edx, %eax
+        cmovns  %ecx, %eax
+        ret
+
+instead of:
+
+_add_zf:
+        addl (%rdi), %esi
+        testl %esi, %esi
+        cmovs %edx, %ecx
+        movl %ecx, %eax
+        ret
+
+//===---------------------------------------------------------------------===//
+
+These two functions have identical effects:
+
+unsigned int f(unsigned int i, unsigned int n) {++i; if (i == n) ++i; return i;}
+unsigned int f2(unsigned int i, unsigned int n) {++i; i += i == n; return i;}
+
+We currently compile them to:
+
+_f:
+        movl 4(%esp), %eax
+        movl %eax, %ecx
+        incl %ecx
+        movl 8(%esp), %edx
+        cmpl %edx, %ecx
+        jne LBB1_2      #UnifiedReturnBlock
+LBB1_1: #cond_true
+        addl $2, %eax
+        ret
+LBB1_2: #UnifiedReturnBlock
+        movl %ecx, %eax
+        ret
+_f2:
          movl 4(%esp), %eax
-        movl 4(%eax), %eax
-        shrl $18, %eax
-        andl $3, %eax
+        movl %eax, %ecx
+        incl %ecx
+        cmpl 8(%esp), %ecx
+        sete %cl
+        movzbl %cl, %ecx
+        leal 1(%ecx,%eax), %eax
+        ret
+
+both of which are inferior to GCC's:
+
+_f:
+        movl    4(%esp), %edx
+        leal    1(%edx), %eax
+        addl    $2, %edx
+        cmpl    8(%esp), %eax
+        cmove   %edx, %eax
+        ret
+_f2:
+        movl    4(%esp), %eax
+        addl    $1, %eax
+        xorl    %edx, %edx
+        cmpl    8(%esp), %eax
+        sete    %dl
+        addl    %edx, %eax
+        ret
+
+//===---------------------------------------------------------------------===//
+
+This code:
+
+void test(int X) {
+  if (X) abort();
+}
+
+is currently compiled to:
+
+_test:
+        subl $12, %esp
+        cmpl $0, 16(%esp)
+        jne LBB1_1
+        addl $12, %esp
+        ret
+LBB1_1:
+        call L_abort$stub
+
+It would be better to produce:
+
+_test:
+        subl $12, %esp
+        cmpl $0, 16(%esp)
+        jne L_abort$stub
+        addl $12, %esp
+        ret
+
+This can be applied to any no-return function call that takes no arguments etc.
+Alternatively, the stack save/restore logic could be shrink-wrapped, producing
+something like this:
+
+_test:
+        cmpl $0, 4(%esp)
+        jne LBB1_1
+        ret
+LBB1_1:
+        subl $12, %esp
+        call L_abort$stub
+
+Both are useful in different situations.  Finally, it could be shrink-wrapped
+and tail called, like this:
+
+_test:
+        cmpl $0, 4(%esp)
+        jne LBB1_1
+        ret
+LBB1_1:
+        pop %eax   # realign stack.
+        call L_abort$stub
+
+Though this probably isn't worth it.
+
+//===---------------------------------------------------------------------===//
+
+We need to teach the codegen to convert two-address INC instructions to LEA
+when the flags are dead (likewise dec).  For example, on X86-64, compile:
+
+int foo(int A, int B) {
+  return A+1;
+}
+
+to:
+
+_foo:
+        leal    1(%edi), %eax
          ret
  
  instead of:
  
+_foo:
+        incl %edi
+        movl %edi, %eax
+        ret
+
+Another example is:
+
+;; X's live range extends beyond the shift, so the register allocator
+;; cannot coalesce it with Y.  Because of this, a copy needs to be
+;; emitted before the shift to save the register value before it is
+;; clobbered.  However, this copy is not needed if the register
+;; allocator turns the shift into an LEA.  This also occurs for ADD.
+
+; Check that the shift gets turned into an LEA.
+; RUN: llvm-upgrade < %s | llvm-as | llc -march=x86 -x86-asm-syntax=intel | \
+; RUN:   not grep {mov E.X, E.X}
+
+%G = external global int
+
+int %test1(int %X, int %Y) {
+        %Z = add int %X, %Y
+        volatile store int %Y, int* %G
+        volatile store int %Z, int* %G
+        ret int %X
+}
+
+int %test2(int %X) {
+        %Z = add int %X, 1  ;; inc
+        volatile store int %Z, int* %G
+        ret int %X
+}
+
+//===---------------------------------------------------------------------===//
+
+Sometimes it is better to codegen subtractions from a constant (e.g. 7-x) with
+a neg instead of a sub instruction.  Consider:
+
+int test(char X) { return 7-X; }
+
+we currently produce:
  _test:
-        movl 4(%esp), %eax
-        movl 4(%eax), %eax
-        shrl $18, %eax
-        # TRUNCATE movb %al, %al
-        andb $3, %al
-        movzbl %al, %eax
+        movl $7, %eax
+        movsbl 4(%esp), %ecx
+        subl %ecx, %eax
          ret
  
-This saves a movzbl, and saves a truncate if it doesn't get coallesced right.
-This is a simple DAGCombine to propagate the zext through the and.
+We would use one fewer register if codegen'd as:
+
+        movsbl 4(%esp), %eax
+       neg %eax
+        add $7, %eax
+        ret
+
+Note that this isn't beneficial if the load can be folded into the sub.  In
+this case, we want a sub:
+
+int test(int X) { return 7-X; }
+_test:
+        movl $7, %eax
+        subl 4(%esp), %eax
+        ret
  
  //===---------------------------------------------------------------------===//
  
-Instead of:
+This is a "commutable two-address" register coallescing deficiency:
+
+define <4 x float> @test1(<4 x float> %V) {
+entry:
+        %tmp8 = shufflevector <4 x float> %V, <4 x float> undef,
+                                        <4 x i32> < i32 3, i32 2, i32 1, i32 0 >
+        %add = add <4 x float> %tmp8, %V
+        ret <4 x float> %add
+}
+
+this codegens to:
+
+_test1:
+        pshufd  $27, %xmm0, %xmm1
+        addps   %xmm0, %xmm1
+        movaps  %xmm1, %xmm0
+        ret
+
+instead of:
+
+_test1:
+        pshufd  $27, %xmm0, %xmm1
+        addps   %xmm1, %xmm0
+        ret
+
+//===---------------------------------------------------------------------===//
+
+Leaf functions that require one 4-byte spill slot have a prolog like this:
+
+_foo:
+        pushl   %esi
+        subl    $4, %esp
+...
+and an epilog like this:
+        addl    $4, %esp
+        popl    %esi
+        ret
+
+It would be smaller, and potentially faster, to push eax on entry and to
+pop into a dummy register instead of using addl/subl of esp.  Just don't pop 
+into any return registers :)
+
+//===---------------------------------------------------------------------===//
+
+The X86 backend should fold (branch (or (setcc, setcc))) into multiple 
+branches.  We generate really poor code for:
+
+double testf(double a) {
+       return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0);
+}
+
+For example, the entry BB is:
+
+_testf:
+        subl    $20, %esp
+        pxor    %xmm0, %xmm0
+        movsd   24(%esp), %xmm1
+        ucomisd %xmm0, %xmm1
+        setnp   %al
+        sete    %cl
+        testb   %cl, %al
+        jne     LBB1_5  # UnifiedReturnBlock
+LBB1_1: # cond_true
+
+
+it would be better to replace the last four instructions with:
+
+       jp LBB1_1
+       je LBB1_5
+LBB1_1:
+
+We also codegen the inner ?: into a diamond:
+
+       cvtss2sd        LCPI1_0(%rip), %xmm2
+        cvtss2sd        LCPI1_1(%rip), %xmm3
+        ucomisd %xmm1, %xmm0
+        ja      LBB1_3  # cond_true
+LBB1_2: # cond_true
+        movapd  %xmm3, %xmm2
+LBB1_3: # cond_true
+        movapd  %xmm2, %xmm0
+        ret
+
+We should sink the load into xmm3 into the LBB1_2 block.  This should
+be pretty easy, and will nuke all the copies.
+
+//===---------------------------------------------------------------------===//
+
+This:
+        #include <algorithm>
+        inline std::pair<unsigned, bool> full_add(unsigned a, unsigned b)
+        { return std::make_pair(a + b, a + b < a); }
+        bool no_overflow(unsigned a, unsigned b)
+        { return !full_add(a, b).second; }
+
+Should compile to:
+
  
-       cmpl $4294967295, %edx
-       jg LBB1_8       #cond_false49
+        _Z11no_overflowjj:
+                addl    %edi, %esi
+                setae   %al
+                ret
  
-emit:
+on x86-64, not:
+
+__Z11no_overflowjj:
+        addl    %edi, %esi
+        cmpl    %edi, %esi
+        setae   %al
+        movzbl  %al, %eax
+        ret
+
+
+//===---------------------------------------------------------------------===//
+
+Re-materialize MOV32r0 etc. with xor instead of changing them to moves if the
+condition register is dead. xor reg reg is shorter than mov reg, #0.
+
+//===---------------------------------------------------------------------===//
+
+We aren't matching RMW instructions aggressively
+enough.  Here's a reduced testcase (more in PR1160):
+
+define void @test(i32* %huge_ptr, i32* %target_ptr) {
+        %A = load i32* %huge_ptr                ; <i32> [#uses=1]
+        %B = load i32* %target_ptr              ; <i32> [#uses=1]
+        %C = or i32 %A, %B              ; <i32> [#uses=1]
+        store i32 %C, i32* %target_ptr
+        ret void
+}
+
+$ llvm-as < t.ll | llc -march=x86-64
+
+_test:
+        movl (%rdi), %eax
+        orl (%rsi), %eax
+        movl %eax, (%rsi)
+        ret
+
+That should be something like:
+
+_test:
+        movl (%rdi), %eax
+        orl %eax, (%rsi)
+        ret
+
+//===---------------------------------------------------------------------===//
+
+The following code:
+
+bb114.preheader:               ; preds = %cond_next94
+       %tmp231232 = sext i16 %tmp62 to i32             ; <i32> [#uses=1]
+       %tmp233 = sub i32 32, %tmp231232                ; <i32> [#uses=1]
+       %tmp245246 = sext i16 %tmp65 to i32             ; <i32> [#uses=1]
+       %tmp252253 = sext i16 %tmp68 to i32             ; <i32> [#uses=1]
+       %tmp254 = sub i32 32, %tmp252253                ; <i32> [#uses=1]
+       %tmp553554 = bitcast i16* %tmp37 to i8*         ; <i8*> [#uses=2]
+       %tmp583584 = sext i16 %tmp98 to i32             ; <i32> [#uses=1]
+       %tmp585 = sub i32 32, %tmp583584                ; <i32> [#uses=1]
+       %tmp614615 = sext i16 %tmp101 to i32            ; <i32> [#uses=1]
+       %tmp621622 = sext i16 %tmp104 to i32            ; <i32> [#uses=1]
+       %tmp623 = sub i32 32, %tmp621622                ; <i32> [#uses=1]
+       br label %bb114
+
+produces:
+
+LBB3_5:        # bb114.preheader
+       movswl  -68(%ebp), %eax
+       movl    $32, %ecx
+       movl    %ecx, -80(%ebp)
+       subl    %eax, -80(%ebp)
+       movswl  -52(%ebp), %eax
+       movl    %ecx, -84(%ebp)
+       subl    %eax, -84(%ebp)
+       movswl  -70(%ebp), %eax
+       movl    %ecx, -88(%ebp)
+       subl    %eax, -88(%ebp)
+       movswl  -50(%ebp), %eax
+       subl    %eax, %ecx
+       movl    %ecx, -76(%ebp)
+       movswl  -42(%ebp), %eax
+       movl    %eax, -92(%ebp)
+       movswl  -66(%ebp), %eax
+       movl    %eax, -96(%ebp)
+       movw    $0, -98(%ebp)
+
+This appears to be bad because the RA is not folding the store to the stack 
+slot into the movl.  The above instructions could be:
+       movl    $32, -80(%ebp)
+...
+       movl    $32, -84(%ebp)
+...
+This seems like a cross between remat and spill folding.
+
+This has redundant subtractions of %eax from a stack slot. However, %ecx doesn't
+change, so we could simply subtract %eax from %ecx first and then use %ecx (or
+vice-versa).
+
+//===---------------------------------------------------------------------===//
+
+For this code:
+
+cond_next603:          ; preds = %bb493, %cond_true336, %cond_next599
+       %v.21050.1 = phi i32 [ %v.21050.0, %cond_next599 ], [ %tmp344, %cond_true336 ], [ %v.2, %bb493 ]                ; <i32> [#uses=1]
+       %maxz.21051.1 = phi i32 [ %maxz.21051.0, %cond_next599 ], [ 0, %cond_true336 ], [ %maxz.2, %bb493 ]             ; <i32> [#uses=2]
+       %cnt.01055.1 = phi i32 [ %cnt.01055.0, %cond_next599 ], [ 0, %cond_true336 ], [ %cnt.0, %bb493 ]                ; <i32> [#uses=2]
+       %byteptr.9 = phi i8* [ %byteptr.12, %cond_next599 ], [ %byteptr.0, %cond_true336 ], [ %byteptr.10, %bb493 ]             ; <i8*> [#uses=9]
+       %bitptr.6 = phi i32 [ %tmp5571104.1, %cond_next599 ], [ %tmp4921049, %cond_true336 ], [ %bitptr.7, %bb493 ]             ; <i32> [#uses=4]
+       %source.5 = phi i32 [ %tmp602, %cond_next599 ], [ %source.0, %cond_true336 ], [ %source.6, %bb493 ]             ; <i32> [#uses=7]
+       %tmp606 = getelementptr %struct.const_tables* @tables, i32 0, i32 0, i32 %cnt.01055.1           ; <i8*> [#uses=1]
+       %tmp607 = load i8* %tmp606, align 1             ; <i8> [#uses=1]
+
+We produce this:
+
+LBB4_70:       # cond_next603
+       movl    -20(%ebp), %esi
+       movl    L_tables$non_lazy_ptr-"L4$pb"(%esi), %esi
+
+However, ICC caches this information before the loop and produces this:
+
+        movl      88(%esp), %eax                                #481.12
+
+//===---------------------------------------------------------------------===//
+
+This code:
+
+       %tmp659 = icmp slt i16 %tmp654, 0               ; <i1> [#uses=1]
+       br i1 %tmp659, label %cond_true662, label %cond_next715
+
+produces this:
+
+       testw   %cx, %cx
+       movswl  %cx, %esi
+       jns     LBB4_109        # cond_next715
+
+Shark tells us that using %cx in the testw instruction is sub-optimal. It
+suggests using the 32-bit register (which is what ICC uses).
+
+//===---------------------------------------------------------------------===//
+
+rdar://5506677 - We compile this:
+
+define i32 @foo(double %x) {
+        %x14 = bitcast double %x to i64         ; <i64> [#uses=1]
+        %tmp713 = trunc i64 %x14 to i32         ; <i32> [#uses=1]
+        %tmp8 = and i32 %tmp713, 2147483647             ; <i32> [#uses=1]
+        ret i32 %tmp8
+}
+
+to:
+
+_foo:
+        subl    $12, %esp
+        fldl    16(%esp)
+        fstpl   (%esp)
+        movl    $2147483647, %eax
+        andl    (%esp), %eax
+        addl    $12, %esp
+        #FP_REG_KILL
+        ret
  
-       testl %edx, %edx
-       js LBB1_8
+It would be much better to eliminate the fldl/fstpl by folding the bitcast 
+into the load SDNode.  That would give us:
  
-This saves a byte of code space.
+_foo:
+        movl    $2147483647, %eax
+        andl    4(%esp), %eax
+        ret
+
+//===---------------------------------------------------------------------===//
+
+We compile this:
+
+void compare (long long foo) {
+  if (foo < 4294967297LL)
+    abort();
+}
+
+to:
+
+_compare:
+        subl    $12, %esp
+        cmpl    $0, 16(%esp)
+        setne   %al
+        movzbw  %al, %ax
+        cmpl    $1, 20(%esp)
+        setg    %cl
+        movzbw  %cl, %cx
+        cmove   %ax, %cx
+        movw    %cx, %ax
+        testb   $1, %al
+        je      LBB1_2  # cond_true
+
+(also really horrible code on ppc).  This is due to the expand code for 64-bit
+compares.  GCC produces multiple branches, which is much nicer:
+
+_compare:
+        pushl   %ebp
+        movl    %esp, %ebp
+        subl    $8, %esp
+        movl    8(%ebp), %eax
+        movl    12(%ebp), %edx
+        subl    $1, %edx
+        jg     L5
+L7:
+        jl      L4
+        cmpl    $0, %eax
+        jbe      L4
+L5:
+
+//===---------------------------------------------------------------------===//
+
+Tail call optimization improvements: Tail call optimization currently
+pushes all arguments on the top of the stack (their normal place for
+non-tail call optimized calls) that source from the callers arguments
+or  that source from a virtual register (also possibly sourcing from
+callers arguments).
+This is done to prevent overwriting of parameters (see example
+below) that might be used later.
+
+example:  
+
+int callee(int32, int64); 
+int caller(int32 arg1, int32 arg2) { 
+  int64 local = arg2 * 2; 
+  return callee(arg2, (int64)local); 
+}
+
+[arg1]          [!arg2 no longer valid since we moved local onto it]
+[arg2]      ->  [(int64)
+[RETADDR]        local  ]
+
+Moving arg1 onto the stack slot of callee function would overwrite
+arg2 of the caller.
+
+Possible optimizations:
+
+
+ - Analyse the actual parameters of the callee to see which would
+   overwrite a caller parameter which is used by the callee and only
+   push them onto the top of the stack.
+
+   int callee (int32 arg1, int32 arg2);
+   int caller (int32 arg1, int32 arg2) {
+       return callee(arg1,arg2);
+   }
+
+   Here we don't need to write any variables to the top of the stack
+   since they don't overwrite each other.
+
+   int callee (int32 arg1, int32 arg2);
+   int caller (int32 arg1, int32 arg2) {
+       return callee(arg2,arg1);
+   }
+
+   Here we need to push the arguments because they overwrite each
+   other.
+
+//===---------------------------------------------------------------------===//
+
+main ()
+{
+  int i = 0;
+  unsigned long int z = 0;
+
+  do {
+    z -= 0x00004000;
+    i++;
+    if (i > 0x00040000)
+      abort ();
+  } while (z > 0);
+  exit (0);
+}
+
+gcc compiles this to:
+
+_main:
+       subl    $28, %esp
+       xorl    %eax, %eax
+       jmp     L2
+L3:
+       cmpl    $262144, %eax
+       je      L10
+L2:
+       addl    $1, %eax
+       cmpl    $262145, %eax
+       jne     L3
+       call    L_abort$stub
+L10:
+       movl    $0, (%esp)
+       call    L_exit$stub
+
+llvm:
+
+_main:
+       subl    $12, %esp
+       movl    $1, %eax
+       movl    $16384, %ecx
+LBB1_1:        # bb
+       cmpl    $262145, %eax
+       jge     LBB1_4  # cond_true
+LBB1_2:        # cond_next
+       incl    %eax
+       addl    $4294950912, %ecx
+       cmpl    $16384, %ecx
+       jne     LBB1_1  # bb
+LBB1_3:        # bb11
+       xorl    %eax, %eax
+       addl    $12, %esp
+       ret
+LBB1_4:        # cond_true
+       call    L_abort$stub
+
+1. LSR should rewrite the first cmp with induction variable %ecx.
+2. DAG combiner should fold
+        leal    1(%eax), %edx
+        cmpl    $262145, %edx
+   =>
+        cmpl    $262144, %eax
+
+//===---------------------------------------------------------------------===//
+
+define i64 @test(double %X) {
+       %Y = fptosi double %X to i64
+       ret i64 %Y
+}
+
+compiles to:
+
+_test:
+       subl    $20, %esp
+       movsd   24(%esp), %xmm0
+       movsd   %xmm0, 8(%esp)
+       fldl    8(%esp)
+       fisttpll        (%esp)
+       movl    4(%esp), %edx
+       movl    (%esp), %eax
+       addl    $20, %esp
+       #FP_REG_KILL
+       ret
+
+This should just fldl directly from the input stack slot.
+
+//===---------------------------------------------------------------------===//
+
+This code:
+int foo (int x) { return (x & 65535) | 255; }
+
+Should compile into:
+
+_foo:
+        movzwl  4(%esp), %eax
+        orb     $-1, %al           ;; 'orl 255' is also fine :)
+        ret
+
+instead of:
+_foo:
+        movl    $255, %eax
+        orl     4(%esp), %eax
+        andl    $65535, %eax
+        ret
+
+//===---------------------------------------------------------------------===//
+
+We're missing an obvious fold of a load into imul:
+
+int test(long a, long b) { return a * b; } 
+
+LLVM produces:
+_test:
+        movl    4(%esp), %ecx
+        movl    8(%esp), %eax
+        imull   %ecx, %eax
+        ret
+
+vs:
+_test:
+        movl    8(%esp), %eax
+        imull   4(%esp), %eax
+        ret
+
+//===---------------------------------------------------------------------===//
+
+We can fold a store into "zeroing a reg".  Instead of:
+
+xorl    %eax, %eax
+movl    %eax, 124(%esp)
+
+we should get:
+
+movl    $0, 124(%esp)
+
+if the flags of the xor are dead.
+
+Likewise, we isel "x<<1" into "add reg,reg".  If reg is spilled, this should
+be folded into: shl [mem], 1
+
+//===---------------------------------------------------------------------===//
+
+This testcase misses a read/modify/write opportunity (from PR1425):
+
+void vertical_decompose97iH1(int *b0, int *b1, int *b2, int width){
+    int i;
+    for(i=0; i<width; i++)
+        b1[i] += (1*(b0[i] + b2[i])+0)>>0;
+}
+
+We compile it down to:
+
+LBB1_2:        # bb
+       movl    (%esi,%edi,4), %ebx
+       addl    (%ecx,%edi,4), %ebx
+       addl    (%edx,%edi,4), %ebx
+       movl    %ebx, (%ecx,%edi,4)
+       incl    %edi
+       cmpl    %eax, %edi
+       jne     LBB1_2  # bb
+
+the inner loop should add to the memory location (%ecx,%edi,4), saving
+a mov.  Something like:
+
+        movl    (%esi,%edi,4), %ebx
+        addl    (%edx,%edi,4), %ebx
+        addl    %ebx, (%ecx,%edi,4)
+
+Here is another interesting example:
+
+void vertical_compose97iH1(int *b0, int *b1, int *b2, int width){
+    int i;
+    for(i=0; i<width; i++)
+        b1[i] -= (1*(b0[i] + b2[i])+0)>>0;
+}
+
+We miss the r/m/w opportunity here by using 2 subs instead of an add+sub[mem]:
+
+LBB9_2:        # bb
+       movl    (%ecx,%edi,4), %ebx
+       subl    (%esi,%edi,4), %ebx
+       subl    (%edx,%edi,4), %ebx
+       movl    %ebx, (%ecx,%edi,4)
+       incl    %edi
+       cmpl    %eax, %edi
+       jne     LBB9_2  # bb
+
+Additionally, LSR should rewrite the exit condition of these loops to use
+a stride-4 IV, would would allow all the scales in the loop to go away.
+This would result in smaller code and more efficient microops.
+
+//===---------------------------------------------------------------------===//
+
+In SSE mode, we turn abs and neg into a load from the constant pool plus a xor
+or and instruction, for example:
+
+       xorpd   LCPI1_0, %xmm2
+
+However, if xmm2 gets spilled, we end up with really ugly code like this:
+
+       movsd   (%esp), %xmm0
+       xorpd   LCPI1_0, %xmm0
+       movsd   %xmm0, (%esp)
+
+Since we 'know' that this is a 'neg', we can actually "fold" the spill into
+the neg/abs instruction, turning it into an *integer* operation, like this:
+
+       xorl 2147483648, [mem+4]     ## 2147483648 = (1 << 31)
+
+you could also use xorb, but xorl is less likely to lead to a partial register
+stall.  Here is a contrived testcase:
+
+double a, b, c;
+void test(double *P) {
+  double X = *P;
+  a = X;
+  bar();
+  X = -X;
+  b = X;
+  bar();
+  c = X;
+}
+
+//===---------------------------------------------------------------------===//