Add debug support for X86/ELF targets (Linux). This allows llvm-gcc4

[oota-llvm.git] / lib / Target / X86 / README.txt
diff --git a/lib/Target/X86/README.txt b/lib/Target/X86/README.txt

index 5a2da2d3d26af6c304515b372f0c664b5d886e50..956caff0c4e05a4b679e2dacfbb5e6e877972dec 100644 (file)
--- a/lib/Target/X86/README.txt
+++ b/lib/Target/X86/README.txt
@@ -18,6 +18,9 @@ long long test(int X, int Y) { return (long long)X*Y; }
  
  ... which should only be one imul instruction.
  
+This can be done with a custom expander, but it would be nice to move this to
+generic code.
+
  //===---------------------------------------------------------------------===//
  
  This should be one DIV/IDIV instruction, not a libcall:
@@ -29,35 +32,6 @@ unsigned test(unsigned long long X, unsigned Y) {
  This can be done trivially with a custom legalizer.  What about overflow 
  though?  http://gcc.gnu.org/bugzilla/show_bug.cgi?id=14224
  
-//===---------------------------------------------------------------------===//
-
-Some targets (e.g. athlons) prefer freep to fstp ST(0):
-http://gcc.gnu.org/ml/gcc-patches/2004-04/msg00659.html
-
-//===---------------------------------------------------------------------===//
-
-This should use fiadd on chips where it is profitable:
-double foo(double P, int *I) { return P+*I; }
-
-//===---------------------------------------------------------------------===//
-
-The FP stackifier needs to be global.  Also, it should handle simple permutates
-to reduce number of shuffle instructions, e.g. turning:
-
-fld P  ->              fld Q
-fld Q                  fld P
-fxch
-
-or:
-
-fxch   ->              fucomi
-fucomi                 jl X
-jg X
-
-Ideas:
-http://gcc.gnu.org/ml/gcc-patches/2004-11/msg02410.html
-
-
  //===---------------------------------------------------------------------===//
  
  Improvements to the multiply -> shift/add algorithm:
@@ -74,6 +48,20 @@ http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html
  
  Another useful one would be  ~0ULL >> X and ~0ULL << X.
  
+One better solution for 1LL << x is:
+        xorl    %eax, %eax
+        xorl    %edx, %edx
+        testb   $32, %cl
+        sete    %al
+        setne   %dl
+        sall    %cl, %eax
+        sall    %cl, %edx
+
+But that requires good 8-bit subreg support.
+
+64-bit shifts (in general) expand to really bad code.  Instead of using
+cmovs, we should expand to a conditional branch like GCC produces.
+
  //===---------------------------------------------------------------------===//
  
  Compile this:
@@ -109,20 +97,6 @@ allocator. Delay codegen until post register allocation.
  
  //===---------------------------------------------------------------------===//
  
-Add a target specific hook to DAG combiner to handle SINT_TO_FP and
-FP_TO_SINT when the source operand is already in memory.
-
-//===---------------------------------------------------------------------===//
-
-Model X86 EFLAGS as a real register to avoid redudant cmp / test. e.g.
-
-       cmpl $1, %eax
-       setg %al
-       testb %al, %al  # unnecessary
-       jne .BB7
-
-//===---------------------------------------------------------------------===//
-
  Count leading zeros and count trailing zeros:
  
  int clz(int X) { return __builtin_clz(X); }
@@ -144,6 +118,7 @@ aren't.
  
  Use push/pop instructions in prolog/epilog sequences instead of stores off 
  ESP (certain code size win, perf win on some [which?] processors).
+Also, it appears icc use push for parameter passing. Need to investigate.
  
  //===---------------------------------------------------------------------===//
  
@@ -153,293 +128,607 @@ flags.
  
  //===---------------------------------------------------------------------===//
  
-Open code rint,floor,ceil,trunc:
-http://gcc.gnu.org/ml/gcc-patches/2004-08/msg02006.html
-http://gcc.gnu.org/ml/gcc-patches/2004-08/msg02011.html
+The instruction selector sometimes misses folding a load into a compare.  The
+pattern is written as (cmp reg, (load p)).  Because the compare isn't 
+commutative, it is not matched with the load on both sides.  The dag combiner
+should be made smart enough to cannonicalize the load into the RHS of a compare
+when it can invert the result of the compare for free.
  
  //===---------------------------------------------------------------------===//
  
-Combine: a = sin(x), b = cos(x) into a,b = sincos(x).
+How about intrinsics? An example is:
+  *res = _mm_mulhi_epu16(*A, _mm_mul_epu32(*B, *C));
  
-Expand these to calls of sin/cos and stores:
-      double sincos(double x, double *sin, double *cos);
-      float sincosf(float x, float *sin, float *cos);
-      long double sincosl(long double x, long double *sin, long double *cos);
+compiles to
+       pmuludq (%eax), %xmm0
+       movl 8(%esp), %eax
+       movdqa (%eax), %xmm1
+       pmulhuw %xmm0, %xmm1
  
-Doing so could allow SROA of the destination pointers.  See also:
-http://gcc.gnu.org/bugzilla/show_bug.cgi?id=17687
+The transformation probably requires a X86 specific pass or a DAG combiner
+target specific hook.
  
  //===---------------------------------------------------------------------===//
  
-The instruction selector sometimes misses folding a load into a compare.  The
-pattern is written as (cmp reg, (load p)).  Because the compare isn't 
-commutative, it is not matched with the load on both sides.  The dag combiner
-should be made smart enough to cannonicalize the load into the RHS of a compare
-when it can invert the result of the compare for free.
+In many cases, LLVM generates code like this:
  
-//===---------------------------------------------------------------------===//
+_test:
+        movl 8(%esp), %eax
+        cmpl %eax, 4(%esp)
+        setl %al
+        movzbl %al, %eax
+        ret
  
-LSR should be turned on for the X86 backend and tuned to take advantage of its
-addressing modes.
+on some processors (which ones?), it is more efficient to do this:
  
-//===---------------------------------------------------------------------===//
+_test:
+        movl 8(%esp), %ebx
+        xor  %eax, %eax
+        cmpl %ebx, 4(%esp)
+        setl %al
+        ret
  
-When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and
-other fast SSE modes.
+Doing this correctly is tricky though, as the xor clobbers the flags.
  
  //===---------------------------------------------------------------------===//
  
-Think about doing i64 math in SSE regs.
+We should generate bts/btr/etc instructions on targets where they are cheap or
+when codesize is important.  e.g., for:
+
+void setbit(int *target, int bit) {
+    *target |= (1 << bit);
+}
+void clearbit(int *target, int bit) {
+    *target &= ~(1 << bit);
+}
  
  //===---------------------------------------------------------------------===//
  
-The DAG Isel doesn't fold the loads into the adds in this testcase.  The
-pattern selector does.  This is because the chain value of the load gets 
-selected first, and the loads aren't checking to see if they are only used by
-and add.
+Instead of the following for memset char*, 1, 10:
  
-.ll:
+       movl $16843009, 4(%edx)
+       movl $16843009, (%edx)
+       movw $257, 8(%edx)
  
-int %test(int* %x, int* %y, int* %z) {
-        %X = load int* %x
-        %Y = load int* %y
-        %Z = load int* %z
-        %a = add int %X, %Y
-        %b = add int %a, %Z
-        ret int %b
-}
+It might be better to generate
  
-dag isel:
+       movl $16843009, %eax
+       movl %eax, 4(%edx)
+       movl %eax, (%edx)
+       movw al, 8(%edx)
+       
+when we can spare a register. It reduces code size.
  
-_test:
+//===---------------------------------------------------------------------===//
+
+Evaluate what the best way to codegen sdiv X, (2^C) is.  For X/8, we currently
+get this:
+
+int %test1(int %X) {
+        %Y = div int %X, 8
+        ret int %Y
+}
+
+_test1:
          movl 4(%esp), %eax
-        movl (%eax), %eax
-        movl 8(%esp), %ecx
-        movl (%ecx), %ecx
-        addl %ecx, %eax
-        movl 12(%esp), %ecx
-        movl (%ecx), %ecx
+        movl %eax, %ecx
+        sarl $31, %ecx
+        shrl $29, %ecx
          addl %ecx, %eax
+        sarl $3, %eax
          ret
  
-pattern isel:
+GCC knows several different ways to codegen it, one of which is this:
  
-_test:
-        movl 12(%esp), %ecx
-        movl 4(%esp), %edx
-        movl 8(%esp), %eax
-        movl (%eax), %eax
-        addl (%edx), %eax
-        addl (%ecx), %eax
+_test1:
+        movl    4(%esp), %eax
+        cmpl    $-1, %eax
+        leal    7(%eax), %ecx
+        cmovle  %ecx, %eax
+        sarl    $3, %eax
          ret
  
-This is bad for register pressure, though the dag isel is producing a 
-better schedule. :)
+which is probably slower, but it's interesting at least :)
+
+//===---------------------------------------------------------------------===//
+
+Should generate min/max for stuff like:
+
+void minf(float a, float b, float *X) {
+  *X = a <= b ? a : b;
+}
+
+Make use of floating point min / max instructions. Perhaps introduce ISD::FMIN
+and ISD::FMAX node types?
  
  //===---------------------------------------------------------------------===//
  
-This testcase should have no SSE instructions in it, and only one load from
-a constant pool:
+The first BB of this code:
  
-double %test3(bool %B) {
-        %C = select bool %B, double 123.412, double 523.01123123
-        ret double %C
+declare bool %foo()
+int %bar() {
+        %V = call bool %foo()
+        br bool %V, label %T, label %F
+T:
+        ret int 1
+F:
+        call bool %foo()
+        ret int 12
  }
  
-Currently, the select is being lowered, which prevents the dag combiner from
-turning 'select (load CPI1), (load CPI2)' -> 'load (select CPI1, CPI2)'
+compiles to:
  
-The pattern isel got this one right.
+_bar:
+        subl $12, %esp
+        call L_foo$stub
+        xorb $1, %al
+        testb %al, %al
+        jne LBB_bar_2   # F
+
+It would be better to emit "cmp %al, 1" than a xor and test.
+
+//===---------------------------------------------------------------------===//
+
+Enable X86InstrInfo::convertToThreeAddress().
  
  //===---------------------------------------------------------------------===//
  
-We need to lower switch statements to tablejumps when appropriate instead of
-always into binary branch trees.
+We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl
+We should leave these as libcalls for everything over a much lower threshold,
+since libc is hand tuned for medium and large mem ops (avoiding RFO for large
+stores, TLB preheating, etc)
  
  //===---------------------------------------------------------------------===//
  
-SSE doesn't have [mem] op= reg instructions.  If we have an SSE instruction
-like this:
+Optimize this into something reasonable:
+ x * copysign(1.0, y) * copysign(1.0, z)
  
-  X += y
+//===---------------------------------------------------------------------===//
  
-and the register allocator decides to spill X, it is cheaper to emit this as:
+Optimize copysign(x, *y) to use an integer load from y.
  
-Y += [xslot]
-store Y -> [xslot]
+//===---------------------------------------------------------------------===//
  
-than as:
+%X = weak global int 0
  
-tmp = [xslot]
-tmp += y
-store tmp -> [xslot]
+void %foo(int %N) {
+       %N = cast int %N to uint
+       %tmp.24 = setgt int %N, 0
+       br bool %tmp.24, label %no_exit, label %return
  
-..and this uses one fewer register (so this should be done at load folding
-time, not at spiller time).  *Note* however that this can only be done
-if Y is dead.  Here's a testcase:
+no_exit:
+       %indvar = phi uint [ 0, %entry ], [ %indvar.next, %no_exit ]
+       %i.0.0 = cast uint %indvar to int
+       volatile store int %i.0.0, int* %X
+       %indvar.next = add uint %indvar, 1
+       %exitcond = seteq uint %indvar.next, %N
+       br bool %exitcond, label %return, label %no_exit
  
-%.str_3 = external global [15 x sbyte]          ; <[15 x sbyte]*> [#uses=0]
-implementation   ; Functions:
-declare void %printf(int, ...)
-void %main() {
-build_tree.exit:
-        br label %no_exit.i7
-no_exit.i7:             ; preds = %no_exit.i7, %build_tree.exit
-        %tmp.0.1.0.i9 = phi double [ 0.000000e+00, %build_tree.exit ], [ %tmp.34.i18, %no_exit.i7 ]      ; <double> [#uses=1]
-        %tmp.0.0.0.i10 = phi double [ 0.000000e+00, %build_tree.exit ], [ %tmp.28.i16, %no_exit.i7 ]     ; <double> [#uses=1]
-        %tmp.28.i16 = add double %tmp.0.0.0.i10, 0.000000e+00
-        %tmp.34.i18 = add double %tmp.0.1.0.i9, 0.000000e+00
-        br bool false, label %Compute_Tree.exit23, label %no_exit.i7
-Compute_Tree.exit23:            ; preds = %no_exit.i7
-        tail call void (int, ...)* %printf( int 0 )
-        store double %tmp.34.i18, double* null
-        ret void
+return:
+       ret void
  }
  
-We currently emit:
+compiles into:
  
-.BBmain_1:
-        xorpd %XMM1, %XMM1
-        addsd %XMM0, %XMM1
-***     movsd %XMM2, QWORD PTR [%ESP + 8]
-***     addsd %XMM2, %XMM1
-***     movsd QWORD PTR [%ESP + 8], %XMM2
-        jmp .BBmain_1   # no_exit.i7
+       .text
+       .align  4
+       .globl  _foo
+_foo:
+       movl 4(%esp), %eax
+       cmpl $1, %eax
+       jl LBB_foo_4    # return
+LBB_foo_1:     # no_exit.preheader
+       xorl %ecx, %ecx
+LBB_foo_2:     # no_exit
+       movl L_X$non_lazy_ptr, %edx
+       movl %ecx, (%edx)
+       incl %ecx
+       cmpl %eax, %ecx
+       jne LBB_foo_2   # no_exit
+LBB_foo_3:     # return.loopexit
+LBB_foo_4:     # return
+       ret
  
-This is a bugpoint reduced testcase, which is why the testcase doesn't make
-much sense (e.g. its an infinite loop). :)
+We should hoist "movl L_X$non_lazy_ptr, %edx" out of the loop after
+remateralization is implemented. This can be accomplished with 1) a target
+dependent LICM pass or 2) makeing SelectDAG represent the whole function. 
  
  //===---------------------------------------------------------------------===//
  
-None of the FPStack instructions are handled in
-X86RegisterInfo::foldMemoryOperand, which prevents the spiller from
-folding spill code into the instructions.
+The following tests perform worse with LSR:
+
+lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor.
  
  //===---------------------------------------------------------------------===//
  
-In many cases, LLVM generates code like this:
+Teach the coalescer to coalesce vregs of different register classes. e.g. FR32 /
+FR64 to VR128.
  
-_test:
-        movl 8(%esp), %eax
-        cmpl %eax, 4(%esp)
-        setl %al
-        movzbl %al, %eax
-        ret
+//===---------------------------------------------------------------------===//
  
-on some processors (which ones?), it is more efficient to do this:
+mov $reg, 48(%esp)
+...
+leal 48(%esp), %eax
+mov %eax, (%esp)
+call _foo
+
+Obviously it would have been better for the first mov (or any op) to store
+directly %esp[0] if there are no other uses.
+
+//===---------------------------------------------------------------------===//
+
+Adding to the list of cmp / test poor codegen issues:
+
+int test(__m128 *A, __m128 *B) {
+  if (_mm_comige_ss(*A, *B))
+    return 3;
+  else
+    return 4;
+}
  
  _test:
-        movl 8(%esp), %ebx
-       xor %eax, %eax
-        cmpl %ebx, 4(%esp)
-        setl %al
-        ret
+       movl 8(%esp), %eax
+       movaps (%eax), %xmm0
+       movl 4(%esp), %eax
+       movaps (%eax), %xmm1
+       comiss %xmm0, %xmm1
+       setae %al
+       movzbl %al, %ecx
+       movl $3, %eax
+       movl $4, %edx
+       cmpl $0, %ecx
+       cmove %edx, %eax
+       ret
  
-Doing this correctly is tricky though, as the xor clobbers the flags.
+Note the setae, movzbl, cmpl, cmove can be replaced with a single cmovae. There
+are a number of issues. 1) We are introducing a setcc between the result of the
+intrisic call and select. 2) The intrinsic is expected to produce a i32 value
+so a any extend (which becomes a zero extend) is added.
+
+We probably need some kind of target DAG combine hook to fix this.
  
  //===---------------------------------------------------------------------===//
  
-We should generate 'test' instead of 'cmp' in various cases, e.g.:
+We generate significantly worse code for this than GCC:
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150
+http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701
  
-bool %test(int %X) {
-        %Y = shl int %X, ubyte 1
-        %C = seteq int %Y, 0
-        ret bool %C
-}
-bool %test(int %X) {
-        %Y = and int %X, 8
-        %C = seteq int %Y, 0
-        ret bool %C
-}
+There is also one case we do worse on PPC.
+
+//===---------------------------------------------------------------------===//
+
+If shorter, we should use things like:
+movzwl %ax, %eax
+instead of:
+andl $65535, %EAX
  
-This may just be a matter of using 'test' to write bigger patterns for X86cmp.
+The former can also be used when the two-addressy nature of the 'and' would
+require a copy to be inserted (in X86InstrInfo::convertToThreeAddress).
  
  //===---------------------------------------------------------------------===//
  
-SSE should implement 'select_cc' using 'emulated conditional moves' that use
-pcmp/pand/pandn/por to do a selection instead of a conditional branch:
+Bad codegen:
  
-double %X(double %Y, double %Z, double %A, double %B) {
-        %C = setlt double %A, %B
-        %z = add double %Z, 0.0    ;; select operand is not a load
-        %D = select bool %C, double %Y, double %z
-        ret double %D
+char foo(int x) { return x; }
+
+_foo:
+       movl 4(%esp), %eax
+       shll $24, %eax
+       sarl $24, %eax
+       ret
+
+SIGN_EXTEND_INREG can be implemented as (sext (trunc)) to take advantage of 
+sub-registers.
+
+//===---------------------------------------------------------------------===//
+
+Consider this:
+
+typedef struct pair { float A, B; } pair;
+void pairtest(pair P, float *FP) {
+        *FP = P.A+P.B;
  }
  
-We currently emit:
+We currently generate this code with llvmgcc4:
  
-_X:
+_pairtest:
          subl $12, %esp
-        xorpd %xmm0, %xmm0
-        addsd 24(%esp), %xmm0
-        movsd 32(%esp), %xmm1
-        movsd 16(%esp), %xmm2
-        ucomisd 40(%esp), %xmm1
-        jb LBB_X_2
-LBB_X_1:
-        movsd %xmm0, %xmm2
-LBB_X_2:
-        movsd %xmm2, (%esp)
-        fldl (%esp)
+        movl 20(%esp), %eax
+        movl %eax, 4(%esp)
+        movl 16(%esp), %eax
+        movl %eax, (%esp)
+        movss (%esp), %xmm0
+        addss 4(%esp), %xmm0
+        movl 24(%esp), %eax
+        movss %xmm0, (%eax)
          addl $12, %esp
          ret
  
+we should be able to generate:
+_pairtest:
+        movss 4(%esp), %xmm0
+        movl 12(%esp), %eax
+        addss 8(%esp), %xmm0
+        movss %xmm0, (%eax)
+        ret
+
+The issue is that llvmgcc4 is forcing the struct to memory, then passing it as
+integer chunks.  It does this so that structs like {short,short} are passed in
+a single 32-bit integer stack slot.  We should handle the safe cases above much
+nicer, while still handling the hard cases.
+
  //===---------------------------------------------------------------------===//
  
-The x86 backend currently supports dynamic-no-pic. Need to add asm
-printer support for static and PIC.
+Another instruction selector deficiency:
+
+void %bar() {
+       %tmp = load int (int)** %foo
+       %tmp = tail call int %tmp( int 3 )
+       ret void
+}
+
+_bar:
+       subl $12, %esp
+       movl L_foo$non_lazy_ptr, %eax
+       movl (%eax), %eax
+       call *%eax
+       addl $12, %esp
+       ret
+
+The current isel scheme will not allow the load to be folded in the call since
+the load's chain result is read by the callseq_start.
  
  //===---------------------------------------------------------------------===//
  
-We should generate bts/btr/etc instructions on targets where they are cheap or
-when codesize is important.  e.g., for:
+Don't forget to find a way to squash noop truncates in the JIT environment.
  
-void setbit(int *target, int bit) {
-    *target |= (1 << bit);
+//===---------------------------------------------------------------------===//
+
+Implement anyext in the same manner as truncate that would allow them to be
+eliminated.
+
+//===---------------------------------------------------------------------===//
+
+How about implementing truncate / anyext as a property of machine instruction
+operand? i.e. Print as 32-bit super-class register / 16-bit sub-class register.
+Do this for the cases where a truncate / anyext is guaranteed to be eliminated.
+For IA32 that is truncate from 32 to 16 and anyext from 16 to 32.
+
+//===---------------------------------------------------------------------===//
+
+For this:
+
+int test(int a)
+{
+  return a * 3;
  }
-void clearbit(int *target, int bit) {
-    *target &= ~(1 << bit);
+
+We currently emits
+       imull $3, 4(%esp), %eax
+
+Perhaps this is what we really should generate is? Is imull three or four
+cycles? Note: ICC generates this:
+       movl    4(%esp), %eax
+       leal    (%eax,%eax,2), %eax
+
+The current instruction priority is based on pattern complexity. The former is
+more "complex" because it folds a load so the latter will not be emitted.
+
+Perhaps we should use AddedComplexity to give LEA32r a higher priority? We
+should always try to match LEA first since the LEA matching code does some
+estimate to determine whether the match is profitable.
+
+However, if we care more about code size, then imull is better. It's two bytes
+shorter than movl + leal.
+
+//===---------------------------------------------------------------------===//
+
+Implement CTTZ, CTLZ with bsf and bsr.
+
+//===---------------------------------------------------------------------===//
+
+It appears gcc place string data with linkonce linkage in
+.section __TEXT,__const_coal,coalesced instead of
+.section __DATA,__const_coal,coalesced.
+Take a look at darwin.h, there are other Darwin assembler directives that we
+do not make use of.
+
+//===---------------------------------------------------------------------===//
+
+We should handle __attribute__ ((__visibility__ ("hidden"))).
+
+//===---------------------------------------------------------------------===//
+
+int %foo(int* %a, int %t) {
+entry:
+        br label %cond_true
+
+cond_true:              ; preds = %cond_true, %entry
+        %x.0.0 = phi int [ 0, %entry ], [ %tmp9, %cond_true ]  
+        %t_addr.0.0 = phi int [ %t, %entry ], [ %tmp7, %cond_true ]
+        %tmp2 = getelementptr int* %a, int %x.0.0              
+        %tmp3 = load int* %tmp2         ; <int> [#uses=1]
+        %tmp5 = add int %t_addr.0.0, %x.0.0             ; <int> [#uses=1]
+        %tmp7 = add int %tmp5, %tmp3            ; <int> [#uses=2]
+        %tmp9 = add int %x.0.0, 1               ; <int> [#uses=2]
+        %tmp = setgt int %tmp9, 39              ; <bool> [#uses=1]
+        br bool %tmp, label %bb12, label %cond_true
+
+bb12:           ; preds = %cond_true
+        ret int %tmp7
  }
  
+is pessimized by -loop-reduce and -indvars
+
  //===---------------------------------------------------------------------===//
  
-Easy: Global addresses are not always allowed as immediates.  For this:
+u32 to float conversion improvement:
  
-int dst = 0; int *ptr = 0;
-void foo() { ptr = &dst; }
+float uint32_2_float( unsigned u ) {
+  float fl = (int) (u & 0xffff);
+  float fh = (int) (u >> 16);
+  fh *= 0x1.0p16f;
+  return fh + fl;
+}
  
-we get this:
+00000000        subl    $0x04,%esp
+00000003        movl    0x08(%esp,1),%eax
+00000007        movl    %eax,%ecx
+00000009        shrl    $0x10,%ecx
+0000000c        cvtsi2ss        %ecx,%xmm0
+00000010        andl    $0x0000ffff,%eax
+00000015        cvtsi2ss        %eax,%xmm1
+00000019        mulss   0x00000078,%xmm0
+00000021        addss   %xmm1,%xmm0
+00000025        movss   %xmm0,(%esp,1)
+0000002a        flds    (%esp,1)
+0000002d        addl    $0x04,%esp
+00000030        ret
  
-_foo:
-        movl $_dst, %eax
-        movl %eax, _ptr
+//===---------------------------------------------------------------------===//
+
+When using fastcc abi, align stack slot of argument of type double on 8 byte
+boundary to improve performance.
+
+//===---------------------------------------------------------------------===//
+
+Codegen:
+
+int f(int a, int b) {
+  if (a == 4 || a == 6)
+    b++;
+  return b;
+}
+
+
+as:
+
+or eax, 2
+cmp eax, 6
+jz label
+
+//===---------------------------------------------------------------------===//
+
+GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting
+simplifications for integer "x cmp y ? a : b".  For example, instead of:
+
+int G;
+void f(int X, int Y) {
+  G = X < 0 ? 14 : 13;
+}
+
+compiling to:
+
+_f:
+        movl $14, %eax
+        movl $13, %ecx
+        movl 4(%esp), %edx
+        testl %edx, %edx
+        cmovl %eax, %ecx
+        movl %ecx, _G
          ret
  
-When: "movl $_dst, _ptr" is sufficient.
+it could be:
+_f:
+        movl    4(%esp), %eax
+        sarl    $31, %eax
+        notl    %eax
+        addl    $14, %eax
+        movl    %eax, _G
+        ret
+
+etc.
  
  //===---------------------------------------------------------------------===//
  
-Use fisttp to do FP to integer conversion whenever it is available.
+Currently we don't have elimination of redundant stack manipulations. Consider
+the code:
+
+int %main() {
+entry:
+       call fastcc void %test1( )
+       call fastcc void %test2( sbyte* cast (void ()* %test1 to sbyte*) )
+       ret int 0
+}
+
+declare fastcc void %test1()
+
+declare fastcc void %test2(sbyte*)
+
+
+This currently compiles to:
+
+       subl $16, %esp
+       call _test5
+       addl $12, %esp
+       subl $16, %esp
+       movl $_test5, (%esp)
+       call _test6
+       addl $12, %esp
+
+The add\sub pair is really unneeded here.
  
  //===---------------------------------------------------------------------===//
  
-Instead of the following for memset char*, 1, 10:
+We generate really bad code in some cases due to lowering SETCC/SELECT at 
+legalize time, which prevents the post-legalize dag combine pass from
+understanding the code.  As a silly example, this prevents us from folding 
+stuff like this:
  
-       movl $16843009, 4(%edx)
-       movl $16843009, (%edx)
-       movw $257, 8(%edx)
+bool %test(ulong %x) {
+  %tmp = setlt ulong %x, 4294967296
+  ret bool %tmp
+}
  
-It might be better to generate
+into x.h == 0
  
-       movl $16843009, %eax
-       movl %eax, 4(%edx)
-       movl %eax, (%edx)
-       movw al, 8(%edx)
-       
-when we can spare a register. It reduces code size.
+//===---------------------------------------------------------------------===//
+
+We currently compile sign_extend_inreg into two shifts:
+
+long foo(long X) {
+  return (long)(signed char)X;
+}
+
+becomes:
+
+_foo:
+        movl 4(%esp), %eax
+        shll $24, %eax
+        sarl $24, %eax
+        ret
+
+This could be:
+
+_foo:
+        movsbl  4(%esp),%eax
+        ret
  
  //===---------------------------------------------------------------------===//
  
-It's not clear whether we should use pxor or xorps / xorpd to clear XMM
-registers. The choice may depend on subtarget information. We should do some
-more experiments on different x86 machines.
+Consider the expansion of:
+
+uint %test3(uint %X) {
+        %tmp1 = rem uint %X, 255
+        ret uint %tmp1
+}
+
+Currently it compiles to:
+
+...
+        movl $2155905153, %ecx
+        movl 8(%esp), %esi
+        movl %esi, %eax
+        mull %ecx
+...
+
+This could be "reassociated" into:
+
+        movl $2155905153, %eax
+        movl 8(%esp), %ecx
+        mull %ecx
+
+to avoid the copy.  In fact, the existing two-address stuff would do this
+except that mul isn't a commutative 2-addr instruction.  I guess this has
+to be done at isel time based on the #uses to mul?
+