Trampoline support for x86-64. This looks like

[oota-llvm.git] / lib / Target / X86 / README.txt
diff --git a/lib/Target/X86/README.txt b/lib/Target/X86/README.txt

index 9cb01a78afe073098686665289f36af20305256f..e9f0d7338b3857dc5018d7e4ee4d97a5631844a1 100644 (file)
--- a/lib/Target/X86/README.txt
+++ b/lib/Target/X86/README.txt
@@ -339,20 +339,18 @@ void foo(int N) {
    for (i = 0; i < N; i++) { X = i; Y = i*4; }
  }
  
-LBB1_1:        #bb.preheader
-       xorl %ecx, %ecx
-       xorw %dx, %dx
-LBB1_2:        #bb
-       movl L_X$non_lazy_ptr, %esi
-       movw %dx, (%esi)
-       movw %dx, %si
-       shlw $2, %si
-       movl L_Y$non_lazy_ptr, %edi
-       movw %si, (%edi)
-       incl %ecx
-       incw %dx
-       cmpl %eax, %ecx
-       jne LBB1_2      #bb
+LBB1_1:        # entry.bb_crit_edge
+       xorl    %ecx, %ecx
+       xorw    %dx, %dx
+LBB1_2:        # bb
+       movl    L_X$non_lazy_ptr, %esi
+       movw    %cx, (%esi)
+       movl    L_Y$non_lazy_ptr, %esi
+       movw    %dx, (%esi)
+       addw    $4, %dx
+       incl    %ecx
+       cmpl    %eax, %ecx
+       jne     LBB1_2  # bb
  
  vs.
  
@@ -367,11 +365,7 @@ L4:
         cmpl    %edx, %edi
         jne     L4
  
-There are 3 issues:
-
-1. Lack of post regalloc LICM.
-2. LSR unable to reused IV for a different type (i16 vs. i32) even though
-   the cast would be free.
+This is due to the lack of post regalloc LICM.
  
  //===---------------------------------------------------------------------===//
  
@@ -657,6 +651,26 @@ _f:
  
  etc.
  
+Another is:
+int usesbb(unsigned int a, unsigned int b) {
+       return (a < b ? -1 : 0);
+}
+to:
+_usesbb:
+       movl    8(%esp), %eax
+       cmpl    %eax, 4(%esp)
+       sbbl    %eax, %eax
+       ret
+
+instead of:
+_usesbb:
+       xorl    %eax, %eax
+       movl    8(%esp), %ecx
+       cmpl    %ecx, 4(%esp)
+       movl    $4294967295, %ecx
+       cmovb   %ecx, %eax
+       ret
+
  //===---------------------------------------------------------------------===//
  
  Currently we don't have elimination of redundant stack manipulations. Consider
@@ -688,28 +702,6 @@ The add\sub pair is really unneeded here.
  
  //===---------------------------------------------------------------------===//
  
-We currently compile sign_extend_inreg into two shifts:
-
-long foo(long X) {
-  return (long)(signed char)X;
-}
-
-becomes:
-
-_foo:
-        movl 4(%esp), %eax
-        shll $24, %eax
-        sarl $24, %eax
-        ret
-
-This could be:
-
-_foo:
-        movsbl  4(%esp),%eax
-        ret
-
-//===---------------------------------------------------------------------===//
-
  Consider the expansion of:
  
  uint %test3(uint %X) {
@@ -824,23 +816,6 @@ _add_zf:
  
  //===---------------------------------------------------------------------===//
  
-This:
-#include <math.h>
-int foo(double X) { return isnan(X); }
-
-compiles to (-m64):
-
-_foo:
-        pxor %xmm1, %xmm1
-        ucomisd %xmm1, %xmm0
-        setp %al
-        movzbl %al, %eax
-        ret
-
-the pxor is not needed, we could compare the value against itself.
-
-//===---------------------------------------------------------------------===//
-
  These two functions have identical effects:
  
  unsigned int f(unsigned int i, unsigned int n) {++i; if (i == n) ++i; return i;}
@@ -1023,24 +998,6 @@ _test:
  
  //===---------------------------------------------------------------------===//
  
-For code like:
-phi (undef, x)
-
-We get an implicit def on the undef side. If the phi is spilled, we then get:
-implicitdef xmm1
-store xmm1 -> stack
-
-It should be possible to teach the x86 backend to "fold" the store into the
-implicitdef, which just deletes the implicit def.
-
-These instructions should go away:
-#IMPLICIT_DEF %xmm1 
-movaps %xmm1, 192(%esp) 
-movaps %xmm1, 224(%esp) 
-movaps %xmm1, 176(%esp)
-
-//===---------------------------------------------------------------------===//
-
  This is a "commutable two-address" register coallescing deficiency:
  
  define <4 x float> @test1(<4 x float> %V) {
@@ -1352,14 +1309,16 @@ L7:
  L5:
  
  //===---------------------------------------------------------------------===//
+
  Tail call optimization improvements: Tail call optimization currently
-pushes all arguments on the top of the stack (their normal place if
-that was a not tail call optimized functiong call ) before moving them
-to actual stack slot. this is done to prevent overwriting of paramters
-(see example below) that might be used, since the arguments of the
-callee overwrites callers arguments.
+pushes all arguments on the top of the stack (their normal place for
+non-tail call optimized calls) that source from the callers arguments
+or  that source from a virtual register (also possibly sourcing from
+callers arguments).
+This is done to prevent overwriting of parameters (see example
+below) that might be used later.
  
- example:  
+example:  
  
  int callee(int32, int64); 
  int caller(int32 arg1, int32 arg2) { 
@@ -1371,64 +1330,252 @@ int caller(int32 arg1, int32 arg2) {
  [arg2]      ->  [(int64)
  [RETADDR]        local  ]
  
-moving arg1 onto the stack slot of callee function would overwrite
+Moving arg1 onto the stack slot of callee function would overwrite
  arg2 of the caller.
  
  Possible optimizations:
  
- - only push those arguments to the top of the stack that are actual
-   parameters of the caller function and have no local value in the
-   caller
-
-   in above example local does not need to be pushed onto the top of
-   the stack as it is definitetly not a caller's function parameter
  
- - analyse the actual parameters of the callee to see which would
-   overwrite a caller paramter which is used by the callee and only
-   push them onto the top of the stack
+ - Analyse the actual parameters of the callee to see which would
+   overwrite a caller parameter which is used by the callee and only
+   push them onto the top of the stack.
  
     int callee (int32 arg1, int32 arg2);
     int caller (int32 arg1, int32 arg2) {
         return callee(arg1,arg2);
     }
  
-   here we don't need to write any variables to the top of the stack
-   since they don't overwrite each other
+   Here we don't need to write any variables to the top of the stack
+   since they don't overwrite each other.
  
     int callee (int32 arg1, int32 arg2);
     int caller (int32 arg1, int32 arg2) {
         return callee(arg2,arg1);
     }
  
-   here we need to push the arguments because they overwrite each other
-
-
-   code for lowering directly onto callers arguments:
-+  SmallVector<std::pair<unsigned, SDOperand>, 8> RegsToPass;
-+  SmallVector<SDOperand, 8> MemOpChains;
-+
-+  SDOperand FramePtr;
-+  SDOperand PtrOff;
-+  SDOperand FIN;
-+  int FI = 0;
-+  // Walk the register/memloc assignments, inserting copies/loads.
-+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
-+    CCValAssign &VA = ArgLocs[i];
-+    SDOperand Arg = Op.getOperand(5+2*VA.getValNo());
-+    
-+    ....
-+    
-+    if (VA.isRegLoc()) {
-+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
-+    } else {
-+      assert(VA.isMemLoc());
-+      // create frame index
-+      int32_t Offset = VA.getLocMemOffset()+FPDiff;
-+      uint32_t OpSize = (MVT::getSizeInBits(VA.getLocVT())+7)/8;
-+      FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset);
-+      FIN = DAG.getFrameIndex(FI, MVT::i32);
-+      // store relative to framepointer
-+      MemOpChains.push_back(DAG.getStore(Chain, Arg, FIN, NULL, 0));
-+    }
-+  }
+   Here we need to push the arguments because they overwrite each
+   other.
+
+//===---------------------------------------------------------------------===//
+
+main ()
+{
+  int i = 0;
+  unsigned long int z = 0;
+
+  do {
+    z -= 0x00004000;
+    i++;
+    if (i > 0x00040000)
+      abort ();
+  } while (z > 0);
+  exit (0);
+}
+
+gcc compiles this to:
+
+_main:
+       subl    $28, %esp
+       xorl    %eax, %eax
+       jmp     L2
+L3:
+       cmpl    $262144, %eax
+       je      L10
+L2:
+       addl    $1, %eax
+       cmpl    $262145, %eax
+       jne     L3
+       call    L_abort$stub
+L10:
+       movl    $0, (%esp)
+       call    L_exit$stub
+
+llvm:
+
+_main:
+       subl    $12, %esp
+       movl    $1, %eax
+       movl    $16384, %ecx
+LBB1_1:        # bb
+       cmpl    $262145, %eax
+       jge     LBB1_4  # cond_true
+LBB1_2:        # cond_next
+       incl    %eax
+       addl    $4294950912, %ecx
+       cmpl    $16384, %ecx
+       jne     LBB1_1  # bb
+LBB1_3:        # bb11
+       xorl    %eax, %eax
+       addl    $12, %esp
+       ret
+LBB1_4:        # cond_true
+       call    L_abort$stub
+
+1. LSR should rewrite the first cmp with induction variable %ecx.
+2. DAG combiner should fold
+        leal    1(%eax), %edx
+        cmpl    $262145, %edx
+   =>
+        cmpl    $262144, %eax
+
+//===---------------------------------------------------------------------===//
+
+define i64 @test(double %X) {
+       %Y = fptosi double %X to i64
+       ret i64 %Y
+}
+
+compiles to:
+
+_test:
+       subl    $20, %esp
+       movsd   24(%esp), %xmm0
+       movsd   %xmm0, 8(%esp)
+       fldl    8(%esp)
+       fisttpll        (%esp)
+       movl    4(%esp), %edx
+       movl    (%esp), %eax
+       addl    $20, %esp
+       #FP_REG_KILL
+       ret
+
+This should just fldl directly from the input stack slot.
+
+//===---------------------------------------------------------------------===//
+
+This code:
+int foo (int x) { return (x & 65535) | 255; }
+
+Should compile into:
+
+_foo:
+        movzwl  4(%esp), %eax
+        orb     $-1, %al           ;; 'orl 255' is also fine :)
+        ret
+
+instead of:
+_foo:
+        movl    $255, %eax
+        orl     4(%esp), %eax
+        andl    $65535, %eax
+        ret
+
+//===---------------------------------------------------------------------===//
+
+We're missing an obvious fold of a load into imul:
+
+int test(long a, long b) { return a * b; } 
+
+LLVM produces:
+_test:
+        movl    4(%esp), %ecx
+        movl    8(%esp), %eax
+        imull   %ecx, %eax
+        ret
+
+vs:
+_test:
+        movl    8(%esp), %eax
+        imull   4(%esp), %eax
+        ret
+
+//===---------------------------------------------------------------------===//
+
+We can fold a store into "zeroing a reg".  Instead of:
+
+xorl    %eax, %eax
+movl    %eax, 124(%esp)
+
+we should get:
+
+movl    $0, 124(%esp)
+
+if the flags of the xor are dead.
+
+Likewise, we isel "x<<1" into "add reg,reg".  If reg is spilled, this should
+be folded into: shl [mem], 1
+
+//===---------------------------------------------------------------------===//
+
+This testcase misses a read/modify/write opportunity (from PR1425):
+
+void vertical_decompose97iH1(int *b0, int *b1, int *b2, int width){
+    int i;
+    for(i=0; i<width; i++)
+        b1[i] += (1*(b0[i] + b2[i])+0)>>0;
+}
+
+We compile it down to:
+
+LBB1_2:        # bb
+       movl    (%esi,%edi,4), %ebx
+       addl    (%ecx,%edi,4), %ebx
+       addl    (%edx,%edi,4), %ebx
+       movl    %ebx, (%ecx,%edi,4)
+       incl    %edi
+       cmpl    %eax, %edi
+       jne     LBB1_2  # bb
+
+the inner loop should add to the memory location (%ecx,%edi,4), saving
+a mov.  Something like:
+
+        movl    (%esi,%edi,4), %ebx
+        addl    (%edx,%edi,4), %ebx
+        addl    %ebx, (%ecx,%edi,4)
+
+Here is another interesting example:
+
+void vertical_compose97iH1(int *b0, int *b1, int *b2, int width){
+    int i;
+    for(i=0; i<width; i++)
+        b1[i] -= (1*(b0[i] + b2[i])+0)>>0;
+}
+
+We miss the r/m/w opportunity here by using 2 subs instead of an add+sub[mem]:
+
+LBB9_2:        # bb
+       movl    (%ecx,%edi,4), %ebx
+       subl    (%esi,%edi,4), %ebx
+       subl    (%edx,%edi,4), %ebx
+       movl    %ebx, (%ecx,%edi,4)
+       incl    %edi
+       cmpl    %eax, %edi
+       jne     LBB9_2  # bb
+
+Additionally, LSR should rewrite the exit condition of these loops to use
+a stride-4 IV, would would allow all the scales in the loop to go away.
+This would result in smaller code and more efficient microops.
+
+//===---------------------------------------------------------------------===//
+
+In SSE mode, we turn abs and neg into a load from the constant pool plus a xor
+or and instruction, for example:
+
+       xorpd   LCPI1_0, %xmm2
+
+However, if xmm2 gets spilled, we end up with really ugly code like this:
+
+       movsd   (%esp), %xmm0
+       xorpd   LCPI1_0, %xmm0
+       movsd   %xmm0, (%esp)
+
+Since we 'know' that this is a 'neg', we can actually "fold" the spill into
+the neg/abs instruction, turning it into an *integer* operation, like this:
+
+       xorl 2147483648, [mem+4]     ## 2147483648 = (1 << 31)
+
+you could also use xorb, but xorl is less likely to lead to a partial register
+stall.  Here is a contrived testcase:
+
+double a, b, c;
+void test(double *P) {
+  double X = *P;
+  a = X;
+  bar();
+  X = -X;
+  b = X;
+  bar();
+  c = X;
+}
+
  //===---------------------------------------------------------------------===//