add a note

[oota-llvm.git] / lib / Target / README.txt
diff --git a/lib/Target/README.txt b/lib/Target/README.txt

index 37b671f34b7638eb1db8e0fe532be03517a4a01f..d477c82fbb260421b368b4d1952b13c7ae4a59e7 100644 (file)
--- a/lib/Target/README.txt
+++ b/lib/Target/README.txt
@@ -2,6 +2,13 @@ Target Independent Opportunities:
  
  //===---------------------------------------------------------------------===//
  
+We should make the various target's "IMPLICIT_DEF" instructions be a single
+target-independent opcode like TargetInstrInfo::INLINEASM.  This would allow
+us to eliminate the TargetInstrDesc::isImplicitDef() method, and would allow
+us to avoid having to define this for every target for every register class.
+
+//===---------------------------------------------------------------------===//
+
  With the recent changes to make the implicit def/use set explicit in
  machineinstrs, we should change the target descriptions for 'call' instructions
  so that the .td files don't list all the call-clobbered registers as implicit
@@ -131,13 +138,6 @@ v4sf example(float *P) {
  
  //===---------------------------------------------------------------------===//
  
-We should constant fold vector type casts at the LLVM level, regardless of the
-cast.  Currently we cannot fold some casts because we don't have TargetData
-information in the constant folder, so we don't know the endianness of the 
-target!
-
-//===---------------------------------------------------------------------===//
-
  Add support for conditional increments, and other related patterns.  Instead
  of:
  
@@ -397,32 +397,48 @@ followed by an uncond branch to an exit block.
  ; This testcase is due to tail-duplication not wanting to copy the return
  ; instruction into the terminating blocks because there was other code
  ; optimized out of the function after the taildup happened.
-;RUN: llvm-upgrade < %s | llvm-as | opt -tailcallelim | llvm-dis | not grep call
+; RUN: llvm-as < %s | opt -tailcallelim | llvm-dis | not grep call
  
-int %t4(int %a) {
+define i32 @t4(i32 %a) {
  entry:
-        %tmp.1 = and int %a, 1
-        %tmp.2 = cast int %tmp.1 to bool
-        br bool %tmp.2, label %then.0, label %else.0
-
-then.0:
-        %tmp.5 = add int %a, -1
-        %tmp.3 = call int %t4( int %tmp.5 )
-        br label %return
-
-else.0:
-        %tmp.7 = setne int %a, 0
-        br bool %tmp.7, label %then.1, label %return
-
-then.1:
-        %tmp.11 = add int %a, -2
-        %tmp.9 = call int %t4( int %tmp.11 )
-        br label %return
-
-return:
-        %result.0 = phi int [ 0, %else.0 ], [ %tmp.3, %then.0 ],
+       %tmp.1 = and i32 %a, 1          ; <i32> [#uses=1]
+       %tmp.2 = icmp ne i32 %tmp.1, 0          ; <i1> [#uses=1]
+       br i1 %tmp.2, label %then.0, label %else.0
+
+then.0:                ; preds = %entry
+       %tmp.5 = add i32 %a, -1         ; <i32> [#uses=1]
+       %tmp.3 = call i32 @t4( i32 %tmp.5 )             ; <i32> [#uses=1]
+       br label %return
+
+else.0:                ; preds = %entry
+       %tmp.7 = icmp ne i32 %a, 0              ; <i1> [#uses=1]
+       br i1 %tmp.7, label %then.1, label %return
+
+then.1:                ; preds = %else.0
+       %tmp.11 = add i32 %a, -2                ; <i32> [#uses=1]
+       %tmp.9 = call i32 @t4( i32 %tmp.11 )            ; <i32> [#uses=1]
+       br label %return
+
+return:                ; preds = %then.1, %else.0, %then.0
+       %result.0 = phi i32 [ 0, %else.0 ], [ %tmp.3, %then.0 ],
                              [ %tmp.9, %then.1 ]
-        ret int %result.0
+       ret i32 %result.0
+}
+
+//===---------------------------------------------------------------------===//
+
+Tail recursion elimination is not transforming this function, because it is
+returning n, which fails the isDynamicConstant check in the accumulator 
+recursion checks.
+
+long long fib(const long long n) {
+  switch(n) {
+    case 0:
+    case 1:
+      return n;
+    default:
+      return fib(n-1) + fib(n-2);
+  }
  }
  
  //===---------------------------------------------------------------------===//
@@ -430,22 +446,285 @@ return:
  Argument promotion should promote arguments for recursive functions, like 
  this:
  
-; RUN: llvm-upgrade < %s | llvm-as | opt -argpromotion | llvm-dis | grep x.val
-
-implementation   ; Functions:
+; RUN: llvm-as < %s | opt -argpromotion | llvm-dis | grep x.val
  
-internal int %foo(int* %x) {
+define internal i32 @foo(i32* %x) {
  entry:
-        %tmp = load int* %x
-        %tmp.foo = call int %foo(int *%x)
-        ret int %tmp.foo
+       %tmp = load i32* %x             ; <i32> [#uses=0]
+       %tmp.foo = call i32 @foo( i32* %x )             ; <i32> [#uses=1]
+       ret i32 %tmp.foo
  }
  
-int %bar(int* %x) {
+define i32 @bar(i32* %x) {
  entry:
-        %tmp3 = call int %foo( int* %x)                ; <int>[#uses=1]
-        ret int %tmp3
+       %tmp3 = call i32 @foo( i32* %x )                ; <i32> [#uses=1]
+       ret i32 %tmp3
+}
+
+//===---------------------------------------------------------------------===//
+
+"basicaa" should know how to look through "or" instructions that act like add
+instructions.  For example in this code, the x*4+1 is turned into x*4 | 1, and
+basicaa can't analyze the array subscript, leading to duplicated loads in the
+generated code:
+
+void test(int X, int Y, int a[]) {
+int i;
+  for (i=2; i<1000; i+=4) {
+  a[i+0] = a[i-1+0]*a[i-2+0];
+  a[i+1] = a[i-1+1]*a[i-2+1];
+  a[i+2] = a[i-1+2]*a[i-2+2];
+  a[i+3] = a[i-1+3]*a[i-2+3];
+  }
+}
+
+//===---------------------------------------------------------------------===//
+
+We should investigate an instruction sinking pass.  Consider this silly
+example in pic mode:
+
+#include <assert.h>
+void foo(int x) {
+  assert(x);
+  //...
+}
+
+we compile this to:
+_foo:
+       subl    $28, %esp
+       call    "L1$pb"
+"L1$pb":
+       popl    %eax
+       cmpl    $0, 32(%esp)
+       je      LBB1_2  # cond_true
+LBB1_1:        # return
+       # ...
+       addl    $28, %esp
+       ret
+LBB1_2:        # cond_true
+...
+
+The PIC base computation (call+popl) is only used on one path through the 
+code, but is currently always computed in the entry block.  It would be 
+better to sink the picbase computation down into the block for the 
+assertion, as it is the only one that uses it.  This happens for a lot of 
+code with early outs.
+
+Another example is loads of arguments, which are usually emitted into the 
+entry block on targets like x86.  If not used in all paths through a 
+function, they should be sunk into the ones that do.
+
+In this case, whole-function-isel would also handle this.
+
+//===---------------------------------------------------------------------===//
+
+Investigate lowering of sparse switch statements into perfect hash tables:
+http://burtleburtle.net/bob/hash/perfect.html
+
+//===---------------------------------------------------------------------===//
+
+We should turn things like "load+fabs+store" and "load+fneg+store" into the
+corresponding integer operations.  On a yonah, this loop:
+
+double a[256];
+void foo() {
+  int i, b;
+  for (b = 0; b < 10000000; b++)
+  for (i = 0; i < 256; i++)
+    a[i] = -a[i];
+}
+
+is twice as slow as this loop:
+
+long long a[256];
+void foo() {
+  int i, b;
+  for (b = 0; b < 10000000; b++)
+  for (i = 0; i < 256; i++)
+    a[i] ^= (1ULL << 63);
  }
  
+and I suspect other processors are similar.  On X86 in particular this is a
+big win because doing this with integers allows the use of read/modify/write
+instructions.
  
+//===---------------------------------------------------------------------===//
+
+DAG Combiner should try to combine small loads into larger loads when 
+profitable.  For example, we compile this C++ example:
+
+struct THotKey { short Key; bool Control; bool Shift; bool Alt; };
+extern THotKey m_HotKey;
+THotKey GetHotKey () { return m_HotKey; }
+
+into (-O3 -fno-exceptions -static -fomit-frame-pointer):
+
+__Z9GetHotKeyv:
+       pushl   %esi
+       movl    8(%esp), %eax
+       movb    _m_HotKey+3, %cl
+       movb    _m_HotKey+4, %dl
+       movb    _m_HotKey+2, %ch
+       movw    _m_HotKey, %si
+       movw    %si, (%eax)
+       movb    %ch, 2(%eax)
+       movb    %cl, 3(%eax)
+       movb    %dl, 4(%eax)
+       popl    %esi
+       ret     $4
+
+GCC produces:
+
+__Z9GetHotKeyv:
+       movl    _m_HotKey, %edx
+       movl    4(%esp), %eax
+       movl    %edx, (%eax)
+       movzwl  _m_HotKey+4, %edx
+       movw    %dx, 4(%eax)
+       ret     $4
+
+The LLVM IR contains the needed alignment info, so we should be able to 
+merge the loads and stores into 4-byte loads:
+
+       %struct.THotKey = type { i16, i8, i8, i8 }
+define void @_Z9GetHotKeyv(%struct.THotKey* sret  %agg.result) nounwind  {
+...
+       %tmp2 = load i16* getelementptr (@m_HotKey, i32 0, i32 0), align 8
+       %tmp5 = load i8* getelementptr (@m_HotKey, i32 0, i32 1), align 2
+       %tmp8 = load i8* getelementptr (@m_HotKey, i32 0, i32 2), align 1
+       %tmp11 = load i8* getelementptr (@m_HotKey, i32 0, i32 3), align 2
+
+Alternatively, we should use a small amount of base-offset alias analysis
+to make it so the scheduler doesn't need to hold all the loads in regs at
+once.
+
+//===---------------------------------------------------------------------===//
+
+We should extend parameter attributes to capture more information about
+pointer parameters for alias analysis.  Some ideas:
+
+1. Add a "nocapture" attribute, which indicates that the callee does not store
+   the address of the parameter into a global or any other memory location
+   visible to the callee.  This can be used to make basicaa and other analyses
+   more powerful.  It is true for things like memcpy, strcat, and many other
+   things, including structs passed by value, most C++ references, etc.
+2. Generalize readonly to be set on parameters.  This is important mod/ref 
+   info for the function, which is important for basicaa and others.  It can
+   also be used by the inliner to avoid inserting a memcpy for byval 
+   arguments when the function is inlined.
+
+These functions can be inferred by various analysis passes such as the 
+globalsmodrefaa pass.  Note that getting #2 right is actually really tricky.
+Consider this code:
+
+struct S;  S G;
+void caller(S byvalarg) { G.field = 1; ... }
+void callee() { caller(G); }
+
+The fact that the caller does not modify byval arg is not enough, we need
+to know that it doesn't modify G either.  This is very tricky.
+
+//===---------------------------------------------------------------------===//
+
+We should add an FRINT node to the DAG to model targets that have legal
+implementations of ceil/floor/rint.
+
+//===---------------------------------------------------------------------===//
+
+This GCC bug: http://gcc.gnu.org/bugzilla/show_bug.cgi?id=34043
+contains a testcase that compiles down to:
+
+       %struct.XMM128 = type { <4 x float> }
+..
+       %src = alloca %struct.XMM128
+..
+       %tmp6263 = bitcast %struct.XMM128* %src to <2 x i64>*
+       %tmp65 = getelementptr %struct.XMM128* %src, i32 0, i32 0
+       store <2 x i64> %tmp5899, <2 x i64>* %tmp6263, align 16
+       %tmp66 = load <4 x float>* %tmp65, align 16             
+       %tmp71 = add <4 x float> %tmp66, %tmp66         
+
+If the mid-level optimizer turned the bitcast of pointer + store of tmp5899
+into a bitcast of the vector value and a store to the pointer, then the 
+store->load could be easily removed.
+
+//===---------------------------------------------------------------------===//
+
+Consider:
+
+int test() {
+  long long input[8] = {1,1,1,1,1,1,1,1};
+  foo(input);
+}
+
+We currently compile this into a memcpy from a global array since the 
+initializer is fairly large and not memset'able.  This is good, but the memcpy
+gets lowered to load/stores in the code generator.  This is also ok, except
+that the codegen lowering for memcpy doesn't handle the case when the source
+is a constant global.  This gives us atrocious code like this:
+
+       call    "L1$pb"
+"L1$pb":
+       popl    %eax
+       movl    _C.0.1444-"L1$pb"+32(%eax), %ecx
+       movl    %ecx, 40(%esp)
+       movl    _C.0.1444-"L1$pb"+20(%eax), %ecx
+       movl    %ecx, 28(%esp)
+       movl    _C.0.1444-"L1$pb"+36(%eax), %ecx
+       movl    %ecx, 44(%esp)
+       movl    _C.0.1444-"L1$pb"+44(%eax), %ecx
+       movl    %ecx, 52(%esp)
+       movl    _C.0.1444-"L1$pb"+40(%eax), %ecx
+       movl    %ecx, 48(%esp)
+       movl    _C.0.1444-"L1$pb"+12(%eax), %ecx
+       movl    %ecx, 20(%esp)
+       movl    _C.0.1444-"L1$pb"+4(%eax), %ecx
+...
+
+instead of:
+       movl    $1, 16(%esp)
+       movl    $0, 20(%esp)
+       movl    $1, 24(%esp)
+       movl    $0, 28(%esp)
+       movl    $1, 32(%esp)
+       movl    $0, 36(%esp)
+       ...
+
+//===---------------------------------------------------------------------===//
+
+http://llvm.org/PR717:
  
+The following code should compile into "ret int undef". Instead, LLVM
+produces "ret int 0":
+
+int f() {
+  int x = 4;
+  int y;
+  if (x == 3) y = 0;
+  return y;
+}
+
+//===---------------------------------------------------------------------===//
+
+The loop unroller should partially unroll loops (instead of peeling them)
+when code growth isn't too bad and when an unroll count allows simplification
+of some code within the loop.  One trivial example is:
+
+#include <stdio.h>
+int main() {
+    int nRet = 17;
+    int nLoop;
+    for ( nLoop = 0; nLoop < 1000; nLoop++ ) {
+        if ( nLoop & 1 )
+            nRet += 2;
+        else
+            nRet -= 1;
+    }
+    return nRet;
+}
+
+Unrolling by 2 would eliminate the '&1' in both copies, leading to a net
+reduction in code size.  The resultant code would then also be suitable for
+exit value computation.
+
+//===---------------------------------------------------------------------===//