add a note about object size from drystone, add a poorly optimized loop from 179...

[oota-llvm.git] / lib / Target / README.txt
diff --git a/lib/Target/README.txt b/lib/Target/README.txt

index 38c3daa9383845a6d4884b91bc703eb3dff7556a..8ed8c598ddeb203a06dd4b450e67e20776a2e8db 100644 (file)
--- a/lib/Target/README.txt
+++ b/lib/Target/README.txt
@@ -41,7 +41,17 @@ This has a number of uses:
  
  //===---------------------------------------------------------------------===//
  
-Make the PPC branch selector target independant
+We should recognized various "overflow detection" idioms and translate them into
+llvm.uadd.with.overflow and similar intrinsics.  Here is a multiply idiom:
+
+unsigned int mul(unsigned int a,unsigned int b) {
+ if ((unsigned long long)a*b>0xffffffff)
+   exit(0);
+  return a*b;
+}
+
+The legalization code for mul-with-overflow needs to be made more robust before
+this can be implemented though.
  
  //===---------------------------------------------------------------------===//
  
@@ -53,41 +63,6 @@ right).
  
  //===---------------------------------------------------------------------===//
  
-Solve this DAG isel folding deficiency:
-
-int X, Y;
-
-void fn1(void)
-{
-  X = X | (Y << 3);
-}
-
-compiles to
-
-fn1:
-       movl Y, %eax
-       shll $3, %eax
-       orl X, %eax
-       movl %eax, X
-       ret
-
-The problem is the store's chain operand is not the load X but rather
-a TokenFactor of the load X and load Y, which prevents the folding.
-
-There are two ways to fix this:
-
-1. The dag combiner can start using alias analysis to realize that y/x
-   don't alias, making the store to X not dependent on the load from Y.
-2. The generated isel could be made smarter in the case it can't
-   disambiguate the pointers.
-
-Number 1 is the preferred solution.
-
-This has been "fixed" by a TableGen hack. But that is a short term workaround
-which will be removed once the proper fix is made.
-
-//===---------------------------------------------------------------------===//
-
  On targets with expensive 64-bit multiply, we could LSR this:
  
  for (i = ...; ++i) {
@@ -156,6 +131,45 @@ void f () {  /* this can be optimized to four additions... */
  This requires reassociating to forms of expressions that are already available,
  something that reassoc doesn't think about yet.
  
+
+//===---------------------------------------------------------------------===//
+
+This function: (derived from GCC PR19988)
+double foo(double x, double y) {
+  return ((x + 0.1234 * y) * (x + -0.1234 * y));
+}
+
+compiles to:
+_foo:
+       movapd  %xmm1, %xmm2
+       mulsd   LCPI1_1(%rip), %xmm1
+       mulsd   LCPI1_0(%rip), %xmm2
+       addsd   %xmm0, %xmm1
+       addsd   %xmm0, %xmm2
+       movapd  %xmm1, %xmm0
+       mulsd   %xmm2, %xmm0
+       ret
+
+Reassociate should be able to turn it into:
+
+double foo(double x, double y) {
+  return ((x + 0.1234 * y) * (x - 0.1234 * y));
+}
+
+Which allows the multiply by constant to be CSE'd, producing:
+
+_foo:
+       mulsd   LCPI1_0(%rip), %xmm1
+       movapd  %xmm1, %xmm2
+       addsd   %xmm0, %xmm2
+       subsd   %xmm1, %xmm0
+       mulsd   %xmm2, %xmm0
+       ret
+
+This doesn't need -ffast-math support at all.  This is particularly bad because
+the llvm-gcc frontend is canonicalizing the later into the former, but clang
+doesn't have this problem.
+
  //===---------------------------------------------------------------------===//
  
  These two functions should generate the same code on big-endian systems:
@@ -224,37 +238,6 @@ if anyone cared enough about sincos.
  
  //===---------------------------------------------------------------------===//
  
-Turn this into a single byte store with no load (the other 3 bytes are
-unmodified):
-
-define void @test(i32* %P) {
-       %tmp = load i32* %P
-        %tmp14 = or i32 %tmp, 3305111552
-        %tmp15 = and i32 %tmp14, 3321888767
-        store i32 %tmp15, i32* %P
-        ret void
-}
-
-//===---------------------------------------------------------------------===//
-
-dag/inst combine "clz(x)>>5 -> x==0" for 32-bit x.
-
-Compile:
-
-int bar(int x)
-{
-  int t = __builtin_clz(x);
-  return -(t>>5);
-}
-
-to:
-
-_bar:   addic r3,r3,-1
-        subfe r3,r3,r3
-        blr
-
-//===---------------------------------------------------------------------===//
-
  quantum_sigma_x in 462.libquantum contains the following loop:
  
        for(i=0; i<reg->size; i++)
@@ -294,6 +277,8 @@ unsigned long reverse(unsigned v) {
  
  //===---------------------------------------------------------------------===//
  
+[LOOP RECOGNITION]
+
  These idioms should be recognized as popcount (see PR1488):
  
  unsigned countbits_slow(unsigned v) {
@@ -325,8 +310,7 @@ unsigned int popcount(unsigned int input) {
    return count;
  }
  
-This is a form of idiom recognition for loops, the same thing that could be
-useful for recognizing memset/memcpy.
+This sort of thing should be added to the loop idiom pass.
  
  //===---------------------------------------------------------------------===//
  
@@ -356,9 +340,25 @@ this construct.
  
  //===---------------------------------------------------------------------===//
  
-viterbi speeds up *significantly* if the various "history" related copy loops
-are turned into memcpy calls at the source level.  We need a "loops to memcpy"
-pass.
+[LOOP OPTIMIZATION]
+
+SingleSource/Benchmarks/Misc/dt.c shows several interesting optimization
+opportunities in its double_array_divs_variable function: it needs loop
+interchange, memory promotion (which LICM already does), vectorization and
+variable trip count loop unrolling (since it has a constant trip count). ICC
+apparently produces this very nice code with -ffast-math:
+
+..B1.70:                        # Preds ..B1.70 ..B1.69
+       mulpd     %xmm0, %xmm1                                  #108.2
+       mulpd     %xmm0, %xmm1                                  #108.2
+       mulpd     %xmm0, %xmm1                                  #108.2
+       mulpd     %xmm0, %xmm1                                  #108.2
+       addl      $8, %edx                                      #
+       cmpl      $131072, %edx                                 #108.2
+       jb        ..B1.70       # Prob 99%                      #108.2
+
+It would be better to count down to zero, but this is a lot better than what we
+do.
  
  //===---------------------------------------------------------------------===//
  
@@ -565,46 +565,21 @@ struct THotKey { short Key; bool Control; bool Shift; bool Alt; };
  extern THotKey m_HotKey;
  THotKey GetHotKey () { return m_HotKey; }
  
-into (-O3 -fno-exceptions -static -fomit-frame-pointer):
-
-__Z9GetHotKeyv:
-       pushl   %esi
-       movl    8(%esp), %eax
-       movb    _m_HotKey+3, %cl
-       movb    _m_HotKey+4, %dl
-       movb    _m_HotKey+2, %ch
-       movw    _m_HotKey, %si
-       movw    %si, (%eax)
-       movb    %ch, 2(%eax)
-       movb    %cl, 3(%eax)
-       movb    %dl, 4(%eax)
-       popl    %esi
-       ret     $4
-
-GCC produces:
-
-__Z9GetHotKeyv:
-       movl    _m_HotKey, %edx
-       movl    4(%esp), %eax
-       movl    %edx, (%eax)
-       movzwl  _m_HotKey+4, %edx
-       movw    %dx, 4(%eax)
-       ret     $4
-
-The LLVM IR contains the needed alignment info, so we should be able to 
-merge the loads and stores into 4-byte loads:
-
-       %struct.THotKey = type { i16, i8, i8, i8 }
-define void @_Z9GetHotKeyv(%struct.THotKey* sret  %agg.result) nounwind  {
-...
-       %tmp2 = load i16* getelementptr (@m_HotKey, i32 0, i32 0), align 8
-       %tmp5 = load i8* getelementptr (@m_HotKey, i32 0, i32 1), align 2
-       %tmp8 = load i8* getelementptr (@m_HotKey, i32 0, i32 2), align 1
-       %tmp11 = load i8* getelementptr (@m_HotKey, i32 0, i32 3), align 2
-
-Alternatively, we should use a small amount of base-offset alias analysis
-to make it so the scheduler doesn't need to hold all the loads in regs at
-once.
+into (-m64 -O3 -fno-exceptions -static -fomit-frame-pointer):
+
+__Z9GetHotKeyv:                         ## @_Z9GetHotKeyv
+       movq    _m_HotKey@GOTPCREL(%rip), %rax
+       movzwl  (%rax), %ecx
+       movzbl  2(%rax), %edx
+       shlq    $16, %rdx
+       orq     %rcx, %rdx
+       movzbl  3(%rax), %ecx
+       shlq    $24, %rcx
+       orq     %rdx, %rcx
+       movzbl  4(%rax), %eax
+       shlq    $32, %rax
+       orq     %rcx, %rax
+       ret
  
  //===---------------------------------------------------------------------===//
  
@@ -616,42 +591,35 @@ implementations of ceil/floor/rint.
  Consider:
  
  int test() {
-  long long input[8] = {1,1,1,1,1,1,1,1};
+  long long input[8] = {1,0,1,0,1,0,1,0};
    foo(input);
  }
  
-We currently compile this into a memcpy from a global array since the 
-initializer is fairly large and not memset'able.  This is good, but the memcpy
-gets lowered to load/stores in the code generator.  This is also ok, except
-that the codegen lowering for memcpy doesn't handle the case when the source
-is a constant global.  This gives us atrocious code like this:
+Clang compiles this into:
  
-       call    "L1$pb"
-"L1$pb":
-       popl    %eax
-       movl    _C.0.1444-"L1$pb"+32(%eax), %ecx
-       movl    %ecx, 40(%esp)
-       movl    _C.0.1444-"L1$pb"+20(%eax), %ecx
-       movl    %ecx, 28(%esp)
-       movl    _C.0.1444-"L1$pb"+36(%eax), %ecx
-       movl    %ecx, 44(%esp)
-       movl    _C.0.1444-"L1$pb"+44(%eax), %ecx
-       movl    %ecx, 52(%esp)
-       movl    _C.0.1444-"L1$pb"+40(%eax), %ecx
-       movl    %ecx, 48(%esp)
-       movl    _C.0.1444-"L1$pb"+12(%eax), %ecx
-       movl    %ecx, 20(%esp)
-       movl    _C.0.1444-"L1$pb"+4(%eax), %ecx
-...
+  call void @llvm.memset.p0i8.i64(i8* %tmp, i8 0, i64 64, i32 16, i1 false)
+  %0 = getelementptr [8 x i64]* %input, i64 0, i64 0
+  store i64 1, i64* %0, align 16
+  %1 = getelementptr [8 x i64]* %input, i64 0, i64 2
+  store i64 1, i64* %1, align 16
+  %2 = getelementptr [8 x i64]* %input, i64 0, i64 4
+  store i64 1, i64* %2, align 16
+  %3 = getelementptr [8 x i64]* %input, i64 0, i64 6
+  store i64 1, i64* %3, align 16
  
-instead of:
-       movl    $1, 16(%esp)
-       movl    $0, 20(%esp)
-       movl    $1, 24(%esp)
-       movl    $0, 28(%esp)
-       movl    $1, 32(%esp)
-       movl    $0, 36(%esp)
-       ...
+Which gets codegen'd into:
+
+       pxor    %xmm0, %xmm0
+       movaps  %xmm0, -16(%rbp)
+       movaps  %xmm0, -32(%rbp)
+       movaps  %xmm0, -48(%rbp)
+       movaps  %xmm0, -64(%rbp)
+       movq    $1, -64(%rbp)
+       movq    $1, -48(%rbp)
+       movq    $1, -32(%rbp)
+       movq    $1, -16(%rbp)
+
+It would be better to have 4 movq's of 0 instead of the movaps's.
  
  //===---------------------------------------------------------------------===//
  
@@ -697,20 +665,6 @@ etc.  On X86, we miss a bunch of 'rotate by variable' cases because the rotate
  matching code in dag combine doesn't look through truncates aggressively 
  enough.  Here are some testcases reduces from GCC PR17886:
  
-unsigned long long f(unsigned long long x, int y) {
-  return (x << y) | (x >> 64-y); 
-} 
-unsigned f2(unsigned x, int y){
-  return (x << y) | (x >> 32-y); 
-} 
-unsigned long long f3(unsigned long long x){
-  int y = 9;
-  return (x << y) | (x >> 64-y); 
-} 
-unsigned f4(unsigned x){
-  int y = 10;
-  return (x << y) | (x >> 32-y); 
-}
  unsigned long long f5(unsigned long long x, unsigned long long y) {
    return (x << 8) | ((y >> 48) & 0xffull);
  }
@@ -729,10 +683,50 @@ unsigned long long f6(unsigned long long x, unsigned long long y, int z) {
    }
  }
  
-On X86-64, we only handle f2/f3/f4 right.  On x86-32, a few of these 
-generate truly horrible code, instead of using shld and friends.  On
-ARM, we end up with calls to L___lshrdi3/L___ashldi3 in f, which is
-badness.  PPC64 misses f, f5 and f6.  CellSPU aborts in isel.
+//===---------------------------------------------------------------------===//
+
+This (and similar related idioms):
+
+unsigned int foo(unsigned char i) {
+  return i | (i<<8) | (i<<16) | (i<<24);
+} 
+
+compiles into:
+
+define i32 @foo(i8 zeroext %i) nounwind readnone ssp noredzone {
+entry:
+  %conv = zext i8 %i to i32
+  %shl = shl i32 %conv, 8
+  %shl5 = shl i32 %conv, 16
+  %shl9 = shl i32 %conv, 24
+  %or = or i32 %shl9, %conv
+  %or6 = or i32 %or, %shl5
+  %or10 = or i32 %or6, %shl
+  ret i32 %or10
+}
+
+it would be better as:
+
+unsigned int bar(unsigned char i) {
+  unsigned int j=i | (i << 8); 
+  return j | (j<<16);
+}
+
+aka:
+
+define i32 @bar(i8 zeroext %i) nounwind readnone ssp noredzone {
+entry:
+  %conv = zext i8 %i to i32
+  %shl = shl i32 %conv, 8
+  %or = or i32 %shl, %conv
+  %shl5 = shl i32 %or, 16
+  %or6 = or i32 %shl5, %or
+  ret i32 %or6
+}
+
+or even i*0x01010101, depending on the speed of the multiplier.  The best way to
+handle this is to canonicalize it to a multiply in IR and have codegen handle
+lowering multiplies to shifts on cpus where shifts are faster.
  
  //===---------------------------------------------------------------------===//
  
@@ -864,27 +858,6 @@ The expression should optimize to something like
  
  //===---------------------------------------------------------------------===//
  
-From GCC Bug 3756:
-int
-pn (int n)
-{
- return (n >= 0 ? 1 : -1);
-}
-Should combine to (n >> 31) | 1.  Currently not optimized with "clang
--emit-llvm-bc | opt -std-compile-opts | llc".
-
-//===---------------------------------------------------------------------===//
-
-void a(int variable)
-{
- if (variable == 4 || variable == 6)
-   bar();
-}
-This should optimize to "if ((variable | 2) == 6)".  Currently not
-optimized with "clang -emit-llvm-bc | opt -std-compile-opts | llc".
-
-//===---------------------------------------------------------------------===//
-
  unsigned int f(unsigned int i, unsigned int n) {++i; if (i == n) ++i; return
  i;}
  unsigned int f2(unsigned int i, unsigned int n) {++i; i += i == n; return i;}
@@ -1002,18 +975,6 @@ Should also combine to x | 8.  Currently not optimized with "clang
  
  //===---------------------------------------------------------------------===//
  
-int a(int x) {return (x & 8) == 0 ? -1 : -9;}
-Should combine to (x | -9) ^ 8.  Currently not optimized with "clang
--emit-llvm-bc | opt -std-compile-opts".
-
-//===---------------------------------------------------------------------===//
-
-int a(int x) {return (x & 8) == 0 ? -9 : -1;}
-Should combine to x | -9.  Currently not optimized with "clang
--emit-llvm-bc | opt -std-compile-opts".
-
-//===---------------------------------------------------------------------===//
-
  int a(int x) {return ((x | -9) ^ 8) & x;}
  Should combine to x & -9.  Currently not optimized with "clang
  -emit-llvm-bc | opt -std-compile-opts".
@@ -1114,6 +1075,77 @@ int test (int a, int b, int c, int g) {
  It would be better to do the mul once to reduce codesize above the if.
  This is GCC PR38204.
  
+
+//===---------------------------------------------------------------------===//
+This simple function from 179.art:
+
+int winner, numf2s;
+struct { double y; int   reset; } *Y;
+
+void find_match() {
+   int i;
+   winner = 0;
+   for (i=0;i<numf2s;i++)
+       if (Y[i].y > Y[winner].y)
+              winner =i;
+}
+
+Compiles into (with clang TBAA):
+
+for.body:                                         ; preds = %for.inc, %bb.nph
+  %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.inc ]
+  %i.01718 = phi i32 [ 0, %bb.nph ], [ %i.01719, %for.inc ]
+  %tmp4 = getelementptr inbounds %struct.anon* %tmp3, i64 %indvar, i32 0
+  %tmp5 = load double* %tmp4, align 8, !tbaa !4
+  %idxprom7 = sext i32 %i.01718 to i64
+  %tmp10 = getelementptr inbounds %struct.anon* %tmp3, i64 %idxprom7, i32 0
+  %tmp11 = load double* %tmp10, align 8, !tbaa !4
+  %cmp12 = fcmp ogt double %tmp5, %tmp11
+  br i1 %cmp12, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %i.017 = trunc i64 %indvar to i32
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %i.01719 = phi i32 [ %i.01718, %for.body ], [ %i.017, %if.then ]
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp eq i64 %indvar.next, %tmp22
+  br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body
+
+
+It is good that we hoisted the reloads of numf2's, and Y out of the loop and
+sunk the store to winner out.
+
+However, this is awful on several levels: the conditional truncate in the loop
+(-indvars at fault? why can't we completely promote the IV to i64?).
+
+Beyond that, we have a partially redundant load in the loop: if "winner" (aka 
+%i.01718) isn't updated, we reload Y[winner].y the next time through the loop.
+Similarly, the addressing that feeds it (including the sext) is redundant. In
+the end we get this generated assembly:
+
+LBB0_2:                                 ## %for.body
+                                        ## =>This Inner Loop Header: Depth=1
+       movsd   (%rdi), %xmm0
+       movslq  %edx, %r8
+       shlq    $4, %r8
+       ucomisd (%rcx,%r8), %xmm0
+       jbe     LBB0_4
+       movl    %esi, %edx
+LBB0_4:                                 ## %for.inc
+       addq    $16, %rdi
+       incq    %rsi
+       cmpq    %rsi, %rax
+       jne     LBB0_2
+
+All things considered this isn't too bad, but we shouldn't need the movslq or
+the shlq instruction, or the load folded into ucomisd every time through the
+loop.
+
+On an x86-specific topic, if the loop can't be restructure, the movl should be a
+cmov.
+
  //===---------------------------------------------------------------------===//
  
  [STORE SINKING]
@@ -1185,6 +1217,29 @@ loadpre14.c loadpre15.c
  
  actually a conditional increment: loadpre18.c loadpre19.c
  
+//===---------------------------------------------------------------------===//
+
+[LOAD PRE / STORE SINKING / SPEC HACK]
+
+This is a chunk of code from 456.hmmer:
+
+int f(int M, int *mc, int *mpp, int *tpmm, int *ip, int *tpim, int *dpp,
+     int *tpdm, int xmb, int *bp, int *ms) {
+ int k, sc;
+ for (k = 1; k <= M; k++) {
+     mc[k] = mpp[k-1]   + tpmm[k-1];
+     if ((sc = ip[k-1]  + tpim[k-1]) > mc[k])  mc[k] = sc;
+     if ((sc = dpp[k-1] + tpdm[k-1]) > mc[k])  mc[k] = sc;
+     if ((sc = xmb  + bp[k])         > mc[k])  mc[k] = sc;
+     mc[k] += ms[k];
+   }
+}
+
+It is very profitable for this benchmark to turn the conditional stores to mc[k]
+into a conditional move (select instr in IR) and allow the final store to do the
+store.  See GCC PR27313 for more details.  Note that this is valid to xform even
+with the new C++ memory model, since mc[k] is previously loaded and later
+stored.
  
  //===---------------------------------------------------------------------===//
  
@@ -1218,9 +1273,16 @@ store->load.
  
  //===---------------------------------------------------------------------===//
  
+[ALIAS ANALYSIS]
+
  Type based alias analysis:
  http://gcc.gnu.org/bugzilla/show_bug.cgi?id=14705
  
+We should do better analysis of posix_memalign.  At the least it should
+no-capture its pointer argument, at best, we should know that the out-value
+result doesn't point to anything (like malloc).  One example of this is in
+SingleSource/Benchmarks/Misc/dt.c
+
  //===---------------------------------------------------------------------===//
  
  A/B get pinned to the stack because we turn an if/then into a select instead
@@ -1270,12 +1332,6 @@ void foo (int a, struct T b)
  
  simplifylibcalls should do several optimizations for strspn/strcspn:
  
-strcspn(x, "") -> strlen(x)
-strcspn("", x) -> 0
-strspn("", x) -> 0
-strspn(x, "") -> strlen(x)
-strspn(x, "a") -> strchr(x, 'a')-x
-
  strcspn(x, "a") -> inlined loop for up to 3 letters (similarly for strspn):
  
  size_t __strcspn_c3 (__const char *__s, int __reject1, int __reject2,
@@ -1315,14 +1371,7 @@ Those should be turned into a switch.
          
  This is interesting for a couple reasons.  First, in this:
  
-        %3073 = call i8* @strcpy(i8* %3072, i8* %3071) nounwind
-        %strlen = call i32 @strlen(i8* %3072)  
-
-The strlen could be replaced with: %strlen = sub %3072, %3073, because the
-strcpy call returns a pointer to the end of the string.  Based on that, the
-endptr GEP just becomes equal to 3073, which eliminates a strlen call and GEP.
-
-Second, the memcpy+strlen strlen can be replaced with:
+The memcpy+strlen strlen can be replaced with:
  
          %3074 = call i32 @strlen([5 x i8]* @"\01LC42") nounwind readonly 
  
@@ -1398,45 +1447,6 @@ This pattern repeats several times, basically doing:
  
  //===---------------------------------------------------------------------===//
  
-186.crafty contains this interesting pattern:
-
-%77 = call i8* @strstr(i8* getelementptr ([6 x i8]* @"\01LC5", i32 0, i32 0),
-                       i8* %30)
-%phitmp648 = icmp eq i8* %77, getelementptr ([6 x i8]* @"\01LC5", i32 0, i32 0)
-br i1 %phitmp648, label %bb70, label %bb76
-
-bb70:           ; preds = %OptionMatch.exit91, %bb69
-        %78 = call i32 @strlen(i8* %30) nounwind readonly align 1               ; <i32> [#uses=1]
-
-This is basically:
-  cststr = "abcdef";
-  if (strstr(cststr, P) == cststr) {
-     x = strlen(P);
-     ...
-
-The strstr call would be significantly cheaper written as:
-
-cststr = "abcdef";
-if (memcmp(P, str, strlen(P)))
-  x = strlen(P);
-
-This is memcmp+strlen instead of strstr.  This also makes the strlen fully
-redundant.
-
-//===---------------------------------------------------------------------===//
-
-186.crafty also contains this code:
-
-%1906 = call i32 @strlen(i8* getelementptr ([32 x i8]* @pgn_event, i32 0,i32 0))
-%1907 = getelementptr [32 x i8]* @pgn_event, i32 0, i32 %1906
-%1908 = call i8* @strcpy(i8* %1907, i8* %1905) nounwind align 1
-%1909 = call i32 @strlen(i8* getelementptr ([32 x i8]* @pgn_event, i32 0,i32 0))
-%1910 = getelementptr [32 x i8]* @pgn_event, i32 0, i32 %1909         
-
-The last strlen is computable as 1908-@pgn_event, which means 1910=1908.
-
-//===---------------------------------------------------------------------===//
-
  186.crafty has this interesting pattern with the "out.4543" variable:
  
  call void @llvm.memcpy.i32(
@@ -1498,22 +1508,6 @@ the float directly.
  
  //===---------------------------------------------------------------------===//
  
-#include <math.h>
-double foo(double a) {    return sin(a); }
-
-This compiles into this on x86-64 Linux:
-foo:
-       subq    $8, %rsp
-       call    sin
-       addq    $8, %rsp
-       ret
-vs:
-
-foo:
-        jmp sin
-
-//===---------------------------------------------------------------------===//
-
  The arg promotion pass should make use of nocapture to make its alias analysis
  stuff much more precise.
  
@@ -1648,36 +1642,7 @@ would delete the or instruction for us.
  
  //===---------------------------------------------------------------------===//
  
-FunctionAttrs is not marking this function as readnone (just readonly):
-$ clang t.c -emit-llvm -S -o - -O0 | opt -mem2reg -S -functionattrs
-
-int t(int a, int b, int c) {
- int *p;
- if (a)
-   p = &a;
- else
-   p = &c;
- return *p;
-}
-
-This is because we codegen this to:
-
-define i32 @t(i32 %a, i32 %b, i32 %c) nounwind readonly ssp {
-entry:
-  %a.addr = alloca i32                            ; <i32*> [#uses=3]
-  %c.addr = alloca i32                            ; <i32*> [#uses=2]
-...
-
-if.end:
-  %p.0 = phi i32* [ %a.addr, %if.then ], [ %c.addr, %if.else ]
-  %tmp2 = load i32* %p.0                          ; <i32> [#uses=1]
-  ret i32 %tmp2
-}
-
-And functionattrs doesn't realize that the p.0 load points to function local
-memory.
-
-Also, functionattrs doesn't know about memcpy/memset.  This function should be
+functionattrs doesn't know much about memcpy/memset.  This function should be
  marked readnone rather than readonly, since it only twiddles local memory, but
  functionattrs doesn't handle memset/memcpy/memmove aggressively:
  
@@ -1692,5 +1657,427 @@ int foo() {
   return **p;
  }
  
+This can be seen at:
+$ clang t.c -S -o - -mkernel -O0 -emit-llvm | opt -functionattrs -S
+
+
+//===---------------------------------------------------------------------===//
+
+Missed instcombine transformation:
+define i1 @a(i32 %x) nounwind readnone {
+entry:
+  %cmp = icmp eq i32 %x, 30
+  %sub = add i32 %x, -30
+  %cmp2 = icmp ugt i32 %sub, 9
+  %or = or i1 %cmp, %cmp2
+  ret i1 %or
+}
+This should be optimized to a single compare.  Testcase derived from gcc.
+
+//===---------------------------------------------------------------------===//
+
+Missed instcombine or reassociate transformation:
+int a(int a, int b) { return (a==12)&(b>47)&(b<58); }
+
+The sgt and slt should be combined into a single comparison. Testcase derived
+from gcc.
+
+//===---------------------------------------------------------------------===//
+
+Missed instcombine transformation:
+
+  %382 = srem i32 %tmp14.i, 64                    ; [#uses=1]
+  %383 = zext i32 %382 to i64                     ; [#uses=1]
+  %384 = shl i64 %381, %383                       ; [#uses=1]
+  %385 = icmp slt i32 %tmp14.i, 64                ; [#uses=1]
+
+The srem can be transformed to an and because if %tmp14.i is negative, the
+shift is undefined.  Testcase derived from 403.gcc.
+
+//===---------------------------------------------------------------------===//
+
+This is a range comparison on a divided result (from 403.gcc):
+
+  %1337 = sdiv i32 %1336, 8                       ; [#uses=1]
+  %.off.i208 = add i32 %1336, 7                   ; [#uses=1]
+  %1338 = icmp ult i32 %.off.i208, 15             ; [#uses=1]
+  
+We already catch this (removing the sdiv) if there isn't an add, we should
+handle the 'add' as well.  This is a common idiom with it's builtin_alloca code.
+C testcase:
+
+int a(int x) { return (unsigned)(x/16+7) < 15; }
+
+Another similar case involves truncations on 64-bit targets:
+
+  %361 = sdiv i64 %.046, 8                        ; [#uses=1]
+  %362 = trunc i64 %361 to i32                    ; [#uses=2]
+...
+  %367 = icmp eq i32 %362, 0                      ; [#uses=1]
+
+//===---------------------------------------------------------------------===//
+
+Missed instcombine/dagcombine transformation:
+define void @lshift_lt(i8 zeroext %a) nounwind {
+entry:
+  %conv = zext i8 %a to i32
+  %shl = shl i32 %conv, 3
+  %cmp = icmp ult i32 %shl, 33
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @bar() nounwind
+  ret void
+
+if.end:
+  ret void
+}
+declare void @bar() nounwind
+
+The shift should be eliminated.  Testcase derived from gcc.
+
+//===---------------------------------------------------------------------===//
+
+These compile into different code, one gets recognized as a switch and the
+other doesn't due to phase ordering issues (PR6212):
+
+int test1(int mainType, int subType) {
+  if (mainType == 7)
+    subType = 4;
+  else if (mainType == 9)
+    subType = 6;
+  else if (mainType == 11)
+    subType = 9;
+  return subType;
+}
+
+int test2(int mainType, int subType) {
+  if (mainType == 7)
+    subType = 4;
+  if (mainType == 9)
+    subType = 6;
+  if (mainType == 11)
+    subType = 9;
+  return subType;
+}
+
+//===---------------------------------------------------------------------===//
+
+The following test case (from PR6576):
+
+define i32 @mul(i32 %a, i32 %b) nounwind readnone {
+entry:
+ %cond1 = icmp eq i32 %b, 0                      ; <i1> [#uses=1]
+ br i1 %cond1, label %exit, label %bb.nph
+bb.nph:                                           ; preds = %entry
+ %tmp = mul i32 %b, %a                           ; <i32> [#uses=1]
+ ret i32 %tmp
+exit:                                             ; preds = %entry
+ ret i32 0
+}
+
+could be reduced to:
+
+define i32 @mul(i32 %a, i32 %b) nounwind readnone {
+entry:
+ %tmp = mul i32 %b, %a
+ ret i32 %tmp
+}
+
+//===---------------------------------------------------------------------===//
+
+We should use DSE + llvm.lifetime.end to delete dead vtable pointer updates.
+See GCC PR34949
+
+Another interesting case is that something related could be used for variables
+that go const after their ctor has finished.  In these cases, globalopt (which
+can statically run the constructor) could mark the global const (so it gets put
+in the readonly section).  A testcase would be:
+
+#include <complex>
+using namespace std;
+const complex<char> should_be_in_rodata (42,-42);
+complex<char> should_be_in_data (42,-42);
+complex<char> should_be_in_bss;
+
+Where we currently evaluate the ctors but the globals don't become const because
+the optimizer doesn't know they "become const" after the ctor is done.  See
+GCC PR4131 for more examples.
+
+//===---------------------------------------------------------------------===//
+
+In this code:
+
+long foo(long x) {
+  return x > 1 ? x : 1;
+}
+
+LLVM emits a comparison with 1 instead of 0. 0 would be equivalent
+and cheaper on most targets.
+
+LLVM prefers comparisons with zero over non-zero in general, but in this
+case it choses instead to keep the max operation obvious.
+
+//===---------------------------------------------------------------------===//
+
+Take the following testcase on x86-64 (similar testcases exist for all targets
+with addc/adde):
+
+define void @a(i64* nocapture %s, i64* nocapture %t, i64 %a, i64 %b,
+i64 %c) nounwind {
+entry:
+ %0 = zext i64 %a to i128                        ; <i128> [#uses=1]
+ %1 = zext i64 %b to i128                        ; <i128> [#uses=1]
+ %2 = add i128 %1, %0                            ; <i128> [#uses=2]
+ %3 = zext i64 %c to i128                        ; <i128> [#uses=1]
+ %4 = shl i128 %3, 64                            ; <i128> [#uses=1]
+ %5 = add i128 %4, %2                            ; <i128> [#uses=1]
+ %6 = lshr i128 %5, 64                           ; <i128> [#uses=1]
+ %7 = trunc i128 %6 to i64                       ; <i64> [#uses=1]
+ store i64 %7, i64* %s, align 8
+ %8 = trunc i128 %2 to i64                       ; <i64> [#uses=1]
+ store i64 %8, i64* %t, align 8
+ ret void
+}
+
+Generated code:
+       addq    %rcx, %rdx
+       movl    $0, %eax
+       adcq    $0, %rax
+       addq    %r8, %rax
+       movq    %rax, (%rdi)
+       movq    %rdx, (%rsi)
+       ret
+
+Expected code:
+       addq    %rcx, %rdx
+       adcq    $0, %r8
+       movq    %r8, (%rdi)
+       movq    %rdx, (%rsi)
+       ret
+
+The generated SelectionDAG has an ADD of an ADDE, where both operands of the
+ADDE are zero. Replacing one of the operands of the ADDE with the other operand
+of the ADD, and replacing the ADD with the ADDE, should give the desired result.
+
+(That said, we are doing a lot better than gcc on this testcase. :) )
+
+//===---------------------------------------------------------------------===//
+
+Switch lowering generates less than ideal code for the following switch:
+define void @a(i32 %x) nounwind {
+entry:
+  switch i32 %x, label %if.end [
+    i32 0, label %if.then
+    i32 1, label %if.then
+    i32 2, label %if.then
+    i32 3, label %if.then
+    i32 5, label %if.then
+  ]
+if.then:
+  tail call void @foo() nounwind
+  ret void
+if.end:
+  ret void
+}
+declare void @foo()
+
+Generated code on x86-64 (other platforms give similar results):
+a:
+       cmpl    $5, %edi
+       ja      .LBB0_2
+       movl    %edi, %eax
+       movl    $47, %ecx
+       btq     %rax, %rcx
+       jb      .LBB0_3
+.LBB0_2:
+       ret
+.LBB0_3:
+       jmp     foo  # TAILCALL
+
+The movl+movl+btq+jb could be simplified to a cmpl+jne.
+
+Or, if we wanted to be really clever, we could simplify the whole thing to
+something like the following, which eliminates a branch:
+       xorl    $1, %edi
+       cmpl    $4, %edi
+       ja      .LBB0_2
+       ret
+.LBB0_2:
+       jmp     foo  # TAILCALL
+//===---------------------------------------------------------------------===//
+Given a branch where the two target blocks are identical ("ret i32 %b" in
+both), simplifycfg will simplify them away. But not so for a switch statement:
+
+define i32 @f(i32 %a, i32 %b) nounwind readnone {
+entry:
+        switch i32 %a, label %bb3 [
+                i32 4, label %bb
+                i32 6, label %bb
+        ]
+
+bb:             ; preds = %entry, %entry
+        ret i32 %b
+
+bb3:            ; preds = %entry
+        ret i32 %b
+}
+//===---------------------------------------------------------------------===//
+
+clang -O3 fails to devirtualize this virtual inheritance case: (GCC PR45875)
+Looks related to PR3100
+
+struct c1 {};
+struct c10 : c1{
+  virtual void foo ();
+};
+struct c11 : c10, c1{
+  virtual void f6 ();
+};
+struct c28 : virtual c11{
+  void f6 ();
+};
+void check_c28 () {
+  c28 obj;
+  c11 *ptr = &obj;
+  ptr->f6 ();
+}
+
+//===---------------------------------------------------------------------===//
+
+We compile this:
+
+int foo(int a) { return (a & (~15)) / 16; }
+
+Into:
+
+define i32 @foo(i32 %a) nounwind readnone ssp {
+entry:
+  %and = and i32 %a, -16
+  %div = sdiv i32 %and, 16
+  ret i32 %div
+}
+
+but this code (X & -A)/A is X >> log2(A) when A is a power of 2, so this case
+should be instcombined into just "a >> 4".
+
+We do get this at the codegen level, so something knows about it, but 
+instcombine should catch it earlier:
+
+_foo:                                   ## @foo
+## BB#0:                                ## %entry
+       movl    %edi, %eax
+       sarl    $4, %eax
+       ret
+
+//===---------------------------------------------------------------------===//
+
+This code (from GCC PR28685):
+
+int test(int a, int b) {
+  int lt = a < b;
+  int eq = a == b;
+  if (lt)
+    return 1;
+  return eq;
+}
+
+Is compiled to:
+
+define i32 @test(i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+  %cmp = icmp slt i32 %a, %b
+  br i1 %cmp, label %return, label %if.end
+
+if.end:                                           ; preds = %entry
+  %cmp5 = icmp eq i32 %a, %b
+  %conv6 = zext i1 %cmp5 to i32
+  ret i32 %conv6
+
+return:                                           ; preds = %entry
+  ret i32 1
+}
+
+it could be:
+
+define i32 @test__(i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+  %0 = icmp sle i32 %a, %b
+  %retval = zext i1 %0 to i32
+  ret i32 %retval
+}
+
+//===---------------------------------------------------------------------===//
+
+This compare could fold to false:
+
+define i1 @g(i32 a) nounwind readnone {
+       %add = shl i32 %a, 1
+       %mul = shl i32 %a, 1
+       %cmp = icmp ugt i32 %add, %mul
+       ret i1 %cmp
+}
+
+//===---------------------------------------------------------------------===//
+
+This code can be seen in viterbi:
+
+  %64 = call noalias i8* @malloc(i64 %62) nounwind
+...
+  %67 = call i64 @llvm.objectsize.i64(i8* %64, i1 false) nounwind
+  %68 = call i8* @__memset_chk(i8* %64, i32 0, i64 %62, i64 %67) nounwind
+
+llvm.objectsize.i64 should be taught about malloc/calloc, allowing it to
+fold to %62.  This is a security win (overflows of malloc will get caught)
+and also a performance win by exposing more memsets to the optimizer.
+
+This occurs several times in viterbi.
+
+Stuff like this occurs in drystone:
+
+  %call5 = call i8* @malloc(i32 48) optsize
+  %5 = getelementptr inbounds i8* %call5, i32 16
+  %6 = call i32 @llvm.objectsize.i32(i8* %5, i1 false)
+
+We should be able to constant fold that.
+
+//===---------------------------------------------------------------------===//
+
+This code (from Benchmarks/Dhrystone/dry.c):
+
+define i32 @Func1(i32, i32) nounwind readnone optsize ssp {
+entry:
+  %sext = shl i32 %0, 24
+  %conv = ashr i32 %sext, 24
+  %sext6 = shl i32 %1, 24
+  %conv4 = ashr i32 %sext6, 24
+  %cmp = icmp eq i32 %conv, %conv4
+  %. = select i1 %cmp, i32 10000, i32 0
+  ret i32 %.
+}
+
+Should be simplified into something like:
+
+define i32 @Func1(i32, i32) nounwind readnone optsize ssp {
+entry:
+  %sext = shl i32 %0, 24
+  %conv = and i32 %sext, 0xFF000000
+  %sext6 = shl i32 %1, 24
+  %conv4 = and i32 %sext6, 0xFF000000
+  %cmp = icmp eq i32 %conv, %conv4
+  %. = select i1 %cmp, i32 10000, i32 0
+  ret i32 %.
+}
+
+and then to:
+
+define i32 @Func1(i32, i32) nounwind readnone optsize ssp {
+entry:
+  %conv = and i32 %0, 0xFF
+  %conv4 = and i32 %1, 0xFF
+  %cmp = icmp eq i32 %conv, %conv4
+  %. = select i1 %cmp, i32 10000, i32 0
+  ret i32 %.
+}
  //===---------------------------------------------------------------------===//