lib/Target/README.txt

   1 Target Independent Opportunities:
   2
   3 //===---------------------------------------------------------------------===//
   4
   5 With the recent changes to make the implicit def/use set explicit in
   6 machineinstrs, we should change the target descriptions for 'call' instructions
   7 so that the .td files don't list all the call-clobbered registers as implicit
   8 defs.  Instead, these should be added by the code generator (e.g. on the dag).
   9
  10 This has a number of uses:
  11
  12 1. PPC32/64 and X86 32/64 can avoid having multiple copies of call instructions
  13    for their different impdef sets.
  14 2. Targets with multiple calling convs (e.g. x86) which have different clobber
  15    sets don't need copies of call instructions.
  16 3. 'Interprocedural register allocation' can be done to reduce the clobber sets
  17    of calls.
  18
  19 //===---------------------------------------------------------------------===//
  20
  21 Make the PPC branch selector target independant
  22
  23 //===---------------------------------------------------------------------===//
  24
  25 Get the C front-end to expand hypot(x,y) -> llvm.sqrt(x*x+y*y) when errno and
  26 precision don't matter (ffastmath).  Misc/mandel will like this. :)  This isn't
  27 safe in general, even on darwin.  See the libm implementation of hypot for
  28 examples (which special case when x/y are exactly zero to get signed zeros etc
  29 right).
  30
  31 //===---------------------------------------------------------------------===//
  32
  33 Solve this DAG isel folding deficiency:
  34
  35 int X, Y;
  36
  37 void fn1(void)
  38 {
  39   X = X | (Y << 3);
  40 }
  41
  42 compiles to
  43
  44 fn1:
  45         movl Y, %eax
  46         shll $3, %eax
  47         orl X, %eax
  48         movl %eax, X
  49         ret
  50
  51 The problem is the store's chain operand is not the load X but rather
  52 a TokenFactor of the load X and load Y, which prevents the folding.
  53
  54 There are two ways to fix this:
  55
  56 1. The dag combiner can start using alias analysis to realize that y/x
  57    don't alias, making the store to X not dependent on the load from Y.
  58 2. The generated isel could be made smarter in the case it can't
  59    disambiguate the pointers.
  60
  61 Number 1 is the preferred solution.
  62
  63 This has been "fixed" by a TableGen hack. But that is a short term workaround
  64 which will be removed once the proper fix is made.
  65
  66 //===---------------------------------------------------------------------===//
  67
  68 On targets with expensive 64-bit multiply, we could LSR this:
  69
  70 for (i = ...; ++i) {
  71    x = 1ULL << i;
  72
  73 into:
  74  long long tmp = 1;
  75  for (i = ...; ++i, tmp+=tmp)
  76    x = tmp;
  77
  78 This would be a win on ppc32, but not x86 or ppc64.
  79
  80 //===---------------------------------------------------------------------===//
  81
  82 Shrink: (setlt (loadi32 P), 0) -> (setlt (loadi8 Phi), 0)
  83
  84 //===---------------------------------------------------------------------===//
  85
  86 Reassociate should turn: X*X*X*X -> t=(X*X) (t*t) to eliminate a multiply.
  87
  88 //===---------------------------------------------------------------------===//
  89
  90 Interesting? testcase for add/shift/mul reassoc:
  91
  92 int bar(int x, int y) {
  93   return x*x*x+y+x*x*x*x*x*y*y*y*y;
  94 }
  95 int foo(int z, int n) {
  96   return bar(z, n) + bar(2*z, 2*n);
  97 }
  98
  99 Reassociate should handle the example in GCC PR16157.
 100
 101 //===---------------------------------------------------------------------===//
 102
 103 These two functions should generate the same code on big-endian systems:
 104
 105 int g(int *j,int *l)  {  return memcmp(j,l,4);  }
 106 int h(int *j, int *l) {  return *j - *l; }
 107
 108 this could be done in SelectionDAGISel.cpp, along with other special cases,
 109 for 1,2,4,8 bytes.
 110
 111 //===---------------------------------------------------------------------===//
 112
 113 It would be nice to revert this patch:
 114 http://lists.cs.uiuc.edu/pipermail/llvm-commits/Week-of-Mon-20060213/031986.html
 115
 116 And teach the dag combiner enough to simplify the code expanded before
 117 legalize.  It seems plausible that this knowledge would let it simplify other
 118 stuff too.
 119
 120 //===---------------------------------------------------------------------===//
 121
 122 For vector types, TargetData.cpp::getTypeInfo() returns alignment that is equal
 123 to the type size. It works but can be overly conservative as the alignment of
 124 specific vector types are target dependent.
 125
 126 //===---------------------------------------------------------------------===//
 127
 128 We should produce an unaligned load from code like this:
 129
 130 v4sf example(float *P) {
 131   return (v4sf){P[0], P[1], P[2], P[3] };
 132 }
 133
 134 //===---------------------------------------------------------------------===//
 135
 136 Add support for conditional increments, and other related patterns.  Instead
 137 of:
 138
 139         movl 136(%esp), %eax
 140         cmpl $0, %eax
 141         je LBB16_2      #cond_next
 142 LBB16_1:        #cond_true
 143         incl _foo
 144 LBB16_2:        #cond_next
 145
 146 emit:
 147         movl    _foo, %eax
 148         cmpl    $1, %edi
 149         sbbl    $-1, %eax
 150         movl    %eax, _foo
 151
 152 //===---------------------------------------------------------------------===//
 153
 154 Combine: a = sin(x), b = cos(x) into a,b = sincos(x).
 155
 156 Expand these to calls of sin/cos and stores:
 157       double sincos(double x, double *sin, double *cos);
 158       float sincosf(float x, float *sin, float *cos);
 159       long double sincosl(long double x, long double *sin, long double *cos);
 160
 161 Doing so could allow SROA of the destination pointers.  See also:
 162 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=17687
 163
 164 This is now easily doable with MRVs.  We could even make an intrinsic for this
 165 if anyone cared enough about sincos.
 166
 167 //===---------------------------------------------------------------------===//
 168
 169 Turn this into a single byte store with no load (the other 3 bytes are
 170 unmodified):
 171
 172 define void @test(i32* %P) {
 173         %tmp = load i32* %P
 174         %tmp14 = or i32 %tmp, 3305111552
 175         %tmp15 = and i32 %tmp14, 3321888767
 176         store i32 %tmp15, i32* %P
 177         ret void
 178 }
 179
 180 //===---------------------------------------------------------------------===//
 181
 182 dag/inst combine "clz(x)>>5 -> x==0" for 32-bit x.
 183
 184 Compile:
 185
 186 int bar(int x)
 187 {
 188   int t = __builtin_clz(x);
 189   return -(t>>5);
 190 }
 191
 192 to:
 193
 194 _bar:   addic r3,r3,-1
 195         subfe r3,r3,r3
 196         blr
 197
 198 //===---------------------------------------------------------------------===//
 199
 200 Legalize should lower ctlz like this:
 201   ctlz(x) = popcnt((x-1) & ~x)
 202
 203 on targets that have popcnt but not ctlz.  itanium, what else?
 204
 205 //===---------------------------------------------------------------------===//
 206
 207 quantum_sigma_x in 462.libquantum contains the following loop:
 208
 209       for(i=0; i<reg->size; i++)
 210         {
 211           /* Flip the target bit of each basis state */
 212           reg->node[i].state ^= ((MAX_UNSIGNED) 1 << target);
 213         }
 214
 215 Where MAX_UNSIGNED/state is a 64-bit int.  On a 32-bit platform it would be just
 216 so cool to turn it into something like:
 217
 218    long long Res = ((MAX_UNSIGNED) 1 << target);
 219    if (target < 32) {
 220      for(i=0; i<reg->size; i++)
 221        reg->node[i].state ^= Res & 0xFFFFFFFFULL;
 222    } else {
 223      for(i=0; i<reg->size; i++)
 224        reg->node[i].state ^= Res & 0xFFFFFFFF00000000ULL
 225    }
 226
 227 ... which would only do one 32-bit XOR per loop iteration instead of two.
 228
 229 It would also be nice to recognize the reg->size doesn't alias reg->node[i], but
 230 alas...
 231
 232 //===---------------------------------------------------------------------===//
 233
 234 This isn't recognized as bswap by instcombine (yes, it really is bswap):
 235
 236 unsigned long reverse(unsigned v) {
 237     unsigned t;
 238     t = v ^ ((v << 16) | (v >> 16));
 239     t &= ~0xff0000;
 240     v = (v << 24) | (v >> 8);
 241     return v ^ (t >> 8);
 242 }
 243
 244 //===---------------------------------------------------------------------===//
 245
 246 These idioms should be recognized as popcount (see PR1488):
 247
 248 unsigned countbits_slow(unsigned v) {
 249   unsigned c;
 250   for (c = 0; v; v >>= 1)
 251     c += v & 1;
 252   return c;
 253 }
 254 unsigned countbits_fast(unsigned v){
 255   unsigned c;
 256   for (c = 0; v; c++)
 257     v &= v - 1; // clear the least significant bit set
 258   return c;
 259 }
 260
 261 BITBOARD = unsigned long long
 262 int PopCnt(register BITBOARD a) {
 263   register int c=0;
 264   while(a) {
 265     c++;
 266     a &= a - 1;
 267   }
 268   return c;
 269 }
 270 unsigned int popcount(unsigned int input) {
 271   unsigned int count = 0;
 272   for (unsigned int i =  0; i < 4 * 8; i++)
 273     count += (input >> i) & i;
 274   return count;
 275 }
 276
 277 //===---------------------------------------------------------------------===//
 278
 279 These should turn into single 16-bit (unaligned?) loads on little/big endian
 280 processors.
 281
 282 unsigned short read_16_le(const unsigned char *adr) {
 283   return adr[0] | (adr[1] << 8);
 284 }
 285 unsigned short read_16_be(const unsigned char *adr) {
 286   return (adr[0] << 8) | adr[1];
 287 }
 288
 289 //===---------------------------------------------------------------------===//
 290
 291 -instcombine should handle this transform:
 292    icmp pred (sdiv X / C1 ), C2
 293 when X, C1, and C2 are unsigned.  Similarly for udiv and signed operands.
 294
 295 Currently InstCombine avoids this transform but will do it when the signs of
 296 the operands and the sign of the divide match. See the FIXME in
 297 InstructionCombining.cpp in the visitSetCondInst method after the switch case
 298 for Instruction::UDiv (around line 4447) for more details.
 299
 300 The SingleSource/Benchmarks/Shootout-C++/hash and hash2 tests have examples of
 301 this construct.
 302
 303 //===---------------------------------------------------------------------===//
 304
 305 viterbi speeds up *significantly* if the various "history" related copy loops
 306 are turned into memcpy calls at the source level.  We need a "loops to memcpy"
 307 pass.
 308
 309 //===---------------------------------------------------------------------===//
 310
 311 Consider:
 312
 313 typedef unsigned U32;
 314 typedef unsigned long long U64;
 315 int test (U32 *inst, U64 *regs) {
 316     U64 effective_addr2;
 317     U32 temp = *inst;
 318     int r1 = (temp >> 20) & 0xf;
 319     int b2 = (temp >> 16) & 0xf;
 320     effective_addr2 = temp & 0xfff;
 321     if (b2) effective_addr2 += regs[b2];
 322     b2 = (temp >> 12) & 0xf;
 323     if (b2) effective_addr2 += regs[b2];
 324     effective_addr2 &= regs[4];
 325      if ((effective_addr2 & 3) == 0)
 326         return 1;
 327     return 0;
 328 }
 329
 330 Note that only the low 2 bits of effective_addr2 are used.  On 32-bit systems,
 331 we don't eliminate the computation of the top half of effective_addr2 because
 332 we don't have whole-function selection dags.  On x86, this means we use one
 333 extra register for the function when effective_addr2 is declared as U64 than
 334 when it is declared U32.
 335
 336 //===---------------------------------------------------------------------===//
 337
 338 Promote for i32 bswap can use i64 bswap + shr.  Useful on targets with 64-bit
 339 regs and bswap, like itanium.
 340
 341 //===---------------------------------------------------------------------===//
 342
 343 LSR should know what GPR types a target has.  This code:
 344
 345 volatile short X, Y; // globals
 346
 347 void foo(int N) {
 348   int i;
 349   for (i = 0; i < N; i++) { X = i; Y = i*4; }
 350 }
 351
 352 produces two identical IV's (after promotion) on PPC/ARM:
 353
 354 LBB1_1: @bb.preheader
 355         mov r3, #0
 356         mov r2, r3
 357         mov r1, r3
 358 LBB1_2: @bb
 359         ldr r12, LCPI1_0
 360         ldr r12, [r12]
 361         strh r2, [r12]
 362         ldr r12, LCPI1_1
 363         ldr r12, [r12]
 364         strh r3, [r12]
 365         add r1, r1, #1    <- [0,+,1]
 366         add r3, r3, #4
 367         add r2, r2, #1    <- [0,+,1]
 368         cmp r1, r0
 369         bne LBB1_2      @bb
 370
 371
 372 //===---------------------------------------------------------------------===//
 373
 374 Tail call elim should be more aggressive, checking to see if the call is
 375 followed by an uncond branch to an exit block.
 376
 377 ; This testcase is due to tail-duplication not wanting to copy the return
 378 ; instruction into the terminating blocks because there was other code
 379 ; optimized out of the function after the taildup happened.
 380 ; RUN: llvm-as < %s | opt -tailcallelim | llvm-dis | not grep call
 381
 382 define i32 @t4(i32 %a) {
 383 entry:
 384         %tmp.1 = and i32 %a, 1          ; <i32> [#uses=1]
 385         %tmp.2 = icmp ne i32 %tmp.1, 0          ; <i1> [#uses=1]
 386         br i1 %tmp.2, label %then.0, label %else.0
 387
 388 then.0:         ; preds = %entry
 389         %tmp.5 = add i32 %a, -1         ; <i32> [#uses=1]
 390         %tmp.3 = call i32 @t4( i32 %tmp.5 )             ; <i32> [#uses=1]
 391         br label %return
 392
 393 else.0:         ; preds = %entry
 394         %tmp.7 = icmp ne i32 %a, 0              ; <i1> [#uses=1]
 395         br i1 %tmp.7, label %then.1, label %return
 396
 397 then.1:         ; preds = %else.0
 398         %tmp.11 = add i32 %a, -2                ; <i32> [#uses=1]
 399         %tmp.9 = call i32 @t4( i32 %tmp.11 )            ; <i32> [#uses=1]
 400         br label %return
 401
 402 return:         ; preds = %then.1, %else.0, %then.0
 403         %result.0 = phi i32 [ 0, %else.0 ], [ %tmp.3, %then.0 ],
 404                             [ %tmp.9, %then.1 ]
 405         ret i32 %result.0
 406 }
 407
 408 //===---------------------------------------------------------------------===//
 409
 410 Tail recursion elimination is not transforming this function, because it is
 411 returning n, which fails the isDynamicConstant check in the accumulator
 412 recursion checks.
 413
 414 long long fib(const long long n) {
 415   switch(n) {
 416     case 0:
 417     case 1:
 418       return n;
 419     default:
 420       return fib(n-1) + fib(n-2);
 421   }
 422 }
 423
 424 //===---------------------------------------------------------------------===//
 425
 426 Tail recursion elimination should handle:
 427
 428 int pow2m1(int n) {
 429  if (n == 0)
 430    return 0;
 431  return 2 * pow2m1 (n - 1) + 1;
 432 }
 433
 434 Also, multiplies can be turned into SHL's, so they should be handled as if
 435 they were associative.  "return foo() << 1" can be tail recursion eliminated.
 436
 437 //===---------------------------------------------------------------------===//
 438
 439 Argument promotion should promote arguments for recursive functions, like
 440 this:
 441
 442 ; RUN: llvm-as < %s | opt -argpromotion | llvm-dis | grep x.val
 443
 444 define internal i32 @foo(i32* %x) {
 445 entry:
 446         %tmp = load i32* %x             ; <i32> [#uses=0]
 447         %tmp.foo = call i32 @foo( i32* %x )             ; <i32> [#uses=1]
 448         ret i32 %tmp.foo
 449 }
 450
 451 define i32 @bar(i32* %x) {
 452 entry:
 453         %tmp3 = call i32 @foo( i32* %x )                ; <i32> [#uses=1]
 454         ret i32 %tmp3
 455 }
 456
 457 //===---------------------------------------------------------------------===//
 458
 459 "basicaa" should know how to look through "or" instructions that act like add
 460 instructions.  For example in this code, the x*4+1 is turned into x*4 | 1, and
 461 basicaa can't analyze the array subscript, leading to duplicated loads in the
 462 generated code:
 463
 464 void test(int X, int Y, int a[]) {
 465 int i;
 466   for (i=2; i<1000; i+=4) {
 467   a[i+0] = a[i-1+0]*a[i-2+0];
 468   a[i+1] = a[i-1+1]*a[i-2+1];
 469   a[i+2] = a[i-1+2]*a[i-2+2];
 470   a[i+3] = a[i-1+3]*a[i-2+3];
 471   }
 472 }
 473
 474 BasicAA also doesn't do this for add.  It needs to know that &A[i+1] != &A[i].
 475
 476 //===---------------------------------------------------------------------===//
 477
 478 We should investigate an instruction sinking pass.  Consider this silly
 479 example in pic mode:
 480
 481 #include <assert.h>
 482 void foo(int x) {
 483   assert(x);
 484   //...
 485 }
 486
 487 we compile this to:
 488 _foo:
 489         subl    $28, %esp
 490         call    "L1$pb"
 491 "L1$pb":
 492         popl    %eax
 493         cmpl    $0, 32(%esp)
 494         je      LBB1_2  # cond_true
 495 LBB1_1: # return
 496         # ...
 497         addl    $28, %esp
 498         ret
 499 LBB1_2: # cond_true
 500 ...
 501
 502 The PIC base computation (call+popl) is only used on one path through the
 503 code, but is currently always computed in the entry block.  It would be
 504 better to sink the picbase computation down into the block for the
 505 assertion, as it is the only one that uses it.  This happens for a lot of
 506 code with early outs.
 507
 508 Another example is loads of arguments, which are usually emitted into the
 509 entry block on targets like x86.  If not used in all paths through a
 510 function, they should be sunk into the ones that do.
 511
 512 In this case, whole-function-isel would also handle this.
 513
 514 //===---------------------------------------------------------------------===//
 515
 516 Investigate lowering of sparse switch statements into perfect hash tables:
 517 http://burtleburtle.net/bob/hash/perfect.html
 518
 519 //===---------------------------------------------------------------------===//
 520
 521 We should turn things like "load+fabs+store" and "load+fneg+store" into the
 522 corresponding integer operations.  On a yonah, this loop:
 523
 524 double a[256];
 525 void foo() {
 526   int i, b;
 527   for (b = 0; b < 10000000; b++)
 528   for (i = 0; i < 256; i++)
 529     a[i] = -a[i];
 530 }
 531
 532 is twice as slow as this loop:
 533
 534 long long a[256];
 535 void foo() {
 536   int i, b;
 537   for (b = 0; b < 10000000; b++)
 538   for (i = 0; i < 256; i++)
 539     a[i] ^= (1ULL << 63);
 540 }
 541
 542 and I suspect other processors are similar.  On X86 in particular this is a
 543 big win because doing this with integers allows the use of read/modify/write
 544 instructions.
 545
 546 //===---------------------------------------------------------------------===//
 547
 548 DAG Combiner should try to combine small loads into larger loads when
 549 profitable.  For example, we compile this C++ example:
 550
 551 struct THotKey { short Key; bool Control; bool Shift; bool Alt; };
 552 extern THotKey m_HotKey;
 553 THotKey GetHotKey () { return m_HotKey; }
 554
 555 into (-O3 -fno-exceptions -static -fomit-frame-pointer):
 556
 557 __Z9GetHotKeyv:
 558         pushl   %esi
 559         movl    8(%esp), %eax
 560         movb    _m_HotKey+3, %cl
 561         movb    _m_HotKey+4, %dl
 562         movb    _m_HotKey+2, %ch
 563         movw    _m_HotKey, %si
 564         movw    %si, (%eax)
 565         movb    %ch, 2(%eax)
 566         movb    %cl, 3(%eax)
 567         movb    %dl, 4(%eax)
 568         popl    %esi
 569         ret     $4
 570
 571 GCC produces:
 572
 573 __Z9GetHotKeyv:
 574         movl    _m_HotKey, %edx
 575         movl    4(%esp), %eax
 576         movl    %edx, (%eax)
 577         movzwl  _m_HotKey+4, %edx
 578         movw    %dx, 4(%eax)
 579         ret     $4
 580
 581 The LLVM IR contains the needed alignment info, so we should be able to
 582 merge the loads and stores into 4-byte loads:
 583
 584         %struct.THotKey = type { i16, i8, i8, i8 }
 585 define void @_Z9GetHotKeyv(%struct.THotKey* sret  %agg.result) nounwind  {
 586 ...
 587         %tmp2 = load i16* getelementptr (@m_HotKey, i32 0, i32 0), align 8
 588         %tmp5 = load i8* getelementptr (@m_HotKey, i32 0, i32 1), align 2
 589         %tmp8 = load i8* getelementptr (@m_HotKey, i32 0, i32 2), align 1
 590         %tmp11 = load i8* getelementptr (@m_HotKey, i32 0, i32 3), align 2
 591
 592 Alternatively, we should use a small amount of base-offset alias analysis
 593 to make it so the scheduler doesn't need to hold all the loads in regs at
 594 once.
 595
 596 //===---------------------------------------------------------------------===//
 597
 598 We should add an FRINT node to the DAG to model targets that have legal
 599 implementations of ceil/floor/rint.
 600
 601 //===---------------------------------------------------------------------===//
 602
 603 This GCC bug: http://gcc.gnu.org/bugzilla/show_bug.cgi?id=34043
 604 contains a testcase that compiles down to:
 605
 606         %struct.XMM128 = type { <4 x float> }
 607 ..
 608         %src = alloca %struct.XMM128
 609 ..
 610         %tmp6263 = bitcast %struct.XMM128* %src to <2 x i64>*
 611         %tmp65 = getelementptr %struct.XMM128* %src, i32 0, i32 0
 612         store <2 x i64> %tmp5899, <2 x i64>* %tmp6263, align 16
 613         %tmp66 = load <4 x float>* %tmp65, align 16
 614         %tmp71 = add <4 x float> %tmp66, %tmp66
 615
 616 If the mid-level optimizer turned the bitcast of pointer + store of tmp5899
 617 into a bitcast of the vector value and a store to the pointer, then the
 618 store->load could be easily removed.
 619
 620 //===---------------------------------------------------------------------===//
 621
 622 Consider:
 623
 624 int test() {
 625   long long input[8] = {1,1,1,1,1,1,1,1};
 626   foo(input);
 627 }
 628
 629 We currently compile this into a memcpy from a global array since the
 630 initializer is fairly large and not memset'able.  This is good, but the memcpy
 631 gets lowered to load/stores in the code generator.  This is also ok, except
 632 that the codegen lowering for memcpy doesn't handle the case when the source
 633 is a constant global.  This gives us atrocious code like this:
 634
 635         call    "L1$pb"
 636 "L1$pb":
 637         popl    %eax
 638         movl    _C.0.1444-"L1$pb"+32(%eax), %ecx
 639         movl    %ecx, 40(%esp)
 640         movl    _C.0.1444-"L1$pb"+20(%eax), %ecx
 641         movl    %ecx, 28(%esp)
 642         movl    _C.0.1444-"L1$pb"+36(%eax), %ecx
 643         movl    %ecx, 44(%esp)
 644         movl    _C.0.1444-"L1$pb"+44(%eax), %ecx
 645         movl    %ecx, 52(%esp)
 646         movl    _C.0.1444-"L1$pb"+40(%eax), %ecx
 647         movl    %ecx, 48(%esp)
 648         movl    _C.0.1444-"L1$pb"+12(%eax), %ecx
 649         movl    %ecx, 20(%esp)
 650         movl    _C.0.1444-"L1$pb"+4(%eax), %ecx
 651 ...
 652
 653 instead of:
 654         movl    $1, 16(%esp)
 655         movl    $0, 20(%esp)
 656         movl    $1, 24(%esp)
 657         movl    $0, 28(%esp)
 658         movl    $1, 32(%esp)
 659         movl    $0, 36(%esp)
 660         ...
 661
 662 //===---------------------------------------------------------------------===//
 663
 664 http://llvm.org/PR717:
 665
 666 The following code should compile into "ret int undef". Instead, LLVM
 667 produces "ret int 0":
 668
 669 int f() {
 670   int x = 4;
 671   int y;
 672   if (x == 3) y = 0;
 673   return y;
 674 }
 675
 676 //===---------------------------------------------------------------------===//
 677
 678 The loop unroller should partially unroll loops (instead of peeling them)
 679 when code growth isn't too bad and when an unroll count allows simplification
 680 of some code within the loop.  One trivial example is:
 681
 682 #include <stdio.h>
 683 int main() {
 684     int nRet = 17;
 685     int nLoop;
 686     for ( nLoop = 0; nLoop < 1000; nLoop++ ) {
 687         if ( nLoop & 1 )
 688             nRet += 2;
 689         else
 690             nRet -= 1;
 691     }
 692     return nRet;
 693 }
 694
 695 Unrolling by 2 would eliminate the '&1' in both copies, leading to a net
 696 reduction in code size.  The resultant code would then also be suitable for
 697 exit value computation.
 698
 699 //===---------------------------------------------------------------------===//
 700
 701 We miss a bunch of rotate opportunities on various targets, including ppc, x86,
 702 etc.  On X86, we miss a bunch of 'rotate by variable' cases because the rotate
 703 matching code in dag combine doesn't look through truncates aggressively
 704 enough.  Here are some testcases reduces from GCC PR17886:
 705
 706 unsigned long long f(unsigned long long x, int y) {
 707   return (x << y) | (x >> 64-y);
 708 }
 709 unsigned f2(unsigned x, int y){
 710   return (x << y) | (x >> 32-y);
 711 }
 712 unsigned long long f3(unsigned long long x){
 713   int y = 9;
 714   return (x << y) | (x >> 64-y);
 715 }
 716 unsigned f4(unsigned x){
 717   int y = 10;
 718   return (x << y) | (x >> 32-y);
 719 }
 720 unsigned long long f5(unsigned long long x, unsigned long long y) {
 721   return (x << 8) | ((y >> 48) & 0xffull);
 722 }
 723 unsigned long long f6(unsigned long long x, unsigned long long y, int z) {
 724   switch(z) {
 725   case 1:
 726     return (x << 8) | ((y >> 48) & 0xffull);
 727   case 2:
 728     return (x << 16) | ((y >> 40) & 0xffffull);
 729   case 3:
 730     return (x << 24) | ((y >> 32) & 0xffffffull);
 731   case 4:
 732     return (x << 32) | ((y >> 24) & 0xffffffffull);
 733   default:
 734     return (x << 40) | ((y >> 16) & 0xffffffffffull);
 735   }
 736 }
 737
 738 On X86-64, we only handle f2/f3/f4 right.  On x86-32, a few of these
 739 generate truly horrible code, instead of using shld and friends.  On
 740 ARM, we end up with calls to L___lshrdi3/L___ashldi3 in f, which is
 741 badness.  PPC64 misses f, f5 and f6.  CellSPU aborts in isel.
 742
 743 //===---------------------------------------------------------------------===//
 744
 745 We do a number of simplifications in simplify libcalls to strength reduce
 746 standard library functions, but we don't currently merge them together.  For
 747 example, it is useful to merge memcpy(a,b,strlen(b)) -> strcpy.  This can only
 748 be done safely if "b" isn't modified between the strlen and memcpy of course.
 749
 750 //===---------------------------------------------------------------------===//
 751
 752 Reassociate should turn things like:
 753
 754 int factorial(int X) {
 755  return X*X*X*X*X*X*X*X;
 756 }
 757
 758 into llvm.powi calls, allowing the code generator to produce balanced
 759 multiplication trees.
 760
 761 //===---------------------------------------------------------------------===//
 762
 763 We generate a horrible  libcall for llvm.powi.  For example, we compile:
 764
 765 #include <cmath>
 766 double f(double a) { return std::pow(a, 4); }
 767
 768 into:
 769
 770 __Z1fd:
 771         subl    $12, %esp
 772         movsd   16(%esp), %xmm0
 773         movsd   %xmm0, (%esp)
 774         movl    $4, 8(%esp)
 775         call    L___powidf2$stub
 776         addl    $12, %esp
 777         ret
 778
 779 GCC produces:
 780
 781 __Z1fd:
 782         subl    $12, %esp
 783         movsd   16(%esp), %xmm0
 784         mulsd   %xmm0, %xmm0
 785         mulsd   %xmm0, %xmm0
 786         movsd   %xmm0, (%esp)
 787         fldl    (%esp)
 788         addl    $12, %esp
 789         ret
 790
 791 //===---------------------------------------------------------------------===//
 792
 793 We compile this program: (from GCC PR11680)
 794 http://gcc.gnu.org/bugzilla/attachment.cgi?id=4487
 795
 796 Into code that runs the same speed in fast/slow modes, but both modes run 2x
 797 slower than when compile with GCC (either 4.0 or 4.2):
 798
 799 $ llvm-g++ perf.cpp -O3 -fno-exceptions
 800 $ time ./a.out fast
 801 1.821u 0.003s 0:01.82 100.0%    0+0k 0+0io 0pf+0w
 802
 803 $ g++ perf.cpp -O3 -fno-exceptions
 804 $ time ./a.out fast
 805 0.821u 0.001s 0:00.82 100.0%    0+0k 0+0io 0pf+0w
 806
 807 It looks like we are making the same inlining decisions, so this may be raw
 808 codegen badness or something else (haven't investigated).
 809
 810 //===---------------------------------------------------------------------===//
 811
 812 We miss some instcombines for stuff like this:
 813 void bar (void);
 814 void foo (unsigned int a) {
 815   /* This one is equivalent to a >= (3 << 2).  */
 816   if ((a >> 2) >= 3)
 817     bar ();
 818 }
 819
 820 A few other related ones are in GCC PR14753.
 821
 822 //===---------------------------------------------------------------------===//
 823
 824 Divisibility by constant can be simplified (according to GCC PR12849) from
 825 being a mulhi to being a mul lo (cheaper).  Testcase:
 826
 827 void bar(unsigned n) {
 828   if (n % 3 == 0)
 829     true();
 830 }
 831
 832 I think this basically amounts to a dag combine to simplify comparisons against
 833 multiply hi's into a comparison against the mullo.
 834
 835 //===---------------------------------------------------------------------===//
 836
 837 Better mod/ref analysis for scanf would allow us to eliminate the vtable and a
 838 bunch of other stuff from this example (see PR1604):
 839
 840 #include <cstdio>
 841 struct test {
 842     int val;
 843     virtual ~test() {}
 844 };
 845
 846 int main() {
 847     test t;
 848     std::scanf("%d", &t.val);
 849     std::printf("%d\n", t.val);
 850 }
 851
 852 //===---------------------------------------------------------------------===//
 853
 854 Instcombine will merge comparisons like (x >= 10) && (x < 20) by producing (x -
 855 10) u< 10, but only when the comparisons have matching sign.
 856
 857 This could be converted with a similiar technique. (PR1941)
 858
 859 define i1 @test(i8 %x) {
 860   %A = icmp uge i8 %x, 5
 861   %B = icmp slt i8 %x, 20
 862   %C = and i1 %A, %B
 863   ret i1 %C
 864 }
 865
 866 //===---------------------------------------------------------------------===//
 867
 868 These functions perform the same computation, but produce different assembly.
 869
 870 define i8 @select(i8 %x) readnone nounwind {
 871   %A = icmp ult i8 %x, 250
 872   %B = select i1 %A, i8 0, i8 1
 873   ret i8 %B
 874 }
 875
 876 define i8 @addshr(i8 %x) readnone nounwind {
 877   %A = zext i8 %x to i9
 878   %B = add i9 %A, 6       ;; 256 - 250 == 6
 879   %C = lshr i9 %B, 8
 880   %D = trunc i9 %C to i8
 881   ret i8 %D
 882 }
 883
 884 //===---------------------------------------------------------------------===//
 885
 886 From gcc bug 24696:
 887 int
 888 f (unsigned long a, unsigned long b, unsigned long c)
 889 {
 890   return ((a & (c - 1)) != 0) || ((b & (c - 1)) != 0);
 891 }
 892 int
 893 f (unsigned long a, unsigned long b, unsigned long c)
 894 {
 895   return ((a & (c - 1)) != 0) | ((b & (c - 1)) != 0);
 896 }
 897 Both should combine to ((a|b) & (c-1)) != 0.  Currently not optimized with
 898 "clang -emit-llvm-bc | opt -std-compile-opts".
 899
 900 //===---------------------------------------------------------------------===//
 901
 902 From GCC Bug 20192:
 903 #define PMD_MASK    (~((1UL << 23) - 1))
 904 void clear_pmd_range(unsigned long start, unsigned long end)
 905 {
 906    if (!(start & ~PMD_MASK) && !(end & ~PMD_MASK))
 907        f();
 908 }
 909 The expression should optimize to something like
 910 "!((start|end)&~PMD_MASK). Currently not optimized with "clang
 911 -emit-llvm-bc | opt -std-compile-opts".
 912
 913 //===---------------------------------------------------------------------===//
 914
 915 From GCC Bug 15241:
 916 unsigned int
 917 foo (unsigned int a, unsigned int b)
 918 {
 919  if (a <= 7 && b <= 7)
 920    baz ();
 921 }
 922 Should combine to "(a|b) <= 7".  Currently not optimized with "clang
 923 -emit-llvm-bc | opt -std-compile-opts".
 924
 925 //===---------------------------------------------------------------------===//
 926
 927 From GCC Bug 3756:
 928 int
 929 pn (int n)
 930 {
 931  return (n >= 0 ? 1 : -1);
 932 }
 933 Should combine to (n >> 31) | 1.  Currently not optimized with "clang
 934 -emit-llvm-bc | opt -std-compile-opts | llc".
 935
 936 //===---------------------------------------------------------------------===//
 937
 938 From GCC Bug 28685:
 939 int test(int a, int b)
 940 {
 941  int lt = a < b;
 942  int eq = a == b;
 943
 944  return (lt || eq);
 945 }
 946 Should combine to "a <= b".  Currently not optimized with "clang
 947 -emit-llvm-bc | opt -std-compile-opts | llc".
 948
 949 //===---------------------------------------------------------------------===//
 950
 951 void a(int variable)
 952 {
 953  if (variable == 4 || variable == 6)
 954    bar();
 955 }
 956 This should optimize to "if ((variable | 2) == 6)".  Currently not
 957 optimized with "clang -emit-llvm-bc | opt -std-compile-opts | llc".
 958
 959 //===---------------------------------------------------------------------===//
 960
 961 unsigned int f(unsigned int i, unsigned int n) {++i; if (i == n) ++i; return
 962 i;}
 963 unsigned int f2(unsigned int i, unsigned int n) {++i; i += i == n; return i;}
 964 These should combine to the same thing.  Currently, the first function
 965 produces better code on X86.
 966
 967 //===---------------------------------------------------------------------===//
 968
 969 From GCC Bug 15784:
 970 #define abs(x) x>0?x:-x
 971 int f(int x, int y)
 972 {
 973  return (abs(x)) >= 0;
 974 }
 975 This should optimize to x == INT_MIN. (With -fwrapv.)  Currently not
 976 optimized with "clang -emit-llvm-bc | opt -std-compile-opts".
 977
 978 //===---------------------------------------------------------------------===//
 979
 980 From GCC Bug 14753:
 981 void
 982 rotate_cst (unsigned int a)
 983 {
 984  a = (a << 10) | (a >> 22);
 985  if (a == 123)
 986    bar ();
 987 }
 988 void
 989 minus_cst (unsigned int a)
 990 {
 991  unsigned int tem;
 992
 993  tem = 20 - a;
 994  if (tem == 5)
 995    bar ();
 996 }
 997 void
 998 mask_gt (unsigned int a)
 999 {
1000  /* This is equivalent to a > 15.  */
1001  if ((a & ~7) > 8)
1002    bar ();
1003 }
1004 void
1005 rshift_gt (unsigned int a)
1006 {
1007  /* This is equivalent to a > 23.  */
1008  if ((a >> 2) > 5)
1009    bar ();
1010 }
1011 All should simplify to a single comparison.  All of these are
1012 currently not optimized with "clang -emit-llvm-bc | opt
1013 -std-compile-opts".
1014
1015 //===---------------------------------------------------------------------===//
1016
1017 From GCC Bug 32605:
1018 int c(int* x) {return (char*)x+2 == (char*)x;}
1019 Should combine to 0.  Currently not optimized with "clang
1020 -emit-llvm-bc | opt -std-compile-opts" (although llc can optimize it).
1021
1022 //===---------------------------------------------------------------------===//
1023
1024 int a(unsigned char* b) {return *b > 99;}
1025 There's an unnecessary zext in the generated code with "clang
1026 -emit-llvm-bc | opt -std-compile-opts".
1027
1028 //===---------------------------------------------------------------------===//
1029
1030 int a(unsigned b) {return ((b << 31) | (b << 30)) >> 31;}
1031 Should be combined to  "((b >> 1) | b) & 1".  Currently not optimized
1032 with "clang -emit-llvm-bc | opt -std-compile-opts".
1033
1034 //===---------------------------------------------------------------------===//
1035
1036 unsigned a(unsigned x, unsigned y) { return x | (y & 1) | (y & 2);}
1037 Should combine to "x | (y & 3)".  Currently not optimized with "clang
1038 -emit-llvm-bc | opt -std-compile-opts".
1039
1040 //===---------------------------------------------------------------------===//
1041
1042 unsigned a(unsigned a) {return ((a | 1) & 3) | (a & -4);}
1043 Should combine to "a | 1".  Currently not optimized with "clang
1044 -emit-llvm-bc | opt -std-compile-opts".
1045
1046 //===---------------------------------------------------------------------===//
1047
1048 int a(int a, int b, int c) {return (~a & c) | ((c|a) & b);}
1049 Should fold to "(~a & c) | (a & b)".  Currently not optimized with
1050 "clang -emit-llvm-bc | opt -std-compile-opts".
1051
1052 //===---------------------------------------------------------------------===//
1053
1054 int a(int a,int b) {return (~(a|b))|a;}
1055 Should fold to "a|~b".  Currently not optimized with "clang
1056 -emit-llvm-bc | opt -std-compile-opts".
1057
1058 //===---------------------------------------------------------------------===//
1059
1060 int a(int a, int b) {return (a&&b) || (a&&!b);}
1061 Should fold to "a".  Currently not optimized with "clang -emit-llvm-bc
1062 | opt -std-compile-opts".
1063
1064 //===---------------------------------------------------------------------===//
1065
1066 int a(int a, int b, int c) {return (a&&b) || (!a&&c);}
1067 Should fold to "a ? b : c", or at least something sane.  Currently not
1068 optimized with "clang -emit-llvm-bc | opt -std-compile-opts".
1069
1070 //===---------------------------------------------------------------------===//
1071
1072 int a(int a, int b, int c) {return (a&&b) || (a&&c) || (a&&b&&c);}
1073 Should fold to a && (b || c).  Currently not optimized with "clang
1074 -emit-llvm-bc | opt -std-compile-opts".
1075
1076 //===---------------------------------------------------------------------===//
1077
1078 int a(int x) {return x | ((x & 8) ^ 8);}
1079 Should combine to x | 8.  Currently not optimized with "clang
1080 -emit-llvm-bc | opt -std-compile-opts".
1081
1082 //===---------------------------------------------------------------------===//
1083
1084 int a(int x) {return x ^ ((x & 8) ^ 8);}
1085 Should also combine to x | 8.  Currently not optimized with "clang
1086 -emit-llvm-bc | opt -std-compile-opts".
1087
1088 //===---------------------------------------------------------------------===//
1089
1090 int a(int x) {return (x & 8) == 0 ? -1 : -9;}
1091 Should combine to (x | -9) ^ 8.  Currently not optimized with "clang
1092 -emit-llvm-bc | opt -std-compile-opts".
1093
1094 //===---------------------------------------------------------------------===//
1095
1096 int a(int x) {return (x & 8) == 0 ? -9 : -1;}
1097 Should combine to x | -9.  Currently not optimized with "clang
1098 -emit-llvm-bc | opt -std-compile-opts".
1099
1100 //===---------------------------------------------------------------------===//
1101
1102 int a(int x) {return ((x | -9) ^ 8) & x;}
1103 Should combine to x & -9.  Currently not optimized with "clang
1104 -emit-llvm-bc | opt -std-compile-opts".
1105
1106 //===---------------------------------------------------------------------===//
1107
1108 unsigned a(unsigned a) {return a * 0x11111111 >> 28 & 1;}
1109 Should combine to "a * 0x88888888 >> 31".  Currently not optimized
1110 with "clang -emit-llvm-bc | opt -std-compile-opts".
1111
1112 //===---------------------------------------------------------------------===//
1113
1114 unsigned a(char* x) {if ((*x & 32) == 0) return b();}
1115 There's an unnecessary zext in the generated code with "clang
1116 -emit-llvm-bc | opt -std-compile-opts".
1117
1118 //===---------------------------------------------------------------------===//
1119
1120 unsigned a(unsigned long long x) {return 40 * (x >> 1);}
1121 Should combine to "20 * (((unsigned)x) & -2)".  Currently not
1122 optimized with "clang -emit-llvm-bc | opt -std-compile-opts".
1123
1124 //===---------------------------------------------------------------------===//
1125
1126 We would like to do the following transform in the instcombiner:
1127
1128   -X/C -> X/-C
1129
1130 However, this isn't valid if (-X) overflows. We can implement this when we
1131 have the concept of a "C signed subtraction" operator that which is undefined
1132 on overflow.
1133
1134 //===---------------------------------------------------------------------===//
1135
1136 This was noticed in the entryblock for grokdeclarator in 403.gcc:
1137
1138         %tmp = icmp eq i32 %decl_context, 4
1139         %decl_context_addr.0 = select i1 %tmp, i32 3, i32 %decl_context
1140         %tmp1 = icmp eq i32 %decl_context_addr.0, 1
1141         %decl_context_addr.1 = select i1 %tmp1, i32 0, i32 %decl_context_addr.0
1142
1143 tmp1 should be simplified to something like:
1144   (!tmp || decl_context == 1)
1145
1146 This allows recursive simplifications, tmp1 is used all over the place in
1147 the function, e.g. by:
1148
1149         %tmp23 = icmp eq i32 %decl_context_addr.1, 0            ; <i1> [#uses=1]
1150         %tmp24 = xor i1 %tmp1, true             ; <i1> [#uses=1]
1151         %or.cond8 = and i1 %tmp23, %tmp24               ; <i1> [#uses=1]
1152
1153 later.
1154
1155 //===---------------------------------------------------------------------===//
1156
1157 Store sinking: This code:
1158
1159 void f (int n, int *cond, int *res) {
1160     int i;
1161     *res = 0;
1162     for (i = 0; i < n; i++)
1163         if (*cond)
1164             *res ^= 234; /* (*) */
1165 }
1166
1167 On this function GVN hoists the fully redundant value of *res, but nothing
1168 moves the store out.  This gives us this code:
1169
1170 bb:             ; preds = %bb2, %entry
1171         %.rle = phi i32 [ 0, %entry ], [ %.rle6, %bb2 ]
1172         %i.05 = phi i32 [ 0, %entry ], [ %indvar.next, %bb2 ]
1173         %1 = load i32* %cond, align 4
1174         %2 = icmp eq i32 %1, 0
1175         br i1 %2, label %bb2, label %bb1
1176
1177 bb1:            ; preds = %bb
1178         %3 = xor i32 %.rle, 234
1179         store i32 %3, i32* %res, align 4
1180         br label %bb2
1181
1182 bb2:            ; preds = %bb, %bb1
1183         %.rle6 = phi i32 [ %3, %bb1 ], [ %.rle, %bb ]
1184         %indvar.next = add i32 %i.05, 1
1185         %exitcond = icmp eq i32 %indvar.next, %n
1186         br i1 %exitcond, label %return, label %bb
1187
1188 DSE should sink partially dead stores to get the store out of the loop.
1189
1190 Here's another partial dead case:
1191 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=12395
1192
1193 //===---------------------------------------------------------------------===//
1194
1195 Scalar PRE hoists the mul in the common block up to the else:
1196
1197 int test (int a, int b, int c, int g) {
1198   int d, e;
1199   if (a)
1200     d = b * c;
1201   else
1202     d = b - c;
1203   e = b * c + g;
1204   return d + e;
1205 }
1206
1207 It would be better to do the mul once to reduce codesize above the if.
1208 This is GCC PR38204.
1209
1210 //===---------------------------------------------------------------------===//
1211
1212 GCC PR37810 is an interesting case where we should sink load/store reload
1213 into the if block and outside the loop, so we don't reload/store it on the
1214 non-call path.
1215
1216 for () {
1217   *P += 1;
1218   if ()
1219     call();
1220   else
1221     ...
1222 ->
1223 tmp = *P
1224 for () {
1225   tmp += 1;
1226   if () {
1227     *P = tmp;
1228     call();
1229     tmp = *P;
1230   } else ...
1231 }
1232 *P = tmp;
1233
1234 We now hoist the reload after the call (Transforms/GVN/lpre-call-wrap.ll), but
1235 we don't sink the store.  We need partially dead store sinking.
1236
1237 //===---------------------------------------------------------------------===//
1238
1239 [PHI TRANSLATE GEPs]
1240
1241 GCC PR37166: Sinking of loads prevents SROA'ing the "g" struct on the stack
1242 leading to excess stack traffic. This could be handled by GVN with some crazy
1243 symbolic phi translation.  The code we get looks like (g is on the stack):
1244
1245 bb2:            ; preds = %bb1
1246 ..
1247         %9 = getelementptr %struct.f* %g, i32 0, i32 0
1248         store i32 %8, i32* %9, align  bel %bb3
1249
1250 bb3:            ; preds = %bb1, %bb2, %bb
1251         %c_addr.0 = phi %struct.f* [ %g, %bb2 ], [ %c, %bb ], [ %c, %bb1 ]
1252         %b_addr.0 = phi %struct.f* [ %b, %bb2 ], [ %g, %bb ], [ %b, %bb1 ]
1253         %10 = getelementptr %struct.f* %c_addr.0, i32 0, i32 0
1254         %11 = load i32* %10, align 4
1255
1256 %11 is fully redundant, an in BB2 it should have the value %8.
1257
1258 GCC PR33344 is a similar case.
1259
1260 //===---------------------------------------------------------------------===//
1261
1262 There are many load PRE testcases in testsuite/gcc.dg/tree-ssa/loadpre* in the
1263 GCC testsuite.  There are many pre testcases as ssa-pre-*.c
1264
1265 //===---------------------------------------------------------------------===//
1266
1267 There are some interesting cases in testsuite/gcc.dg/tree-ssa/pred-comm* in the
1268 GCC testsuite.  For example, predcom-1.c is:
1269
1270  for (i = 2; i < 1000; i++)
1271     fib[i] = (fib[i-1] + fib[i - 2]) & 0xffff;
1272
1273 which compiles into:
1274
1275 bb1:            ; preds = %bb1, %bb1.thread
1276         %indvar = phi i32 [ 0, %bb1.thread ], [ %0, %bb1 ]
1277         %i.0.reg2mem.0 = add i32 %indvar, 2
1278         %0 = add i32 %indvar, 1         ; <i32> [#uses=3]
1279         %1 = getelementptr [1000 x i32]* @fib, i32 0, i32 %0
1280         %2 = load i32* %1, align 4              ; <i32> [#uses=1]
1281         %3 = getelementptr [1000 x i32]* @fib, i32 0, i32 %indvar
1282         %4 = load i32* %3, align 4              ; <i32> [#uses=1]
1283         %5 = add i32 %4, %2             ; <i32> [#uses=1]
1284         %6 = and i32 %5, 65535          ; <i32> [#uses=1]
1285         %7 = getelementptr [1000 x i32]* @fib, i32 0, i32 %i.0.reg2mem.0
1286         store i32 %6, i32* %7, align 4
1287         %exitcond = icmp eq i32 %0, 998         ; <i1> [#uses=1]
1288         br i1 %exitcond, label %return, label %bb1
1289
1290 This is basically:
1291   LOAD fib[i+1]
1292   LOAD fib[i]
1293   STORE fib[i+2]
1294
1295 instead of handling this as a loop or other xform, all we'd need to do is teach
1296 load PRE to phi translate the %0 add (i+1) into the predecessor as (i'+1+1) =
1297 (i'+2) (where i' is the previous iteration of i).  This would find the store
1298 which feeds it.
1299
1300 predcom-2.c is apparently the same as predcom-1.c
1301 predcom-3.c is very similar but needs loads feeding each other instead of
1302 store->load.
1303 predcom-4.c seems the same as the rest.
1304
1305
1306 //===---------------------------------------------------------------------===//
1307
1308 Other simple load PRE cases:
1309 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=35287 [LPRE crit edge splitting]
1310
1311 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=34677 (licm does this, LPRE crit edge)
1312   llvm-gcc t2.c -S -o - -O0 -emit-llvm | llvm-as | opt -mem2reg -simplifycfg -gvn | llvm-dis
1313
1314 //===---------------------------------------------------------------------===//
1315
1316 Type based alias analysis:
1317 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=14705
1318
1319 //===---------------------------------------------------------------------===//
1320
1321 When GVN/PRE finds a store of float* to a must aliases pointer when expecting
1322 an int*, it should turn it into a bitcast.  This is a nice generalization of
1323 the SROA hack that would apply to other cases, e.g.:
1324
1325 int foo(int C, int *P, float X) {
1326   if (C) {
1327     bar();
1328     *P = 42;
1329   } else
1330     *(float*)P = X;
1331
1332    return *P;
1333 }
1334
1335
1336 One example (that requires crazy phi translation) is:
1337 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=16799 [BITCAST PHI TRANS]
1338
1339 //===---------------------------------------------------------------------===//
1340
1341 A/B get pinned to the stack because we turn an if/then into a select instead
1342 of PRE'ing the load/store.  This may be fixable in instcombine:
1343 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=37892
1344
1345
1346
1347 Interesting missed case because of control flow flattening (should be 2 loads):
1348 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=26629
1349 With: llvm-gcc t2.c -S -o - -O0 -emit-llvm | llvm-as |
1350              opt -mem2reg -gvn -instcombine | llvm-dis
1351 we miss it because we need 1) GEP PHI TRAN, 2) CRIT EDGE 3) MULTIPLE DIFFERENT
1352 VALS PRODUCED BY ONE BLOCK OVER DIFFERENT PATHS
1353
1354 //===---------------------------------------------------------------------===//
1355
1356 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=19633
1357 We could eliminate the branch condition here, loading from null is undefined:
1358
1359 struct S { int w, x, y, z; };
1360 struct T { int r; struct S s; };
1361 void bar (struct S, int);
1362 void foo (int a, struct T b)
1363 {
1364   struct S *c = 0;
1365   if (a)
1366     c = &b.s;
1367   bar (*c, a);
1368 }
1369
1370 //===---------------------------------------------------------------------===//
1371
1372 simplifylibcalls should do several optimizations for strspn/strcspn:
1373
1374 strcspn(x, "") -> strlen(x)
1375 strcspn("", x) -> 0
1376 strspn("", x) -> 0
1377 strspn(x, "") -> strlen(x)
1378 strspn(x, "a") -> strchr(x, 'a')-x
1379
1380 strcspn(x, "a") -> inlined loop for up to 3 letters (similarly for strspn):
1381
1382 size_t __strcspn_c3 (__const char *__s, int __reject1, int __reject2,
1383                      int __reject3) {
1384   register size_t __result = 0;
1385   while (__s[__result] != '\0' && __s[__result] != __reject1 &&
1386          __s[__result] != __reject2 && __s[__result] != __reject3)
1387     ++__result;
1388   return __result;
1389 }
1390
1391 This should turn into a switch on the character.  See PR3253 for some notes on
1392 codegen.
1393
1394 456.hmmer apparently uses strcspn and strspn a lot.  471.omnetpp uses strspn.
1395
1396 //===---------------------------------------------------------------------===//
1397
1398 "gas" uses this idiom:
1399   else if (strchr ("+-/*%|&^:[]()~", *intel_parser.op_string))
1400 ..
1401   else if (strchr ("<>", *intel_parser.op_string)
1402
1403 Those should be turned into a switch.
1404
1405 //===---------------------------------------------------------------------===//
1406
1407 252.eon contains this interesting code:
1408
1409         %3072 = getelementptr [100 x i8]* %tempString, i32 0, i32 0
1410         %3073 = call i8* @strcpy(i8* %3072, i8* %3071) nounwind
1411         %strlen = call i32 @strlen(i8* %3072)    ; uses = 1
1412         %endptr = getelementptr [100 x i8]* %tempString, i32 0, i32 %strlen
1413         call void @llvm.memcpy.i32(i8* %endptr,
1414           i8* getelementptr ([5 x i8]* @"\01LC42", i32 0, i32 0), i32 5, i32 1)
1415         %3074 = call i32 @strlen(i8* %endptr) nounwind readonly
1416
1417 This is interesting for a couple reasons.  First, in this:
1418
1419         %3073 = call i8* @strcpy(i8* %3072, i8* %3071) nounwind
1420         %strlen = call i32 @strlen(i8* %3072)
1421
1422 The strlen could be replaced with: %strlen = sub %3072, %3073, because the
1423 strcpy call returns a pointer to the end of the string.  Based on that, the
1424 endptr GEP just becomes equal to 3073, which eliminates a strlen call and GEP.
1425
1426 Second, the memcpy+strlen strlen can be replaced with:
1427
1428         %3074 = call i32 @strlen([5 x i8]* @"\01LC42") nounwind readonly
1429
1430 Because the destination was just copied into the specified memory buffer.  This,
1431 in turn, can be constant folded to "4".
1432
1433 In other code, it contains:
1434
1435         %endptr6978 = bitcast i8* %endptr69 to i32*
1436         store i32 7107374, i32* %endptr6978, align 1
1437         %3167 = call i32 @strlen(i8* %endptr69) nounwind readonly
1438
1439 Which could also be constant folded.  Whatever is producing this should probably
1440 be fixed to leave this as a memcpy from a string.
1441
1442 Further, eon also has an interesting partially redundant strlen call:
1443
1444 bb8:            ; preds = %_ZN18eonImageCalculatorC1Ev.exit
1445         %682 = getelementptr i8** %argv, i32 6          ; <i8**> [#uses=2]
1446         %683 = load i8** %682, align 4          ; <i8*> [#uses=4]
1447         %684 = load i8* %683, align 1           ; <i8> [#uses=1]
1448         %685 = icmp eq i8 %684, 0               ; <i1> [#uses=1]
1449         br i1 %685, label %bb10, label %bb9
1450
1451 bb9:            ; preds = %bb8
1452         %686 = call i32 @strlen(i8* %683) nounwind readonly
1453         %687 = icmp ugt i32 %686, 254           ; <i1> [#uses=1]
1454         br i1 %687, label %bb10, label %bb11
1455
1456 bb10:           ; preds = %bb9, %bb8
1457         %688 = call i32 @strlen(i8* %683) nounwind readonly
1458
1459 This could be eliminated by doing the strlen once in bb8, saving code size and
1460 improving perf on the bb8->9->10 path.
1461
1462 //===---------------------------------------------------------------------===//
1463
1464 I see an interesting fully redundant call to strlen left in 186.crafty:InputMove
1465 which looks like:
1466        %movetext11 = getelementptr [128 x i8]* %movetext, i32 0, i32 0
1467
1468
1469 bb62:           ; preds = %bb55, %bb53
1470         %promote.0 = phi i32 [ %169, %bb55 ], [ 0, %bb53 ]
1471         %171 = call i32 @strlen(i8* %movetext11) nounwind readonly align 1
1472         %172 = add i32 %171, -1         ; <i32> [#uses=1]
1473         %173 = getelementptr [128 x i8]* %movetext, i32 0, i32 %172
1474
1475 ...  no stores ...
1476        br i1 %or.cond, label %bb65, label %bb72
1477
1478 bb65:           ; preds = %bb62
1479         store i8 0, i8* %173, align 1
1480         br label %bb72
1481
1482 bb72:           ; preds = %bb65, %bb62
1483         %trank.1 = phi i32 [ %176, %bb65 ], [ -1, %bb62 ]
1484         %177 = call i32 @strlen(i8* %movetext11) nounwind readonly align 1
1485
1486 Note that on the bb62->bb72 path, that the %177 strlen call is partially
1487 redundant with the %171 call.  At worst, we could shove the %177 strlen call
1488 up into the bb65 block moving it out of the bb62->bb72 path.   However, note
1489 that bb65 stores to the string, zeroing out the last byte.  This means that on
1490 that path the value of %177 is actually just %171-1.  A sub is cheaper than a
1491 strlen!
1492
1493 This pattern repeats several times, basically doing:
1494
1495   A = strlen(P);
1496   P[A-1] = 0;
1497   B = strlen(P);
1498   where it is "obvious" that B = A-1.
1499
1500 //===---------------------------------------------------------------------===//
1501
1502 186.crafty contains this interesting pattern:
1503
1504 %77 = call i8* @strstr(i8* getelementptr ([6 x i8]* @"\01LC5", i32 0, i32 0),
1505                        i8* %30)
1506 %phitmp648 = icmp eq i8* %77, getelementptr ([6 x i8]* @"\01LC5", i32 0, i32 0)
1507 br i1 %phitmp648, label %bb70, label %bb76
1508
1509 bb70:           ; preds = %OptionMatch.exit91, %bb69
1510         %78 = call i32 @strlen(i8* %30) nounwind readonly align 1               ; <i32> [#uses=1]
1511
1512 This is basically:
1513   cststr = "abcdef";
1514   if (strstr(cststr, P) == cststr) {
1515      x = strlen(P);
1516      ...
1517
1518 The strstr call would be significantly cheaper written as:
1519
1520 cststr = "abcdef";
1521 if (memcmp(P, str, strlen(P)))
1522   x = strlen(P);
1523
1524 This is memcmp+strlen instead of strstr.  This also makes the strlen fully
1525 redundant.
1526
1527 //===---------------------------------------------------------------------===//
1528
1529 186.crafty also contains this code:
1530
1531 %1906 = call i32 @strlen(i8* getelementptr ([32 x i8]* @pgn_event, i32 0,i32 0))
1532 %1907 = getelementptr [32 x i8]* @pgn_event, i32 0, i32 %1906
1533 %1908 = call i8* @strcpy(i8* %1907, i8* %1905) nounwind align 1
1534 %1909 = call i32 @strlen(i8* getelementptr ([32 x i8]* @pgn_event, i32 0,i32 0))
1535 %1910 = getelementptr [32 x i8]* @pgn_event, i32 0, i32 %1909
1536
1537 The last strlen is computable as 1908-@pgn_event, which means 1910=1908.
1538
1539 //===---------------------------------------------------------------------===//
1540
1541 186.crafty has this interesting pattern with the "out.4543" variable:
1542
1543 call void @llvm.memcpy.i32(
1544         i8* getelementptr ([10 x i8]* @out.4543, i32 0, i32 0),
1545        i8* getelementptr ([7 x i8]* @"\01LC28700", i32 0, i32 0), i32 7, i32 1)
1546 %101 = call@printf(i8* ...   @out.4543, i32 0, i32 0)) nounwind
1547
1548 It is basically doing:
1549
1550   memcpy(globalarray, "string");
1551   printf(...,  globalarray);
1552
1553 Anyway, by knowing that printf just reads the memory and forward substituting
1554 the string directly into the printf, this eliminates reads from globalarray.
1555 Since this pattern occurs frequently in crafty (due to the "DisplayTime" and
1556 other similar functions) there are many stores to "out".  Once all the printfs
1557 stop using "out", all that is left is the memcpy's into it.  This should allow
1558 globalopt to remove the "stored only" global.
1559
1560 //===---------------------------------------------------------------------===//
1561
1562 This code:
1563
1564 define inreg i32 @foo(i8* inreg %p) nounwind {
1565   %tmp0 = load i8* %p
1566   %tmp1 = ashr i8 %tmp0, 5
1567   %tmp2 = sext i8 %tmp1 to i32
1568   ret i32 %tmp2
1569 }
1570
1571 could be dagcombine'd to a sign-extending load with a shift.
1572 For example, on x86 this currently gets this:
1573
1574         movb    (%eax), %al
1575         sarb    $5, %al
1576         movsbl  %al, %eax
1577
1578 while it could get this:
1579
1580         movsbl  (%eax), %eax
1581         sarl    $5, %eax
1582
1583 //===---------------------------------------------------------------------===//
1584
1585 GCC PR31029:
1586
1587 int test(int x) { return 1-x == x; }     // --> return false
1588 int test2(int x) { return 2-x == x; }    // --> return x == 1 ?
1589
1590 Always foldable for odd constants, what is the rule for even?
1591
1592 //===---------------------------------------------------------------------===//
1593
1594 PR 3381: GEP to field of size 0 inside a struct could be turned into GEP
1595 for next field in struct (which is at same address).
1596
1597 For example: store of float into { {{}}, float } could be turned into a store to
1598 the float directly.
1599
1600 //===---------------------------------------------------------------------===//
1601
1602 #include <math.h>
1603 double foo(double a) {    return sin(a); }
1604
1605 This compiles into this on x86-64 Linux:
1606 foo:
1607         subq    $8, %rsp
1608         call    sin
1609         addq    $8, %rsp
1610         ret
1611 vs:
1612
1613 foo:
1614         jmp sin
1615
1616 //===---------------------------------------------------------------------===//
1617
1618 The arg promotion pass should make use of nocapture to make its alias analysis
1619 stuff much more precise.
1620
1621 //===---------------------------------------------------------------------===//
1622
1623 The following functions should be optimized to use a select instead of a
1624 branch (from gcc PR40072):
1625
1626 char char_int(int m) {if(m>7) return 0; return m;}
1627 int int_char(char m) {if(m>7) return 0; return m;}
1628
1629 //===---------------------------------------------------------------------===//
1630
1631 Instcombine should replace the load with a constant in:
1632
1633   static const char x[4] = {'a', 'b', 'c', 'd'};
1634
1635   unsigned int y(void) {
1636     return *(unsigned int *)x;
1637   }
1638
1639 It currently only does this transformation when the size of the constant
1640 is the same as the size of the integer (so, try x[5]) and the last byte
1641 is a null (making it a C string). There's no need for these restrictions.
1642
1643 //===---------------------------------------------------------------------===//
1644
1645 InstCombine's "turn load from constant into constant" optimization should be
1646 more aggressive in the presence of bitcasts.  For example, because of unions,
1647 this code:
1648
1649 union vec2d {
1650     double e[2];
1651     double v __attribute__((vector_size(16)));
1652 };
1653 typedef union vec2d vec2d;
1654
1655 static vec2d a={{1,2}}, b={{3,4}};
1656
1657 vec2d foo () {
1658     return (vec2d){ .v = a.v + b.v * (vec2d){{5,5}}.v };
1659 }
1660
1661 Compiles into:
1662
1663 @a = internal constant %0 { [2 x double]
1664            [double 1.000000e+00, double 2.000000e+00] }, align 16
1665 @b = internal constant %0 { [2 x double]
1666            [double 3.000000e+00, double 4.000000e+00] }, align 16
1667 ...
1668 define void @foo(%struct.vec2d* noalias nocapture sret %agg.result) nounwind {
1669 entry:
1670         %0 = load <2 x double>* getelementptr (%struct.vec2d*
1671            bitcast (%0* @a to %struct.vec2d*), i32 0, i32 0), align 16
1672         %1 = load <2 x double>* getelementptr (%struct.vec2d*
1673            bitcast (%0* @b to %struct.vec2d*), i32 0, i32 0), align 16
1674
1675
1676 Instcombine should be able to optimize away the loads (and thus the globals).
1677
1678
1679 //===---------------------------------------------------------------------===//