lib/Target/README.txt

   1 Target Independent Opportunities:
   2
   3 //===---------------------------------------------------------------------===//
   4
   5 With the recent changes to make the implicit def/use set explicit in
   6 machineinstrs, we should change the target descriptions for 'call' instructions
   7 so that the .td files don't list all the call-clobbered registers as implicit
   8 defs.  Instead, these should be added by the code generator (e.g. on the dag).
   9
  10 This has a number of uses:
  11
  12 1. PPC32/64 and X86 32/64 can avoid having multiple copies of call instructions
  13    for their different impdef sets.
  14 2. Targets with multiple calling convs (e.g. x86) which have different clobber
  15    sets don't need copies of call instructions.
  16 3. 'Interprocedural register allocation' can be done to reduce the clobber sets
  17    of calls.
  18
  19 //===---------------------------------------------------------------------===//
  20
  21 Make the PPC branch selector target independant
  22
  23 //===---------------------------------------------------------------------===//
  24
  25 Get the C front-end to expand hypot(x,y) -> llvm.sqrt(x*x+y*y) when errno and
  26 precision don't matter (ffastmath).  Misc/mandel will like this. :)  This isn't
  27 safe in general, even on darwin.  See the libm implementation of hypot for
  28 examples (which special case when x/y are exactly zero to get signed zeros etc
  29 right).
  30
  31 //===---------------------------------------------------------------------===//
  32
  33 Solve this DAG isel folding deficiency:
  34
  35 int X, Y;
  36
  37 void fn1(void)
  38 {
  39   X = X | (Y << 3);
  40 }
  41
  42 compiles to
  43
  44 fn1:
  45         movl Y, %eax
  46         shll $3, %eax
  47         orl X, %eax
  48         movl %eax, X
  49         ret
  50
  51 The problem is the store's chain operand is not the load X but rather
  52 a TokenFactor of the load X and load Y, which prevents the folding.
  53
  54 There are two ways to fix this:
  55
  56 1. The dag combiner can start using alias analysis to realize that y/x
  57    don't alias, making the store to X not dependent on the load from Y.
  58 2. The generated isel could be made smarter in the case it can't
  59    disambiguate the pointers.
  60
  61 Number 1 is the preferred solution.
  62
  63 This has been "fixed" by a TableGen hack. But that is a short term workaround
  64 which will be removed once the proper fix is made.
  65
  66 //===---------------------------------------------------------------------===//
  67
  68 On targets with expensive 64-bit multiply, we could LSR this:
  69
  70 for (i = ...; ++i) {
  71    x = 1ULL << i;
  72
  73 into:
  74  long long tmp = 1;
  75  for (i = ...; ++i, tmp+=tmp)
  76    x = tmp;
  77
  78 This would be a win on ppc32, but not x86 or ppc64.
  79
  80 //===---------------------------------------------------------------------===//
  81
  82 Shrink: (setlt (loadi32 P), 0) -> (setlt (loadi8 Phi), 0)
  83
  84 //===---------------------------------------------------------------------===//
  85
  86 Reassociate should turn: X*X*X*X -> t=(X*X) (t*t) to eliminate a multiply.
  87
  88 //===---------------------------------------------------------------------===//
  89
  90 Interesting? testcase for add/shift/mul reassoc:
  91
  92 int bar(int x, int y) {
  93   return x*x*x+y+x*x*x*x*x*y*y*y*y;
  94 }
  95 int foo(int z, int n) {
  96   return bar(z, n) + bar(2*z, 2*n);
  97 }
  98
  99 Reassociate should handle the example in GCC PR16157.
 100
 101 //===---------------------------------------------------------------------===//
 102
 103 These two functions should generate the same code on big-endian systems:
 104
 105 int g(int *j,int *l)  {  return memcmp(j,l,4);  }
 106 int h(int *j, int *l) {  return *j - *l; }
 107
 108 this could be done in SelectionDAGISel.cpp, along with other special cases,
 109 for 1,2,4,8 bytes.
 110
 111 //===---------------------------------------------------------------------===//
 112
 113 It would be nice to revert this patch:
 114 http://lists.cs.uiuc.edu/pipermail/llvm-commits/Week-of-Mon-20060213/031986.html
 115
 116 And teach the dag combiner enough to simplify the code expanded before
 117 legalize.  It seems plausible that this knowledge would let it simplify other
 118 stuff too.
 119
 120 //===---------------------------------------------------------------------===//
 121
 122 For vector types, TargetData.cpp::getTypeInfo() returns alignment that is equal
 123 to the type size. It works but can be overly conservative as the alignment of
 124 specific vector types are target dependent.
 125
 126 //===---------------------------------------------------------------------===//
 127
 128 We should produce an unaligned load from code like this:
 129
 130 v4sf example(float *P) {
 131   return (v4sf){P[0], P[1], P[2], P[3] };
 132 }
 133
 134 //===---------------------------------------------------------------------===//
 135
 136 Add support for conditional increments, and other related patterns.  Instead
 137 of:
 138
 139         movl 136(%esp), %eax
 140         cmpl $0, %eax
 141         je LBB16_2      #cond_next
 142 LBB16_1:        #cond_true
 143         incl _foo
 144 LBB16_2:        #cond_next
 145
 146 emit:
 147         movl    _foo, %eax
 148         cmpl    $1, %edi
 149         sbbl    $-1, %eax
 150         movl    %eax, _foo
 151
 152 //===---------------------------------------------------------------------===//
 153
 154 Combine: a = sin(x), b = cos(x) into a,b = sincos(x).
 155
 156 Expand these to calls of sin/cos and stores:
 157       double sincos(double x, double *sin, double *cos);
 158       float sincosf(float x, float *sin, float *cos);
 159       long double sincosl(long double x, long double *sin, long double *cos);
 160
 161 Doing so could allow SROA of the destination pointers.  See also:
 162 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=17687
 163
 164 This is now easily doable with MRVs.  We could even make an intrinsic for this
 165 if anyone cared enough about sincos.
 166
 167 //===---------------------------------------------------------------------===//
 168
 169 Turn this into a single byte store with no load (the other 3 bytes are
 170 unmodified):
 171
 172 define void @test(i32* %P) {
 173         %tmp = load i32* %P
 174         %tmp14 = or i32 %tmp, 3305111552
 175         %tmp15 = and i32 %tmp14, 3321888767
 176         store i32 %tmp15, i32* %P
 177         ret void
 178 }
 179
 180 //===---------------------------------------------------------------------===//
 181
 182 dag/inst combine "clz(x)>>5 -> x==0" for 32-bit x.
 183
 184 Compile:
 185
 186 int bar(int x)
 187 {
 188   int t = __builtin_clz(x);
 189   return -(t>>5);
 190 }
 191
 192 to:
 193
 194 _bar:   addic r3,r3,-1
 195         subfe r3,r3,r3
 196         blr
 197
 198 //===---------------------------------------------------------------------===//
 199
 200 quantum_sigma_x in 462.libquantum contains the following loop:
 201
 202       for(i=0; i<reg->size; i++)
 203         {
 204           /* Flip the target bit of each basis state */
 205           reg->node[i].state ^= ((MAX_UNSIGNED) 1 << target);
 206         }
 207
 208 Where MAX_UNSIGNED/state is a 64-bit int.  On a 32-bit platform it would be just
 209 so cool to turn it into something like:
 210
 211    long long Res = ((MAX_UNSIGNED) 1 << target);
 212    if (target < 32) {
 213      for(i=0; i<reg->size; i++)
 214        reg->node[i].state ^= Res & 0xFFFFFFFFULL;
 215    } else {
 216      for(i=0; i<reg->size; i++)
 217        reg->node[i].state ^= Res & 0xFFFFFFFF00000000ULL
 218    }
 219
 220 ... which would only do one 32-bit XOR per loop iteration instead of two.
 221
 222 It would also be nice to recognize the reg->size doesn't alias reg->node[i], but
 223 alas.
 224
 225 //===---------------------------------------------------------------------===//
 226
 227 This should be optimized to one 'and' and one 'or', from PR4216:
 228
 229 define i32 @test_bitfield(i32 %bf.prev.low) nounwind ssp {
 230 entry:
 231   %bf.prev.lo.cleared10 = or i32 %bf.prev.low, 32962 ; <i32> [#uses=1]
 232   %0 = and i32 %bf.prev.low, -65536               ; <i32> [#uses=1]
 233   %1 = and i32 %bf.prev.lo.cleared10, 40186       ; <i32> [#uses=1]
 234   %2 = or i32 %1, %0                              ; <i32> [#uses=1]
 235   ret i32 %2
 236 }
 237
 238 //===---------------------------------------------------------------------===//
 239
 240 This isn't recognized as bswap by instcombine (yes, it really is bswap):
 241
 242 unsigned long reverse(unsigned v) {
 243     unsigned t;
 244     t = v ^ ((v << 16) | (v >> 16));
 245     t &= ~0xff0000;
 246     v = (v << 24) | (v >> 8);
 247     return v ^ (t >> 8);
 248 }
 249
 250 //===---------------------------------------------------------------------===//
 251
 252 These idioms should be recognized as popcount (see PR1488):
 253
 254 unsigned countbits_slow(unsigned v) {
 255   unsigned c;
 256   for (c = 0; v; v >>= 1)
 257     c += v & 1;
 258   return c;
 259 }
 260 unsigned countbits_fast(unsigned v){
 261   unsigned c;
 262   for (c = 0; v; c++)
 263     v &= v - 1; // clear the least significant bit set
 264   return c;
 265 }
 266
 267 BITBOARD = unsigned long long
 268 int PopCnt(register BITBOARD a) {
 269   register int c=0;
 270   while(a) {
 271     c++;
 272     a &= a - 1;
 273   }
 274   return c;
 275 }
 276 unsigned int popcount(unsigned int input) {
 277   unsigned int count = 0;
 278   for (unsigned int i =  0; i < 4 * 8; i++)
 279     count += (input >> i) & i;
 280   return count;
 281 }
 282
 283 //===---------------------------------------------------------------------===//
 284
 285 These should turn into single 16-bit (unaligned?) loads on little/big endian
 286 processors.
 287
 288 unsigned short read_16_le(const unsigned char *adr) {
 289   return adr[0] | (adr[1] << 8);
 290 }
 291 unsigned short read_16_be(const unsigned char *adr) {
 292   return (adr[0] << 8) | adr[1];
 293 }
 294
 295 //===---------------------------------------------------------------------===//
 296
 297 -instcombine should handle this transform:
 298    icmp pred (sdiv X / C1 ), C2
 299 when X, C1, and C2 are unsigned.  Similarly for udiv and signed operands.
 300
 301 Currently InstCombine avoids this transform but will do it when the signs of
 302 the operands and the sign of the divide match. See the FIXME in
 303 InstructionCombining.cpp in the visitSetCondInst method after the switch case
 304 for Instruction::UDiv (around line 4447) for more details.
 305
 306 The SingleSource/Benchmarks/Shootout-C++/hash and hash2 tests have examples of
 307 this construct.
 308
 309 //===---------------------------------------------------------------------===//
 310
 311 viterbi speeds up *significantly* if the various "history" related copy loops
 312 are turned into memcpy calls at the source level.  We need a "loops to memcpy"
 313 pass.
 314
 315 //===---------------------------------------------------------------------===//
 316
 317 Consider:
 318
 319 typedef unsigned U32;
 320 typedef unsigned long long U64;
 321 int test (U32 *inst, U64 *regs) {
 322     U64 effective_addr2;
 323     U32 temp = *inst;
 324     int r1 = (temp >> 20) & 0xf;
 325     int b2 = (temp >> 16) & 0xf;
 326     effective_addr2 = temp & 0xfff;
 327     if (b2) effective_addr2 += regs[b2];
 328     b2 = (temp >> 12) & 0xf;
 329     if (b2) effective_addr2 += regs[b2];
 330     effective_addr2 &= regs[4];
 331      if ((effective_addr2 & 3) == 0)
 332         return 1;
 333     return 0;
 334 }
 335
 336 Note that only the low 2 bits of effective_addr2 are used.  On 32-bit systems,
 337 we don't eliminate the computation of the top half of effective_addr2 because
 338 we don't have whole-function selection dags.  On x86, this means we use one
 339 extra register for the function when effective_addr2 is declared as U64 than
 340 when it is declared U32.
 341
 342 PHI Slicing could be extended to do this.
 343
 344 //===---------------------------------------------------------------------===//
 345
 346 LSR should know what GPR types a target has.  This code:
 347
 348 volatile short X, Y; // globals
 349
 350 void foo(int N) {
 351   int i;
 352   for (i = 0; i < N; i++) { X = i; Y = i*4; }
 353 }
 354
 355 produces two near identical IV's (after promotion) on PPC/ARM:
 356
 357 LBB1_2:
 358         ldr r3, LCPI1_0
 359         ldr r3, [r3]
 360         strh r2, [r3]
 361         ldr r3, LCPI1_1
 362         ldr r3, [r3]
 363         strh r1, [r3]
 364         add r1, r1, #4
 365         add r2, r2, #1   <- [0,+,1]
 366         sub r0, r0, #1   <- [0,-,1]
 367         cmp r0, #0
 368         bne LBB1_2
 369
 370 LSR should reuse the "+" IV for the exit test.
 371
 372
 373 //===---------------------------------------------------------------------===//
 374
 375 Tail call elim should be more aggressive, checking to see if the call is
 376 followed by an uncond branch to an exit block.
 377
 378 ; This testcase is due to tail-duplication not wanting to copy the return
 379 ; instruction into the terminating blocks because there was other code
 380 ; optimized out of the function after the taildup happened.
 381 ; RUN: llvm-as < %s | opt -tailcallelim | llvm-dis | not grep call
 382
 383 define i32 @t4(i32 %a) {
 384 entry:
 385         %tmp.1 = and i32 %a, 1          ; <i32> [#uses=1]
 386         %tmp.2 = icmp ne i32 %tmp.1, 0          ; <i1> [#uses=1]
 387         br i1 %tmp.2, label %then.0, label %else.0
 388
 389 then.0:         ; preds = %entry
 390         %tmp.5 = add i32 %a, -1         ; <i32> [#uses=1]
 391         %tmp.3 = call i32 @t4( i32 %tmp.5 )             ; <i32> [#uses=1]
 392         br label %return
 393
 394 else.0:         ; preds = %entry
 395         %tmp.7 = icmp ne i32 %a, 0              ; <i1> [#uses=1]
 396         br i1 %tmp.7, label %then.1, label %return
 397
 398 then.1:         ; preds = %else.0
 399         %tmp.11 = add i32 %a, -2                ; <i32> [#uses=1]
 400         %tmp.9 = call i32 @t4( i32 %tmp.11 )            ; <i32> [#uses=1]
 401         br label %return
 402
 403 return:         ; preds = %then.1, %else.0, %then.0
 404         %result.0 = phi i32 [ 0, %else.0 ], [ %tmp.3, %then.0 ],
 405                             [ %tmp.9, %then.1 ]
 406         ret i32 %result.0
 407 }
 408
 409 //===---------------------------------------------------------------------===//
 410
 411 Tail recursion elimination should handle:
 412
 413 int pow2m1(int n) {
 414  if (n == 0)
 415    return 0;
 416  return 2 * pow2m1 (n - 1) + 1;
 417 }
 418
 419 Also, multiplies can be turned into SHL's, so they should be handled as if
 420 they were associative.  "return foo() << 1" can be tail recursion eliminated.
 421
 422 //===---------------------------------------------------------------------===//
 423
 424 Argument promotion should promote arguments for recursive functions, like
 425 this:
 426
 427 ; RUN: llvm-as < %s | opt -argpromotion | llvm-dis | grep x.val
 428
 429 define internal i32 @foo(i32* %x) {
 430 entry:
 431         %tmp = load i32* %x             ; <i32> [#uses=0]
 432         %tmp.foo = call i32 @foo( i32* %x )             ; <i32> [#uses=1]
 433         ret i32 %tmp.foo
 434 }
 435
 436 define i32 @bar(i32* %x) {
 437 entry:
 438         %tmp3 = call i32 @foo( i32* %x )                ; <i32> [#uses=1]
 439         ret i32 %tmp3
 440 }
 441
 442 //===---------------------------------------------------------------------===//
 443
 444 "basicaa" should know how to look through "or" instructions that act like add
 445 instructions.  For example in this code, the x*4+1 is turned into x*4 | 1, and
 446 basicaa can't analyze the array subscript, leading to duplicated loads in the
 447 generated code:
 448
 449 void test(int X, int Y, int a[]) {
 450 int i;
 451   for (i=2; i<1000; i+=4) {
 452   a[i+0] = a[i-1+0]*a[i-2+0];
 453   a[i+1] = a[i-1+1]*a[i-2+1];
 454   a[i+2] = a[i-1+2]*a[i-2+2];
 455   a[i+3] = a[i-1+3]*a[i-2+3];
 456   }
 457 }
 458
 459 BasicAA also doesn't do this for add.  It needs to know that &A[i+1] != &A[i].
 460
 461 //===---------------------------------------------------------------------===//
 462
 463 We should investigate an instruction sinking pass.  Consider this silly
 464 example in pic mode:
 465
 466 #include <assert.h>
 467 void foo(int x) {
 468   assert(x);
 469   //...
 470 }
 471
 472 we compile this to:
 473 _foo:
 474         subl    $28, %esp
 475         call    "L1$pb"
 476 "L1$pb":
 477         popl    %eax
 478         cmpl    $0, 32(%esp)
 479         je      LBB1_2  # cond_true
 480 LBB1_1: # return
 481         # ...
 482         addl    $28, %esp
 483         ret
 484 LBB1_2: # cond_true
 485 ...
 486
 487 The PIC base computation (call+popl) is only used on one path through the
 488 code, but is currently always computed in the entry block.  It would be
 489 better to sink the picbase computation down into the block for the
 490 assertion, as it is the only one that uses it.  This happens for a lot of
 491 code with early outs.
 492
 493 Another example is loads of arguments, which are usually emitted into the
 494 entry block on targets like x86.  If not used in all paths through a
 495 function, they should be sunk into the ones that do.
 496
 497 In this case, whole-function-isel would also handle this.
 498
 499 //===---------------------------------------------------------------------===//
 500
 501 Investigate lowering of sparse switch statements into perfect hash tables:
 502 http://burtleburtle.net/bob/hash/perfect.html
 503
 504 //===---------------------------------------------------------------------===//
 505
 506 We should turn things like "load+fabs+store" and "load+fneg+store" into the
 507 corresponding integer operations.  On a yonah, this loop:
 508
 509 double a[256];
 510 void foo() {
 511   int i, b;
 512   for (b = 0; b < 10000000; b++)
 513   for (i = 0; i < 256; i++)
 514     a[i] = -a[i];
 515 }
 516
 517 is twice as slow as this loop:
 518
 519 long long a[256];
 520 void foo() {
 521   int i, b;
 522   for (b = 0; b < 10000000; b++)
 523   for (i = 0; i < 256; i++)
 524     a[i] ^= (1ULL << 63);
 525 }
 526
 527 and I suspect other processors are similar.  On X86 in particular this is a
 528 big win because doing this with integers allows the use of read/modify/write
 529 instructions.
 530
 531 //===---------------------------------------------------------------------===//
 532
 533 DAG Combiner should try to combine small loads into larger loads when
 534 profitable.  For example, we compile this C++ example:
 535
 536 struct THotKey { short Key; bool Control; bool Shift; bool Alt; };
 537 extern THotKey m_HotKey;
 538 THotKey GetHotKey () { return m_HotKey; }
 539
 540 into (-O3 -fno-exceptions -static -fomit-frame-pointer):
 541
 542 __Z9GetHotKeyv:
 543         pushl   %esi
 544         movl    8(%esp), %eax
 545         movb    _m_HotKey+3, %cl
 546         movb    _m_HotKey+4, %dl
 547         movb    _m_HotKey+2, %ch
 548         movw    _m_HotKey, %si
 549         movw    %si, (%eax)
 550         movb    %ch, 2(%eax)
 551         movb    %cl, 3(%eax)
 552         movb    %dl, 4(%eax)
 553         popl    %esi
 554         ret     $4
 555
 556 GCC produces:
 557
 558 __Z9GetHotKeyv:
 559         movl    _m_HotKey, %edx
 560         movl    4(%esp), %eax
 561         movl    %edx, (%eax)
 562         movzwl  _m_HotKey+4, %edx
 563         movw    %dx, 4(%eax)
 564         ret     $4
 565
 566 The LLVM IR contains the needed alignment info, so we should be able to
 567 merge the loads and stores into 4-byte loads:
 568
 569         %struct.THotKey = type { i16, i8, i8, i8 }
 570 define void @_Z9GetHotKeyv(%struct.THotKey* sret  %agg.result) nounwind  {
 571 ...
 572         %tmp2 = load i16* getelementptr (@m_HotKey, i32 0, i32 0), align 8
 573         %tmp5 = load i8* getelementptr (@m_HotKey, i32 0, i32 1), align 2
 574         %tmp8 = load i8* getelementptr (@m_HotKey, i32 0, i32 2), align 1
 575         %tmp11 = load i8* getelementptr (@m_HotKey, i32 0, i32 3), align 2
 576
 577 Alternatively, we should use a small amount of base-offset alias analysis
 578 to make it so the scheduler doesn't need to hold all the loads in regs at
 579 once.
 580
 581 //===---------------------------------------------------------------------===//
 582
 583 We should add an FRINT node to the DAG to model targets that have legal
 584 implementations of ceil/floor/rint.
 585
 586 //===---------------------------------------------------------------------===//
 587
 588 Consider:
 589
 590 int test() {
 591   long long input[8] = {1,1,1,1,1,1,1,1};
 592   foo(input);
 593 }
 594
 595 We currently compile this into a memcpy from a global array since the
 596 initializer is fairly large and not memset'able.  This is good, but the memcpy
 597 gets lowered to load/stores in the code generator.  This is also ok, except
 598 that the codegen lowering for memcpy doesn't handle the case when the source
 599 is a constant global.  This gives us atrocious code like this:
 600
 601         call    "L1$pb"
 602 "L1$pb":
 603         popl    %eax
 604         movl    _C.0.1444-"L1$pb"+32(%eax), %ecx
 605         movl    %ecx, 40(%esp)
 606         movl    _C.0.1444-"L1$pb"+20(%eax), %ecx
 607         movl    %ecx, 28(%esp)
 608         movl    _C.0.1444-"L1$pb"+36(%eax), %ecx
 609         movl    %ecx, 44(%esp)
 610         movl    _C.0.1444-"L1$pb"+44(%eax), %ecx
 611         movl    %ecx, 52(%esp)
 612         movl    _C.0.1444-"L1$pb"+40(%eax), %ecx
 613         movl    %ecx, 48(%esp)
 614         movl    _C.0.1444-"L1$pb"+12(%eax), %ecx
 615         movl    %ecx, 20(%esp)
 616         movl    _C.0.1444-"L1$pb"+4(%eax), %ecx
 617 ...
 618
 619 instead of:
 620         movl    $1, 16(%esp)
 621         movl    $0, 20(%esp)
 622         movl    $1, 24(%esp)
 623         movl    $0, 28(%esp)
 624         movl    $1, 32(%esp)
 625         movl    $0, 36(%esp)
 626         ...
 627
 628 //===---------------------------------------------------------------------===//
 629
 630 http://llvm.org/PR717:
 631
 632 The following code should compile into "ret int undef". Instead, LLVM
 633 produces "ret int 0":
 634
 635 int f() {
 636   int x = 4;
 637   int y;
 638   if (x == 3) y = 0;
 639   return y;
 640 }
 641
 642 //===---------------------------------------------------------------------===//
 643
 644 The loop unroller should partially unroll loops (instead of peeling them)
 645 when code growth isn't too bad and when an unroll count allows simplification
 646 of some code within the loop.  One trivial example is:
 647
 648 #include <stdio.h>
 649 int main() {
 650     int nRet = 17;
 651     int nLoop;
 652     for ( nLoop = 0; nLoop < 1000; nLoop++ ) {
 653         if ( nLoop & 1 )
 654             nRet += 2;
 655         else
 656             nRet -= 1;
 657     }
 658     return nRet;
 659 }
 660
 661 Unrolling by 2 would eliminate the '&1' in both copies, leading to a net
 662 reduction in code size.  The resultant code would then also be suitable for
 663 exit value computation.
 664
 665 //===---------------------------------------------------------------------===//
 666
 667 We miss a bunch of rotate opportunities on various targets, including ppc, x86,
 668 etc.  On X86, we miss a bunch of 'rotate by variable' cases because the rotate
 669 matching code in dag combine doesn't look through truncates aggressively
 670 enough.  Here are some testcases reduces from GCC PR17886:
 671
 672 unsigned long long f(unsigned long long x, int y) {
 673   return (x << y) | (x >> 64-y);
 674 }
 675 unsigned f2(unsigned x, int y){
 676   return (x << y) | (x >> 32-y);
 677 }
 678 unsigned long long f3(unsigned long long x){
 679   int y = 9;
 680   return (x << y) | (x >> 64-y);
 681 }
 682 unsigned f4(unsigned x){
 683   int y = 10;
 684   return (x << y) | (x >> 32-y);
 685 }
 686 unsigned long long f5(unsigned long long x, unsigned long long y) {
 687   return (x << 8) | ((y >> 48) & 0xffull);
 688 }
 689 unsigned long long f6(unsigned long long x, unsigned long long y, int z) {
 690   switch(z) {
 691   case 1:
 692     return (x << 8) | ((y >> 48) & 0xffull);
 693   case 2:
 694     return (x << 16) | ((y >> 40) & 0xffffull);
 695   case 3:
 696     return (x << 24) | ((y >> 32) & 0xffffffull);
 697   case 4:
 698     return (x << 32) | ((y >> 24) & 0xffffffffull);
 699   default:
 700     return (x << 40) | ((y >> 16) & 0xffffffffffull);
 701   }
 702 }
 703
 704 On X86-64, we only handle f2/f3/f4 right.  On x86-32, a few of these
 705 generate truly horrible code, instead of using shld and friends.  On
 706 ARM, we end up with calls to L___lshrdi3/L___ashldi3 in f, which is
 707 badness.  PPC64 misses f, f5 and f6.  CellSPU aborts in isel.
 708
 709 //===---------------------------------------------------------------------===//
 710
 711 We do a number of simplifications in simplify libcalls to strength reduce
 712 standard library functions, but we don't currently merge them together.  For
 713 example, it is useful to merge memcpy(a,b,strlen(b)) -> strcpy.  This can only
 714 be done safely if "b" isn't modified between the strlen and memcpy of course.
 715
 716 //===---------------------------------------------------------------------===//
 717
 718 Reassociate should turn things like:
 719
 720 int factorial(int X) {
 721  return X*X*X*X*X*X*X*X;
 722 }
 723
 724 into llvm.powi calls, allowing the code generator to produce balanced
 725 multiplication trees.
 726
 727 //===---------------------------------------------------------------------===//
 728
 729 We generate a horrible  libcall for llvm.powi.  For example, we compile:
 730
 731 #include <cmath>
 732 double f(double a) { return std::pow(a, 4); }
 733
 734 into:
 735
 736 __Z1fd:
 737         subl    $12, %esp
 738         movsd   16(%esp), %xmm0
 739         movsd   %xmm0, (%esp)
 740         movl    $4, 8(%esp)
 741         call    L___powidf2$stub
 742         addl    $12, %esp
 743         ret
 744
 745 GCC produces:
 746
 747 __Z1fd:
 748         subl    $12, %esp
 749         movsd   16(%esp), %xmm0
 750         mulsd   %xmm0, %xmm0
 751         mulsd   %xmm0, %xmm0
 752         movsd   %xmm0, (%esp)
 753         fldl    (%esp)
 754         addl    $12, %esp
 755         ret
 756
 757 //===---------------------------------------------------------------------===//
 758
 759 We compile this program: (from GCC PR11680)
 760 http://gcc.gnu.org/bugzilla/attachment.cgi?id=4487
 761
 762 Into code that runs the same speed in fast/slow modes, but both modes run 2x
 763 slower than when compile with GCC (either 4.0 or 4.2):
 764
 765 $ llvm-g++ perf.cpp -O3 -fno-exceptions
 766 $ time ./a.out fast
 767 1.821u 0.003s 0:01.82 100.0%    0+0k 0+0io 0pf+0w
 768
 769 $ g++ perf.cpp -O3 -fno-exceptions
 770 $ time ./a.out fast
 771 0.821u 0.001s 0:00.82 100.0%    0+0k 0+0io 0pf+0w
 772
 773 It looks like we are making the same inlining decisions, so this may be raw
 774 codegen badness or something else (haven't investigated).
 775
 776 //===---------------------------------------------------------------------===//
 777
 778 We miss some instcombines for stuff like this:
 779 void bar (void);
 780 void foo (unsigned int a) {
 781   /* This one is equivalent to a >= (3 << 2).  */
 782   if ((a >> 2) >= 3)
 783     bar ();
 784 }
 785
 786 A few other related ones are in GCC PR14753.
 787
 788 //===---------------------------------------------------------------------===//
 789
 790 Divisibility by constant can be simplified (according to GCC PR12849) from
 791 being a mulhi to being a mul lo (cheaper).  Testcase:
 792
 793 void bar(unsigned n) {
 794   if (n % 3 == 0)
 795     true();
 796 }
 797
 798 I think this basically amounts to a dag combine to simplify comparisons against
 799 multiply hi's into a comparison against the mullo.
 800
 801 //===---------------------------------------------------------------------===//
 802
 803 Better mod/ref analysis for scanf would allow us to eliminate the vtable and a
 804 bunch of other stuff from this example (see PR1604):
 805
 806 #include <cstdio>
 807 struct test {
 808     int val;
 809     virtual ~test() {}
 810 };
 811
 812 int main() {
 813     test t;
 814     std::scanf("%d", &t.val);
 815     std::printf("%d\n", t.val);
 816 }
 817
 818 //===---------------------------------------------------------------------===//
 819
 820 Instcombine will merge comparisons like (x >= 10) && (x < 20) by producing (x -
 821 10) u< 10, but only when the comparisons have matching sign.
 822
 823 This could be converted with a similiar technique. (PR1941)
 824
 825 define i1 @test(i8 %x) {
 826   %A = icmp uge i8 %x, 5
 827   %B = icmp slt i8 %x, 20
 828   %C = and i1 %A, %B
 829   ret i1 %C
 830 }
 831
 832 //===---------------------------------------------------------------------===//
 833
 834 These functions perform the same computation, but produce different assembly.
 835
 836 define i8 @select(i8 %x) readnone nounwind {
 837   %A = icmp ult i8 %x, 250
 838   %B = select i1 %A, i8 0, i8 1
 839   ret i8 %B
 840 }
 841
 842 define i8 @addshr(i8 %x) readnone nounwind {
 843   %A = zext i8 %x to i9
 844   %B = add i9 %A, 6       ;; 256 - 250 == 6
 845   %C = lshr i9 %B, 8
 846   %D = trunc i9 %C to i8
 847   ret i8 %D
 848 }
 849
 850 //===---------------------------------------------------------------------===//
 851
 852 From gcc bug 24696:
 853 int
 854 f (unsigned long a, unsigned long b, unsigned long c)
 855 {
 856   return ((a & (c - 1)) != 0) || ((b & (c - 1)) != 0);
 857 }
 858 int
 859 f (unsigned long a, unsigned long b, unsigned long c)
 860 {
 861   return ((a & (c - 1)) != 0) | ((b & (c - 1)) != 0);
 862 }
 863 Both should combine to ((a|b) & (c-1)) != 0.  Currently not optimized with
 864 "clang -emit-llvm-bc | opt -std-compile-opts".
 865
 866 //===---------------------------------------------------------------------===//
 867
 868 From GCC Bug 20192:
 869 #define PMD_MASK    (~((1UL << 23) - 1))
 870 void clear_pmd_range(unsigned long start, unsigned long end)
 871 {
 872    if (!(start & ~PMD_MASK) && !(end & ~PMD_MASK))
 873        f();
 874 }
 875 The expression should optimize to something like
 876 "!((start|end)&~PMD_MASK). Currently not optimized with "clang
 877 -emit-llvm-bc | opt -std-compile-opts".
 878
 879 //===---------------------------------------------------------------------===//
 880
 881 From GCC Bug 15241:
 882 unsigned int
 883 foo (unsigned int a, unsigned int b)
 884 {
 885  if (a <= 7 && b <= 7)
 886    baz ();
 887 }
 888 Should combine to "(a|b) <= 7".  Currently not optimized with "clang
 889 -emit-llvm-bc | opt -std-compile-opts".
 890
 891 //===---------------------------------------------------------------------===//
 892
 893 From GCC Bug 3756:
 894 int
 895 pn (int n)
 896 {
 897  return (n >= 0 ? 1 : -1);
 898 }
 899 Should combine to (n >> 31) | 1.  Currently not optimized with "clang
 900 -emit-llvm-bc | opt -std-compile-opts | llc".
 901
 902 //===---------------------------------------------------------------------===//
 903
 904 From GCC Bug 28685:
 905 int test(int a, int b)
 906 {
 907  int lt = a < b;
 908  int eq = a == b;
 909
 910  return (lt || eq);
 911 }
 912 Should combine to "a <= b".  Currently not optimized with "clang
 913 -emit-llvm-bc | opt -std-compile-opts | llc".
 914
 915 //===---------------------------------------------------------------------===//
 916
 917 void a(int variable)
 918 {
 919  if (variable == 4 || variable == 6)
 920    bar();
 921 }
 922 This should optimize to "if ((variable | 2) == 6)".  Currently not
 923 optimized with "clang -emit-llvm-bc | opt -std-compile-opts | llc".
 924
 925 //===---------------------------------------------------------------------===//
 926
 927 unsigned int f(unsigned int i, unsigned int n) {++i; if (i == n) ++i; return
 928 i;}
 929 unsigned int f2(unsigned int i, unsigned int n) {++i; i += i == n; return i;}
 930 These should combine to the same thing.  Currently, the first function
 931 produces better code on X86.
 932
 933 //===---------------------------------------------------------------------===//
 934
 935 From GCC Bug 15784:
 936 #define abs(x) x>0?x:-x
 937 int f(int x, int y)
 938 {
 939  return (abs(x)) >= 0;
 940 }
 941 This should optimize to x == INT_MIN. (With -fwrapv.)  Currently not
 942 optimized with "clang -emit-llvm-bc | opt -std-compile-opts".
 943
 944 //===---------------------------------------------------------------------===//
 945
 946 From GCC Bug 14753:
 947 void
 948 rotate_cst (unsigned int a)
 949 {
 950  a = (a << 10) | (a >> 22);
 951  if (a == 123)
 952    bar ();
 953 }
 954 void
 955 minus_cst (unsigned int a)
 956 {
 957  unsigned int tem;
 958
 959  tem = 20 - a;
 960  if (tem == 5)
 961    bar ();
 962 }
 963 void
 964 mask_gt (unsigned int a)
 965 {
 966  /* This is equivalent to a > 15.  */
 967  if ((a & ~7) > 8)
 968    bar ();
 969 }
 970 void
 971 rshift_gt (unsigned int a)
 972 {
 973  /* This is equivalent to a > 23.  */
 974  if ((a >> 2) > 5)
 975    bar ();
 976 }
 977 All should simplify to a single comparison.  All of these are
 978 currently not optimized with "clang -emit-llvm-bc | opt
 979 -std-compile-opts".
 980
 981 //===---------------------------------------------------------------------===//
 982
 983 From GCC Bug 32605:
 984 int c(int* x) {return (char*)x+2 == (char*)x;}
 985 Should combine to 0.  Currently not optimized with "clang
 986 -emit-llvm-bc | opt -std-compile-opts" (although llc can optimize it).
 987
 988 //===---------------------------------------------------------------------===//
 989
 990 int a(unsigned char* b) {return *b > 99;}
 991 There's an unnecessary zext in the generated code with "clang
 992 -emit-llvm-bc | opt -std-compile-opts".
 993
 994 //===---------------------------------------------------------------------===//
 995
 996 int a(unsigned b) {return ((b << 31) | (b << 30)) >> 31;}
 997 Should be combined to  "((b >> 1) | b) & 1".  Currently not optimized
 998 with "clang -emit-llvm-bc | opt -std-compile-opts".
 999
1000 //===---------------------------------------------------------------------===//
1001
1002 unsigned a(unsigned x, unsigned y) { return x | (y & 1) | (y & 2);}
1003 Should combine to "x | (y & 3)".  Currently not optimized with "clang
1004 -emit-llvm-bc | opt -std-compile-opts".
1005
1006 //===---------------------------------------------------------------------===//
1007
1008 unsigned a(unsigned a) {return ((a | 1) & 3) | (a & -4);}
1009 Should combine to "a | 1".  Currently not optimized with "clang
1010 -emit-llvm-bc | opt -std-compile-opts".
1011
1012 //===---------------------------------------------------------------------===//
1013
1014 int a(int a, int b, int c) {return (~a & c) | ((c|a) & b);}
1015 Should fold to "(~a & c) | (a & b)".  Currently not optimized with
1016 "clang -emit-llvm-bc | opt -std-compile-opts".
1017
1018 //===---------------------------------------------------------------------===//
1019
1020 int a(int a,int b) {return (~(a|b))|a;}
1021 Should fold to "a|~b".  Currently not optimized with "clang
1022 -emit-llvm-bc | opt -std-compile-opts".
1023
1024 //===---------------------------------------------------------------------===//
1025
1026 int a(int a, int b) {return (a&&b) || (a&&!b);}
1027 Should fold to "a".  Currently not optimized with "clang -emit-llvm-bc
1028 | opt -std-compile-opts".
1029
1030 //===---------------------------------------------------------------------===//
1031
1032 int a(int a, int b, int c) {return (a&&b) || (!a&&c);}
1033 Should fold to "a ? b : c", or at least something sane.  Currently not
1034 optimized with "clang -emit-llvm-bc | opt -std-compile-opts".
1035
1036 //===---------------------------------------------------------------------===//
1037
1038 int a(int a, int b, int c) {return (a&&b) || (a&&c) || (a&&b&&c);}
1039 Should fold to a && (b || c).  Currently not optimized with "clang
1040 -emit-llvm-bc | opt -std-compile-opts".
1041
1042 //===---------------------------------------------------------------------===//
1043
1044 int a(int x) {return x | ((x & 8) ^ 8);}
1045 Should combine to x | 8.  Currently not optimized with "clang
1046 -emit-llvm-bc | opt -std-compile-opts".
1047
1048 //===---------------------------------------------------------------------===//
1049
1050 int a(int x) {return x ^ ((x & 8) ^ 8);}
1051 Should also combine to x | 8.  Currently not optimized with "clang
1052 -emit-llvm-bc | opt -std-compile-opts".
1053
1054 //===---------------------------------------------------------------------===//
1055
1056 int a(int x) {return (x & 8) == 0 ? -1 : -9;}
1057 Should combine to (x | -9) ^ 8.  Currently not optimized with "clang
1058 -emit-llvm-bc | opt -std-compile-opts".
1059
1060 //===---------------------------------------------------------------------===//
1061
1062 int a(int x) {return (x & 8) == 0 ? -9 : -1;}
1063 Should combine to x | -9.  Currently not optimized with "clang
1064 -emit-llvm-bc | opt -std-compile-opts".
1065
1066 //===---------------------------------------------------------------------===//
1067
1068 int a(int x) {return ((x | -9) ^ 8) & x;}
1069 Should combine to x & -9.  Currently not optimized with "clang
1070 -emit-llvm-bc | opt -std-compile-opts".
1071
1072 //===---------------------------------------------------------------------===//
1073
1074 unsigned a(unsigned a) {return a * 0x11111111 >> 28 & 1;}
1075 Should combine to "a * 0x88888888 >> 31".  Currently not optimized
1076 with "clang -emit-llvm-bc | opt -std-compile-opts".
1077
1078 //===---------------------------------------------------------------------===//
1079
1080 unsigned a(char* x) {if ((*x & 32) == 0) return b();}
1081 There's an unnecessary zext in the generated code with "clang
1082 -emit-llvm-bc | opt -std-compile-opts".
1083
1084 //===---------------------------------------------------------------------===//
1085
1086 unsigned a(unsigned long long x) {return 40 * (x >> 1);}
1087 Should combine to "20 * (((unsigned)x) & -2)".  Currently not
1088 optimized with "clang -emit-llvm-bc | opt -std-compile-opts".
1089
1090 //===---------------------------------------------------------------------===//
1091
1092 This was noticed in the entryblock for grokdeclarator in 403.gcc:
1093
1094         %tmp = icmp eq i32 %decl_context, 4
1095         %decl_context_addr.0 = select i1 %tmp, i32 3, i32 %decl_context
1096         %tmp1 = icmp eq i32 %decl_context_addr.0, 1
1097         %decl_context_addr.1 = select i1 %tmp1, i32 0, i32 %decl_context_addr.0
1098
1099 tmp1 should be simplified to something like:
1100   (!tmp || decl_context == 1)
1101
1102 This allows recursive simplifications, tmp1 is used all over the place in
1103 the function, e.g. by:
1104
1105         %tmp23 = icmp eq i32 %decl_context_addr.1, 0            ; <i1> [#uses=1]
1106         %tmp24 = xor i1 %tmp1, true             ; <i1> [#uses=1]
1107         %or.cond8 = and i1 %tmp23, %tmp24               ; <i1> [#uses=1]
1108
1109 later.
1110
1111 //===---------------------------------------------------------------------===//
1112
1113 Store sinking: This code:
1114
1115 void f (int n, int *cond, int *res) {
1116     int i;
1117     *res = 0;
1118     for (i = 0; i < n; i++)
1119         if (*cond)
1120             *res ^= 234; /* (*) */
1121 }
1122
1123 On this function GVN hoists the fully redundant value of *res, but nothing
1124 moves the store out.  This gives us this code:
1125
1126 bb:             ; preds = %bb2, %entry
1127         %.rle = phi i32 [ 0, %entry ], [ %.rle6, %bb2 ]
1128         %i.05 = phi i32 [ 0, %entry ], [ %indvar.next, %bb2 ]
1129         %1 = load i32* %cond, align 4
1130         %2 = icmp eq i32 %1, 0
1131         br i1 %2, label %bb2, label %bb1
1132
1133 bb1:            ; preds = %bb
1134         %3 = xor i32 %.rle, 234
1135         store i32 %3, i32* %res, align 4
1136         br label %bb2
1137
1138 bb2:            ; preds = %bb, %bb1
1139         %.rle6 = phi i32 [ %3, %bb1 ], [ %.rle, %bb ]
1140         %indvar.next = add i32 %i.05, 1
1141         %exitcond = icmp eq i32 %indvar.next, %n
1142         br i1 %exitcond, label %return, label %bb
1143
1144 DSE should sink partially dead stores to get the store out of the loop.
1145
1146 Here's another partial dead case:
1147 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=12395
1148
1149 //===---------------------------------------------------------------------===//
1150
1151 Scalar PRE hoists the mul in the common block up to the else:
1152
1153 int test (int a, int b, int c, int g) {
1154   int d, e;
1155   if (a)
1156     d = b * c;
1157   else
1158     d = b - c;
1159   e = b * c + g;
1160   return d + e;
1161 }
1162
1163 It would be better to do the mul once to reduce codesize above the if.
1164 This is GCC PR38204.
1165
1166 //===---------------------------------------------------------------------===//
1167
1168 GCC PR37810 is an interesting case where we should sink load/store reload
1169 into the if block and outside the loop, so we don't reload/store it on the
1170 non-call path.
1171
1172 for () {
1173   *P += 1;
1174   if ()
1175     call();
1176   else
1177     ...
1178 ->
1179 tmp = *P
1180 for () {
1181   tmp += 1;
1182   if () {
1183     *P = tmp;
1184     call();
1185     tmp = *P;
1186   } else ...
1187 }
1188 *P = tmp;
1189
1190 We now hoist the reload after the call (Transforms/GVN/lpre-call-wrap.ll), but
1191 we don't sink the store.  We need partially dead store sinking.
1192
1193 //===---------------------------------------------------------------------===//
1194
1195 [PHI TRANSLATE GEPs]
1196
1197 GCC PR37166: Sinking of loads prevents SROA'ing the "g" struct on the stack
1198 leading to excess stack traffic. This could be handled by GVN with some crazy
1199 symbolic phi translation.  The code we get looks like (g is on the stack):
1200
1201 bb2:            ; preds = %bb1
1202 ..
1203         %9 = getelementptr %struct.f* %g, i32 0, i32 0
1204         store i32 %8, i32* %9, align  bel %bb3
1205
1206 bb3:            ; preds = %bb1, %bb2, %bb
1207         %c_addr.0 = phi %struct.f* [ %g, %bb2 ], [ %c, %bb ], [ %c, %bb1 ]
1208         %b_addr.0 = phi %struct.f* [ %b, %bb2 ], [ %g, %bb ], [ %b, %bb1 ]
1209         %10 = getelementptr %struct.f* %c_addr.0, i32 0, i32 0
1210         %11 = load i32* %10, align 4
1211
1212 %11 is fully redundant, an in BB2 it should have the value %8.
1213
1214 GCC PR33344 is a similar case.
1215
1216 //===---------------------------------------------------------------------===//
1217
1218 [PHI TRANSLATE INDEXED GEPs]  PR5313
1219
1220 Load redundancy elimination for simple loop.  This loop:
1221
1222 void append_text(const char* text,unsigned char * const  io) {
1223   while(*text)
1224     *io=*text++;
1225 }
1226
1227 Compiles to have a fully redundant load in the loop (%2):
1228
1229 define void @append_text(i8* nocapture %text, i8* nocapture %io) nounwind {
1230 entry:
1231   %0 = load i8* %text, align 1                    ; <i8> [#uses=1]
1232   %1 = icmp eq i8 %0, 0                           ; <i1> [#uses=1]
1233   br i1 %1, label %return, label %bb
1234
1235 bb:                                               ; preds = %bb, %entry
1236   %indvar = phi i32 [ 0, %entry ], [ %tmp, %bb ]  ; <i32> [#uses=2]
1237   %text_addr.04 = getelementptr i8* %text, i32 %indvar ; <i8*> [#uses=1]
1238   %2 = load i8* %text_addr.04, align 1            ; <i8> [#uses=1]
1239   store i8 %2, i8* %io, align 1
1240   %tmp = add i32 %indvar, 1                       ; <i32> [#uses=2]
1241   %scevgep = getelementptr i8* %text, i32 %tmp    ; <i8*> [#uses=1]
1242   %3 = load i8* %scevgep, align 1                 ; <i8> [#uses=1]
1243   %4 = icmp eq i8 %3, 0                           ; <i1> [#uses=1]
1244   br i1 %4, label %return, label %bb
1245
1246 return:                                           ; preds = %bb, %entry
1247   ret void
1248 }
1249
1250 //===---------------------------------------------------------------------===//
1251
1252 There are many load PRE testcases in testsuite/gcc.dg/tree-ssa/loadpre* in the
1253 GCC testsuite.  There are many pre testcases as ssa-pre-*.c
1254
1255 //===---------------------------------------------------------------------===//
1256
1257 There are some interesting cases in testsuite/gcc.dg/tree-ssa/pred-comm* in the
1258 GCC testsuite.  For example, predcom-1.c is:
1259
1260  for (i = 2; i < 1000; i++)
1261     fib[i] = (fib[i-1] + fib[i - 2]) & 0xffff;
1262
1263 which compiles into:
1264
1265 bb1:            ; preds = %bb1, %bb1.thread
1266         %indvar = phi i32 [ 0, %bb1.thread ], [ %0, %bb1 ]
1267         %i.0.reg2mem.0 = add i32 %indvar, 2
1268         %0 = add i32 %indvar, 1         ; <i32> [#uses=3]
1269         %1 = getelementptr [1000 x i32]* @fib, i32 0, i32 %0
1270         %2 = load i32* %1, align 4              ; <i32> [#uses=1]
1271         %3 = getelementptr [1000 x i32]* @fib, i32 0, i32 %indvar
1272         %4 = load i32* %3, align 4              ; <i32> [#uses=1]
1273         %5 = add i32 %4, %2             ; <i32> [#uses=1]
1274         %6 = and i32 %5, 65535          ; <i32> [#uses=1]
1275         %7 = getelementptr [1000 x i32]* @fib, i32 0, i32 %i.0.reg2mem.0
1276         store i32 %6, i32* %7, align 4
1277         %exitcond = icmp eq i32 %0, 998         ; <i1> [#uses=1]
1278         br i1 %exitcond, label %return, label %bb1
1279
1280 This is basically:
1281   LOAD fib[i+1]
1282   LOAD fib[i]
1283   STORE fib[i+2]
1284
1285 instead of handling this as a loop or other xform, all we'd need to do is teach
1286 load PRE to phi translate the %0 add (i+1) into the predecessor as (i'+1+1) =
1287 (i'+2) (where i' is the previous iteration of i).  This would find the store
1288 which feeds it.
1289
1290 predcom-2.c is apparently the same as predcom-1.c
1291 predcom-3.c is very similar but needs loads feeding each other instead of
1292 store->load.
1293 predcom-4.c seems the same as the rest.
1294
1295
1296 //===---------------------------------------------------------------------===//
1297
1298 Other simple load PRE cases:
1299 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=35287 [LPRE crit edge splitting]
1300
1301 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=34677 (licm does this, LPRE crit edge)
1302   llvm-gcc t2.c -S -o - -O0 -emit-llvm | llvm-as | opt -mem2reg -simplifycfg -gvn | llvm-dis
1303
1304 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=16799 [BITCAST PHI TRANS]
1305
1306 //===---------------------------------------------------------------------===//
1307
1308 Type based alias analysis:
1309 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=14705
1310
1311 //===---------------------------------------------------------------------===//
1312
1313 A/B get pinned to the stack because we turn an if/then into a select instead
1314 of PRE'ing the load/store.  This may be fixable in instcombine:
1315 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=37892
1316
1317 struct X { int i; };
1318 int foo (int x) {
1319   struct X a;
1320   struct X b;
1321   struct X *p;
1322   a.i = 1;
1323   b.i = 2;
1324   if (x)
1325     p = &a;
1326   else
1327     p = &b;
1328   return p->i;
1329 }
1330
1331 //===---------------------------------------------------------------------===//
1332
1333 Interesting missed case because of control flow flattening (should be 2 loads):
1334 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=26629
1335 With: llvm-gcc t2.c -S -o - -O0 -emit-llvm | llvm-as |
1336              opt -mem2reg -gvn -instcombine | llvm-dis
1337 we miss it because we need 1) GEP PHI TRAN, 2) CRIT EDGE 3) MULTIPLE DIFFERENT
1338 VALS PRODUCED BY ONE BLOCK OVER DIFFERENT PATHS
1339
1340 //===---------------------------------------------------------------------===//
1341
1342 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=19633
1343 We could eliminate the branch condition here, loading from null is undefined:
1344
1345 struct S { int w, x, y, z; };
1346 struct T { int r; struct S s; };
1347 void bar (struct S, int);
1348 void foo (int a, struct T b)
1349 {
1350   struct S *c = 0;
1351   if (a)
1352     c = &b.s;
1353   bar (*c, a);
1354 }
1355
1356 //===---------------------------------------------------------------------===//
1357
1358 simplifylibcalls should do several optimizations for strspn/strcspn:
1359
1360 strcspn(x, "") -> strlen(x)
1361 strcspn("", x) -> 0
1362 strspn("", x) -> 0
1363 strspn(x, "") -> strlen(x)
1364 strspn(x, "a") -> strchr(x, 'a')-x
1365
1366 strcspn(x, "a") -> inlined loop for up to 3 letters (similarly for strspn):
1367
1368 size_t __strcspn_c3 (__const char *__s, int __reject1, int __reject2,
1369                      int __reject3) {
1370   register size_t __result = 0;
1371   while (__s[__result] != '\0' && __s[__result] != __reject1 &&
1372          __s[__result] != __reject2 && __s[__result] != __reject3)
1373     ++__result;
1374   return __result;
1375 }
1376
1377 This should turn into a switch on the character.  See PR3253 for some notes on
1378 codegen.
1379
1380 456.hmmer apparently uses strcspn and strspn a lot.  471.omnetpp uses strspn.
1381
1382 //===---------------------------------------------------------------------===//
1383
1384 "gas" uses this idiom:
1385   else if (strchr ("+-/*%|&^:[]()~", *intel_parser.op_string))
1386 ..
1387   else if (strchr ("<>", *intel_parser.op_string)
1388
1389 Those should be turned into a switch.
1390
1391 //===---------------------------------------------------------------------===//
1392
1393 252.eon contains this interesting code:
1394
1395         %3072 = getelementptr [100 x i8]* %tempString, i32 0, i32 0
1396         %3073 = call i8* @strcpy(i8* %3072, i8* %3071) nounwind
1397         %strlen = call i32 @strlen(i8* %3072)    ; uses = 1
1398         %endptr = getelementptr [100 x i8]* %tempString, i32 0, i32 %strlen
1399         call void @llvm.memcpy.i32(i8* %endptr,
1400           i8* getelementptr ([5 x i8]* @"\01LC42", i32 0, i32 0), i32 5, i32 1)
1401         %3074 = call i32 @strlen(i8* %endptr) nounwind readonly
1402
1403 This is interesting for a couple reasons.  First, in this:
1404
1405         %3073 = call i8* @strcpy(i8* %3072, i8* %3071) nounwind
1406         %strlen = call i32 @strlen(i8* %3072)
1407
1408 The strlen could be replaced with: %strlen = sub %3072, %3073, because the
1409 strcpy call returns a pointer to the end of the string.  Based on that, the
1410 endptr GEP just becomes equal to 3073, which eliminates a strlen call and GEP.
1411
1412 Second, the memcpy+strlen strlen can be replaced with:
1413
1414         %3074 = call i32 @strlen([5 x i8]* @"\01LC42") nounwind readonly
1415
1416 Because the destination was just copied into the specified memory buffer.  This,
1417 in turn, can be constant folded to "4".
1418
1419 In other code, it contains:
1420
1421         %endptr6978 = bitcast i8* %endptr69 to i32*
1422         store i32 7107374, i32* %endptr6978, align 1
1423         %3167 = call i32 @strlen(i8* %endptr69) nounwind readonly
1424
1425 Which could also be constant folded.  Whatever is producing this should probably
1426 be fixed to leave this as a memcpy from a string.
1427
1428 Further, eon also has an interesting partially redundant strlen call:
1429
1430 bb8:            ; preds = %_ZN18eonImageCalculatorC1Ev.exit
1431         %682 = getelementptr i8** %argv, i32 6          ; <i8**> [#uses=2]
1432         %683 = load i8** %682, align 4          ; <i8*> [#uses=4]
1433         %684 = load i8* %683, align 1           ; <i8> [#uses=1]
1434         %685 = icmp eq i8 %684, 0               ; <i1> [#uses=1]
1435         br i1 %685, label %bb10, label %bb9
1436
1437 bb9:            ; preds = %bb8
1438         %686 = call i32 @strlen(i8* %683) nounwind readonly
1439         %687 = icmp ugt i32 %686, 254           ; <i1> [#uses=1]
1440         br i1 %687, label %bb10, label %bb11
1441
1442 bb10:           ; preds = %bb9, %bb8
1443         %688 = call i32 @strlen(i8* %683) nounwind readonly
1444
1445 This could be eliminated by doing the strlen once in bb8, saving code size and
1446 improving perf on the bb8->9->10 path.
1447
1448 //===---------------------------------------------------------------------===//
1449
1450 I see an interesting fully redundant call to strlen left in 186.crafty:InputMove
1451 which looks like:
1452        %movetext11 = getelementptr [128 x i8]* %movetext, i32 0, i32 0
1453
1454
1455 bb62:           ; preds = %bb55, %bb53
1456         %promote.0 = phi i32 [ %169, %bb55 ], [ 0, %bb53 ]
1457         %171 = call i32 @strlen(i8* %movetext11) nounwind readonly align 1
1458         %172 = add i32 %171, -1         ; <i32> [#uses=1]
1459         %173 = getelementptr [128 x i8]* %movetext, i32 0, i32 %172
1460
1461 ...  no stores ...
1462        br i1 %or.cond, label %bb65, label %bb72
1463
1464 bb65:           ; preds = %bb62
1465         store i8 0, i8* %173, align 1
1466         br label %bb72
1467
1468 bb72:           ; preds = %bb65, %bb62
1469         %trank.1 = phi i32 [ %176, %bb65 ], [ -1, %bb62 ]
1470         %177 = call i32 @strlen(i8* %movetext11) nounwind readonly align 1
1471
1472 Note that on the bb62->bb72 path, that the %177 strlen call is partially
1473 redundant with the %171 call.  At worst, we could shove the %177 strlen call
1474 up into the bb65 block moving it out of the bb62->bb72 path.   However, note
1475 that bb65 stores to the string, zeroing out the last byte.  This means that on
1476 that path the value of %177 is actually just %171-1.  A sub is cheaper than a
1477 strlen!
1478
1479 This pattern repeats several times, basically doing:
1480
1481   A = strlen(P);
1482   P[A-1] = 0;
1483   B = strlen(P);
1484   where it is "obvious" that B = A-1.
1485
1486 //===---------------------------------------------------------------------===//
1487
1488 186.crafty contains this interesting pattern:
1489
1490 %77 = call i8* @strstr(i8* getelementptr ([6 x i8]* @"\01LC5", i32 0, i32 0),
1491                        i8* %30)
1492 %phitmp648 = icmp eq i8* %77, getelementptr ([6 x i8]* @"\01LC5", i32 0, i32 0)
1493 br i1 %phitmp648, label %bb70, label %bb76
1494
1495 bb70:           ; preds = %OptionMatch.exit91, %bb69
1496         %78 = call i32 @strlen(i8* %30) nounwind readonly align 1               ; <i32> [#uses=1]
1497
1498 This is basically:
1499   cststr = "abcdef";
1500   if (strstr(cststr, P) == cststr) {
1501      x = strlen(P);
1502      ...
1503
1504 The strstr call would be significantly cheaper written as:
1505
1506 cststr = "abcdef";
1507 if (memcmp(P, str, strlen(P)))
1508   x = strlen(P);
1509
1510 This is memcmp+strlen instead of strstr.  This also makes the strlen fully
1511 redundant.
1512
1513 //===---------------------------------------------------------------------===//
1514
1515 186.crafty also contains this code:
1516
1517 %1906 = call i32 @strlen(i8* getelementptr ([32 x i8]* @pgn_event, i32 0,i32 0))
1518 %1907 = getelementptr [32 x i8]* @pgn_event, i32 0, i32 %1906
1519 %1908 = call i8* @strcpy(i8* %1907, i8* %1905) nounwind align 1
1520 %1909 = call i32 @strlen(i8* getelementptr ([32 x i8]* @pgn_event, i32 0,i32 0))
1521 %1910 = getelementptr [32 x i8]* @pgn_event, i32 0, i32 %1909
1522
1523 The last strlen is computable as 1908-@pgn_event, which means 1910=1908.
1524
1525 //===---------------------------------------------------------------------===//
1526
1527 186.crafty has this interesting pattern with the "out.4543" variable:
1528
1529 call void @llvm.memcpy.i32(
1530         i8* getelementptr ([10 x i8]* @out.4543, i32 0, i32 0),
1531        i8* getelementptr ([7 x i8]* @"\01LC28700", i32 0, i32 0), i32 7, i32 1)
1532 %101 = call@printf(i8* ...   @out.4543, i32 0, i32 0)) nounwind
1533
1534 It is basically doing:
1535
1536   memcpy(globalarray, "string");
1537   printf(...,  globalarray);
1538
1539 Anyway, by knowing that printf just reads the memory and forward substituting
1540 the string directly into the printf, this eliminates reads from globalarray.
1541 Since this pattern occurs frequently in crafty (due to the "DisplayTime" and
1542 other similar functions) there are many stores to "out".  Once all the printfs
1543 stop using "out", all that is left is the memcpy's into it.  This should allow
1544 globalopt to remove the "stored only" global.
1545
1546 //===---------------------------------------------------------------------===//
1547
1548 This code:
1549
1550 define inreg i32 @foo(i8* inreg %p) nounwind {
1551   %tmp0 = load i8* %p
1552   %tmp1 = ashr i8 %tmp0, 5
1553   %tmp2 = sext i8 %tmp1 to i32
1554   ret i32 %tmp2
1555 }
1556
1557 could be dagcombine'd to a sign-extending load with a shift.
1558 For example, on x86 this currently gets this:
1559
1560         movb    (%eax), %al
1561         sarb    $5, %al
1562         movsbl  %al, %eax
1563
1564 while it could get this:
1565
1566         movsbl  (%eax), %eax
1567         sarl    $5, %eax
1568
1569 //===---------------------------------------------------------------------===//
1570
1571 GCC PR31029:
1572
1573 int test(int x) { return 1-x == x; }     // --> return false
1574 int test2(int x) { return 2-x == x; }    // --> return x == 1 ?
1575
1576 Always foldable for odd constants, what is the rule for even?
1577
1578 //===---------------------------------------------------------------------===//
1579
1580 PR 3381: GEP to field of size 0 inside a struct could be turned into GEP
1581 for next field in struct (which is at same address).
1582
1583 For example: store of float into { {{}}, float } could be turned into a store to
1584 the float directly.
1585
1586 //===---------------------------------------------------------------------===//
1587
1588 #include <math.h>
1589 double foo(double a) {    return sin(a); }
1590
1591 This compiles into this on x86-64 Linux:
1592 foo:
1593         subq    $8, %rsp
1594         call    sin
1595         addq    $8, %rsp
1596         ret
1597 vs:
1598
1599 foo:
1600         jmp sin
1601
1602 //===---------------------------------------------------------------------===//
1603
1604 The arg promotion pass should make use of nocapture to make its alias analysis
1605 stuff much more precise.
1606
1607 //===---------------------------------------------------------------------===//
1608
1609 The following functions should be optimized to use a select instead of a
1610 branch (from gcc PR40072):
1611
1612 char char_int(int m) {if(m>7) return 0; return m;}
1613 int int_char(char m) {if(m>7) return 0; return m;}
1614
1615 //===---------------------------------------------------------------------===//
1616
1617 int func(int a, int b) { if (a & 0x80) b |= 0x80; else b &= ~0x80; return b; }
1618
1619 Generates this:
1620
1621 define i32 @func(i32 %a, i32 %b) nounwind readnone ssp {
1622 entry:
1623   %0 = and i32 %a, 128                            ; <i32> [#uses=1]
1624   %1 = icmp eq i32 %0, 0                          ; <i1> [#uses=1]
1625   %2 = or i32 %b, 128                             ; <i32> [#uses=1]
1626   %3 = and i32 %b, -129                           ; <i32> [#uses=1]
1627   %b_addr.0 = select i1 %1, i32 %3, i32 %2        ; <i32> [#uses=1]
1628   ret i32 %b_addr.0
1629 }
1630
1631 However, it's functionally equivalent to:
1632
1633          b = (b & ~0x80) | (a & 0x80);
1634
1635 Which generates this:
1636
1637 define i32 @func(i32 %a, i32 %b) nounwind readnone ssp {
1638 entry:
1639   %0 = and i32 %b, -129                           ; <i32> [#uses=1]
1640   %1 = and i32 %a, 128                            ; <i32> [#uses=1]
1641   %2 = or i32 %0, %1                              ; <i32> [#uses=1]
1642   ret i32 %2
1643 }
1644
1645 This can be generalized for other forms:
1646
1647      b = (b & ~0x80) | (a & 0x40) << 1;
1648
1649 //===---------------------------------------------------------------------===//
1650
1651 These two functions produce different code. They shouldn't:
1652
1653 #include <stdint.h>
1654
1655 uint8_t p1(uint8_t b, uint8_t a) {
1656   b = (b & ~0xc0) | (a & 0xc0);
1657   return (b);
1658 }
1659
1660 uint8_t p2(uint8_t b, uint8_t a) {
1661   b = (b & ~0x40) | (a & 0x40);
1662   b = (b & ~0x80) | (a & 0x80);
1663   return (b);
1664 }
1665
1666 define zeroext i8 @p1(i8 zeroext %b, i8 zeroext %a) nounwind readnone ssp {
1667 entry:
1668   %0 = and i8 %b, 63                              ; <i8> [#uses=1]
1669   %1 = and i8 %a, -64                             ; <i8> [#uses=1]
1670   %2 = or i8 %1, %0                               ; <i8> [#uses=1]
1671   ret i8 %2
1672 }
1673
1674 define zeroext i8 @p2(i8 zeroext %b, i8 zeroext %a) nounwind readnone ssp {
1675 entry:
1676   %0 = and i8 %b, 63                              ; <i8> [#uses=1]
1677   %.masked = and i8 %a, 64                        ; <i8> [#uses=1]
1678   %1 = and i8 %a, -128                            ; <i8> [#uses=1]
1679   %2 = or i8 %1, %0                               ; <i8> [#uses=1]
1680   %3 = or i8 %2, %.masked                         ; <i8> [#uses=1]
1681   ret i8 %3
1682 }
1683
1684 //===---------------------------------------------------------------------===//
1685
1686 IPSCCP does not currently propagate argument dependent constants through
1687 functions where it does not not all of the callers.  This includes functions
1688 with normal external linkage as well as templates, C99 inline functions etc.
1689 Specifically, it does nothing to:
1690
1691 define i32 @test(i32 %x, i32 %y, i32 %z) nounwind {
1692 entry:
1693   %0 = add nsw i32 %y, %z
1694   %1 = mul i32 %0, %x
1695   %2 = mul i32 %y, %z
1696   %3 = add nsw i32 %1, %2
1697   ret i32 %3
1698 }
1699
1700 define i32 @test2() nounwind {
1701 entry:
1702   %0 = call i32 @test(i32 1, i32 2, i32 4) nounwind
1703   ret i32 %0
1704 }
1705
1706 It would be interesting extend IPSCCP to be able to handle simple cases like
1707 this, where all of the arguments to a call are constant.  Because IPSCCP runs
1708 before inlining, trivial templates and inline functions are not yet inlined.
1709 The results for a function + set of constant arguments should be memoized in a
1710 map.
1711
1712 //===---------------------------------------------------------------------===//
1713
1714 The libcall constant folding stuff should be moved out of SimplifyLibcalls into
1715 libanalysis' constantfolding logic.  This would allow IPSCCP to be able to
1716 handle simple things like this:
1717
1718 static int foo(const char *X) { return strlen(X); }
1719 int bar() { return foo("abcd"); }
1720
1721 //===---------------------------------------------------------------------===//
1722
1723 InstCombine should use SimplifyDemandedBits to remove the or instruction:
1724
1725 define i1 @test(i8 %x, i8 %y) {
1726   %A = or i8 %x, 1
1727   %B = icmp ugt i8 %A, 3
1728   ret i1 %B
1729 }
1730
1731 Currently instcombine calls SimplifyDemandedBits with either all bits or just
1732 the sign bit, if the comparison is obviously a sign test. In this case, we only
1733 need all but the bottom two bits from %A, and if we gave that mask to SDB it
1734 would delete the or instruction for us.
1735
1736 //===---------------------------------------------------------------------===//