lib/Target/README.txt

   1 Target Independent Opportunities:
   2
   3 //===---------------------------------------------------------------------===//
   4
   5 Dead argument elimination should be enhanced to handle cases when an argument is
   6 dead to an externally visible function.  Though the argument can't be removed
   7 from the externally visible function, the caller doesn't need to pass it in.
   8 For example in this testcase:
   9
  10   void foo(int X) __attribute__((noinline));
  11   void foo(int X) { sideeffect(); }
  12   void bar(int A) { foo(A+1); }
  13
  14 We compile bar to:
  15
  16 define void @bar(i32 %A) nounwind ssp {
  17   %0 = add nsw i32 %A, 1                          ; <i32> [#uses=1]
  18   tail call void @foo(i32 %0) nounwind noinline ssp
  19   ret void
  20 }
  21
  22 The add is dead, we could pass in 'i32 undef' instead.  This occurs for C++
  23 templates etc, which usually have linkonce_odr/weak_odr linkage, not internal
  24 linkage.
  25
  26 //===---------------------------------------------------------------------===//
  27
  28 With the recent changes to make the implicit def/use set explicit in
  29 machineinstrs, we should change the target descriptions for 'call' instructions
  30 so that the .td files don't list all the call-clobbered registers as implicit
  31 defs.  Instead, these should be added by the code generator (e.g. on the dag).
  32
  33 This has a number of uses:
  34
  35 1. PPC32/64 and X86 32/64 can avoid having multiple copies of call instructions
  36    for their different impdef sets.
  37 2. Targets with multiple calling convs (e.g. x86) which have different clobber
  38    sets don't need copies of call instructions.
  39 3. 'Interprocedural register allocation' can be done to reduce the clobber sets
  40    of calls.
  41
  42 //===---------------------------------------------------------------------===//
  43
  44 Make the PPC branch selector target independant
  45
  46 //===---------------------------------------------------------------------===//
  47
  48 Get the C front-end to expand hypot(x,y) -> llvm.sqrt(x*x+y*y) when errno and
  49 precision don't matter (ffastmath).  Misc/mandel will like this. :)  This isn't
  50 safe in general, even on darwin.  See the libm implementation of hypot for
  51 examples (which special case when x/y are exactly zero to get signed zeros etc
  52 right).
  53
  54 //===---------------------------------------------------------------------===//
  55
  56 Solve this DAG isel folding deficiency:
  57
  58 int X, Y;
  59
  60 void fn1(void)
  61 {
  62   X = X | (Y << 3);
  63 }
  64
  65 compiles to
  66
  67 fn1:
  68         movl Y, %eax
  69         shll $3, %eax
  70         orl X, %eax
  71         movl %eax, X
  72         ret
  73
  74 The problem is the store's chain operand is not the load X but rather
  75 a TokenFactor of the load X and load Y, which prevents the folding.
  76
  77 There are two ways to fix this:
  78
  79 1. The dag combiner can start using alias analysis to realize that y/x
  80    don't alias, making the store to X not dependent on the load from Y.
  81 2. The generated isel could be made smarter in the case it can't
  82    disambiguate the pointers.
  83
  84 Number 1 is the preferred solution.
  85
  86 This has been "fixed" by a TableGen hack. But that is a short term workaround
  87 which will be removed once the proper fix is made.
  88
  89 //===---------------------------------------------------------------------===//
  90
  91 On targets with expensive 64-bit multiply, we could LSR this:
  92
  93 for (i = ...; ++i) {
  94    x = 1ULL << i;
  95
  96 into:
  97  long long tmp = 1;
  98  for (i = ...; ++i, tmp+=tmp)
  99    x = tmp;
 100
 101 This would be a win on ppc32, but not x86 or ppc64.
 102
 103 //===---------------------------------------------------------------------===//
 104
 105 Shrink: (setlt (loadi32 P), 0) -> (setlt (loadi8 Phi), 0)
 106
 107 //===---------------------------------------------------------------------===//
 108
 109 Reassociate should turn things like:
 110
 111 int factorial(int X) {
 112  return X*X*X*X*X*X*X*X;
 113 }
 114
 115 into llvm.powi calls, allowing the code generator to produce balanced
 116 multiplication trees.
 117
 118 First, the intrinsic needs to be extended to support integers, and second the
 119 code generator needs to be enhanced to lower these to multiplication trees.
 120
 121 //===---------------------------------------------------------------------===//
 122
 123 Interesting? testcase for add/shift/mul reassoc:
 124
 125 int bar(int x, int y) {
 126   return x*x*x+y+x*x*x*x*x*y*y*y*y;
 127 }
 128 int foo(int z, int n) {
 129   return bar(z, n) + bar(2*z, 2*n);
 130 }
 131
 132 This is blocked on not handling X*X*X -> powi(X, 3) (see note above).  The issue
 133 is that we end up getting t = 2*X  s = t*t   and don't turn this into 4*X*X,
 134 which is the same number of multiplies and is canonical, because the 2*X has
 135 multiple uses.  Here's a simple example:
 136
 137 define i32 @test15(i32 %X1) {
 138   %B = mul i32 %X1, 47   ; X1*47
 139   %C = mul i32 %B, %B
 140   ret i32 %C
 141 }
 142
 143
 144 //===---------------------------------------------------------------------===//
 145
 146 Reassociate should handle the example in GCC PR16157:
 147
 148 extern int a0, a1, a2, a3, a4; extern int b0, b1, b2, b3, b4;
 149 void f () {  /* this can be optimized to four additions... */
 150         b4 = a4 + a3 + a2 + a1 + a0;
 151         b3 = a3 + a2 + a1 + a0;
 152         b2 = a2 + a1 + a0;
 153         b1 = a1 + a0;
 154 }
 155
 156 This requires reassociating to forms of expressions that are already available,
 157 something that reassoc doesn't think about yet.
 158
 159 //===---------------------------------------------------------------------===//
 160
 161 These two functions should generate the same code on big-endian systems:
 162
 163 int g(int *j,int *l)  {  return memcmp(j,l,4);  }
 164 int h(int *j, int *l) {  return *j - *l; }
 165
 166 this could be done in SelectionDAGISel.cpp, along with other special cases,
 167 for 1,2,4,8 bytes.
 168
 169 //===---------------------------------------------------------------------===//
 170
 171 It would be nice to revert this patch:
 172 http://lists.cs.uiuc.edu/pipermail/llvm-commits/Week-of-Mon-20060213/031986.html
 173
 174 And teach the dag combiner enough to simplify the code expanded before
 175 legalize.  It seems plausible that this knowledge would let it simplify other
 176 stuff too.
 177
 178 //===---------------------------------------------------------------------===//
 179
 180 For vector types, TargetData.cpp::getTypeInfo() returns alignment that is equal
 181 to the type size. It works but can be overly conservative as the alignment of
 182 specific vector types are target dependent.
 183
 184 //===---------------------------------------------------------------------===//
 185
 186 We should produce an unaligned load from code like this:
 187
 188 v4sf example(float *P) {
 189   return (v4sf){P[0], P[1], P[2], P[3] };
 190 }
 191
 192 //===---------------------------------------------------------------------===//
 193
 194 Add support for conditional increments, and other related patterns.  Instead
 195 of:
 196
 197         movl 136(%esp), %eax
 198         cmpl $0, %eax
 199         je LBB16_2      #cond_next
 200 LBB16_1:        #cond_true
 201         incl _foo
 202 LBB16_2:        #cond_next
 203
 204 emit:
 205         movl    _foo, %eax
 206         cmpl    $1, %edi
 207         sbbl    $-1, %eax
 208         movl    %eax, _foo
 209
 210 //===---------------------------------------------------------------------===//
 211
 212 Combine: a = sin(x), b = cos(x) into a,b = sincos(x).
 213
 214 Expand these to calls of sin/cos and stores:
 215       double sincos(double x, double *sin, double *cos);
 216       float sincosf(float x, float *sin, float *cos);
 217       long double sincosl(long double x, long double *sin, long double *cos);
 218
 219 Doing so could allow SROA of the destination pointers.  See also:
 220 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=17687
 221
 222 This is now easily doable with MRVs.  We could even make an intrinsic for this
 223 if anyone cared enough about sincos.
 224
 225 //===---------------------------------------------------------------------===//
 226
 227 Turn this into a single byte store with no load (the other 3 bytes are
 228 unmodified):
 229
 230 define void @test(i32* %P) {
 231         %tmp = load i32* %P
 232         %tmp14 = or i32 %tmp, 3305111552
 233         %tmp15 = and i32 %tmp14, 3321888767
 234         store i32 %tmp15, i32* %P
 235         ret void
 236 }
 237
 238 //===---------------------------------------------------------------------===//
 239
 240 dag/inst combine "clz(x)>>5 -> x==0" for 32-bit x.
 241
 242 Compile:
 243
 244 int bar(int x)
 245 {
 246   int t = __builtin_clz(x);
 247   return -(t>>5);
 248 }
 249
 250 to:
 251
 252 _bar:   addic r3,r3,-1
 253         subfe r3,r3,r3
 254         blr
 255
 256 //===---------------------------------------------------------------------===//
 257
 258 quantum_sigma_x in 462.libquantum contains the following loop:
 259
 260       for(i=0; i<reg->size; i++)
 261         {
 262           /* Flip the target bit of each basis state */
 263           reg->node[i].state ^= ((MAX_UNSIGNED) 1 << target);
 264         }
 265
 266 Where MAX_UNSIGNED/state is a 64-bit int.  On a 32-bit platform it would be just
 267 so cool to turn it into something like:
 268
 269    long long Res = ((MAX_UNSIGNED) 1 << target);
 270    if (target < 32) {
 271      for(i=0; i<reg->size; i++)
 272        reg->node[i].state ^= Res & 0xFFFFFFFFULL;
 273    } else {
 274      for(i=0; i<reg->size; i++)
 275        reg->node[i].state ^= Res & 0xFFFFFFFF00000000ULL
 276    }
 277
 278 ... which would only do one 32-bit XOR per loop iteration instead of two.
 279
 280 It would also be nice to recognize the reg->size doesn't alias reg->node[i], but
 281 this requires TBAA.
 282
 283 //===---------------------------------------------------------------------===//
 284
 285 This isn't recognized as bswap by instcombine (yes, it really is bswap):
 286
 287 unsigned long reverse(unsigned v) {
 288     unsigned t;
 289     t = v ^ ((v << 16) | (v >> 16));
 290     t &= ~0xff0000;
 291     v = (v << 24) | (v >> 8);
 292     return v ^ (t >> 8);
 293 }
 294
 295 //===---------------------------------------------------------------------===//
 296
 297 These idioms should be recognized as popcount (see PR1488):
 298
 299 unsigned countbits_slow(unsigned v) {
 300   unsigned c;
 301   for (c = 0; v; v >>= 1)
 302     c += v & 1;
 303   return c;
 304 }
 305 unsigned countbits_fast(unsigned v){
 306   unsigned c;
 307   for (c = 0; v; c++)
 308     v &= v - 1; // clear the least significant bit set
 309   return c;
 310 }
 311
 312 BITBOARD = unsigned long long
 313 int PopCnt(register BITBOARD a) {
 314   register int c=0;
 315   while(a) {
 316     c++;
 317     a &= a - 1;
 318   }
 319   return c;
 320 }
 321 unsigned int popcount(unsigned int input) {
 322   unsigned int count = 0;
 323   for (unsigned int i =  0; i < 4 * 8; i++)
 324     count += (input >> i) & i;
 325   return count;
 326 }
 327
 328 This is a form of idiom recognition for loops, the same thing that could be
 329 useful for recognizing memset/memcpy.
 330
 331 //===---------------------------------------------------------------------===//
 332
 333 These should turn into single 16-bit (unaligned?) loads on little/big endian
 334 processors.
 335
 336 unsigned short read_16_le(const unsigned char *adr) {
 337   return adr[0] | (adr[1] << 8);
 338 }
 339 unsigned short read_16_be(const unsigned char *adr) {
 340   return (adr[0] << 8) | adr[1];
 341 }
 342
 343 //===---------------------------------------------------------------------===//
 344
 345 -instcombine should handle this transform:
 346    icmp pred (sdiv X / C1 ), C2
 347 when X, C1, and C2 are unsigned.  Similarly for udiv and signed operands.
 348
 349 Currently InstCombine avoids this transform but will do it when the signs of
 350 the operands and the sign of the divide match. See the FIXME in
 351 InstructionCombining.cpp in the visitSetCondInst method after the switch case
 352 for Instruction::UDiv (around line 4447) for more details.
 353
 354 The SingleSource/Benchmarks/Shootout-C++/hash and hash2 tests have examples of
 355 this construct.
 356
 357 //===---------------------------------------------------------------------===//
 358
 359 viterbi speeds up *significantly* if the various "history" related copy loops
 360 are turned into memcpy calls at the source level.  We need a "loops to memcpy"
 361 pass.
 362
 363 //===---------------------------------------------------------------------===//
 364
 365 Consider:
 366
 367 typedef unsigned U32;
 368 typedef unsigned long long U64;
 369 int test (U32 *inst, U64 *regs) {
 370     U64 effective_addr2;
 371     U32 temp = *inst;
 372     int r1 = (temp >> 20) & 0xf;
 373     int b2 = (temp >> 16) & 0xf;
 374     effective_addr2 = temp & 0xfff;
 375     if (b2) effective_addr2 += regs[b2];
 376     b2 = (temp >> 12) & 0xf;
 377     if (b2) effective_addr2 += regs[b2];
 378     effective_addr2 &= regs[4];
 379      if ((effective_addr2 & 3) == 0)
 380         return 1;
 381     return 0;
 382 }
 383
 384 Note that only the low 2 bits of effective_addr2 are used.  On 32-bit systems,
 385 we don't eliminate the computation of the top half of effective_addr2 because
 386 we don't have whole-function selection dags.  On x86, this means we use one
 387 extra register for the function when effective_addr2 is declared as U64 than
 388 when it is declared U32.
 389
 390 PHI Slicing could be extended to do this.
 391
 392 //===---------------------------------------------------------------------===//
 393
 394 LSR should know what GPR types a target has from TargetData.  This code:
 395
 396 volatile short X, Y; // globals
 397
 398 void foo(int N) {
 399   int i;
 400   for (i = 0; i < N; i++) { X = i; Y = i*4; }
 401 }
 402
 403 produces two near identical IV's (after promotion) on PPC/ARM:
 404
 405 LBB1_2:
 406         ldr r3, LCPI1_0
 407         ldr r3, [r3]
 408         strh r2, [r3]
 409         ldr r3, LCPI1_1
 410         ldr r3, [r3]
 411         strh r1, [r3]
 412         add r1, r1, #4
 413         add r2, r2, #1   <- [0,+,1]
 414         sub r0, r0, #1   <- [0,-,1]
 415         cmp r0, #0
 416         bne LBB1_2
 417
 418 LSR should reuse the "+" IV for the exit test.
 419
 420 //===---------------------------------------------------------------------===//
 421
 422 Tail call elim should be more aggressive, checking to see if the call is
 423 followed by an uncond branch to an exit block.
 424
 425 ; This testcase is due to tail-duplication not wanting to copy the return
 426 ; instruction into the terminating blocks because there was other code
 427 ; optimized out of the function after the taildup happened.
 428 ; RUN: llvm-as < %s | opt -tailcallelim | llvm-dis | not grep call
 429
 430 define i32 @t4(i32 %a) {
 431 entry:
 432         %tmp.1 = and i32 %a, 1          ; <i32> [#uses=1]
 433         %tmp.2 = icmp ne i32 %tmp.1, 0          ; <i1> [#uses=1]
 434         br i1 %tmp.2, label %then.0, label %else.0
 435
 436 then.0:         ; preds = %entry
 437         %tmp.5 = add i32 %a, -1         ; <i32> [#uses=1]
 438         %tmp.3 = call i32 @t4( i32 %tmp.5 )             ; <i32> [#uses=1]
 439         br label %return
 440
 441 else.0:         ; preds = %entry
 442         %tmp.7 = icmp ne i32 %a, 0              ; <i1> [#uses=1]
 443         br i1 %tmp.7, label %then.1, label %return
 444
 445 then.1:         ; preds = %else.0
 446         %tmp.11 = add i32 %a, -2                ; <i32> [#uses=1]
 447         %tmp.9 = call i32 @t4( i32 %tmp.11 )            ; <i32> [#uses=1]
 448         br label %return
 449
 450 return:         ; preds = %then.1, %else.0, %then.0
 451         %result.0 = phi i32 [ 0, %else.0 ], [ %tmp.3, %then.0 ],
 452                             [ %tmp.9, %then.1 ]
 453         ret i32 %result.0
 454 }
 455
 456 //===---------------------------------------------------------------------===//
 457
 458 Tail recursion elimination should handle:
 459
 460 int pow2m1(int n) {
 461  if (n == 0)
 462    return 0;
 463  return 2 * pow2m1 (n - 1) + 1;
 464 }
 465
 466 Also, multiplies can be turned into SHL's, so they should be handled as if
 467 they were associative.  "return foo() << 1" can be tail recursion eliminated.
 468
 469 //===---------------------------------------------------------------------===//
 470
 471 Argument promotion should promote arguments for recursive functions, like
 472 this:
 473
 474 ; RUN: llvm-as < %s | opt -argpromotion | llvm-dis | grep x.val
 475
 476 define internal i32 @foo(i32* %x) {
 477 entry:
 478         %tmp = load i32* %x             ; <i32> [#uses=0]
 479         %tmp.foo = call i32 @foo( i32* %x )             ; <i32> [#uses=1]
 480         ret i32 %tmp.foo
 481 }
 482
 483 define i32 @bar(i32* %x) {
 484 entry:
 485         %tmp3 = call i32 @foo( i32* %x )                ; <i32> [#uses=1]
 486         ret i32 %tmp3
 487 }
 488
 489 //===---------------------------------------------------------------------===//
 490
 491 We should investigate an instruction sinking pass.  Consider this silly
 492 example in pic mode:
 493
 494 #include <assert.h>
 495 void foo(int x) {
 496   assert(x);
 497   //...
 498 }
 499
 500 we compile this to:
 501 _foo:
 502         subl    $28, %esp
 503         call    "L1$pb"
 504 "L1$pb":
 505         popl    %eax
 506         cmpl    $0, 32(%esp)
 507         je      LBB1_2  # cond_true
 508 LBB1_1: # return
 509         # ...
 510         addl    $28, %esp
 511         ret
 512 LBB1_2: # cond_true
 513 ...
 514
 515 The PIC base computation (call+popl) is only used on one path through the
 516 code, but is currently always computed in the entry block.  It would be
 517 better to sink the picbase computation down into the block for the
 518 assertion, as it is the only one that uses it.  This happens for a lot of
 519 code with early outs.
 520
 521 Another example is loads of arguments, which are usually emitted into the
 522 entry block on targets like x86.  If not used in all paths through a
 523 function, they should be sunk into the ones that do.
 524
 525 In this case, whole-function-isel would also handle this.
 526
 527 //===---------------------------------------------------------------------===//
 528
 529 Investigate lowering of sparse switch statements into perfect hash tables:
 530 http://burtleburtle.net/bob/hash/perfect.html
 531
 532 //===---------------------------------------------------------------------===//
 533
 534 We should turn things like "load+fabs+store" and "load+fneg+store" into the
 535 corresponding integer operations.  On a yonah, this loop:
 536
 537 double a[256];
 538 void foo() {
 539   int i, b;
 540   for (b = 0; b < 10000000; b++)
 541   for (i = 0; i < 256; i++)
 542     a[i] = -a[i];
 543 }
 544
 545 is twice as slow as this loop:
 546
 547 long long a[256];
 548 void foo() {
 549   int i, b;
 550   for (b = 0; b < 10000000; b++)
 551   for (i = 0; i < 256; i++)
 552     a[i] ^= (1ULL << 63);
 553 }
 554
 555 and I suspect other processors are similar.  On X86 in particular this is a
 556 big win because doing this with integers allows the use of read/modify/write
 557 instructions.
 558
 559 //===---------------------------------------------------------------------===//
 560
 561 DAG Combiner should try to combine small loads into larger loads when
 562 profitable.  For example, we compile this C++ example:
 563
 564 struct THotKey { short Key; bool Control; bool Shift; bool Alt; };
 565 extern THotKey m_HotKey;
 566 THotKey GetHotKey () { return m_HotKey; }
 567
 568 into (-O3 -fno-exceptions -static -fomit-frame-pointer):
 569
 570 __Z9GetHotKeyv:
 571         pushl   %esi
 572         movl    8(%esp), %eax
 573         movb    _m_HotKey+3, %cl
 574         movb    _m_HotKey+4, %dl
 575         movb    _m_HotKey+2, %ch
 576         movw    _m_HotKey, %si
 577         movw    %si, (%eax)
 578         movb    %ch, 2(%eax)
 579         movb    %cl, 3(%eax)
 580         movb    %dl, 4(%eax)
 581         popl    %esi
 582         ret     $4
 583
 584 GCC produces:
 585
 586 __Z9GetHotKeyv:
 587         movl    _m_HotKey, %edx
 588         movl    4(%esp), %eax
 589         movl    %edx, (%eax)
 590         movzwl  _m_HotKey+4, %edx
 591         movw    %dx, 4(%eax)
 592         ret     $4
 593
 594 The LLVM IR contains the needed alignment info, so we should be able to
 595 merge the loads and stores into 4-byte loads:
 596
 597         %struct.THotKey = type { i16, i8, i8, i8 }
 598 define void @_Z9GetHotKeyv(%struct.THotKey* sret  %agg.result) nounwind  {
 599 ...
 600         %tmp2 = load i16* getelementptr (@m_HotKey, i32 0, i32 0), align 8
 601         %tmp5 = load i8* getelementptr (@m_HotKey, i32 0, i32 1), align 2
 602         %tmp8 = load i8* getelementptr (@m_HotKey, i32 0, i32 2), align 1
 603         %tmp11 = load i8* getelementptr (@m_HotKey, i32 0, i32 3), align 2
 604
 605 Alternatively, we should use a small amount of base-offset alias analysis
 606 to make it so the scheduler doesn't need to hold all the loads in regs at
 607 once.
 608
 609 //===---------------------------------------------------------------------===//
 610
 611 We should add an FRINT node to the DAG to model targets that have legal
 612 implementations of ceil/floor/rint.
 613
 614 //===---------------------------------------------------------------------===//
 615
 616 Consider:
 617
 618 int test() {
 619   long long input[8] = {1,1,1,1,1,1,1,1};
 620   foo(input);
 621 }
 622
 623 We currently compile this into a memcpy from a global array since the
 624 initializer is fairly large and not memset'able.  This is good, but the memcpy
 625 gets lowered to load/stores in the code generator.  This is also ok, except
 626 that the codegen lowering for memcpy doesn't handle the case when the source
 627 is a constant global.  This gives us atrocious code like this:
 628
 629         call    "L1$pb"
 630 "L1$pb":
 631         popl    %eax
 632         movl    _C.0.1444-"L1$pb"+32(%eax), %ecx
 633         movl    %ecx, 40(%esp)
 634         movl    _C.0.1444-"L1$pb"+20(%eax), %ecx
 635         movl    %ecx, 28(%esp)
 636         movl    _C.0.1444-"L1$pb"+36(%eax), %ecx
 637         movl    %ecx, 44(%esp)
 638         movl    _C.0.1444-"L1$pb"+44(%eax), %ecx
 639         movl    %ecx, 52(%esp)
 640         movl    _C.0.1444-"L1$pb"+40(%eax), %ecx
 641         movl    %ecx, 48(%esp)
 642         movl    _C.0.1444-"L1$pb"+12(%eax), %ecx
 643         movl    %ecx, 20(%esp)
 644         movl    _C.0.1444-"L1$pb"+4(%eax), %ecx
 645 ...
 646
 647 instead of:
 648         movl    $1, 16(%esp)
 649         movl    $0, 20(%esp)
 650         movl    $1, 24(%esp)
 651         movl    $0, 28(%esp)
 652         movl    $1, 32(%esp)
 653         movl    $0, 36(%esp)
 654         ...
 655
 656 //===---------------------------------------------------------------------===//
 657
 658 http://llvm.org/PR717:
 659
 660 The following code should compile into "ret int undef". Instead, LLVM
 661 produces "ret int 0":
 662
 663 int f() {
 664   int x = 4;
 665   int y;
 666   if (x == 3) y = 0;
 667   return y;
 668 }
 669
 670 //===---------------------------------------------------------------------===//
 671
 672 The loop unroller should partially unroll loops (instead of peeling them)
 673 when code growth isn't too bad and when an unroll count allows simplification
 674 of some code within the loop.  One trivial example is:
 675
 676 #include <stdio.h>
 677 int main() {
 678     int nRet = 17;
 679     int nLoop;
 680     for ( nLoop = 0; nLoop < 1000; nLoop++ ) {
 681         if ( nLoop & 1 )
 682             nRet += 2;
 683         else
 684             nRet -= 1;
 685     }
 686     return nRet;
 687 }
 688
 689 Unrolling by 2 would eliminate the '&1' in both copies, leading to a net
 690 reduction in code size.  The resultant code would then also be suitable for
 691 exit value computation.
 692
 693 //===---------------------------------------------------------------------===//
 694
 695 We miss a bunch of rotate opportunities on various targets, including ppc, x86,
 696 etc.  On X86, we miss a bunch of 'rotate by variable' cases because the rotate
 697 matching code in dag combine doesn't look through truncates aggressively
 698 enough.  Here are some testcases reduces from GCC PR17886:
 699
 700 unsigned long long f(unsigned long long x, int y) {
 701   return (x << y) | (x >> 64-y);
 702 }
 703 unsigned f2(unsigned x, int y){
 704   return (x << y) | (x >> 32-y);
 705 }
 706 unsigned long long f3(unsigned long long x){
 707   int y = 9;
 708   return (x << y) | (x >> 64-y);
 709 }
 710 unsigned f4(unsigned x){
 711   int y = 10;
 712   return (x << y) | (x >> 32-y);
 713 }
 714 unsigned long long f5(unsigned long long x, unsigned long long y) {
 715   return (x << 8) | ((y >> 48) & 0xffull);
 716 }
 717 unsigned long long f6(unsigned long long x, unsigned long long y, int z) {
 718   switch(z) {
 719   case 1:
 720     return (x << 8) | ((y >> 48) & 0xffull);
 721   case 2:
 722     return (x << 16) | ((y >> 40) & 0xffffull);
 723   case 3:
 724     return (x << 24) | ((y >> 32) & 0xffffffull);
 725   case 4:
 726     return (x << 32) | ((y >> 24) & 0xffffffffull);
 727   default:
 728     return (x << 40) | ((y >> 16) & 0xffffffffffull);
 729   }
 730 }
 731
 732 On X86-64, we only handle f2/f3/f4 right.  On x86-32, a few of these
 733 generate truly horrible code, instead of using shld and friends.  On
 734 ARM, we end up with calls to L___lshrdi3/L___ashldi3 in f, which is
 735 badness.  PPC64 misses f, f5 and f6.  CellSPU aborts in isel.
 736
 737 //===---------------------------------------------------------------------===//
 738
 739 We do a number of simplifications in simplify libcalls to strength reduce
 740 standard library functions, but we don't currently merge them together.  For
 741 example, it is useful to merge memcpy(a,b,strlen(b)) -> strcpy.  This can only
 742 be done safely if "b" isn't modified between the strlen and memcpy of course.
 743
 744 //===---------------------------------------------------------------------===//
 745
 746 We compile this program: (from GCC PR11680)
 747 http://gcc.gnu.org/bugzilla/attachment.cgi?id=4487
 748
 749 Into code that runs the same speed in fast/slow modes, but both modes run 2x
 750 slower than when compile with GCC (either 4.0 or 4.2):
 751
 752 $ llvm-g++ perf.cpp -O3 -fno-exceptions
 753 $ time ./a.out fast
 754 1.821u 0.003s 0:01.82 100.0%    0+0k 0+0io 0pf+0w
 755
 756 $ g++ perf.cpp -O3 -fno-exceptions
 757 $ time ./a.out fast
 758 0.821u 0.001s 0:00.82 100.0%    0+0k 0+0io 0pf+0w
 759
 760 It looks like we are making the same inlining decisions, so this may be raw
 761 codegen badness or something else (haven't investigated).
 762
 763 //===---------------------------------------------------------------------===//
 764
 765 We miss some instcombines for stuff like this:
 766 void bar (void);
 767 void foo (unsigned int a) {
 768   /* This one is equivalent to a >= (3 << 2).  */
 769   if ((a >> 2) >= 3)
 770     bar ();
 771 }
 772
 773 A few other related ones are in GCC PR14753.
 774
 775 //===---------------------------------------------------------------------===//
 776
 777 Divisibility by constant can be simplified (according to GCC PR12849) from
 778 being a mulhi to being a mul lo (cheaper).  Testcase:
 779
 780 void bar(unsigned n) {
 781   if (n % 3 == 0)
 782     true();
 783 }
 784
 785 This is equivalent to the following, where 2863311531 is the multiplicative
 786 inverse of 3, and 1431655766 is ((2^32)-1)/3+1:
 787 void bar(unsigned n) {
 788   if (n * 2863311531U < 1431655766U)
 789     true();
 790 }
 791
 792 The same transformation can work with an even modulo with the addition of a
 793 rotate: rotate the result of the multiply to the right by the number of bits
 794 which need to be zero for the condition to be true, and shrink the compare RHS
 795 by the same amount.  Unless the target supports rotates, though, that
 796 transformation probably isn't worthwhile.
 797
 798 The transformation can also easily be made to work with non-zero equality
 799 comparisons: just transform, for example, "n % 3 == 1" to "(n-1) % 3 == 0".
 800
 801 //===---------------------------------------------------------------------===//
 802
 803 Better mod/ref analysis for scanf would allow us to eliminate the vtable and a
 804 bunch of other stuff from this example (see PR1604):
 805
 806 #include <cstdio>
 807 struct test {
 808     int val;
 809     virtual ~test() {}
 810 };
 811
 812 int main() {
 813     test t;
 814     std::scanf("%d", &t.val);
 815     std::printf("%d\n", t.val);
 816 }
 817
 818 //===---------------------------------------------------------------------===//
 819
 820 These functions perform the same computation, but produce different assembly.
 821
 822 define i8 @select(i8 %x) readnone nounwind {
 823   %A = icmp ult i8 %x, 250
 824   %B = select i1 %A, i8 0, i8 1
 825   ret i8 %B
 826 }
 827
 828 define i8 @addshr(i8 %x) readnone nounwind {
 829   %A = zext i8 %x to i9
 830   %B = add i9 %A, 6       ;; 256 - 250 == 6
 831   %C = lshr i9 %B, 8
 832   %D = trunc i9 %C to i8
 833   ret i8 %D
 834 }
 835
 836 //===---------------------------------------------------------------------===//
 837
 838 From gcc bug 24696:
 839 int
 840 f (unsigned long a, unsigned long b, unsigned long c)
 841 {
 842   return ((a & (c - 1)) != 0) || ((b & (c - 1)) != 0);
 843 }
 844 int
 845 f (unsigned long a, unsigned long b, unsigned long c)
 846 {
 847   return ((a & (c - 1)) != 0) | ((b & (c - 1)) != 0);
 848 }
 849 Both should combine to ((a|b) & (c-1)) != 0.  Currently not optimized with
 850 "clang -emit-llvm-bc | opt -std-compile-opts".
 851
 852 //===---------------------------------------------------------------------===//
 853
 854 From GCC Bug 20192:
 855 #define PMD_MASK    (~((1UL << 23) - 1))
 856 void clear_pmd_range(unsigned long start, unsigned long end)
 857 {
 858    if (!(start & ~PMD_MASK) && !(end & ~PMD_MASK))
 859        f();
 860 }
 861 The expression should optimize to something like
 862 "!((start|end)&~PMD_MASK). Currently not optimized with "clang
 863 -emit-llvm-bc | opt -std-compile-opts".
 864
 865 //===---------------------------------------------------------------------===//
 866
 867 From GCC Bug 3756:
 868 int
 869 pn (int n)
 870 {
 871  return (n >= 0 ? 1 : -1);
 872 }
 873 Should combine to (n >> 31) | 1.  Currently not optimized with "clang
 874 -emit-llvm-bc | opt -std-compile-opts | llc".
 875
 876 //===---------------------------------------------------------------------===//
 877
 878 void a(int variable)
 879 {
 880  if (variable == 4 || variable == 6)
 881    bar();
 882 }
 883 This should optimize to "if ((variable | 2) == 6)".  Currently not
 884 optimized with "clang -emit-llvm-bc | opt -std-compile-opts | llc".
 885
 886 //===---------------------------------------------------------------------===//
 887
 888 unsigned int f(unsigned int i, unsigned int n) {++i; if (i == n) ++i; return
 889 i;}
 890 unsigned int f2(unsigned int i, unsigned int n) {++i; i += i == n; return i;}
 891 These should combine to the same thing.  Currently, the first function
 892 produces better code on X86.
 893
 894 //===---------------------------------------------------------------------===//
 895
 896 From GCC Bug 15784:
 897 #define abs(x) x>0?x:-x
 898 int f(int x, int y)
 899 {
 900  return (abs(x)) >= 0;
 901 }
 902 This should optimize to x == INT_MIN. (With -fwrapv.)  Currently not
 903 optimized with "clang -emit-llvm-bc | opt -std-compile-opts".
 904
 905 //===---------------------------------------------------------------------===//
 906
 907 From GCC Bug 14753:
 908 void
 909 rotate_cst (unsigned int a)
 910 {
 911  a = (a << 10) | (a >> 22);
 912  if (a == 123)
 913    bar ();
 914 }
 915 void
 916 minus_cst (unsigned int a)
 917 {
 918  unsigned int tem;
 919
 920  tem = 20 - a;
 921  if (tem == 5)
 922    bar ();
 923 }
 924 void
 925 mask_gt (unsigned int a)
 926 {
 927  /* This is equivalent to a > 15.  */
 928  if ((a & ~7) > 8)
 929    bar ();
 930 }
 931 void
 932 rshift_gt (unsigned int a)
 933 {
 934  /* This is equivalent to a > 23.  */
 935  if ((a >> 2) > 5)
 936    bar ();
 937 }
 938 All should simplify to a single comparison.  All of these are
 939 currently not optimized with "clang -emit-llvm-bc | opt
 940 -std-compile-opts".
 941
 942 //===---------------------------------------------------------------------===//
 943
 944 From GCC Bug 32605:
 945 int c(int* x) {return (char*)x+2 == (char*)x;}
 946 Should combine to 0.  Currently not optimized with "clang
 947 -emit-llvm-bc | opt -std-compile-opts" (although llc can optimize it).
 948
 949 //===---------------------------------------------------------------------===//
 950
 951 int a(unsigned b) {return ((b << 31) | (b << 30)) >> 31;}
 952 Should be combined to  "((b >> 1) | b) & 1".  Currently not optimized
 953 with "clang -emit-llvm-bc | opt -std-compile-opts".
 954
 955 //===---------------------------------------------------------------------===//
 956
 957 unsigned a(unsigned x, unsigned y) { return x | (y & 1) | (y & 2);}
 958 Should combine to "x | (y & 3)".  Currently not optimized with "clang
 959 -emit-llvm-bc | opt -std-compile-opts".
 960
 961 //===---------------------------------------------------------------------===//
 962
 963 int a(int a, int b, int c) {return (~a & c) | ((c|a) & b);}
 964 Should fold to "(~a & c) | (a & b)".  Currently not optimized with
 965 "clang -emit-llvm-bc | opt -std-compile-opts".
 966
 967 //===---------------------------------------------------------------------===//
 968
 969 int a(int a,int b) {return (~(a|b))|a;}
 970 Should fold to "a|~b".  Currently not optimized with "clang
 971 -emit-llvm-bc | opt -std-compile-opts".
 972
 973 //===---------------------------------------------------------------------===//
 974
 975 int a(int a, int b) {return (a&&b) || (a&&!b);}
 976 Should fold to "a".  Currently not optimized with "clang -emit-llvm-bc
 977 | opt -std-compile-opts".
 978
 979 //===---------------------------------------------------------------------===//
 980
 981 int a(int a, int b, int c) {return (a&&b) || (!a&&c);}
 982 Should fold to "a ? b : c", or at least something sane.  Currently not
 983 optimized with "clang -emit-llvm-bc | opt -std-compile-opts".
 984
 985 //===---------------------------------------------------------------------===//
 986
 987 int a(int a, int b, int c) {return (a&&b) || (a&&c) || (a&&b&&c);}
 988 Should fold to a && (b || c).  Currently not optimized with "clang
 989 -emit-llvm-bc | opt -std-compile-opts".
 990
 991 //===---------------------------------------------------------------------===//
 992
 993 int a(int x) {return x | ((x & 8) ^ 8);}
 994 Should combine to x | 8.  Currently not optimized with "clang
 995 -emit-llvm-bc | opt -std-compile-opts".
 996
 997 //===---------------------------------------------------------------------===//
 998
 999 int a(int x) {return x ^ ((x & 8) ^ 8);}
1000 Should also combine to x | 8.  Currently not optimized with "clang
1001 -emit-llvm-bc | opt -std-compile-opts".
1002
1003 //===---------------------------------------------------------------------===//
1004
1005 int a(int x) {return (x & 8) == 0 ? -1 : -9;}
1006 Should combine to (x | -9) ^ 8.  Currently not optimized with "clang
1007 -emit-llvm-bc | opt -std-compile-opts".
1008
1009 //===---------------------------------------------------------------------===//
1010
1011 int a(int x) {return (x & 8) == 0 ? -9 : -1;}
1012 Should combine to x | -9.  Currently not optimized with "clang
1013 -emit-llvm-bc | opt -std-compile-opts".
1014
1015 //===---------------------------------------------------------------------===//
1016
1017 int a(int x) {return ((x | -9) ^ 8) & x;}
1018 Should combine to x & -9.  Currently not optimized with "clang
1019 -emit-llvm-bc | opt -std-compile-opts".
1020
1021 //===---------------------------------------------------------------------===//
1022
1023 unsigned a(unsigned a) {return a * 0x11111111 >> 28 & 1;}
1024 Should combine to "a * 0x88888888 >> 31".  Currently not optimized
1025 with "clang -emit-llvm-bc | opt -std-compile-opts".
1026
1027 //===---------------------------------------------------------------------===//
1028
1029 unsigned a(char* x) {if ((*x & 32) == 0) return b();}
1030 There's an unnecessary zext in the generated code with "clang
1031 -emit-llvm-bc | opt -std-compile-opts".
1032
1033 //===---------------------------------------------------------------------===//
1034
1035 unsigned a(unsigned long long x) {return 40 * (x >> 1);}
1036 Should combine to "20 * (((unsigned)x) & -2)".  Currently not
1037 optimized with "clang -emit-llvm-bc | opt -std-compile-opts".
1038
1039 //===---------------------------------------------------------------------===//
1040
1041 This was noticed in the entryblock for grokdeclarator in 403.gcc:
1042
1043         %tmp = icmp eq i32 %decl_context, 4
1044         %decl_context_addr.0 = select i1 %tmp, i32 3, i32 %decl_context
1045         %tmp1 = icmp eq i32 %decl_context_addr.0, 1
1046         %decl_context_addr.1 = select i1 %tmp1, i32 0, i32 %decl_context_addr.0
1047
1048 tmp1 should be simplified to something like:
1049   (!tmp || decl_context == 1)
1050
1051 This allows recursive simplifications, tmp1 is used all over the place in
1052 the function, e.g. by:
1053
1054         %tmp23 = icmp eq i32 %decl_context_addr.1, 0            ; <i1> [#uses=1]
1055         %tmp24 = xor i1 %tmp1, true             ; <i1> [#uses=1]
1056         %or.cond8 = and i1 %tmp23, %tmp24               ; <i1> [#uses=1]
1057
1058 later.
1059
1060 //===---------------------------------------------------------------------===//
1061
1062 [STORE SINKING]
1063
1064 Store sinking: This code:
1065
1066 void f (int n, int *cond, int *res) {
1067     int i;
1068     *res = 0;
1069     for (i = 0; i < n; i++)
1070         if (*cond)
1071             *res ^= 234; /* (*) */
1072 }
1073
1074 On this function GVN hoists the fully redundant value of *res, but nothing
1075 moves the store out.  This gives us this code:
1076
1077 bb:             ; preds = %bb2, %entry
1078         %.rle = phi i32 [ 0, %entry ], [ %.rle6, %bb2 ]
1079         %i.05 = phi i32 [ 0, %entry ], [ %indvar.next, %bb2 ]
1080         %1 = load i32* %cond, align 4
1081         %2 = icmp eq i32 %1, 0
1082         br i1 %2, label %bb2, label %bb1
1083
1084 bb1:            ; preds = %bb
1085         %3 = xor i32 %.rle, 234
1086         store i32 %3, i32* %res, align 4
1087         br label %bb2
1088
1089 bb2:            ; preds = %bb, %bb1
1090         %.rle6 = phi i32 [ %3, %bb1 ], [ %.rle, %bb ]
1091         %indvar.next = add i32 %i.05, 1
1092         %exitcond = icmp eq i32 %indvar.next, %n
1093         br i1 %exitcond, label %return, label %bb
1094
1095 DSE should sink partially dead stores to get the store out of the loop.
1096
1097 Here's another partial dead case:
1098 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=12395
1099
1100 //===---------------------------------------------------------------------===//
1101
1102 Scalar PRE hoists the mul in the common block up to the else:
1103
1104 int test (int a, int b, int c, int g) {
1105   int d, e;
1106   if (a)
1107     d = b * c;
1108   else
1109     d = b - c;
1110   e = b * c + g;
1111   return d + e;
1112 }
1113
1114 It would be better to do the mul once to reduce codesize above the if.
1115 This is GCC PR38204.
1116
1117 //===---------------------------------------------------------------------===//
1118
1119 [STORE SINKING]
1120
1121 GCC PR37810 is an interesting case where we should sink load/store reload
1122 into the if block and outside the loop, so we don't reload/store it on the
1123 non-call path.
1124
1125 for () {
1126   *P += 1;
1127   if ()
1128     call();
1129   else
1130     ...
1131 ->
1132 tmp = *P
1133 for () {
1134   tmp += 1;
1135   if () {
1136     *P = tmp;
1137     call();
1138     tmp = *P;
1139   } else ...
1140 }
1141 *P = tmp;
1142
1143 We now hoist the reload after the call (Transforms/GVN/lpre-call-wrap.ll), but
1144 we don't sink the store.  We need partially dead store sinking.
1145
1146 //===---------------------------------------------------------------------===//
1147
1148 [LOAD PRE CRIT EDGE SPLITTING]
1149
1150 GCC PR37166: Sinking of loads prevents SROA'ing the "g" struct on the stack
1151 leading to excess stack traffic. This could be handled by GVN with some crazy
1152 symbolic phi translation.  The code we get looks like (g is on the stack):
1153
1154 bb2:            ; preds = %bb1
1155 ..
1156         %9 = getelementptr %struct.f* %g, i32 0, i32 0
1157         store i32 %8, i32* %9, align  bel %bb3
1158
1159 bb3:            ; preds = %bb1, %bb2, %bb
1160         %c_addr.0 = phi %struct.f* [ %g, %bb2 ], [ %c, %bb ], [ %c, %bb1 ]
1161         %b_addr.0 = phi %struct.f* [ %b, %bb2 ], [ %g, %bb ], [ %b, %bb1 ]
1162         %10 = getelementptr %struct.f* %c_addr.0, i32 0, i32 0
1163         %11 = load i32* %10, align 4
1164
1165 %11 is partially redundant, an in BB2 it should have the value %8.
1166
1167 GCC PR33344 and PR35287 are similar cases.
1168
1169
1170 //===---------------------------------------------------------------------===//
1171
1172 [LOAD PRE]
1173
1174 There are many load PRE testcases in testsuite/gcc.dg/tree-ssa/loadpre* in the
1175 GCC testsuite, ones we don't get yet are (checked through loadpre25):
1176
1177 [CRIT EDGE BREAKING]
1178 loadpre3.c predcom-4.c
1179
1180 [PRE OF READONLY CALL]
1181 loadpre5.c
1182
1183 [TURN SELECT INTO BRANCH]
1184 loadpre14.c loadpre15.c
1185
1186 actually a conditional increment: loadpre18.c loadpre19.c
1187
1188
1189 //===---------------------------------------------------------------------===//
1190
1191 [SCALAR PRE]
1192 There are many PRE testcases in testsuite/gcc.dg/tree-ssa/ssa-pre-*.c in the
1193 GCC testsuite.
1194
1195 //===---------------------------------------------------------------------===//
1196
1197 There are some interesting cases in testsuite/gcc.dg/tree-ssa/pred-comm* in the
1198 GCC testsuite.  For example, we get the first example in predcom-1.c, but
1199 miss the second one:
1200
1201 unsigned fib[1000];
1202 unsigned avg[1000];
1203
1204 __attribute__ ((noinline))
1205 void count_averages(int n) {
1206   int i;
1207   for (i = 1; i < n; i++)
1208     avg[i] = (((unsigned long) fib[i - 1] + fib[i] + fib[i + 1]) / 3) & 0xffff;
1209 }
1210
1211 which compiles into two loads instead of one in the loop.
1212
1213 predcom-2.c is the same as predcom-1.c
1214
1215 predcom-3.c is very similar but needs loads feeding each other instead of
1216 store->load.
1217
1218
1219 //===---------------------------------------------------------------------===//
1220
1221 Type based alias analysis:
1222 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=14705
1223
1224 //===---------------------------------------------------------------------===//
1225
1226 A/B get pinned to the stack because we turn an if/then into a select instead
1227 of PRE'ing the load/store.  This may be fixable in instcombine:
1228 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=37892
1229
1230 struct X { int i; };
1231 int foo (int x) {
1232   struct X a;
1233   struct X b;
1234   struct X *p;
1235   a.i = 1;
1236   b.i = 2;
1237   if (x)
1238     p = &a;
1239   else
1240     p = &b;
1241   return p->i;
1242 }
1243
1244 //===---------------------------------------------------------------------===//
1245
1246 Interesting missed case because of control flow flattening (should be 2 loads):
1247 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=26629
1248 With: llvm-gcc t2.c -S -o - -O0 -emit-llvm | llvm-as |
1249              opt -mem2reg -gvn -instcombine | llvm-dis
1250 we miss it because we need 1) CRIT EDGE 2) MULTIPLE DIFFERENT
1251 VALS PRODUCED BY ONE BLOCK OVER DIFFERENT PATHS
1252
1253 //===---------------------------------------------------------------------===//
1254
1255 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=19633
1256 We could eliminate the branch condition here, loading from null is undefined:
1257
1258 struct S { int w, x, y, z; };
1259 struct T { int r; struct S s; };
1260 void bar (struct S, int);
1261 void foo (int a, struct T b)
1262 {
1263   struct S *c = 0;
1264   if (a)
1265     c = &b.s;
1266   bar (*c, a);
1267 }
1268
1269 //===---------------------------------------------------------------------===//
1270
1271 simplifylibcalls should do several optimizations for strspn/strcspn:
1272
1273 strcspn(x, "") -> strlen(x)
1274 strcspn("", x) -> 0
1275 strspn("", x) -> 0
1276 strspn(x, "") -> strlen(x)
1277 strspn(x, "a") -> strchr(x, 'a')-x
1278
1279 strcspn(x, "a") -> inlined loop for up to 3 letters (similarly for strspn):
1280
1281 size_t __strcspn_c3 (__const char *__s, int __reject1, int __reject2,
1282                      int __reject3) {
1283   register size_t __result = 0;
1284   while (__s[__result] != '\0' && __s[__result] != __reject1 &&
1285          __s[__result] != __reject2 && __s[__result] != __reject3)
1286     ++__result;
1287   return __result;
1288 }
1289
1290 This should turn into a switch on the character.  See PR3253 for some notes on
1291 codegen.
1292
1293 456.hmmer apparently uses strcspn and strspn a lot.  471.omnetpp uses strspn.
1294
1295 //===---------------------------------------------------------------------===//
1296
1297 "gas" uses this idiom:
1298   else if (strchr ("+-/*%|&^:[]()~", *intel_parser.op_string))
1299 ..
1300   else if (strchr ("<>", *intel_parser.op_string)
1301
1302 Those should be turned into a switch.
1303
1304 //===---------------------------------------------------------------------===//
1305
1306 252.eon contains this interesting code:
1307
1308         %3072 = getelementptr [100 x i8]* %tempString, i32 0, i32 0
1309         %3073 = call i8* @strcpy(i8* %3072, i8* %3071) nounwind
1310         %strlen = call i32 @strlen(i8* %3072)    ; uses = 1
1311         %endptr = getelementptr [100 x i8]* %tempString, i32 0, i32 %strlen
1312         call void @llvm.memcpy.i32(i8* %endptr,
1313           i8* getelementptr ([5 x i8]* @"\01LC42", i32 0, i32 0), i32 5, i32 1)
1314         %3074 = call i32 @strlen(i8* %endptr) nounwind readonly
1315
1316 This is interesting for a couple reasons.  First, in this:
1317
1318         %3073 = call i8* @strcpy(i8* %3072, i8* %3071) nounwind
1319         %strlen = call i32 @strlen(i8* %3072)
1320
1321 The strlen could be replaced with: %strlen = sub %3072, %3073, because the
1322 strcpy call returns a pointer to the end of the string.  Based on that, the
1323 endptr GEP just becomes equal to 3073, which eliminates a strlen call and GEP.
1324
1325 Second, the memcpy+strlen strlen can be replaced with:
1326
1327         %3074 = call i32 @strlen([5 x i8]* @"\01LC42") nounwind readonly
1328
1329 Because the destination was just copied into the specified memory buffer.  This,
1330 in turn, can be constant folded to "4".
1331
1332 In other code, it contains:
1333
1334         %endptr6978 = bitcast i8* %endptr69 to i32*
1335         store i32 7107374, i32* %endptr6978, align 1
1336         %3167 = call i32 @strlen(i8* %endptr69) nounwind readonly
1337
1338 Which could also be constant folded.  Whatever is producing this should probably
1339 be fixed to leave this as a memcpy from a string.
1340
1341 Further, eon also has an interesting partially redundant strlen call:
1342
1343 bb8:            ; preds = %_ZN18eonImageCalculatorC1Ev.exit
1344         %682 = getelementptr i8** %argv, i32 6          ; <i8**> [#uses=2]
1345         %683 = load i8** %682, align 4          ; <i8*> [#uses=4]
1346         %684 = load i8* %683, align 1           ; <i8> [#uses=1]
1347         %685 = icmp eq i8 %684, 0               ; <i1> [#uses=1]
1348         br i1 %685, label %bb10, label %bb9
1349
1350 bb9:            ; preds = %bb8
1351         %686 = call i32 @strlen(i8* %683) nounwind readonly
1352         %687 = icmp ugt i32 %686, 254           ; <i1> [#uses=1]
1353         br i1 %687, label %bb10, label %bb11
1354
1355 bb10:           ; preds = %bb9, %bb8
1356         %688 = call i32 @strlen(i8* %683) nounwind readonly
1357
1358 This could be eliminated by doing the strlen once in bb8, saving code size and
1359 improving perf on the bb8->9->10 path.
1360
1361 //===---------------------------------------------------------------------===//
1362
1363 I see an interesting fully redundant call to strlen left in 186.crafty:InputMove
1364 which looks like:
1365        %movetext11 = getelementptr [128 x i8]* %movetext, i32 0, i32 0
1366
1367
1368 bb62:           ; preds = %bb55, %bb53
1369         %promote.0 = phi i32 [ %169, %bb55 ], [ 0, %bb53 ]
1370         %171 = call i32 @strlen(i8* %movetext11) nounwind readonly align 1
1371         %172 = add i32 %171, -1         ; <i32> [#uses=1]
1372         %173 = getelementptr [128 x i8]* %movetext, i32 0, i32 %172
1373
1374 ...  no stores ...
1375        br i1 %or.cond, label %bb65, label %bb72
1376
1377 bb65:           ; preds = %bb62
1378         store i8 0, i8* %173, align 1
1379         br label %bb72
1380
1381 bb72:           ; preds = %bb65, %bb62
1382         %trank.1 = phi i32 [ %176, %bb65 ], [ -1, %bb62 ]
1383         %177 = call i32 @strlen(i8* %movetext11) nounwind readonly align 1
1384
1385 Note that on the bb62->bb72 path, that the %177 strlen call is partially
1386 redundant with the %171 call.  At worst, we could shove the %177 strlen call
1387 up into the bb65 block moving it out of the bb62->bb72 path.   However, note
1388 that bb65 stores to the string, zeroing out the last byte.  This means that on
1389 that path the value of %177 is actually just %171-1.  A sub is cheaper than a
1390 strlen!
1391
1392 This pattern repeats several times, basically doing:
1393
1394   A = strlen(P);
1395   P[A-1] = 0;
1396   B = strlen(P);
1397   where it is "obvious" that B = A-1.
1398
1399 //===---------------------------------------------------------------------===//
1400
1401 186.crafty contains this interesting pattern:
1402
1403 %77 = call i8* @strstr(i8* getelementptr ([6 x i8]* @"\01LC5", i32 0, i32 0),
1404                        i8* %30)
1405 %phitmp648 = icmp eq i8* %77, getelementptr ([6 x i8]* @"\01LC5", i32 0, i32 0)
1406 br i1 %phitmp648, label %bb70, label %bb76
1407
1408 bb70:           ; preds = %OptionMatch.exit91, %bb69
1409         %78 = call i32 @strlen(i8* %30) nounwind readonly align 1               ; <i32> [#uses=1]
1410
1411 This is basically:
1412   cststr = "abcdef";
1413   if (strstr(cststr, P) == cststr) {
1414      x = strlen(P);
1415      ...
1416
1417 The strstr call would be significantly cheaper written as:
1418
1419 cststr = "abcdef";
1420 if (memcmp(P, str, strlen(P)))
1421   x = strlen(P);
1422
1423 This is memcmp+strlen instead of strstr.  This also makes the strlen fully
1424 redundant.
1425
1426 //===---------------------------------------------------------------------===//
1427
1428 186.crafty also contains this code:
1429
1430 %1906 = call i32 @strlen(i8* getelementptr ([32 x i8]* @pgn_event, i32 0,i32 0))
1431 %1907 = getelementptr [32 x i8]* @pgn_event, i32 0, i32 %1906
1432 %1908 = call i8* @strcpy(i8* %1907, i8* %1905) nounwind align 1
1433 %1909 = call i32 @strlen(i8* getelementptr ([32 x i8]* @pgn_event, i32 0,i32 0))
1434 %1910 = getelementptr [32 x i8]* @pgn_event, i32 0, i32 %1909
1435
1436 The last strlen is computable as 1908-@pgn_event, which means 1910=1908.
1437
1438 //===---------------------------------------------------------------------===//
1439
1440 186.crafty has this interesting pattern with the "out.4543" variable:
1441
1442 call void @llvm.memcpy.i32(
1443         i8* getelementptr ([10 x i8]* @out.4543, i32 0, i32 0),
1444        i8* getelementptr ([7 x i8]* @"\01LC28700", i32 0, i32 0), i32 7, i32 1)
1445 %101 = call@printf(i8* ...   @out.4543, i32 0, i32 0)) nounwind
1446
1447 It is basically doing:
1448
1449   memcpy(globalarray, "string");
1450   printf(...,  globalarray);
1451
1452 Anyway, by knowing that printf just reads the memory and forward substituting
1453 the string directly into the printf, this eliminates reads from globalarray.
1454 Since this pattern occurs frequently in crafty (due to the "DisplayTime" and
1455 other similar functions) there are many stores to "out".  Once all the printfs
1456 stop using "out", all that is left is the memcpy's into it.  This should allow
1457 globalopt to remove the "stored only" global.
1458
1459 //===---------------------------------------------------------------------===//
1460
1461 This code:
1462
1463 define inreg i32 @foo(i8* inreg %p) nounwind {
1464   %tmp0 = load i8* %p
1465   %tmp1 = ashr i8 %tmp0, 5
1466   %tmp2 = sext i8 %tmp1 to i32
1467   ret i32 %tmp2
1468 }
1469
1470 could be dagcombine'd to a sign-extending load with a shift.
1471 For example, on x86 this currently gets this:
1472
1473         movb    (%eax), %al
1474         sarb    $5, %al
1475         movsbl  %al, %eax
1476
1477 while it could get this:
1478
1479         movsbl  (%eax), %eax
1480         sarl    $5, %eax
1481
1482 //===---------------------------------------------------------------------===//
1483
1484 GCC PR31029:
1485
1486 int test(int x) { return 1-x == x; }     // --> return false
1487 int test2(int x) { return 2-x == x; }    // --> return x == 1 ?
1488
1489 Always foldable for odd constants, what is the rule for even?
1490
1491 //===---------------------------------------------------------------------===//
1492
1493 PR 3381: GEP to field of size 0 inside a struct could be turned into GEP
1494 for next field in struct (which is at same address).
1495
1496 For example: store of float into { {{}}, float } could be turned into a store to
1497 the float directly.
1498
1499 //===---------------------------------------------------------------------===//
1500
1501 #include <math.h>
1502 double foo(double a) {    return sin(a); }
1503
1504 This compiles into this on x86-64 Linux:
1505 foo:
1506         subq    $8, %rsp
1507         call    sin
1508         addq    $8, %rsp
1509         ret
1510 vs:
1511
1512 foo:
1513         jmp sin
1514
1515 //===---------------------------------------------------------------------===//
1516
1517 The arg promotion pass should make use of nocapture to make its alias analysis
1518 stuff much more precise.
1519
1520 //===---------------------------------------------------------------------===//
1521
1522 The following functions should be optimized to use a select instead of a
1523 branch (from gcc PR40072):
1524
1525 char char_int(int m) {if(m>7) return 0; return m;}
1526 int int_char(char m) {if(m>7) return 0; return m;}
1527
1528 //===---------------------------------------------------------------------===//
1529
1530 int func(int a, int b) { if (a & 0x80) b |= 0x80; else b &= ~0x80; return b; }
1531
1532 Generates this:
1533
1534 define i32 @func(i32 %a, i32 %b) nounwind readnone ssp {
1535 entry:
1536   %0 = and i32 %a, 128                            ; <i32> [#uses=1]
1537   %1 = icmp eq i32 %0, 0                          ; <i1> [#uses=1]
1538   %2 = or i32 %b, 128                             ; <i32> [#uses=1]
1539   %3 = and i32 %b, -129                           ; <i32> [#uses=1]
1540   %b_addr.0 = select i1 %1, i32 %3, i32 %2        ; <i32> [#uses=1]
1541   ret i32 %b_addr.0
1542 }
1543
1544 However, it's functionally equivalent to:
1545
1546          b = (b & ~0x80) | (a & 0x80);
1547
1548 Which generates this:
1549
1550 define i32 @func(i32 %a, i32 %b) nounwind readnone ssp {
1551 entry:
1552   %0 = and i32 %b, -129                           ; <i32> [#uses=1]
1553   %1 = and i32 %a, 128                            ; <i32> [#uses=1]
1554   %2 = or i32 %0, %1                              ; <i32> [#uses=1]
1555   ret i32 %2
1556 }
1557
1558 This can be generalized for other forms:
1559
1560      b = (b & ~0x80) | (a & 0x40) << 1;
1561
1562 //===---------------------------------------------------------------------===//
1563
1564 These two functions produce different code. They shouldn't:
1565
1566 #include <stdint.h>
1567
1568 uint8_t p1(uint8_t b, uint8_t a) {
1569   b = (b & ~0xc0) | (a & 0xc0);
1570   return (b);
1571 }
1572
1573 uint8_t p2(uint8_t b, uint8_t a) {
1574   b = (b & ~0x40) | (a & 0x40);
1575   b = (b & ~0x80) | (a & 0x80);
1576   return (b);
1577 }
1578
1579 define zeroext i8 @p1(i8 zeroext %b, i8 zeroext %a) nounwind readnone ssp {
1580 entry:
1581   %0 = and i8 %b, 63                              ; <i8> [#uses=1]
1582   %1 = and i8 %a, -64                             ; <i8> [#uses=1]
1583   %2 = or i8 %1, %0                               ; <i8> [#uses=1]
1584   ret i8 %2
1585 }
1586
1587 define zeroext i8 @p2(i8 zeroext %b, i8 zeroext %a) nounwind readnone ssp {
1588 entry:
1589   %0 = and i8 %b, 63                              ; <i8> [#uses=1]
1590   %.masked = and i8 %a, 64                        ; <i8> [#uses=1]
1591   %1 = and i8 %a, -128                            ; <i8> [#uses=1]
1592   %2 = or i8 %1, %0                               ; <i8> [#uses=1]
1593   %3 = or i8 %2, %.masked                         ; <i8> [#uses=1]
1594   ret i8 %3
1595 }
1596
1597 //===---------------------------------------------------------------------===//
1598
1599 IPSCCP does not currently propagate argument dependent constants through
1600 functions where it does not not all of the callers.  This includes functions
1601 with normal external linkage as well as templates, C99 inline functions etc.
1602 Specifically, it does nothing to:
1603
1604 define i32 @test(i32 %x, i32 %y, i32 %z) nounwind {
1605 entry:
1606   %0 = add nsw i32 %y, %z
1607   %1 = mul i32 %0, %x
1608   %2 = mul i32 %y, %z
1609   %3 = add nsw i32 %1, %2
1610   ret i32 %3
1611 }
1612
1613 define i32 @test2() nounwind {
1614 entry:
1615   %0 = call i32 @test(i32 1, i32 2, i32 4) nounwind
1616   ret i32 %0
1617 }
1618
1619 It would be interesting extend IPSCCP to be able to handle simple cases like
1620 this, where all of the arguments to a call are constant.  Because IPSCCP runs
1621 before inlining, trivial templates and inline functions are not yet inlined.
1622 The results for a function + set of constant arguments should be memoized in a
1623 map.
1624
1625 //===---------------------------------------------------------------------===//
1626
1627 The libcall constant folding stuff should be moved out of SimplifyLibcalls into
1628 libanalysis' constantfolding logic.  This would allow IPSCCP to be able to
1629 handle simple things like this:
1630
1631 static int foo(const char *X) { return strlen(X); }
1632 int bar() { return foo("abcd"); }
1633
1634 //===---------------------------------------------------------------------===//
1635
1636 InstCombine should use SimplifyDemandedBits to remove the or instruction:
1637
1638 define i1 @test(i8 %x, i8 %y) {
1639   %A = or i8 %x, 1
1640   %B = icmp ugt i8 %A, 3
1641   ret i1 %B
1642 }
1643
1644 Currently instcombine calls SimplifyDemandedBits with either all bits or just
1645 the sign bit, if the comparison is obviously a sign test. In this case, we only
1646 need all but the bottom two bits from %A, and if we gave that mask to SDB it
1647 would delete the or instruction for us.
1648
1649 //===---------------------------------------------------------------------===//
1650
1651 functionattrs doesn't know much about memcpy/memset.  This function should be
1652 marked readnone rather than readonly, since it only twiddles local memory, but
1653 functionattrs doesn't handle memset/memcpy/memmove aggressively:
1654
1655 struct X { int *p; int *q; };
1656 int foo() {
1657  int i = 0, j = 1;
1658  struct X x, y;
1659  int **p;
1660  y.p = &i;
1661  x.q = &j;
1662  p = __builtin_memcpy (&x, &y, sizeof (int *));
1663  return **p;
1664 }
1665
1666 //===---------------------------------------------------------------------===//
1667