lib/Target/README.txt

   1 Target Independent Opportunities:
   2
   3 //===---------------------------------------------------------------------===//
   4
   5 We should make the various target's "IMPLICIT_DEF" instructions be a single
   6 target-independent opcode like TargetInstrInfo::INLINEASM.  This would allow
   7 us to eliminate the TargetInstrDesc::isImplicitDef() method, and would allow
   8 us to avoid having to define this for every target for every register class.
   9
  10 //===---------------------------------------------------------------------===//
  11
  12 With the recent changes to make the implicit def/use set explicit in
  13 machineinstrs, we should change the target descriptions for 'call' instructions
  14 so that the .td files don't list all the call-clobbered registers as implicit
  15 defs.  Instead, these should be added by the code generator (e.g. on the dag).
  16
  17 This has a number of uses:
  18
  19 1. PPC32/64 and X86 32/64 can avoid having multiple copies of call instructions
  20    for their different impdef sets.
  21 2. Targets with multiple calling convs (e.g. x86) which have different clobber
  22    sets don't need copies of call instructions.
  23 3. 'Interprocedural register allocation' can be done to reduce the clobber sets
  24    of calls.
  25
  26 //===---------------------------------------------------------------------===//
  27
  28 Make the PPC branch selector target independant
  29
  30 //===---------------------------------------------------------------------===//
  31
  32 Get the C front-end to expand hypot(x,y) -> llvm.sqrt(x*x+y*y) when errno and
  33 precision don't matter (ffastmath).  Misc/mandel will like this. :)
  34
  35 //===---------------------------------------------------------------------===//
  36
  37 Solve this DAG isel folding deficiency:
  38
  39 int X, Y;
  40
  41 void fn1(void)
  42 {
  43   X = X | (Y << 3);
  44 }
  45
  46 compiles to
  47
  48 fn1:
  49         movl Y, %eax
  50         shll $3, %eax
  51         orl X, %eax
  52         movl %eax, X
  53         ret
  54
  55 The problem is the store's chain operand is not the load X but rather
  56 a TokenFactor of the load X and load Y, which prevents the folding.
  57
  58 There are two ways to fix this:
  59
  60 1. The dag combiner can start using alias analysis to realize that y/x
  61    don't alias, making the store to X not dependent on the load from Y.
  62 2. The generated isel could be made smarter in the case it can't
  63    disambiguate the pointers.
  64
  65 Number 1 is the preferred solution.
  66
  67 This has been "fixed" by a TableGen hack. But that is a short term workaround
  68 which will be removed once the proper fix is made.
  69
  70 //===---------------------------------------------------------------------===//
  71
  72 On targets with expensive 64-bit multiply, we could LSR this:
  73
  74 for (i = ...; ++i) {
  75    x = 1ULL << i;
  76
  77 into:
  78  long long tmp = 1;
  79  for (i = ...; ++i, tmp+=tmp)
  80    x = tmp;
  81
  82 This would be a win on ppc32, but not x86 or ppc64.
  83
  84 //===---------------------------------------------------------------------===//
  85
  86 Shrink: (setlt (loadi32 P), 0) -> (setlt (loadi8 Phi), 0)
  87
  88 //===---------------------------------------------------------------------===//
  89
  90 Reassociate should turn: X*X*X*X -> t=(X*X) (t*t) to eliminate a multiply.
  91
  92 //===---------------------------------------------------------------------===//
  93
  94 Interesting? testcase for add/shift/mul reassoc:
  95
  96 int bar(int x, int y) {
  97   return x*x*x+y+x*x*x*x*x*y*y*y*y;
  98 }
  99 int foo(int z, int n) {
 100   return bar(z, n) + bar(2*z, 2*n);
 101 }
 102
 103 Reassociate should handle the example in GCC PR16157.
 104
 105 //===---------------------------------------------------------------------===//
 106
 107 These two functions should generate the same code on big-endian systems:
 108
 109 int g(int *j,int *l)  {  return memcmp(j,l,4);  }
 110 int h(int *j, int *l) {  return *j - *l; }
 111
 112 this could be done in SelectionDAGISel.cpp, along with other special cases,
 113 for 1,2,4,8 bytes.
 114
 115 //===---------------------------------------------------------------------===//
 116
 117 It would be nice to revert this patch:
 118 http://lists.cs.uiuc.edu/pipermail/llvm-commits/Week-of-Mon-20060213/031986.html
 119
 120 And teach the dag combiner enough to simplify the code expanded before
 121 legalize.  It seems plausible that this knowledge would let it simplify other
 122 stuff too.
 123
 124 //===---------------------------------------------------------------------===//
 125
 126 For vector types, TargetData.cpp::getTypeInfo() returns alignment that is equal
 127 to the type size. It works but can be overly conservative as the alignment of
 128 specific vector types are target dependent.
 129
 130 //===---------------------------------------------------------------------===//
 131
 132 We should add 'unaligned load/store' nodes, and produce them from code like
 133 this:
 134
 135 v4sf example(float *P) {
 136   return (v4sf){P[0], P[1], P[2], P[3] };
 137 }
 138
 139 //===---------------------------------------------------------------------===//
 140
 141 Add support for conditional increments, and other related patterns.  Instead
 142 of:
 143
 144         movl 136(%esp), %eax
 145         cmpl $0, %eax
 146         je LBB16_2      #cond_next
 147 LBB16_1:        #cond_true
 148         incl _foo
 149 LBB16_2:        #cond_next
 150
 151 emit:
 152         movl    _foo, %eax
 153         cmpl    $1, %edi
 154         sbbl    $-1, %eax
 155         movl    %eax, _foo
 156
 157 //===---------------------------------------------------------------------===//
 158
 159 Combine: a = sin(x), b = cos(x) into a,b = sincos(x).
 160
 161 Expand these to calls of sin/cos and stores:
 162       double sincos(double x, double *sin, double *cos);
 163       float sincosf(float x, float *sin, float *cos);
 164       long double sincosl(long double x, long double *sin, long double *cos);
 165
 166 Doing so could allow SROA of the destination pointers.  See also:
 167 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=17687
 168
 169 //===---------------------------------------------------------------------===//
 170
 171 Scalar Repl cannot currently promote this testcase to 'ret long cst':
 172
 173         %struct.X = type { i32, i32 }
 174         %struct.Y = type { %struct.X }
 175
 176 define i64 @bar() {
 177         %retval = alloca %struct.Y, align 8
 178         %tmp12 = getelementptr %struct.Y* %retval, i32 0, i32 0, i32 0
 179         store i32 0, i32* %tmp12
 180         %tmp15 = getelementptr %struct.Y* %retval, i32 0, i32 0, i32 1
 181         store i32 1, i32* %tmp15
 182         %retval.upgrd.1 = bitcast %struct.Y* %retval to i64*
 183         %retval.upgrd.2 = load i64* %retval.upgrd.1
 184         ret i64 %retval.upgrd.2
 185 }
 186
 187 it should be extended to do so.
 188
 189 //===---------------------------------------------------------------------===//
 190
 191 -scalarrepl should promote this to be a vector scalar.
 192
 193         %struct..0anon = type { <4 x float> }
 194
 195 define void @test1(<4 x float> %V, float* %P) {
 196         %u = alloca %struct..0anon, align 16
 197         %tmp = getelementptr %struct..0anon* %u, i32 0, i32 0
 198         store <4 x float> %V, <4 x float>* %tmp
 199         %tmp1 = bitcast %struct..0anon* %u to [4 x float]*
 200         %tmp.upgrd.1 = getelementptr [4 x float]* %tmp1, i32 0, i32 1
 201         %tmp.upgrd.2 = load float* %tmp.upgrd.1
 202         %tmp3 = mul float %tmp.upgrd.2, 2.000000e+00
 203         store float %tmp3, float* %P
 204         ret void
 205 }
 206
 207 //===---------------------------------------------------------------------===//
 208
 209 Turn this into a single byte store with no load (the other 3 bytes are
 210 unmodified):
 211
 212 void %test(uint* %P) {
 213         %tmp = load uint* %P
 214         %tmp14 = or uint %tmp, 3305111552
 215         %tmp15 = and uint %tmp14, 3321888767
 216         store uint %tmp15, uint* %P
 217         ret void
 218 }
 219
 220 //===---------------------------------------------------------------------===//
 221
 222 dag/inst combine "clz(x)>>5 -> x==0" for 32-bit x.
 223
 224 Compile:
 225
 226 int bar(int x)
 227 {
 228   int t = __builtin_clz(x);
 229   return -(t>>5);
 230 }
 231
 232 to:
 233
 234 _bar:   addic r3,r3,-1
 235         subfe r3,r3,r3
 236         blr
 237
 238 //===---------------------------------------------------------------------===//
 239
 240 Legalize should lower ctlz like this:
 241   ctlz(x) = popcnt((x-1) & ~x)
 242
 243 on targets that have popcnt but not ctlz.  itanium, what else?
 244
 245 //===---------------------------------------------------------------------===//
 246
 247 quantum_sigma_x in 462.libquantum contains the following loop:
 248
 249       for(i=0; i<reg->size; i++)
 250         {
 251           /* Flip the target bit of each basis state */
 252           reg->node[i].state ^= ((MAX_UNSIGNED) 1 << target);
 253         }
 254
 255 Where MAX_UNSIGNED/state is a 64-bit int.  On a 32-bit platform it would be just
 256 so cool to turn it into something like:
 257
 258    long long Res = ((MAX_UNSIGNED) 1 << target);
 259    if (target < 32) {
 260      for(i=0; i<reg->size; i++)
 261        reg->node[i].state ^= Res & 0xFFFFFFFFULL;
 262    } else {
 263      for(i=0; i<reg->size; i++)
 264        reg->node[i].state ^= Res & 0xFFFFFFFF00000000ULL
 265    }
 266
 267 ... which would only do one 32-bit XOR per loop iteration instead of two.
 268
 269 It would also be nice to recognize the reg->size doesn't alias reg->node[i], but
 270 alas...
 271
 272 //===---------------------------------------------------------------------===//
 273
 274 This isn't recognized as bswap by instcombine (yes, it really is bswap):
 275
 276 unsigned long reverse(unsigned v) {
 277     unsigned t;
 278     t = v ^ ((v << 16) | (v >> 16));
 279     t &= ~0xff0000;
 280     v = (v << 24) | (v >> 8);
 281     return v ^ (t >> 8);
 282 }
 283
 284 //===---------------------------------------------------------------------===//
 285
 286 These idioms should be recognized as popcount (see PR1488):
 287
 288 unsigned countbits_slow(unsigned v) {
 289   unsigned c;
 290   for (c = 0; v; v >>= 1)
 291     c += v & 1;
 292   return c;
 293 }
 294 unsigned countbits_fast(unsigned v){
 295   unsigned c;
 296   for (c = 0; v; c++)
 297     v &= v - 1; // clear the least significant bit set
 298   return c;
 299 }
 300
 301 BITBOARD = unsigned long long
 302 int PopCnt(register BITBOARD a) {
 303   register int c=0;
 304   while(a) {
 305     c++;
 306     a &= a - 1;
 307   }
 308   return c;
 309 }
 310 unsigned int popcount(unsigned int input) {
 311   unsigned int count = 0;
 312   for (unsigned int i =  0; i < 4 * 8; i++)
 313     count += (input >> i) & i;
 314   return count;
 315 }
 316
 317 //===---------------------------------------------------------------------===//
 318
 319 These should turn into single 16-bit (unaligned?) loads on little/big endian
 320 processors.
 321
 322 unsigned short read_16_le(const unsigned char *adr) {
 323   return adr[0] | (adr[1] << 8);
 324 }
 325 unsigned short read_16_be(const unsigned char *adr) {
 326   return (adr[0] << 8) | adr[1];
 327 }
 328
 329 //===---------------------------------------------------------------------===//
 330
 331 -instcombine should handle this transform:
 332    icmp pred (sdiv X / C1 ), C2
 333 when X, C1, and C2 are unsigned.  Similarly for udiv and signed operands.
 334
 335 Currently InstCombine avoids this transform but will do it when the signs of
 336 the operands and the sign of the divide match. See the FIXME in
 337 InstructionCombining.cpp in the visitSetCondInst method after the switch case
 338 for Instruction::UDiv (around line 4447) for more details.
 339
 340 The SingleSource/Benchmarks/Shootout-C++/hash and hash2 tests have examples of
 341 this construct.
 342
 343 //===---------------------------------------------------------------------===//
 344
 345 viterbi speeds up *significantly* if the various "history" related copy loops
 346 are turned into memcpy calls at the source level.  We need a "loops to memcpy"
 347 pass.
 348
 349 //===---------------------------------------------------------------------===//
 350
 351 Consider:
 352
 353 typedef unsigned U32;
 354 typedef unsigned long long U64;
 355 int test (U32 *inst, U64 *regs) {
 356     U64 effective_addr2;
 357     U32 temp = *inst;
 358     int r1 = (temp >> 20) & 0xf;
 359     int b2 = (temp >> 16) & 0xf;
 360     effective_addr2 = temp & 0xfff;
 361     if (b2) effective_addr2 += regs[b2];
 362     b2 = (temp >> 12) & 0xf;
 363     if (b2) effective_addr2 += regs[b2];
 364     effective_addr2 &= regs[4];
 365      if ((effective_addr2 & 3) == 0)
 366         return 1;
 367     return 0;
 368 }
 369
 370 Note that only the low 2 bits of effective_addr2 are used.  On 32-bit systems,
 371 we don't eliminate the computation of the top half of effective_addr2 because
 372 we don't have whole-function selection dags.  On x86, this means we use one
 373 extra register for the function when effective_addr2 is declared as U64 than
 374 when it is declared U32.
 375
 376 //===---------------------------------------------------------------------===//
 377
 378 Promote for i32 bswap can use i64 bswap + shr.  Useful on targets with 64-bit
 379 regs and bswap, like itanium.
 380
 381 //===---------------------------------------------------------------------===//
 382
 383 LSR should know what GPR types a target has.  This code:
 384
 385 volatile short X, Y; // globals
 386
 387 void foo(int N) {
 388   int i;
 389   for (i = 0; i < N; i++) { X = i; Y = i*4; }
 390 }
 391
 392 produces two identical IV's (after promotion) on PPC/ARM:
 393
 394 LBB1_1: @bb.preheader
 395         mov r3, #0
 396         mov r2, r3
 397         mov r1, r3
 398 LBB1_2: @bb
 399         ldr r12, LCPI1_0
 400         ldr r12, [r12]
 401         strh r2, [r12]
 402         ldr r12, LCPI1_1
 403         ldr r12, [r12]
 404         strh r3, [r12]
 405         add r1, r1, #1    <- [0,+,1]
 406         add r3, r3, #4
 407         add r2, r2, #1    <- [0,+,1]
 408         cmp r1, r0
 409         bne LBB1_2      @bb
 410
 411
 412 //===---------------------------------------------------------------------===//
 413
 414 Tail call elim should be more aggressive, checking to see if the call is
 415 followed by an uncond branch to an exit block.
 416
 417 ; This testcase is due to tail-duplication not wanting to copy the return
 418 ; instruction into the terminating blocks because there was other code
 419 ; optimized out of the function after the taildup happened.
 420 ; RUN: llvm-as < %s | opt -tailcallelim | llvm-dis | not grep call
 421
 422 define i32 @t4(i32 %a) {
 423 entry:
 424         %tmp.1 = and i32 %a, 1          ; <i32> [#uses=1]
 425         %tmp.2 = icmp ne i32 %tmp.1, 0          ; <i1> [#uses=1]
 426         br i1 %tmp.2, label %then.0, label %else.0
 427
 428 then.0:         ; preds = %entry
 429         %tmp.5 = add i32 %a, -1         ; <i32> [#uses=1]
 430         %tmp.3 = call i32 @t4( i32 %tmp.5 )             ; <i32> [#uses=1]
 431         br label %return
 432
 433 else.0:         ; preds = %entry
 434         %tmp.7 = icmp ne i32 %a, 0              ; <i1> [#uses=1]
 435         br i1 %tmp.7, label %then.1, label %return
 436
 437 then.1:         ; preds = %else.0
 438         %tmp.11 = add i32 %a, -2                ; <i32> [#uses=1]
 439         %tmp.9 = call i32 @t4( i32 %tmp.11 )            ; <i32> [#uses=1]
 440         br label %return
 441
 442 return:         ; preds = %then.1, %else.0, %then.0
 443         %result.0 = phi i32 [ 0, %else.0 ], [ %tmp.3, %then.0 ],
 444                             [ %tmp.9, %then.1 ]
 445         ret i32 %result.0
 446 }
 447
 448 //===---------------------------------------------------------------------===//
 449
 450 Tail recursion elimination is not transforming this function, because it is
 451 returning n, which fails the isDynamicConstant check in the accumulator
 452 recursion checks.
 453
 454 long long fib(const long long n) {
 455   switch(n) {
 456     case 0:
 457     case 1:
 458       return n;
 459     default:
 460       return fib(n-1) + fib(n-2);
 461   }
 462 }
 463
 464 //===---------------------------------------------------------------------===//
 465
 466 Tail recursion elimination should handle:
 467
 468 int pow2m1(int n) {
 469  if (n == 0)
 470    return 0;
 471  return 2 * pow2m1 (n - 1) + 1;
 472 }
 473
 474 Also, multiplies can be turned into SHL's, so they should be handled as if
 475 they were associative.  "return foo() << 1" can be tail recursion eliminated.
 476
 477 //===---------------------------------------------------------------------===//
 478
 479 Argument promotion should promote arguments for recursive functions, like
 480 this:
 481
 482 ; RUN: llvm-as < %s | opt -argpromotion | llvm-dis | grep x.val
 483
 484 define internal i32 @foo(i32* %x) {
 485 entry:
 486         %tmp = load i32* %x             ; <i32> [#uses=0]
 487         %tmp.foo = call i32 @foo( i32* %x )             ; <i32> [#uses=1]
 488         ret i32 %tmp.foo
 489 }
 490
 491 define i32 @bar(i32* %x) {
 492 entry:
 493         %tmp3 = call i32 @foo( i32* %x )                ; <i32> [#uses=1]
 494         ret i32 %tmp3
 495 }
 496
 497 //===---------------------------------------------------------------------===//
 498
 499 "basicaa" should know how to look through "or" instructions that act like add
 500 instructions.  For example in this code, the x*4+1 is turned into x*4 | 1, and
 501 basicaa can't analyze the array subscript, leading to duplicated loads in the
 502 generated code:
 503
 504 void test(int X, int Y, int a[]) {
 505 int i;
 506   for (i=2; i<1000; i+=4) {
 507   a[i+0] = a[i-1+0]*a[i-2+0];
 508   a[i+1] = a[i-1+1]*a[i-2+1];
 509   a[i+2] = a[i-1+2]*a[i-2+2];
 510   a[i+3] = a[i-1+3]*a[i-2+3];
 511   }
 512 }
 513
 514 //===---------------------------------------------------------------------===//
 515
 516 We should investigate an instruction sinking pass.  Consider this silly
 517 example in pic mode:
 518
 519 #include <assert.h>
 520 void foo(int x) {
 521   assert(x);
 522   //...
 523 }
 524
 525 we compile this to:
 526 _foo:
 527         subl    $28, %esp
 528         call    "L1$pb"
 529 "L1$pb":
 530         popl    %eax
 531         cmpl    $0, 32(%esp)
 532         je      LBB1_2  # cond_true
 533 LBB1_1: # return
 534         # ...
 535         addl    $28, %esp
 536         ret
 537 LBB1_2: # cond_true
 538 ...
 539
 540 The PIC base computation (call+popl) is only used on one path through the
 541 code, but is currently always computed in the entry block.  It would be
 542 better to sink the picbase computation down into the block for the
 543 assertion, as it is the only one that uses it.  This happens for a lot of
 544 code with early outs.
 545
 546 Another example is loads of arguments, which are usually emitted into the
 547 entry block on targets like x86.  If not used in all paths through a
 548 function, they should be sunk into the ones that do.
 549
 550 In this case, whole-function-isel would also handle this.
 551
 552 //===---------------------------------------------------------------------===//
 553
 554 Investigate lowering of sparse switch statements into perfect hash tables:
 555 http://burtleburtle.net/bob/hash/perfect.html
 556
 557 //===---------------------------------------------------------------------===//
 558
 559 We should turn things like "load+fabs+store" and "load+fneg+store" into the
 560 corresponding integer operations.  On a yonah, this loop:
 561
 562 double a[256];
 563 void foo() {
 564   int i, b;
 565   for (b = 0; b < 10000000; b++)
 566   for (i = 0; i < 256; i++)
 567     a[i] = -a[i];
 568 }
 569
 570 is twice as slow as this loop:
 571
 572 long long a[256];
 573 void foo() {
 574   int i, b;
 575   for (b = 0; b < 10000000; b++)
 576   for (i = 0; i < 256; i++)
 577     a[i] ^= (1ULL << 63);
 578 }
 579
 580 and I suspect other processors are similar.  On X86 in particular this is a
 581 big win because doing this with integers allows the use of read/modify/write
 582 instructions.
 583
 584 //===---------------------------------------------------------------------===//
 585
 586 DAG Combiner should try to combine small loads into larger loads when
 587 profitable.  For example, we compile this C++ example:
 588
 589 struct THotKey { short Key; bool Control; bool Shift; bool Alt; };
 590 extern THotKey m_HotKey;
 591 THotKey GetHotKey () { return m_HotKey; }
 592
 593 into (-O3 -fno-exceptions -static -fomit-frame-pointer):
 594
 595 __Z9GetHotKeyv:
 596         pushl   %esi
 597         movl    8(%esp), %eax
 598         movb    _m_HotKey+3, %cl
 599         movb    _m_HotKey+4, %dl
 600         movb    _m_HotKey+2, %ch
 601         movw    _m_HotKey, %si
 602         movw    %si, (%eax)
 603         movb    %ch, 2(%eax)
 604         movb    %cl, 3(%eax)
 605         movb    %dl, 4(%eax)
 606         popl    %esi
 607         ret     $4
 608
 609 GCC produces:
 610
 611 __Z9GetHotKeyv:
 612         movl    _m_HotKey, %edx
 613         movl    4(%esp), %eax
 614         movl    %edx, (%eax)
 615         movzwl  _m_HotKey+4, %edx
 616         movw    %dx, 4(%eax)
 617         ret     $4
 618
 619 The LLVM IR contains the needed alignment info, so we should be able to
 620 merge the loads and stores into 4-byte loads:
 621
 622         %struct.THotKey = type { i16, i8, i8, i8 }
 623 define void @_Z9GetHotKeyv(%struct.THotKey* sret  %agg.result) nounwind  {
 624 ...
 625         %tmp2 = load i16* getelementptr (@m_HotKey, i32 0, i32 0), align 8
 626         %tmp5 = load i8* getelementptr (@m_HotKey, i32 0, i32 1), align 2
 627         %tmp8 = load i8* getelementptr (@m_HotKey, i32 0, i32 2), align 1
 628         %tmp11 = load i8* getelementptr (@m_HotKey, i32 0, i32 3), align 2
 629
 630 Alternatively, we should use a small amount of base-offset alias analysis
 631 to make it so the scheduler doesn't need to hold all the loads in regs at
 632 once.
 633
 634 //===---------------------------------------------------------------------===//
 635
 636 We should extend parameter attributes to capture more information about
 637 pointer parameters for alias analysis.  Some ideas:
 638
 639 1. Add a "nocapture" attribute, which indicates that the callee does not store
 640    the address of the parameter into a global or any other memory location
 641    visible to the callee.  This can be used to make basicaa and other analyses
 642    more powerful.  It is true for things like memcpy, strcat, and many other
 643    things, including structs passed by value, most C++ references, etc.
 644 2. Generalize readonly to be set on parameters.  This is important mod/ref
 645    info for the function, which is important for basicaa and others.  It can
 646    also be used by the inliner to avoid inserting a memcpy for byval
 647    arguments when the function is inlined.
 648
 649 These functions can be inferred by various analysis passes such as the
 650 globalsmodrefaa pass.  Note that getting #2 right is actually really tricky.
 651 Consider this code:
 652
 653 struct S;  S G;
 654 void caller(S byvalarg) { G.field = 1; ... }
 655 void callee() { caller(G); }
 656
 657 The fact that the caller does not modify byval arg is not enough, we need
 658 to know that it doesn't modify G either.  This is very tricky.
 659
 660 //===---------------------------------------------------------------------===//
 661
 662 We should add an FRINT node to the DAG to model targets that have legal
 663 implementations of ceil/floor/rint.
 664
 665 //===---------------------------------------------------------------------===//
 666
 667 This GCC bug: http://gcc.gnu.org/bugzilla/show_bug.cgi?id=34043
 668 contains a testcase that compiles down to:
 669
 670         %struct.XMM128 = type { <4 x float> }
 671 ..
 672         %src = alloca %struct.XMM128
 673 ..
 674         %tmp6263 = bitcast %struct.XMM128* %src to <2 x i64>*
 675         %tmp65 = getelementptr %struct.XMM128* %src, i32 0, i32 0
 676         store <2 x i64> %tmp5899, <2 x i64>* %tmp6263, align 16
 677         %tmp66 = load <4 x float>* %tmp65, align 16
 678         %tmp71 = add <4 x float> %tmp66, %tmp66
 679
 680 If the mid-level optimizer turned the bitcast of pointer + store of tmp5899
 681 into a bitcast of the vector value and a store to the pointer, then the
 682 store->load could be easily removed.
 683
 684 //===---------------------------------------------------------------------===//
 685
 686 Consider:
 687
 688 int test() {
 689   long long input[8] = {1,1,1,1,1,1,1,1};
 690   foo(input);
 691 }
 692
 693 We currently compile this into a memcpy from a global array since the
 694 initializer is fairly large and not memset'able.  This is good, but the memcpy
 695 gets lowered to load/stores in the code generator.  This is also ok, except
 696 that the codegen lowering for memcpy doesn't handle the case when the source
 697 is a constant global.  This gives us atrocious code like this:
 698
 699         call    "L1$pb"
 700 "L1$pb":
 701         popl    %eax
 702         movl    _C.0.1444-"L1$pb"+32(%eax), %ecx
 703         movl    %ecx, 40(%esp)
 704         movl    _C.0.1444-"L1$pb"+20(%eax), %ecx
 705         movl    %ecx, 28(%esp)
 706         movl    _C.0.1444-"L1$pb"+36(%eax), %ecx
 707         movl    %ecx, 44(%esp)
 708         movl    _C.0.1444-"L1$pb"+44(%eax), %ecx
 709         movl    %ecx, 52(%esp)
 710         movl    _C.0.1444-"L1$pb"+40(%eax), %ecx
 711         movl    %ecx, 48(%esp)
 712         movl    _C.0.1444-"L1$pb"+12(%eax), %ecx
 713         movl    %ecx, 20(%esp)
 714         movl    _C.0.1444-"L1$pb"+4(%eax), %ecx
 715 ...
 716
 717 instead of:
 718         movl    $1, 16(%esp)
 719         movl    $0, 20(%esp)
 720         movl    $1, 24(%esp)
 721         movl    $0, 28(%esp)
 722         movl    $1, 32(%esp)
 723         movl    $0, 36(%esp)
 724         ...
 725
 726 //===---------------------------------------------------------------------===//
 727
 728 http://llvm.org/PR717:
 729
 730 The following code should compile into "ret int undef". Instead, LLVM
 731 produces "ret int 0":
 732
 733 int f() {
 734   int x = 4;
 735   int y;
 736   if (x == 3) y = 0;
 737   return y;
 738 }
 739
 740 //===---------------------------------------------------------------------===//
 741
 742 The loop unroller should partially unroll loops (instead of peeling them)
 743 when code growth isn't too bad and when an unroll count allows simplification
 744 of some code within the loop.  One trivial example is:
 745
 746 #include <stdio.h>
 747 int main() {
 748     int nRet = 17;
 749     int nLoop;
 750     for ( nLoop = 0; nLoop < 1000; nLoop++ ) {
 751         if ( nLoop & 1 )
 752             nRet += 2;
 753         else
 754             nRet -= 1;
 755     }
 756     return nRet;
 757 }
 758
 759 Unrolling by 2 would eliminate the '&1' in both copies, leading to a net
 760 reduction in code size.  The resultant code would then also be suitable for
 761 exit value computation.
 762
 763 //===---------------------------------------------------------------------===//
 764
 765 We miss a bunch of rotate opportunities on various targets, including ppc, x86,
 766 etc.  On X86, we miss a bunch of 'rotate by variable' cases because the rotate
 767 matching code in dag combine doesn't look through truncates aggressively
 768 enough.  Here are some testcases reduces from GCC PR17886:
 769
 770 unsigned long long f(unsigned long long x, int y) {
 771   return (x << y) | (x >> 64-y);
 772 }
 773 unsigned f2(unsigned x, int y){
 774   return (x << y) | (x >> 32-y);
 775 }
 776 unsigned long long f3(unsigned long long x){
 777   int y = 9;
 778   return (x << y) | (x >> 64-y);
 779 }
 780 unsigned f4(unsigned x){
 781   int y = 10;
 782   return (x << y) | (x >> 32-y);
 783 }
 784 unsigned long long f5(unsigned long long x, unsigned long long y) {
 785   return (x << 8) | ((y >> 48) & 0xffull);
 786 }
 787 unsigned long long f6(unsigned long long x, unsigned long long y, int z) {
 788   switch(z) {
 789   case 1:
 790     return (x << 8) | ((y >> 48) & 0xffull);
 791   case 2:
 792     return (x << 16) | ((y >> 40) & 0xffffull);
 793   case 3:
 794     return (x << 24) | ((y >> 32) & 0xffffffull);
 795   case 4:
 796     return (x << 32) | ((y >> 24) & 0xffffffffull);
 797   default:
 798     return (x << 40) | ((y >> 16) & 0xffffffffffull);
 799   }
 800 }
 801
 802 On X86-64, we only handle f3/f4 right.  On x86-32, several of these
 803 generate truly horrible code, instead of using shld and friends.  On
 804 ARM, we end up with calls to L___lshrdi3/L___ashldi3 in f, which is
 805 badness.  PPC64 misses f, f5 and f6.  CellSPU aborts in isel.
 806
 807 //===---------------------------------------------------------------------===//
 808
 809 We do a number of simplifications in simplify libcalls to strength reduce
 810 standard library functions, but we don't currently merge them together.  For
 811 example, it is useful to merge memcpy(a,b,strlen(b)) -> strcpy.  This can only
 812 be done safely if "b" isn't modified between the strlen and memcpy of course.
 813
 814 //===---------------------------------------------------------------------===//
 815
 816 We should be able to evaluate this loop:
 817
 818 int test(int x_offs) {
 819   while (x_offs > 4)
 820      x_offs -= 4;
 821   return x_offs;
 822 }
 823
 824 //===---------------------------------------------------------------------===//
 825
 826 Reassociate should turn things like:
 827
 828 int factorial(int X) {
 829  return X*X*X*X*X*X*X*X;
 830 }
 831
 832 into llvm.powi calls, allowing the code generator to produce balanced
 833 multiplication trees.
 834
 835 //===---------------------------------------------------------------------===//
 836
 837 We generate a horrible  libcall for llvm.powi.  For example, we compile:
 838
 839 #include <cmath>
 840 double f(double a) { return std::pow(a, 4); }
 841
 842 into:
 843
 844 __Z1fd:
 845         subl    $12, %esp
 846         movsd   16(%esp), %xmm0
 847         movsd   %xmm0, (%esp)
 848         movl    $4, 8(%esp)
 849         call    L___powidf2$stub
 850         addl    $12, %esp
 851         ret
 852
 853 GCC produces:
 854
 855 __Z1fd:
 856         subl    $12, %esp
 857         movsd   16(%esp), %xmm0
 858         mulsd   %xmm0, %xmm0
 859         mulsd   %xmm0, %xmm0
 860         movsd   %xmm0, (%esp)
 861         fldl    (%esp)
 862         addl    $12, %esp
 863         ret
 864
 865 //===---------------------------------------------------------------------===//
 866
 867 We compile this program: (from GCC PR11680)
 868 http://gcc.gnu.org/bugzilla/attachment.cgi?id=4487
 869
 870 Into code that runs the same speed in fast/slow modes, but both modes run 2x
 871 slower than when compile with GCC (either 4.0 or 4.2):
 872
 873 $ llvm-g++ perf.cpp -O3 -fno-exceptions
 874 $ time ./a.out fast
 875 1.821u 0.003s 0:01.82 100.0%    0+0k 0+0io 0pf+0w
 876
 877 $ g++ perf.cpp -O3 -fno-exceptions
 878 $ time ./a.out fast
 879 0.821u 0.001s 0:00.82 100.0%    0+0k 0+0io 0pf+0w
 880
 881 It looks like we are making the same inlining decisions, so this may be raw
 882 codegen badness or something else (haven't investigated).
 883
 884 //===---------------------------------------------------------------------===//
 885
 886 We miss some instcombines for stuff like this:
 887 void bar (void);
 888 void foo (unsigned int a) {
 889   /* This one is equivalent to a >= (3 << 2).  */
 890   if ((a >> 2) >= 3)
 891     bar ();
 892 }
 893
 894 A few other related ones are in GCC PR14753.
 895
 896 //===---------------------------------------------------------------------===//
 897
 898 Divisibility by constant can be simplified (according to GCC PR12849) from
 899 being a mulhi to being a mul lo (cheaper).  Testcase:
 900
 901 void bar(unsigned n) {
 902   if (n % 3 == 0)
 903     true();
 904 }
 905
 906 I think this basically amounts to a dag combine to simplify comparisons against
 907 multiply hi's into a comparison against the mullo.
 908
 909 //===---------------------------------------------------------------------===//
 910
 911 SROA is not promoting the union on the stack in this example, we should end
 912 up with no allocas.
 913
 914 union vec2d {
 915     double e[2];
 916     double v __attribute__((vector_size(16)));
 917 };
 918 typedef union vec2d vec2d;
 919
 920 static vec2d a={{1,2}}, b={{3,4}};
 921
 922 vec2d foo () {
 923     return (vec2d){ .v = a.v + b.v * (vec2d){{5,5}}.v };
 924 }
 925
 926 //===---------------------------------------------------------------------===//
 927
 928 This C++ file:
 929 void g(); struct A { int n; int m; A& operator++(void) { ++n; if (n == m) g();
 930 return *this; }    A() : n(0), m(0) { } friend bool operator!=(A const& a1,
 931 A const& a2) { return a1.n != a2.n; } }; void testfunction(A& iter) { A const
 932 end; while (iter != end) ++iter; }
 933
 934 Compiles down to:
 935
 936 bb:             ; preds = %bb3.backedge, %bb.nph
 937         %.rle = phi i32 [ %1, %bb.nph ], [ %7, %bb3.backedge ]          ; <i32> [#uses=1]
 938         %4 = add i32 %.rle, 1           ; <i32> [#uses=2]
 939         store i32 %4, i32* %0, align 4
 940         %5 = load i32* %3, align 4              ; <i32> [#uses=1]
 941         %6 = icmp eq i32 %4, %5         ; <i1> [#uses=1]
 942         br i1 %6, label %bb1, label %bb3.backedge
 943
 944 bb1:            ; preds = %bb
 945         tail call void @_Z1gv()
 946         br label %bb3.backedge
 947
 948 bb3.backedge:           ; preds = %bb, %bb1
 949         %7 = load i32* %0, align 4              ; <i32> [#uses=2]
 950
 951
 952 The %7 load is partially redundant with the store of %4 to %0, GVN's PRE
 953 should remove it, but it doesn't apply to memory objects.
 954
 955 //===---------------------------------------------------------------------===//
 956
 957 Better mod/ref analysis for scanf would allow us to eliminate the vtable and a
 958 bunch of other stuff from this example (see PR1604):
 959
 960 #include <cstdio>
 961 struct test {
 962     int val;
 963     virtual ~test() {}
 964 };
 965
 966 int main() {
 967     test t;
 968     std::scanf("%d", &t.val);
 969     std::printf("%d\n", t.val);
 970 }
 971
 972 //===---------------------------------------------------------------------===//
 973
 974 Instcombine will merge comparisons like (x >= 10) && (x < 20) by producing (x -
 975 10) u< 10, but only when the comparisons have matching sign.
 976
 977 This could be converted with a similiar technique. (PR1941)
 978
 979 define i1 @test(i8 %x) {
 980   %A = icmp uge i8 %x, 5
 981   %B = icmp slt i8 %x, 20
 982   %C = and i1 %A, %B
 983   ret i1 %C
 984 }
 985
 986 //===---------------------------------------------------------------------===//
 987