lib/Target/X86/README.txt

   1 //===---------------------------------------------------------------------===//
   2 // Random ideas for the X86 backend.
   3 //===---------------------------------------------------------------------===//
   4
   5 We should add support for the "movbe" instruction, which does a byte-swapping
   6 copy (3-addr bswap + memory support?)  This is available on Atom processors.
   7
   8 //===---------------------------------------------------------------------===//
   9
  10 This should be one DIV/IDIV instruction, not a libcall:
  11
  12 unsigned test(unsigned long long X, unsigned Y) {
  13         return X/Y;
  14 }
  15
  16 This can be done trivially with a custom legalizer.  What about overflow
  17 though?  http://gcc.gnu.org/bugzilla/show_bug.cgi?id=14224
  18
  19 //===---------------------------------------------------------------------===//
  20
  21 Improvements to the multiply -> shift/add algorithm:
  22 http://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html
  23
  24 //===---------------------------------------------------------------------===//
  25
  26 Improve code like this (occurs fairly frequently, e.g. in LLVM):
  27 long long foo(int x) { return 1LL << x; }
  28
  29 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01109.html
  30 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01128.html
  31 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html
  32
  33 Another useful one would be  ~0ULL >> X and ~0ULL << X.
  34
  35 One better solution for 1LL << x is:
  36         xorl    %eax, %eax
  37         xorl    %edx, %edx
  38         testb   $32, %cl
  39         sete    %al
  40         setne   %dl
  41         sall    %cl, %eax
  42         sall    %cl, %edx
  43
  44 But that requires good 8-bit subreg support.
  45
  46 Also, this might be better.  It's an extra shift, but it's one instruction
  47 shorter, and doesn't stress 8-bit subreg support.
  48 (From http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01148.html,
  49 but without the unnecessary and.)
  50         movl %ecx, %eax
  51         shrl $5, %eax
  52         movl %eax, %edx
  53         xorl $1, %edx
  54         sall %cl, %eax
  55         sall %cl. %edx
  56
  57 64-bit shifts (in general) expand to really bad code.  Instead of using
  58 cmovs, we should expand to a conditional branch like GCC produces.
  59
  60 //===---------------------------------------------------------------------===//
  61
  62 Some isel ideas:
  63
  64 1. Dynamic programming based approach when compile time if not an
  65    issue.
  66 2. Code duplication (addressing mode) during isel.
  67 3. Other ideas from "Register-Sensitive Selection, Duplication, and
  68    Sequencing of Instructions".
  69 4. Scheduling for reduced register pressure.  E.g. "Minimum Register
  70    Instruction Sequence Problem: Revisiting Optimal Code Generation for DAGs"
  71    and other related papers.
  72    http://citeseer.ist.psu.edu/govindarajan01minimum.html
  73
  74 //===---------------------------------------------------------------------===//
  75
  76 Should we promote i16 to i32 to avoid partial register update stalls?
  77
  78 //===---------------------------------------------------------------------===//
  79
  80 Leave any_extend as pseudo instruction and hint to register
  81 allocator. Delay codegen until post register allocation.
  82 Note. any_extend is now turned into an INSERT_SUBREG. We still need to teach
  83 the coalescer how to deal with it though.
  84
  85 //===---------------------------------------------------------------------===//
  86
  87 It appears icc use push for parameter passing. Need to investigate.
  88
  89 //===---------------------------------------------------------------------===//
  90
  91 This:
  92
  93 void foo(void);
  94 void bar(int x, int *P) {
  95   x >>= 2;
  96   if (x)
  97     foo();
  98   *P = x;
  99 }
 100
 101 compiles into:
 102
 103         movq    %rsi, %rbx
 104         movl    %edi, %r14d
 105         sarl    $2, %r14d
 106         testl   %r14d, %r14d
 107         je      LBB0_2
 108
 109 Instead of doing an explicit test, we can use the flags off the sar.  This
 110 occurs in a bigger testcase like this, which is pretty common:
 111
 112 #include <vector>
 113 int test1(std::vector<int> &X) {
 114   int Sum = 0;
 115   for (long i = 0, e = X.size(); i != e; ++i)
 116     X[i] = 0;
 117   return Sum;
 118 }
 119
 120 //===---------------------------------------------------------------------===//
 121
 122 Only use inc/neg/not instructions on processors where they are faster than
 123 add/sub/xor.  They are slower on the P4 due to only updating some processor
 124 flags.
 125
 126 //===---------------------------------------------------------------------===//
 127
 128 The instruction selector sometimes misses folding a load into a compare.  The
 129 pattern is written as (cmp reg, (load p)).  Because the compare isn't
 130 commutative, it is not matched with the load on both sides.  The dag combiner
 131 should be made smart enough to cannonicalize the load into the RHS of a compare
 132 when it can invert the result of the compare for free.
 133
 134 //===---------------------------------------------------------------------===//
 135
 136 In many cases, LLVM generates code like this:
 137
 138 _test:
 139         movl 8(%esp), %eax
 140         cmpl %eax, 4(%esp)
 141         setl %al
 142         movzbl %al, %eax
 143         ret
 144
 145 on some processors (which ones?), it is more efficient to do this:
 146
 147 _test:
 148         movl 8(%esp), %ebx
 149         xor  %eax, %eax
 150         cmpl %ebx, 4(%esp)
 151         setl %al
 152         ret
 153
 154 Doing this correctly is tricky though, as the xor clobbers the flags.
 155
 156 //===---------------------------------------------------------------------===//
 157
 158 We should generate bts/btr/etc instructions on targets where they are cheap or
 159 when codesize is important.  e.g., for:
 160
 161 void setbit(int *target, int bit) {
 162     *target |= (1 << bit);
 163 }
 164 void clearbit(int *target, int bit) {
 165     *target &= ~(1 << bit);
 166 }
 167
 168 //===---------------------------------------------------------------------===//
 169
 170 Instead of the following for memset char*, 1, 10:
 171
 172         movl $16843009, 4(%edx)
 173         movl $16843009, (%edx)
 174         movw $257, 8(%edx)
 175
 176 It might be better to generate
 177
 178         movl $16843009, %eax
 179         movl %eax, 4(%edx)
 180         movl %eax, (%edx)
 181         movw al, 8(%edx)
 182
 183 when we can spare a register. It reduces code size.
 184
 185 //===---------------------------------------------------------------------===//
 186
 187 Evaluate what the best way to codegen sdiv X, (2^C) is.  For X/8, we currently
 188 get this:
 189
 190 define i32 @test1(i32 %X) {
 191     %Y = sdiv i32 %X, 8
 192     ret i32 %Y
 193 }
 194
 195 _test1:
 196         movl 4(%esp), %eax
 197         movl %eax, %ecx
 198         sarl $31, %ecx
 199         shrl $29, %ecx
 200         addl %ecx, %eax
 201         sarl $3, %eax
 202         ret
 203
 204 GCC knows several different ways to codegen it, one of which is this:
 205
 206 _test1:
 207         movl    4(%esp), %eax
 208         cmpl    $-1, %eax
 209         leal    7(%eax), %ecx
 210         cmovle  %ecx, %eax
 211         sarl    $3, %eax
 212         ret
 213
 214 which is probably slower, but it's interesting at least :)
 215
 216 //===---------------------------------------------------------------------===//
 217
 218 We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl
 219 We should leave these as libcalls for everything over a much lower threshold,
 220 since libc is hand tuned for medium and large mem ops (avoiding RFO for large
 221 stores, TLB preheating, etc)
 222
 223 //===---------------------------------------------------------------------===//
 224
 225 Optimize this into something reasonable:
 226  x * copysign(1.0, y) * copysign(1.0, z)
 227
 228 //===---------------------------------------------------------------------===//
 229
 230 Optimize copysign(x, *y) to use an integer load from y.
 231
 232 //===---------------------------------------------------------------------===//
 233
 234 The following tests perform worse with LSR:
 235
 236 lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor.
 237
 238 //===---------------------------------------------------------------------===//
 239
 240 Adding to the list of cmp / test poor codegen issues:
 241
 242 int test(__m128 *A, __m128 *B) {
 243   if (_mm_comige_ss(*A, *B))
 244     return 3;
 245   else
 246     return 4;
 247 }
 248
 249 _test:
 250         movl 8(%esp), %eax
 251         movaps (%eax), %xmm0
 252         movl 4(%esp), %eax
 253         movaps (%eax), %xmm1
 254         comiss %xmm0, %xmm1
 255         setae %al
 256         movzbl %al, %ecx
 257         movl $3, %eax
 258         movl $4, %edx
 259         cmpl $0, %ecx
 260         cmove %edx, %eax
 261         ret
 262
 263 Note the setae, movzbl, cmpl, cmove can be replaced with a single cmovae. There
 264 are a number of issues. 1) We are introducing a setcc between the result of the
 265 intrisic call and select. 2) The intrinsic is expected to produce a i32 value
 266 so a any extend (which becomes a zero extend) is added.
 267
 268 We probably need some kind of target DAG combine hook to fix this.
 269
 270 //===---------------------------------------------------------------------===//
 271
 272 We generate significantly worse code for this than GCC:
 273 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150
 274 http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701
 275
 276 There is also one case we do worse on PPC.
 277
 278 //===---------------------------------------------------------------------===//
 279
 280 For this:
 281
 282 int test(int a)
 283 {
 284   return a * 3;
 285 }
 286
 287 We currently emits
 288         imull $3, 4(%esp), %eax
 289
 290 Perhaps this is what we really should generate is? Is imull three or four
 291 cycles? Note: ICC generates this:
 292         movl    4(%esp), %eax
 293         leal    (%eax,%eax,2), %eax
 294
 295 The current instruction priority is based on pattern complexity. The former is
 296 more "complex" because it folds a load so the latter will not be emitted.
 297
 298 Perhaps we should use AddedComplexity to give LEA32r a higher priority? We
 299 should always try to match LEA first since the LEA matching code does some
 300 estimate to determine whether the match is profitable.
 301
 302 However, if we care more about code size, then imull is better. It's two bytes
 303 shorter than movl + leal.
 304
 305 On a Pentium M, both variants have the same characteristics with regard
 306 to throughput; however, the multiplication has a latency of four cycles, as
 307 opposed to two cycles for the movl+lea variant.
 308
 309 //===---------------------------------------------------------------------===//
 310
 311 __builtin_ffs codegen is messy.
 312
 313 int ffs_(unsigned X) { return __builtin_ffs(X); }
 314
 315 llvm produces:
 316 ffs_:
 317         movl    4(%esp), %ecx
 318         bsfl    %ecx, %eax
 319         movl    $32, %edx
 320         cmove   %edx, %eax
 321         incl    %eax
 322         xorl    %edx, %edx
 323         testl   %ecx, %ecx
 324         cmove   %edx, %eax
 325         ret
 326
 327 vs gcc:
 328
 329 _ffs_:
 330         movl    $-1, %edx
 331         bsfl    4(%esp), %eax
 332         cmove   %edx, %eax
 333         addl    $1, %eax
 334         ret
 335
 336 Another example of __builtin_ffs (use predsimplify to eliminate a select):
 337
 338 int foo (unsigned long j) {
 339   if (j)
 340     return __builtin_ffs (j) - 1;
 341   else
 342     return 0;
 343 }
 344
 345 //===---------------------------------------------------------------------===//
 346
 347 It appears gcc place string data with linkonce linkage in
 348 .section __TEXT,__const_coal,coalesced instead of
 349 .section __DATA,__const_coal,coalesced.
 350 Take a look at darwin.h, there are other Darwin assembler directives that we
 351 do not make use of.
 352
 353 //===---------------------------------------------------------------------===//
 354
 355 define i32 @foo(i32* %a, i32 %t) {
 356 entry:
 357         br label %cond_true
 358
 359 cond_true:              ; preds = %cond_true, %entry
 360         %x.0.0 = phi i32 [ 0, %entry ], [ %tmp9, %cond_true ]           ; <i32> [#uses=3]
 361         %t_addr.0.0 = phi i32 [ %t, %entry ], [ %tmp7, %cond_true ]             ; <i32> [#uses=1]
 362         %tmp2 = getelementptr i32* %a, i32 %x.0.0               ; <i32*> [#uses=1]
 363         %tmp3 = load i32* %tmp2         ; <i32> [#uses=1]
 364         %tmp5 = add i32 %t_addr.0.0, %x.0.0             ; <i32> [#uses=1]
 365         %tmp7 = add i32 %tmp5, %tmp3            ; <i32> [#uses=2]
 366         %tmp9 = add i32 %x.0.0, 1               ; <i32> [#uses=2]
 367         %tmp = icmp sgt i32 %tmp9, 39           ; <i1> [#uses=1]
 368         br i1 %tmp, label %bb12, label %cond_true
 369
 370 bb12:           ; preds = %cond_true
 371         ret i32 %tmp7
 372 }
 373 is pessimized by -loop-reduce and -indvars
 374
 375 //===---------------------------------------------------------------------===//
 376
 377 u32 to float conversion improvement:
 378
 379 float uint32_2_float( unsigned u ) {
 380   float fl = (int) (u & 0xffff);
 381   float fh = (int) (u >> 16);
 382   fh *= 0x1.0p16f;
 383   return fh + fl;
 384 }
 385
 386 00000000        subl    $0x04,%esp
 387 00000003        movl    0x08(%esp,1),%eax
 388 00000007        movl    %eax,%ecx
 389 00000009        shrl    $0x10,%ecx
 390 0000000c        cvtsi2ss        %ecx,%xmm0
 391 00000010        andl    $0x0000ffff,%eax
 392 00000015        cvtsi2ss        %eax,%xmm1
 393 00000019        mulss   0x00000078,%xmm0
 394 00000021        addss   %xmm1,%xmm0
 395 00000025        movss   %xmm0,(%esp,1)
 396 0000002a        flds    (%esp,1)
 397 0000002d        addl    $0x04,%esp
 398 00000030        ret
 399
 400 //===---------------------------------------------------------------------===//
 401
 402 When using fastcc abi, align stack slot of argument of type double on 8 byte
 403 boundary to improve performance.
 404
 405 //===---------------------------------------------------------------------===//
 406
 407 GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting
 408 simplifications for integer "x cmp y ? a : b".
 409
 410 //===---------------------------------------------------------------------===//
 411
 412 Consider the expansion of:
 413
 414 define i32 @test3(i32 %X) {
 415         %tmp1 = urem i32 %X, 255
 416         ret i32 %tmp1
 417 }
 418
 419 Currently it compiles to:
 420
 421 ...
 422         movl $2155905153, %ecx
 423         movl 8(%esp), %esi
 424         movl %esi, %eax
 425         mull %ecx
 426 ...
 427
 428 This could be "reassociated" into:
 429
 430         movl $2155905153, %eax
 431         movl 8(%esp), %ecx
 432         mull %ecx
 433
 434 to avoid the copy.  In fact, the existing two-address stuff would do this
 435 except that mul isn't a commutative 2-addr instruction.  I guess this has
 436 to be done at isel time based on the #uses to mul?
 437
 438 //===---------------------------------------------------------------------===//
 439
 440 Make sure the instruction which starts a loop does not cross a cacheline
 441 boundary. This requires knowning the exact length of each machine instruction.
 442 That is somewhat complicated, but doable. Example 256.bzip2:
 443
 444 In the new trace, the hot loop has an instruction which crosses a cacheline
 445 boundary.  In addition to potential cache misses, this can't help decoding as I
 446 imagine there has to be some kind of complicated decoder reset and realignment
 447 to grab the bytes from the next cacheline.
 448
 449 532  532 0x3cfc movb     (1809(%esp, %esi), %bl   <<<--- spans 2 64 byte lines
 450 942  942 0x3d03 movl     %dh, (1809(%esp, %esi)
 451 937  937 0x3d0a incl     %esi
 452 3    3   0x3d0b cmpb     %bl, %dl
 453 27   27  0x3d0d jnz      0x000062db <main+11707>
 454
 455 //===---------------------------------------------------------------------===//
 456
 457 In c99 mode, the preprocessor doesn't like assembly comments like #TRUNCATE.
 458
 459 //===---------------------------------------------------------------------===//
 460
 461 This could be a single 16-bit load.
 462
 463 int f(char *p) {
 464     if ((p[0] == 1) & (p[1] == 2)) return 1;
 465     return 0;
 466 }
 467
 468 //===---------------------------------------------------------------------===//
 469
 470 We should inline lrintf and probably other libc functions.
 471
 472 //===---------------------------------------------------------------------===//
 473
 474 Use the FLAGS values from arithmetic instructions more.  For example, compile:
 475
 476 int add_zf(int *x, int y, int a, int b) {
 477      if ((*x += y) == 0)
 478           return a;
 479      else
 480           return b;
 481 }
 482
 483 to:
 484        addl    %esi, (%rdi)
 485        movl    %edx, %eax
 486        cmovne  %ecx, %eax
 487        ret
 488 instead of:
 489
 490 _add_zf:
 491         addl (%rdi), %esi
 492         movl %esi, (%rdi)
 493         testl %esi, %esi
 494         cmove %edx, %ecx
 495         movl %ecx, %eax
 496         ret
 497
 498 As another example, compile function f2 in test/CodeGen/X86/cmp-test.ll
 499 without a test instruction.
 500
 501 //===---------------------------------------------------------------------===//
 502
 503 These two functions have identical effects:
 504
 505 unsigned int f(unsigned int i, unsigned int n) {++i; if (i == n) ++i; return i;}
 506 unsigned int f2(unsigned int i, unsigned int n) {++i; i += i == n; return i;}
 507
 508 We currently compile them to:
 509
 510 _f:
 511         movl 4(%esp), %eax
 512         movl %eax, %ecx
 513         incl %ecx
 514         movl 8(%esp), %edx
 515         cmpl %edx, %ecx
 516         jne LBB1_2      #UnifiedReturnBlock
 517 LBB1_1: #cond_true
 518         addl $2, %eax
 519         ret
 520 LBB1_2: #UnifiedReturnBlock
 521         movl %ecx, %eax
 522         ret
 523 _f2:
 524         movl 4(%esp), %eax
 525         movl %eax, %ecx
 526         incl %ecx
 527         cmpl 8(%esp), %ecx
 528         sete %cl
 529         movzbl %cl, %ecx
 530         leal 1(%ecx,%eax), %eax
 531         ret
 532
 533 both of which are inferior to GCC's:
 534
 535 _f:
 536         movl    4(%esp), %edx
 537         leal    1(%edx), %eax
 538         addl    $2, %edx
 539         cmpl    8(%esp), %eax
 540         cmove   %edx, %eax
 541         ret
 542 _f2:
 543         movl    4(%esp), %eax
 544         addl    $1, %eax
 545         xorl    %edx, %edx
 546         cmpl    8(%esp), %eax
 547         sete    %dl
 548         addl    %edx, %eax
 549         ret
 550
 551 //===---------------------------------------------------------------------===//
 552
 553 This code:
 554
 555 void test(int X) {
 556   if (X) abort();
 557 }
 558
 559 is currently compiled to:
 560
 561 _test:
 562         subl $12, %esp
 563         cmpl $0, 16(%esp)
 564         jne LBB1_1
 565         addl $12, %esp
 566         ret
 567 LBB1_1:
 568         call L_abort$stub
 569
 570 It would be better to produce:
 571
 572 _test:
 573         subl $12, %esp
 574         cmpl $0, 16(%esp)
 575         jne L_abort$stub
 576         addl $12, %esp
 577         ret
 578
 579 This can be applied to any no-return function call that takes no arguments etc.
 580 Alternatively, the stack save/restore logic could be shrink-wrapped, producing
 581 something like this:
 582
 583 _test:
 584         cmpl $0, 4(%esp)
 585         jne LBB1_1
 586         ret
 587 LBB1_1:
 588         subl $12, %esp
 589         call L_abort$stub
 590
 591 Both are useful in different situations.  Finally, it could be shrink-wrapped
 592 and tail called, like this:
 593
 594 _test:
 595         cmpl $0, 4(%esp)
 596         jne LBB1_1
 597         ret
 598 LBB1_1:
 599         pop %eax   # realign stack.
 600         call L_abort$stub
 601
 602 Though this probably isn't worth it.
 603
 604 //===---------------------------------------------------------------------===//
 605
 606 Sometimes it is better to codegen subtractions from a constant (e.g. 7-x) with
 607 a neg instead of a sub instruction.  Consider:
 608
 609 int test(char X) { return 7-X; }
 610
 611 we currently produce:
 612 _test:
 613         movl $7, %eax
 614         movsbl 4(%esp), %ecx
 615         subl %ecx, %eax
 616         ret
 617
 618 We would use one fewer register if codegen'd as:
 619
 620         movsbl 4(%esp), %eax
 621         neg %eax
 622         add $7, %eax
 623         ret
 624
 625 Note that this isn't beneficial if the load can be folded into the sub.  In
 626 this case, we want a sub:
 627
 628 int test(int X) { return 7-X; }
 629 _test:
 630         movl $7, %eax
 631         subl 4(%esp), %eax
 632         ret
 633
 634 //===---------------------------------------------------------------------===//
 635
 636 Leaf functions that require one 4-byte spill slot have a prolog like this:
 637
 638 _foo:
 639         pushl   %esi
 640         subl    $4, %esp
 641 ...
 642 and an epilog like this:
 643         addl    $4, %esp
 644         popl    %esi
 645         ret
 646
 647 It would be smaller, and potentially faster, to push eax on entry and to
 648 pop into a dummy register instead of using addl/subl of esp.  Just don't pop
 649 into any return registers :)
 650
 651 //===---------------------------------------------------------------------===//
 652
 653 The X86 backend should fold (branch (or (setcc, setcc))) into multiple
 654 branches.  We generate really poor code for:
 655
 656 double testf(double a) {
 657        return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0);
 658 }
 659
 660 For example, the entry BB is:
 661
 662 _testf:
 663         subl    $20, %esp
 664         pxor    %xmm0, %xmm0
 665         movsd   24(%esp), %xmm1
 666         ucomisd %xmm0, %xmm1
 667         setnp   %al
 668         sete    %cl
 669         testb   %cl, %al
 670         jne     LBB1_5  # UnifiedReturnBlock
 671 LBB1_1: # cond_true
 672
 673
 674 it would be better to replace the last four instructions with:
 675
 676         jp LBB1_1
 677         je LBB1_5
 678 LBB1_1:
 679
 680 We also codegen the inner ?: into a diamond:
 681
 682        cvtss2sd        LCPI1_0(%rip), %xmm2
 683         cvtss2sd        LCPI1_1(%rip), %xmm3
 684         ucomisd %xmm1, %xmm0
 685         ja      LBB1_3  # cond_true
 686 LBB1_2: # cond_true
 687         movapd  %xmm3, %xmm2
 688 LBB1_3: # cond_true
 689         movapd  %xmm2, %xmm0
 690         ret
 691
 692 We should sink the load into xmm3 into the LBB1_2 block.  This should
 693 be pretty easy, and will nuke all the copies.
 694
 695 //===---------------------------------------------------------------------===//
 696
 697 This:
 698         #include <algorithm>
 699         inline std::pair<unsigned, bool> full_add(unsigned a, unsigned b)
 700         { return std::make_pair(a + b, a + b < a); }
 701         bool no_overflow(unsigned a, unsigned b)
 702         { return !full_add(a, b).second; }
 703
 704 Should compile to:
 705         addl    %esi, %edi
 706         setae   %al
 707         movzbl  %al, %eax
 708         ret
 709
 710 on x86-64, instead of the rather stupid-looking:
 711         addl    %esi, %edi
 712         setb    %al
 713         xorb    $1, %al
 714         movzbl  %al, %eax
 715         ret
 716
 717
 718 //===---------------------------------------------------------------------===//
 719
 720 The following code:
 721
 722 bb114.preheader:                ; preds = %cond_next94
 723         %tmp231232 = sext i16 %tmp62 to i32             ; <i32> [#uses=1]
 724         %tmp233 = sub i32 32, %tmp231232                ; <i32> [#uses=1]
 725         %tmp245246 = sext i16 %tmp65 to i32             ; <i32> [#uses=1]
 726         %tmp252253 = sext i16 %tmp68 to i32             ; <i32> [#uses=1]
 727         %tmp254 = sub i32 32, %tmp252253                ; <i32> [#uses=1]
 728         %tmp553554 = bitcast i16* %tmp37 to i8*         ; <i8*> [#uses=2]
 729         %tmp583584 = sext i16 %tmp98 to i32             ; <i32> [#uses=1]
 730         %tmp585 = sub i32 32, %tmp583584                ; <i32> [#uses=1]
 731         %tmp614615 = sext i16 %tmp101 to i32            ; <i32> [#uses=1]
 732         %tmp621622 = sext i16 %tmp104 to i32            ; <i32> [#uses=1]
 733         %tmp623 = sub i32 32, %tmp621622                ; <i32> [#uses=1]
 734         br label %bb114
 735
 736 produces:
 737
 738 LBB3_5: # bb114.preheader
 739         movswl  -68(%ebp), %eax
 740         movl    $32, %ecx
 741         movl    %ecx, -80(%ebp)
 742         subl    %eax, -80(%ebp)
 743         movswl  -52(%ebp), %eax
 744         movl    %ecx, -84(%ebp)
 745         subl    %eax, -84(%ebp)
 746         movswl  -70(%ebp), %eax
 747         movl    %ecx, -88(%ebp)
 748         subl    %eax, -88(%ebp)
 749         movswl  -50(%ebp), %eax
 750         subl    %eax, %ecx
 751         movl    %ecx, -76(%ebp)
 752         movswl  -42(%ebp), %eax
 753         movl    %eax, -92(%ebp)
 754         movswl  -66(%ebp), %eax
 755         movl    %eax, -96(%ebp)
 756         movw    $0, -98(%ebp)
 757
 758 This appears to be bad because the RA is not folding the store to the stack
 759 slot into the movl.  The above instructions could be:
 760         movl    $32, -80(%ebp)
 761 ...
 762         movl    $32, -84(%ebp)
 763 ...
 764 This seems like a cross between remat and spill folding.
 765
 766 This has redundant subtractions of %eax from a stack slot. However, %ecx doesn't
 767 change, so we could simply subtract %eax from %ecx first and then use %ecx (or
 768 vice-versa).
 769
 770 //===---------------------------------------------------------------------===//
 771
 772 This code:
 773
 774         %tmp659 = icmp slt i16 %tmp654, 0               ; <i1> [#uses=1]
 775         br i1 %tmp659, label %cond_true662, label %cond_next715
 776
 777 produces this:
 778
 779         testw   %cx, %cx
 780         movswl  %cx, %esi
 781         jns     LBB4_109        # cond_next715
 782
 783 Shark tells us that using %cx in the testw instruction is sub-optimal. It
 784 suggests using the 32-bit register (which is what ICC uses).
 785
 786 //===---------------------------------------------------------------------===//
 787
 788 We compile this:
 789
 790 void compare (long long foo) {
 791   if (foo < 4294967297LL)
 792     abort();
 793 }
 794
 795 to:
 796
 797 compare:
 798         subl    $4, %esp
 799         cmpl    $0, 8(%esp)
 800         setne   %al
 801         movzbw  %al, %ax
 802         cmpl    $1, 12(%esp)
 803         setg    %cl
 804         movzbw  %cl, %cx
 805         cmove   %ax, %cx
 806         testb   $1, %cl
 807         jne     .LBB1_2 # UnifiedReturnBlock
 808 .LBB1_1:        # ifthen
 809         call    abort
 810 .LBB1_2:        # UnifiedReturnBlock
 811         addl    $4, %esp
 812         ret
 813
 814 (also really horrible code on ppc).  This is due to the expand code for 64-bit
 815 compares.  GCC produces multiple branches, which is much nicer:
 816
 817 compare:
 818         subl    $12, %esp
 819         movl    20(%esp), %edx
 820         movl    16(%esp), %eax
 821         decl    %edx
 822         jle     .L7
 823 .L5:
 824         addl    $12, %esp
 825         ret
 826         .p2align 4,,7
 827 .L7:
 828         jl      .L4
 829         cmpl    $0, %eax
 830         .p2align 4,,8
 831         ja      .L5
 832 .L4:
 833         .p2align 4,,9
 834         call    abort
 835
 836 //===---------------------------------------------------------------------===//
 837
 838 Tail call optimization improvements: Tail call optimization currently
 839 pushes all arguments on the top of the stack (their normal place for
 840 non-tail call optimized calls) that source from the callers arguments
 841 or  that source from a virtual register (also possibly sourcing from
 842 callers arguments).
 843 This is done to prevent overwriting of parameters (see example
 844 below) that might be used later.
 845
 846 example:
 847
 848 int callee(int32, int64);
 849 int caller(int32 arg1, int32 arg2) {
 850   int64 local = arg2 * 2;
 851   return callee(arg2, (int64)local);
 852 }
 853
 854 [arg1]          [!arg2 no longer valid since we moved local onto it]
 855 [arg2]      ->  [(int64)
 856 [RETADDR]        local  ]
 857
 858 Moving arg1 onto the stack slot of callee function would overwrite
 859 arg2 of the caller.
 860
 861 Possible optimizations:
 862
 863
 864  - Analyse the actual parameters of the callee to see which would
 865    overwrite a caller parameter which is used by the callee and only
 866    push them onto the top of the stack.
 867
 868    int callee (int32 arg1, int32 arg2);
 869    int caller (int32 arg1, int32 arg2) {
 870        return callee(arg1,arg2);
 871    }
 872
 873    Here we don't need to write any variables to the top of the stack
 874    since they don't overwrite each other.
 875
 876    int callee (int32 arg1, int32 arg2);
 877    int caller (int32 arg1, int32 arg2) {
 878        return callee(arg2,arg1);
 879    }
 880
 881    Here we need to push the arguments because they overwrite each
 882    other.
 883
 884 //===---------------------------------------------------------------------===//
 885
 886 main ()
 887 {
 888   int i = 0;
 889   unsigned long int z = 0;
 890
 891   do {
 892     z -= 0x00004000;
 893     i++;
 894     if (i > 0x00040000)
 895       abort ();
 896   } while (z > 0);
 897   exit (0);
 898 }
 899
 900 gcc compiles this to:
 901
 902 _main:
 903         subl    $28, %esp
 904         xorl    %eax, %eax
 905         jmp     L2
 906 L3:
 907         cmpl    $262144, %eax
 908         je      L10
 909 L2:
 910         addl    $1, %eax
 911         cmpl    $262145, %eax
 912         jne     L3
 913         call    L_abort$stub
 914 L10:
 915         movl    $0, (%esp)
 916         call    L_exit$stub
 917
 918 llvm:
 919
 920 _main:
 921         subl    $12, %esp
 922         movl    $1, %eax
 923         movl    $16384, %ecx
 924 LBB1_1: # bb
 925         cmpl    $262145, %eax
 926         jge     LBB1_4  # cond_true
 927 LBB1_2: # cond_next
 928         incl    %eax
 929         addl    $4294950912, %ecx
 930         cmpl    $16384, %ecx
 931         jne     LBB1_1  # bb
 932 LBB1_3: # bb11
 933         xorl    %eax, %eax
 934         addl    $12, %esp
 935         ret
 936 LBB1_4: # cond_true
 937         call    L_abort$stub
 938
 939 1. LSR should rewrite the first cmp with induction variable %ecx.
 940 2. DAG combiner should fold
 941         leal    1(%eax), %edx
 942         cmpl    $262145, %edx
 943    =>
 944         cmpl    $262144, %eax
 945
 946 //===---------------------------------------------------------------------===//
 947
 948 define i64 @test(double %X) {
 949         %Y = fptosi double %X to i64
 950         ret i64 %Y
 951 }
 952
 953 compiles to:
 954
 955 _test:
 956         subl    $20, %esp
 957         movsd   24(%esp), %xmm0
 958         movsd   %xmm0, 8(%esp)
 959         fldl    8(%esp)
 960         fisttpll        (%esp)
 961         movl    4(%esp), %edx
 962         movl    (%esp), %eax
 963         addl    $20, %esp
 964         #FP_REG_KILL
 965         ret
 966
 967 This should just fldl directly from the input stack slot.
 968
 969 //===---------------------------------------------------------------------===//
 970
 971 This code:
 972 int foo (int x) { return (x & 65535) | 255; }
 973
 974 Should compile into:
 975
 976 _foo:
 977         movzwl  4(%esp), %eax
 978         orl     $255, %eax
 979         ret
 980
 981 instead of:
 982 _foo:
 983         movl    $65280, %eax
 984         andl    4(%esp), %eax
 985         orl     $255, %eax
 986         ret
 987
 988 //===---------------------------------------------------------------------===//
 989
 990 We're codegen'ing multiply of long longs inefficiently:
 991
 992 unsigned long long LLM(unsigned long long arg1, unsigned long long arg2) {
 993   return arg1 *  arg2;
 994 }
 995
 996 We compile to (fomit-frame-pointer):
 997
 998 _LLM:
 999         pushl   %esi
1000         movl    8(%esp), %ecx
1001         movl    16(%esp), %esi
1002         movl    %esi, %eax
1003         mull    %ecx
1004         imull   12(%esp), %esi
1005         addl    %edx, %esi
1006         imull   20(%esp), %ecx
1007         movl    %esi, %edx
1008         addl    %ecx, %edx
1009         popl    %esi
1010         ret
1011
1012 This looks like a scheduling deficiency and lack of remat of the load from
1013 the argument area.  ICC apparently produces:
1014
1015         movl      8(%esp), %ecx
1016         imull     12(%esp), %ecx
1017         movl      16(%esp), %eax
1018         imull     4(%esp), %eax
1019         addl      %eax, %ecx
1020         movl      4(%esp), %eax
1021         mull      12(%esp)
1022         addl      %ecx, %edx
1023         ret
1024
1025 Note that it remat'd loads from 4(esp) and 12(esp).  See this GCC PR:
1026 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=17236
1027
1028 //===---------------------------------------------------------------------===//
1029
1030 We can fold a store into "zeroing a reg".  Instead of:
1031
1032 xorl    %eax, %eax
1033 movl    %eax, 124(%esp)
1034
1035 we should get:
1036
1037 movl    $0, 124(%esp)
1038
1039 if the flags of the xor are dead.
1040
1041 Likewise, we isel "x<<1" into "add reg,reg".  If reg is spilled, this should
1042 be folded into: shl [mem], 1
1043
1044 //===---------------------------------------------------------------------===//
1045
1046 In SSE mode, we turn abs and neg into a load from the constant pool plus a xor
1047 or and instruction, for example:
1048
1049         xorpd   LCPI1_0, %xmm2
1050
1051 However, if xmm2 gets spilled, we end up with really ugly code like this:
1052
1053         movsd   (%esp), %xmm0
1054         xorpd   LCPI1_0, %xmm0
1055         movsd   %xmm0, (%esp)
1056
1057 Since we 'know' that this is a 'neg', we can actually "fold" the spill into
1058 the neg/abs instruction, turning it into an *integer* operation, like this:
1059
1060         xorl 2147483648, [mem+4]     ## 2147483648 = (1 << 31)
1061
1062 you could also use xorb, but xorl is less likely to lead to a partial register
1063 stall.  Here is a contrived testcase:
1064
1065 double a, b, c;
1066 void test(double *P) {
1067   double X = *P;
1068   a = X;
1069   bar();
1070   X = -X;
1071   b = X;
1072   bar();
1073   c = X;
1074 }
1075
1076 //===---------------------------------------------------------------------===//
1077
1078 The generated code on x86 for checking for signed overflow on a multiply the
1079 obvious way is much longer than it needs to be.
1080
1081 int x(int a, int b) {
1082   long long prod = (long long)a*b;
1083   return  prod > 0x7FFFFFFF || prod < (-0x7FFFFFFF-1);
1084 }
1085
1086 See PR2053 for more details.
1087
1088 //===---------------------------------------------------------------------===//
1089
1090 We should investigate using cdq/ctld (effect: edx = sar eax, 31)
1091 more aggressively; it should cost the same as a move+shift on any modern
1092 processor, but it's a lot shorter. Downside is that it puts more
1093 pressure on register allocation because it has fixed operands.
1094
1095 Example:
1096 int abs(int x) {return x < 0 ? -x : x;}
1097
1098 gcc compiles this to the following when using march/mtune=pentium2/3/4/m/etc.:
1099 abs:
1100         movl    4(%esp), %eax
1101         cltd
1102         xorl    %edx, %eax
1103         subl    %edx, %eax
1104         ret
1105
1106 //===---------------------------------------------------------------------===//
1107
1108 Take the following code (from
1109 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=16541):
1110
1111 extern unsigned char first_one[65536];
1112 int FirstOnet(unsigned long long arg1)
1113 {
1114   if (arg1 >> 48)
1115     return (first_one[arg1 >> 48]);
1116   return 0;
1117 }
1118
1119
1120 The following code is currently generated:
1121 FirstOnet:
1122         movl    8(%esp), %eax
1123         cmpl    $65536, %eax
1124         movl    4(%esp), %ecx
1125         jb      .LBB1_2 # UnifiedReturnBlock
1126 .LBB1_1:        # ifthen
1127         shrl    $16, %eax
1128         movzbl  first_one(%eax), %eax
1129         ret
1130 .LBB1_2:        # UnifiedReturnBlock
1131         xorl    %eax, %eax
1132         ret
1133
1134 We could change the "movl 8(%esp), %eax" into "movzwl 10(%esp), %eax"; this
1135 lets us change the cmpl into a testl, which is shorter, and eliminate the shift.
1136
1137 //===---------------------------------------------------------------------===//
1138
1139 We compile this function:
1140
1141 define i32 @foo(i32 %a, i32 %b, i32 %c, i8 zeroext  %d) nounwind  {
1142 entry:
1143         %tmp2 = icmp eq i8 %d, 0                ; <i1> [#uses=1]
1144         br i1 %tmp2, label %bb7, label %bb
1145
1146 bb:             ; preds = %entry
1147         %tmp6 = add i32 %b, %a          ; <i32> [#uses=1]
1148         ret i32 %tmp6
1149
1150 bb7:            ; preds = %entry
1151         %tmp10 = sub i32 %a, %c         ; <i32> [#uses=1]
1152         ret i32 %tmp10
1153 }
1154
1155 to:
1156
1157 foo:                                    # @foo
1158 # BB#0:                                 # %entry
1159         movl    4(%esp), %ecx
1160         cmpb    $0, 16(%esp)
1161         je      .LBB0_2
1162 # BB#1:                                 # %bb
1163         movl    8(%esp), %eax
1164         addl    %ecx, %eax
1165         ret
1166 .LBB0_2:                                # %bb7
1167         movl    12(%esp), %edx
1168         movl    %ecx, %eax
1169         subl    %edx, %eax
1170         ret
1171
1172 There's an obviously unnecessary movl in .LBB0_2, and we could eliminate a
1173 couple more movls by putting 4(%esp) into %eax instead of %ecx.
1174
1175 //===---------------------------------------------------------------------===//
1176
1177 See rdar://4653682.
1178
1179 From flops:
1180
1181 LBB1_15:        # bb310
1182         cvtss2sd        LCPI1_0, %xmm1
1183         addsd   %xmm1, %xmm0
1184         movsd   176(%esp), %xmm2
1185         mulsd   %xmm0, %xmm2
1186         movapd  %xmm2, %xmm3
1187         mulsd   %xmm3, %xmm3
1188         movapd  %xmm3, %xmm4
1189         mulsd   LCPI1_23, %xmm4
1190         addsd   LCPI1_24, %xmm4
1191         mulsd   %xmm3, %xmm4
1192         addsd   LCPI1_25, %xmm4
1193         mulsd   %xmm3, %xmm4
1194         addsd   LCPI1_26, %xmm4
1195         mulsd   %xmm3, %xmm4
1196         addsd   LCPI1_27, %xmm4
1197         mulsd   %xmm3, %xmm4
1198         addsd   LCPI1_28, %xmm4
1199         mulsd   %xmm3, %xmm4
1200         addsd   %xmm1, %xmm4
1201         mulsd   %xmm2, %xmm4
1202         movsd   152(%esp), %xmm1
1203         addsd   %xmm4, %xmm1
1204         movsd   %xmm1, 152(%esp)
1205         incl    %eax
1206         cmpl    %eax, %esi
1207         jge     LBB1_15 # bb310
1208 LBB1_16:        # bb358.loopexit
1209         movsd   152(%esp), %xmm0
1210         addsd   %xmm0, %xmm0
1211         addsd   LCPI1_22, %xmm0
1212         movsd   %xmm0, 152(%esp)
1213
1214 Rather than spilling the result of the last addsd in the loop, we should have
1215 insert a copy to split the interval (one for the duration of the loop, one
1216 extending to the fall through). The register pressure in the loop isn't high
1217 enough to warrant the spill.
1218
1219 Also check why xmm7 is not used at all in the function.
1220
1221 //===---------------------------------------------------------------------===//
1222
1223 Take the following:
1224
1225 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
1226 target triple = "i386-apple-darwin8"
1227 @in_exit.4870.b = internal global i1 false              ; <i1*> [#uses=2]
1228 define fastcc void @abort_gzip() noreturn nounwind  {
1229 entry:
1230         %tmp.b.i = load i1* @in_exit.4870.b             ; <i1> [#uses=1]
1231         br i1 %tmp.b.i, label %bb.i, label %bb4.i
1232 bb.i:           ; preds = %entry
1233         tail call void @exit( i32 1 ) noreturn nounwind
1234         unreachable
1235 bb4.i:          ; preds = %entry
1236         store i1 true, i1* @in_exit.4870.b
1237         tail call void @exit( i32 1 ) noreturn nounwind
1238         unreachable
1239 }
1240 declare void @exit(i32) noreturn nounwind
1241
1242 This compiles into:
1243 _abort_gzip:                            ## @abort_gzip
1244 ## BB#0:                                ## %entry
1245         subl    $12, %esp
1246         movb    _in_exit.4870.b, %al
1247         cmpb    $1, %al
1248         jne     LBB0_2
1249
1250 We somehow miss folding the movb into the cmpb.
1251
1252 //===---------------------------------------------------------------------===//
1253
1254 We compile:
1255
1256 int test(int x, int y) {
1257   return x-y-1;
1258 }
1259
1260 into (-m64):
1261
1262 _test:
1263         decl    %edi
1264         movl    %edi, %eax
1265         subl    %esi, %eax
1266         ret
1267
1268 it would be better to codegen as: x+~y  (notl+addl)
1269
1270 //===---------------------------------------------------------------------===//
1271
1272 This code:
1273
1274 int foo(const char *str,...)
1275 {
1276  __builtin_va_list a; int x;
1277  __builtin_va_start(a,str); x = __builtin_va_arg(a,int); __builtin_va_end(a);
1278  return x;
1279 }
1280
1281 gets compiled into this on x86-64:
1282         subq    $200, %rsp
1283         movaps  %xmm7, 160(%rsp)
1284         movaps  %xmm6, 144(%rsp)
1285         movaps  %xmm5, 128(%rsp)
1286         movaps  %xmm4, 112(%rsp)
1287         movaps  %xmm3, 96(%rsp)
1288         movaps  %xmm2, 80(%rsp)
1289         movaps  %xmm1, 64(%rsp)
1290         movaps  %xmm0, 48(%rsp)
1291         movq    %r9, 40(%rsp)
1292         movq    %r8, 32(%rsp)
1293         movq    %rcx, 24(%rsp)
1294         movq    %rdx, 16(%rsp)
1295         movq    %rsi, 8(%rsp)
1296         leaq    (%rsp), %rax
1297         movq    %rax, 192(%rsp)
1298         leaq    208(%rsp), %rax
1299         movq    %rax, 184(%rsp)
1300         movl    $48, 180(%rsp)
1301         movl    $8, 176(%rsp)
1302         movl    176(%rsp), %eax
1303         cmpl    $47, %eax
1304         jbe     .LBB1_3 # bb
1305 .LBB1_1:        # bb3
1306         movq    184(%rsp), %rcx
1307         leaq    8(%rcx), %rax
1308         movq    %rax, 184(%rsp)
1309 .LBB1_2:        # bb4
1310         movl    (%rcx), %eax
1311         addq    $200, %rsp
1312         ret
1313 .LBB1_3:        # bb
1314         movl    %eax, %ecx
1315         addl    $8, %eax
1316         addq    192(%rsp), %rcx
1317         movl    %eax, 176(%rsp)
1318         jmp     .LBB1_2 # bb4
1319
1320 gcc 4.3 generates:
1321         subq    $96, %rsp
1322 .LCFI0:
1323         leaq    104(%rsp), %rax
1324         movq    %rsi, -80(%rsp)
1325         movl    $8, -120(%rsp)
1326         movq    %rax, -112(%rsp)
1327         leaq    -88(%rsp), %rax
1328         movq    %rax, -104(%rsp)
1329         movl    $8, %eax
1330         cmpl    $48, %eax
1331         jb      .L6
1332         movq    -112(%rsp), %rdx
1333         movl    (%rdx), %eax
1334         addq    $96, %rsp
1335         ret
1336         .p2align 4,,10
1337         .p2align 3
1338 .L6:
1339         mov     %eax, %edx
1340         addq    -104(%rsp), %rdx
1341         addl    $8, %eax
1342         movl    %eax, -120(%rsp)
1343         movl    (%rdx), %eax
1344         addq    $96, %rsp
1345         ret
1346
1347 and it gets compiled into this on x86:
1348         pushl   %ebp
1349         movl    %esp, %ebp
1350         subl    $4, %esp
1351         leal    12(%ebp), %eax
1352         movl    %eax, -4(%ebp)
1353         leal    16(%ebp), %eax
1354         movl    %eax, -4(%ebp)
1355         movl    12(%ebp), %eax
1356         addl    $4, %esp
1357         popl    %ebp
1358         ret
1359
1360 gcc 4.3 generates:
1361         pushl   %ebp
1362         movl    %esp, %ebp
1363         movl    12(%ebp), %eax
1364         popl    %ebp
1365         ret
1366
1367 //===---------------------------------------------------------------------===//
1368
1369 Teach tblgen not to check bitconvert source type in some cases. This allows us
1370 to consolidate the following patterns in X86InstrMMX.td:
1371
1372 def : Pat<(v2i32 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
1373                                                   (iPTR 0))))),
1374           (v2i32 (MMX_MOVDQ2Qrr VR128:$src))>;
1375 def : Pat<(v4i16 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
1376                                                   (iPTR 0))))),
1377           (v4i16 (MMX_MOVDQ2Qrr VR128:$src))>;
1378 def : Pat<(v8i8 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
1379                                                   (iPTR 0))))),
1380           (v8i8 (MMX_MOVDQ2Qrr VR128:$src))>;
1381
1382 There are other cases in various td files.
1383
1384 //===---------------------------------------------------------------------===//
1385
1386 Take something like the following on x86-32:
1387 unsigned a(unsigned long long x, unsigned y) {return x % y;}
1388
1389 We currently generate a libcall, but we really shouldn't: the expansion is
1390 shorter and likely faster than the libcall.  The expected code is something
1391 like the following:
1392
1393         movl    12(%ebp), %eax
1394         movl    16(%ebp), %ecx
1395         xorl    %edx, %edx
1396         divl    %ecx
1397         movl    8(%ebp), %eax
1398         divl    %ecx
1399         movl    %edx, %eax
1400         ret
1401
1402 A similar code sequence works for division.
1403
1404 //===---------------------------------------------------------------------===//
1405
1406 These should compile to the same code, but the later codegen's to useless
1407 instructions on X86. This may be a trivial dag combine (GCC PR7061):
1408
1409 struct s1 { unsigned char a, b; };
1410 unsigned long f1(struct s1 x) {
1411     return x.a + x.b;
1412 }
1413 struct s2 { unsigned a: 8, b: 8; };
1414 unsigned long f2(struct s2 x) {
1415     return x.a + x.b;
1416 }
1417
1418 //===---------------------------------------------------------------------===//
1419
1420 We currently compile this:
1421
1422 define i32 @func1(i32 %v1, i32 %v2) nounwind {
1423 entry:
1424   %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
1425   %sum = extractvalue {i32, i1} %t, 0
1426   %obit = extractvalue {i32, i1} %t, 1
1427   br i1 %obit, label %overflow, label %normal
1428 normal:
1429   ret i32 %sum
1430 overflow:
1431   call void @llvm.trap()
1432   unreachable
1433 }
1434 declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32)
1435 declare void @llvm.trap()
1436
1437 to:
1438
1439 _func1:
1440         movl    4(%esp), %eax
1441         addl    8(%esp), %eax
1442         jo      LBB1_2  ## overflow
1443 LBB1_1: ## normal
1444         ret
1445 LBB1_2: ## overflow
1446         ud2
1447
1448 it would be nice to produce "into" someday.
1449
1450 //===---------------------------------------------------------------------===//
1451
1452 This code:
1453
1454 void vec_mpys1(int y[], const int x[], int scaler) {
1455 int i;
1456 for (i = 0; i < 150; i++)
1457  y[i] += (((long long)scaler * (long long)x[i]) >> 31);
1458 }
1459
1460 Compiles to this loop with GCC 3.x:
1461
1462 .L5:
1463         movl    %ebx, %eax
1464         imull   (%edi,%ecx,4)
1465         shrdl   $31, %edx, %eax
1466         addl    %eax, (%esi,%ecx,4)
1467         incl    %ecx
1468         cmpl    $149, %ecx
1469         jle     .L5
1470
1471 llvm-gcc compiles it to the much uglier:
1472
1473 LBB1_1: ## bb1
1474         movl    24(%esp), %eax
1475         movl    (%eax,%edi,4), %ebx
1476         movl    %ebx, %ebp
1477         imull   %esi, %ebp
1478         movl    %ebx, %eax
1479         mull    %ecx
1480         addl    %ebp, %edx
1481         sarl    $31, %ebx
1482         imull   %ecx, %ebx
1483         addl    %edx, %ebx
1484         shldl   $1, %eax, %ebx
1485         movl    20(%esp), %eax
1486         addl    %ebx, (%eax,%edi,4)
1487         incl    %edi
1488         cmpl    $150, %edi
1489         jne     LBB1_1  ## bb1
1490
1491 The issue is that we hoist the cast of "scaler" to long long outside of the
1492 loop, the value comes into the loop as two values, and
1493 RegsForValue::getCopyFromRegs doesn't know how to put an AssertSext on the
1494 constructed BUILD_PAIR which represents the cast value.
1495
1496 This can be handled by making CodeGenPrepare sink the cast.
1497
1498 //===---------------------------------------------------------------------===//
1499
1500 Test instructions can be eliminated by using EFLAGS values from arithmetic
1501 instructions. This is currently not done for mul, and, or, xor, neg, shl,
1502 sra, srl, shld, shrd, atomic ops, and others. It is also currently not done
1503 for read-modify-write instructions. It is also current not done if the
1504 OF or CF flags are needed.
1505
1506 The shift operators have the complication that when the shift count is
1507 zero, EFLAGS is not set, so they can only subsume a test instruction if
1508 the shift count is known to be non-zero. Also, using the EFLAGS value
1509 from a shift is apparently very slow on some x86 implementations.
1510
1511 In read-modify-write instructions, the root node in the isel match is
1512 the store, and isel has no way for the use of the EFLAGS result of the
1513 arithmetic to be remapped to the new node.
1514
1515 Add and subtract instructions set OF on signed overflow and CF on unsiged
1516 overflow, while test instructions always clear OF and CF. In order to
1517 replace a test with an add or subtract in a situation where OF or CF is
1518 needed, codegen must be able to prove that the operation cannot see
1519 signed or unsigned overflow, respectively.
1520
1521 //===---------------------------------------------------------------------===//
1522
1523 memcpy/memmove do not lower to SSE copies when possible.  A silly example is:
1524 define <16 x float> @foo(<16 x float> %A) nounwind {
1525         %tmp = alloca <16 x float>, align 16
1526         %tmp2 = alloca <16 x float>, align 16
1527         store <16 x float> %A, <16 x float>* %tmp
1528         %s = bitcast <16 x float>* %tmp to i8*
1529         %s2 = bitcast <16 x float>* %tmp2 to i8*
1530         call void @llvm.memcpy.i64(i8* %s, i8* %s2, i64 64, i32 16)
1531         %R = load <16 x float>* %tmp2
1532         ret <16 x float> %R
1533 }
1534
1535 declare void @llvm.memcpy.i64(i8* nocapture, i8* nocapture, i64, i32) nounwind
1536
1537 which compiles to:
1538
1539 _foo:
1540         subl    $140, %esp
1541         movaps  %xmm3, 112(%esp)
1542         movaps  %xmm2, 96(%esp)
1543         movaps  %xmm1, 80(%esp)
1544         movaps  %xmm0, 64(%esp)
1545         movl    60(%esp), %eax
1546         movl    %eax, 124(%esp)
1547         movl    56(%esp), %eax
1548         movl    %eax, 120(%esp)
1549         movl    52(%esp), %eax
1550         <many many more 32-bit copies>
1551         movaps  (%esp), %xmm0
1552         movaps  16(%esp), %xmm1
1553         movaps  32(%esp), %xmm2
1554         movaps  48(%esp), %xmm3
1555         addl    $140, %esp
1556         ret
1557
1558 On Nehalem, it may even be cheaper to just use movups when unaligned than to
1559 fall back to lower-granularity chunks.
1560
1561 //===---------------------------------------------------------------------===//
1562
1563 Implement processor-specific optimizations for parity with GCC on these
1564 processors.  GCC does two optimizations:
1565
1566 1. ix86_pad_returns inserts a noop before ret instructions if immediately
1567    preceded by a conditional branch or is the target of a jump.
1568 2. ix86_avoid_jump_misspredicts inserts noops in cases where a 16-byte block of
1569    code contains more than 3 branches.
1570
1571 The first one is done for all AMDs, Core2, and "Generic"
1572 The second one is done for: Atom, Pentium Pro, all AMDs, Pentium 4, Nocona,
1573   Core 2, and "Generic"
1574
1575 //===---------------------------------------------------------------------===//
1576
1577 Testcase:
1578 int a(int x) { return (x & 127) > 31; }
1579
1580 Current output:
1581         movl    4(%esp), %eax
1582         andl    $127, %eax
1583         cmpl    $31, %eax
1584         seta    %al
1585         movzbl  %al, %eax
1586         ret
1587
1588 Ideal output:
1589         xorl    %eax, %eax
1590         testl   $96, 4(%esp)
1591         setne   %al
1592         ret
1593
1594 This should definitely be done in instcombine, canonicalizing the range
1595 condition into a != condition.  We get this IR:
1596
1597 define i32 @a(i32 %x) nounwind readnone {
1598 entry:
1599         %0 = and i32 %x, 127            ; <i32> [#uses=1]
1600         %1 = icmp ugt i32 %0, 31                ; <i1> [#uses=1]
1601         %2 = zext i1 %1 to i32          ; <i32> [#uses=1]
1602         ret i32 %2
1603 }
1604
1605 Instcombine prefers to strength reduce relational comparisons to equality
1606 comparisons when possible, this should be another case of that.  This could
1607 be handled pretty easily in InstCombiner::visitICmpInstWithInstAndIntCst, but it
1608 looks like InstCombiner::visitICmpInstWithInstAndIntCst should really already
1609 be redesigned to use ComputeMaskedBits and friends.
1610
1611
1612 //===---------------------------------------------------------------------===//
1613 Testcase:
1614 int x(int a) { return (a&0xf0)>>4; }
1615
1616 Current output:
1617         movl    4(%esp), %eax
1618         shrl    $4, %eax
1619         andl    $15, %eax
1620         ret
1621
1622 Ideal output:
1623         movzbl  4(%esp), %eax
1624         shrl    $4, %eax
1625         ret
1626
1627 //===---------------------------------------------------------------------===//
1628
1629 Re-implement atomic builtins __sync_add_and_fetch() and __sync_sub_and_fetch
1630 properly.
1631
1632 When the return value is not used (i.e. only care about the value in the
1633 memory), x86 does not have to use add to implement these. Instead, it can use
1634 add, sub, inc, dec instructions with the "lock" prefix.
1635
1636 This is currently implemented using a bit of instruction selection trick. The
1637 issue is the target independent pattern produces one output and a chain and we
1638 want to map it into one that just output a chain. The current trick is to select
1639 it into a MERGE_VALUES with the first definition being an implicit_def. The
1640 proper solution is to add new ISD opcodes for the no-output variant. DAG
1641 combiner can then transform the node before it gets to target node selection.
1642
1643 Problem #2 is we are adding a whole bunch of x86 atomic instructions when in
1644 fact these instructions are identical to the non-lock versions. We need a way to
1645 add target specific information to target nodes and have this information
1646 carried over to machine instructions. Asm printer (or JIT) can use this
1647 information to add the "lock" prefix.
1648
1649 //===---------------------------------------------------------------------===//
1650
1651 struct B {
1652   unsigned char y0 : 1;
1653 };
1654
1655 int bar(struct B* a) { return a->y0; }
1656
1657 define i32 @bar(%struct.B* nocapture %a) nounwind readonly optsize {
1658   %1 = getelementptr inbounds %struct.B* %a, i64 0, i32 0
1659   %2 = load i8* %1, align 1
1660   %3 = and i8 %2, 1
1661   %4 = zext i8 %3 to i32
1662   ret i32 %4
1663 }
1664
1665 bar:                                    # @bar
1666 # BB#0:
1667         movb    (%rdi), %al
1668         andb    $1, %al
1669         movzbl  %al, %eax
1670         ret
1671
1672 Missed optimization: should be movl+andl.
1673
1674 //===---------------------------------------------------------------------===//
1675
1676 The x86_64 abi says:
1677
1678 Booleans, when stored in a memory object, are stored as single byte objects the
1679 value of which is always 0 (false) or 1 (true).
1680
1681 We are not using this fact:
1682
1683 int bar(_Bool *a) { return *a; }
1684
1685 define i32 @bar(i8* nocapture %a) nounwind readonly optsize {
1686   %1 = load i8* %a, align 1, !tbaa !0
1687   %tmp = and i8 %1, 1
1688   %2 = zext i8 %tmp to i32
1689   ret i32 %2
1690 }
1691
1692 bar:
1693         movb    (%rdi), %al
1694         andb    $1, %al
1695         movzbl  %al, %eax
1696         ret
1697
1698 GCC produces
1699
1700 bar:
1701         movzbl  (%rdi), %eax
1702         ret
1703
1704 //===---------------------------------------------------------------------===//
1705
1706 Consider the following two functions compiled with clang:
1707 _Bool foo(int *x) { return !(*x & 4); }
1708 unsigned bar(int *x) { return !(*x & 4); }
1709
1710 foo:
1711         movl    4(%esp), %eax
1712         testb   $4, (%eax)
1713         sete    %al
1714         movzbl  %al, %eax
1715         ret
1716
1717 bar:
1718         movl    4(%esp), %eax
1719         movl    (%eax), %eax
1720         shrl    $2, %eax
1721         andl    $1, %eax
1722         xorl    $1, %eax
1723         ret
1724
1725 The second function generates more code even though the two functions are
1726 are functionally identical.
1727
1728 //===---------------------------------------------------------------------===//
1729
1730 Take the following C code:
1731 int f(int a, int b) { return (unsigned char)a == (unsigned char)b; }
1732
1733 We generate the following IR with clang:
1734 define i32 @f(i32 %a, i32 %b) nounwind readnone {
1735 entry:
1736   %tmp = xor i32 %b, %a                           ; <i32> [#uses=1]
1737   %tmp6 = and i32 %tmp, 255                       ; <i32> [#uses=1]
1738   %cmp = icmp eq i32 %tmp6, 0                     ; <i1> [#uses=1]
1739   %conv5 = zext i1 %cmp to i32                    ; <i32> [#uses=1]
1740   ret i32 %conv5
1741 }
1742
1743 And the following x86 code:
1744         xorl    %esi, %edi
1745         testb   $-1, %dil
1746         sete    %al
1747         movzbl  %al, %eax
1748         ret
1749
1750 A cmpb instead of the xorl+testb would be one instruction shorter.
1751
1752 //===---------------------------------------------------------------------===//
1753
1754 Given the following C code:
1755 int f(int a, int b) { return (signed char)a == (signed char)b; }
1756
1757 We generate the following IR with clang:
1758 define i32 @f(i32 %a, i32 %b) nounwind readnone {
1759 entry:
1760   %sext = shl i32 %a, 24                          ; <i32> [#uses=1]
1761   %conv1 = ashr i32 %sext, 24                     ; <i32> [#uses=1]
1762   %sext6 = shl i32 %b, 24                         ; <i32> [#uses=1]
1763   %conv4 = ashr i32 %sext6, 24                    ; <i32> [#uses=1]
1764   %cmp = icmp eq i32 %conv1, %conv4               ; <i1> [#uses=1]
1765   %conv5 = zext i1 %cmp to i32                    ; <i32> [#uses=1]
1766   ret i32 %conv5
1767 }
1768
1769 And the following x86 code:
1770         movsbl  %sil, %eax
1771         movsbl  %dil, %ecx
1772         cmpl    %eax, %ecx
1773         sete    %al
1774         movzbl  %al, %eax
1775         ret
1776
1777
1778 It should be possible to eliminate the sign extensions.
1779
1780 //===---------------------------------------------------------------------===//
1781
1782 LLVM misses a load+store narrowing opportunity in this code:
1783
1784 %struct.bf = type { i64, i16, i16, i32 }
1785
1786 @bfi = external global %struct.bf*                ; <%struct.bf**> [#uses=2]
1787
1788 define void @t1() nounwind ssp {
1789 entry:
1790   %0 = load %struct.bf** @bfi, align 8            ; <%struct.bf*> [#uses=1]
1791   %1 = getelementptr %struct.bf* %0, i64 0, i32 1 ; <i16*> [#uses=1]
1792   %2 = bitcast i16* %1 to i32*                    ; <i32*> [#uses=2]
1793   %3 = load i32* %2, align 1                      ; <i32> [#uses=1]
1794   %4 = and i32 %3, -65537                         ; <i32> [#uses=1]
1795   store i32 %4, i32* %2, align 1
1796   %5 = load %struct.bf** @bfi, align 8            ; <%struct.bf*> [#uses=1]
1797   %6 = getelementptr %struct.bf* %5, i64 0, i32 1 ; <i16*> [#uses=1]
1798   %7 = bitcast i16* %6 to i32*                    ; <i32*> [#uses=2]
1799   %8 = load i32* %7, align 1                      ; <i32> [#uses=1]
1800   %9 = and i32 %8, -131073                        ; <i32> [#uses=1]
1801   store i32 %9, i32* %7, align 1
1802   ret void
1803 }
1804
1805 LLVM currently emits this:
1806
1807   movq  bfi(%rip), %rax
1808   andl  $-65537, 8(%rax)
1809   movq  bfi(%rip), %rax
1810   andl  $-131073, 8(%rax)
1811   ret
1812
1813 It could narrow the loads and stores to emit this:
1814
1815   movq  bfi(%rip), %rax
1816   andb  $-2, 10(%rax)
1817   movq  bfi(%rip), %rax
1818   andb  $-3, 10(%rax)
1819   ret
1820
1821 The trouble is that there is a TokenFactor between the store and the
1822 load, making it non-trivial to determine if there's anything between
1823 the load and the store which would prohibit narrowing.
1824
1825 //===---------------------------------------------------------------------===//
1826
1827 This code:
1828 void foo(unsigned x) {
1829   if (x == 0) bar();
1830   else if (x == 1) qux();
1831 }
1832
1833 currently compiles into:
1834 _foo:
1835         movl    4(%esp), %eax
1836         cmpl    $1, %eax
1837         je      LBB0_3
1838         testl   %eax, %eax
1839         jne     LBB0_4
1840
1841 the testl could be removed:
1842 _foo:
1843         movl    4(%esp), %eax
1844         cmpl    $1, %eax
1845         je      LBB0_3
1846         jb      LBB0_4
1847
1848 0 is the only unsigned number < 1.
1849
1850 //===---------------------------------------------------------------------===//
1851
1852 This code:
1853
1854 %0 = type { i32, i1 }
1855
1856 define i32 @add32carry(i32 %sum, i32 %x) nounwind readnone ssp {
1857 entry:
1858   %uadd = tail call %0 @llvm.uadd.with.overflow.i32(i32 %sum, i32 %x)
1859   %cmp = extractvalue %0 %uadd, 1
1860   %inc = zext i1 %cmp to i32
1861   %add = add i32 %x, %sum
1862   %z.0 = add i32 %add, %inc
1863   ret i32 %z.0
1864 }
1865
1866 declare %0 @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone
1867
1868 compiles to:
1869
1870 _add32carry:                            ## @add32carry
1871         addl    %esi, %edi
1872         sbbl    %ecx, %ecx
1873         movl    %edi, %eax
1874         subl    %ecx, %eax
1875         ret
1876
1877 But it could be:
1878
1879 _add32carry:
1880         leal    (%rsi,%rdi), %eax
1881         cmpl    %esi, %eax
1882         adcl    $0, %eax
1883         ret
1884
1885 //===---------------------------------------------------------------------===//
1886
1887 The hot loop of 256.bzip2 contains code that looks a bit like this:
1888
1889 int foo(char *P, char *Q, int x, int y) {
1890   if (P[0] != Q[0])
1891      return P[0] < Q[0];
1892   if (P[1] != Q[1])
1893      return P[1] < Q[1];
1894   if (P[2] != Q[2])
1895      return P[2] < Q[2];
1896    return P[3] < Q[3];
1897 }
1898
1899 In the real code, we get a lot more wrong than this.  However, even in this
1900 code we generate:
1901
1902 _foo:                                   ## @foo
1903 ## BB#0:                                ## %entry
1904         movb    (%rsi), %al
1905         movb    (%rdi), %cl
1906         cmpb    %al, %cl
1907         je      LBB0_2
1908 LBB0_1:                                 ## %if.then
1909         cmpb    %al, %cl
1910         jmp     LBB0_5
1911 LBB0_2:                                 ## %if.end
1912         movb    1(%rsi), %al
1913         movb    1(%rdi), %cl
1914         cmpb    %al, %cl
1915         jne     LBB0_1
1916 ## BB#3:                                ## %if.end38
1917         movb    2(%rsi), %al
1918         movb    2(%rdi), %cl
1919         cmpb    %al, %cl
1920         jne     LBB0_1
1921 ## BB#4:                                ## %if.end60
1922         movb    3(%rdi), %al
1923         cmpb    3(%rsi), %al
1924 LBB0_5:                                 ## %if.end60
1925         setl    %al
1926         movzbl  %al, %eax
1927         ret
1928
1929 Note that we generate jumps to LBB0_1 which does a redundant compare.  The
1930 redundant compare also forces the register values to be live, which prevents
1931 folding one of the loads into the compare.  In contrast, GCC 4.2 produces:
1932
1933 _foo:
1934         movzbl  (%rsi), %eax
1935         cmpb    %al, (%rdi)
1936         jne     L10
1937 L12:
1938         movzbl  1(%rsi), %eax
1939         cmpb    %al, 1(%rdi)
1940         jne     L10
1941         movzbl  2(%rsi), %eax
1942         cmpb    %al, 2(%rdi)
1943         jne     L10
1944         movzbl  3(%rdi), %eax
1945         cmpb    3(%rsi), %al
1946 L10:
1947         setl    %al
1948         movzbl  %al, %eax
1949         ret
1950
1951 which is "perfect".
1952
1953 //===---------------------------------------------------------------------===//
1954
1955 For the branch in the following code:
1956 int a();
1957 int b(int x, int y) {
1958   if (x & (1<<(y&7)))
1959     return a();
1960   return y;
1961 }
1962
1963 We currently generate:
1964         movb    %sil, %al
1965         andb    $7, %al
1966         movzbl  %al, %eax
1967         btl     %eax, %edi
1968         jae     .LBB0_2
1969
1970 movl+andl would be shorter than the movb+andb+movzbl sequence.
1971
1972 //===---------------------------------------------------------------------===//
1973
1974 For the following:
1975 struct u1 {
1976     float x, y;
1977 };
1978 float foo(struct u1 u) {
1979     return u.x + u.y;
1980 }
1981
1982 We currently generate:
1983         movdqa  %xmm0, %xmm1
1984         pshufd  $1, %xmm0, %xmm0        # xmm0 = xmm0[1,0,0,0]
1985         addss   %xmm1, %xmm0
1986         ret
1987
1988 We could save an instruction here by commuting the addss.
1989
1990 //===---------------------------------------------------------------------===//
1991
1992 This (from PR9661):
1993
1994 float clamp_float(float a) {
1995         if (a > 1.0f)
1996                 return 1.0f;
1997         else if (a < 0.0f)
1998                 return 0.0f;
1999         else
2000                 return a;
2001 }
2002
2003 Could compile to:
2004
2005 clamp_float:                            # @clamp_float
2006         movss   .LCPI0_0(%rip), %xmm1
2007         minss   %xmm1, %xmm0
2008         pxor    %xmm1, %xmm1
2009         maxss   %xmm1, %xmm0
2010         ret
2011
2012 with -ffast-math.
2013
2014 //===---------------------------------------------------------------------===//
2015
2016 This function (from PR9803):
2017
2018 int clamp2(int a) {
2019         if (a > 5)
2020                 a = 5;
2021         if (a < 0)
2022                 return 0;
2023         return a;
2024 }
2025
2026 Compiles to:
2027
2028 _clamp2:                                ## @clamp2
2029         pushq   %rbp
2030         movq    %rsp, %rbp
2031         cmpl    $5, %edi
2032         movl    $5, %ecx
2033         cmovlel %edi, %ecx
2034         testl   %ecx, %ecx
2035         movl    $0, %eax
2036         cmovnsl %ecx, %eax
2037         popq    %rbp
2038         ret
2039
2040 The move of 0 could be scheduled above the test to make it is xor reg,reg.
2041
2042 //===---------------------------------------------------------------------===//
2043
2044 GCC PR48986.  We currently compile this:
2045
2046 void bar(void);
2047 void yyy(int* p) {
2048     if (__sync_fetch_and_add(p, -1) == 1)
2049       bar();
2050 }
2051
2052 into:
2053         movl    $-1, %eax
2054         lock
2055         xaddl   %eax, (%rdi)
2056         cmpl    $1, %eax
2057         je      LBB0_2
2058
2059 Instead we could generate:
2060
2061         lock
2062         dec %rdi
2063         je LBB0_2
2064
2065 The trick is to match "fetch_and_add(X, -C) == C".
2066
2067 //===---------------------------------------------------------------------===//
2068