lib/Target/X86/README.txt

   1 //===---------------------------------------------------------------------===//
   2 // Random ideas for the X86 backend.
   3 //===---------------------------------------------------------------------===//
   4
   5 We should add support for the "movbe" instruction, which does a byte-swapping
   6 copy (3-addr bswap + memory support?)  This is available on Atom processors.
   7
   8 //===---------------------------------------------------------------------===//
   9
  10 CodeGen/X86/lea-3.ll:test3 should be a single LEA, not a shift/move.  The X86
  11 backend knows how to three-addressify this shift, but it appears the register
  12 allocator isn't even asking it to do so in this case.  We should investigate
  13 why this isn't happening, it could have significant impact on other important
  14 cases for X86 as well.
  15
  16 //===---------------------------------------------------------------------===//
  17
  18 This should be one DIV/IDIV instruction, not a libcall:
  19
  20 unsigned test(unsigned long long X, unsigned Y) {
  21         return X/Y;
  22 }
  23
  24 This can be done trivially with a custom legalizer.  What about overflow
  25 though?  http://gcc.gnu.org/bugzilla/show_bug.cgi?id=14224
  26
  27 //===---------------------------------------------------------------------===//
  28
  29 Improvements to the multiply -> shift/add algorithm:
  30 http://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html
  31
  32 //===---------------------------------------------------------------------===//
  33
  34 Improve code like this (occurs fairly frequently, e.g. in LLVM):
  35 long long foo(int x) { return 1LL << x; }
  36
  37 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01109.html
  38 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01128.html
  39 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html
  40
  41 Another useful one would be  ~0ULL >> X and ~0ULL << X.
  42
  43 One better solution for 1LL << x is:
  44         xorl    %eax, %eax
  45         xorl    %edx, %edx
  46         testb   $32, %cl
  47         sete    %al
  48         setne   %dl
  49         sall    %cl, %eax
  50         sall    %cl, %edx
  51
  52 But that requires good 8-bit subreg support.
  53
  54 Also, this might be better.  It's an extra shift, but it's one instruction
  55 shorter, and doesn't stress 8-bit subreg support.
  56 (From http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01148.html,
  57 but without the unnecessary and.)
  58         movl %ecx, %eax
  59         shrl $5, %eax
  60         movl %eax, %edx
  61         xorl $1, %edx
  62         sall %cl, %eax
  63         sall %cl. %edx
  64
  65 64-bit shifts (in general) expand to really bad code.  Instead of using
  66 cmovs, we should expand to a conditional branch like GCC produces.
  67
  68 //===---------------------------------------------------------------------===//
  69
  70 Some isel ideas:
  71
  72 1. Dynamic programming based approach when compile time if not an
  73    issue.
  74 2. Code duplication (addressing mode) during isel.
  75 3. Other ideas from "Register-Sensitive Selection, Duplication, and
  76    Sequencing of Instructions".
  77 4. Scheduling for reduced register pressure.  E.g. "Minimum Register
  78    Instruction Sequence Problem: Revisiting Optimal Code Generation for DAGs"
  79    and other related papers.
  80    http://citeseer.ist.psu.edu/govindarajan01minimum.html
  81
  82 //===---------------------------------------------------------------------===//
  83
  84 Should we promote i16 to i32 to avoid partial register update stalls?
  85
  86 //===---------------------------------------------------------------------===//
  87
  88 Leave any_extend as pseudo instruction and hint to register
  89 allocator. Delay codegen until post register allocation.
  90 Note. any_extend is now turned into an INSERT_SUBREG. We still need to teach
  91 the coalescer how to deal with it though.
  92
  93 //===---------------------------------------------------------------------===//
  94
  95 It appears icc use push for parameter passing. Need to investigate.
  96
  97 //===---------------------------------------------------------------------===//
  98
  99 Only use inc/neg/not instructions on processors where they are faster than
 100 add/sub/xor.  They are slower on the P4 due to only updating some processor
 101 flags.
 102
 103 //===---------------------------------------------------------------------===//
 104
 105 The instruction selector sometimes misses folding a load into a compare.  The
 106 pattern is written as (cmp reg, (load p)).  Because the compare isn't
 107 commutative, it is not matched with the load on both sides.  The dag combiner
 108 should be made smart enough to cannonicalize the load into the RHS of a compare
 109 when it can invert the result of the compare for free.
 110
 111 //===---------------------------------------------------------------------===//
 112
 113 In many cases, LLVM generates code like this:
 114
 115 _test:
 116         movl 8(%esp), %eax
 117         cmpl %eax, 4(%esp)
 118         setl %al
 119         movzbl %al, %eax
 120         ret
 121
 122 on some processors (which ones?), it is more efficient to do this:
 123
 124 _test:
 125         movl 8(%esp), %ebx
 126         xor  %eax, %eax
 127         cmpl %ebx, 4(%esp)
 128         setl %al
 129         ret
 130
 131 Doing this correctly is tricky though, as the xor clobbers the flags.
 132
 133 //===---------------------------------------------------------------------===//
 134
 135 We should generate bts/btr/etc instructions on targets where they are cheap or
 136 when codesize is important.  e.g., for:
 137
 138 void setbit(int *target, int bit) {
 139     *target |= (1 << bit);
 140 }
 141 void clearbit(int *target, int bit) {
 142     *target &= ~(1 << bit);
 143 }
 144
 145 //===---------------------------------------------------------------------===//
 146
 147 Instead of the following for memset char*, 1, 10:
 148
 149         movl $16843009, 4(%edx)
 150         movl $16843009, (%edx)
 151         movw $257, 8(%edx)
 152
 153 It might be better to generate
 154
 155         movl $16843009, %eax
 156         movl %eax, 4(%edx)
 157         movl %eax, (%edx)
 158         movw al, 8(%edx)
 159
 160 when we can spare a register. It reduces code size.
 161
 162 //===---------------------------------------------------------------------===//
 163
 164 Evaluate what the best way to codegen sdiv X, (2^C) is.  For X/8, we currently
 165 get this:
 166
 167 define i32 @test1(i32 %X) {
 168     %Y = sdiv i32 %X, 8
 169     ret i32 %Y
 170 }
 171
 172 _test1:
 173         movl 4(%esp), %eax
 174         movl %eax, %ecx
 175         sarl $31, %ecx
 176         shrl $29, %ecx
 177         addl %ecx, %eax
 178         sarl $3, %eax
 179         ret
 180
 181 GCC knows several different ways to codegen it, one of which is this:
 182
 183 _test1:
 184         movl    4(%esp), %eax
 185         cmpl    $-1, %eax
 186         leal    7(%eax), %ecx
 187         cmovle  %ecx, %eax
 188         sarl    $3, %eax
 189         ret
 190
 191 which is probably slower, but it's interesting at least :)
 192
 193 //===---------------------------------------------------------------------===//
 194
 195 We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl
 196 We should leave these as libcalls for everything over a much lower threshold,
 197 since libc is hand tuned for medium and large mem ops (avoiding RFO for large
 198 stores, TLB preheating, etc)
 199
 200 //===---------------------------------------------------------------------===//
 201
 202 Optimize this into something reasonable:
 203  x * copysign(1.0, y) * copysign(1.0, z)
 204
 205 //===---------------------------------------------------------------------===//
 206
 207 Optimize copysign(x, *y) to use an integer load from y.
 208
 209 //===---------------------------------------------------------------------===//
 210
 211 The following tests perform worse with LSR:
 212
 213 lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor.
 214
 215 //===---------------------------------------------------------------------===//
 216
 217 Adding to the list of cmp / test poor codegen issues:
 218
 219 int test(__m128 *A, __m128 *B) {
 220   if (_mm_comige_ss(*A, *B))
 221     return 3;
 222   else
 223     return 4;
 224 }
 225
 226 _test:
 227         movl 8(%esp), %eax
 228         movaps (%eax), %xmm0
 229         movl 4(%esp), %eax
 230         movaps (%eax), %xmm1
 231         comiss %xmm0, %xmm1
 232         setae %al
 233         movzbl %al, %ecx
 234         movl $3, %eax
 235         movl $4, %edx
 236         cmpl $0, %ecx
 237         cmove %edx, %eax
 238         ret
 239
 240 Note the setae, movzbl, cmpl, cmove can be replaced with a single cmovae. There
 241 are a number of issues. 1) We are introducing a setcc between the result of the
 242 intrisic call and select. 2) The intrinsic is expected to produce a i32 value
 243 so a any extend (which becomes a zero extend) is added.
 244
 245 We probably need some kind of target DAG combine hook to fix this.
 246
 247 //===---------------------------------------------------------------------===//
 248
 249 We generate significantly worse code for this than GCC:
 250 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150
 251 http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701
 252
 253 There is also one case we do worse on PPC.
 254
 255 //===---------------------------------------------------------------------===//
 256
 257 For this:
 258
 259 int test(int a)
 260 {
 261   return a * 3;
 262 }
 263
 264 We currently emits
 265         imull $3, 4(%esp), %eax
 266
 267 Perhaps this is what we really should generate is? Is imull three or four
 268 cycles? Note: ICC generates this:
 269         movl    4(%esp), %eax
 270         leal    (%eax,%eax,2), %eax
 271
 272 The current instruction priority is based on pattern complexity. The former is
 273 more "complex" because it folds a load so the latter will not be emitted.
 274
 275 Perhaps we should use AddedComplexity to give LEA32r a higher priority? We
 276 should always try to match LEA first since the LEA matching code does some
 277 estimate to determine whether the match is profitable.
 278
 279 However, if we care more about code size, then imull is better. It's two bytes
 280 shorter than movl + leal.
 281
 282 On a Pentium M, both variants have the same characteristics with regard
 283 to throughput; however, the multiplication has a latency of four cycles, as
 284 opposed to two cycles for the movl+lea variant.
 285
 286 //===---------------------------------------------------------------------===//
 287
 288 __builtin_ffs codegen is messy.
 289
 290 int ffs_(unsigned X) { return __builtin_ffs(X); }
 291
 292 llvm produces:
 293 ffs_:
 294         movl    4(%esp), %ecx
 295         bsfl    %ecx, %eax
 296         movl    $32, %edx
 297         cmove   %edx, %eax
 298         incl    %eax
 299         xorl    %edx, %edx
 300         testl   %ecx, %ecx
 301         cmove   %edx, %eax
 302         ret
 303
 304 vs gcc:
 305
 306 _ffs_:
 307         movl    $-1, %edx
 308         bsfl    4(%esp), %eax
 309         cmove   %edx, %eax
 310         addl    $1, %eax
 311         ret
 312
 313 Another example of __builtin_ffs (use predsimplify to eliminate a select):
 314
 315 int foo (unsigned long j) {
 316   if (j)
 317     return __builtin_ffs (j) - 1;
 318   else
 319     return 0;
 320 }
 321
 322 //===---------------------------------------------------------------------===//
 323
 324 It appears gcc place string data with linkonce linkage in
 325 .section __TEXT,__const_coal,coalesced instead of
 326 .section __DATA,__const_coal,coalesced.
 327 Take a look at darwin.h, there are other Darwin assembler directives that we
 328 do not make use of.
 329
 330 //===---------------------------------------------------------------------===//
 331
 332 define i32 @foo(i32* %a, i32 %t) {
 333 entry:
 334         br label %cond_true
 335
 336 cond_true:              ; preds = %cond_true, %entry
 337         %x.0.0 = phi i32 [ 0, %entry ], [ %tmp9, %cond_true ]           ; <i32> [#uses=3]
 338         %t_addr.0.0 = phi i32 [ %t, %entry ], [ %tmp7, %cond_true ]             ; <i32> [#uses=1]
 339         %tmp2 = getelementptr i32* %a, i32 %x.0.0               ; <i32*> [#uses=1]
 340         %tmp3 = load i32* %tmp2         ; <i32> [#uses=1]
 341         %tmp5 = add i32 %t_addr.0.0, %x.0.0             ; <i32> [#uses=1]
 342         %tmp7 = add i32 %tmp5, %tmp3            ; <i32> [#uses=2]
 343         %tmp9 = add i32 %x.0.0, 1               ; <i32> [#uses=2]
 344         %tmp = icmp sgt i32 %tmp9, 39           ; <i1> [#uses=1]
 345         br i1 %tmp, label %bb12, label %cond_true
 346
 347 bb12:           ; preds = %cond_true
 348         ret i32 %tmp7
 349 }
 350 is pessimized by -loop-reduce and -indvars
 351
 352 //===---------------------------------------------------------------------===//
 353
 354 u32 to float conversion improvement:
 355
 356 float uint32_2_float( unsigned u ) {
 357   float fl = (int) (u & 0xffff);
 358   float fh = (int) (u >> 16);
 359   fh *= 0x1.0p16f;
 360   return fh + fl;
 361 }
 362
 363 00000000        subl    $0x04,%esp
 364 00000003        movl    0x08(%esp,1),%eax
 365 00000007        movl    %eax,%ecx
 366 00000009        shrl    $0x10,%ecx
 367 0000000c        cvtsi2ss        %ecx,%xmm0
 368 00000010        andl    $0x0000ffff,%eax
 369 00000015        cvtsi2ss        %eax,%xmm1
 370 00000019        mulss   0x00000078,%xmm0
 371 00000021        addss   %xmm1,%xmm0
 372 00000025        movss   %xmm0,(%esp,1)
 373 0000002a        flds    (%esp,1)
 374 0000002d        addl    $0x04,%esp
 375 00000030        ret
 376
 377 //===---------------------------------------------------------------------===//
 378
 379 When using fastcc abi, align stack slot of argument of type double on 8 byte
 380 boundary to improve performance.
 381
 382 //===---------------------------------------------------------------------===//
 383
 384 GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting
 385 simplifications for integer "x cmp y ? a : b".
 386
 387 //===---------------------------------------------------------------------===//
 388
 389 Consider the expansion of:
 390
 391 define i32 @test3(i32 %X) {
 392         %tmp1 = urem i32 %X, 255
 393         ret i32 %tmp1
 394 }
 395
 396 Currently it compiles to:
 397
 398 ...
 399         movl $2155905153, %ecx
 400         movl 8(%esp), %esi
 401         movl %esi, %eax
 402         mull %ecx
 403 ...
 404
 405 This could be "reassociated" into:
 406
 407         movl $2155905153, %eax
 408         movl 8(%esp), %ecx
 409         mull %ecx
 410
 411 to avoid the copy.  In fact, the existing two-address stuff would do this
 412 except that mul isn't a commutative 2-addr instruction.  I guess this has
 413 to be done at isel time based on the #uses to mul?
 414
 415 //===---------------------------------------------------------------------===//
 416
 417 Make sure the instruction which starts a loop does not cross a cacheline
 418 boundary. This requires knowning the exact length of each machine instruction.
 419 That is somewhat complicated, but doable. Example 256.bzip2:
 420
 421 In the new trace, the hot loop has an instruction which crosses a cacheline
 422 boundary.  In addition to potential cache misses, this can't help decoding as I
 423 imagine there has to be some kind of complicated decoder reset and realignment
 424 to grab the bytes from the next cacheline.
 425
 426 532  532 0x3cfc movb     (1809(%esp, %esi), %bl   <<<--- spans 2 64 byte lines
 427 942  942 0x3d03 movl     %dh, (1809(%esp, %esi)
 428 937  937 0x3d0a incl     %esi
 429 3    3   0x3d0b cmpb     %bl, %dl
 430 27   27  0x3d0d jnz      0x000062db <main+11707>
 431
 432 //===---------------------------------------------------------------------===//
 433
 434 In c99 mode, the preprocessor doesn't like assembly comments like #TRUNCATE.
 435
 436 //===---------------------------------------------------------------------===//
 437
 438 This could be a single 16-bit load.
 439
 440 int f(char *p) {
 441     if ((p[0] == 1) & (p[1] == 2)) return 1;
 442     return 0;
 443 }
 444
 445 //===---------------------------------------------------------------------===//
 446
 447 We should inline lrintf and probably other libc functions.
 448
 449 //===---------------------------------------------------------------------===//
 450
 451 Use the FLAGS values from arithmetic instructions more.  For example, compile:
 452
 453 int add_zf(int *x, int y, int a, int b) {
 454      if ((*x += y) == 0)
 455           return a;
 456      else
 457           return b;
 458 }
 459
 460 to:
 461        addl    %esi, (%rdi)
 462        movl    %edx, %eax
 463        cmovne  %ecx, %eax
 464        ret
 465 instead of:
 466
 467 _add_zf:
 468         addl (%rdi), %esi
 469         movl %esi, (%rdi)
 470         testl %esi, %esi
 471         cmove %edx, %ecx
 472         movl %ecx, %eax
 473         ret
 474
 475 As another example, compile function f2 in test/CodeGen/X86/cmp-test.ll
 476 without a test instruction.
 477
 478 //===---------------------------------------------------------------------===//
 479
 480 These two functions have identical effects:
 481
 482 unsigned int f(unsigned int i, unsigned int n) {++i; if (i == n) ++i; return i;}
 483 unsigned int f2(unsigned int i, unsigned int n) {++i; i += i == n; return i;}
 484
 485 We currently compile them to:
 486
 487 _f:
 488         movl 4(%esp), %eax
 489         movl %eax, %ecx
 490         incl %ecx
 491         movl 8(%esp), %edx
 492         cmpl %edx, %ecx
 493         jne LBB1_2      #UnifiedReturnBlock
 494 LBB1_1: #cond_true
 495         addl $2, %eax
 496         ret
 497 LBB1_2: #UnifiedReturnBlock
 498         movl %ecx, %eax
 499         ret
 500 _f2:
 501         movl 4(%esp), %eax
 502         movl %eax, %ecx
 503         incl %ecx
 504         cmpl 8(%esp), %ecx
 505         sete %cl
 506         movzbl %cl, %ecx
 507         leal 1(%ecx,%eax), %eax
 508         ret
 509
 510 both of which are inferior to GCC's:
 511
 512 _f:
 513         movl    4(%esp), %edx
 514         leal    1(%edx), %eax
 515         addl    $2, %edx
 516         cmpl    8(%esp), %eax
 517         cmove   %edx, %eax
 518         ret
 519 _f2:
 520         movl    4(%esp), %eax
 521         addl    $1, %eax
 522         xorl    %edx, %edx
 523         cmpl    8(%esp), %eax
 524         sete    %dl
 525         addl    %edx, %eax
 526         ret
 527
 528 //===---------------------------------------------------------------------===//
 529
 530 This code:
 531
 532 void test(int X) {
 533   if (X) abort();
 534 }
 535
 536 is currently compiled to:
 537
 538 _test:
 539         subl $12, %esp
 540         cmpl $0, 16(%esp)
 541         jne LBB1_1
 542         addl $12, %esp
 543         ret
 544 LBB1_1:
 545         call L_abort$stub
 546
 547 It would be better to produce:
 548
 549 _test:
 550         subl $12, %esp
 551         cmpl $0, 16(%esp)
 552         jne L_abort$stub
 553         addl $12, %esp
 554         ret
 555
 556 This can be applied to any no-return function call that takes no arguments etc.
 557 Alternatively, the stack save/restore logic could be shrink-wrapped, producing
 558 something like this:
 559
 560 _test:
 561         cmpl $0, 4(%esp)
 562         jne LBB1_1
 563         ret
 564 LBB1_1:
 565         subl $12, %esp
 566         call L_abort$stub
 567
 568 Both are useful in different situations.  Finally, it could be shrink-wrapped
 569 and tail called, like this:
 570
 571 _test:
 572         cmpl $0, 4(%esp)
 573         jne LBB1_1
 574         ret
 575 LBB1_1:
 576         pop %eax   # realign stack.
 577         call L_abort$stub
 578
 579 Though this probably isn't worth it.
 580
 581 //===---------------------------------------------------------------------===//
 582
 583 Sometimes it is better to codegen subtractions from a constant (e.g. 7-x) with
 584 a neg instead of a sub instruction.  Consider:
 585
 586 int test(char X) { return 7-X; }
 587
 588 we currently produce:
 589 _test:
 590         movl $7, %eax
 591         movsbl 4(%esp), %ecx
 592         subl %ecx, %eax
 593         ret
 594
 595 We would use one fewer register if codegen'd as:
 596
 597         movsbl 4(%esp), %eax
 598         neg %eax
 599         add $7, %eax
 600         ret
 601
 602 Note that this isn't beneficial if the load can be folded into the sub.  In
 603 this case, we want a sub:
 604
 605 int test(int X) { return 7-X; }
 606 _test:
 607         movl $7, %eax
 608         subl 4(%esp), %eax
 609         ret
 610
 611 //===---------------------------------------------------------------------===//
 612
 613 Leaf functions that require one 4-byte spill slot have a prolog like this:
 614
 615 _foo:
 616         pushl   %esi
 617         subl    $4, %esp
 618 ...
 619 and an epilog like this:
 620         addl    $4, %esp
 621         popl    %esi
 622         ret
 623
 624 It would be smaller, and potentially faster, to push eax on entry and to
 625 pop into a dummy register instead of using addl/subl of esp.  Just don't pop
 626 into any return registers :)
 627
 628 //===---------------------------------------------------------------------===//
 629
 630 The X86 backend should fold (branch (or (setcc, setcc))) into multiple
 631 branches.  We generate really poor code for:
 632
 633 double testf(double a) {
 634        return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0);
 635 }
 636
 637 For example, the entry BB is:
 638
 639 _testf:
 640         subl    $20, %esp
 641         pxor    %xmm0, %xmm0
 642         movsd   24(%esp), %xmm1
 643         ucomisd %xmm0, %xmm1
 644         setnp   %al
 645         sete    %cl
 646         testb   %cl, %al
 647         jne     LBB1_5  # UnifiedReturnBlock
 648 LBB1_1: # cond_true
 649
 650
 651 it would be better to replace the last four instructions with:
 652
 653         jp LBB1_1
 654         je LBB1_5
 655 LBB1_1:
 656
 657 We also codegen the inner ?: into a diamond:
 658
 659        cvtss2sd        LCPI1_0(%rip), %xmm2
 660         cvtss2sd        LCPI1_1(%rip), %xmm3
 661         ucomisd %xmm1, %xmm0
 662         ja      LBB1_3  # cond_true
 663 LBB1_2: # cond_true
 664         movapd  %xmm3, %xmm2
 665 LBB1_3: # cond_true
 666         movapd  %xmm2, %xmm0
 667         ret
 668
 669 We should sink the load into xmm3 into the LBB1_2 block.  This should
 670 be pretty easy, and will nuke all the copies.
 671
 672 //===---------------------------------------------------------------------===//
 673
 674 This:
 675         #include <algorithm>
 676         inline std::pair<unsigned, bool> full_add(unsigned a, unsigned b)
 677         { return std::make_pair(a + b, a + b < a); }
 678         bool no_overflow(unsigned a, unsigned b)
 679         { return !full_add(a, b).second; }
 680
 681 Should compile to:
 682
 683
 684         _Z11no_overflowjj:
 685                 addl    %edi, %esi
 686                 setae   %al
 687                 ret
 688
 689 FIXME: That code looks wrong; bool return is normally defined as zext.
 690
 691 on x86-64, not:
 692
 693 __Z11no_overflowjj:
 694         addl    %edi, %esi
 695         cmpl    %edi, %esi
 696         setae   %al
 697         movzbl  %al, %eax
 698         ret
 699
 700
 701 //===---------------------------------------------------------------------===//
 702
 703 The following code:
 704
 705 bb114.preheader:                ; preds = %cond_next94
 706         %tmp231232 = sext i16 %tmp62 to i32             ; <i32> [#uses=1]
 707         %tmp233 = sub i32 32, %tmp231232                ; <i32> [#uses=1]
 708         %tmp245246 = sext i16 %tmp65 to i32             ; <i32> [#uses=1]
 709         %tmp252253 = sext i16 %tmp68 to i32             ; <i32> [#uses=1]
 710         %tmp254 = sub i32 32, %tmp252253                ; <i32> [#uses=1]
 711         %tmp553554 = bitcast i16* %tmp37 to i8*         ; <i8*> [#uses=2]
 712         %tmp583584 = sext i16 %tmp98 to i32             ; <i32> [#uses=1]
 713         %tmp585 = sub i32 32, %tmp583584                ; <i32> [#uses=1]
 714         %tmp614615 = sext i16 %tmp101 to i32            ; <i32> [#uses=1]
 715         %tmp621622 = sext i16 %tmp104 to i32            ; <i32> [#uses=1]
 716         %tmp623 = sub i32 32, %tmp621622                ; <i32> [#uses=1]
 717         br label %bb114
 718
 719 produces:
 720
 721 LBB3_5: # bb114.preheader
 722         movswl  -68(%ebp), %eax
 723         movl    $32, %ecx
 724         movl    %ecx, -80(%ebp)
 725         subl    %eax, -80(%ebp)
 726         movswl  -52(%ebp), %eax
 727         movl    %ecx, -84(%ebp)
 728         subl    %eax, -84(%ebp)
 729         movswl  -70(%ebp), %eax
 730         movl    %ecx, -88(%ebp)
 731         subl    %eax, -88(%ebp)
 732         movswl  -50(%ebp), %eax
 733         subl    %eax, %ecx
 734         movl    %ecx, -76(%ebp)
 735         movswl  -42(%ebp), %eax
 736         movl    %eax, -92(%ebp)
 737         movswl  -66(%ebp), %eax
 738         movl    %eax, -96(%ebp)
 739         movw    $0, -98(%ebp)
 740
 741 This appears to be bad because the RA is not folding the store to the stack
 742 slot into the movl.  The above instructions could be:
 743         movl    $32, -80(%ebp)
 744 ...
 745         movl    $32, -84(%ebp)
 746 ...
 747 This seems like a cross between remat and spill folding.
 748
 749 This has redundant subtractions of %eax from a stack slot. However, %ecx doesn't
 750 change, so we could simply subtract %eax from %ecx first and then use %ecx (or
 751 vice-versa).
 752
 753 //===---------------------------------------------------------------------===//
 754
 755 This code:
 756
 757         %tmp659 = icmp slt i16 %tmp654, 0               ; <i1> [#uses=1]
 758         br i1 %tmp659, label %cond_true662, label %cond_next715
 759
 760 produces this:
 761
 762         testw   %cx, %cx
 763         movswl  %cx, %esi
 764         jns     LBB4_109        # cond_next715
 765
 766 Shark tells us that using %cx in the testw instruction is sub-optimal. It
 767 suggests using the 32-bit register (which is what ICC uses).
 768
 769 //===---------------------------------------------------------------------===//
 770
 771 We compile this:
 772
 773 void compare (long long foo) {
 774   if (foo < 4294967297LL)
 775     abort();
 776 }
 777
 778 to:
 779
 780 compare:
 781         subl    $4, %esp
 782         cmpl    $0, 8(%esp)
 783         setne   %al
 784         movzbw  %al, %ax
 785         cmpl    $1, 12(%esp)
 786         setg    %cl
 787         movzbw  %cl, %cx
 788         cmove   %ax, %cx
 789         testb   $1, %cl
 790         jne     .LBB1_2 # UnifiedReturnBlock
 791 .LBB1_1:        # ifthen
 792         call    abort
 793 .LBB1_2:        # UnifiedReturnBlock
 794         addl    $4, %esp
 795         ret
 796
 797 (also really horrible code on ppc).  This is due to the expand code for 64-bit
 798 compares.  GCC produces multiple branches, which is much nicer:
 799
 800 compare:
 801         subl    $12, %esp
 802         movl    20(%esp), %edx
 803         movl    16(%esp), %eax
 804         decl    %edx
 805         jle     .L7
 806 .L5:
 807         addl    $12, %esp
 808         ret
 809         .p2align 4,,7
 810 .L7:
 811         jl      .L4
 812         cmpl    $0, %eax
 813         .p2align 4,,8
 814         ja      .L5
 815 .L4:
 816         .p2align 4,,9
 817         call    abort
 818
 819 //===---------------------------------------------------------------------===//
 820
 821 Tail call optimization improvements: Tail call optimization currently
 822 pushes all arguments on the top of the stack (their normal place for
 823 non-tail call optimized calls) that source from the callers arguments
 824 or  that source from a virtual register (also possibly sourcing from
 825 callers arguments).
 826 This is done to prevent overwriting of parameters (see example
 827 below) that might be used later.
 828
 829 example:
 830
 831 int callee(int32, int64);
 832 int caller(int32 arg1, int32 arg2) {
 833   int64 local = arg2 * 2;
 834   return callee(arg2, (int64)local);
 835 }
 836
 837 [arg1]          [!arg2 no longer valid since we moved local onto it]
 838 [arg2]      ->  [(int64)
 839 [RETADDR]        local  ]
 840
 841 Moving arg1 onto the stack slot of callee function would overwrite
 842 arg2 of the caller.
 843
 844 Possible optimizations:
 845
 846
 847  - Analyse the actual parameters of the callee to see which would
 848    overwrite a caller parameter which is used by the callee and only
 849    push them onto the top of the stack.
 850
 851    int callee (int32 arg1, int32 arg2);
 852    int caller (int32 arg1, int32 arg2) {
 853        return callee(arg1,arg2);
 854    }
 855
 856    Here we don't need to write any variables to the top of the stack
 857    since they don't overwrite each other.
 858
 859    int callee (int32 arg1, int32 arg2);
 860    int caller (int32 arg1, int32 arg2) {
 861        return callee(arg2,arg1);
 862    }
 863
 864    Here we need to push the arguments because they overwrite each
 865    other.
 866
 867 //===---------------------------------------------------------------------===//
 868
 869 main ()
 870 {
 871   int i = 0;
 872   unsigned long int z = 0;
 873
 874   do {
 875     z -= 0x00004000;
 876     i++;
 877     if (i > 0x00040000)
 878       abort ();
 879   } while (z > 0);
 880   exit (0);
 881 }
 882
 883 gcc compiles this to:
 884
 885 _main:
 886         subl    $28, %esp
 887         xorl    %eax, %eax
 888         jmp     L2
 889 L3:
 890         cmpl    $262144, %eax
 891         je      L10
 892 L2:
 893         addl    $1, %eax
 894         cmpl    $262145, %eax
 895         jne     L3
 896         call    L_abort$stub
 897 L10:
 898         movl    $0, (%esp)
 899         call    L_exit$stub
 900
 901 llvm:
 902
 903 _main:
 904         subl    $12, %esp
 905         movl    $1, %eax
 906         movl    $16384, %ecx
 907 LBB1_1: # bb
 908         cmpl    $262145, %eax
 909         jge     LBB1_4  # cond_true
 910 LBB1_2: # cond_next
 911         incl    %eax
 912         addl    $4294950912, %ecx
 913         cmpl    $16384, %ecx
 914         jne     LBB1_1  # bb
 915 LBB1_3: # bb11
 916         xorl    %eax, %eax
 917         addl    $12, %esp
 918         ret
 919 LBB1_4: # cond_true
 920         call    L_abort$stub
 921
 922 1. LSR should rewrite the first cmp with induction variable %ecx.
 923 2. DAG combiner should fold
 924         leal    1(%eax), %edx
 925         cmpl    $262145, %edx
 926    =>
 927         cmpl    $262144, %eax
 928
 929 //===---------------------------------------------------------------------===//
 930
 931 define i64 @test(double %X) {
 932         %Y = fptosi double %X to i64
 933         ret i64 %Y
 934 }
 935
 936 compiles to:
 937
 938 _test:
 939         subl    $20, %esp
 940         movsd   24(%esp), %xmm0
 941         movsd   %xmm0, 8(%esp)
 942         fldl    8(%esp)
 943         fisttpll        (%esp)
 944         movl    4(%esp), %edx
 945         movl    (%esp), %eax
 946         addl    $20, %esp
 947         #FP_REG_KILL
 948         ret
 949
 950 This should just fldl directly from the input stack slot.
 951
 952 //===---------------------------------------------------------------------===//
 953
 954 This code:
 955 int foo (int x) { return (x & 65535) | 255; }
 956
 957 Should compile into:
 958
 959 _foo:
 960         movzwl  4(%esp), %eax
 961         orl     $255, %eax
 962         ret
 963
 964 instead of:
 965 _foo:
 966         movl    $255, %eax
 967         orl     4(%esp), %eax
 968         andl    $65535, %eax
 969         ret
 970
 971 //===---------------------------------------------------------------------===//
 972
 973 We're codegen'ing multiply of long longs inefficiently:
 974
 975 unsigned long long LLM(unsigned long long arg1, unsigned long long arg2) {
 976   return arg1 *  arg2;
 977 }
 978
 979 We compile to (fomit-frame-pointer):
 980
 981 _LLM:
 982         pushl   %esi
 983         movl    8(%esp), %ecx
 984         movl    16(%esp), %esi
 985         movl    %esi, %eax
 986         mull    %ecx
 987         imull   12(%esp), %esi
 988         addl    %edx, %esi
 989         imull   20(%esp), %ecx
 990         movl    %esi, %edx
 991         addl    %ecx, %edx
 992         popl    %esi
 993         ret
 994
 995 This looks like a scheduling deficiency and lack of remat of the load from
 996 the argument area.  ICC apparently produces:
 997
 998         movl      8(%esp), %ecx
 999         imull     12(%esp), %ecx
1000         movl      16(%esp), %eax
1001         imull     4(%esp), %eax
1002         addl      %eax, %ecx
1003         movl      4(%esp), %eax
1004         mull      12(%esp)
1005         addl      %ecx, %edx
1006         ret
1007
1008 Note that it remat'd loads from 4(esp) and 12(esp).  See this GCC PR:
1009 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=17236
1010
1011 //===---------------------------------------------------------------------===//
1012
1013 We can fold a store into "zeroing a reg".  Instead of:
1014
1015 xorl    %eax, %eax
1016 movl    %eax, 124(%esp)
1017
1018 we should get:
1019
1020 movl    $0, 124(%esp)
1021
1022 if the flags of the xor are dead.
1023
1024 Likewise, we isel "x<<1" into "add reg,reg".  If reg is spilled, this should
1025 be folded into: shl [mem], 1
1026
1027 //===---------------------------------------------------------------------===//
1028
1029 In SSE mode, we turn abs and neg into a load from the constant pool plus a xor
1030 or and instruction, for example:
1031
1032         xorpd   LCPI1_0, %xmm2
1033
1034 However, if xmm2 gets spilled, we end up with really ugly code like this:
1035
1036         movsd   (%esp), %xmm0
1037         xorpd   LCPI1_0, %xmm0
1038         movsd   %xmm0, (%esp)
1039
1040 Since we 'know' that this is a 'neg', we can actually "fold" the spill into
1041 the neg/abs instruction, turning it into an *integer* operation, like this:
1042
1043         xorl 2147483648, [mem+4]     ## 2147483648 = (1 << 31)
1044
1045 you could also use xorb, but xorl is less likely to lead to a partial register
1046 stall.  Here is a contrived testcase:
1047
1048 double a, b, c;
1049 void test(double *P) {
1050   double X = *P;
1051   a = X;
1052   bar();
1053   X = -X;
1054   b = X;
1055   bar();
1056   c = X;
1057 }
1058
1059 //===---------------------------------------------------------------------===//
1060
1061 The generated code on x86 for checking for signed overflow on a multiply the
1062 obvious way is much longer than it needs to be.
1063
1064 int x(int a, int b) {
1065   long long prod = (long long)a*b;
1066   return  prod > 0x7FFFFFFF || prod < (-0x7FFFFFFF-1);
1067 }
1068
1069 See PR2053 for more details.
1070
1071 //===---------------------------------------------------------------------===//
1072
1073 We should investigate using cdq/ctld (effect: edx = sar eax, 31)
1074 more aggressively; it should cost the same as a move+shift on any modern
1075 processor, but it's a lot shorter. Downside is that it puts more
1076 pressure on register allocation because it has fixed operands.
1077
1078 Example:
1079 int abs(int x) {return x < 0 ? -x : x;}
1080
1081 gcc compiles this to the following when using march/mtune=pentium2/3/4/m/etc.:
1082 abs:
1083         movl    4(%esp), %eax
1084         cltd
1085         xorl    %edx, %eax
1086         subl    %edx, %eax
1087         ret
1088
1089 //===---------------------------------------------------------------------===//
1090
1091 Take the following code (from
1092 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=16541):
1093
1094 extern unsigned char first_one[65536];
1095 int FirstOnet(unsigned long long arg1)
1096 {
1097   if (arg1 >> 48)
1098     return (first_one[arg1 >> 48]);
1099   return 0;
1100 }
1101
1102
1103 The following code is currently generated:
1104 FirstOnet:
1105         movl    8(%esp), %eax
1106         cmpl    $65536, %eax
1107         movl    4(%esp), %ecx
1108         jb      .LBB1_2 # UnifiedReturnBlock
1109 .LBB1_1:        # ifthen
1110         shrl    $16, %eax
1111         movzbl  first_one(%eax), %eax
1112         ret
1113 .LBB1_2:        # UnifiedReturnBlock
1114         xorl    %eax, %eax
1115         ret
1116
1117 We could change the "movl 8(%esp), %eax" into "movzwl 10(%esp), %eax"; this
1118 lets us change the cmpl into a testl, which is shorter, and eliminate the shift.
1119
1120 //===---------------------------------------------------------------------===//
1121
1122 We compile this function:
1123
1124 define i32 @foo(i32 %a, i32 %b, i32 %c, i8 zeroext  %d) nounwind  {
1125 entry:
1126         %tmp2 = icmp eq i8 %d, 0                ; <i1> [#uses=1]
1127         br i1 %tmp2, label %bb7, label %bb
1128
1129 bb:             ; preds = %entry
1130         %tmp6 = add i32 %b, %a          ; <i32> [#uses=1]
1131         ret i32 %tmp6
1132
1133 bb7:            ; preds = %entry
1134         %tmp10 = sub i32 %a, %c         ; <i32> [#uses=1]
1135         ret i32 %tmp10
1136 }
1137
1138 to:
1139
1140 foo:                                    # @foo
1141 # BB#0:                                 # %entry
1142         movl    4(%esp), %ecx
1143         cmpb    $0, 16(%esp)
1144         je      .LBB0_2
1145 # BB#1:                                 # %bb
1146         movl    8(%esp), %eax
1147         addl    %ecx, %eax
1148         ret
1149 .LBB0_2:                                # %bb7
1150         movl    12(%esp), %edx
1151         movl    %ecx, %eax
1152         subl    %edx, %eax
1153         ret
1154
1155 There's an obviously unnecessary movl in .LBB0_2, and we could eliminate a
1156 couple more movls by putting 4(%esp) into %eax instead of %ecx.
1157
1158 //===---------------------------------------------------------------------===//
1159
1160 See rdar://4653682.
1161
1162 From flops:
1163
1164 LBB1_15:        # bb310
1165         cvtss2sd        LCPI1_0, %xmm1
1166         addsd   %xmm1, %xmm0
1167         movsd   176(%esp), %xmm2
1168         mulsd   %xmm0, %xmm2
1169         movapd  %xmm2, %xmm3
1170         mulsd   %xmm3, %xmm3
1171         movapd  %xmm3, %xmm4
1172         mulsd   LCPI1_23, %xmm4
1173         addsd   LCPI1_24, %xmm4
1174         mulsd   %xmm3, %xmm4
1175         addsd   LCPI1_25, %xmm4
1176         mulsd   %xmm3, %xmm4
1177         addsd   LCPI1_26, %xmm4
1178         mulsd   %xmm3, %xmm4
1179         addsd   LCPI1_27, %xmm4
1180         mulsd   %xmm3, %xmm4
1181         addsd   LCPI1_28, %xmm4
1182         mulsd   %xmm3, %xmm4
1183         addsd   %xmm1, %xmm4
1184         mulsd   %xmm2, %xmm4
1185         movsd   152(%esp), %xmm1
1186         addsd   %xmm4, %xmm1
1187         movsd   %xmm1, 152(%esp)
1188         incl    %eax
1189         cmpl    %eax, %esi
1190         jge     LBB1_15 # bb310
1191 LBB1_16:        # bb358.loopexit
1192         movsd   152(%esp), %xmm0
1193         addsd   %xmm0, %xmm0
1194         addsd   LCPI1_22, %xmm0
1195         movsd   %xmm0, 152(%esp)
1196
1197 Rather than spilling the result of the last addsd in the loop, we should have
1198 insert a copy to split the interval (one for the duration of the loop, one
1199 extending to the fall through). The register pressure in the loop isn't high
1200 enough to warrant the spill.
1201
1202 Also check why xmm7 is not used at all in the function.
1203
1204 //===---------------------------------------------------------------------===//
1205
1206 Take the following:
1207
1208 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
1209 target triple = "i386-apple-darwin8"
1210 @in_exit.4870.b = internal global i1 false              ; <i1*> [#uses=2]
1211 define fastcc void @abort_gzip() noreturn nounwind  {
1212 entry:
1213         %tmp.b.i = load i1* @in_exit.4870.b             ; <i1> [#uses=1]
1214         br i1 %tmp.b.i, label %bb.i, label %bb4.i
1215 bb.i:           ; preds = %entry
1216         tail call void @exit( i32 1 ) noreturn nounwind
1217         unreachable
1218 bb4.i:          ; preds = %entry
1219         store i1 true, i1* @in_exit.4870.b
1220         tail call void @exit( i32 1 ) noreturn nounwind
1221         unreachable
1222 }
1223 declare void @exit(i32) noreturn nounwind
1224
1225 This compiles into:
1226 _abort_gzip:                            ## @abort_gzip
1227 ## BB#0:                                ## %entry
1228         subl    $12, %esp
1229         movb    _in_exit.4870.b, %al
1230         cmpb    $1, %al
1231         jne     LBB0_2
1232
1233 We somehow miss folding the movb into the cmpb.
1234
1235 //===---------------------------------------------------------------------===//
1236
1237 We compile:
1238
1239 int test(int x, int y) {
1240   return x-y-1;
1241 }
1242
1243 into (-m64):
1244
1245 _test:
1246         decl    %edi
1247         movl    %edi, %eax
1248         subl    %esi, %eax
1249         ret
1250
1251 it would be better to codegen as: x+~y  (notl+addl)
1252
1253 //===---------------------------------------------------------------------===//
1254
1255 This code:
1256
1257 int foo(const char *str,...)
1258 {
1259  __builtin_va_list a; int x;
1260  __builtin_va_start(a,str); x = __builtin_va_arg(a,int); __builtin_va_end(a);
1261  return x;
1262 }
1263
1264 gets compiled into this on x86-64:
1265         subq    $200, %rsp
1266         movaps  %xmm7, 160(%rsp)
1267         movaps  %xmm6, 144(%rsp)
1268         movaps  %xmm5, 128(%rsp)
1269         movaps  %xmm4, 112(%rsp)
1270         movaps  %xmm3, 96(%rsp)
1271         movaps  %xmm2, 80(%rsp)
1272         movaps  %xmm1, 64(%rsp)
1273         movaps  %xmm0, 48(%rsp)
1274         movq    %r9, 40(%rsp)
1275         movq    %r8, 32(%rsp)
1276         movq    %rcx, 24(%rsp)
1277         movq    %rdx, 16(%rsp)
1278         movq    %rsi, 8(%rsp)
1279         leaq    (%rsp), %rax
1280         movq    %rax, 192(%rsp)
1281         leaq    208(%rsp), %rax
1282         movq    %rax, 184(%rsp)
1283         movl    $48, 180(%rsp)
1284         movl    $8, 176(%rsp)
1285         movl    176(%rsp), %eax
1286         cmpl    $47, %eax
1287         jbe     .LBB1_3 # bb
1288 .LBB1_1:        # bb3
1289         movq    184(%rsp), %rcx
1290         leaq    8(%rcx), %rax
1291         movq    %rax, 184(%rsp)
1292 .LBB1_2:        # bb4
1293         movl    (%rcx), %eax
1294         addq    $200, %rsp
1295         ret
1296 .LBB1_3:        # bb
1297         movl    %eax, %ecx
1298         addl    $8, %eax
1299         addq    192(%rsp), %rcx
1300         movl    %eax, 176(%rsp)
1301         jmp     .LBB1_2 # bb4
1302
1303 gcc 4.3 generates:
1304         subq    $96, %rsp
1305 .LCFI0:
1306         leaq    104(%rsp), %rax
1307         movq    %rsi, -80(%rsp)
1308         movl    $8, -120(%rsp)
1309         movq    %rax, -112(%rsp)
1310         leaq    -88(%rsp), %rax
1311         movq    %rax, -104(%rsp)
1312         movl    $8, %eax
1313         cmpl    $48, %eax
1314         jb      .L6
1315         movq    -112(%rsp), %rdx
1316         movl    (%rdx), %eax
1317         addq    $96, %rsp
1318         ret
1319         .p2align 4,,10
1320         .p2align 3
1321 .L6:
1322         mov     %eax, %edx
1323         addq    -104(%rsp), %rdx
1324         addl    $8, %eax
1325         movl    %eax, -120(%rsp)
1326         movl    (%rdx), %eax
1327         addq    $96, %rsp
1328         ret
1329
1330 and it gets compiled into this on x86:
1331         pushl   %ebp
1332         movl    %esp, %ebp
1333         subl    $4, %esp
1334         leal    12(%ebp), %eax
1335         movl    %eax, -4(%ebp)
1336         leal    16(%ebp), %eax
1337         movl    %eax, -4(%ebp)
1338         movl    12(%ebp), %eax
1339         addl    $4, %esp
1340         popl    %ebp
1341         ret
1342
1343 gcc 4.3 generates:
1344         pushl   %ebp
1345         movl    %esp, %ebp
1346         movl    12(%ebp), %eax
1347         popl    %ebp
1348         ret
1349
1350 //===---------------------------------------------------------------------===//
1351
1352 Teach tblgen not to check bitconvert source type in some cases. This allows us
1353 to consolidate the following patterns in X86InstrMMX.td:
1354
1355 def : Pat<(v2i32 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
1356                                                   (iPTR 0))))),
1357           (v2i32 (MMX_MOVDQ2Qrr VR128:$src))>;
1358 def : Pat<(v4i16 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
1359                                                   (iPTR 0))))),
1360           (v4i16 (MMX_MOVDQ2Qrr VR128:$src))>;
1361 def : Pat<(v8i8 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
1362                                                   (iPTR 0))))),
1363           (v8i8 (MMX_MOVDQ2Qrr VR128:$src))>;
1364
1365 There are other cases in various td files.
1366
1367 //===---------------------------------------------------------------------===//
1368
1369 Take something like the following on x86-32:
1370 unsigned a(unsigned long long x, unsigned y) {return x % y;}
1371
1372 We currently generate a libcall, but we really shouldn't: the expansion is
1373 shorter and likely faster than the libcall.  The expected code is something
1374 like the following:
1375
1376         movl    12(%ebp), %eax
1377         movl    16(%ebp), %ecx
1378         xorl    %edx, %edx
1379         divl    %ecx
1380         movl    8(%ebp), %eax
1381         divl    %ecx
1382         movl    %edx, %eax
1383         ret
1384
1385 A similar code sequence works for division.
1386
1387 //===---------------------------------------------------------------------===//
1388
1389 These should compile to the same code, but the later codegen's to useless
1390 instructions on X86. This may be a trivial dag combine (GCC PR7061):
1391
1392 struct s1 { unsigned char a, b; };
1393 unsigned long f1(struct s1 x) {
1394     return x.a + x.b;
1395 }
1396 struct s2 { unsigned a: 8, b: 8; };
1397 unsigned long f2(struct s2 x) {
1398     return x.a + x.b;
1399 }
1400
1401 //===---------------------------------------------------------------------===//
1402
1403 We currently compile this:
1404
1405 define i32 @func1(i32 %v1, i32 %v2) nounwind {
1406 entry:
1407   %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
1408   %sum = extractvalue {i32, i1} %t, 0
1409   %obit = extractvalue {i32, i1} %t, 1
1410   br i1 %obit, label %overflow, label %normal
1411 normal:
1412   ret i32 %sum
1413 overflow:
1414   call void @llvm.trap()
1415   unreachable
1416 }
1417 declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32)
1418 declare void @llvm.trap()
1419
1420 to:
1421
1422 _func1:
1423         movl    4(%esp), %eax
1424         addl    8(%esp), %eax
1425         jo      LBB1_2  ## overflow
1426 LBB1_1: ## normal
1427         ret
1428 LBB1_2: ## overflow
1429         ud2
1430
1431 it would be nice to produce "into" someday.
1432
1433 //===---------------------------------------------------------------------===//
1434
1435 This code:
1436
1437 void vec_mpys1(int y[], const int x[], int scaler) {
1438 int i;
1439 for (i = 0; i < 150; i++)
1440  y[i] += (((long long)scaler * (long long)x[i]) >> 31);
1441 }
1442
1443 Compiles to this loop with GCC 3.x:
1444
1445 .L5:
1446         movl    %ebx, %eax
1447         imull   (%edi,%ecx,4)
1448         shrdl   $31, %edx, %eax
1449         addl    %eax, (%esi,%ecx,4)
1450         incl    %ecx
1451         cmpl    $149, %ecx
1452         jle     .L5
1453
1454 llvm-gcc compiles it to the much uglier:
1455
1456 LBB1_1: ## bb1
1457         movl    24(%esp), %eax
1458         movl    (%eax,%edi,4), %ebx
1459         movl    %ebx, %ebp
1460         imull   %esi, %ebp
1461         movl    %ebx, %eax
1462         mull    %ecx
1463         addl    %ebp, %edx
1464         sarl    $31, %ebx
1465         imull   %ecx, %ebx
1466         addl    %edx, %ebx
1467         shldl   $1, %eax, %ebx
1468         movl    20(%esp), %eax
1469         addl    %ebx, (%eax,%edi,4)
1470         incl    %edi
1471         cmpl    $150, %edi
1472         jne     LBB1_1  ## bb1
1473
1474 The issue is that we hoist the cast of "scaler" to long long outside of the
1475 loop, the value comes into the loop as two values, and
1476 RegsForValue::getCopyFromRegs doesn't know how to put an AssertSext on the
1477 constructed BUILD_PAIR which represents the cast value.
1478
1479 //===---------------------------------------------------------------------===//
1480
1481 Test instructions can be eliminated by using EFLAGS values from arithmetic
1482 instructions. This is currently not done for mul, and, or, xor, neg, shl,
1483 sra, srl, shld, shrd, atomic ops, and others. It is also currently not done
1484 for read-modify-write instructions. It is also current not done if the
1485 OF or CF flags are needed.
1486
1487 The shift operators have the complication that when the shift count is
1488 zero, EFLAGS is not set, so they can only subsume a test instruction if
1489 the shift count is known to be non-zero. Also, using the EFLAGS value
1490 from a shift is apparently very slow on some x86 implementations.
1491
1492 In read-modify-write instructions, the root node in the isel match is
1493 the store, and isel has no way for the use of the EFLAGS result of the
1494 arithmetic to be remapped to the new node.
1495
1496 Add and subtract instructions set OF on signed overflow and CF on unsiged
1497 overflow, while test instructions always clear OF and CF. In order to
1498 replace a test with an add or subtract in a situation where OF or CF is
1499 needed, codegen must be able to prove that the operation cannot see
1500 signed or unsigned overflow, respectively.
1501
1502 //===---------------------------------------------------------------------===//
1503
1504 memcpy/memmove do not lower to SSE copies when possible.  A silly example is:
1505 define <16 x float> @foo(<16 x float> %A) nounwind {
1506         %tmp = alloca <16 x float>, align 16
1507         %tmp2 = alloca <16 x float>, align 16
1508         store <16 x float> %A, <16 x float>* %tmp
1509         %s = bitcast <16 x float>* %tmp to i8*
1510         %s2 = bitcast <16 x float>* %tmp2 to i8*
1511         call void @llvm.memcpy.i64(i8* %s, i8* %s2, i64 64, i32 16)
1512         %R = load <16 x float>* %tmp2
1513         ret <16 x float> %R
1514 }
1515
1516 declare void @llvm.memcpy.i64(i8* nocapture, i8* nocapture, i64, i32) nounwind
1517
1518 which compiles to:
1519
1520 _foo:
1521         subl    $140, %esp
1522         movaps  %xmm3, 112(%esp)
1523         movaps  %xmm2, 96(%esp)
1524         movaps  %xmm1, 80(%esp)
1525         movaps  %xmm0, 64(%esp)
1526         movl    60(%esp), %eax
1527         movl    %eax, 124(%esp)
1528         movl    56(%esp), %eax
1529         movl    %eax, 120(%esp)
1530         movl    52(%esp), %eax
1531         <many many more 32-bit copies>
1532         movaps  (%esp), %xmm0
1533         movaps  16(%esp), %xmm1
1534         movaps  32(%esp), %xmm2
1535         movaps  48(%esp), %xmm3
1536         addl    $140, %esp
1537         ret
1538
1539 On Nehalem, it may even be cheaper to just use movups when unaligned than to
1540 fall back to lower-granularity chunks.
1541
1542 //===---------------------------------------------------------------------===//
1543
1544 Implement processor-specific optimizations for parity with GCC on these
1545 processors.  GCC does two optimizations:
1546
1547 1. ix86_pad_returns inserts a noop before ret instructions if immediately
1548    preceeded by a conditional branch or is the target of a jump.
1549 2. ix86_avoid_jump_misspredicts inserts noops in cases where a 16-byte block of
1550    code contains more than 3 branches.
1551
1552 The first one is done for all AMDs, Core2, and "Generic"
1553 The second one is done for: Atom, Pentium Pro, all AMDs, Pentium 4, Nocona,
1554   Core 2, and "Generic"
1555
1556 //===---------------------------------------------------------------------===//
1557
1558 Testcase:
1559 int a(int x) { return (x & 127) > 31; }
1560
1561 Current output:
1562         movl    4(%esp), %eax
1563         andl    $127, %eax
1564         cmpl    $31, %eax
1565         seta    %al
1566         movzbl  %al, %eax
1567         ret
1568
1569 Ideal output:
1570         xorl    %eax, %eax
1571         testl   $96, 4(%esp)
1572         setne   %al
1573         ret
1574
1575 This should definitely be done in instcombine, canonicalizing the range
1576 condition into a != condition.  We get this IR:
1577
1578 define i32 @a(i32 %x) nounwind readnone {
1579 entry:
1580         %0 = and i32 %x, 127            ; <i32> [#uses=1]
1581         %1 = icmp ugt i32 %0, 31                ; <i1> [#uses=1]
1582         %2 = zext i1 %1 to i32          ; <i32> [#uses=1]
1583         ret i32 %2
1584 }
1585
1586 Instcombine prefers to strength reduce relational comparisons to equality
1587 comparisons when possible, this should be another case of that.  This could
1588 be handled pretty easily in InstCombiner::visitICmpInstWithInstAndIntCst, but it
1589 looks like InstCombiner::visitICmpInstWithInstAndIntCst should really already
1590 be redesigned to use ComputeMaskedBits and friends.
1591
1592
1593 //===---------------------------------------------------------------------===//
1594 Testcase:
1595 int x(int a) { return (a&0xf0)>>4; }
1596
1597 Current output:
1598         movl    4(%esp), %eax
1599         shrl    $4, %eax
1600         andl    $15, %eax
1601         ret
1602
1603 Ideal output:
1604         movzbl  4(%esp), %eax
1605         shrl    $4, %eax
1606         ret
1607
1608 //===---------------------------------------------------------------------===//
1609
1610 Re-implement atomic builtins __sync_add_and_fetch() and __sync_sub_and_fetch
1611 properly.
1612
1613 When the return value is not used (i.e. only care about the value in the
1614 memory), x86 does not have to use add to implement these. Instead, it can use
1615 add, sub, inc, dec instructions with the "lock" prefix.
1616
1617 This is currently implemented using a bit of instruction selection trick. The
1618 issue is the target independent pattern produces one output and a chain and we
1619 want to map it into one that just output a chain. The current trick is to select
1620 it into a MERGE_VALUES with the first definition being an implicit_def. The
1621 proper solution is to add new ISD opcodes for the no-output variant. DAG
1622 combiner can then transform the node before it gets to target node selection.
1623
1624 Problem #2 is we are adding a whole bunch of x86 atomic instructions when in
1625 fact these instructions are identical to the non-lock versions. We need a way to
1626 add target specific information to target nodes and have this information
1627 carried over to machine instructions. Asm printer (or JIT) can use this
1628 information to add the "lock" prefix.
1629
1630 //===---------------------------------------------------------------------===//
1631
1632 _Bool bar(int *x) { return *x & 1; }
1633
1634 define zeroext i1 @bar(i32* nocapture %x) nounwind readonly {
1635 entry:
1636   %tmp1 = load i32* %x                            ; <i32> [#uses=1]
1637   %and = and i32 %tmp1, 1                         ; <i32> [#uses=1]
1638   %tobool = icmp ne i32 %and, 0                   ; <i1> [#uses=1]
1639   ret i1 %tobool
1640 }
1641
1642 bar:                                                        # @bar
1643 # BB#0:                                                     # %entry
1644         movl    4(%esp), %eax
1645         movb    (%eax), %al
1646         andb    $1, %al
1647         movzbl  %al, %eax
1648         ret
1649
1650 Missed optimization: should be movl+andl.
1651
1652 //===---------------------------------------------------------------------===//
1653
1654 Consider the following two functions compiled with clang:
1655 _Bool foo(int *x) { return !(*x & 4); }
1656 unsigned bar(int *x) { return !(*x & 4); }
1657
1658 foo:
1659         movl    4(%esp), %eax
1660         testb   $4, (%eax)
1661         sete    %al
1662         movzbl  %al, %eax
1663         ret
1664
1665 bar:
1666         movl    4(%esp), %eax
1667         movl    (%eax), %eax
1668         shrl    $2, %eax
1669         andl    $1, %eax
1670         xorl    $1, %eax
1671         ret
1672
1673 The second function generates more code even though the two functions are
1674 are functionally identical.
1675
1676 //===---------------------------------------------------------------------===//
1677
1678 Take the following C code:
1679 int x(int y) { return (y & 63) << 14; }
1680
1681 Code produced by gcc:
1682         andl    $63, %edi
1683         sall    $14, %edi
1684         movl    %edi, %eax
1685         ret
1686
1687 Code produced by clang:
1688         shll    $14, %edi
1689         movl    %edi, %eax
1690         andl    $1032192, %eax
1691         ret
1692
1693 The code produced by gcc is 3 bytes shorter.  This sort of construct often
1694 shows up with bitfields.
1695
1696 //===---------------------------------------------------------------------===//
1697
1698 Take the following C code:
1699 int f(int a, int b) { return (unsigned char)a == (unsigned char)b; }
1700
1701 We generate the following IR with clang:
1702 define i32 @f(i32 %a, i32 %b) nounwind readnone {
1703 entry:
1704   %tmp = xor i32 %b, %a                           ; <i32> [#uses=1]
1705   %tmp6 = and i32 %tmp, 255                       ; <i32> [#uses=1]
1706   %cmp = icmp eq i32 %tmp6, 0                     ; <i1> [#uses=1]
1707   %conv5 = zext i1 %cmp to i32                    ; <i32> [#uses=1]
1708   ret i32 %conv5
1709 }
1710
1711 And the following x86 code:
1712         xorl    %esi, %edi
1713         testb   $-1, %dil
1714         sete    %al
1715         movzbl  %al, %eax
1716         ret
1717
1718 A cmpb instead of the xorl+testb would be one instruction shorter.
1719
1720 //===---------------------------------------------------------------------===//
1721
1722 Given the following C code:
1723 int f(int a, int b) { return (signed char)a == (signed char)b; }
1724
1725 We generate the following IR with clang:
1726 define i32 @f(i32 %a, i32 %b) nounwind readnone {
1727 entry:
1728   %sext = shl i32 %a, 24                          ; <i32> [#uses=1]
1729   %conv1 = ashr i32 %sext, 24                     ; <i32> [#uses=1]
1730   %sext6 = shl i32 %b, 24                         ; <i32> [#uses=1]
1731   %conv4 = ashr i32 %sext6, 24                    ; <i32> [#uses=1]
1732   %cmp = icmp eq i32 %conv1, %conv4               ; <i1> [#uses=1]
1733   %conv5 = zext i1 %cmp to i32                    ; <i32> [#uses=1]
1734   ret i32 %conv5
1735 }
1736
1737 And the following x86 code:
1738         movsbl  %sil, %eax
1739         movsbl  %dil, %ecx
1740         cmpl    %eax, %ecx
1741         sete    %al
1742         movzbl  %al, %eax
1743         ret
1744
1745
1746 It should be possible to eliminate the sign extensions.
1747
1748 //===---------------------------------------------------------------------===//
1749
1750 LLVM misses a load+store narrowing opportunity in this code:
1751
1752 %struct.bf = type { i64, i16, i16, i32 }
1753
1754 @bfi = external global %struct.bf*                ; <%struct.bf**> [#uses=2]
1755
1756 define void @t1() nounwind ssp {
1757 entry:
1758   %0 = load %struct.bf** @bfi, align 8            ; <%struct.bf*> [#uses=1]
1759   %1 = getelementptr %struct.bf* %0, i64 0, i32 1 ; <i16*> [#uses=1]
1760   %2 = bitcast i16* %1 to i32*                    ; <i32*> [#uses=2]
1761   %3 = load i32* %2, align 1                      ; <i32> [#uses=1]
1762   %4 = and i32 %3, -65537                         ; <i32> [#uses=1]
1763   store i32 %4, i32* %2, align 1
1764   %5 = load %struct.bf** @bfi, align 8            ; <%struct.bf*> [#uses=1]
1765   %6 = getelementptr %struct.bf* %5, i64 0, i32 1 ; <i16*> [#uses=1]
1766   %7 = bitcast i16* %6 to i32*                    ; <i32*> [#uses=2]
1767   %8 = load i32* %7, align 1                      ; <i32> [#uses=1]
1768   %9 = and i32 %8, -131073                        ; <i32> [#uses=1]
1769   store i32 %9, i32* %7, align 1
1770   ret void
1771 }
1772
1773 LLVM currently emits this:
1774
1775   movq  bfi(%rip), %rax
1776   andl  $-65537, 8(%rax)
1777   movq  bfi(%rip), %rax
1778   andl  $-131073, 8(%rax)
1779   ret
1780
1781 It could narrow the loads and stores to emit this:
1782
1783   movq  bfi(%rip), %rax
1784   andb  $-2, 10(%rax)
1785   movq  bfi(%rip), %rax
1786   andb  $-3, 10(%rax)
1787   ret
1788
1789 The trouble is that there is a TokenFactor between the store and the
1790 load, making it non-trivial to determine if there's anything between
1791 the load and the store which would prohibit narrowing.
1792
1793 //===---------------------------------------------------------------------===//
1794
1795 This code:
1796 void foo(unsigned x) {
1797   if (x == 0) bar();
1798   else if (x == 1) qux();
1799 }
1800
1801 currently compiles into:
1802 _foo:
1803         movl    4(%esp), %eax
1804         cmpl    $1, %eax
1805         je      LBB0_3
1806         testl   %eax, %eax
1807         jne     LBB0_4
1808
1809 the testl could be removed:
1810 _foo:
1811         movl    4(%esp), %eax
1812         cmpl    $1, %eax
1813         je      LBB0_3
1814         jb      LBB0_4
1815
1816 0 is the only unsigned number < 1.
1817
1818 //===---------------------------------------------------------------------===//