lib/Target/X86/README.txt

   1 //===---------------------------------------------------------------------===//
   2 // Random ideas for the X86 backend.
   3 //===---------------------------------------------------------------------===//
   4
   5
   6 //===---------------------------------------------------------------------===//
   7
   8 CodeGen/X86/lea-3.ll:test3 should be a single LEA, not a shift/move.  The X86
   9 backend knows how to three-addressify this shift, but it appears the register
  10 allocator isn't even asking it to do so in this case.  We should investigate
  11 why this isn't happening, it could have significant impact on other important
  12 cases for X86 as well.
  13
  14 //===---------------------------------------------------------------------===//
  15
  16 This should be one DIV/IDIV instruction, not a libcall:
  17
  18 unsigned test(unsigned long long X, unsigned Y) {
  19         return X/Y;
  20 }
  21
  22 This can be done trivially with a custom legalizer.  What about overflow
  23 though?  http://gcc.gnu.org/bugzilla/show_bug.cgi?id=14224
  24
  25 //===---------------------------------------------------------------------===//
  26
  27 Improvements to the multiply -> shift/add algorithm:
  28 http://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html
  29
  30 //===---------------------------------------------------------------------===//
  31
  32 Improve code like this (occurs fairly frequently, e.g. in LLVM):
  33 long long foo(int x) { return 1LL << x; }
  34
  35 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01109.html
  36 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01128.html
  37 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html
  38
  39 Another useful one would be  ~0ULL >> X and ~0ULL << X.
  40
  41 One better solution for 1LL << x is:
  42         xorl    %eax, %eax
  43         xorl    %edx, %edx
  44         testb   $32, %cl
  45         sete    %al
  46         setne   %dl
  47         sall    %cl, %eax
  48         sall    %cl, %edx
  49
  50 But that requires good 8-bit subreg support.
  51
  52 Also, this might be better.  It's an extra shift, but it's one instruction
  53 shorter, and doesn't stress 8-bit subreg support.
  54 (From http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01148.html,
  55 but without the unnecessary and.)
  56         movl %ecx, %eax
  57         shrl $5, %eax
  58         movl %eax, %edx
  59         xorl $1, %edx
  60         sall %cl, %eax
  61         sall %cl. %edx
  62
  63 64-bit shifts (in general) expand to really bad code.  Instead of using
  64 cmovs, we should expand to a conditional branch like GCC produces.
  65
  66 //===---------------------------------------------------------------------===//
  67
  68 Compile this:
  69 _Bool f(_Bool a) { return a!=1; }
  70
  71 into:
  72         movzbl  %dil, %eax
  73         xorl    $1, %eax
  74         ret
  75
  76 (Although note that this isn't a legal way to express the code that llvm-gcc
  77 currently generates for that function.)
  78
  79 //===---------------------------------------------------------------------===//
  80
  81 Some isel ideas:
  82
  83 1. Dynamic programming based approach when compile time if not an
  84    issue.
  85 2. Code duplication (addressing mode) during isel.
  86 3. Other ideas from "Register-Sensitive Selection, Duplication, and
  87    Sequencing of Instructions".
  88 4. Scheduling for reduced register pressure.  E.g. "Minimum Register
  89    Instruction Sequence Problem: Revisiting Optimal Code Generation for DAGs"
  90    and other related papers.
  91    http://citeseer.ist.psu.edu/govindarajan01minimum.html
  92
  93 //===---------------------------------------------------------------------===//
  94
  95 Should we promote i16 to i32 to avoid partial register update stalls?
  96
  97 //===---------------------------------------------------------------------===//
  98
  99 Leave any_extend as pseudo instruction and hint to register
 100 allocator. Delay codegen until post register allocation.
 101 Note. any_extend is now turned into an INSERT_SUBREG. We still need to teach
 102 the coalescer how to deal with it though.
 103
 104 //===---------------------------------------------------------------------===//
 105
 106 It appears icc use push for parameter passing. Need to investigate.
 107
 108 //===---------------------------------------------------------------------===//
 109
 110 Only use inc/neg/not instructions on processors where they are faster than
 111 add/sub/xor.  They are slower on the P4 due to only updating some processor
 112 flags.
 113
 114 //===---------------------------------------------------------------------===//
 115
 116 The instruction selector sometimes misses folding a load into a compare.  The
 117 pattern is written as (cmp reg, (load p)).  Because the compare isn't
 118 commutative, it is not matched with the load on both sides.  The dag combiner
 119 should be made smart enough to cannonicalize the load into the RHS of a compare
 120 when it can invert the result of the compare for free.
 121
 122 //===---------------------------------------------------------------------===//
 123
 124 How about intrinsics? An example is:
 125   *res = _mm_mulhi_epu16(*A, _mm_mul_epu32(*B, *C));
 126
 127 compiles to
 128         pmuludq (%eax), %xmm0
 129         movl 8(%esp), %eax
 130         movdqa (%eax), %xmm1
 131         pmulhuw %xmm0, %xmm1
 132
 133 The transformation probably requires a X86 specific pass or a DAG combiner
 134 target specific hook.
 135
 136 //===---------------------------------------------------------------------===//
 137
 138 In many cases, LLVM generates code like this:
 139
 140 _test:
 141         movl 8(%esp), %eax
 142         cmpl %eax, 4(%esp)
 143         setl %al
 144         movzbl %al, %eax
 145         ret
 146
 147 on some processors (which ones?), it is more efficient to do this:
 148
 149 _test:
 150         movl 8(%esp), %ebx
 151         xor  %eax, %eax
 152         cmpl %ebx, 4(%esp)
 153         setl %al
 154         ret
 155
 156 Doing this correctly is tricky though, as the xor clobbers the flags.
 157
 158 //===---------------------------------------------------------------------===//
 159
 160 We should generate bts/btr/etc instructions on targets where they are cheap or
 161 when codesize is important.  e.g., for:
 162
 163 void setbit(int *target, int bit) {
 164     *target |= (1 << bit);
 165 }
 166 void clearbit(int *target, int bit) {
 167     *target &= ~(1 << bit);
 168 }
 169
 170 //===---------------------------------------------------------------------===//
 171
 172 Instead of the following for memset char*, 1, 10:
 173
 174         movl $16843009, 4(%edx)
 175         movl $16843009, (%edx)
 176         movw $257, 8(%edx)
 177
 178 It might be better to generate
 179
 180         movl $16843009, %eax
 181         movl %eax, 4(%edx)
 182         movl %eax, (%edx)
 183         movw al, 8(%edx)
 184
 185 when we can spare a register. It reduces code size.
 186
 187 //===---------------------------------------------------------------------===//
 188
 189 Evaluate what the best way to codegen sdiv X, (2^C) is.  For X/8, we currently
 190 get this:
 191
 192 define i32 @test1(i32 %X) {
 193     %Y = sdiv i32 %X, 8
 194     ret i32 %Y
 195 }
 196
 197 _test1:
 198         movl 4(%esp), %eax
 199         movl %eax, %ecx
 200         sarl $31, %ecx
 201         shrl $29, %ecx
 202         addl %ecx, %eax
 203         sarl $3, %eax
 204         ret
 205
 206 GCC knows several different ways to codegen it, one of which is this:
 207
 208 _test1:
 209         movl    4(%esp), %eax
 210         cmpl    $-1, %eax
 211         leal    7(%eax), %ecx
 212         cmovle  %ecx, %eax
 213         sarl    $3, %eax
 214         ret
 215
 216 which is probably slower, but it's interesting at least :)
 217
 218 //===---------------------------------------------------------------------===//
 219
 220 We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl
 221 We should leave these as libcalls for everything over a much lower threshold,
 222 since libc is hand tuned for medium and large mem ops (avoiding RFO for large
 223 stores, TLB preheating, etc)
 224
 225 //===---------------------------------------------------------------------===//
 226
 227 Optimize this into something reasonable:
 228  x * copysign(1.0, y) * copysign(1.0, z)
 229
 230 //===---------------------------------------------------------------------===//
 231
 232 Optimize copysign(x, *y) to use an integer load from y.
 233
 234 //===---------------------------------------------------------------------===//
 235
 236 %X = weak global int 0
 237
 238 void %foo(int %N) {
 239         %N = cast int %N to uint
 240         %tmp.24 = setgt int %N, 0
 241         br bool %tmp.24, label %no_exit, label %return
 242
 243 no_exit:
 244         %indvar = phi uint [ 0, %entry ], [ %indvar.next, %no_exit ]
 245         %i.0.0 = cast uint %indvar to int
 246         volatile store int %i.0.0, int* %X
 247         %indvar.next = add uint %indvar, 1
 248         %exitcond = seteq uint %indvar.next, %N
 249         br bool %exitcond, label %return, label %no_exit
 250
 251 return:
 252         ret void
 253 }
 254
 255 compiles into:
 256
 257         .text
 258         .align  4
 259         .globl  _foo
 260 _foo:
 261         movl 4(%esp), %eax
 262         cmpl $1, %eax
 263         jl LBB_foo_4    # return
 264 LBB_foo_1:      # no_exit.preheader
 265         xorl %ecx, %ecx
 266 LBB_foo_2:      # no_exit
 267         movl L_X$non_lazy_ptr, %edx
 268         movl %ecx, (%edx)
 269         incl %ecx
 270         cmpl %eax, %ecx
 271         jne LBB_foo_2   # no_exit
 272 LBB_foo_3:      # return.loopexit
 273 LBB_foo_4:      # return
 274         ret
 275
 276 We should hoist "movl L_X$non_lazy_ptr, %edx" out of the loop after
 277 remateralization is implemented. This can be accomplished with 1) a target
 278 dependent LICM pass or 2) makeing SelectDAG represent the whole function.
 279
 280 //===---------------------------------------------------------------------===//
 281
 282 The following tests perform worse with LSR:
 283
 284 lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor.
 285
 286 //===---------------------------------------------------------------------===//
 287
 288 We are generating far worse code than gcc:
 289
 290 volatile short X, Y;
 291
 292 void foo(int N) {
 293   int i;
 294   for (i = 0; i < N; i++) { X = i; Y = i*4; }
 295 }
 296
 297 LBB1_1: # entry.bb_crit_edge
 298         xorl    %ecx, %ecx
 299         xorw    %dx, %dx
 300 LBB1_2: # bb
 301         movl    L_X$non_lazy_ptr, %esi
 302         movw    %cx, (%esi)
 303         movl    L_Y$non_lazy_ptr, %esi
 304         movw    %dx, (%esi)
 305         addw    $4, %dx
 306         incl    %ecx
 307         cmpl    %eax, %ecx
 308         jne     LBB1_2  # bb
 309
 310 vs.
 311
 312         xorl    %edx, %edx
 313         movl    L_X$non_lazy_ptr-"L00000000001$pb"(%ebx), %esi
 314         movl    L_Y$non_lazy_ptr-"L00000000001$pb"(%ebx), %ecx
 315 L4:
 316         movw    %dx, (%esi)
 317         leal    0(,%edx,4), %eax
 318         movw    %ax, (%ecx)
 319         addl    $1, %edx
 320         cmpl    %edx, %edi
 321         jne     L4
 322
 323 This is due to the lack of post regalloc LICM.
 324
 325 //===---------------------------------------------------------------------===//
 326
 327 Teach the coalescer to coalesce vregs of different register classes. e.g. FR32 /
 328 FR64 to VR128.
 329
 330 //===---------------------------------------------------------------------===//
 331
 332 Adding to the list of cmp / test poor codegen issues:
 333
 334 int test(__m128 *A, __m128 *B) {
 335   if (_mm_comige_ss(*A, *B))
 336     return 3;
 337   else
 338     return 4;
 339 }
 340
 341 _test:
 342         movl 8(%esp), %eax
 343         movaps (%eax), %xmm0
 344         movl 4(%esp), %eax
 345         movaps (%eax), %xmm1
 346         comiss %xmm0, %xmm1
 347         setae %al
 348         movzbl %al, %ecx
 349         movl $3, %eax
 350         movl $4, %edx
 351         cmpl $0, %ecx
 352         cmove %edx, %eax
 353         ret
 354
 355 Note the setae, movzbl, cmpl, cmove can be replaced with a single cmovae. There
 356 are a number of issues. 1) We are introducing a setcc between the result of the
 357 intrisic call and select. 2) The intrinsic is expected to produce a i32 value
 358 so a any extend (which becomes a zero extend) is added.
 359
 360 We probably need some kind of target DAG combine hook to fix this.
 361
 362 //===---------------------------------------------------------------------===//
 363
 364 We generate significantly worse code for this than GCC:
 365 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150
 366 http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701
 367
 368 There is also one case we do worse on PPC.
 369
 370 //===---------------------------------------------------------------------===//
 371
 372 For this:
 373
 374 int test(int a)
 375 {
 376   return a * 3;
 377 }
 378
 379 We currently emits
 380         imull $3, 4(%esp), %eax
 381
 382 Perhaps this is what we really should generate is? Is imull three or four
 383 cycles? Note: ICC generates this:
 384         movl    4(%esp), %eax
 385         leal    (%eax,%eax,2), %eax
 386
 387 The current instruction priority is based on pattern complexity. The former is
 388 more "complex" because it folds a load so the latter will not be emitted.
 389
 390 Perhaps we should use AddedComplexity to give LEA32r a higher priority? We
 391 should always try to match LEA first since the LEA matching code does some
 392 estimate to determine whether the match is profitable.
 393
 394 However, if we care more about code size, then imull is better. It's two bytes
 395 shorter than movl + leal.
 396
 397 //===---------------------------------------------------------------------===//
 398
 399 __builtin_ffs codegen is messy.
 400
 401 int ffs_(unsigned X) { return __builtin_ffs(X); }
 402
 403 llvm produces:
 404 ffs_:
 405         movl    4(%esp), %ecx
 406         bsfl    %ecx, %eax
 407         movl    $32, %edx
 408         cmove   %edx, %eax
 409         incl    %eax
 410         xorl    %edx, %edx
 411         testl   %ecx, %ecx
 412         cmove   %edx, %eax
 413         ret
 414
 415 vs gcc:
 416
 417 _ffs_:
 418         movl    $-1, %edx
 419         bsfl    4(%esp), %eax
 420         cmove   %edx, %eax
 421         addl    $1, %eax
 422         ret
 423
 424 Another example of __builtin_ffs (use predsimplify to eliminate a select):
 425
 426 int foo (unsigned long j) {
 427   if (j)
 428     return __builtin_ffs (j) - 1;
 429   else
 430     return 0;
 431 }
 432
 433 //===---------------------------------------------------------------------===//
 434
 435 It appears gcc place string data with linkonce linkage in
 436 .section __TEXT,__const_coal,coalesced instead of
 437 .section __DATA,__const_coal,coalesced.
 438 Take a look at darwin.h, there are other Darwin assembler directives that we
 439 do not make use of.
 440
 441 //===---------------------------------------------------------------------===//
 442
 443 define i32 @foo(i32* %a, i32 %t) {
 444 entry:
 445         br label %cond_true
 446
 447 cond_true:              ; preds = %cond_true, %entry
 448         %x.0.0 = phi i32 [ 0, %entry ], [ %tmp9, %cond_true ]           ; <i32> [#uses=3]
 449         %t_addr.0.0 = phi i32 [ %t, %entry ], [ %tmp7, %cond_true ]             ; <i32> [#uses=1]
 450         %tmp2 = getelementptr i32* %a, i32 %x.0.0               ; <i32*> [#uses=1]
 451         %tmp3 = load i32* %tmp2         ; <i32> [#uses=1]
 452         %tmp5 = add i32 %t_addr.0.0, %x.0.0             ; <i32> [#uses=1]
 453         %tmp7 = add i32 %tmp5, %tmp3            ; <i32> [#uses=2]
 454         %tmp9 = add i32 %x.0.0, 1               ; <i32> [#uses=2]
 455         %tmp = icmp sgt i32 %tmp9, 39           ; <i1> [#uses=1]
 456         br i1 %tmp, label %bb12, label %cond_true
 457
 458 bb12:           ; preds = %cond_true
 459         ret i32 %tmp7
 460 }
 461 is pessimized by -loop-reduce and -indvars
 462
 463 //===---------------------------------------------------------------------===//
 464
 465 u32 to float conversion improvement:
 466
 467 float uint32_2_float( unsigned u ) {
 468   float fl = (int) (u & 0xffff);
 469   float fh = (int) (u >> 16);
 470   fh *= 0x1.0p16f;
 471   return fh + fl;
 472 }
 473
 474 00000000        subl    $0x04,%esp
 475 00000003        movl    0x08(%esp,1),%eax
 476 00000007        movl    %eax,%ecx
 477 00000009        shrl    $0x10,%ecx
 478 0000000c        cvtsi2ss        %ecx,%xmm0
 479 00000010        andl    $0x0000ffff,%eax
 480 00000015        cvtsi2ss        %eax,%xmm1
 481 00000019        mulss   0x00000078,%xmm0
 482 00000021        addss   %xmm1,%xmm0
 483 00000025        movss   %xmm0,(%esp,1)
 484 0000002a        flds    (%esp,1)
 485 0000002d        addl    $0x04,%esp
 486 00000030        ret
 487
 488 //===---------------------------------------------------------------------===//
 489
 490 When using fastcc abi, align stack slot of argument of type double on 8 byte
 491 boundary to improve performance.
 492
 493 //===---------------------------------------------------------------------===//
 494
 495 Codegen:
 496
 497 int f(int a, int b) {
 498   if (a == 4 || a == 6)
 499     b++;
 500   return b;
 501 }
 502
 503
 504 as:
 505
 506 or eax, 2
 507 cmp eax, 6
 508 jz label
 509
 510 //===---------------------------------------------------------------------===//
 511
 512 GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting
 513 simplifications for integer "x cmp y ? a : b".  For example, instead of:
 514
 515 int G;
 516 void f(int X, int Y) {
 517   G = X < 0 ? 14 : 13;
 518 }
 519
 520 compiling to:
 521
 522 _f:
 523         movl $14, %eax
 524         movl $13, %ecx
 525         movl 4(%esp), %edx
 526         testl %edx, %edx
 527         cmovl %eax, %ecx
 528         movl %ecx, _G
 529         ret
 530
 531 it could be:
 532 _f:
 533         movl    4(%esp), %eax
 534         sarl    $31, %eax
 535         notl    %eax
 536         addl    $14, %eax
 537         movl    %eax, _G
 538         ret
 539
 540 etc.
 541
 542 Another is:
 543 int usesbb(unsigned int a, unsigned int b) {
 544        return (a < b ? -1 : 0);
 545 }
 546 to:
 547 _usesbb:
 548         movl    8(%esp), %eax
 549         cmpl    %eax, 4(%esp)
 550         sbbl    %eax, %eax
 551         ret
 552
 553 instead of:
 554 _usesbb:
 555         xorl    %eax, %eax
 556         movl    8(%esp), %ecx
 557         cmpl    %ecx, 4(%esp)
 558         movl    $4294967295, %ecx
 559         cmovb   %ecx, %eax
 560         ret
 561
 562 //===---------------------------------------------------------------------===//
 563
 564 Currently we don't have elimination of redundant stack manipulations. Consider
 565 the code:
 566
 567 int %main() {
 568 entry:
 569         call fastcc void %test1( )
 570         call fastcc void %test2( sbyte* cast (void ()* %test1 to sbyte*) )
 571         ret int 0
 572 }
 573
 574 declare fastcc void %test1()
 575
 576 declare fastcc void %test2(sbyte*)
 577
 578
 579 This currently compiles to:
 580
 581         subl $16, %esp
 582         call _test5
 583         addl $12, %esp
 584         subl $16, %esp
 585         movl $_test5, (%esp)
 586         call _test6
 587         addl $12, %esp
 588
 589 The add\sub pair is really unneeded here.
 590
 591 //===---------------------------------------------------------------------===//
 592
 593 Consider the expansion of:
 594
 595 define i32 @test3(i32 %X) {
 596         %tmp1 = urem i32 %X, 255
 597         ret i32 %tmp1
 598 }
 599
 600 Currently it compiles to:
 601
 602 ...
 603         movl $2155905153, %ecx
 604         movl 8(%esp), %esi
 605         movl %esi, %eax
 606         mull %ecx
 607 ...
 608
 609 This could be "reassociated" into:
 610
 611         movl $2155905153, %eax
 612         movl 8(%esp), %ecx
 613         mull %ecx
 614
 615 to avoid the copy.  In fact, the existing two-address stuff would do this
 616 except that mul isn't a commutative 2-addr instruction.  I guess this has
 617 to be done at isel time based on the #uses to mul?
 618
 619 //===---------------------------------------------------------------------===//
 620
 621 Make sure the instruction which starts a loop does not cross a cacheline
 622 boundary. This requires knowning the exact length of each machine instruction.
 623 That is somewhat complicated, but doable. Example 256.bzip2:
 624
 625 In the new trace, the hot loop has an instruction which crosses a cacheline
 626 boundary.  In addition to potential cache misses, this can't help decoding as I
 627 imagine there has to be some kind of complicated decoder reset and realignment
 628 to grab the bytes from the next cacheline.
 629
 630 532  532 0x3cfc movb     (1809(%esp, %esi), %bl   <<<--- spans 2 64 byte lines
 631 942  942 0x3d03 movl     %dh, (1809(%esp, %esi)
 632 937  937 0x3d0a incl     %esi
 633 3    3   0x3d0b cmpb     %bl, %dl
 634 27   27  0x3d0d jnz      0x000062db <main+11707>
 635
 636 //===---------------------------------------------------------------------===//
 637
 638 In c99 mode, the preprocessor doesn't like assembly comments like #TRUNCATE.
 639
 640 //===---------------------------------------------------------------------===//
 641
 642 This could be a single 16-bit load.
 643
 644 int f(char *p) {
 645     if ((p[0] == 1) & (p[1] == 2)) return 1;
 646     return 0;
 647 }
 648
 649 //===---------------------------------------------------------------------===//
 650
 651 We should inline lrintf and probably other libc functions.
 652
 653 //===---------------------------------------------------------------------===//
 654
 655 Start using the flags more.  For example, compile:
 656
 657 int add_zf(int *x, int y, int a, int b) {
 658      if ((*x += y) == 0)
 659           return a;
 660      else
 661           return b;
 662 }
 663
 664 to:
 665        addl    %esi, (%rdi)
 666        movl    %edx, %eax
 667        cmovne  %ecx, %eax
 668        ret
 669 instead of:
 670
 671 _add_zf:
 672         addl (%rdi), %esi
 673         movl %esi, (%rdi)
 674         testl %esi, %esi
 675         cmove %edx, %ecx
 676         movl %ecx, %eax
 677         ret
 678
 679 and:
 680
 681 int add_zf(int *x, int y, int a, int b) {
 682      if ((*x + y) < 0)
 683           return a;
 684      else
 685           return b;
 686 }
 687
 688 to:
 689
 690 add_zf:
 691         addl    (%rdi), %esi
 692         movl    %edx, %eax
 693         cmovns  %ecx, %eax
 694         ret
 695
 696 instead of:
 697
 698 _add_zf:
 699         addl (%rdi), %esi
 700         testl %esi, %esi
 701         cmovs %edx, %ecx
 702         movl %ecx, %eax
 703         ret
 704
 705 //===---------------------------------------------------------------------===//
 706
 707 These two functions have identical effects:
 708
 709 unsigned int f(unsigned int i, unsigned int n) {++i; if (i == n) ++i; return i;}
 710 unsigned int f2(unsigned int i, unsigned int n) {++i; i += i == n; return i;}
 711
 712 We currently compile them to:
 713
 714 _f:
 715         movl 4(%esp), %eax
 716         movl %eax, %ecx
 717         incl %ecx
 718         movl 8(%esp), %edx
 719         cmpl %edx, %ecx
 720         jne LBB1_2      #UnifiedReturnBlock
 721 LBB1_1: #cond_true
 722         addl $2, %eax
 723         ret
 724 LBB1_2: #UnifiedReturnBlock
 725         movl %ecx, %eax
 726         ret
 727 _f2:
 728         movl 4(%esp), %eax
 729         movl %eax, %ecx
 730         incl %ecx
 731         cmpl 8(%esp), %ecx
 732         sete %cl
 733         movzbl %cl, %ecx
 734         leal 1(%ecx,%eax), %eax
 735         ret
 736
 737 both of which are inferior to GCC's:
 738
 739 _f:
 740         movl    4(%esp), %edx
 741         leal    1(%edx), %eax
 742         addl    $2, %edx
 743         cmpl    8(%esp), %eax
 744         cmove   %edx, %eax
 745         ret
 746 _f2:
 747         movl    4(%esp), %eax
 748         addl    $1, %eax
 749         xorl    %edx, %edx
 750         cmpl    8(%esp), %eax
 751         sete    %dl
 752         addl    %edx, %eax
 753         ret
 754
 755 //===---------------------------------------------------------------------===//
 756
 757 This code:
 758
 759 void test(int X) {
 760   if (X) abort();
 761 }
 762
 763 is currently compiled to:
 764
 765 _test:
 766         subl $12, %esp
 767         cmpl $0, 16(%esp)
 768         jne LBB1_1
 769         addl $12, %esp
 770         ret
 771 LBB1_1:
 772         call L_abort$stub
 773
 774 It would be better to produce:
 775
 776 _test:
 777         subl $12, %esp
 778         cmpl $0, 16(%esp)
 779         jne L_abort$stub
 780         addl $12, %esp
 781         ret
 782
 783 This can be applied to any no-return function call that takes no arguments etc.
 784 Alternatively, the stack save/restore logic could be shrink-wrapped, producing
 785 something like this:
 786
 787 _test:
 788         cmpl $0, 4(%esp)
 789         jne LBB1_1
 790         ret
 791 LBB1_1:
 792         subl $12, %esp
 793         call L_abort$stub
 794
 795 Both are useful in different situations.  Finally, it could be shrink-wrapped
 796 and tail called, like this:
 797
 798 _test:
 799         cmpl $0, 4(%esp)
 800         jne LBB1_1
 801         ret
 802 LBB1_1:
 803         pop %eax   # realign stack.
 804         call L_abort$stub
 805
 806 Though this probably isn't worth it.
 807
 808 //===---------------------------------------------------------------------===//
 809
 810 We need to teach the codegen to convert two-address INC instructions to LEA
 811 when the flags are dead (likewise dec).  For example, on X86-64, compile:
 812
 813 int foo(int A, int B) {
 814   return A+1;
 815 }
 816
 817 to:
 818
 819 _foo:
 820         leal    1(%edi), %eax
 821         ret
 822
 823 instead of:
 824
 825 _foo:
 826         incl %edi
 827         movl %edi, %eax
 828         ret
 829
 830 Another example is:
 831
 832 ;; X's live range extends beyond the shift, so the register allocator
 833 ;; cannot coalesce it with Y.  Because of this, a copy needs to be
 834 ;; emitted before the shift to save the register value before it is
 835 ;; clobbered.  However, this copy is not needed if the register
 836 ;; allocator turns the shift into an LEA.  This also occurs for ADD.
 837
 838 ; Check that the shift gets turned into an LEA.
 839 ; RUN: llvm-as < %s | llc -march=x86 -x86-asm-syntax=intel | \
 840 ; RUN:   not grep {mov E.X, E.X}
 841
 842 @G = external global i32                ; <i32*> [#uses=3]
 843
 844 define i32 @test1(i32 %X, i32 %Y) {
 845         %Z = add i32 %X, %Y             ; <i32> [#uses=1]
 846         volatile store i32 %Y, i32* @G
 847         volatile store i32 %Z, i32* @G
 848         ret i32 %X
 849 }
 850
 851 define i32 @test2(i32 %X) {
 852         %Z = add i32 %X, 1              ; <i32> [#uses=1]
 853         volatile store i32 %Z, i32* @G
 854         ret i32 %X
 855 }
 856
 857 //===---------------------------------------------------------------------===//
 858
 859 Sometimes it is better to codegen subtractions from a constant (e.g. 7-x) with
 860 a neg instead of a sub instruction.  Consider:
 861
 862 int test(char X) { return 7-X; }
 863
 864 we currently produce:
 865 _test:
 866         movl $7, %eax
 867         movsbl 4(%esp), %ecx
 868         subl %ecx, %eax
 869         ret
 870
 871 We would use one fewer register if codegen'd as:
 872
 873         movsbl 4(%esp), %eax
 874         neg %eax
 875         add $7, %eax
 876         ret
 877
 878 Note that this isn't beneficial if the load can be folded into the sub.  In
 879 this case, we want a sub:
 880
 881 int test(int X) { return 7-X; }
 882 _test:
 883         movl $7, %eax
 884         subl 4(%esp), %eax
 885         ret
 886
 887 //===---------------------------------------------------------------------===//
 888
 889 Leaf functions that require one 4-byte spill slot have a prolog like this:
 890
 891 _foo:
 892         pushl   %esi
 893         subl    $4, %esp
 894 ...
 895 and an epilog like this:
 896         addl    $4, %esp
 897         popl    %esi
 898         ret
 899
 900 It would be smaller, and potentially faster, to push eax on entry and to
 901 pop into a dummy register instead of using addl/subl of esp.  Just don't pop
 902 into any return registers :)
 903
 904 //===---------------------------------------------------------------------===//
 905
 906 The X86 backend should fold (branch (or (setcc, setcc))) into multiple
 907 branches.  We generate really poor code for:
 908
 909 double testf(double a) {
 910        return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0);
 911 }
 912
 913 For example, the entry BB is:
 914
 915 _testf:
 916         subl    $20, %esp
 917         pxor    %xmm0, %xmm0
 918         movsd   24(%esp), %xmm1
 919         ucomisd %xmm0, %xmm1
 920         setnp   %al
 921         sete    %cl
 922         testb   %cl, %al
 923         jne     LBB1_5  # UnifiedReturnBlock
 924 LBB1_1: # cond_true
 925
 926
 927 it would be better to replace the last four instructions with:
 928
 929         jp LBB1_1
 930         je LBB1_5
 931 LBB1_1:
 932
 933 We also codegen the inner ?: into a diamond:
 934
 935        cvtss2sd        LCPI1_0(%rip), %xmm2
 936         cvtss2sd        LCPI1_1(%rip), %xmm3
 937         ucomisd %xmm1, %xmm0
 938         ja      LBB1_3  # cond_true
 939 LBB1_2: # cond_true
 940         movapd  %xmm3, %xmm2
 941 LBB1_3: # cond_true
 942         movapd  %xmm2, %xmm0
 943         ret
 944
 945 We should sink the load into xmm3 into the LBB1_2 block.  This should
 946 be pretty easy, and will nuke all the copies.
 947
 948 //===---------------------------------------------------------------------===//
 949
 950 This:
 951         #include <algorithm>
 952         inline std::pair<unsigned, bool> full_add(unsigned a, unsigned b)
 953         { return std::make_pair(a + b, a + b < a); }
 954         bool no_overflow(unsigned a, unsigned b)
 955         { return !full_add(a, b).second; }
 956
 957 Should compile to:
 958
 959
 960         _Z11no_overflowjj:
 961                 addl    %edi, %esi
 962                 setae   %al
 963                 ret
 964
 965 FIXME: That code looks wrong; bool return is normally defined as zext.
 966
 967 on x86-64, not:
 968
 969 __Z11no_overflowjj:
 970         addl    %edi, %esi
 971         cmpl    %edi, %esi
 972         setae   %al
 973         movzbl  %al, %eax
 974         ret
 975
 976
 977 //===---------------------------------------------------------------------===//
 978
 979 Re-materialize MOV32r0 etc. with xor instead of changing them to moves if the
 980 condition register is dead. xor reg reg is shorter than mov reg, #0.
 981
 982 //===---------------------------------------------------------------------===//
 983
 984 We aren't matching RMW instructions aggressively
 985 enough.  Here's a reduced testcase (more in PR1160):
 986
 987 define void @test(i32* %huge_ptr, i32* %target_ptr) {
 988         %A = load i32* %huge_ptr                ; <i32> [#uses=1]
 989         %B = load i32* %target_ptr              ; <i32> [#uses=1]
 990         %C = or i32 %A, %B              ; <i32> [#uses=1]
 991         store i32 %C, i32* %target_ptr
 992         ret void
 993 }
 994
 995 $ llvm-as < t.ll | llc -march=x86-64
 996
 997 _test:
 998         movl (%rdi), %eax
 999         orl (%rsi), %eax
1000         movl %eax, (%rsi)
1001         ret
1002
1003 That should be something like:
1004
1005 _test:
1006         movl (%rdi), %eax
1007         orl %eax, (%rsi)
1008         ret
1009
1010 //===---------------------------------------------------------------------===//
1011
1012 The following code:
1013
1014 bb114.preheader:                ; preds = %cond_next94
1015         %tmp231232 = sext i16 %tmp62 to i32             ; <i32> [#uses=1]
1016         %tmp233 = sub i32 32, %tmp231232                ; <i32> [#uses=1]
1017         %tmp245246 = sext i16 %tmp65 to i32             ; <i32> [#uses=1]
1018         %tmp252253 = sext i16 %tmp68 to i32             ; <i32> [#uses=1]
1019         %tmp254 = sub i32 32, %tmp252253                ; <i32> [#uses=1]
1020         %tmp553554 = bitcast i16* %tmp37 to i8*         ; <i8*> [#uses=2]
1021         %tmp583584 = sext i16 %tmp98 to i32             ; <i32> [#uses=1]
1022         %tmp585 = sub i32 32, %tmp583584                ; <i32> [#uses=1]
1023         %tmp614615 = sext i16 %tmp101 to i32            ; <i32> [#uses=1]
1024         %tmp621622 = sext i16 %tmp104 to i32            ; <i32> [#uses=1]
1025         %tmp623 = sub i32 32, %tmp621622                ; <i32> [#uses=1]
1026         br label %bb114
1027
1028 produces:
1029
1030 LBB3_5: # bb114.preheader
1031         movswl  -68(%ebp), %eax
1032         movl    $32, %ecx
1033         movl    %ecx, -80(%ebp)
1034         subl    %eax, -80(%ebp)
1035         movswl  -52(%ebp), %eax
1036         movl    %ecx, -84(%ebp)
1037         subl    %eax, -84(%ebp)
1038         movswl  -70(%ebp), %eax
1039         movl    %ecx, -88(%ebp)
1040         subl    %eax, -88(%ebp)
1041         movswl  -50(%ebp), %eax
1042         subl    %eax, %ecx
1043         movl    %ecx, -76(%ebp)
1044         movswl  -42(%ebp), %eax
1045         movl    %eax, -92(%ebp)
1046         movswl  -66(%ebp), %eax
1047         movl    %eax, -96(%ebp)
1048         movw    $0, -98(%ebp)
1049
1050 This appears to be bad because the RA is not folding the store to the stack
1051 slot into the movl.  The above instructions could be:
1052         movl    $32, -80(%ebp)
1053 ...
1054         movl    $32, -84(%ebp)
1055 ...
1056 This seems like a cross between remat and spill folding.
1057
1058 This has redundant subtractions of %eax from a stack slot. However, %ecx doesn't
1059 change, so we could simply subtract %eax from %ecx first and then use %ecx (or
1060 vice-versa).
1061
1062 //===---------------------------------------------------------------------===//
1063
1064 This code:
1065
1066         %tmp659 = icmp slt i16 %tmp654, 0               ; <i1> [#uses=1]
1067         br i1 %tmp659, label %cond_true662, label %cond_next715
1068
1069 produces this:
1070
1071         testw   %cx, %cx
1072         movswl  %cx, %esi
1073         jns     LBB4_109        # cond_next715
1074
1075 Shark tells us that using %cx in the testw instruction is sub-optimal. It
1076 suggests using the 32-bit register (which is what ICC uses).
1077
1078 //===---------------------------------------------------------------------===//
1079
1080 We compile this:
1081
1082 void compare (long long foo) {
1083   if (foo < 4294967297LL)
1084     abort();
1085 }
1086
1087 to:
1088
1089 compare:
1090         subl    $4, %esp
1091         cmpl    $0, 8(%esp)
1092         setne   %al
1093         movzbw  %al, %ax
1094         cmpl    $1, 12(%esp)
1095         setg    %cl
1096         movzbw  %cl, %cx
1097         cmove   %ax, %cx
1098         testb   $1, %cl
1099         jne     .LBB1_2 # UnifiedReturnBlock
1100 .LBB1_1:        # ifthen
1101         call    abort
1102 .LBB1_2:        # UnifiedReturnBlock
1103         addl    $4, %esp
1104         ret
1105
1106 (also really horrible code on ppc).  This is due to the expand code for 64-bit
1107 compares.  GCC produces multiple branches, which is much nicer:
1108
1109 compare:
1110         subl    $12, %esp
1111         movl    20(%esp), %edx
1112         movl    16(%esp), %eax
1113         decl    %edx
1114         jle     .L7
1115 .L5:
1116         addl    $12, %esp
1117         ret
1118         .p2align 4,,7
1119 .L7:
1120         jl      .L4
1121         cmpl    $0, %eax
1122         .p2align 4,,8
1123         ja      .L5
1124 .L4:
1125         .p2align 4,,9
1126         call    abort
1127
1128 //===---------------------------------------------------------------------===//
1129
1130 Tail call optimization improvements: Tail call optimization currently
1131 pushes all arguments on the top of the stack (their normal place for
1132 non-tail call optimized calls) that source from the callers arguments
1133 or  that source from a virtual register (also possibly sourcing from
1134 callers arguments).
1135 This is done to prevent overwriting of parameters (see example
1136 below) that might be used later.
1137
1138 example:
1139
1140 int callee(int32, int64);
1141 int caller(int32 arg1, int32 arg2) {
1142   int64 local = arg2 * 2;
1143   return callee(arg2, (int64)local);
1144 }
1145
1146 [arg1]          [!arg2 no longer valid since we moved local onto it]
1147 [arg2]      ->  [(int64)
1148 [RETADDR]        local  ]
1149
1150 Moving arg1 onto the stack slot of callee function would overwrite
1151 arg2 of the caller.
1152
1153 Possible optimizations:
1154
1155
1156  - Analyse the actual parameters of the callee to see which would
1157    overwrite a caller parameter which is used by the callee and only
1158    push them onto the top of the stack.
1159
1160    int callee (int32 arg1, int32 arg2);
1161    int caller (int32 arg1, int32 arg2) {
1162        return callee(arg1,arg2);
1163    }
1164
1165    Here we don't need to write any variables to the top of the stack
1166    since they don't overwrite each other.
1167
1168    int callee (int32 arg1, int32 arg2);
1169    int caller (int32 arg1, int32 arg2) {
1170        return callee(arg2,arg1);
1171    }
1172
1173    Here we need to push the arguments because they overwrite each
1174    other.
1175
1176 //===---------------------------------------------------------------------===//
1177
1178 main ()
1179 {
1180   int i = 0;
1181   unsigned long int z = 0;
1182
1183   do {
1184     z -= 0x00004000;
1185     i++;
1186     if (i > 0x00040000)
1187       abort ();
1188   } while (z > 0);
1189   exit (0);
1190 }
1191
1192 gcc compiles this to:
1193
1194 _main:
1195         subl    $28, %esp
1196         xorl    %eax, %eax
1197         jmp     L2
1198 L3:
1199         cmpl    $262144, %eax
1200         je      L10
1201 L2:
1202         addl    $1, %eax
1203         cmpl    $262145, %eax
1204         jne     L3
1205         call    L_abort$stub
1206 L10:
1207         movl    $0, (%esp)
1208         call    L_exit$stub
1209
1210 llvm:
1211
1212 _main:
1213         subl    $12, %esp
1214         movl    $1, %eax
1215         movl    $16384, %ecx
1216 LBB1_1: # bb
1217         cmpl    $262145, %eax
1218         jge     LBB1_4  # cond_true
1219 LBB1_2: # cond_next
1220         incl    %eax
1221         addl    $4294950912, %ecx
1222         cmpl    $16384, %ecx
1223         jne     LBB1_1  # bb
1224 LBB1_3: # bb11
1225         xorl    %eax, %eax
1226         addl    $12, %esp
1227         ret
1228 LBB1_4: # cond_true
1229         call    L_abort$stub
1230
1231 1. LSR should rewrite the first cmp with induction variable %ecx.
1232 2. DAG combiner should fold
1233         leal    1(%eax), %edx
1234         cmpl    $262145, %edx
1235    =>
1236         cmpl    $262144, %eax
1237
1238 //===---------------------------------------------------------------------===//
1239
1240 define i64 @test(double %X) {
1241         %Y = fptosi double %X to i64
1242         ret i64 %Y
1243 }
1244
1245 compiles to:
1246
1247 _test:
1248         subl    $20, %esp
1249         movsd   24(%esp), %xmm0
1250         movsd   %xmm0, 8(%esp)
1251         fldl    8(%esp)
1252         fisttpll        (%esp)
1253         movl    4(%esp), %edx
1254         movl    (%esp), %eax
1255         addl    $20, %esp
1256         #FP_REG_KILL
1257         ret
1258
1259 This should just fldl directly from the input stack slot.
1260
1261 //===---------------------------------------------------------------------===//
1262
1263 This code:
1264 int foo (int x) { return (x & 65535) | 255; }
1265
1266 Should compile into:
1267
1268 _foo:
1269         movzwl  4(%esp), %eax
1270         orl     $255, %eax
1271         ret
1272
1273 instead of:
1274 _foo:
1275         movl    $255, %eax
1276         orl     4(%esp), %eax
1277         andl    $65535, %eax
1278         ret
1279
1280 //===---------------------------------------------------------------------===//
1281
1282 We're codegen'ing multiply of long longs inefficiently:
1283
1284 unsigned long long LLM(unsigned long long arg1, unsigned long long arg2) {
1285   return arg1 *  arg2;
1286 }
1287
1288 We compile to (fomit-frame-pointer):
1289
1290 _LLM:
1291         pushl   %esi
1292         movl    8(%esp), %ecx
1293         movl    16(%esp), %esi
1294         movl    %esi, %eax
1295         mull    %ecx
1296         imull   12(%esp), %esi
1297         addl    %edx, %esi
1298         imull   20(%esp), %ecx
1299         movl    %esi, %edx
1300         addl    %ecx, %edx
1301         popl    %esi
1302         ret
1303
1304 This looks like a scheduling deficiency and lack of remat of the load from
1305 the argument area.  ICC apparently produces:
1306
1307         movl      8(%esp), %ecx
1308         imull     12(%esp), %ecx
1309         movl      16(%esp), %eax
1310         imull     4(%esp), %eax
1311         addl      %eax, %ecx
1312         movl      4(%esp), %eax
1313         mull      12(%esp)
1314         addl      %ecx, %edx
1315         ret
1316
1317 Note that it remat'd loads from 4(esp) and 12(esp).  See this GCC PR:
1318 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=17236
1319
1320 //===---------------------------------------------------------------------===//
1321
1322 We can fold a store into "zeroing a reg".  Instead of:
1323
1324 xorl    %eax, %eax
1325 movl    %eax, 124(%esp)
1326
1327 we should get:
1328
1329 movl    $0, 124(%esp)
1330
1331 if the flags of the xor are dead.
1332
1333 Likewise, we isel "x<<1" into "add reg,reg".  If reg is spilled, this should
1334 be folded into: shl [mem], 1
1335
1336 //===---------------------------------------------------------------------===//
1337
1338 This testcase misses a read/modify/write opportunity (from PR1425):
1339
1340 void vertical_decompose97iH1(int *b0, int *b1, int *b2, int width){
1341     int i;
1342     for(i=0; i<width; i++)
1343         b1[i] += (1*(b0[i] + b2[i])+0)>>0;
1344 }
1345
1346 We compile it down to:
1347
1348 LBB1_2: # bb
1349         movl    (%esi,%edi,4), %ebx
1350         addl    (%ecx,%edi,4), %ebx
1351         addl    (%edx,%edi,4), %ebx
1352         movl    %ebx, (%ecx,%edi,4)
1353         incl    %edi
1354         cmpl    %eax, %edi
1355         jne     LBB1_2  # bb
1356
1357 the inner loop should add to the memory location (%ecx,%edi,4), saving
1358 a mov.  Something like:
1359
1360         movl    (%esi,%edi,4), %ebx
1361         addl    (%edx,%edi,4), %ebx
1362         addl    %ebx, (%ecx,%edi,4)
1363
1364 Here is another interesting example:
1365
1366 void vertical_compose97iH1(int *b0, int *b1, int *b2, int width){
1367     int i;
1368     for(i=0; i<width; i++)
1369         b1[i] -= (1*(b0[i] + b2[i])+0)>>0;
1370 }
1371
1372 We miss the r/m/w opportunity here by using 2 subs instead of an add+sub[mem]:
1373
1374 LBB9_2: # bb
1375         movl    (%ecx,%edi,4), %ebx
1376         subl    (%esi,%edi,4), %ebx
1377         subl    (%edx,%edi,4), %ebx
1378         movl    %ebx, (%ecx,%edi,4)
1379         incl    %edi
1380         cmpl    %eax, %edi
1381         jne     LBB9_2  # bb
1382
1383 Additionally, LSR should rewrite the exit condition of these loops to use
1384 a stride-4 IV, would would allow all the scales in the loop to go away.
1385 This would result in smaller code and more efficient microops.
1386
1387 //===---------------------------------------------------------------------===//
1388
1389 In SSE mode, we turn abs and neg into a load from the constant pool plus a xor
1390 or and instruction, for example:
1391
1392         xorpd   LCPI1_0, %xmm2
1393
1394 However, if xmm2 gets spilled, we end up with really ugly code like this:
1395
1396         movsd   (%esp), %xmm0
1397         xorpd   LCPI1_0, %xmm0
1398         movsd   %xmm0, (%esp)
1399
1400 Since we 'know' that this is a 'neg', we can actually "fold" the spill into
1401 the neg/abs instruction, turning it into an *integer* operation, like this:
1402
1403         xorl 2147483648, [mem+4]     ## 2147483648 = (1 << 31)
1404
1405 you could also use xorb, but xorl is less likely to lead to a partial register
1406 stall.  Here is a contrived testcase:
1407
1408 double a, b, c;
1409 void test(double *P) {
1410   double X = *P;
1411   a = X;
1412   bar();
1413   X = -X;
1414   b = X;
1415   bar();
1416   c = X;
1417 }
1418
1419 //===---------------------------------------------------------------------===//
1420
1421 handling llvm.memory.barrier on pre SSE2 cpus
1422
1423 should generate:
1424 lock ; mov %esp, %esp
1425
1426 //===---------------------------------------------------------------------===//
1427
1428 The generated code on x86 for checking for signed overflow on a multiply the
1429 obvious way is much longer than it needs to be.
1430
1431 int x(int a, int b) {
1432   long long prod = (long long)a*b;
1433   return  prod > 0x7FFFFFFF || prod < (-0x7FFFFFFF-1);
1434 }
1435
1436 See PR2053 for more details.
1437
1438 //===---------------------------------------------------------------------===//
1439
1440 We should investigate using cdq/ctld (effect: edx = sar eax, 31)
1441 more aggressively; it should cost the same as a move+shift on any modern
1442 processor, but it's a lot shorter. Downside is that it puts more
1443 pressure on register allocation because it has fixed operands.
1444
1445 Example:
1446 int abs(int x) {return x < 0 ? -x : x;}
1447
1448 gcc compiles this to the following when using march/mtune=pentium2/3/4/m/etc.:
1449 abs:
1450         movl    4(%esp), %eax
1451         cltd
1452         xorl    %edx, %eax
1453         subl    %edx, %eax
1454         ret
1455
1456 //===---------------------------------------------------------------------===//
1457
1458 Consider:
1459 int test(unsigned long a, unsigned long b) { return -(a < b); }
1460
1461 We currently compile this to:
1462
1463 define i32 @test(i32 %a, i32 %b) nounwind  {
1464         %tmp3 = icmp ult i32 %a, %b             ; <i1> [#uses=1]
1465         %tmp34 = zext i1 %tmp3 to i32           ; <i32> [#uses=1]
1466         %tmp5 = sub i32 0, %tmp34               ; <i32> [#uses=1]
1467         ret i32 %tmp5
1468 }
1469
1470 and
1471
1472 _test:
1473         movl    8(%esp), %eax
1474         cmpl    %eax, 4(%esp)
1475         setb    %al
1476         movzbl  %al, %eax
1477         negl    %eax
1478         ret
1479
1480 Several deficiencies here.  First, we should instcombine zext+neg into sext:
1481
1482 define i32 @test2(i32 %a, i32 %b) nounwind  {
1483         %tmp3 = icmp ult i32 %a, %b             ; <i1> [#uses=1]
1484         %tmp34 = sext i1 %tmp3 to i32           ; <i32> [#uses=1]
1485         ret i32 %tmp34
1486 }
1487
1488 However, before we can do that, we have to fix the bad codegen that we get for
1489 sext from bool:
1490
1491 _test2:
1492         movl    8(%esp), %eax
1493         cmpl    %eax, 4(%esp)
1494         setb    %al
1495         movzbl  %al, %eax
1496         shll    $31, %eax
1497         sarl    $31, %eax
1498         ret
1499
1500 This code should be at least as good as the code above.  Once this is fixed, we
1501 can optimize this specific case even more to:
1502
1503         movl    8(%esp), %eax
1504         xorl    %ecx, %ecx
1505         cmpl    %eax, 4(%esp)
1506         sbbl    %ecx, %ecx
1507
1508 //===---------------------------------------------------------------------===//
1509
1510 Take the following code (from
1511 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=16541):
1512
1513 extern unsigned char first_one[65536];
1514 int FirstOnet(unsigned long long arg1)
1515 {
1516   if (arg1 >> 48)
1517     return (first_one[arg1 >> 48]);
1518   return 0;
1519 }
1520
1521
1522 The following code is currently generated:
1523 FirstOnet:
1524         movl    8(%esp), %eax
1525         cmpl    $65536, %eax
1526         movl    4(%esp), %ecx
1527         jb      .LBB1_2 # UnifiedReturnBlock
1528 .LBB1_1:        # ifthen
1529         shrl    $16, %eax
1530         movzbl  first_one(%eax), %eax
1531         ret
1532 .LBB1_2:        # UnifiedReturnBlock
1533         xorl    %eax, %eax
1534         ret
1535
1536 There are a few possible improvements here:
1537 1. We should be able to eliminate the dead load into %ecx
1538 2. We could change the "movl 8(%esp), %eax" into
1539    "movzwl 10(%esp), %eax"; this lets us change the cmpl
1540    into a testl, which is shorter, and eliminate the shift.
1541
1542 We could also in theory eliminate the branch by using a conditional
1543 for the address of the load, but that seems unlikely to be worthwhile
1544 in general.
1545
1546 //===---------------------------------------------------------------------===//
1547
1548 We compile this function:
1549
1550 define i32 @foo(i32 %a, i32 %b, i32 %c, i8 zeroext  %d) nounwind  {
1551 entry:
1552         %tmp2 = icmp eq i8 %d, 0                ; <i1> [#uses=1]
1553         br i1 %tmp2, label %bb7, label %bb
1554
1555 bb:             ; preds = %entry
1556         %tmp6 = add i32 %b, %a          ; <i32> [#uses=1]
1557         ret i32 %tmp6
1558
1559 bb7:            ; preds = %entry
1560         %tmp10 = sub i32 %a, %c         ; <i32> [#uses=1]
1561         ret i32 %tmp10
1562 }
1563
1564 to:
1565
1566 _foo:
1567         cmpb    $0, 16(%esp)
1568         movl    12(%esp), %ecx
1569         movl    8(%esp), %eax
1570         movl    4(%esp), %edx
1571         je      LBB1_2  # bb7
1572 LBB1_1: # bb
1573         addl    %edx, %eax
1574         ret
1575 LBB1_2: # bb7
1576         movl    %edx, %eax
1577         subl    %ecx, %eax
1578         ret
1579
1580 The coalescer could coalesce "edx" with "eax" to avoid the movl in LBB1_2
1581 if it commuted the addl in LBB1_1.
1582
1583 //===---------------------------------------------------------------------===//
1584
1585 See rdar://4653682.
1586
1587 From flops:
1588
1589 LBB1_15:        # bb310
1590         cvtss2sd        LCPI1_0, %xmm1
1591         addsd   %xmm1, %xmm0
1592         movsd   176(%esp), %xmm2
1593         mulsd   %xmm0, %xmm2
1594         movapd  %xmm2, %xmm3
1595         mulsd   %xmm3, %xmm3
1596         movapd  %xmm3, %xmm4
1597         mulsd   LCPI1_23, %xmm4
1598         addsd   LCPI1_24, %xmm4
1599         mulsd   %xmm3, %xmm4
1600         addsd   LCPI1_25, %xmm4
1601         mulsd   %xmm3, %xmm4
1602         addsd   LCPI1_26, %xmm4
1603         mulsd   %xmm3, %xmm4
1604         addsd   LCPI1_27, %xmm4
1605         mulsd   %xmm3, %xmm4
1606         addsd   LCPI1_28, %xmm4
1607         mulsd   %xmm3, %xmm4
1608         addsd   %xmm1, %xmm4
1609         mulsd   %xmm2, %xmm4
1610         movsd   152(%esp), %xmm1
1611         addsd   %xmm4, %xmm1
1612         movsd   %xmm1, 152(%esp)
1613         incl    %eax
1614         cmpl    %eax, %esi
1615         jge     LBB1_15 # bb310
1616 LBB1_16:        # bb358.loopexit
1617         movsd   152(%esp), %xmm0
1618         addsd   %xmm0, %xmm0
1619         addsd   LCPI1_22, %xmm0
1620         movsd   %xmm0, 152(%esp)
1621
1622 Rather than spilling the result of the last addsd in the loop, we should have
1623 insert a copy to split the interval (one for the duration of the loop, one
1624 extending to the fall through). The register pressure in the loop isn't high
1625 enough to warrant the spill.
1626
1627 Also check why xmm7 is not used at all in the function.
1628
1629 //===---------------------------------------------------------------------===//
1630
1631 Legalize loses track of the fact that bools are always zero extended when in
1632 memory.  This causes us to compile abort_gzip (from 164.gzip) from:
1633
1634 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
1635 target triple = "i386-apple-darwin8"
1636 @in_exit.4870.b = internal global i1 false              ; <i1*> [#uses=2]
1637 define fastcc void @abort_gzip() noreturn nounwind  {
1638 entry:
1639         %tmp.b.i = load i1* @in_exit.4870.b             ; <i1> [#uses=1]
1640         br i1 %tmp.b.i, label %bb.i, label %bb4.i
1641 bb.i:           ; preds = %entry
1642         tail call void @exit( i32 1 ) noreturn nounwind
1643         unreachable
1644 bb4.i:          ; preds = %entry
1645         store i1 true, i1* @in_exit.4870.b
1646         tail call void @exit( i32 1 ) noreturn nounwind
1647         unreachable
1648 }
1649 declare void @exit(i32) noreturn nounwind
1650
1651 into:
1652
1653 _abort_gzip:
1654         subl    $12, %esp
1655         movb    _in_exit.4870.b, %al
1656         notb    %al
1657         testb   $1, %al
1658         jne     LBB1_2  ## bb4.i
1659 LBB1_1: ## bb.i
1660   ...
1661
1662 //===---------------------------------------------------------------------===//
1663
1664 We compile:
1665
1666 int test(int x, int y) {
1667   return x-y-1;
1668 }
1669
1670 into (-m64):
1671
1672 _test:
1673         decl    %edi
1674         movl    %edi, %eax
1675         subl    %esi, %eax
1676         ret
1677
1678 it would be better to codegen as: x+~y  (notl+addl)
1679
1680 //===---------------------------------------------------------------------===//
1681
1682 This code:
1683
1684 int foo(const char *str,...)
1685 {
1686  __builtin_va_list a; int x;
1687  __builtin_va_start(a,str); x = __builtin_va_arg(a,int); __builtin_va_end(a);
1688  return x;
1689 }
1690
1691 gets compiled into this on x86-64:
1692         subq    $200, %rsp
1693         movaps  %xmm7, 160(%rsp)
1694         movaps  %xmm6, 144(%rsp)
1695         movaps  %xmm5, 128(%rsp)
1696         movaps  %xmm4, 112(%rsp)
1697         movaps  %xmm3, 96(%rsp)
1698         movaps  %xmm2, 80(%rsp)
1699         movaps  %xmm1, 64(%rsp)
1700         movaps  %xmm0, 48(%rsp)
1701         movq    %r9, 40(%rsp)
1702         movq    %r8, 32(%rsp)
1703         movq    %rcx, 24(%rsp)
1704         movq    %rdx, 16(%rsp)
1705         movq    %rsi, 8(%rsp)
1706         leaq    (%rsp), %rax
1707         movq    %rax, 192(%rsp)
1708         leaq    208(%rsp), %rax
1709         movq    %rax, 184(%rsp)
1710         movl    $48, 180(%rsp)
1711         movl    $8, 176(%rsp)
1712         movl    176(%rsp), %eax
1713         cmpl    $47, %eax
1714         jbe     .LBB1_3 # bb
1715 .LBB1_1:        # bb3
1716         movq    184(%rsp), %rcx
1717         leaq    8(%rcx), %rax
1718         movq    %rax, 184(%rsp)
1719 .LBB1_2:        # bb4
1720         movl    (%rcx), %eax
1721         addq    $200, %rsp
1722         ret
1723 .LBB1_3:        # bb
1724         movl    %eax, %ecx
1725         addl    $8, %eax
1726         addq    192(%rsp), %rcx
1727         movl    %eax, 176(%rsp)
1728         jmp     .LBB1_2 # bb4
1729
1730 gcc 4.3 generates:
1731         subq    $96, %rsp
1732 .LCFI0:
1733         leaq    104(%rsp), %rax
1734         movq    %rsi, -80(%rsp)
1735         movl    $8, -120(%rsp)
1736         movq    %rax, -112(%rsp)
1737         leaq    -88(%rsp), %rax
1738         movq    %rax, -104(%rsp)
1739         movl    $8, %eax
1740         cmpl    $48, %eax
1741         jb      .L6
1742         movq    -112(%rsp), %rdx
1743         movl    (%rdx), %eax
1744         addq    $96, %rsp
1745         ret
1746         .p2align 4,,10
1747         .p2align 3
1748 .L6:
1749         mov     %eax, %edx
1750         addq    -104(%rsp), %rdx
1751         addl    $8, %eax
1752         movl    %eax, -120(%rsp)
1753         movl    (%rdx), %eax
1754         addq    $96, %rsp
1755         ret
1756
1757 and it gets compiled into this on x86:
1758         pushl   %ebp
1759         movl    %esp, %ebp
1760         subl    $4, %esp
1761         leal    12(%ebp), %eax
1762         movl    %eax, -4(%ebp)
1763         leal    16(%ebp), %eax
1764         movl    %eax, -4(%ebp)
1765         movl    12(%ebp), %eax
1766         addl    $4, %esp
1767         popl    %ebp
1768         ret
1769
1770 gcc 4.3 generates:
1771         pushl   %ebp
1772         movl    %esp, %ebp
1773         movl    12(%ebp), %eax
1774         popl    %ebp
1775         ret