lib/Target/X86/README.txt

   1 //===---------------------------------------------------------------------===//
   2 // Random ideas for the X86 backend.
   3 //===---------------------------------------------------------------------===//
   4
   5 Missing features:
   6   - Support for SSE4: http://www.intel.com/software/penryn
   7 http://softwarecommunity.intel.com/isn/Downloads/Intel%20SSE4%20Programming%20Reference.pdf
   8   - support for 3DNow!
   9   - weird abis?
  10
  11 //===---------------------------------------------------------------------===//
  12
  13 CodeGen/X86/lea-3.ll:test3 should be a single LEA, not a shift/move.  The X86
  14 backend knows how to three-addressify this shift, but it appears the register
  15 allocator isn't even asking it to do so in this case.  We should investigate
  16 why this isn't happening, it could have significant impact on other important
  17 cases for X86 as well.
  18
  19 //===---------------------------------------------------------------------===//
  20
  21 This should be one DIV/IDIV instruction, not a libcall:
  22
  23 unsigned test(unsigned long long X, unsigned Y) {
  24         return X/Y;
  25 }
  26
  27 This can be done trivially with a custom legalizer.  What about overflow
  28 though?  http://gcc.gnu.org/bugzilla/show_bug.cgi?id=14224
  29
  30 //===---------------------------------------------------------------------===//
  31
  32 Improvements to the multiply -> shift/add algorithm:
  33 http://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html
  34
  35 //===---------------------------------------------------------------------===//
  36
  37 Improve code like this (occurs fairly frequently, e.g. in LLVM):
  38 long long foo(int x) { return 1LL << x; }
  39
  40 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01109.html
  41 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01128.html
  42 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html
  43
  44 Another useful one would be  ~0ULL >> X and ~0ULL << X.
  45
  46 One better solution for 1LL << x is:
  47         xorl    %eax, %eax
  48         xorl    %edx, %edx
  49         testb   $32, %cl
  50         sete    %al
  51         setne   %dl
  52         sall    %cl, %eax
  53         sall    %cl, %edx
  54
  55 But that requires good 8-bit subreg support.
  56
  57 Also, this might be better.  It's an extra shift, but it's one instruction
  58 shorter, and doesn't stress 8-bit subreg support.
  59 (From http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01148.html,
  60 but without the unnecessary and.)
  61         movl %ecx, %eax
  62         shrl $5, %eax
  63         movl %eax, %edx
  64         xorl $1, %edx
  65         sall %cl, %eax
  66         sall %cl. %edx
  67
  68 64-bit shifts (in general) expand to really bad code.  Instead of using
  69 cmovs, we should expand to a conditional branch like GCC produces.
  70
  71 //===---------------------------------------------------------------------===//
  72
  73 Compile this:
  74 _Bool f(_Bool a) { return a!=1; }
  75
  76 into:
  77         movzbl  %dil, %eax
  78         xorl    $1, %eax
  79         ret
  80
  81 (Although note that this isn't a legal way to express the code that llvm-gcc
  82 currently generates for that function.)
  83
  84 //===---------------------------------------------------------------------===//
  85
  86 Some isel ideas:
  87
  88 1. Dynamic programming based approach when compile time if not an
  89    issue.
  90 2. Code duplication (addressing mode) during isel.
  91 3. Other ideas from "Register-Sensitive Selection, Duplication, and
  92    Sequencing of Instructions".
  93 4. Scheduling for reduced register pressure.  E.g. "Minimum Register
  94    Instruction Sequence Problem: Revisiting Optimal Code Generation for DAGs"
  95    and other related papers.
  96    http://citeseer.ist.psu.edu/govindarajan01minimum.html
  97
  98 //===---------------------------------------------------------------------===//
  99
 100 Should we promote i16 to i32 to avoid partial register update stalls?
 101
 102 //===---------------------------------------------------------------------===//
 103
 104 Leave any_extend as pseudo instruction and hint to register
 105 allocator. Delay codegen until post register allocation.
 106 Note. any_extend is now turned into an INSERT_SUBREG. We still need to teach
 107 the coalescer how to deal with it though.
 108
 109 //===---------------------------------------------------------------------===//
 110
 111 It appears icc use push for parameter passing. Need to investigate.
 112
 113 //===---------------------------------------------------------------------===//
 114
 115 Only use inc/neg/not instructions on processors where they are faster than
 116 add/sub/xor.  They are slower on the P4 due to only updating some processor
 117 flags.
 118
 119 //===---------------------------------------------------------------------===//
 120
 121 The instruction selector sometimes misses folding a load into a compare.  The
 122 pattern is written as (cmp reg, (load p)).  Because the compare isn't
 123 commutative, it is not matched with the load on both sides.  The dag combiner
 124 should be made smart enough to cannonicalize the load into the RHS of a compare
 125 when it can invert the result of the compare for free.
 126
 127 //===---------------------------------------------------------------------===//
 128
 129 How about intrinsics? An example is:
 130   *res = _mm_mulhi_epu16(*A, _mm_mul_epu32(*B, *C));
 131
 132 compiles to
 133         pmuludq (%eax), %xmm0
 134         movl 8(%esp), %eax
 135         movdqa (%eax), %xmm1
 136         pmulhuw %xmm0, %xmm1
 137
 138 The transformation probably requires a X86 specific pass or a DAG combiner
 139 target specific hook.
 140
 141 //===---------------------------------------------------------------------===//
 142
 143 In many cases, LLVM generates code like this:
 144
 145 _test:
 146         movl 8(%esp), %eax
 147         cmpl %eax, 4(%esp)
 148         setl %al
 149         movzbl %al, %eax
 150         ret
 151
 152 on some processors (which ones?), it is more efficient to do this:
 153
 154 _test:
 155         movl 8(%esp), %ebx
 156         xor  %eax, %eax
 157         cmpl %ebx, 4(%esp)
 158         setl %al
 159         ret
 160
 161 Doing this correctly is tricky though, as the xor clobbers the flags.
 162
 163 //===---------------------------------------------------------------------===//
 164
 165 We should generate bts/btr/etc instructions on targets where they are cheap or
 166 when codesize is important.  e.g., for:
 167
 168 void setbit(int *target, int bit) {
 169     *target |= (1 << bit);
 170 }
 171 void clearbit(int *target, int bit) {
 172     *target &= ~(1 << bit);
 173 }
 174
 175 //===---------------------------------------------------------------------===//
 176
 177 Instead of the following for memset char*, 1, 10:
 178
 179         movl $16843009, 4(%edx)
 180         movl $16843009, (%edx)
 181         movw $257, 8(%edx)
 182
 183 It might be better to generate
 184
 185         movl $16843009, %eax
 186         movl %eax, 4(%edx)
 187         movl %eax, (%edx)
 188         movw al, 8(%edx)
 189
 190 when we can spare a register. It reduces code size.
 191
 192 //===---------------------------------------------------------------------===//
 193
 194 Evaluate what the best way to codegen sdiv X, (2^C) is.  For X/8, we currently
 195 get this:
 196
 197 define i32 @test1(i32 %X) {
 198     %Y = sdiv i32 %X, 8
 199     ret i32 %Y
 200 }
 201
 202 _test1:
 203         movl 4(%esp), %eax
 204         movl %eax, %ecx
 205         sarl $31, %ecx
 206         shrl $29, %ecx
 207         addl %ecx, %eax
 208         sarl $3, %eax
 209         ret
 210
 211 GCC knows several different ways to codegen it, one of which is this:
 212
 213 _test1:
 214         movl    4(%esp), %eax
 215         cmpl    $-1, %eax
 216         leal    7(%eax), %ecx
 217         cmovle  %ecx, %eax
 218         sarl    $3, %eax
 219         ret
 220
 221 which is probably slower, but it's interesting at least :)
 222
 223 //===---------------------------------------------------------------------===//
 224
 225 We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl
 226 We should leave these as libcalls for everything over a much lower threshold,
 227 since libc is hand tuned for medium and large mem ops (avoiding RFO for large
 228 stores, TLB preheating, etc)
 229
 230 //===---------------------------------------------------------------------===//
 231
 232 Optimize this into something reasonable:
 233  x * copysign(1.0, y) * copysign(1.0, z)
 234
 235 //===---------------------------------------------------------------------===//
 236
 237 Optimize copysign(x, *y) to use an integer load from y.
 238
 239 //===---------------------------------------------------------------------===//
 240
 241 %X = weak global int 0
 242
 243 void %foo(int %N) {
 244         %N = cast int %N to uint
 245         %tmp.24 = setgt int %N, 0
 246         br bool %tmp.24, label %no_exit, label %return
 247
 248 no_exit:
 249         %indvar = phi uint [ 0, %entry ], [ %indvar.next, %no_exit ]
 250         %i.0.0 = cast uint %indvar to int
 251         volatile store int %i.0.0, int* %X
 252         %indvar.next = add uint %indvar, 1
 253         %exitcond = seteq uint %indvar.next, %N
 254         br bool %exitcond, label %return, label %no_exit
 255
 256 return:
 257         ret void
 258 }
 259
 260 compiles into:
 261
 262         .text
 263         .align  4
 264         .globl  _foo
 265 _foo:
 266         movl 4(%esp), %eax
 267         cmpl $1, %eax
 268         jl LBB_foo_4    # return
 269 LBB_foo_1:      # no_exit.preheader
 270         xorl %ecx, %ecx
 271 LBB_foo_2:      # no_exit
 272         movl L_X$non_lazy_ptr, %edx
 273         movl %ecx, (%edx)
 274         incl %ecx
 275         cmpl %eax, %ecx
 276         jne LBB_foo_2   # no_exit
 277 LBB_foo_3:      # return.loopexit
 278 LBB_foo_4:      # return
 279         ret
 280
 281 We should hoist "movl L_X$non_lazy_ptr, %edx" out of the loop after
 282 remateralization is implemented. This can be accomplished with 1) a target
 283 dependent LICM pass or 2) makeing SelectDAG represent the whole function.
 284
 285 //===---------------------------------------------------------------------===//
 286
 287 The following tests perform worse with LSR:
 288
 289 lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor.
 290
 291 //===---------------------------------------------------------------------===//
 292
 293 We are generating far worse code than gcc:
 294
 295 volatile short X, Y;
 296
 297 void foo(int N) {
 298   int i;
 299   for (i = 0; i < N; i++) { X = i; Y = i*4; }
 300 }
 301
 302 LBB1_1: # entry.bb_crit_edge
 303         xorl    %ecx, %ecx
 304         xorw    %dx, %dx
 305 LBB1_2: # bb
 306         movl    L_X$non_lazy_ptr, %esi
 307         movw    %cx, (%esi)
 308         movl    L_Y$non_lazy_ptr, %esi
 309         movw    %dx, (%esi)
 310         addw    $4, %dx
 311         incl    %ecx
 312         cmpl    %eax, %ecx
 313         jne     LBB1_2  # bb
 314
 315 vs.
 316
 317         xorl    %edx, %edx
 318         movl    L_X$non_lazy_ptr-"L00000000001$pb"(%ebx), %esi
 319         movl    L_Y$non_lazy_ptr-"L00000000001$pb"(%ebx), %ecx
 320 L4:
 321         movw    %dx, (%esi)
 322         leal    0(,%edx,4), %eax
 323         movw    %ax, (%ecx)
 324         addl    $1, %edx
 325         cmpl    %edx, %edi
 326         jne     L4
 327
 328 This is due to the lack of post regalloc LICM.
 329
 330 //===---------------------------------------------------------------------===//
 331
 332 Teach the coalescer to coalesce vregs of different register classes. e.g. FR32 /
 333 FR64 to VR128.
 334
 335 //===---------------------------------------------------------------------===//
 336
 337 mov $reg, 48(%esp)
 338 ...
 339 leal 48(%esp), %eax
 340 mov %eax, (%esp)
 341 call _foo
 342
 343 Obviously it would have been better for the first mov (or any op) to store
 344 directly %esp[0] if there are no other uses.
 345
 346 //===---------------------------------------------------------------------===//
 347
 348 Adding to the list of cmp / test poor codegen issues:
 349
 350 int test(__m128 *A, __m128 *B) {
 351   if (_mm_comige_ss(*A, *B))
 352     return 3;
 353   else
 354     return 4;
 355 }
 356
 357 _test:
 358         movl 8(%esp), %eax
 359         movaps (%eax), %xmm0
 360         movl 4(%esp), %eax
 361         movaps (%eax), %xmm1
 362         comiss %xmm0, %xmm1
 363         setae %al
 364         movzbl %al, %ecx
 365         movl $3, %eax
 366         movl $4, %edx
 367         cmpl $0, %ecx
 368         cmove %edx, %eax
 369         ret
 370
 371 Note the setae, movzbl, cmpl, cmove can be replaced with a single cmovae. There
 372 are a number of issues. 1) We are introducing a setcc between the result of the
 373 intrisic call and select. 2) The intrinsic is expected to produce a i32 value
 374 so a any extend (which becomes a zero extend) is added.
 375
 376 We probably need some kind of target DAG combine hook to fix this.
 377
 378 //===---------------------------------------------------------------------===//
 379
 380 We generate significantly worse code for this than GCC:
 381 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150
 382 http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701
 383
 384 There is also one case we do worse on PPC.
 385
 386 //===---------------------------------------------------------------------===//
 387
 388 If shorter, we should use things like:
 389 movzwl %ax, %eax
 390 instead of:
 391 andl $65535, %EAX
 392
 393 The former can also be used when the two-addressy nature of the 'and' would
 394 require a copy to be inserted (in X86InstrInfo::convertToThreeAddress).
 395
 396 //===---------------------------------------------------------------------===//
 397
 398 Another instruction selector deficiency:
 399
 400 void %bar() {
 401         %tmp = load int (int)** %foo
 402         %tmp = tail call int %tmp( int 3 )
 403         ret void
 404 }
 405
 406 _bar:
 407         subl $12, %esp
 408         movl L_foo$non_lazy_ptr, %eax
 409         movl (%eax), %eax
 410         call *%eax
 411         addl $12, %esp
 412         ret
 413
 414 The current isel scheme will not allow the load to be folded in the call since
 415 the load's chain result is read by the callseq_start.
 416
 417 //===---------------------------------------------------------------------===//
 418
 419 For this:
 420
 421 int test(int a)
 422 {
 423   return a * 3;
 424 }
 425
 426 We currently emits
 427         imull $3, 4(%esp), %eax
 428
 429 Perhaps this is what we really should generate is? Is imull three or four
 430 cycles? Note: ICC generates this:
 431         movl    4(%esp), %eax
 432         leal    (%eax,%eax,2), %eax
 433
 434 The current instruction priority is based on pattern complexity. The former is
 435 more "complex" because it folds a load so the latter will not be emitted.
 436
 437 Perhaps we should use AddedComplexity to give LEA32r a higher priority? We
 438 should always try to match LEA first since the LEA matching code does some
 439 estimate to determine whether the match is profitable.
 440
 441 However, if we care more about code size, then imull is better. It's two bytes
 442 shorter than movl + leal.
 443
 444 //===---------------------------------------------------------------------===//
 445
 446 __builtin_ffs codegen is messy.
 447
 448 int ffs_(unsigned X) { return __builtin_ffs(X); }
 449
 450 llvm produces:
 451 ffs_:
 452         movl    4(%esp), %ecx
 453         bsfl    %ecx, %eax
 454         movl    $32, %edx
 455         cmove   %edx, %eax
 456         incl    %eax
 457         xorl    %edx, %edx
 458         testl   %ecx, %ecx
 459         cmove   %edx, %eax
 460         ret
 461
 462 vs gcc:
 463
 464 _ffs_:
 465         movl    $-1, %edx
 466         bsfl    4(%esp), %eax
 467         cmove   %edx, %eax
 468         addl    $1, %eax
 469         ret
 470
 471 Another example of __builtin_ffs (use predsimplify to eliminate a select):
 472
 473 int foo (unsigned long j) {
 474   if (j)
 475     return __builtin_ffs (j) - 1;
 476   else
 477     return 0;
 478 }
 479
 480 //===---------------------------------------------------------------------===//
 481
 482 It appears gcc place string data with linkonce linkage in
 483 .section __TEXT,__const_coal,coalesced instead of
 484 .section __DATA,__const_coal,coalesced.
 485 Take a look at darwin.h, there are other Darwin assembler directives that we
 486 do not make use of.
 487
 488 //===---------------------------------------------------------------------===//
 489
 490 define i32 @foo(i32* %a, i32 %t) {
 491 entry:
 492         br label %cond_true
 493
 494 cond_true:              ; preds = %cond_true, %entry
 495         %x.0.0 = phi i32 [ 0, %entry ], [ %tmp9, %cond_true ]           ; <i32> [#uses=3]
 496         %t_addr.0.0 = phi i32 [ %t, %entry ], [ %tmp7, %cond_true ]             ; <i32> [#uses=1]
 497         %tmp2 = getelementptr i32* %a, i32 %x.0.0               ; <i32*> [#uses=1]
 498         %tmp3 = load i32* %tmp2         ; <i32> [#uses=1]
 499         %tmp5 = add i32 %t_addr.0.0, %x.0.0             ; <i32> [#uses=1]
 500         %tmp7 = add i32 %tmp5, %tmp3            ; <i32> [#uses=2]
 501         %tmp9 = add i32 %x.0.0, 1               ; <i32> [#uses=2]
 502         %tmp = icmp sgt i32 %tmp9, 39           ; <i1> [#uses=1]
 503         br i1 %tmp, label %bb12, label %cond_true
 504
 505 bb12:           ; preds = %cond_true
 506         ret i32 %tmp7
 507 }
 508 is pessimized by -loop-reduce and -indvars
 509
 510 //===---------------------------------------------------------------------===//
 511
 512 u32 to float conversion improvement:
 513
 514 float uint32_2_float( unsigned u ) {
 515   float fl = (int) (u & 0xffff);
 516   float fh = (int) (u >> 16);
 517   fh *= 0x1.0p16f;
 518   return fh + fl;
 519 }
 520
 521 00000000        subl    $0x04,%esp
 522 00000003        movl    0x08(%esp,1),%eax
 523 00000007        movl    %eax,%ecx
 524 00000009        shrl    $0x10,%ecx
 525 0000000c        cvtsi2ss        %ecx,%xmm0
 526 00000010        andl    $0x0000ffff,%eax
 527 00000015        cvtsi2ss        %eax,%xmm1
 528 00000019        mulss   0x00000078,%xmm0
 529 00000021        addss   %xmm1,%xmm0
 530 00000025        movss   %xmm0,(%esp,1)
 531 0000002a        flds    (%esp,1)
 532 0000002d        addl    $0x04,%esp
 533 00000030        ret
 534
 535 //===---------------------------------------------------------------------===//
 536
 537 When using fastcc abi, align stack slot of argument of type double on 8 byte
 538 boundary to improve performance.
 539
 540 //===---------------------------------------------------------------------===//
 541
 542 Codegen:
 543
 544 int f(int a, int b) {
 545   if (a == 4 || a == 6)
 546     b++;
 547   return b;
 548 }
 549
 550
 551 as:
 552
 553 or eax, 2
 554 cmp eax, 6
 555 jz label
 556
 557 //===---------------------------------------------------------------------===//
 558
 559 GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting
 560 simplifications for integer "x cmp y ? a : b".  For example, instead of:
 561
 562 int G;
 563 void f(int X, int Y) {
 564   G = X < 0 ? 14 : 13;
 565 }
 566
 567 compiling to:
 568
 569 _f:
 570         movl $14, %eax
 571         movl $13, %ecx
 572         movl 4(%esp), %edx
 573         testl %edx, %edx
 574         cmovl %eax, %ecx
 575         movl %ecx, _G
 576         ret
 577
 578 it could be:
 579 _f:
 580         movl    4(%esp), %eax
 581         sarl    $31, %eax
 582         notl    %eax
 583         addl    $14, %eax
 584         movl    %eax, _G
 585         ret
 586
 587 etc.
 588
 589 Another is:
 590 int usesbb(unsigned int a, unsigned int b) {
 591        return (a < b ? -1 : 0);
 592 }
 593 to:
 594 _usesbb:
 595         movl    8(%esp), %eax
 596         cmpl    %eax, 4(%esp)
 597         sbbl    %eax, %eax
 598         ret
 599
 600 instead of:
 601 _usesbb:
 602         xorl    %eax, %eax
 603         movl    8(%esp), %ecx
 604         cmpl    %ecx, 4(%esp)
 605         movl    $4294967295, %ecx
 606         cmovb   %ecx, %eax
 607         ret
 608
 609 //===---------------------------------------------------------------------===//
 610
 611 Currently we don't have elimination of redundant stack manipulations. Consider
 612 the code:
 613
 614 int %main() {
 615 entry:
 616         call fastcc void %test1( )
 617         call fastcc void %test2( sbyte* cast (void ()* %test1 to sbyte*) )
 618         ret int 0
 619 }
 620
 621 declare fastcc void %test1()
 622
 623 declare fastcc void %test2(sbyte*)
 624
 625
 626 This currently compiles to:
 627
 628         subl $16, %esp
 629         call _test5
 630         addl $12, %esp
 631         subl $16, %esp
 632         movl $_test5, (%esp)
 633         call _test6
 634         addl $12, %esp
 635
 636 The add\sub pair is really unneeded here.
 637
 638 //===---------------------------------------------------------------------===//
 639
 640 Consider the expansion of:
 641
 642 define i32 @test3(i32 %X) {
 643         %tmp1 = urem i32 %X, 255
 644         ret i32 %tmp1
 645 }
 646
 647 Currently it compiles to:
 648
 649 ...
 650         movl $2155905153, %ecx
 651         movl 8(%esp), %esi
 652         movl %esi, %eax
 653         mull %ecx
 654 ...
 655
 656 This could be "reassociated" into:
 657
 658         movl $2155905153, %eax
 659         movl 8(%esp), %ecx
 660         mull %ecx
 661
 662 to avoid the copy.  In fact, the existing two-address stuff would do this
 663 except that mul isn't a commutative 2-addr instruction.  I guess this has
 664 to be done at isel time based on the #uses to mul?
 665
 666 //===---------------------------------------------------------------------===//
 667
 668 Make sure the instruction which starts a loop does not cross a cacheline
 669 boundary. This requires knowning the exact length of each machine instruction.
 670 That is somewhat complicated, but doable. Example 256.bzip2:
 671
 672 In the new trace, the hot loop has an instruction which crosses a cacheline
 673 boundary.  In addition to potential cache misses, this can't help decoding as I
 674 imagine there has to be some kind of complicated decoder reset and realignment
 675 to grab the bytes from the next cacheline.
 676
 677 532  532 0x3cfc movb     (1809(%esp, %esi), %bl   <<<--- spans 2 64 byte lines
 678 942  942 0x3d03 movl     %dh, (1809(%esp, %esi)
 679 937  937 0x3d0a incl     %esi
 680 3    3   0x3d0b cmpb     %bl, %dl
 681 27   27  0x3d0d jnz      0x000062db <main+11707>
 682
 683 //===---------------------------------------------------------------------===//
 684
 685 In c99 mode, the preprocessor doesn't like assembly comments like #TRUNCATE.
 686
 687 //===---------------------------------------------------------------------===//
 688
 689 This could be a single 16-bit load.
 690
 691 int f(char *p) {
 692     if ((p[0] == 1) & (p[1] == 2)) return 1;
 693     return 0;
 694 }
 695
 696 //===---------------------------------------------------------------------===//
 697
 698 We should inline lrintf and probably other libc functions.
 699
 700 //===---------------------------------------------------------------------===//
 701
 702 Start using the flags more.  For example, compile:
 703
 704 int add_zf(int *x, int y, int a, int b) {
 705      if ((*x += y) == 0)
 706           return a;
 707      else
 708           return b;
 709 }
 710
 711 to:
 712        addl    %esi, (%rdi)
 713        movl    %edx, %eax
 714        cmovne  %ecx, %eax
 715        ret
 716 instead of:
 717
 718 _add_zf:
 719         addl (%rdi), %esi
 720         movl %esi, (%rdi)
 721         testl %esi, %esi
 722         cmove %edx, %ecx
 723         movl %ecx, %eax
 724         ret
 725
 726 and:
 727
 728 int add_zf(int *x, int y, int a, int b) {
 729      if ((*x + y) < 0)
 730           return a;
 731      else
 732           return b;
 733 }
 734
 735 to:
 736
 737 add_zf:
 738         addl    (%rdi), %esi
 739         movl    %edx, %eax
 740         cmovns  %ecx, %eax
 741         ret
 742
 743 instead of:
 744
 745 _add_zf:
 746         addl (%rdi), %esi
 747         testl %esi, %esi
 748         cmovs %edx, %ecx
 749         movl %ecx, %eax
 750         ret
 751
 752 //===---------------------------------------------------------------------===//
 753
 754 These two functions have identical effects:
 755
 756 unsigned int f(unsigned int i, unsigned int n) {++i; if (i == n) ++i; return i;}
 757 unsigned int f2(unsigned int i, unsigned int n) {++i; i += i == n; return i;}
 758
 759 We currently compile them to:
 760
 761 _f:
 762         movl 4(%esp), %eax
 763         movl %eax, %ecx
 764         incl %ecx
 765         movl 8(%esp), %edx
 766         cmpl %edx, %ecx
 767         jne LBB1_2      #UnifiedReturnBlock
 768 LBB1_1: #cond_true
 769         addl $2, %eax
 770         ret
 771 LBB1_2: #UnifiedReturnBlock
 772         movl %ecx, %eax
 773         ret
 774 _f2:
 775         movl 4(%esp), %eax
 776         movl %eax, %ecx
 777         incl %ecx
 778         cmpl 8(%esp), %ecx
 779         sete %cl
 780         movzbl %cl, %ecx
 781         leal 1(%ecx,%eax), %eax
 782         ret
 783
 784 both of which are inferior to GCC's:
 785
 786 _f:
 787         movl    4(%esp), %edx
 788         leal    1(%edx), %eax
 789         addl    $2, %edx
 790         cmpl    8(%esp), %eax
 791         cmove   %edx, %eax
 792         ret
 793 _f2:
 794         movl    4(%esp), %eax
 795         addl    $1, %eax
 796         xorl    %edx, %edx
 797         cmpl    8(%esp), %eax
 798         sete    %dl
 799         addl    %edx, %eax
 800         ret
 801
 802 //===---------------------------------------------------------------------===//
 803
 804 This code:
 805
 806 void test(int X) {
 807   if (X) abort();
 808 }
 809
 810 is currently compiled to:
 811
 812 _test:
 813         subl $12, %esp
 814         cmpl $0, 16(%esp)
 815         jne LBB1_1
 816         addl $12, %esp
 817         ret
 818 LBB1_1:
 819         call L_abort$stub
 820
 821 It would be better to produce:
 822
 823 _test:
 824         subl $12, %esp
 825         cmpl $0, 16(%esp)
 826         jne L_abort$stub
 827         addl $12, %esp
 828         ret
 829
 830 This can be applied to any no-return function call that takes no arguments etc.
 831 Alternatively, the stack save/restore logic could be shrink-wrapped, producing
 832 something like this:
 833
 834 _test:
 835         cmpl $0, 4(%esp)
 836         jne LBB1_1
 837         ret
 838 LBB1_1:
 839         subl $12, %esp
 840         call L_abort$stub
 841
 842 Both are useful in different situations.  Finally, it could be shrink-wrapped
 843 and tail called, like this:
 844
 845 _test:
 846         cmpl $0, 4(%esp)
 847         jne LBB1_1
 848         ret
 849 LBB1_1:
 850         pop %eax   # realign stack.
 851         call L_abort$stub
 852
 853 Though this probably isn't worth it.
 854
 855 //===---------------------------------------------------------------------===//
 856
 857 We need to teach the codegen to convert two-address INC instructions to LEA
 858 when the flags are dead (likewise dec).  For example, on X86-64, compile:
 859
 860 int foo(int A, int B) {
 861   return A+1;
 862 }
 863
 864 to:
 865
 866 _foo:
 867         leal    1(%edi), %eax
 868         ret
 869
 870 instead of:
 871
 872 _foo:
 873         incl %edi
 874         movl %edi, %eax
 875         ret
 876
 877 Another example is:
 878
 879 ;; X's live range extends beyond the shift, so the register allocator
 880 ;; cannot coalesce it with Y.  Because of this, a copy needs to be
 881 ;; emitted before the shift to save the register value before it is
 882 ;; clobbered.  However, this copy is not needed if the register
 883 ;; allocator turns the shift into an LEA.  This also occurs for ADD.
 884
 885 ; Check that the shift gets turned into an LEA.
 886 ; RUN: llvm-as < %s | llc -march=x86 -x86-asm-syntax=intel | \
 887 ; RUN:   not grep {mov E.X, E.X}
 888
 889 @G = external global i32                ; <i32*> [#uses=3]
 890
 891 define i32 @test1(i32 %X, i32 %Y) {
 892         %Z = add i32 %X, %Y             ; <i32> [#uses=1]
 893         volatile store i32 %Y, i32* @G
 894         volatile store i32 %Z, i32* @G
 895         ret i32 %X
 896 }
 897
 898 define i32 @test2(i32 %X) {
 899         %Z = add i32 %X, 1              ; <i32> [#uses=1]
 900         volatile store i32 %Z, i32* @G
 901         ret i32 %X
 902 }
 903
 904 //===---------------------------------------------------------------------===//
 905
 906 Sometimes it is better to codegen subtractions from a constant (e.g. 7-x) with
 907 a neg instead of a sub instruction.  Consider:
 908
 909 int test(char X) { return 7-X; }
 910
 911 we currently produce:
 912 _test:
 913         movl $7, %eax
 914         movsbl 4(%esp), %ecx
 915         subl %ecx, %eax
 916         ret
 917
 918 We would use one fewer register if codegen'd as:
 919
 920         movsbl 4(%esp), %eax
 921         neg %eax
 922         add $7, %eax
 923         ret
 924
 925 Note that this isn't beneficial if the load can be folded into the sub.  In
 926 this case, we want a sub:
 927
 928 int test(int X) { return 7-X; }
 929 _test:
 930         movl $7, %eax
 931         subl 4(%esp), %eax
 932         ret
 933
 934 //===---------------------------------------------------------------------===//
 935
 936 Leaf functions that require one 4-byte spill slot have a prolog like this:
 937
 938 _foo:
 939         pushl   %esi
 940         subl    $4, %esp
 941 ...
 942 and an epilog like this:
 943         addl    $4, %esp
 944         popl    %esi
 945         ret
 946
 947 It would be smaller, and potentially faster, to push eax on entry and to
 948 pop into a dummy register instead of using addl/subl of esp.  Just don't pop
 949 into any return registers :)
 950
 951 //===---------------------------------------------------------------------===//
 952
 953 The X86 backend should fold (branch (or (setcc, setcc))) into multiple
 954 branches.  We generate really poor code for:
 955
 956 double testf(double a) {
 957        return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0);
 958 }
 959
 960 For example, the entry BB is:
 961
 962 _testf:
 963         subl    $20, %esp
 964         pxor    %xmm0, %xmm0
 965         movsd   24(%esp), %xmm1
 966         ucomisd %xmm0, %xmm1
 967         setnp   %al
 968         sete    %cl
 969         testb   %cl, %al
 970         jne     LBB1_5  # UnifiedReturnBlock
 971 LBB1_1: # cond_true
 972
 973
 974 it would be better to replace the last four instructions with:
 975
 976         jp LBB1_1
 977         je LBB1_5
 978 LBB1_1:
 979
 980 We also codegen the inner ?: into a diamond:
 981
 982        cvtss2sd        LCPI1_0(%rip), %xmm2
 983         cvtss2sd        LCPI1_1(%rip), %xmm3
 984         ucomisd %xmm1, %xmm0
 985         ja      LBB1_3  # cond_true
 986 LBB1_2: # cond_true
 987         movapd  %xmm3, %xmm2
 988 LBB1_3: # cond_true
 989         movapd  %xmm2, %xmm0
 990         ret
 991
 992 We should sink the load into xmm3 into the LBB1_2 block.  This should
 993 be pretty easy, and will nuke all the copies.
 994
 995 //===---------------------------------------------------------------------===//
 996
 997 This:
 998         #include <algorithm>
 999         inline std::pair<unsigned, bool> full_add(unsigned a, unsigned b)
1000         { return std::make_pair(a + b, a + b < a); }
1001         bool no_overflow(unsigned a, unsigned b)
1002         { return !full_add(a, b).second; }
1003
1004 Should compile to:
1005
1006
1007         _Z11no_overflowjj:
1008                 addl    %edi, %esi
1009                 setae   %al
1010                 ret
1011
1012 FIXME: That code looks wrong; bool return is normally defined as zext.
1013
1014 on x86-64, not:
1015
1016 __Z11no_overflowjj:
1017         addl    %edi, %esi
1018         cmpl    %edi, %esi
1019         setae   %al
1020         movzbl  %al, %eax
1021         ret
1022
1023
1024 //===---------------------------------------------------------------------===//
1025
1026 Re-materialize MOV32r0 etc. with xor instead of changing them to moves if the
1027 condition register is dead. xor reg reg is shorter than mov reg, #0.
1028
1029 //===---------------------------------------------------------------------===//
1030
1031 We aren't matching RMW instructions aggressively
1032 enough.  Here's a reduced testcase (more in PR1160):
1033
1034 define void @test(i32* %huge_ptr, i32* %target_ptr) {
1035         %A = load i32* %huge_ptr                ; <i32> [#uses=1]
1036         %B = load i32* %target_ptr              ; <i32> [#uses=1]
1037         %C = or i32 %A, %B              ; <i32> [#uses=1]
1038         store i32 %C, i32* %target_ptr
1039         ret void
1040 }
1041
1042 $ llvm-as < t.ll | llc -march=x86-64
1043
1044 _test:
1045         movl (%rdi), %eax
1046         orl (%rsi), %eax
1047         movl %eax, (%rsi)
1048         ret
1049
1050 That should be something like:
1051
1052 _test:
1053         movl (%rdi), %eax
1054         orl %eax, (%rsi)
1055         ret
1056
1057 //===---------------------------------------------------------------------===//
1058
1059 The following code:
1060
1061 bb114.preheader:                ; preds = %cond_next94
1062         %tmp231232 = sext i16 %tmp62 to i32             ; <i32> [#uses=1]
1063         %tmp233 = sub i32 32, %tmp231232                ; <i32> [#uses=1]
1064         %tmp245246 = sext i16 %tmp65 to i32             ; <i32> [#uses=1]
1065         %tmp252253 = sext i16 %tmp68 to i32             ; <i32> [#uses=1]
1066         %tmp254 = sub i32 32, %tmp252253                ; <i32> [#uses=1]
1067         %tmp553554 = bitcast i16* %tmp37 to i8*         ; <i8*> [#uses=2]
1068         %tmp583584 = sext i16 %tmp98 to i32             ; <i32> [#uses=1]
1069         %tmp585 = sub i32 32, %tmp583584                ; <i32> [#uses=1]
1070         %tmp614615 = sext i16 %tmp101 to i32            ; <i32> [#uses=1]
1071         %tmp621622 = sext i16 %tmp104 to i32            ; <i32> [#uses=1]
1072         %tmp623 = sub i32 32, %tmp621622                ; <i32> [#uses=1]
1073         br label %bb114
1074
1075 produces:
1076
1077 LBB3_5: # bb114.preheader
1078         movswl  -68(%ebp), %eax
1079         movl    $32, %ecx
1080         movl    %ecx, -80(%ebp)
1081         subl    %eax, -80(%ebp)
1082         movswl  -52(%ebp), %eax
1083         movl    %ecx, -84(%ebp)
1084         subl    %eax, -84(%ebp)
1085         movswl  -70(%ebp), %eax
1086         movl    %ecx, -88(%ebp)
1087         subl    %eax, -88(%ebp)
1088         movswl  -50(%ebp), %eax
1089         subl    %eax, %ecx
1090         movl    %ecx, -76(%ebp)
1091         movswl  -42(%ebp), %eax
1092         movl    %eax, -92(%ebp)
1093         movswl  -66(%ebp), %eax
1094         movl    %eax, -96(%ebp)
1095         movw    $0, -98(%ebp)
1096
1097 This appears to be bad because the RA is not folding the store to the stack
1098 slot into the movl.  The above instructions could be:
1099         movl    $32, -80(%ebp)
1100 ...
1101         movl    $32, -84(%ebp)
1102 ...
1103 This seems like a cross between remat and spill folding.
1104
1105 This has redundant subtractions of %eax from a stack slot. However, %ecx doesn't
1106 change, so we could simply subtract %eax from %ecx first and then use %ecx (or
1107 vice-versa).
1108
1109 //===---------------------------------------------------------------------===//
1110
1111 For this code:
1112
1113 cond_next603:           ; preds = %bb493, %cond_true336, %cond_next599
1114         %v.21050.1 = phi i32 [ %v.21050.0, %cond_next599 ], [ %tmp344, %cond_true336 ], [ %v.2, %bb493 ]                ; <i32> [#uses=1]
1115         %maxz.21051.1 = phi i32 [ %maxz.21051.0, %cond_next599 ], [ 0, %cond_true336 ], [ %maxz.2, %bb493 ]             ; <i32> [#uses=2]
1116         %cnt.01055.1 = phi i32 [ %cnt.01055.0, %cond_next599 ], [ 0, %cond_true336 ], [ %cnt.0, %bb493 ]                ; <i32> [#uses=2]
1117         %byteptr.9 = phi i8* [ %byteptr.12, %cond_next599 ], [ %byteptr.0, %cond_true336 ], [ %byteptr.10, %bb493 ]             ; <i8*> [#uses=9]
1118         %bitptr.6 = phi i32 [ %tmp5571104.1, %cond_next599 ], [ %tmp4921049, %cond_true336 ], [ %bitptr.7, %bb493 ]             ; <i32> [#uses=4]
1119         %source.5 = phi i32 [ %tmp602, %cond_next599 ], [ %source.0, %cond_true336 ], [ %source.6, %bb493 ]             ; <i32> [#uses=7]
1120         %tmp606 = getelementptr %struct.const_tables* @tables, i32 0, i32 0, i32 %cnt.01055.1           ; <i8*> [#uses=1]
1121         %tmp607 = load i8* %tmp606, align 1             ; <i8> [#uses=1]
1122
1123 We produce this:
1124
1125 LBB4_70:        # cond_next603
1126         movl    -20(%ebp), %esi
1127         movl    L_tables$non_lazy_ptr-"L4$pb"(%esi), %esi
1128
1129 However, ICC caches this information before the loop and produces this:
1130
1131         movl      88(%esp), %eax                                #481.12
1132
1133 //===---------------------------------------------------------------------===//
1134
1135 This code:
1136
1137         %tmp659 = icmp slt i16 %tmp654, 0               ; <i1> [#uses=1]
1138         br i1 %tmp659, label %cond_true662, label %cond_next715
1139
1140 produces this:
1141
1142         testw   %cx, %cx
1143         movswl  %cx, %esi
1144         jns     LBB4_109        # cond_next715
1145
1146 Shark tells us that using %cx in the testw instruction is sub-optimal. It
1147 suggests using the 32-bit register (which is what ICC uses).
1148
1149 //===---------------------------------------------------------------------===//
1150
1151 We compile this:
1152
1153 void compare (long long foo) {
1154   if (foo < 4294967297LL)
1155     abort();
1156 }
1157
1158 to:
1159
1160 compare:
1161         subl    $4, %esp
1162         cmpl    $0, 8(%esp)
1163         setne   %al
1164         movzbw  %al, %ax
1165         cmpl    $1, 12(%esp)
1166         setg    %cl
1167         movzbw  %cl, %cx
1168         cmove   %ax, %cx
1169         testb   $1, %cl
1170         jne     .LBB1_2 # UnifiedReturnBlock
1171 .LBB1_1:        # ifthen
1172         call    abort
1173 .LBB1_2:        # UnifiedReturnBlock
1174         addl    $4, %esp
1175         ret
1176
1177 (also really horrible code on ppc).  This is due to the expand code for 64-bit
1178 compares.  GCC produces multiple branches, which is much nicer:
1179
1180 compare:
1181         subl    $12, %esp
1182         movl    20(%esp), %edx
1183         movl    16(%esp), %eax
1184         decl    %edx
1185         jle     .L7
1186 .L5:
1187         addl    $12, %esp
1188         ret
1189         .p2align 4,,7
1190 .L7:
1191         jl      .L4
1192         cmpl    $0, %eax
1193         .p2align 4,,8
1194         ja      .L5
1195 .L4:
1196         .p2align 4,,9
1197         call    abort
1198
1199 //===---------------------------------------------------------------------===//
1200
1201 Tail call optimization improvements: Tail call optimization currently
1202 pushes all arguments on the top of the stack (their normal place for
1203 non-tail call optimized calls) that source from the callers arguments
1204 or  that source from a virtual register (also possibly sourcing from
1205 callers arguments).
1206 This is done to prevent overwriting of parameters (see example
1207 below) that might be used later.
1208
1209 example:
1210
1211 int callee(int32, int64);
1212 int caller(int32 arg1, int32 arg2) {
1213   int64 local = arg2 * 2;
1214   return callee(arg2, (int64)local);
1215 }
1216
1217 [arg1]          [!arg2 no longer valid since we moved local onto it]
1218 [arg2]      ->  [(int64)
1219 [RETADDR]        local  ]
1220
1221 Moving arg1 onto the stack slot of callee function would overwrite
1222 arg2 of the caller.
1223
1224 Possible optimizations:
1225
1226
1227  - Analyse the actual parameters of the callee to see which would
1228    overwrite a caller parameter which is used by the callee and only
1229    push them onto the top of the stack.
1230
1231    int callee (int32 arg1, int32 arg2);
1232    int caller (int32 arg1, int32 arg2) {
1233        return callee(arg1,arg2);
1234    }
1235
1236    Here we don't need to write any variables to the top of the stack
1237    since they don't overwrite each other.
1238
1239    int callee (int32 arg1, int32 arg2);
1240    int caller (int32 arg1, int32 arg2) {
1241        return callee(arg2,arg1);
1242    }
1243
1244    Here we need to push the arguments because they overwrite each
1245    other.
1246
1247 //===---------------------------------------------------------------------===//
1248
1249 main ()
1250 {
1251   int i = 0;
1252   unsigned long int z = 0;
1253
1254   do {
1255     z -= 0x00004000;
1256     i++;
1257     if (i > 0x00040000)
1258       abort ();
1259   } while (z > 0);
1260   exit (0);
1261 }
1262
1263 gcc compiles this to:
1264
1265 _main:
1266         subl    $28, %esp
1267         xorl    %eax, %eax
1268         jmp     L2
1269 L3:
1270         cmpl    $262144, %eax
1271         je      L10
1272 L2:
1273         addl    $1, %eax
1274         cmpl    $262145, %eax
1275         jne     L3
1276         call    L_abort$stub
1277 L10:
1278         movl    $0, (%esp)
1279         call    L_exit$stub
1280
1281 llvm:
1282
1283 _main:
1284         subl    $12, %esp
1285         movl    $1, %eax
1286         movl    $16384, %ecx
1287 LBB1_1: # bb
1288         cmpl    $262145, %eax
1289         jge     LBB1_4  # cond_true
1290 LBB1_2: # cond_next
1291         incl    %eax
1292         addl    $4294950912, %ecx
1293         cmpl    $16384, %ecx
1294         jne     LBB1_1  # bb
1295 LBB1_3: # bb11
1296         xorl    %eax, %eax
1297         addl    $12, %esp
1298         ret
1299 LBB1_4: # cond_true
1300         call    L_abort$stub
1301
1302 1. LSR should rewrite the first cmp with induction variable %ecx.
1303 2. DAG combiner should fold
1304         leal    1(%eax), %edx
1305         cmpl    $262145, %edx
1306    =>
1307         cmpl    $262144, %eax
1308
1309 //===---------------------------------------------------------------------===//
1310
1311 define i64 @test(double %X) {
1312         %Y = fptosi double %X to i64
1313         ret i64 %Y
1314 }
1315
1316 compiles to:
1317
1318 _test:
1319         subl    $20, %esp
1320         movsd   24(%esp), %xmm0
1321         movsd   %xmm0, 8(%esp)
1322         fldl    8(%esp)
1323         fisttpll        (%esp)
1324         movl    4(%esp), %edx
1325         movl    (%esp), %eax
1326         addl    $20, %esp
1327         #FP_REG_KILL
1328         ret
1329
1330 This should just fldl directly from the input stack slot.
1331
1332 //===---------------------------------------------------------------------===//
1333
1334 This code:
1335 int foo (int x) { return (x & 65535) | 255; }
1336
1337 Should compile into:
1338
1339 _foo:
1340         movzwl  4(%esp), %eax
1341         orl     $255, %eax
1342         ret
1343
1344 instead of:
1345 _foo:
1346         movl    $255, %eax
1347         orl     4(%esp), %eax
1348         andl    $65535, %eax
1349         ret
1350
1351 //===---------------------------------------------------------------------===//
1352
1353 We're codegen'ing multiply of long longs inefficiently:
1354
1355 unsigned long long LLM(unsigned long long arg1, unsigned long long arg2) {
1356   return arg1 *  arg2;
1357 }
1358
1359 We compile to (fomit-frame-pointer):
1360
1361 _LLM:
1362         pushl   %esi
1363         movl    8(%esp), %ecx
1364         movl    16(%esp), %esi
1365         movl    %esi, %eax
1366         mull    %ecx
1367         imull   12(%esp), %esi
1368         addl    %edx, %esi
1369         imull   20(%esp), %ecx
1370         movl    %esi, %edx
1371         addl    %ecx, %edx
1372         popl    %esi
1373         ret
1374
1375 This looks like a scheduling deficiency and lack of remat of the load from
1376 the argument area.  ICC apparently produces:
1377
1378         movl      8(%esp), %ecx
1379         imull     12(%esp), %ecx
1380         movl      16(%esp), %eax
1381         imull     4(%esp), %eax
1382         addl      %eax, %ecx
1383         movl      4(%esp), %eax
1384         mull      12(%esp)
1385         addl      %ecx, %edx
1386         ret
1387
1388 Note that it remat'd loads from 4(esp) and 12(esp).  See this GCC PR:
1389 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=17236
1390
1391 //===---------------------------------------------------------------------===//
1392
1393 We can fold a store into "zeroing a reg".  Instead of:
1394
1395 xorl    %eax, %eax
1396 movl    %eax, 124(%esp)
1397
1398 we should get:
1399
1400 movl    $0, 124(%esp)
1401
1402 if the flags of the xor are dead.
1403
1404 Likewise, we isel "x<<1" into "add reg,reg".  If reg is spilled, this should
1405 be folded into: shl [mem], 1
1406
1407 //===---------------------------------------------------------------------===//
1408
1409 This testcase misses a read/modify/write opportunity (from PR1425):
1410
1411 void vertical_decompose97iH1(int *b0, int *b1, int *b2, int width){
1412     int i;
1413     for(i=0; i<width; i++)
1414         b1[i] += (1*(b0[i] + b2[i])+0)>>0;
1415 }
1416
1417 We compile it down to:
1418
1419 LBB1_2: # bb
1420         movl    (%esi,%edi,4), %ebx
1421         addl    (%ecx,%edi,4), %ebx
1422         addl    (%edx,%edi,4), %ebx
1423         movl    %ebx, (%ecx,%edi,4)
1424         incl    %edi
1425         cmpl    %eax, %edi
1426         jne     LBB1_2  # bb
1427
1428 the inner loop should add to the memory location (%ecx,%edi,4), saving
1429 a mov.  Something like:
1430
1431         movl    (%esi,%edi,4), %ebx
1432         addl    (%edx,%edi,4), %ebx
1433         addl    %ebx, (%ecx,%edi,4)
1434
1435 Here is another interesting example:
1436
1437 void vertical_compose97iH1(int *b0, int *b1, int *b2, int width){
1438     int i;
1439     for(i=0; i<width; i++)
1440         b1[i] -= (1*(b0[i] + b2[i])+0)>>0;
1441 }
1442
1443 We miss the r/m/w opportunity here by using 2 subs instead of an add+sub[mem]:
1444
1445 LBB9_2: # bb
1446         movl    (%ecx,%edi,4), %ebx
1447         subl    (%esi,%edi,4), %ebx
1448         subl    (%edx,%edi,4), %ebx
1449         movl    %ebx, (%ecx,%edi,4)
1450         incl    %edi
1451         cmpl    %eax, %edi
1452         jne     LBB9_2  # bb
1453
1454 Additionally, LSR should rewrite the exit condition of these loops to use
1455 a stride-4 IV, would would allow all the scales in the loop to go away.
1456 This would result in smaller code and more efficient microops.
1457
1458 //===---------------------------------------------------------------------===//
1459
1460 In SSE mode, we turn abs and neg into a load from the constant pool plus a xor
1461 or and instruction, for example:
1462
1463         xorpd   LCPI1_0, %xmm2
1464
1465 However, if xmm2 gets spilled, we end up with really ugly code like this:
1466
1467         movsd   (%esp), %xmm0
1468         xorpd   LCPI1_0, %xmm0
1469         movsd   %xmm0, (%esp)
1470
1471 Since we 'know' that this is a 'neg', we can actually "fold" the spill into
1472 the neg/abs instruction, turning it into an *integer* operation, like this:
1473
1474         xorl 2147483648, [mem+4]     ## 2147483648 = (1 << 31)
1475
1476 you could also use xorb, but xorl is less likely to lead to a partial register
1477 stall.  Here is a contrived testcase:
1478
1479 double a, b, c;
1480 void test(double *P) {
1481   double X = *P;
1482   a = X;
1483   bar();
1484   X = -X;
1485   b = X;
1486   bar();
1487   c = X;
1488 }
1489
1490 //===---------------------------------------------------------------------===//
1491
1492 handling llvm.memory.barrier on pre SSE2 cpus
1493
1494 should generate:
1495 lock ; mov %esp, %esp
1496
1497 //===---------------------------------------------------------------------===//
1498
1499 The generated code on x86 for checking for signed overflow on a multiply the
1500 obvious way is much longer than it needs to be.
1501
1502 int x(int a, int b) {
1503   long long prod = (long long)a*b;
1504   return  prod > 0x7FFFFFFF || prod < (-0x7FFFFFFF-1);
1505 }
1506
1507 See PR2053 for more details.
1508
1509 //===---------------------------------------------------------------------===//
1510
1511 We should investigate using cdq/ctld (effect: edx = sar eax, 31)
1512 more aggressively; it should cost the same as a move+shift on any modern
1513 processor, but it's a lot shorter. Downside is that it puts more
1514 pressure on register allocation because it has fixed operands.
1515
1516 Example:
1517 int abs(int x) {return x < 0 ? -x : x;}
1518
1519 gcc compiles this to the following when using march/mtune=pentium2/3/4/m/etc.:
1520 abs:
1521         movl    4(%esp), %eax
1522         cltd
1523         xorl    %edx, %eax
1524         subl    %edx, %eax
1525         ret
1526
1527 //===---------------------------------------------------------------------===//
1528
1529 Consider:
1530 int test(unsigned long a, unsigned long b) { return -(a < b); }
1531
1532 We currently compile this to:
1533
1534 define i32 @test(i32 %a, i32 %b) nounwind  {
1535         %tmp3 = icmp ult i32 %a, %b             ; <i1> [#uses=1]
1536         %tmp34 = zext i1 %tmp3 to i32           ; <i32> [#uses=1]
1537         %tmp5 = sub i32 0, %tmp34               ; <i32> [#uses=1]
1538         ret i32 %tmp5
1539 }
1540
1541 and
1542
1543 _test:
1544         movl    8(%esp), %eax
1545         cmpl    %eax, 4(%esp)
1546         setb    %al
1547         movzbl  %al, %eax
1548         negl    %eax
1549         ret
1550
1551 Several deficiencies here.  First, we should instcombine zext+neg into sext:
1552
1553 define i32 @test2(i32 %a, i32 %b) nounwind  {
1554         %tmp3 = icmp ult i32 %a, %b             ; <i1> [#uses=1]
1555         %tmp34 = sext i1 %tmp3 to i32           ; <i32> [#uses=1]
1556         ret i32 %tmp34
1557 }
1558
1559 However, before we can do that, we have to fix the bad codegen that we get for
1560 sext from bool:
1561
1562 _test2:
1563         movl    8(%esp), %eax
1564         cmpl    %eax, 4(%esp)
1565         setb    %al
1566         movzbl  %al, %eax
1567         shll    $31, %eax
1568         sarl    $31, %eax
1569         ret
1570
1571 This code should be at least as good as the code above.  Once this is fixed, we
1572 can optimize this specific case even more to:
1573
1574         movl    8(%esp), %eax
1575         xorl    %ecx, %ecx
1576         cmpl    %eax, 4(%esp)
1577         sbbl    %ecx, %ecx
1578
1579 //===---------------------------------------------------------------------===//
1580
1581 Take the following code (from
1582 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=16541):
1583
1584 extern unsigned char first_one[65536];
1585 int FirstOnet(unsigned long long arg1)
1586 {
1587   if (arg1 >> 48)
1588     return (first_one[arg1 >> 48]);
1589   return 0;
1590 }
1591
1592
1593 The following code is currently generated:
1594 FirstOnet:
1595         movl    8(%esp), %eax
1596         cmpl    $65536, %eax
1597         movl    4(%esp), %ecx
1598         jb      .LBB1_2 # UnifiedReturnBlock
1599 .LBB1_1:        # ifthen
1600         shrl    $16, %eax
1601         movzbl  first_one(%eax), %eax
1602         ret
1603 .LBB1_2:        # UnifiedReturnBlock
1604         xorl    %eax, %eax
1605         ret
1606
1607 There are a few possible improvements here:
1608 1. We should be able to eliminate the dead load into %ecx
1609 2. We could change the "movl 8(%esp), %eax" into
1610    "movzwl 10(%esp), %eax"; this lets us change the cmpl
1611    into a testl, which is shorter, and eliminate the shift.
1612
1613 We could also in theory eliminate the branch by using a conditional
1614 for the address of the load, but that seems unlikely to be worthwhile
1615 in general.
1616
1617 //===---------------------------------------------------------------------===//
1618
1619 We compile this function:
1620
1621 define i32 @foo(i32 %a, i32 %b, i32 %c, i8 zeroext  %d) nounwind  {
1622 entry:
1623         %tmp2 = icmp eq i8 %d, 0                ; <i1> [#uses=1]
1624         br i1 %tmp2, label %bb7, label %bb
1625
1626 bb:             ; preds = %entry
1627         %tmp6 = add i32 %b, %a          ; <i32> [#uses=1]
1628         ret i32 %tmp6
1629
1630 bb7:            ; preds = %entry
1631         %tmp10 = sub i32 %a, %c         ; <i32> [#uses=1]
1632         ret i32 %tmp10
1633 }
1634
1635 to:
1636
1637 _foo:
1638         cmpb    $0, 16(%esp)
1639         movl    12(%esp), %ecx
1640         movl    8(%esp), %eax
1641         movl    4(%esp), %edx
1642         je      LBB1_2  # bb7
1643 LBB1_1: # bb
1644         addl    %edx, %eax
1645         ret
1646 LBB1_2: # bb7
1647         movl    %edx, %eax
1648         subl    %ecx, %eax
1649         ret
1650
1651 The coalescer could coalesce "edx" with "eax" to avoid the movl in LBB1_2
1652 if it commuted the addl in LBB1_1.
1653
1654 //===---------------------------------------------------------------------===//
1655