lib/Target/X86/README.txt

   1 //===---------------------------------------------------------------------===//
   2 // Random ideas for the X86 backend.
   3 //===---------------------------------------------------------------------===//
   4
   5 Missing features:
   6   - Support for SSE4: http://www.intel.com/software/penryn
   7 http://softwarecommunity.intel.com/isn/Downloads/Intel%20SSE4%20Programming%20Reference.pdf
   8   - support for 3DNow!
   9   - weird abis?
  10
  11 //===---------------------------------------------------------------------===//
  12
  13 Add a MUL2U and MUL2S nodes to represent a multiply that returns both the
  14 Hi and Lo parts (combination of MUL and MULH[SU] into one node).  Add this to
  15 X86, & make the dag combiner produce it when needed.  This will eliminate one
  16 imul from the code generated for:
  17
  18 long long test(long long X, long long Y) { return X*Y; }
  19
  20 by using the EAX result from the mul.  We should add a similar node for
  21 DIVREM.
  22
  23 another case is:
  24
  25 long long test(int X, int Y) { return (long long)X*Y; }
  26
  27 ... which should only be one imul instruction.
  28
  29 or:
  30
  31 unsigned long long int t2(unsigned int a, unsigned int b) {
  32        return (unsigned long long)a * b;
  33 }
  34
  35 ... which should be one mul instruction.
  36
  37
  38 This can be done with a custom expander, but it would be nice to move this to
  39 generic code.
  40
  41 //===---------------------------------------------------------------------===//
  42
  43 CodeGen/X86/lea-3.ll:test3 should be a single LEA, not a shift/move.  The X86
  44 backend knows how to three-addressify this shift, but it appears the register
  45 allocator isn't even asking it to do so in this case.  We should investigate
  46 why this isn't happening, it could have significant impact on other important
  47 cases for X86 as well.
  48
  49 //===---------------------------------------------------------------------===//
  50
  51 This should be one DIV/IDIV instruction, not a libcall:
  52
  53 unsigned test(unsigned long long X, unsigned Y) {
  54         return X/Y;
  55 }
  56
  57 This can be done trivially with a custom legalizer.  What about overflow
  58 though?  http://gcc.gnu.org/bugzilla/show_bug.cgi?id=14224
  59
  60 //===---------------------------------------------------------------------===//
  61
  62 Improvements to the multiply -> shift/add algorithm:
  63 http://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html
  64
  65 //===---------------------------------------------------------------------===//
  66
  67 Improve code like this (occurs fairly frequently, e.g. in LLVM):
  68 long long foo(int x) { return 1LL << x; }
  69
  70 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01109.html
  71 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01128.html
  72 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html
  73
  74 Another useful one would be  ~0ULL >> X and ~0ULL << X.
  75
  76 One better solution for 1LL << x is:
  77         xorl    %eax, %eax
  78         xorl    %edx, %edx
  79         testb   $32, %cl
  80         sete    %al
  81         setne   %dl
  82         sall    %cl, %eax
  83         sall    %cl, %edx
  84
  85 But that requires good 8-bit subreg support.
  86
  87 64-bit shifts (in general) expand to really bad code.  Instead of using
  88 cmovs, we should expand to a conditional branch like GCC produces.
  89
  90 //===---------------------------------------------------------------------===//
  91
  92 Compile this:
  93 _Bool f(_Bool a) { return a!=1; }
  94
  95 into:
  96         movzbl  %dil, %eax
  97         xorl    $1, %eax
  98         ret
  99
 100 //===---------------------------------------------------------------------===//
 101
 102 Some isel ideas:
 103
 104 1. Dynamic programming based approach when compile time if not an
 105    issue.
 106 2. Code duplication (addressing mode) during isel.
 107 3. Other ideas from "Register-Sensitive Selection, Duplication, and
 108    Sequencing of Instructions".
 109 4. Scheduling for reduced register pressure.  E.g. "Minimum Register
 110    Instruction Sequence Problem: Revisiting Optimal Code Generation for DAGs"
 111    and other related papers.
 112    http://citeseer.ist.psu.edu/govindarajan01minimum.html
 113
 114 //===---------------------------------------------------------------------===//
 115
 116 Should we promote i16 to i32 to avoid partial register update stalls?
 117
 118 //===---------------------------------------------------------------------===//
 119
 120 Leave any_extend as pseudo instruction and hint to register
 121 allocator. Delay codegen until post register allocation.
 122
 123 //===---------------------------------------------------------------------===//
 124
 125 Count leading zeros and count trailing zeros:
 126
 127 int clz(int X) { return __builtin_clz(X); }
 128 int ctz(int X) { return __builtin_ctz(X); }
 129
 130 $ gcc t.c -S -o - -O3  -fomit-frame-pointer -masm=intel
 131 clz:
 132         bsr     %eax, DWORD PTR [%esp+4]
 133         xor     %eax, 31
 134         ret
 135 ctz:
 136         bsf     %eax, DWORD PTR [%esp+4]
 137         ret
 138
 139 however, check that these are defined for 0 and 32.  Our intrinsics are, GCC's
 140 aren't.
 141
 142 Another example (use predsimplify to eliminate a select):
 143
 144 int foo (unsigned long j) {
 145   if (j)
 146     return __builtin_ffs (j) - 1;
 147   else
 148     return 0;
 149 }
 150
 151 //===---------------------------------------------------------------------===//
 152
 153 It appears icc use push for parameter passing. Need to investigate.
 154
 155 //===---------------------------------------------------------------------===//
 156
 157 Only use inc/neg/not instructions on processors where they are faster than
 158 add/sub/xor.  They are slower on the P4 due to only updating some processor
 159 flags.
 160
 161 //===---------------------------------------------------------------------===//
 162
 163 The instruction selector sometimes misses folding a load into a compare.  The
 164 pattern is written as (cmp reg, (load p)).  Because the compare isn't
 165 commutative, it is not matched with the load on both sides.  The dag combiner
 166 should be made smart enough to cannonicalize the load into the RHS of a compare
 167 when it can invert the result of the compare for free.
 168
 169 //===---------------------------------------------------------------------===//
 170
 171 How about intrinsics? An example is:
 172   *res = _mm_mulhi_epu16(*A, _mm_mul_epu32(*B, *C));
 173
 174 compiles to
 175         pmuludq (%eax), %xmm0
 176         movl 8(%esp), %eax
 177         movdqa (%eax), %xmm1
 178         pmulhuw %xmm0, %xmm1
 179
 180 The transformation probably requires a X86 specific pass or a DAG combiner
 181 target specific hook.
 182
 183 //===---------------------------------------------------------------------===//
 184
 185 In many cases, LLVM generates code like this:
 186
 187 _test:
 188         movl 8(%esp), %eax
 189         cmpl %eax, 4(%esp)
 190         setl %al
 191         movzbl %al, %eax
 192         ret
 193
 194 on some processors (which ones?), it is more efficient to do this:
 195
 196 _test:
 197         movl 8(%esp), %ebx
 198         xor  %eax, %eax
 199         cmpl %ebx, 4(%esp)
 200         setl %al
 201         ret
 202
 203 Doing this correctly is tricky though, as the xor clobbers the flags.
 204
 205 //===---------------------------------------------------------------------===//
 206
 207 We should generate bts/btr/etc instructions on targets where they are cheap or
 208 when codesize is important.  e.g., for:
 209
 210 void setbit(int *target, int bit) {
 211     *target |= (1 << bit);
 212 }
 213 void clearbit(int *target, int bit) {
 214     *target &= ~(1 << bit);
 215 }
 216
 217 //===---------------------------------------------------------------------===//
 218
 219 Instead of the following for memset char*, 1, 10:
 220
 221         movl $16843009, 4(%edx)
 222         movl $16843009, (%edx)
 223         movw $257, 8(%edx)
 224
 225 It might be better to generate
 226
 227         movl $16843009, %eax
 228         movl %eax, 4(%edx)
 229         movl %eax, (%edx)
 230         movw al, 8(%edx)
 231
 232 when we can spare a register. It reduces code size.
 233
 234 //===---------------------------------------------------------------------===//
 235
 236 Evaluate what the best way to codegen sdiv X, (2^C) is.  For X/8, we currently
 237 get this:
 238
 239 int %test1(int %X) {
 240         %Y = div int %X, 8
 241         ret int %Y
 242 }
 243
 244 _test1:
 245         movl 4(%esp), %eax
 246         movl %eax, %ecx
 247         sarl $31, %ecx
 248         shrl $29, %ecx
 249         addl %ecx, %eax
 250         sarl $3, %eax
 251         ret
 252
 253 GCC knows several different ways to codegen it, one of which is this:
 254
 255 _test1:
 256         movl    4(%esp), %eax
 257         cmpl    $-1, %eax
 258         leal    7(%eax), %ecx
 259         cmovle  %ecx, %eax
 260         sarl    $3, %eax
 261         ret
 262
 263 which is probably slower, but it's interesting at least :)
 264
 265 //===---------------------------------------------------------------------===//
 266
 267 The first BB of this code:
 268
 269 declare bool %foo()
 270 int %bar() {
 271         %V = call bool %foo()
 272         br bool %V, label %T, label %F
 273 T:
 274         ret int 1
 275 F:
 276         call bool %foo()
 277         ret int 12
 278 }
 279
 280 compiles to:
 281
 282 _bar:
 283         subl $12, %esp
 284         call L_foo$stub
 285         xorb $1, %al
 286         testb %al, %al
 287         jne LBB_bar_2   # F
 288
 289 It would be better to emit "cmp %al, 1" than a xor and test.
 290
 291 //===---------------------------------------------------------------------===//
 292
 293 We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl
 294 We should leave these as libcalls for everything over a much lower threshold,
 295 since libc is hand tuned for medium and large mem ops (avoiding RFO for large
 296 stores, TLB preheating, etc)
 297
 298 //===---------------------------------------------------------------------===//
 299
 300 Optimize this into something reasonable:
 301  x * copysign(1.0, y) * copysign(1.0, z)
 302
 303 //===---------------------------------------------------------------------===//
 304
 305 Optimize copysign(x, *y) to use an integer load from y.
 306
 307 //===---------------------------------------------------------------------===//
 308
 309 %X = weak global int 0
 310
 311 void %foo(int %N) {
 312         %N = cast int %N to uint
 313         %tmp.24 = setgt int %N, 0
 314         br bool %tmp.24, label %no_exit, label %return
 315
 316 no_exit:
 317         %indvar = phi uint [ 0, %entry ], [ %indvar.next, %no_exit ]
 318         %i.0.0 = cast uint %indvar to int
 319         volatile store int %i.0.0, int* %X
 320         %indvar.next = add uint %indvar, 1
 321         %exitcond = seteq uint %indvar.next, %N
 322         br bool %exitcond, label %return, label %no_exit
 323
 324 return:
 325         ret void
 326 }
 327
 328 compiles into:
 329
 330         .text
 331         .align  4
 332         .globl  _foo
 333 _foo:
 334         movl 4(%esp), %eax
 335         cmpl $1, %eax
 336         jl LBB_foo_4    # return
 337 LBB_foo_1:      # no_exit.preheader
 338         xorl %ecx, %ecx
 339 LBB_foo_2:      # no_exit
 340         movl L_X$non_lazy_ptr, %edx
 341         movl %ecx, (%edx)
 342         incl %ecx
 343         cmpl %eax, %ecx
 344         jne LBB_foo_2   # no_exit
 345 LBB_foo_3:      # return.loopexit
 346 LBB_foo_4:      # return
 347         ret
 348
 349 We should hoist "movl L_X$non_lazy_ptr, %edx" out of the loop after
 350 remateralization is implemented. This can be accomplished with 1) a target
 351 dependent LICM pass or 2) makeing SelectDAG represent the whole function.
 352
 353 //===---------------------------------------------------------------------===//
 354
 355 The following tests perform worse with LSR:
 356
 357 lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor.
 358
 359 //===---------------------------------------------------------------------===//
 360
 361 We are generating far worse code than gcc:
 362
 363 volatile short X, Y;
 364
 365 void foo(int N) {
 366   int i;
 367   for (i = 0; i < N; i++) { X = i; Y = i*4; }
 368 }
 369
 370 LBB1_1: #bb.preheader
 371         xorl %ecx, %ecx
 372         xorw %dx, %dx
 373 LBB1_2: #bb
 374         movl L_X$non_lazy_ptr, %esi
 375         movw %dx, (%esi)
 376         movw %dx, %si
 377         shlw $2, %si
 378         movl L_Y$non_lazy_ptr, %edi
 379         movw %si, (%edi)
 380         incl %ecx
 381         incw %dx
 382         cmpl %eax, %ecx
 383         jne LBB1_2      #bb
 384
 385 vs.
 386
 387         xorl    %edx, %edx
 388         movl    L_X$non_lazy_ptr-"L00000000001$pb"(%ebx), %esi
 389         movl    L_Y$non_lazy_ptr-"L00000000001$pb"(%ebx), %ecx
 390 L4:
 391         movw    %dx, (%esi)
 392         leal    0(,%edx,4), %eax
 393         movw    %ax, (%ecx)
 394         addl    $1, %edx
 395         cmpl    %edx, %edi
 396         jne     L4
 397
 398 There are 3 issues:
 399
 400 1. Lack of post regalloc LICM.
 401 2. LSR unable to reused IV for a different type (i16 vs. i32) even though
 402    the cast would be free.
 403
 404 //===---------------------------------------------------------------------===//
 405
 406 Teach the coalescer to coalesce vregs of different register classes. e.g. FR32 /
 407 FR64 to VR128.
 408
 409 //===---------------------------------------------------------------------===//
 410
 411 mov $reg, 48(%esp)
 412 ...
 413 leal 48(%esp), %eax
 414 mov %eax, (%esp)
 415 call _foo
 416
 417 Obviously it would have been better for the first mov (or any op) to store
 418 directly %esp[0] if there are no other uses.
 419
 420 //===---------------------------------------------------------------------===//
 421
 422 Adding to the list of cmp / test poor codegen issues:
 423
 424 int test(__m128 *A, __m128 *B) {
 425   if (_mm_comige_ss(*A, *B))
 426     return 3;
 427   else
 428     return 4;
 429 }
 430
 431 _test:
 432         movl 8(%esp), %eax
 433         movaps (%eax), %xmm0
 434         movl 4(%esp), %eax
 435         movaps (%eax), %xmm1
 436         comiss %xmm0, %xmm1
 437         setae %al
 438         movzbl %al, %ecx
 439         movl $3, %eax
 440         movl $4, %edx
 441         cmpl $0, %ecx
 442         cmove %edx, %eax
 443         ret
 444
 445 Note the setae, movzbl, cmpl, cmove can be replaced with a single cmovae. There
 446 are a number of issues. 1) We are introducing a setcc between the result of the
 447 intrisic call and select. 2) The intrinsic is expected to produce a i32 value
 448 so a any extend (which becomes a zero extend) is added.
 449
 450 We probably need some kind of target DAG combine hook to fix this.
 451
 452 //===---------------------------------------------------------------------===//
 453
 454 We generate significantly worse code for this than GCC:
 455 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150
 456 http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701
 457
 458 There is also one case we do worse on PPC.
 459
 460 //===---------------------------------------------------------------------===//
 461
 462 If shorter, we should use things like:
 463 movzwl %ax, %eax
 464 instead of:
 465 andl $65535, %EAX
 466
 467 The former can also be used when the two-addressy nature of the 'and' would
 468 require a copy to be inserted (in X86InstrInfo::convertToThreeAddress).
 469
 470 //===---------------------------------------------------------------------===//
 471
 472 Consider this:
 473
 474 typedef struct pair { float A, B; } pair;
 475 void pairtest(pair P, float *FP) {
 476         *FP = P.A+P.B;
 477 }
 478
 479 We currently generate this code with llvmgcc4:
 480
 481 _pairtest:
 482         movl 8(%esp), %eax
 483         movl 4(%esp), %ecx
 484         movd %eax, %xmm0
 485         movd %ecx, %xmm1
 486         addss %xmm0, %xmm1
 487         movl 12(%esp), %eax
 488         movss %xmm1, (%eax)
 489         ret
 490
 491 we should be able to generate:
 492 _pairtest:
 493         movss 4(%esp), %xmm0
 494         movl 12(%esp), %eax
 495         addss 8(%esp), %xmm0
 496         movss %xmm0, (%eax)
 497         ret
 498
 499 The issue is that llvmgcc4 is forcing the struct to memory, then passing it as
 500 integer chunks.  It does this so that structs like {short,short} are passed in
 501 a single 32-bit integer stack slot.  We should handle the safe cases above much
 502 nicer, while still handling the hard cases.
 503
 504 While true in general, in this specific case we could do better by promoting
 505 load int + bitcast to float -> load fload.  This basically needs alignment info,
 506 the code is already implemented (but disabled) in dag combine).
 507
 508 //===---------------------------------------------------------------------===//
 509
 510 Another instruction selector deficiency:
 511
 512 void %bar() {
 513         %tmp = load int (int)** %foo
 514         %tmp = tail call int %tmp( int 3 )
 515         ret void
 516 }
 517
 518 _bar:
 519         subl $12, %esp
 520         movl L_foo$non_lazy_ptr, %eax
 521         movl (%eax), %eax
 522         call *%eax
 523         addl $12, %esp
 524         ret
 525
 526 The current isel scheme will not allow the load to be folded in the call since
 527 the load's chain result is read by the callseq_start.
 528
 529 //===---------------------------------------------------------------------===//
 530
 531 For this:
 532
 533 int test(int a)
 534 {
 535   return a * 3;
 536 }
 537
 538 We currently emits
 539         imull $3, 4(%esp), %eax
 540
 541 Perhaps this is what we really should generate is? Is imull three or four
 542 cycles? Note: ICC generates this:
 543         movl    4(%esp), %eax
 544         leal    (%eax,%eax,2), %eax
 545
 546 The current instruction priority is based on pattern complexity. The former is
 547 more "complex" because it folds a load so the latter will not be emitted.
 548
 549 Perhaps we should use AddedComplexity to give LEA32r a higher priority? We
 550 should always try to match LEA first since the LEA matching code does some
 551 estimate to determine whether the match is profitable.
 552
 553 However, if we care more about code size, then imull is better. It's two bytes
 554 shorter than movl + leal.
 555
 556 //===---------------------------------------------------------------------===//
 557
 558 Implement CTTZ, CTLZ with bsf and bsr. GCC produces:
 559
 560 int ctz_(unsigned X) { return __builtin_ctz(X); }
 561 int clz_(unsigned X) { return __builtin_clz(X); }
 562 int ffs_(unsigned X) { return __builtin_ffs(X); }
 563
 564 _ctz_:
 565         bsfl    4(%esp), %eax
 566         ret
 567 _clz_:
 568         bsrl    4(%esp), %eax
 569         xorl    $31, %eax
 570         ret
 571 _ffs_:
 572         movl    $-1, %edx
 573         bsfl    4(%esp), %eax
 574         cmove   %edx, %eax
 575         addl    $1, %eax
 576         ret
 577
 578 //===---------------------------------------------------------------------===//
 579
 580 It appears gcc place string data with linkonce linkage in
 581 .section __TEXT,__const_coal,coalesced instead of
 582 .section __DATA,__const_coal,coalesced.
 583 Take a look at darwin.h, there are other Darwin assembler directives that we
 584 do not make use of.
 585
 586 //===---------------------------------------------------------------------===//
 587
 588 int %foo(int* %a, int %t) {
 589 entry:
 590         br label %cond_true
 591
 592 cond_true:              ; preds = %cond_true, %entry
 593         %x.0.0 = phi int [ 0, %entry ], [ %tmp9, %cond_true ]
 594         %t_addr.0.0 = phi int [ %t, %entry ], [ %tmp7, %cond_true ]
 595         %tmp2 = getelementptr int* %a, int %x.0.0
 596         %tmp3 = load int* %tmp2         ; <int> [#uses=1]
 597         %tmp5 = add int %t_addr.0.0, %x.0.0             ; <int> [#uses=1]
 598         %tmp7 = add int %tmp5, %tmp3            ; <int> [#uses=2]
 599         %tmp9 = add int %x.0.0, 1               ; <int> [#uses=2]
 600         %tmp = setgt int %tmp9, 39              ; <bool> [#uses=1]
 601         br bool %tmp, label %bb12, label %cond_true
 602
 603 bb12:           ; preds = %cond_true
 604         ret int %tmp7
 605 }
 606
 607 is pessimized by -loop-reduce and -indvars
 608
 609 //===---------------------------------------------------------------------===//
 610
 611 u32 to float conversion improvement:
 612
 613 float uint32_2_float( unsigned u ) {
 614   float fl = (int) (u & 0xffff);
 615   float fh = (int) (u >> 16);
 616   fh *= 0x1.0p16f;
 617   return fh + fl;
 618 }
 619
 620 00000000        subl    $0x04,%esp
 621 00000003        movl    0x08(%esp,1),%eax
 622 00000007        movl    %eax,%ecx
 623 00000009        shrl    $0x10,%ecx
 624 0000000c        cvtsi2ss        %ecx,%xmm0
 625 00000010        andl    $0x0000ffff,%eax
 626 00000015        cvtsi2ss        %eax,%xmm1
 627 00000019        mulss   0x00000078,%xmm0
 628 00000021        addss   %xmm1,%xmm0
 629 00000025        movss   %xmm0,(%esp,1)
 630 0000002a        flds    (%esp,1)
 631 0000002d        addl    $0x04,%esp
 632 00000030        ret
 633
 634 //===---------------------------------------------------------------------===//
 635
 636 When using fastcc abi, align stack slot of argument of type double on 8 byte
 637 boundary to improve performance.
 638
 639 //===---------------------------------------------------------------------===//
 640
 641 Codegen:
 642
 643 int f(int a, int b) {
 644   if (a == 4 || a == 6)
 645     b++;
 646   return b;
 647 }
 648
 649
 650 as:
 651
 652 or eax, 2
 653 cmp eax, 6
 654 jz label
 655
 656 //===---------------------------------------------------------------------===//
 657
 658 GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting
 659 simplifications for integer "x cmp y ? a : b".  For example, instead of:
 660
 661 int G;
 662 void f(int X, int Y) {
 663   G = X < 0 ? 14 : 13;
 664 }
 665
 666 compiling to:
 667
 668 _f:
 669         movl $14, %eax
 670         movl $13, %ecx
 671         movl 4(%esp), %edx
 672         testl %edx, %edx
 673         cmovl %eax, %ecx
 674         movl %ecx, _G
 675         ret
 676
 677 it could be:
 678 _f:
 679         movl    4(%esp), %eax
 680         sarl    $31, %eax
 681         notl    %eax
 682         addl    $14, %eax
 683         movl    %eax, _G
 684         ret
 685
 686 etc.
 687
 688 //===---------------------------------------------------------------------===//
 689
 690 Currently we don't have elimination of redundant stack manipulations. Consider
 691 the code:
 692
 693 int %main() {
 694 entry:
 695         call fastcc void %test1( )
 696         call fastcc void %test2( sbyte* cast (void ()* %test1 to sbyte*) )
 697         ret int 0
 698 }
 699
 700 declare fastcc void %test1()
 701
 702 declare fastcc void %test2(sbyte*)
 703
 704
 705 This currently compiles to:
 706
 707         subl $16, %esp
 708         call _test5
 709         addl $12, %esp
 710         subl $16, %esp
 711         movl $_test5, (%esp)
 712         call _test6
 713         addl $12, %esp
 714
 715 The add\sub pair is really unneeded here.
 716
 717 //===---------------------------------------------------------------------===//
 718
 719 We currently compile sign_extend_inreg into two shifts:
 720
 721 long foo(long X) {
 722   return (long)(signed char)X;
 723 }
 724
 725 becomes:
 726
 727 _foo:
 728         movl 4(%esp), %eax
 729         shll $24, %eax
 730         sarl $24, %eax
 731         ret
 732
 733 This could be:
 734
 735 _foo:
 736         movsbl  4(%esp),%eax
 737         ret
 738
 739 //===---------------------------------------------------------------------===//
 740
 741 Consider the expansion of:
 742
 743 uint %test3(uint %X) {
 744         %tmp1 = rem uint %X, 255
 745         ret uint %tmp1
 746 }
 747
 748 Currently it compiles to:
 749
 750 ...
 751         movl $2155905153, %ecx
 752         movl 8(%esp), %esi
 753         movl %esi, %eax
 754         mull %ecx
 755 ...
 756
 757 This could be "reassociated" into:
 758
 759         movl $2155905153, %eax
 760         movl 8(%esp), %ecx
 761         mull %ecx
 762
 763 to avoid the copy.  In fact, the existing two-address stuff would do this
 764 except that mul isn't a commutative 2-addr instruction.  I guess this has
 765 to be done at isel time based on the #uses to mul?
 766
 767 //===---------------------------------------------------------------------===//
 768
 769 Make sure the instruction which starts a loop does not cross a cacheline
 770 boundary. This requires knowning the exact length of each machine instruction.
 771 That is somewhat complicated, but doable. Example 256.bzip2:
 772
 773 In the new trace, the hot loop has an instruction which crosses a cacheline
 774 boundary.  In addition to potential cache misses, this can't help decoding as I
 775 imagine there has to be some kind of complicated decoder reset and realignment
 776 to grab the bytes from the next cacheline.
 777
 778 532  532 0x3cfc movb     (1809(%esp, %esi), %bl   <<<--- spans 2 64 byte lines
 779 942  942 0x3d03 movl     %dh, (1809(%esp, %esi)
 780 937  937 0x3d0a incl     %esi
 781 3    3   0x3d0b cmpb     %bl, %dl
 782 27   27  0x3d0d jnz      0x000062db <main+11707>
 783
 784 //===---------------------------------------------------------------------===//
 785
 786 In c99 mode, the preprocessor doesn't like assembly comments like #TRUNCATE.
 787
 788 //===---------------------------------------------------------------------===//
 789
 790 This could be a single 16-bit load.
 791
 792 int f(char *p) {
 793     if ((p[0] == 1) & (p[1] == 2)) return 1;
 794     return 0;
 795 }
 796
 797 //===---------------------------------------------------------------------===//
 798
 799 We should inline lrintf and probably other libc functions.
 800
 801 //===---------------------------------------------------------------------===//
 802
 803 Start using the flags more.  For example, compile:
 804
 805 int add_zf(int *x, int y, int a, int b) {
 806      if ((*x += y) == 0)
 807           return a;
 808      else
 809           return b;
 810 }
 811
 812 to:
 813        addl    %esi, (%rdi)
 814        movl    %edx, %eax
 815        cmovne  %ecx, %eax
 816        ret
 817 instead of:
 818
 819 _add_zf:
 820         addl (%rdi), %esi
 821         movl %esi, (%rdi)
 822         testl %esi, %esi
 823         cmove %edx, %ecx
 824         movl %ecx, %eax
 825         ret
 826
 827 and:
 828
 829 int add_zf(int *x, int y, int a, int b) {
 830      if ((*x + y) < 0)
 831           return a;
 832      else
 833           return b;
 834 }
 835
 836 to:
 837
 838 add_zf:
 839         addl    (%rdi), %esi
 840         movl    %edx, %eax
 841         cmovns  %ecx, %eax
 842         ret
 843
 844 instead of:
 845
 846 _add_zf:
 847         addl (%rdi), %esi
 848         testl %esi, %esi
 849         cmovs %edx, %ecx
 850         movl %ecx, %eax
 851         ret
 852
 853 //===---------------------------------------------------------------------===//
 854
 855 This:
 856 #include <math.h>
 857 int foo(double X) { return isnan(X); }
 858
 859 compiles to (-m64):
 860
 861 _foo:
 862         pxor %xmm1, %xmm1
 863         ucomisd %xmm1, %xmm0
 864         setp %al
 865         movzbl %al, %eax
 866         ret
 867
 868 the pxor is not needed, we could compare the value against itself.
 869
 870 //===---------------------------------------------------------------------===//
 871
 872 These two functions have identical effects:
 873
 874 unsigned int f(unsigned int i, unsigned int n) {++i; if (i == n) ++i; return i;}
 875 unsigned int f2(unsigned int i, unsigned int n) {++i; i += i == n; return i;}
 876
 877 We currently compile them to:
 878
 879 _f:
 880         movl 4(%esp), %eax
 881         movl %eax, %ecx
 882         incl %ecx
 883         movl 8(%esp), %edx
 884         cmpl %edx, %ecx
 885         jne LBB1_2      #UnifiedReturnBlock
 886 LBB1_1: #cond_true
 887         addl $2, %eax
 888         ret
 889 LBB1_2: #UnifiedReturnBlock
 890         movl %ecx, %eax
 891         ret
 892 _f2:
 893         movl 4(%esp), %eax
 894         movl %eax, %ecx
 895         incl %ecx
 896         cmpl 8(%esp), %ecx
 897         sete %cl
 898         movzbl %cl, %ecx
 899         leal 1(%ecx,%eax), %eax
 900         ret
 901
 902 both of which are inferior to GCC's:
 903
 904 _f:
 905         movl    4(%esp), %edx
 906         leal    1(%edx), %eax
 907         addl    $2, %edx
 908         cmpl    8(%esp), %eax
 909         cmove   %edx, %eax
 910         ret
 911 _f2:
 912         movl    4(%esp), %eax
 913         addl    $1, %eax
 914         xorl    %edx, %edx
 915         cmpl    8(%esp), %eax
 916         sete    %dl
 917         addl    %edx, %eax
 918         ret
 919
 920 //===---------------------------------------------------------------------===//
 921
 922 This code:
 923
 924 void test(int X) {
 925   if (X) abort();
 926 }
 927
 928 is currently compiled to:
 929
 930 _test:
 931         subl $12, %esp
 932         cmpl $0, 16(%esp)
 933         jne LBB1_1
 934         addl $12, %esp
 935         ret
 936 LBB1_1:
 937         call L_abort$stub
 938
 939 It would be better to produce:
 940
 941 _test:
 942         subl $12, %esp
 943         cmpl $0, 16(%esp)
 944         jne L_abort$stub
 945         addl $12, %esp
 946         ret
 947
 948 This can be applied to any no-return function call that takes no arguments etc.
 949 Alternatively, the stack save/restore logic could be shrink-wrapped, producing
 950 something like this:
 951
 952 _test:
 953         cmpl $0, 4(%esp)
 954         jne LBB1_1
 955         ret
 956 LBB1_1:
 957         subl $12, %esp
 958         call L_abort$stub
 959
 960 Both are useful in different situations.  Finally, it could be shrink-wrapped
 961 and tail called, like this:
 962
 963 _test:
 964         cmpl $0, 4(%esp)
 965         jne LBB1_1
 966         ret
 967 LBB1_1:
 968         pop %eax   # realign stack.
 969         call L_abort$stub
 970
 971 Though this probably isn't worth it.
 972
 973 //===---------------------------------------------------------------------===//
 974
 975 We need to teach the codegen to convert two-address INC instructions to LEA
 976 when the flags are dead (likewise dec).  For example, on X86-64, compile:
 977
 978 int foo(int A, int B) {
 979   return A+1;
 980 }
 981
 982 to:
 983
 984 _foo:
 985         leal    1(%edi), %eax
 986         ret
 987
 988 instead of:
 989
 990 _foo:
 991         incl %edi
 992         movl %edi, %eax
 993         ret
 994
 995 Another example is:
 996
 997 ;; X's live range extends beyond the shift, so the register allocator
 998 ;; cannot coalesce it with Y.  Because of this, a copy needs to be
 999 ;; emitted before the shift to save the register value before it is
1000 ;; clobbered.  However, this copy is not needed if the register
1001 ;; allocator turns the shift into an LEA.  This also occurs for ADD.
1002
1003 ; Check that the shift gets turned into an LEA.
1004 ; RUN: llvm-upgrade < %s | llvm-as | llc -march=x86 -x86-asm-syntax=intel | \
1005 ; RUN:   not grep {mov E.X, E.X}
1006
1007 %G = external global int
1008
1009 int %test1(int %X, int %Y) {
1010         %Z = add int %X, %Y
1011         volatile store int %Y, int* %G
1012         volatile store int %Z, int* %G
1013         ret int %X
1014 }
1015
1016 int %test2(int %X) {
1017         %Z = add int %X, 1  ;; inc
1018         volatile store int %Z, int* %G
1019         ret int %X
1020 }
1021
1022 //===---------------------------------------------------------------------===//
1023
1024 This:
1025 #include <xmmintrin.h>
1026 unsigned test(float f) {
1027  return _mm_cvtsi128_si32( (__m128i) _mm_set_ss( f ));
1028 }
1029
1030 Compiles to:
1031 _test:
1032         movss 4(%esp), %xmm0
1033         movd %xmm0, %eax
1034         ret
1035
1036 it should compile to a move from the stack slot directly into eax.  DAGCombine
1037 has this xform, but it is currently disabled until the alignment fields of
1038 the load/store nodes are trustworthy.
1039
1040 //===---------------------------------------------------------------------===//
1041
1042 Sometimes it is better to codegen subtractions from a constant (e.g. 7-x) with
1043 a neg instead of a sub instruction.  Consider:
1044
1045 int test(char X) { return 7-X; }
1046
1047 we currently produce:
1048 _test:
1049         movl $7, %eax
1050         movsbl 4(%esp), %ecx
1051         subl %ecx, %eax
1052         ret
1053
1054 We would use one fewer register if codegen'd as:
1055
1056         movsbl 4(%esp), %eax
1057         neg %eax
1058         add $7, %eax
1059         ret
1060
1061 Note that this isn't beneficial if the load can be folded into the sub.  In
1062 this case, we want a sub:
1063
1064 int test(int X) { return 7-X; }
1065 _test:
1066         movl $7, %eax
1067         subl 4(%esp), %eax
1068         ret
1069
1070 //===---------------------------------------------------------------------===//
1071
1072 For code like:
1073 phi (undef, x)
1074
1075 We get an implicit def on the undef side. If the phi is spilled, we then get:
1076 implicitdef xmm1
1077 store xmm1 -> stack
1078
1079 It should be possible to teach the x86 backend to "fold" the store into the
1080 implicitdef, which just deletes the implicit def.
1081
1082 These instructions should go away:
1083 #IMPLICIT_DEF %xmm1
1084 movaps %xmm1, 192(%esp)
1085 movaps %xmm1, 224(%esp)
1086 movaps %xmm1, 176(%esp)
1087
1088 //===---------------------------------------------------------------------===//
1089
1090 This is a "commutable two-address" register coallescing deficiency:
1091
1092 define <4 x float> @test1(<4 x float> %V) {
1093 entry:
1094         %tmp8 = shufflevector <4 x float> %V, <4 x float> undef,
1095                                         <4 x i32> < i32 3, i32 2, i32 1, i32 0 >
1096         %add = add <4 x float> %tmp8, %V
1097         ret <4 x float> %add
1098 }
1099
1100 this codegens to:
1101
1102 _test1:
1103         pshufd  $27, %xmm0, %xmm1
1104         addps   %xmm0, %xmm1
1105         movaps  %xmm1, %xmm0
1106         ret
1107
1108 instead of:
1109
1110 _test1:
1111         pshufd  $27, %xmm0, %xmm1
1112         addps   %xmm1, %xmm0
1113         ret
1114
1115 //===---------------------------------------------------------------------===//
1116
1117 Leaf functions that require one 4-byte spill slot have a prolog like this:
1118
1119 _foo:
1120         pushl   %esi
1121         subl    $4, %esp
1122 ...
1123 and an epilog like this:
1124         addl    $4, %esp
1125         popl    %esi
1126         ret
1127
1128 It would be smaller, and potentially faster, to push eax on entry and to
1129 pop into a dummy register instead of using addl/subl of esp.  Just don't pop
1130 into any return registers :)
1131
1132 //===---------------------------------------------------------------------===//
1133
1134 The X86 backend should fold (branch (or (setcc, setcc))) into multiple
1135 branches.  We generate really poor code for:
1136
1137 double testf(double a) {
1138        return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0);
1139 }
1140
1141 For example, the entry BB is:
1142
1143 _testf:
1144         subl    $20, %esp
1145         pxor    %xmm0, %xmm0
1146         movsd   24(%esp), %xmm1
1147         ucomisd %xmm0, %xmm1
1148         setnp   %al
1149         sete    %cl
1150         testb   %cl, %al
1151         jne     LBB1_5  # UnifiedReturnBlock
1152 LBB1_1: # cond_true
1153
1154
1155 it would be better to replace the last four instructions with:
1156
1157         jp LBB1_1
1158         je LBB1_5
1159 LBB1_1:
1160
1161 We also codegen the inner ?: into a diamond:
1162
1163        cvtss2sd        LCPI1_0(%rip), %xmm2
1164         cvtss2sd        LCPI1_1(%rip), %xmm3
1165         ucomisd %xmm1, %xmm0
1166         ja      LBB1_3  # cond_true
1167 LBB1_2: # cond_true
1168         movapd  %xmm3, %xmm2
1169 LBB1_3: # cond_true
1170         movapd  %xmm2, %xmm0
1171         ret
1172
1173 We should sink the load into xmm3 into the LBB1_2 block.  This should
1174 be pretty easy, and will nuke all the copies.
1175
1176 //===---------------------------------------------------------------------===//
1177
1178 This:
1179         #include <algorithm>
1180         inline std::pair<unsigned, bool> full_add(unsigned a, unsigned b)
1181         { return std::make_pair(a + b, a + b < a); }
1182         bool no_overflow(unsigned a, unsigned b)
1183         { return !full_add(a, b).second; }
1184
1185 Should compile to:
1186
1187
1188         _Z11no_overflowjj:
1189                 addl    %edi, %esi
1190                 setae   %al
1191                 ret
1192
1193 on x86-64, not:
1194
1195 __Z11no_overflowjj:
1196         addl    %edi, %esi
1197         cmpl    %edi, %esi
1198         setae   %al
1199         movzbl  %al, %eax
1200         ret
1201
1202
1203 //===---------------------------------------------------------------------===//
1204
1205 Re-materialize MOV32r0 etc. with xor instead of changing them to moves if the
1206 condition register is dead. xor reg reg is shorter than mov reg, #0.
1207
1208 //===---------------------------------------------------------------------===//
1209
1210 We aren't matching RMW instructions aggressively
1211 enough.  Here's a reduced testcase (more in PR1160):
1212
1213 define void @test(i32* %huge_ptr, i32* %target_ptr) {
1214         %A = load i32* %huge_ptr                ; <i32> [#uses=1]
1215         %B = load i32* %target_ptr              ; <i32> [#uses=1]
1216         %C = or i32 %A, %B              ; <i32> [#uses=1]
1217         store i32 %C, i32* %target_ptr
1218         ret void
1219 }
1220
1221 $ llvm-as < t.ll | llc -march=x86-64
1222
1223 _test:
1224         movl (%rdi), %eax
1225         orl (%rsi), %eax
1226         movl %eax, (%rsi)
1227         ret
1228
1229 That should be something like:
1230
1231 _test:
1232         movl (%rdi), %eax
1233         orl %eax, (%rsi)
1234         ret
1235
1236 //===---------------------------------------------------------------------===//
1237
1238 The following code:
1239
1240 bb114.preheader:                ; preds = %cond_next94
1241         %tmp231232 = sext i16 %tmp62 to i32             ; <i32> [#uses=1]
1242         %tmp233 = sub i32 32, %tmp231232                ; <i32> [#uses=1]
1243         %tmp245246 = sext i16 %tmp65 to i32             ; <i32> [#uses=1]
1244         %tmp252253 = sext i16 %tmp68 to i32             ; <i32> [#uses=1]
1245         %tmp254 = sub i32 32, %tmp252253                ; <i32> [#uses=1]
1246         %tmp553554 = bitcast i16* %tmp37 to i8*         ; <i8*> [#uses=2]
1247         %tmp583584 = sext i16 %tmp98 to i32             ; <i32> [#uses=1]
1248         %tmp585 = sub i32 32, %tmp583584                ; <i32> [#uses=1]
1249         %tmp614615 = sext i16 %tmp101 to i32            ; <i32> [#uses=1]
1250         %tmp621622 = sext i16 %tmp104 to i32            ; <i32> [#uses=1]
1251         %tmp623 = sub i32 32, %tmp621622                ; <i32> [#uses=1]
1252         br label %bb114
1253
1254 produces:
1255
1256 LBB3_5: # bb114.preheader
1257         movswl  -68(%ebp), %eax
1258         movl    $32, %ecx
1259         movl    %ecx, -80(%ebp)
1260         subl    %eax, -80(%ebp)
1261         movswl  -52(%ebp), %eax
1262         movl    %ecx, -84(%ebp)
1263         subl    %eax, -84(%ebp)
1264         movswl  -70(%ebp), %eax
1265         movl    %ecx, -88(%ebp)
1266         subl    %eax, -88(%ebp)
1267         movswl  -50(%ebp), %eax
1268         subl    %eax, %ecx
1269         movl    %ecx, -76(%ebp)
1270         movswl  -42(%ebp), %eax
1271         movl    %eax, -92(%ebp)
1272         movswl  -66(%ebp), %eax
1273         movl    %eax, -96(%ebp)
1274         movw    $0, -98(%ebp)
1275
1276 This appears to be bad because the RA is not folding the store to the stack
1277 slot into the movl.  The above instructions could be:
1278         movl    $32, -80(%ebp)
1279 ...
1280         movl    $32, -84(%ebp)
1281 ...
1282 This seems like a cross between remat and spill folding.
1283
1284 This has redundant subtractions of %eax from a stack slot. However, %ecx doesn't
1285 change, so we could simply subtract %eax from %ecx first and then use %ecx (or
1286 vice-versa).
1287
1288 //===---------------------------------------------------------------------===//
1289
1290 For this code:
1291
1292 cond_next603:           ; preds = %bb493, %cond_true336, %cond_next599
1293         %v.21050.1 = phi i32 [ %v.21050.0, %cond_next599 ], [ %tmp344, %cond_true336 ], [ %v.2, %bb493 ]                ; <i32> [#uses=1]
1294         %maxz.21051.1 = phi i32 [ %maxz.21051.0, %cond_next599 ], [ 0, %cond_true336 ], [ %maxz.2, %bb493 ]             ; <i32> [#uses=2]
1295         %cnt.01055.1 = phi i32 [ %cnt.01055.0, %cond_next599 ], [ 0, %cond_true336 ], [ %cnt.0, %bb493 ]                ; <i32> [#uses=2]
1296         %byteptr.9 = phi i8* [ %byteptr.12, %cond_next599 ], [ %byteptr.0, %cond_true336 ], [ %byteptr.10, %bb493 ]             ; <i8*> [#uses=9]
1297         %bitptr.6 = phi i32 [ %tmp5571104.1, %cond_next599 ], [ %tmp4921049, %cond_true336 ], [ %bitptr.7, %bb493 ]             ; <i32> [#uses=4]
1298         %source.5 = phi i32 [ %tmp602, %cond_next599 ], [ %source.0, %cond_true336 ], [ %source.6, %bb493 ]             ; <i32> [#uses=7]
1299         %tmp606 = getelementptr %struct.const_tables* @tables, i32 0, i32 0, i32 %cnt.01055.1           ; <i8*> [#uses=1]
1300         %tmp607 = load i8* %tmp606, align 1             ; <i8> [#uses=1]
1301
1302 We produce this:
1303
1304 LBB4_70:        # cond_next603
1305         movl    -20(%ebp), %esi
1306         movl    L_tables$non_lazy_ptr-"L4$pb"(%esi), %esi
1307
1308 However, ICC caches this information before the loop and produces this:
1309
1310         movl      88(%esp), %eax                                #481.12
1311
1312 //===---------------------------------------------------------------------===//
1313
1314 This code:
1315
1316         %tmp659 = icmp slt i16 %tmp654, 0               ; <i1> [#uses=1]
1317         br i1 %tmp659, label %cond_true662, label %cond_next715
1318
1319 produces this:
1320
1321         testw   %cx, %cx
1322         movswl  %cx, %esi
1323         jns     LBB4_109        # cond_next715
1324
1325 Shark tells us that using %cx in the testw instruction is sub-optimal. It
1326 suggests using the 32-bit register (which is what ICC uses).
1327
1328 //===---------------------------------------------------------------------===//
1329
1330 rdar://5506677 - We compile this:
1331
1332 define i32 @foo(double %x) {
1333         %x14 = bitcast double %x to i64         ; <i64> [#uses=1]
1334         %tmp713 = trunc i64 %x14 to i32         ; <i32> [#uses=1]
1335         %tmp8 = and i32 %tmp713, 2147483647             ; <i32> [#uses=1]
1336         ret i32 %tmp8
1337 }
1338
1339 to:
1340
1341 _foo:
1342         subl    $12, %esp
1343         fldl    16(%esp)
1344         fstpl   (%esp)
1345         movl    $2147483647, %eax
1346         andl    (%esp), %eax
1347         addl    $12, %esp
1348         #FP_REG_KILL
1349         ret
1350
1351 It would be much better to eliminate the fldl/fstpl by folding the bitcast
1352 into the load SDNode.  That would give us:
1353
1354 _foo:
1355         movl    $2147483647, %eax
1356         andl    4(%esp), %eax
1357         ret
1358
1359 //===---------------------------------------------------------------------===//
1360
1361 We compile this:
1362
1363 void compare (long long foo) {
1364   if (foo < 4294967297LL)
1365     abort();
1366 }
1367
1368 to:
1369
1370 _compare:
1371         subl    $12, %esp
1372         cmpl    $0, 16(%esp)
1373         setne   %al
1374         movzbw  %al, %ax
1375         cmpl    $1, 20(%esp)
1376         setg    %cl
1377         movzbw  %cl, %cx
1378         cmove   %ax, %cx
1379         movw    %cx, %ax
1380         testb   $1, %al
1381         je      LBB1_2  # cond_true
1382
1383 (also really horrible code on ppc).  This is due to the expand code for 64-bit
1384 compares.  GCC produces multiple branches, which is much nicer:
1385
1386 _compare:
1387         pushl   %ebp
1388         movl    %esp, %ebp
1389         subl    $8, %esp
1390         movl    8(%ebp), %eax
1391         movl    12(%ebp), %edx
1392         subl    $1, %edx
1393         jg     L5
1394 L7:
1395         jl      L4
1396         cmpl    $0, %eax
1397         jbe      L4
1398 L5:
1399
1400 //===---------------------------------------------------------------------===//