lib/Target/X86/README.txt

   1 //===---------------------------------------------------------------------===//
   2 // Random ideas for the X86 backend.
   3 //===---------------------------------------------------------------------===//
   4
   5 Missing features:
   6   - Support for SSE4: http://www.intel.com/software/penryn
   7 http://softwarecommunity.intel.com/isn/Downloads/Intel%20SSE4%20Programming%20Reference.pdf
   8   - support for 3DNow!
   9   - weird abis?
  10
  11 //===---------------------------------------------------------------------===//
  12
  13 CodeGen/X86/lea-3.ll:test3 should be a single LEA, not a shift/move.  The X86
  14 backend knows how to three-addressify this shift, but it appears the register
  15 allocator isn't even asking it to do so in this case.  We should investigate
  16 why this isn't happening, it could have significant impact on other important
  17 cases for X86 as well.
  18
  19 //===---------------------------------------------------------------------===//
  20
  21 This should be one DIV/IDIV instruction, not a libcall:
  22
  23 unsigned test(unsigned long long X, unsigned Y) {
  24         return X/Y;
  25 }
  26
  27 This can be done trivially with a custom legalizer.  What about overflow
  28 though?  http://gcc.gnu.org/bugzilla/show_bug.cgi?id=14224
  29
  30 //===---------------------------------------------------------------------===//
  31
  32 Improvements to the multiply -> shift/add algorithm:
  33 http://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html
  34
  35 //===---------------------------------------------------------------------===//
  36
  37 Improve code like this (occurs fairly frequently, e.g. in LLVM):
  38 long long foo(int x) { return 1LL << x; }
  39
  40 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01109.html
  41 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01128.html
  42 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html
  43
  44 Another useful one would be  ~0ULL >> X and ~0ULL << X.
  45
  46 One better solution for 1LL << x is:
  47         xorl    %eax, %eax
  48         xorl    %edx, %edx
  49         testb   $32, %cl
  50         sete    %al
  51         setne   %dl
  52         sall    %cl, %eax
  53         sall    %cl, %edx
  54
  55 But that requires good 8-bit subreg support.
  56
  57 64-bit shifts (in general) expand to really bad code.  Instead of using
  58 cmovs, we should expand to a conditional branch like GCC produces.
  59
  60 //===---------------------------------------------------------------------===//
  61
  62 Compile this:
  63 _Bool f(_Bool a) { return a!=1; }
  64
  65 into:
  66         movzbl  %dil, %eax
  67         xorl    $1, %eax
  68         ret
  69
  70 //===---------------------------------------------------------------------===//
  71
  72 Some isel ideas:
  73
  74 1. Dynamic programming based approach when compile time if not an
  75    issue.
  76 2. Code duplication (addressing mode) during isel.
  77 3. Other ideas from "Register-Sensitive Selection, Duplication, and
  78    Sequencing of Instructions".
  79 4. Scheduling for reduced register pressure.  E.g. "Minimum Register
  80    Instruction Sequence Problem: Revisiting Optimal Code Generation for DAGs"
  81    and other related papers.
  82    http://citeseer.ist.psu.edu/govindarajan01minimum.html
  83
  84 //===---------------------------------------------------------------------===//
  85
  86 Should we promote i16 to i32 to avoid partial register update stalls?
  87
  88 //===---------------------------------------------------------------------===//
  89
  90 Leave any_extend as pseudo instruction and hint to register
  91 allocator. Delay codegen until post register allocation.
  92
  93 //===---------------------------------------------------------------------===//
  94
  95 Count leading zeros and count trailing zeros:
  96
  97 int clz(int X) { return __builtin_clz(X); }
  98 int ctz(int X) { return __builtin_ctz(X); }
  99
 100 $ gcc t.c -S -o - -O3  -fomit-frame-pointer -masm=intel
 101 clz:
 102         bsr     %eax, DWORD PTR [%esp+4]
 103         xor     %eax, 31
 104         ret
 105 ctz:
 106         bsf     %eax, DWORD PTR [%esp+4]
 107         ret
 108
 109 however, check that these are defined for 0 and 32.  Our intrinsics are, GCC's
 110 aren't.
 111
 112 Another example (use predsimplify to eliminate a select):
 113
 114 int foo (unsigned long j) {
 115   if (j)
 116     return __builtin_ffs (j) - 1;
 117   else
 118     return 0;
 119 }
 120
 121 //===---------------------------------------------------------------------===//
 122
 123 It appears icc use push for parameter passing. Need to investigate.
 124
 125 //===---------------------------------------------------------------------===//
 126
 127 Only use inc/neg/not instructions on processors where they are faster than
 128 add/sub/xor.  They are slower on the P4 due to only updating some processor
 129 flags.
 130
 131 //===---------------------------------------------------------------------===//
 132
 133 The instruction selector sometimes misses folding a load into a compare.  The
 134 pattern is written as (cmp reg, (load p)).  Because the compare isn't
 135 commutative, it is not matched with the load on both sides.  The dag combiner
 136 should be made smart enough to cannonicalize the load into the RHS of a compare
 137 when it can invert the result of the compare for free.
 138
 139 //===---------------------------------------------------------------------===//
 140
 141 How about intrinsics? An example is:
 142   *res = _mm_mulhi_epu16(*A, _mm_mul_epu32(*B, *C));
 143
 144 compiles to
 145         pmuludq (%eax), %xmm0
 146         movl 8(%esp), %eax
 147         movdqa (%eax), %xmm1
 148         pmulhuw %xmm0, %xmm1
 149
 150 The transformation probably requires a X86 specific pass or a DAG combiner
 151 target specific hook.
 152
 153 //===---------------------------------------------------------------------===//
 154
 155 In many cases, LLVM generates code like this:
 156
 157 _test:
 158         movl 8(%esp), %eax
 159         cmpl %eax, 4(%esp)
 160         setl %al
 161         movzbl %al, %eax
 162         ret
 163
 164 on some processors (which ones?), it is more efficient to do this:
 165
 166 _test:
 167         movl 8(%esp), %ebx
 168         xor  %eax, %eax
 169         cmpl %ebx, 4(%esp)
 170         setl %al
 171         ret
 172
 173 Doing this correctly is tricky though, as the xor clobbers the flags.
 174
 175 //===---------------------------------------------------------------------===//
 176
 177 We should generate bts/btr/etc instructions on targets where they are cheap or
 178 when codesize is important.  e.g., for:
 179
 180 void setbit(int *target, int bit) {
 181     *target |= (1 << bit);
 182 }
 183 void clearbit(int *target, int bit) {
 184     *target &= ~(1 << bit);
 185 }
 186
 187 //===---------------------------------------------------------------------===//
 188
 189 Instead of the following for memset char*, 1, 10:
 190
 191         movl $16843009, 4(%edx)
 192         movl $16843009, (%edx)
 193         movw $257, 8(%edx)
 194
 195 It might be better to generate
 196
 197         movl $16843009, %eax
 198         movl %eax, 4(%edx)
 199         movl %eax, (%edx)
 200         movw al, 8(%edx)
 201
 202 when we can spare a register. It reduces code size.
 203
 204 //===---------------------------------------------------------------------===//
 205
 206 Evaluate what the best way to codegen sdiv X, (2^C) is.  For X/8, we currently
 207 get this:
 208
 209 int %test1(int %X) {
 210         %Y = div int %X, 8
 211         ret int %Y
 212 }
 213
 214 _test1:
 215         movl 4(%esp), %eax
 216         movl %eax, %ecx
 217         sarl $31, %ecx
 218         shrl $29, %ecx
 219         addl %ecx, %eax
 220         sarl $3, %eax
 221         ret
 222
 223 GCC knows several different ways to codegen it, one of which is this:
 224
 225 _test1:
 226         movl    4(%esp), %eax
 227         cmpl    $-1, %eax
 228         leal    7(%eax), %ecx
 229         cmovle  %ecx, %eax
 230         sarl    $3, %eax
 231         ret
 232
 233 which is probably slower, but it's interesting at least :)
 234
 235 //===---------------------------------------------------------------------===//
 236
 237 The first BB of this code:
 238
 239 declare bool %foo()
 240 int %bar() {
 241         %V = call bool %foo()
 242         br bool %V, label %T, label %F
 243 T:
 244         ret int 1
 245 F:
 246         call bool %foo()
 247         ret int 12
 248 }
 249
 250 compiles to:
 251
 252 _bar:
 253         subl $12, %esp
 254         call L_foo$stub
 255         xorb $1, %al
 256         testb %al, %al
 257         jne LBB_bar_2   # F
 258
 259 It would be better to emit "cmp %al, 1" than a xor and test.
 260
 261 //===---------------------------------------------------------------------===//
 262
 263 We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl
 264 We should leave these as libcalls for everything over a much lower threshold,
 265 since libc is hand tuned for medium and large mem ops (avoiding RFO for large
 266 stores, TLB preheating, etc)
 267
 268 //===---------------------------------------------------------------------===//
 269
 270 Optimize this into something reasonable:
 271  x * copysign(1.0, y) * copysign(1.0, z)
 272
 273 //===---------------------------------------------------------------------===//
 274
 275 Optimize copysign(x, *y) to use an integer load from y.
 276
 277 //===---------------------------------------------------------------------===//
 278
 279 %X = weak global int 0
 280
 281 void %foo(int %N) {
 282         %N = cast int %N to uint
 283         %tmp.24 = setgt int %N, 0
 284         br bool %tmp.24, label %no_exit, label %return
 285
 286 no_exit:
 287         %indvar = phi uint [ 0, %entry ], [ %indvar.next, %no_exit ]
 288         %i.0.0 = cast uint %indvar to int
 289         volatile store int %i.0.0, int* %X
 290         %indvar.next = add uint %indvar, 1
 291         %exitcond = seteq uint %indvar.next, %N
 292         br bool %exitcond, label %return, label %no_exit
 293
 294 return:
 295         ret void
 296 }
 297
 298 compiles into:
 299
 300         .text
 301         .align  4
 302         .globl  _foo
 303 _foo:
 304         movl 4(%esp), %eax
 305         cmpl $1, %eax
 306         jl LBB_foo_4    # return
 307 LBB_foo_1:      # no_exit.preheader
 308         xorl %ecx, %ecx
 309 LBB_foo_2:      # no_exit
 310         movl L_X$non_lazy_ptr, %edx
 311         movl %ecx, (%edx)
 312         incl %ecx
 313         cmpl %eax, %ecx
 314         jne LBB_foo_2   # no_exit
 315 LBB_foo_3:      # return.loopexit
 316 LBB_foo_4:      # return
 317         ret
 318
 319 We should hoist "movl L_X$non_lazy_ptr, %edx" out of the loop after
 320 remateralization is implemented. This can be accomplished with 1) a target
 321 dependent LICM pass or 2) makeing SelectDAG represent the whole function.
 322
 323 //===---------------------------------------------------------------------===//
 324
 325 The following tests perform worse with LSR:
 326
 327 lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor.
 328
 329 //===---------------------------------------------------------------------===//
 330
 331 We are generating far worse code than gcc:
 332
 333 volatile short X, Y;
 334
 335 void foo(int N) {
 336   int i;
 337   for (i = 0; i < N; i++) { X = i; Y = i*4; }
 338 }
 339
 340 LBB1_1: #bb.preheader
 341         xorl %ecx, %ecx
 342         xorw %dx, %dx
 343 LBB1_2: #bb
 344         movl L_X$non_lazy_ptr, %esi
 345         movw %dx, (%esi)
 346         movw %dx, %si
 347         shlw $2, %si
 348         movl L_Y$non_lazy_ptr, %edi
 349         movw %si, (%edi)
 350         incl %ecx
 351         incw %dx
 352         cmpl %eax, %ecx
 353         jne LBB1_2      #bb
 354
 355 vs.
 356
 357         xorl    %edx, %edx
 358         movl    L_X$non_lazy_ptr-"L00000000001$pb"(%ebx), %esi
 359         movl    L_Y$non_lazy_ptr-"L00000000001$pb"(%ebx), %ecx
 360 L4:
 361         movw    %dx, (%esi)
 362         leal    0(,%edx,4), %eax
 363         movw    %ax, (%ecx)
 364         addl    $1, %edx
 365         cmpl    %edx, %edi
 366         jne     L4
 367
 368 There are 3 issues:
 369
 370 1. Lack of post regalloc LICM.
 371 2. LSR unable to reused IV for a different type (i16 vs. i32) even though
 372    the cast would be free.
 373
 374 //===---------------------------------------------------------------------===//
 375
 376 Teach the coalescer to coalesce vregs of different register classes. e.g. FR32 /
 377 FR64 to VR128.
 378
 379 //===---------------------------------------------------------------------===//
 380
 381 mov $reg, 48(%esp)
 382 ...
 383 leal 48(%esp), %eax
 384 mov %eax, (%esp)
 385 call _foo
 386
 387 Obviously it would have been better for the first mov (or any op) to store
 388 directly %esp[0] if there are no other uses.
 389
 390 //===---------------------------------------------------------------------===//
 391
 392 Adding to the list of cmp / test poor codegen issues:
 393
 394 int test(__m128 *A, __m128 *B) {
 395   if (_mm_comige_ss(*A, *B))
 396     return 3;
 397   else
 398     return 4;
 399 }
 400
 401 _test:
 402         movl 8(%esp), %eax
 403         movaps (%eax), %xmm0
 404         movl 4(%esp), %eax
 405         movaps (%eax), %xmm1
 406         comiss %xmm0, %xmm1
 407         setae %al
 408         movzbl %al, %ecx
 409         movl $3, %eax
 410         movl $4, %edx
 411         cmpl $0, %ecx
 412         cmove %edx, %eax
 413         ret
 414
 415 Note the setae, movzbl, cmpl, cmove can be replaced with a single cmovae. There
 416 are a number of issues. 1) We are introducing a setcc between the result of the
 417 intrisic call and select. 2) The intrinsic is expected to produce a i32 value
 418 so a any extend (which becomes a zero extend) is added.
 419
 420 We probably need some kind of target DAG combine hook to fix this.
 421
 422 //===---------------------------------------------------------------------===//
 423
 424 We generate significantly worse code for this than GCC:
 425 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150
 426 http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701
 427
 428 There is also one case we do worse on PPC.
 429
 430 //===---------------------------------------------------------------------===//
 431
 432 If shorter, we should use things like:
 433 movzwl %ax, %eax
 434 instead of:
 435 andl $65535, %EAX
 436
 437 The former can also be used when the two-addressy nature of the 'and' would
 438 require a copy to be inserted (in X86InstrInfo::convertToThreeAddress).
 439
 440 //===---------------------------------------------------------------------===//
 441
 442 Consider this:
 443
 444 typedef struct pair { float A, B; } pair;
 445 void pairtest(pair P, float *FP) {
 446         *FP = P.A+P.B;
 447 }
 448
 449 We currently generate this code with llvmgcc4:
 450
 451 _pairtest:
 452         movl 8(%esp), %eax
 453         movl 4(%esp), %ecx
 454         movd %eax, %xmm0
 455         movd %ecx, %xmm1
 456         addss %xmm0, %xmm1
 457         movl 12(%esp), %eax
 458         movss %xmm1, (%eax)
 459         ret
 460
 461 we should be able to generate:
 462 _pairtest:
 463         movss 4(%esp), %xmm0
 464         movl 12(%esp), %eax
 465         addss 8(%esp), %xmm0
 466         movss %xmm0, (%eax)
 467         ret
 468
 469 The issue is that llvmgcc4 is forcing the struct to memory, then passing it as
 470 integer chunks.  It does this so that structs like {short,short} are passed in
 471 a single 32-bit integer stack slot.  We should handle the safe cases above much
 472 nicer, while still handling the hard cases.
 473
 474 While true in general, in this specific case we could do better by promoting
 475 load int + bitcast to float -> load fload.  This basically needs alignment info,
 476 the code is already implemented (but disabled) in dag combine).
 477
 478 //===---------------------------------------------------------------------===//
 479
 480 Another instruction selector deficiency:
 481
 482 void %bar() {
 483         %tmp = load int (int)** %foo
 484         %tmp = tail call int %tmp( int 3 )
 485         ret void
 486 }
 487
 488 _bar:
 489         subl $12, %esp
 490         movl L_foo$non_lazy_ptr, %eax
 491         movl (%eax), %eax
 492         call *%eax
 493         addl $12, %esp
 494         ret
 495
 496 The current isel scheme will not allow the load to be folded in the call since
 497 the load's chain result is read by the callseq_start.
 498
 499 //===---------------------------------------------------------------------===//
 500
 501 For this:
 502
 503 int test(int a)
 504 {
 505   return a * 3;
 506 }
 507
 508 We currently emits
 509         imull $3, 4(%esp), %eax
 510
 511 Perhaps this is what we really should generate is? Is imull three or four
 512 cycles? Note: ICC generates this:
 513         movl    4(%esp), %eax
 514         leal    (%eax,%eax,2), %eax
 515
 516 The current instruction priority is based on pattern complexity. The former is
 517 more "complex" because it folds a load so the latter will not be emitted.
 518
 519 Perhaps we should use AddedComplexity to give LEA32r a higher priority? We
 520 should always try to match LEA first since the LEA matching code does some
 521 estimate to determine whether the match is profitable.
 522
 523 However, if we care more about code size, then imull is better. It's two bytes
 524 shorter than movl + leal.
 525
 526 //===---------------------------------------------------------------------===//
 527
 528 Implement CTTZ, CTLZ with bsf and bsr. GCC produces:
 529
 530 int ctz_(unsigned X) { return __builtin_ctz(X); }
 531 int clz_(unsigned X) { return __builtin_clz(X); }
 532 int ffs_(unsigned X) { return __builtin_ffs(X); }
 533
 534 _ctz_:
 535         bsfl    4(%esp), %eax
 536         ret
 537 _clz_:
 538         bsrl    4(%esp), %eax
 539         xorl    $31, %eax
 540         ret
 541 _ffs_:
 542         movl    $-1, %edx
 543         bsfl    4(%esp), %eax
 544         cmove   %edx, %eax
 545         addl    $1, %eax
 546         ret
 547
 548 //===---------------------------------------------------------------------===//
 549
 550 It appears gcc place string data with linkonce linkage in
 551 .section __TEXT,__const_coal,coalesced instead of
 552 .section __DATA,__const_coal,coalesced.
 553 Take a look at darwin.h, there are other Darwin assembler directives that we
 554 do not make use of.
 555
 556 //===---------------------------------------------------------------------===//
 557
 558 int %foo(int* %a, int %t) {
 559 entry:
 560         br label %cond_true
 561
 562 cond_true:              ; preds = %cond_true, %entry
 563         %x.0.0 = phi int [ 0, %entry ], [ %tmp9, %cond_true ]
 564         %t_addr.0.0 = phi int [ %t, %entry ], [ %tmp7, %cond_true ]
 565         %tmp2 = getelementptr int* %a, int %x.0.0
 566         %tmp3 = load int* %tmp2         ; <int> [#uses=1]
 567         %tmp5 = add int %t_addr.0.0, %x.0.0             ; <int> [#uses=1]
 568         %tmp7 = add int %tmp5, %tmp3            ; <int> [#uses=2]
 569         %tmp9 = add int %x.0.0, 1               ; <int> [#uses=2]
 570         %tmp = setgt int %tmp9, 39              ; <bool> [#uses=1]
 571         br bool %tmp, label %bb12, label %cond_true
 572
 573 bb12:           ; preds = %cond_true
 574         ret int %tmp7
 575 }
 576
 577 is pessimized by -loop-reduce and -indvars
 578
 579 //===---------------------------------------------------------------------===//
 580
 581 u32 to float conversion improvement:
 582
 583 float uint32_2_float( unsigned u ) {
 584   float fl = (int) (u & 0xffff);
 585   float fh = (int) (u >> 16);
 586   fh *= 0x1.0p16f;
 587   return fh + fl;
 588 }
 589
 590 00000000        subl    $0x04,%esp
 591 00000003        movl    0x08(%esp,1),%eax
 592 00000007        movl    %eax,%ecx
 593 00000009        shrl    $0x10,%ecx
 594 0000000c        cvtsi2ss        %ecx,%xmm0
 595 00000010        andl    $0x0000ffff,%eax
 596 00000015        cvtsi2ss        %eax,%xmm1
 597 00000019        mulss   0x00000078,%xmm0
 598 00000021        addss   %xmm1,%xmm0
 599 00000025        movss   %xmm0,(%esp,1)
 600 0000002a        flds    (%esp,1)
 601 0000002d        addl    $0x04,%esp
 602 00000030        ret
 603
 604 //===---------------------------------------------------------------------===//
 605
 606 When using fastcc abi, align stack slot of argument of type double on 8 byte
 607 boundary to improve performance.
 608
 609 //===---------------------------------------------------------------------===//
 610
 611 Codegen:
 612
 613 int f(int a, int b) {
 614   if (a == 4 || a == 6)
 615     b++;
 616   return b;
 617 }
 618
 619
 620 as:
 621
 622 or eax, 2
 623 cmp eax, 6
 624 jz label
 625
 626 //===---------------------------------------------------------------------===//
 627
 628 GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting
 629 simplifications for integer "x cmp y ? a : b".  For example, instead of:
 630
 631 int G;
 632 void f(int X, int Y) {
 633   G = X < 0 ? 14 : 13;
 634 }
 635
 636 compiling to:
 637
 638 _f:
 639         movl $14, %eax
 640         movl $13, %ecx
 641         movl 4(%esp), %edx
 642         testl %edx, %edx
 643         cmovl %eax, %ecx
 644         movl %ecx, _G
 645         ret
 646
 647 it could be:
 648 _f:
 649         movl    4(%esp), %eax
 650         sarl    $31, %eax
 651         notl    %eax
 652         addl    $14, %eax
 653         movl    %eax, _G
 654         ret
 655
 656 etc.
 657
 658 //===---------------------------------------------------------------------===//
 659
 660 Currently we don't have elimination of redundant stack manipulations. Consider
 661 the code:
 662
 663 int %main() {
 664 entry:
 665         call fastcc void %test1( )
 666         call fastcc void %test2( sbyte* cast (void ()* %test1 to sbyte*) )
 667         ret int 0
 668 }
 669
 670 declare fastcc void %test1()
 671
 672 declare fastcc void %test2(sbyte*)
 673
 674
 675 This currently compiles to:
 676
 677         subl $16, %esp
 678         call _test5
 679         addl $12, %esp
 680         subl $16, %esp
 681         movl $_test5, (%esp)
 682         call _test6
 683         addl $12, %esp
 684
 685 The add\sub pair is really unneeded here.
 686
 687 //===---------------------------------------------------------------------===//
 688
 689 We currently compile sign_extend_inreg into two shifts:
 690
 691 long foo(long X) {
 692   return (long)(signed char)X;
 693 }
 694
 695 becomes:
 696
 697 _foo:
 698         movl 4(%esp), %eax
 699         shll $24, %eax
 700         sarl $24, %eax
 701         ret
 702
 703 This could be:
 704
 705 _foo:
 706         movsbl  4(%esp),%eax
 707         ret
 708
 709 //===---------------------------------------------------------------------===//
 710
 711 Consider the expansion of:
 712
 713 uint %test3(uint %X) {
 714         %tmp1 = rem uint %X, 255
 715         ret uint %tmp1
 716 }
 717
 718 Currently it compiles to:
 719
 720 ...
 721         movl $2155905153, %ecx
 722         movl 8(%esp), %esi
 723         movl %esi, %eax
 724         mull %ecx
 725 ...
 726
 727 This could be "reassociated" into:
 728
 729         movl $2155905153, %eax
 730         movl 8(%esp), %ecx
 731         mull %ecx
 732
 733 to avoid the copy.  In fact, the existing two-address stuff would do this
 734 except that mul isn't a commutative 2-addr instruction.  I guess this has
 735 to be done at isel time based on the #uses to mul?
 736
 737 //===---------------------------------------------------------------------===//
 738
 739 Make sure the instruction which starts a loop does not cross a cacheline
 740 boundary. This requires knowning the exact length of each machine instruction.
 741 That is somewhat complicated, but doable. Example 256.bzip2:
 742
 743 In the new trace, the hot loop has an instruction which crosses a cacheline
 744 boundary.  In addition to potential cache misses, this can't help decoding as I
 745 imagine there has to be some kind of complicated decoder reset and realignment
 746 to grab the bytes from the next cacheline.
 747
 748 532  532 0x3cfc movb     (1809(%esp, %esi), %bl   <<<--- spans 2 64 byte lines
 749 942  942 0x3d03 movl     %dh, (1809(%esp, %esi)
 750 937  937 0x3d0a incl     %esi
 751 3    3   0x3d0b cmpb     %bl, %dl
 752 27   27  0x3d0d jnz      0x000062db <main+11707>
 753
 754 //===---------------------------------------------------------------------===//
 755
 756 In c99 mode, the preprocessor doesn't like assembly comments like #TRUNCATE.
 757
 758 //===---------------------------------------------------------------------===//
 759
 760 This could be a single 16-bit load.
 761
 762 int f(char *p) {
 763     if ((p[0] == 1) & (p[1] == 2)) return 1;
 764     return 0;
 765 }
 766
 767 //===---------------------------------------------------------------------===//
 768
 769 We should inline lrintf and probably other libc functions.
 770
 771 //===---------------------------------------------------------------------===//
 772
 773 Start using the flags more.  For example, compile:
 774
 775 int add_zf(int *x, int y, int a, int b) {
 776      if ((*x += y) == 0)
 777           return a;
 778      else
 779           return b;
 780 }
 781
 782 to:
 783        addl    %esi, (%rdi)
 784        movl    %edx, %eax
 785        cmovne  %ecx, %eax
 786        ret
 787 instead of:
 788
 789 _add_zf:
 790         addl (%rdi), %esi
 791         movl %esi, (%rdi)
 792         testl %esi, %esi
 793         cmove %edx, %ecx
 794         movl %ecx, %eax
 795         ret
 796
 797 and:
 798
 799 int add_zf(int *x, int y, int a, int b) {
 800      if ((*x + y) < 0)
 801           return a;
 802      else
 803           return b;
 804 }
 805
 806 to:
 807
 808 add_zf:
 809         addl    (%rdi), %esi
 810         movl    %edx, %eax
 811         cmovns  %ecx, %eax
 812         ret
 813
 814 instead of:
 815
 816 _add_zf:
 817         addl (%rdi), %esi
 818         testl %esi, %esi
 819         cmovs %edx, %ecx
 820         movl %ecx, %eax
 821         ret
 822
 823 //===---------------------------------------------------------------------===//
 824
 825 This:
 826 #include <math.h>
 827 int foo(double X) { return isnan(X); }
 828
 829 compiles to (-m64):
 830
 831 _foo:
 832         pxor %xmm1, %xmm1
 833         ucomisd %xmm1, %xmm0
 834         setp %al
 835         movzbl %al, %eax
 836         ret
 837
 838 the pxor is not needed, we could compare the value against itself.
 839
 840 //===---------------------------------------------------------------------===//
 841
 842 These two functions have identical effects:
 843
 844 unsigned int f(unsigned int i, unsigned int n) {++i; if (i == n) ++i; return i;}
 845 unsigned int f2(unsigned int i, unsigned int n) {++i; i += i == n; return i;}
 846
 847 We currently compile them to:
 848
 849 _f:
 850         movl 4(%esp), %eax
 851         movl %eax, %ecx
 852         incl %ecx
 853         movl 8(%esp), %edx
 854         cmpl %edx, %ecx
 855         jne LBB1_2      #UnifiedReturnBlock
 856 LBB1_1: #cond_true
 857         addl $2, %eax
 858         ret
 859 LBB1_2: #UnifiedReturnBlock
 860         movl %ecx, %eax
 861         ret
 862 _f2:
 863         movl 4(%esp), %eax
 864         movl %eax, %ecx
 865         incl %ecx
 866         cmpl 8(%esp), %ecx
 867         sete %cl
 868         movzbl %cl, %ecx
 869         leal 1(%ecx,%eax), %eax
 870         ret
 871
 872 both of which are inferior to GCC's:
 873
 874 _f:
 875         movl    4(%esp), %edx
 876         leal    1(%edx), %eax
 877         addl    $2, %edx
 878         cmpl    8(%esp), %eax
 879         cmove   %edx, %eax
 880         ret
 881 _f2:
 882         movl    4(%esp), %eax
 883         addl    $1, %eax
 884         xorl    %edx, %edx
 885         cmpl    8(%esp), %eax
 886         sete    %dl
 887         addl    %edx, %eax
 888         ret
 889
 890 //===---------------------------------------------------------------------===//
 891
 892 This code:
 893
 894 void test(int X) {
 895   if (X) abort();
 896 }
 897
 898 is currently compiled to:
 899
 900 _test:
 901         subl $12, %esp
 902         cmpl $0, 16(%esp)
 903         jne LBB1_1
 904         addl $12, %esp
 905         ret
 906 LBB1_1:
 907         call L_abort$stub
 908
 909 It would be better to produce:
 910
 911 _test:
 912         subl $12, %esp
 913         cmpl $0, 16(%esp)
 914         jne L_abort$stub
 915         addl $12, %esp
 916         ret
 917
 918 This can be applied to any no-return function call that takes no arguments etc.
 919 Alternatively, the stack save/restore logic could be shrink-wrapped, producing
 920 something like this:
 921
 922 _test:
 923         cmpl $0, 4(%esp)
 924         jne LBB1_1
 925         ret
 926 LBB1_1:
 927         subl $12, %esp
 928         call L_abort$stub
 929
 930 Both are useful in different situations.  Finally, it could be shrink-wrapped
 931 and tail called, like this:
 932
 933 _test:
 934         cmpl $0, 4(%esp)
 935         jne LBB1_1
 936         ret
 937 LBB1_1:
 938         pop %eax   # realign stack.
 939         call L_abort$stub
 940
 941 Though this probably isn't worth it.
 942
 943 //===---------------------------------------------------------------------===//
 944
 945 We need to teach the codegen to convert two-address INC instructions to LEA
 946 when the flags are dead (likewise dec).  For example, on X86-64, compile:
 947
 948 int foo(int A, int B) {
 949   return A+1;
 950 }
 951
 952 to:
 953
 954 _foo:
 955         leal    1(%edi), %eax
 956         ret
 957
 958 instead of:
 959
 960 _foo:
 961         incl %edi
 962         movl %edi, %eax
 963         ret
 964
 965 Another example is:
 966
 967 ;; X's live range extends beyond the shift, so the register allocator
 968 ;; cannot coalesce it with Y.  Because of this, a copy needs to be
 969 ;; emitted before the shift to save the register value before it is
 970 ;; clobbered.  However, this copy is not needed if the register
 971 ;; allocator turns the shift into an LEA.  This also occurs for ADD.
 972
 973 ; Check that the shift gets turned into an LEA.
 974 ; RUN: llvm-upgrade < %s | llvm-as | llc -march=x86 -x86-asm-syntax=intel | \
 975 ; RUN:   not grep {mov E.X, E.X}
 976
 977 %G = external global int
 978
 979 int %test1(int %X, int %Y) {
 980         %Z = add int %X, %Y
 981         volatile store int %Y, int* %G
 982         volatile store int %Z, int* %G
 983         ret int %X
 984 }
 985
 986 int %test2(int %X) {
 987         %Z = add int %X, 1  ;; inc
 988         volatile store int %Z, int* %G
 989         ret int %X
 990 }
 991
 992 //===---------------------------------------------------------------------===//
 993
 994 This:
 995 #include <xmmintrin.h>
 996 unsigned test(float f) {
 997  return _mm_cvtsi128_si32( (__m128i) _mm_set_ss( f ));
 998 }
 999
1000 Compiles to:
1001 _test:
1002         movss 4(%esp), %xmm0
1003         movd %xmm0, %eax
1004         ret
1005
1006 it should compile to a move from the stack slot directly into eax.  DAGCombine
1007 has this xform, but it is currently disabled until the alignment fields of
1008 the load/store nodes are trustworthy.
1009
1010 //===---------------------------------------------------------------------===//
1011
1012 Sometimes it is better to codegen subtractions from a constant (e.g. 7-x) with
1013 a neg instead of a sub instruction.  Consider:
1014
1015 int test(char X) { return 7-X; }
1016
1017 we currently produce:
1018 _test:
1019         movl $7, %eax
1020         movsbl 4(%esp), %ecx
1021         subl %ecx, %eax
1022         ret
1023
1024 We would use one fewer register if codegen'd as:
1025
1026         movsbl 4(%esp), %eax
1027         neg %eax
1028         add $7, %eax
1029         ret
1030
1031 Note that this isn't beneficial if the load can be folded into the sub.  In
1032 this case, we want a sub:
1033
1034 int test(int X) { return 7-X; }
1035 _test:
1036         movl $7, %eax
1037         subl 4(%esp), %eax
1038         ret
1039
1040 //===---------------------------------------------------------------------===//
1041
1042 For code like:
1043 phi (undef, x)
1044
1045 We get an implicit def on the undef side. If the phi is spilled, we then get:
1046 implicitdef xmm1
1047 store xmm1 -> stack
1048
1049 It should be possible to teach the x86 backend to "fold" the store into the
1050 implicitdef, which just deletes the implicit def.
1051
1052 These instructions should go away:
1053 #IMPLICIT_DEF %xmm1
1054 movaps %xmm1, 192(%esp)
1055 movaps %xmm1, 224(%esp)
1056 movaps %xmm1, 176(%esp)
1057
1058 //===---------------------------------------------------------------------===//
1059
1060 This is a "commutable two-address" register coallescing deficiency:
1061
1062 define <4 x float> @test1(<4 x float> %V) {
1063 entry:
1064         %tmp8 = shufflevector <4 x float> %V, <4 x float> undef,
1065                                         <4 x i32> < i32 3, i32 2, i32 1, i32 0 >
1066         %add = add <4 x float> %tmp8, %V
1067         ret <4 x float> %add
1068 }
1069
1070 this codegens to:
1071
1072 _test1:
1073         pshufd  $27, %xmm0, %xmm1
1074         addps   %xmm0, %xmm1
1075         movaps  %xmm1, %xmm0
1076         ret
1077
1078 instead of:
1079
1080 _test1:
1081         pshufd  $27, %xmm0, %xmm1
1082         addps   %xmm1, %xmm0
1083         ret
1084
1085 //===---------------------------------------------------------------------===//
1086
1087 Leaf functions that require one 4-byte spill slot have a prolog like this:
1088
1089 _foo:
1090         pushl   %esi
1091         subl    $4, %esp
1092 ...
1093 and an epilog like this:
1094         addl    $4, %esp
1095         popl    %esi
1096         ret
1097
1098 It would be smaller, and potentially faster, to push eax on entry and to
1099 pop into a dummy register instead of using addl/subl of esp.  Just don't pop
1100 into any return registers :)
1101
1102 //===---------------------------------------------------------------------===//
1103
1104 The X86 backend should fold (branch (or (setcc, setcc))) into multiple
1105 branches.  We generate really poor code for:
1106
1107 double testf(double a) {
1108        return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0);
1109 }
1110
1111 For example, the entry BB is:
1112
1113 _testf:
1114         subl    $20, %esp
1115         pxor    %xmm0, %xmm0
1116         movsd   24(%esp), %xmm1
1117         ucomisd %xmm0, %xmm1
1118         setnp   %al
1119         sete    %cl
1120         testb   %cl, %al
1121         jne     LBB1_5  # UnifiedReturnBlock
1122 LBB1_1: # cond_true
1123
1124
1125 it would be better to replace the last four instructions with:
1126
1127         jp LBB1_1
1128         je LBB1_5
1129 LBB1_1:
1130
1131 We also codegen the inner ?: into a diamond:
1132
1133        cvtss2sd        LCPI1_0(%rip), %xmm2
1134         cvtss2sd        LCPI1_1(%rip), %xmm3
1135         ucomisd %xmm1, %xmm0
1136         ja      LBB1_3  # cond_true
1137 LBB1_2: # cond_true
1138         movapd  %xmm3, %xmm2
1139 LBB1_3: # cond_true
1140         movapd  %xmm2, %xmm0
1141         ret
1142
1143 We should sink the load into xmm3 into the LBB1_2 block.  This should
1144 be pretty easy, and will nuke all the copies.
1145
1146 //===---------------------------------------------------------------------===//
1147
1148 This:
1149         #include <algorithm>
1150         inline std::pair<unsigned, bool> full_add(unsigned a, unsigned b)
1151         { return std::make_pair(a + b, a + b < a); }
1152         bool no_overflow(unsigned a, unsigned b)
1153         { return !full_add(a, b).second; }
1154
1155 Should compile to:
1156
1157
1158         _Z11no_overflowjj:
1159                 addl    %edi, %esi
1160                 setae   %al
1161                 ret
1162
1163 on x86-64, not:
1164
1165 __Z11no_overflowjj:
1166         addl    %edi, %esi
1167         cmpl    %edi, %esi
1168         setae   %al
1169         movzbl  %al, %eax
1170         ret
1171
1172
1173 //===---------------------------------------------------------------------===//
1174
1175 Re-materialize MOV32r0 etc. with xor instead of changing them to moves if the
1176 condition register is dead. xor reg reg is shorter than mov reg, #0.
1177
1178 //===---------------------------------------------------------------------===//
1179
1180 We aren't matching RMW instructions aggressively
1181 enough.  Here's a reduced testcase (more in PR1160):
1182
1183 define void @test(i32* %huge_ptr, i32* %target_ptr) {
1184         %A = load i32* %huge_ptr                ; <i32> [#uses=1]
1185         %B = load i32* %target_ptr              ; <i32> [#uses=1]
1186         %C = or i32 %A, %B              ; <i32> [#uses=1]
1187         store i32 %C, i32* %target_ptr
1188         ret void
1189 }
1190
1191 $ llvm-as < t.ll | llc -march=x86-64
1192
1193 _test:
1194         movl (%rdi), %eax
1195         orl (%rsi), %eax
1196         movl %eax, (%rsi)
1197         ret
1198
1199 That should be something like:
1200
1201 _test:
1202         movl (%rdi), %eax
1203         orl %eax, (%rsi)
1204         ret
1205
1206 //===---------------------------------------------------------------------===//
1207
1208 The following code:
1209
1210 bb114.preheader:                ; preds = %cond_next94
1211         %tmp231232 = sext i16 %tmp62 to i32             ; <i32> [#uses=1]
1212         %tmp233 = sub i32 32, %tmp231232                ; <i32> [#uses=1]
1213         %tmp245246 = sext i16 %tmp65 to i32             ; <i32> [#uses=1]
1214         %tmp252253 = sext i16 %tmp68 to i32             ; <i32> [#uses=1]
1215         %tmp254 = sub i32 32, %tmp252253                ; <i32> [#uses=1]
1216         %tmp553554 = bitcast i16* %tmp37 to i8*         ; <i8*> [#uses=2]
1217         %tmp583584 = sext i16 %tmp98 to i32             ; <i32> [#uses=1]
1218         %tmp585 = sub i32 32, %tmp583584                ; <i32> [#uses=1]
1219         %tmp614615 = sext i16 %tmp101 to i32            ; <i32> [#uses=1]
1220         %tmp621622 = sext i16 %tmp104 to i32            ; <i32> [#uses=1]
1221         %tmp623 = sub i32 32, %tmp621622                ; <i32> [#uses=1]
1222         br label %bb114
1223
1224 produces:
1225
1226 LBB3_5: # bb114.preheader
1227         movswl  -68(%ebp), %eax
1228         movl    $32, %ecx
1229         movl    %ecx, -80(%ebp)
1230         subl    %eax, -80(%ebp)
1231         movswl  -52(%ebp), %eax
1232         movl    %ecx, -84(%ebp)
1233         subl    %eax, -84(%ebp)
1234         movswl  -70(%ebp), %eax
1235         movl    %ecx, -88(%ebp)
1236         subl    %eax, -88(%ebp)
1237         movswl  -50(%ebp), %eax
1238         subl    %eax, %ecx
1239         movl    %ecx, -76(%ebp)
1240         movswl  -42(%ebp), %eax
1241         movl    %eax, -92(%ebp)
1242         movswl  -66(%ebp), %eax
1243         movl    %eax, -96(%ebp)
1244         movw    $0, -98(%ebp)
1245
1246 This appears to be bad because the RA is not folding the store to the stack
1247 slot into the movl.  The above instructions could be:
1248         movl    $32, -80(%ebp)
1249 ...
1250         movl    $32, -84(%ebp)
1251 ...
1252 This seems like a cross between remat and spill folding.
1253
1254 This has redundant subtractions of %eax from a stack slot. However, %ecx doesn't
1255 change, so we could simply subtract %eax from %ecx first and then use %ecx (or
1256 vice-versa).
1257
1258 //===---------------------------------------------------------------------===//
1259
1260 For this code:
1261
1262 cond_next603:           ; preds = %bb493, %cond_true336, %cond_next599
1263         %v.21050.1 = phi i32 [ %v.21050.0, %cond_next599 ], [ %tmp344, %cond_true336 ], [ %v.2, %bb493 ]                ; <i32> [#uses=1]
1264         %maxz.21051.1 = phi i32 [ %maxz.21051.0, %cond_next599 ], [ 0, %cond_true336 ], [ %maxz.2, %bb493 ]             ; <i32> [#uses=2]
1265         %cnt.01055.1 = phi i32 [ %cnt.01055.0, %cond_next599 ], [ 0, %cond_true336 ], [ %cnt.0, %bb493 ]                ; <i32> [#uses=2]
1266         %byteptr.9 = phi i8* [ %byteptr.12, %cond_next599 ], [ %byteptr.0, %cond_true336 ], [ %byteptr.10, %bb493 ]             ; <i8*> [#uses=9]
1267         %bitptr.6 = phi i32 [ %tmp5571104.1, %cond_next599 ], [ %tmp4921049, %cond_true336 ], [ %bitptr.7, %bb493 ]             ; <i32> [#uses=4]
1268         %source.5 = phi i32 [ %tmp602, %cond_next599 ], [ %source.0, %cond_true336 ], [ %source.6, %bb493 ]             ; <i32> [#uses=7]
1269         %tmp606 = getelementptr %struct.const_tables* @tables, i32 0, i32 0, i32 %cnt.01055.1           ; <i8*> [#uses=1]
1270         %tmp607 = load i8* %tmp606, align 1             ; <i8> [#uses=1]
1271
1272 We produce this:
1273
1274 LBB4_70:        # cond_next603
1275         movl    -20(%ebp), %esi
1276         movl    L_tables$non_lazy_ptr-"L4$pb"(%esi), %esi
1277
1278 However, ICC caches this information before the loop and produces this:
1279
1280         movl      88(%esp), %eax                                #481.12
1281
1282 //===---------------------------------------------------------------------===//
1283
1284 This code:
1285
1286         %tmp659 = icmp slt i16 %tmp654, 0               ; <i1> [#uses=1]
1287         br i1 %tmp659, label %cond_true662, label %cond_next715
1288
1289 produces this:
1290
1291         testw   %cx, %cx
1292         movswl  %cx, %esi
1293         jns     LBB4_109        # cond_next715
1294
1295 Shark tells us that using %cx in the testw instruction is sub-optimal. It
1296 suggests using the 32-bit register (which is what ICC uses).
1297
1298 //===---------------------------------------------------------------------===//
1299
1300 rdar://5506677 - We compile this:
1301
1302 define i32 @foo(double %x) {
1303         %x14 = bitcast double %x to i64         ; <i64> [#uses=1]
1304         %tmp713 = trunc i64 %x14 to i32         ; <i32> [#uses=1]
1305         %tmp8 = and i32 %tmp713, 2147483647             ; <i32> [#uses=1]
1306         ret i32 %tmp8
1307 }
1308
1309 to:
1310
1311 _foo:
1312         subl    $12, %esp
1313         fldl    16(%esp)
1314         fstpl   (%esp)
1315         movl    $2147483647, %eax
1316         andl    (%esp), %eax
1317         addl    $12, %esp
1318         #FP_REG_KILL
1319         ret
1320
1321 It would be much better to eliminate the fldl/fstpl by folding the bitcast
1322 into the load SDNode.  That would give us:
1323
1324 _foo:
1325         movl    $2147483647, %eax
1326         andl    4(%esp), %eax
1327         ret
1328
1329 //===---------------------------------------------------------------------===//
1330
1331 We compile this:
1332
1333 void compare (long long foo) {
1334   if (foo < 4294967297LL)
1335     abort();
1336 }
1337
1338 to:
1339
1340 _compare:
1341         subl    $12, %esp
1342         cmpl    $0, 16(%esp)
1343         setne   %al
1344         movzbw  %al, %ax
1345         cmpl    $1, 20(%esp)
1346         setg    %cl
1347         movzbw  %cl, %cx
1348         cmove   %ax, %cx
1349         movw    %cx, %ax
1350         testb   $1, %al
1351         je      LBB1_2  # cond_true
1352
1353 (also really horrible code on ppc).  This is due to the expand code for 64-bit
1354 compares.  GCC produces multiple branches, which is much nicer:
1355
1356 _compare:
1357         pushl   %ebp
1358         movl    %esp, %ebp
1359         subl    $8, %esp
1360         movl    8(%ebp), %eax
1361         movl    12(%ebp), %edx
1362         subl    $1, %edx
1363         jg     L5
1364 L7:
1365         jl      L4
1366         cmpl    $0, %eax
1367         jbe      L4
1368 L5:
1369
1370 //===---------------------------------------------------------------------===//
1371 Tail call optimization improvements: Tail call optimization currently
1372 pushes all arguments on the top of the stack (their normal place if
1373 that was a not tail call optimized functiong call ) before moving them
1374 to actual stack slot. this is done to prevent overwriting of paramters
1375 (see example below) that might be used, since the arguments of the
1376 callee overwrites callers arguments.
1377
1378  example:
1379
1380 int callee(int32, int64);
1381 int caller(int32 arg1, int32 arg2) {
1382   int64 local = arg2 * 2;
1383   return callee(arg2, (int64)local);
1384 }
1385
1386 [arg1]          [!arg2 no longer valid since we moved local onto it]
1387 [arg2]      ->  [(int64)
1388 [RETADDR]        local  ]
1389
1390 moving arg1 onto the stack slot of callee function would overwrite
1391 arg2 of the caller.
1392
1393 Possible optimizations:
1394
1395  - only push those arguments to the top of the stack that are actual
1396    parameters of the caller function and have no local value in the
1397    caller
1398
1399    in above example local does not need to be pushed onto the top of
1400    the stack as it is definitetly not a caller's function parameter
1401
1402  - analyse the actual parameters of the callee to see which would
1403    overwrite a caller paramter which is used by the callee and only
1404    push them onto the top of the stack
1405
1406    int callee (int32 arg1, int32 arg2);
1407    int caller (int32 arg1, int32 arg2) {
1408        return callee(arg1,arg2);
1409    }
1410
1411    here we don't need to write any variables to the top of the stack
1412    since they don't overwrite each other
1413
1414    int callee (int32 arg1, int32 arg2);
1415    int caller (int32 arg1, int32 arg2) {
1416        return callee(arg2,arg1);
1417    }
1418
1419    here we need to push the arguments because they overwrite each other
1420
1421
1422    code for lowering directly onto callers arguments:
1423 +  SmallVector<std::pair<unsigned, SDOperand>, 8> RegsToPass;
1424 +  SmallVector<SDOperand, 8> MemOpChains;
1425 +
1426 +  SDOperand FramePtr;
1427 +  SDOperand PtrOff;
1428 +  SDOperand FIN;
1429 +  int FI = 0;
1430 +  // Walk the register/memloc assignments, inserting copies/loads.
1431 +  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
1432 +    CCValAssign &VA = ArgLocs[i];
1433 +    SDOperand Arg = Op.getOperand(5+2*VA.getValNo());
1434 +
1435 +    ....
1436 +
1437 +    if (VA.isRegLoc()) {
1438 +      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
1439 +    } else {
1440 +      assert(VA.isMemLoc());
1441 +      // create frame index
1442 +      int32_t Offset = VA.getLocMemOffset()+FPDiff;
1443 +      uint32_t OpSize = (MVT::getSizeInBits(VA.getLocVT())+7)/8;
1444 +      FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset);
1445 +      FIN = DAG.getFrameIndex(FI, MVT::i32);
1446 +      // store relative to framepointer
1447 +      MemOpChains.push_back(DAG.getStore(Chain, Arg, FIN, NULL, 0));
1448 +    }
1449 +  }
1450 //===---------------------------------------------------------------------===//