lib/Target/X86/README.txt

   1 //===---------------------------------------------------------------------===//
   2 // Random ideas for the X86 backend.
   3 //===---------------------------------------------------------------------===//
   4
   5 We should add support for the "movbe" instruction, which does a byte-swapping
   6 copy (3-addr bswap + memory support?)  This is available on Atom processors.
   7
   8 //===---------------------------------------------------------------------===//
   9
  10 CodeGen/X86/lea-3.ll:test3 should be a single LEA, not a shift/move.  The X86
  11 backend knows how to three-addressify this shift, but it appears the register
  12 allocator isn't even asking it to do so in this case.  We should investigate
  13 why this isn't happening, it could have significant impact on other important
  14 cases for X86 as well.
  15
  16 //===---------------------------------------------------------------------===//
  17
  18 This should be one DIV/IDIV instruction, not a libcall:
  19
  20 unsigned test(unsigned long long X, unsigned Y) {
  21         return X/Y;
  22 }
  23
  24 This can be done trivially with a custom legalizer.  What about overflow
  25 though?  http://gcc.gnu.org/bugzilla/show_bug.cgi?id=14224
  26
  27 //===---------------------------------------------------------------------===//
  28
  29 Improvements to the multiply -> shift/add algorithm:
  30 http://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html
  31
  32 //===---------------------------------------------------------------------===//
  33
  34 Improve code like this (occurs fairly frequently, e.g. in LLVM):
  35 long long foo(int x) { return 1LL << x; }
  36
  37 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01109.html
  38 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01128.html
  39 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html
  40
  41 Another useful one would be  ~0ULL >> X and ~0ULL << X.
  42
  43 One better solution for 1LL << x is:
  44         xorl    %eax, %eax
  45         xorl    %edx, %edx
  46         testb   $32, %cl
  47         sete    %al
  48         setne   %dl
  49         sall    %cl, %eax
  50         sall    %cl, %edx
  51
  52 But that requires good 8-bit subreg support.
  53
  54 Also, this might be better.  It's an extra shift, but it's one instruction
  55 shorter, and doesn't stress 8-bit subreg support.
  56 (From http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01148.html,
  57 but without the unnecessary and.)
  58         movl %ecx, %eax
  59         shrl $5, %eax
  60         movl %eax, %edx
  61         xorl $1, %edx
  62         sall %cl, %eax
  63         sall %cl. %edx
  64
  65 64-bit shifts (in general) expand to really bad code.  Instead of using
  66 cmovs, we should expand to a conditional branch like GCC produces.
  67
  68 //===---------------------------------------------------------------------===//
  69
  70 Some isel ideas:
  71
  72 1. Dynamic programming based approach when compile time if not an
  73    issue.
  74 2. Code duplication (addressing mode) during isel.
  75 3. Other ideas from "Register-Sensitive Selection, Duplication, and
  76    Sequencing of Instructions".
  77 4. Scheduling for reduced register pressure.  E.g. "Minimum Register
  78    Instruction Sequence Problem: Revisiting Optimal Code Generation for DAGs"
  79    and other related papers.
  80    http://citeseer.ist.psu.edu/govindarajan01minimum.html
  81
  82 //===---------------------------------------------------------------------===//
  83
  84 Should we promote i16 to i32 to avoid partial register update stalls?
  85
  86 //===---------------------------------------------------------------------===//
  87
  88 Leave any_extend as pseudo instruction and hint to register
  89 allocator. Delay codegen until post register allocation.
  90 Note. any_extend is now turned into an INSERT_SUBREG. We still need to teach
  91 the coalescer how to deal with it though.
  92
  93 //===---------------------------------------------------------------------===//
  94
  95 It appears icc use push for parameter passing. Need to investigate.
  96
  97 //===---------------------------------------------------------------------===//
  98
  99 This:
 100
 101 void foo(void);
 102 void bar(int x, int *P) {
 103   x >>= 2;
 104   if (x)
 105     foo();
 106   *P = x;
 107 }
 108
 109 compiles into:
 110
 111         movq    %rsi, %rbx
 112         movl    %edi, %r14d
 113         sarl    $2, %r14d
 114         testl   %r14d, %r14d
 115         je      LBB0_2
 116
 117 Instead of doing an explicit test, we can use the flags off the sar.  This
 118 occurs in a bigger testcase like this, which is pretty common:
 119
 120 #include <vector>
 121 int test1(std::vector<int> &X) {
 122   int Sum = 0;
 123   for (long i = 0, e = X.size(); i != e; ++i)
 124     X[i] = 0;
 125   return Sum;
 126 }
 127
 128 //===---------------------------------------------------------------------===//
 129
 130 Only use inc/neg/not instructions on processors where they are faster than
 131 add/sub/xor.  They are slower on the P4 due to only updating some processor
 132 flags.
 133
 134 //===---------------------------------------------------------------------===//
 135
 136 The instruction selector sometimes misses folding a load into a compare.  The
 137 pattern is written as (cmp reg, (load p)).  Because the compare isn't
 138 commutative, it is not matched with the load on both sides.  The dag combiner
 139 should be made smart enough to cannonicalize the load into the RHS of a compare
 140 when it can invert the result of the compare for free.
 141
 142 //===---------------------------------------------------------------------===//
 143
 144 In many cases, LLVM generates code like this:
 145
 146 _test:
 147         movl 8(%esp), %eax
 148         cmpl %eax, 4(%esp)
 149         setl %al
 150         movzbl %al, %eax
 151         ret
 152
 153 on some processors (which ones?), it is more efficient to do this:
 154
 155 _test:
 156         movl 8(%esp), %ebx
 157         xor  %eax, %eax
 158         cmpl %ebx, 4(%esp)
 159         setl %al
 160         ret
 161
 162 Doing this correctly is tricky though, as the xor clobbers the flags.
 163
 164 //===---------------------------------------------------------------------===//
 165
 166 We should generate bts/btr/etc instructions on targets where they are cheap or
 167 when codesize is important.  e.g., for:
 168
 169 void setbit(int *target, int bit) {
 170     *target |= (1 << bit);
 171 }
 172 void clearbit(int *target, int bit) {
 173     *target &= ~(1 << bit);
 174 }
 175
 176 //===---------------------------------------------------------------------===//
 177
 178 Instead of the following for memset char*, 1, 10:
 179
 180         movl $16843009, 4(%edx)
 181         movl $16843009, (%edx)
 182         movw $257, 8(%edx)
 183
 184 It might be better to generate
 185
 186         movl $16843009, %eax
 187         movl %eax, 4(%edx)
 188         movl %eax, (%edx)
 189         movw al, 8(%edx)
 190
 191 when we can spare a register. It reduces code size.
 192
 193 //===---------------------------------------------------------------------===//
 194
 195 Evaluate what the best way to codegen sdiv X, (2^C) is.  For X/8, we currently
 196 get this:
 197
 198 define i32 @test1(i32 %X) {
 199     %Y = sdiv i32 %X, 8
 200     ret i32 %Y
 201 }
 202
 203 _test1:
 204         movl 4(%esp), %eax
 205         movl %eax, %ecx
 206         sarl $31, %ecx
 207         shrl $29, %ecx
 208         addl %ecx, %eax
 209         sarl $3, %eax
 210         ret
 211
 212 GCC knows several different ways to codegen it, one of which is this:
 213
 214 _test1:
 215         movl    4(%esp), %eax
 216         cmpl    $-1, %eax
 217         leal    7(%eax), %ecx
 218         cmovle  %ecx, %eax
 219         sarl    $3, %eax
 220         ret
 221
 222 which is probably slower, but it's interesting at least :)
 223
 224 //===---------------------------------------------------------------------===//
 225
 226 We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl
 227 We should leave these as libcalls for everything over a much lower threshold,
 228 since libc is hand tuned for medium and large mem ops (avoiding RFO for large
 229 stores, TLB preheating, etc)
 230
 231 //===---------------------------------------------------------------------===//
 232
 233 Optimize this into something reasonable:
 234  x * copysign(1.0, y) * copysign(1.0, z)
 235
 236 //===---------------------------------------------------------------------===//
 237
 238 Optimize copysign(x, *y) to use an integer load from y.
 239
 240 //===---------------------------------------------------------------------===//
 241
 242 The following tests perform worse with LSR:
 243
 244 lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor.
 245
 246 //===---------------------------------------------------------------------===//
 247
 248 Adding to the list of cmp / test poor codegen issues:
 249
 250 int test(__m128 *A, __m128 *B) {
 251   if (_mm_comige_ss(*A, *B))
 252     return 3;
 253   else
 254     return 4;
 255 }
 256
 257 _test:
 258         movl 8(%esp), %eax
 259         movaps (%eax), %xmm0
 260         movl 4(%esp), %eax
 261         movaps (%eax), %xmm1
 262         comiss %xmm0, %xmm1
 263         setae %al
 264         movzbl %al, %ecx
 265         movl $3, %eax
 266         movl $4, %edx
 267         cmpl $0, %ecx
 268         cmove %edx, %eax
 269         ret
 270
 271 Note the setae, movzbl, cmpl, cmove can be replaced with a single cmovae. There
 272 are a number of issues. 1) We are introducing a setcc between the result of the
 273 intrisic call and select. 2) The intrinsic is expected to produce a i32 value
 274 so a any extend (which becomes a zero extend) is added.
 275
 276 We probably need some kind of target DAG combine hook to fix this.
 277
 278 //===---------------------------------------------------------------------===//
 279
 280 We generate significantly worse code for this than GCC:
 281 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150
 282 http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701
 283
 284 There is also one case we do worse on PPC.
 285
 286 //===---------------------------------------------------------------------===//
 287
 288 For this:
 289
 290 int test(int a)
 291 {
 292   return a * 3;
 293 }
 294
 295 We currently emits
 296         imull $3, 4(%esp), %eax
 297
 298 Perhaps this is what we really should generate is? Is imull three or four
 299 cycles? Note: ICC generates this:
 300         movl    4(%esp), %eax
 301         leal    (%eax,%eax,2), %eax
 302
 303 The current instruction priority is based on pattern complexity. The former is
 304 more "complex" because it folds a load so the latter will not be emitted.
 305
 306 Perhaps we should use AddedComplexity to give LEA32r a higher priority? We
 307 should always try to match LEA first since the LEA matching code does some
 308 estimate to determine whether the match is profitable.
 309
 310 However, if we care more about code size, then imull is better. It's two bytes
 311 shorter than movl + leal.
 312
 313 On a Pentium M, both variants have the same characteristics with regard
 314 to throughput; however, the multiplication has a latency of four cycles, as
 315 opposed to two cycles for the movl+lea variant.
 316
 317 //===---------------------------------------------------------------------===//
 318
 319 __builtin_ffs codegen is messy.
 320
 321 int ffs_(unsigned X) { return __builtin_ffs(X); }
 322
 323 llvm produces:
 324 ffs_:
 325         movl    4(%esp), %ecx
 326         bsfl    %ecx, %eax
 327         movl    $32, %edx
 328         cmove   %edx, %eax
 329         incl    %eax
 330         xorl    %edx, %edx
 331         testl   %ecx, %ecx
 332         cmove   %edx, %eax
 333         ret
 334
 335 vs gcc:
 336
 337 _ffs_:
 338         movl    $-1, %edx
 339         bsfl    4(%esp), %eax
 340         cmove   %edx, %eax
 341         addl    $1, %eax
 342         ret
 343
 344 Another example of __builtin_ffs (use predsimplify to eliminate a select):
 345
 346 int foo (unsigned long j) {
 347   if (j)
 348     return __builtin_ffs (j) - 1;
 349   else
 350     return 0;
 351 }
 352
 353 //===---------------------------------------------------------------------===//
 354
 355 It appears gcc place string data with linkonce linkage in
 356 .section __TEXT,__const_coal,coalesced instead of
 357 .section __DATA,__const_coal,coalesced.
 358 Take a look at darwin.h, there are other Darwin assembler directives that we
 359 do not make use of.
 360
 361 //===---------------------------------------------------------------------===//
 362
 363 define i32 @foo(i32* %a, i32 %t) {
 364 entry:
 365         br label %cond_true
 366
 367 cond_true:              ; preds = %cond_true, %entry
 368         %x.0.0 = phi i32 [ 0, %entry ], [ %tmp9, %cond_true ]           ; <i32> [#uses=3]
 369         %t_addr.0.0 = phi i32 [ %t, %entry ], [ %tmp7, %cond_true ]             ; <i32> [#uses=1]
 370         %tmp2 = getelementptr i32* %a, i32 %x.0.0               ; <i32*> [#uses=1]
 371         %tmp3 = load i32* %tmp2         ; <i32> [#uses=1]
 372         %tmp5 = add i32 %t_addr.0.0, %x.0.0             ; <i32> [#uses=1]
 373         %tmp7 = add i32 %tmp5, %tmp3            ; <i32> [#uses=2]
 374         %tmp9 = add i32 %x.0.0, 1               ; <i32> [#uses=2]
 375         %tmp = icmp sgt i32 %tmp9, 39           ; <i1> [#uses=1]
 376         br i1 %tmp, label %bb12, label %cond_true
 377
 378 bb12:           ; preds = %cond_true
 379         ret i32 %tmp7
 380 }
 381 is pessimized by -loop-reduce and -indvars
 382
 383 //===---------------------------------------------------------------------===//
 384
 385 u32 to float conversion improvement:
 386
 387 float uint32_2_float( unsigned u ) {
 388   float fl = (int) (u & 0xffff);
 389   float fh = (int) (u >> 16);
 390   fh *= 0x1.0p16f;
 391   return fh + fl;
 392 }
 393
 394 00000000        subl    $0x04,%esp
 395 00000003        movl    0x08(%esp,1),%eax
 396 00000007        movl    %eax,%ecx
 397 00000009        shrl    $0x10,%ecx
 398 0000000c        cvtsi2ss        %ecx,%xmm0
 399 00000010        andl    $0x0000ffff,%eax
 400 00000015        cvtsi2ss        %eax,%xmm1
 401 00000019        mulss   0x00000078,%xmm0
 402 00000021        addss   %xmm1,%xmm0
 403 00000025        movss   %xmm0,(%esp,1)
 404 0000002a        flds    (%esp,1)
 405 0000002d        addl    $0x04,%esp
 406 00000030        ret
 407
 408 //===---------------------------------------------------------------------===//
 409
 410 When using fastcc abi, align stack slot of argument of type double on 8 byte
 411 boundary to improve performance.
 412
 413 //===---------------------------------------------------------------------===//
 414
 415 GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting
 416 simplifications for integer "x cmp y ? a : b".
 417
 418 //===---------------------------------------------------------------------===//
 419
 420 Consider the expansion of:
 421
 422 define i32 @test3(i32 %X) {
 423         %tmp1 = urem i32 %X, 255
 424         ret i32 %tmp1
 425 }
 426
 427 Currently it compiles to:
 428
 429 ...
 430         movl $2155905153, %ecx
 431         movl 8(%esp), %esi
 432         movl %esi, %eax
 433         mull %ecx
 434 ...
 435
 436 This could be "reassociated" into:
 437
 438         movl $2155905153, %eax
 439         movl 8(%esp), %ecx
 440         mull %ecx
 441
 442 to avoid the copy.  In fact, the existing two-address stuff would do this
 443 except that mul isn't a commutative 2-addr instruction.  I guess this has
 444 to be done at isel time based on the #uses to mul?
 445
 446 //===---------------------------------------------------------------------===//
 447
 448 Make sure the instruction which starts a loop does not cross a cacheline
 449 boundary. This requires knowning the exact length of each machine instruction.
 450 That is somewhat complicated, but doable. Example 256.bzip2:
 451
 452 In the new trace, the hot loop has an instruction which crosses a cacheline
 453 boundary.  In addition to potential cache misses, this can't help decoding as I
 454 imagine there has to be some kind of complicated decoder reset and realignment
 455 to grab the bytes from the next cacheline.
 456
 457 532  532 0x3cfc movb     (1809(%esp, %esi), %bl   <<<--- spans 2 64 byte lines
 458 942  942 0x3d03 movl     %dh, (1809(%esp, %esi)
 459 937  937 0x3d0a incl     %esi
 460 3    3   0x3d0b cmpb     %bl, %dl
 461 27   27  0x3d0d jnz      0x000062db <main+11707>
 462
 463 //===---------------------------------------------------------------------===//
 464
 465 In c99 mode, the preprocessor doesn't like assembly comments like #TRUNCATE.
 466
 467 //===---------------------------------------------------------------------===//
 468
 469 This could be a single 16-bit load.
 470
 471 int f(char *p) {
 472     if ((p[0] == 1) & (p[1] == 2)) return 1;
 473     return 0;
 474 }
 475
 476 //===---------------------------------------------------------------------===//
 477
 478 We should inline lrintf and probably other libc functions.
 479
 480 //===---------------------------------------------------------------------===//
 481
 482 Use the FLAGS values from arithmetic instructions more.  For example, compile:
 483
 484 int add_zf(int *x, int y, int a, int b) {
 485      if ((*x += y) == 0)
 486           return a;
 487      else
 488           return b;
 489 }
 490
 491 to:
 492        addl    %esi, (%rdi)
 493        movl    %edx, %eax
 494        cmovne  %ecx, %eax
 495        ret
 496 instead of:
 497
 498 _add_zf:
 499         addl (%rdi), %esi
 500         movl %esi, (%rdi)
 501         testl %esi, %esi
 502         cmove %edx, %ecx
 503         movl %ecx, %eax
 504         ret
 505
 506 As another example, compile function f2 in test/CodeGen/X86/cmp-test.ll
 507 without a test instruction.
 508
 509 //===---------------------------------------------------------------------===//
 510
 511 These two functions have identical effects:
 512
 513 unsigned int f(unsigned int i, unsigned int n) {++i; if (i == n) ++i; return i;}
 514 unsigned int f2(unsigned int i, unsigned int n) {++i; i += i == n; return i;}
 515
 516 We currently compile them to:
 517
 518 _f:
 519         movl 4(%esp), %eax
 520         movl %eax, %ecx
 521         incl %ecx
 522         movl 8(%esp), %edx
 523         cmpl %edx, %ecx
 524         jne LBB1_2      #UnifiedReturnBlock
 525 LBB1_1: #cond_true
 526         addl $2, %eax
 527         ret
 528 LBB1_2: #UnifiedReturnBlock
 529         movl %ecx, %eax
 530         ret
 531 _f2:
 532         movl 4(%esp), %eax
 533         movl %eax, %ecx
 534         incl %ecx
 535         cmpl 8(%esp), %ecx
 536         sete %cl
 537         movzbl %cl, %ecx
 538         leal 1(%ecx,%eax), %eax
 539         ret
 540
 541 both of which are inferior to GCC's:
 542
 543 _f:
 544         movl    4(%esp), %edx
 545         leal    1(%edx), %eax
 546         addl    $2, %edx
 547         cmpl    8(%esp), %eax
 548         cmove   %edx, %eax
 549         ret
 550 _f2:
 551         movl    4(%esp), %eax
 552         addl    $1, %eax
 553         xorl    %edx, %edx
 554         cmpl    8(%esp), %eax
 555         sete    %dl
 556         addl    %edx, %eax
 557         ret
 558
 559 //===---------------------------------------------------------------------===//
 560
 561 This code:
 562
 563 void test(int X) {
 564   if (X) abort();
 565 }
 566
 567 is currently compiled to:
 568
 569 _test:
 570         subl $12, %esp
 571         cmpl $0, 16(%esp)
 572         jne LBB1_1
 573         addl $12, %esp
 574         ret
 575 LBB1_1:
 576         call L_abort$stub
 577
 578 It would be better to produce:
 579
 580 _test:
 581         subl $12, %esp
 582         cmpl $0, 16(%esp)
 583         jne L_abort$stub
 584         addl $12, %esp
 585         ret
 586
 587 This can be applied to any no-return function call that takes no arguments etc.
 588 Alternatively, the stack save/restore logic could be shrink-wrapped, producing
 589 something like this:
 590
 591 _test:
 592         cmpl $0, 4(%esp)
 593         jne LBB1_1
 594         ret
 595 LBB1_1:
 596         subl $12, %esp
 597         call L_abort$stub
 598
 599 Both are useful in different situations.  Finally, it could be shrink-wrapped
 600 and tail called, like this:
 601
 602 _test:
 603         cmpl $0, 4(%esp)
 604         jne LBB1_1
 605         ret
 606 LBB1_1:
 607         pop %eax   # realign stack.
 608         call L_abort$stub
 609
 610 Though this probably isn't worth it.
 611
 612 //===---------------------------------------------------------------------===//
 613
 614 Sometimes it is better to codegen subtractions from a constant (e.g. 7-x) with
 615 a neg instead of a sub instruction.  Consider:
 616
 617 int test(char X) { return 7-X; }
 618
 619 we currently produce:
 620 _test:
 621         movl $7, %eax
 622         movsbl 4(%esp), %ecx
 623         subl %ecx, %eax
 624         ret
 625
 626 We would use one fewer register if codegen'd as:
 627
 628         movsbl 4(%esp), %eax
 629         neg %eax
 630         add $7, %eax
 631         ret
 632
 633 Note that this isn't beneficial if the load can be folded into the sub.  In
 634 this case, we want a sub:
 635
 636 int test(int X) { return 7-X; }
 637 _test:
 638         movl $7, %eax
 639         subl 4(%esp), %eax
 640         ret
 641
 642 //===---------------------------------------------------------------------===//
 643
 644 Leaf functions that require one 4-byte spill slot have a prolog like this:
 645
 646 _foo:
 647         pushl   %esi
 648         subl    $4, %esp
 649 ...
 650 and an epilog like this:
 651         addl    $4, %esp
 652         popl    %esi
 653         ret
 654
 655 It would be smaller, and potentially faster, to push eax on entry and to
 656 pop into a dummy register instead of using addl/subl of esp.  Just don't pop
 657 into any return registers :)
 658
 659 //===---------------------------------------------------------------------===//
 660
 661 The X86 backend should fold (branch (or (setcc, setcc))) into multiple
 662 branches.  We generate really poor code for:
 663
 664 double testf(double a) {
 665        return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0);
 666 }
 667
 668 For example, the entry BB is:
 669
 670 _testf:
 671         subl    $20, %esp
 672         pxor    %xmm0, %xmm0
 673         movsd   24(%esp), %xmm1
 674         ucomisd %xmm0, %xmm1
 675         setnp   %al
 676         sete    %cl
 677         testb   %cl, %al
 678         jne     LBB1_5  # UnifiedReturnBlock
 679 LBB1_1: # cond_true
 680
 681
 682 it would be better to replace the last four instructions with:
 683
 684         jp LBB1_1
 685         je LBB1_5
 686 LBB1_1:
 687
 688 We also codegen the inner ?: into a diamond:
 689
 690        cvtss2sd        LCPI1_0(%rip), %xmm2
 691         cvtss2sd        LCPI1_1(%rip), %xmm3
 692         ucomisd %xmm1, %xmm0
 693         ja      LBB1_3  # cond_true
 694 LBB1_2: # cond_true
 695         movapd  %xmm3, %xmm2
 696 LBB1_3: # cond_true
 697         movapd  %xmm2, %xmm0
 698         ret
 699
 700 We should sink the load into xmm3 into the LBB1_2 block.  This should
 701 be pretty easy, and will nuke all the copies.
 702
 703 //===---------------------------------------------------------------------===//
 704
 705 This:
 706         #include <algorithm>
 707         inline std::pair<unsigned, bool> full_add(unsigned a, unsigned b)
 708         { return std::make_pair(a + b, a + b < a); }
 709         bool no_overflow(unsigned a, unsigned b)
 710         { return !full_add(a, b).second; }
 711
 712 Should compile to:
 713
 714
 715         _Z11no_overflowjj:
 716                 addl    %edi, %esi
 717                 setae   %al
 718                 ret
 719
 720 FIXME: That code looks wrong; bool return is normally defined as zext.
 721
 722 on x86-64, not:
 723
 724 __Z11no_overflowjj:
 725         addl    %edi, %esi
 726         cmpl    %edi, %esi
 727         setae   %al
 728         movzbl  %al, %eax
 729         ret
 730
 731
 732 //===---------------------------------------------------------------------===//
 733
 734 The following code:
 735
 736 bb114.preheader:                ; preds = %cond_next94
 737         %tmp231232 = sext i16 %tmp62 to i32             ; <i32> [#uses=1]
 738         %tmp233 = sub i32 32, %tmp231232                ; <i32> [#uses=1]
 739         %tmp245246 = sext i16 %tmp65 to i32             ; <i32> [#uses=1]
 740         %tmp252253 = sext i16 %tmp68 to i32             ; <i32> [#uses=1]
 741         %tmp254 = sub i32 32, %tmp252253                ; <i32> [#uses=1]
 742         %tmp553554 = bitcast i16* %tmp37 to i8*         ; <i8*> [#uses=2]
 743         %tmp583584 = sext i16 %tmp98 to i32             ; <i32> [#uses=1]
 744         %tmp585 = sub i32 32, %tmp583584                ; <i32> [#uses=1]
 745         %tmp614615 = sext i16 %tmp101 to i32            ; <i32> [#uses=1]
 746         %tmp621622 = sext i16 %tmp104 to i32            ; <i32> [#uses=1]
 747         %tmp623 = sub i32 32, %tmp621622                ; <i32> [#uses=1]
 748         br label %bb114
 749
 750 produces:
 751
 752 LBB3_5: # bb114.preheader
 753         movswl  -68(%ebp), %eax
 754         movl    $32, %ecx
 755         movl    %ecx, -80(%ebp)
 756         subl    %eax, -80(%ebp)
 757         movswl  -52(%ebp), %eax
 758         movl    %ecx, -84(%ebp)
 759         subl    %eax, -84(%ebp)
 760         movswl  -70(%ebp), %eax
 761         movl    %ecx, -88(%ebp)
 762         subl    %eax, -88(%ebp)
 763         movswl  -50(%ebp), %eax
 764         subl    %eax, %ecx
 765         movl    %ecx, -76(%ebp)
 766         movswl  -42(%ebp), %eax
 767         movl    %eax, -92(%ebp)
 768         movswl  -66(%ebp), %eax
 769         movl    %eax, -96(%ebp)
 770         movw    $0, -98(%ebp)
 771
 772 This appears to be bad because the RA is not folding the store to the stack
 773 slot into the movl.  The above instructions could be:
 774         movl    $32, -80(%ebp)
 775 ...
 776         movl    $32, -84(%ebp)
 777 ...
 778 This seems like a cross between remat and spill folding.
 779
 780 This has redundant subtractions of %eax from a stack slot. However, %ecx doesn't
 781 change, so we could simply subtract %eax from %ecx first and then use %ecx (or
 782 vice-versa).
 783
 784 //===---------------------------------------------------------------------===//
 785
 786 This code:
 787
 788         %tmp659 = icmp slt i16 %tmp654, 0               ; <i1> [#uses=1]
 789         br i1 %tmp659, label %cond_true662, label %cond_next715
 790
 791 produces this:
 792
 793         testw   %cx, %cx
 794         movswl  %cx, %esi
 795         jns     LBB4_109        # cond_next715
 796
 797 Shark tells us that using %cx in the testw instruction is sub-optimal. It
 798 suggests using the 32-bit register (which is what ICC uses).
 799
 800 //===---------------------------------------------------------------------===//
 801
 802 We compile this:
 803
 804 void compare (long long foo) {
 805   if (foo < 4294967297LL)
 806     abort();
 807 }
 808
 809 to:
 810
 811 compare:
 812         subl    $4, %esp
 813         cmpl    $0, 8(%esp)
 814         setne   %al
 815         movzbw  %al, %ax
 816         cmpl    $1, 12(%esp)
 817         setg    %cl
 818         movzbw  %cl, %cx
 819         cmove   %ax, %cx
 820         testb   $1, %cl
 821         jne     .LBB1_2 # UnifiedReturnBlock
 822 .LBB1_1:        # ifthen
 823         call    abort
 824 .LBB1_2:        # UnifiedReturnBlock
 825         addl    $4, %esp
 826         ret
 827
 828 (also really horrible code on ppc).  This is due to the expand code for 64-bit
 829 compares.  GCC produces multiple branches, which is much nicer:
 830
 831 compare:
 832         subl    $12, %esp
 833         movl    20(%esp), %edx
 834         movl    16(%esp), %eax
 835         decl    %edx
 836         jle     .L7
 837 .L5:
 838         addl    $12, %esp
 839         ret
 840         .p2align 4,,7
 841 .L7:
 842         jl      .L4
 843         cmpl    $0, %eax
 844         .p2align 4,,8
 845         ja      .L5
 846 .L4:
 847         .p2align 4,,9
 848         call    abort
 849
 850 //===---------------------------------------------------------------------===//
 851
 852 Tail call optimization improvements: Tail call optimization currently
 853 pushes all arguments on the top of the stack (their normal place for
 854 non-tail call optimized calls) that source from the callers arguments
 855 or  that source from a virtual register (also possibly sourcing from
 856 callers arguments).
 857 This is done to prevent overwriting of parameters (see example
 858 below) that might be used later.
 859
 860 example:
 861
 862 int callee(int32, int64);
 863 int caller(int32 arg1, int32 arg2) {
 864   int64 local = arg2 * 2;
 865   return callee(arg2, (int64)local);
 866 }
 867
 868 [arg1]          [!arg2 no longer valid since we moved local onto it]
 869 [arg2]      ->  [(int64)
 870 [RETADDR]        local  ]
 871
 872 Moving arg1 onto the stack slot of callee function would overwrite
 873 arg2 of the caller.
 874
 875 Possible optimizations:
 876
 877
 878  - Analyse the actual parameters of the callee to see which would
 879    overwrite a caller parameter which is used by the callee and only
 880    push them onto the top of the stack.
 881
 882    int callee (int32 arg1, int32 arg2);
 883    int caller (int32 arg1, int32 arg2) {
 884        return callee(arg1,arg2);
 885    }
 886
 887    Here we don't need to write any variables to the top of the stack
 888    since they don't overwrite each other.
 889
 890    int callee (int32 arg1, int32 arg2);
 891    int caller (int32 arg1, int32 arg2) {
 892        return callee(arg2,arg1);
 893    }
 894
 895    Here we need to push the arguments because they overwrite each
 896    other.
 897
 898 //===---------------------------------------------------------------------===//
 899
 900 main ()
 901 {
 902   int i = 0;
 903   unsigned long int z = 0;
 904
 905   do {
 906     z -= 0x00004000;
 907     i++;
 908     if (i > 0x00040000)
 909       abort ();
 910   } while (z > 0);
 911   exit (0);
 912 }
 913
 914 gcc compiles this to:
 915
 916 _main:
 917         subl    $28, %esp
 918         xorl    %eax, %eax
 919         jmp     L2
 920 L3:
 921         cmpl    $262144, %eax
 922         je      L10
 923 L2:
 924         addl    $1, %eax
 925         cmpl    $262145, %eax
 926         jne     L3
 927         call    L_abort$stub
 928 L10:
 929         movl    $0, (%esp)
 930         call    L_exit$stub
 931
 932 llvm:
 933
 934 _main:
 935         subl    $12, %esp
 936         movl    $1, %eax
 937         movl    $16384, %ecx
 938 LBB1_1: # bb
 939         cmpl    $262145, %eax
 940         jge     LBB1_4  # cond_true
 941 LBB1_2: # cond_next
 942         incl    %eax
 943         addl    $4294950912, %ecx
 944         cmpl    $16384, %ecx
 945         jne     LBB1_1  # bb
 946 LBB1_3: # bb11
 947         xorl    %eax, %eax
 948         addl    $12, %esp
 949         ret
 950 LBB1_4: # cond_true
 951         call    L_abort$stub
 952
 953 1. LSR should rewrite the first cmp with induction variable %ecx.
 954 2. DAG combiner should fold
 955         leal    1(%eax), %edx
 956         cmpl    $262145, %edx
 957    =>
 958         cmpl    $262144, %eax
 959
 960 //===---------------------------------------------------------------------===//
 961
 962 define i64 @test(double %X) {
 963         %Y = fptosi double %X to i64
 964         ret i64 %Y
 965 }
 966
 967 compiles to:
 968
 969 _test:
 970         subl    $20, %esp
 971         movsd   24(%esp), %xmm0
 972         movsd   %xmm0, 8(%esp)
 973         fldl    8(%esp)
 974         fisttpll        (%esp)
 975         movl    4(%esp), %edx
 976         movl    (%esp), %eax
 977         addl    $20, %esp
 978         #FP_REG_KILL
 979         ret
 980
 981 This should just fldl directly from the input stack slot.
 982
 983 //===---------------------------------------------------------------------===//
 984
 985 This code:
 986 int foo (int x) { return (x & 65535) | 255; }
 987
 988 Should compile into:
 989
 990 _foo:
 991         movzwl  4(%esp), %eax
 992         orl     $255, %eax
 993         ret
 994
 995 instead of:
 996 _foo:
 997         movl    $255, %eax
 998         orl     4(%esp), %eax
 999         andl    $65535, %eax
1000         ret
1001
1002 //===---------------------------------------------------------------------===//
1003
1004 We're codegen'ing multiply of long longs inefficiently:
1005
1006 unsigned long long LLM(unsigned long long arg1, unsigned long long arg2) {
1007   return arg1 *  arg2;
1008 }
1009
1010 We compile to (fomit-frame-pointer):
1011
1012 _LLM:
1013         pushl   %esi
1014         movl    8(%esp), %ecx
1015         movl    16(%esp), %esi
1016         movl    %esi, %eax
1017         mull    %ecx
1018         imull   12(%esp), %esi
1019         addl    %edx, %esi
1020         imull   20(%esp), %ecx
1021         movl    %esi, %edx
1022         addl    %ecx, %edx
1023         popl    %esi
1024         ret
1025
1026 This looks like a scheduling deficiency and lack of remat of the load from
1027 the argument area.  ICC apparently produces:
1028
1029         movl      8(%esp), %ecx
1030         imull     12(%esp), %ecx
1031         movl      16(%esp), %eax
1032         imull     4(%esp), %eax
1033         addl      %eax, %ecx
1034         movl      4(%esp), %eax
1035         mull      12(%esp)
1036         addl      %ecx, %edx
1037         ret
1038
1039 Note that it remat'd loads from 4(esp) and 12(esp).  See this GCC PR:
1040 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=17236
1041
1042 //===---------------------------------------------------------------------===//
1043
1044 We can fold a store into "zeroing a reg".  Instead of:
1045
1046 xorl    %eax, %eax
1047 movl    %eax, 124(%esp)
1048
1049 we should get:
1050
1051 movl    $0, 124(%esp)
1052
1053 if the flags of the xor are dead.
1054
1055 Likewise, we isel "x<<1" into "add reg,reg".  If reg is spilled, this should
1056 be folded into: shl [mem], 1
1057
1058 //===---------------------------------------------------------------------===//
1059
1060 In SSE mode, we turn abs and neg into a load from the constant pool plus a xor
1061 or and instruction, for example:
1062
1063         xorpd   LCPI1_0, %xmm2
1064
1065 However, if xmm2 gets spilled, we end up with really ugly code like this:
1066
1067         movsd   (%esp), %xmm0
1068         xorpd   LCPI1_0, %xmm0
1069         movsd   %xmm0, (%esp)
1070
1071 Since we 'know' that this is a 'neg', we can actually "fold" the spill into
1072 the neg/abs instruction, turning it into an *integer* operation, like this:
1073
1074         xorl 2147483648, [mem+4]     ## 2147483648 = (1 << 31)
1075
1076 you could also use xorb, but xorl is less likely to lead to a partial register
1077 stall.  Here is a contrived testcase:
1078
1079 double a, b, c;
1080 void test(double *P) {
1081   double X = *P;
1082   a = X;
1083   bar();
1084   X = -X;
1085   b = X;
1086   bar();
1087   c = X;
1088 }
1089
1090 //===---------------------------------------------------------------------===//
1091
1092 The generated code on x86 for checking for signed overflow on a multiply the
1093 obvious way is much longer than it needs to be.
1094
1095 int x(int a, int b) {
1096   long long prod = (long long)a*b;
1097   return  prod > 0x7FFFFFFF || prod < (-0x7FFFFFFF-1);
1098 }
1099
1100 See PR2053 for more details.
1101
1102 //===---------------------------------------------------------------------===//
1103
1104 We should investigate using cdq/ctld (effect: edx = sar eax, 31)
1105 more aggressively; it should cost the same as a move+shift on any modern
1106 processor, but it's a lot shorter. Downside is that it puts more
1107 pressure on register allocation because it has fixed operands.
1108
1109 Example:
1110 int abs(int x) {return x < 0 ? -x : x;}
1111
1112 gcc compiles this to the following when using march/mtune=pentium2/3/4/m/etc.:
1113 abs:
1114         movl    4(%esp), %eax
1115         cltd
1116         xorl    %edx, %eax
1117         subl    %edx, %eax
1118         ret
1119
1120 //===---------------------------------------------------------------------===//
1121
1122 Take the following code (from
1123 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=16541):
1124
1125 extern unsigned char first_one[65536];
1126 int FirstOnet(unsigned long long arg1)
1127 {
1128   if (arg1 >> 48)
1129     return (first_one[arg1 >> 48]);
1130   return 0;
1131 }
1132
1133
1134 The following code is currently generated:
1135 FirstOnet:
1136         movl    8(%esp), %eax
1137         cmpl    $65536, %eax
1138         movl    4(%esp), %ecx
1139         jb      .LBB1_2 # UnifiedReturnBlock
1140 .LBB1_1:        # ifthen
1141         shrl    $16, %eax
1142         movzbl  first_one(%eax), %eax
1143         ret
1144 .LBB1_2:        # UnifiedReturnBlock
1145         xorl    %eax, %eax
1146         ret
1147
1148 We could change the "movl 8(%esp), %eax" into "movzwl 10(%esp), %eax"; this
1149 lets us change the cmpl into a testl, which is shorter, and eliminate the shift.
1150
1151 //===---------------------------------------------------------------------===//
1152
1153 We compile this function:
1154
1155 define i32 @foo(i32 %a, i32 %b, i32 %c, i8 zeroext  %d) nounwind  {
1156 entry:
1157         %tmp2 = icmp eq i8 %d, 0                ; <i1> [#uses=1]
1158         br i1 %tmp2, label %bb7, label %bb
1159
1160 bb:             ; preds = %entry
1161         %tmp6 = add i32 %b, %a          ; <i32> [#uses=1]
1162         ret i32 %tmp6
1163
1164 bb7:            ; preds = %entry
1165         %tmp10 = sub i32 %a, %c         ; <i32> [#uses=1]
1166         ret i32 %tmp10
1167 }
1168
1169 to:
1170
1171 foo:                                    # @foo
1172 # BB#0:                                 # %entry
1173         movl    4(%esp), %ecx
1174         cmpb    $0, 16(%esp)
1175         je      .LBB0_2
1176 # BB#1:                                 # %bb
1177         movl    8(%esp), %eax
1178         addl    %ecx, %eax
1179         ret
1180 .LBB0_2:                                # %bb7
1181         movl    12(%esp), %edx
1182         movl    %ecx, %eax
1183         subl    %edx, %eax
1184         ret
1185
1186 There's an obviously unnecessary movl in .LBB0_2, and we could eliminate a
1187 couple more movls by putting 4(%esp) into %eax instead of %ecx.
1188
1189 //===---------------------------------------------------------------------===//
1190
1191 See rdar://4653682.
1192
1193 From flops:
1194
1195 LBB1_15:        # bb310
1196         cvtss2sd        LCPI1_0, %xmm1
1197         addsd   %xmm1, %xmm0
1198         movsd   176(%esp), %xmm2
1199         mulsd   %xmm0, %xmm2
1200         movapd  %xmm2, %xmm3
1201         mulsd   %xmm3, %xmm3
1202         movapd  %xmm3, %xmm4
1203         mulsd   LCPI1_23, %xmm4
1204         addsd   LCPI1_24, %xmm4
1205         mulsd   %xmm3, %xmm4
1206         addsd   LCPI1_25, %xmm4
1207         mulsd   %xmm3, %xmm4
1208         addsd   LCPI1_26, %xmm4
1209         mulsd   %xmm3, %xmm4
1210         addsd   LCPI1_27, %xmm4
1211         mulsd   %xmm3, %xmm4
1212         addsd   LCPI1_28, %xmm4
1213         mulsd   %xmm3, %xmm4
1214         addsd   %xmm1, %xmm4
1215         mulsd   %xmm2, %xmm4
1216         movsd   152(%esp), %xmm1
1217         addsd   %xmm4, %xmm1
1218         movsd   %xmm1, 152(%esp)
1219         incl    %eax
1220         cmpl    %eax, %esi
1221         jge     LBB1_15 # bb310
1222 LBB1_16:        # bb358.loopexit
1223         movsd   152(%esp), %xmm0
1224         addsd   %xmm0, %xmm0
1225         addsd   LCPI1_22, %xmm0
1226         movsd   %xmm0, 152(%esp)
1227
1228 Rather than spilling the result of the last addsd in the loop, we should have
1229 insert a copy to split the interval (one for the duration of the loop, one
1230 extending to the fall through). The register pressure in the loop isn't high
1231 enough to warrant the spill.
1232
1233 Also check why xmm7 is not used at all in the function.
1234
1235 //===---------------------------------------------------------------------===//
1236
1237 Take the following:
1238
1239 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
1240 target triple = "i386-apple-darwin8"
1241 @in_exit.4870.b = internal global i1 false              ; <i1*> [#uses=2]
1242 define fastcc void @abort_gzip() noreturn nounwind  {
1243 entry:
1244         %tmp.b.i = load i1* @in_exit.4870.b             ; <i1> [#uses=1]
1245         br i1 %tmp.b.i, label %bb.i, label %bb4.i
1246 bb.i:           ; preds = %entry
1247         tail call void @exit( i32 1 ) noreturn nounwind
1248         unreachable
1249 bb4.i:          ; preds = %entry
1250         store i1 true, i1* @in_exit.4870.b
1251         tail call void @exit( i32 1 ) noreturn nounwind
1252         unreachable
1253 }
1254 declare void @exit(i32) noreturn nounwind
1255
1256 This compiles into:
1257 _abort_gzip:                            ## @abort_gzip
1258 ## BB#0:                                ## %entry
1259         subl    $12, %esp
1260         movb    _in_exit.4870.b, %al
1261         cmpb    $1, %al
1262         jne     LBB0_2
1263
1264 We somehow miss folding the movb into the cmpb.
1265
1266 //===---------------------------------------------------------------------===//
1267
1268 We compile:
1269
1270 int test(int x, int y) {
1271   return x-y-1;
1272 }
1273
1274 into (-m64):
1275
1276 _test:
1277         decl    %edi
1278         movl    %edi, %eax
1279         subl    %esi, %eax
1280         ret
1281
1282 it would be better to codegen as: x+~y  (notl+addl)
1283
1284 //===---------------------------------------------------------------------===//
1285
1286 This code:
1287
1288 int foo(const char *str,...)
1289 {
1290  __builtin_va_list a; int x;
1291  __builtin_va_start(a,str); x = __builtin_va_arg(a,int); __builtin_va_end(a);
1292  return x;
1293 }
1294
1295 gets compiled into this on x86-64:
1296         subq    $200, %rsp
1297         movaps  %xmm7, 160(%rsp)
1298         movaps  %xmm6, 144(%rsp)
1299         movaps  %xmm5, 128(%rsp)
1300         movaps  %xmm4, 112(%rsp)
1301         movaps  %xmm3, 96(%rsp)
1302         movaps  %xmm2, 80(%rsp)
1303         movaps  %xmm1, 64(%rsp)
1304         movaps  %xmm0, 48(%rsp)
1305         movq    %r9, 40(%rsp)
1306         movq    %r8, 32(%rsp)
1307         movq    %rcx, 24(%rsp)
1308         movq    %rdx, 16(%rsp)
1309         movq    %rsi, 8(%rsp)
1310         leaq    (%rsp), %rax
1311         movq    %rax, 192(%rsp)
1312         leaq    208(%rsp), %rax
1313         movq    %rax, 184(%rsp)
1314         movl    $48, 180(%rsp)
1315         movl    $8, 176(%rsp)
1316         movl    176(%rsp), %eax
1317         cmpl    $47, %eax
1318         jbe     .LBB1_3 # bb
1319 .LBB1_1:        # bb3
1320         movq    184(%rsp), %rcx
1321         leaq    8(%rcx), %rax
1322         movq    %rax, 184(%rsp)
1323 .LBB1_2:        # bb4
1324         movl    (%rcx), %eax
1325         addq    $200, %rsp
1326         ret
1327 .LBB1_3:        # bb
1328         movl    %eax, %ecx
1329         addl    $8, %eax
1330         addq    192(%rsp), %rcx
1331         movl    %eax, 176(%rsp)
1332         jmp     .LBB1_2 # bb4
1333
1334 gcc 4.3 generates:
1335         subq    $96, %rsp
1336 .LCFI0:
1337         leaq    104(%rsp), %rax
1338         movq    %rsi, -80(%rsp)
1339         movl    $8, -120(%rsp)
1340         movq    %rax, -112(%rsp)
1341         leaq    -88(%rsp), %rax
1342         movq    %rax, -104(%rsp)
1343         movl    $8, %eax
1344         cmpl    $48, %eax
1345         jb      .L6
1346         movq    -112(%rsp), %rdx
1347         movl    (%rdx), %eax
1348         addq    $96, %rsp
1349         ret
1350         .p2align 4,,10
1351         .p2align 3
1352 .L6:
1353         mov     %eax, %edx
1354         addq    -104(%rsp), %rdx
1355         addl    $8, %eax
1356         movl    %eax, -120(%rsp)
1357         movl    (%rdx), %eax
1358         addq    $96, %rsp
1359         ret
1360
1361 and it gets compiled into this on x86:
1362         pushl   %ebp
1363         movl    %esp, %ebp
1364         subl    $4, %esp
1365         leal    12(%ebp), %eax
1366         movl    %eax, -4(%ebp)
1367         leal    16(%ebp), %eax
1368         movl    %eax, -4(%ebp)
1369         movl    12(%ebp), %eax
1370         addl    $4, %esp
1371         popl    %ebp
1372         ret
1373
1374 gcc 4.3 generates:
1375         pushl   %ebp
1376         movl    %esp, %ebp
1377         movl    12(%ebp), %eax
1378         popl    %ebp
1379         ret
1380
1381 //===---------------------------------------------------------------------===//
1382
1383 Teach tblgen not to check bitconvert source type in some cases. This allows us
1384 to consolidate the following patterns in X86InstrMMX.td:
1385
1386 def : Pat<(v2i32 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
1387                                                   (iPTR 0))))),
1388           (v2i32 (MMX_MOVDQ2Qrr VR128:$src))>;
1389 def : Pat<(v4i16 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
1390                                                   (iPTR 0))))),
1391           (v4i16 (MMX_MOVDQ2Qrr VR128:$src))>;
1392 def : Pat<(v8i8 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
1393                                                   (iPTR 0))))),
1394           (v8i8 (MMX_MOVDQ2Qrr VR128:$src))>;
1395
1396 There are other cases in various td files.
1397
1398 //===---------------------------------------------------------------------===//
1399
1400 Take something like the following on x86-32:
1401 unsigned a(unsigned long long x, unsigned y) {return x % y;}
1402
1403 We currently generate a libcall, but we really shouldn't: the expansion is
1404 shorter and likely faster than the libcall.  The expected code is something
1405 like the following:
1406
1407         movl    12(%ebp), %eax
1408         movl    16(%ebp), %ecx
1409         xorl    %edx, %edx
1410         divl    %ecx
1411         movl    8(%ebp), %eax
1412         divl    %ecx
1413         movl    %edx, %eax
1414         ret
1415
1416 A similar code sequence works for division.
1417
1418 //===---------------------------------------------------------------------===//
1419
1420 These should compile to the same code, but the later codegen's to useless
1421 instructions on X86. This may be a trivial dag combine (GCC PR7061):
1422
1423 struct s1 { unsigned char a, b; };
1424 unsigned long f1(struct s1 x) {
1425     return x.a + x.b;
1426 }
1427 struct s2 { unsigned a: 8, b: 8; };
1428 unsigned long f2(struct s2 x) {
1429     return x.a + x.b;
1430 }
1431
1432 //===---------------------------------------------------------------------===//
1433
1434 We currently compile this:
1435
1436 define i32 @func1(i32 %v1, i32 %v2) nounwind {
1437 entry:
1438   %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
1439   %sum = extractvalue {i32, i1} %t, 0
1440   %obit = extractvalue {i32, i1} %t, 1
1441   br i1 %obit, label %overflow, label %normal
1442 normal:
1443   ret i32 %sum
1444 overflow:
1445   call void @llvm.trap()
1446   unreachable
1447 }
1448 declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32)
1449 declare void @llvm.trap()
1450
1451 to:
1452
1453 _func1:
1454         movl    4(%esp), %eax
1455         addl    8(%esp), %eax
1456         jo      LBB1_2  ## overflow
1457 LBB1_1: ## normal
1458         ret
1459 LBB1_2: ## overflow
1460         ud2
1461
1462 it would be nice to produce "into" someday.
1463
1464 //===---------------------------------------------------------------------===//
1465
1466 This code:
1467
1468 void vec_mpys1(int y[], const int x[], int scaler) {
1469 int i;
1470 for (i = 0; i < 150; i++)
1471  y[i] += (((long long)scaler * (long long)x[i]) >> 31);
1472 }
1473
1474 Compiles to this loop with GCC 3.x:
1475
1476 .L5:
1477         movl    %ebx, %eax
1478         imull   (%edi,%ecx,4)
1479         shrdl   $31, %edx, %eax
1480         addl    %eax, (%esi,%ecx,4)
1481         incl    %ecx
1482         cmpl    $149, %ecx
1483         jle     .L5
1484
1485 llvm-gcc compiles it to the much uglier:
1486
1487 LBB1_1: ## bb1
1488         movl    24(%esp), %eax
1489         movl    (%eax,%edi,4), %ebx
1490         movl    %ebx, %ebp
1491         imull   %esi, %ebp
1492         movl    %ebx, %eax
1493         mull    %ecx
1494         addl    %ebp, %edx
1495         sarl    $31, %ebx
1496         imull   %ecx, %ebx
1497         addl    %edx, %ebx
1498         shldl   $1, %eax, %ebx
1499         movl    20(%esp), %eax
1500         addl    %ebx, (%eax,%edi,4)
1501         incl    %edi
1502         cmpl    $150, %edi
1503         jne     LBB1_1  ## bb1
1504
1505 The issue is that we hoist the cast of "scaler" to long long outside of the
1506 loop, the value comes into the loop as two values, and
1507 RegsForValue::getCopyFromRegs doesn't know how to put an AssertSext on the
1508 constructed BUILD_PAIR which represents the cast value.
1509
1510 This can be handled by making CodeGenPrepare sink the cast.
1511
1512 //===---------------------------------------------------------------------===//
1513
1514 Test instructions can be eliminated by using EFLAGS values from arithmetic
1515 instructions. This is currently not done for mul, and, or, xor, neg, shl,
1516 sra, srl, shld, shrd, atomic ops, and others. It is also currently not done
1517 for read-modify-write instructions. It is also current not done if the
1518 OF or CF flags are needed.
1519
1520 The shift operators have the complication that when the shift count is
1521 zero, EFLAGS is not set, so they can only subsume a test instruction if
1522 the shift count is known to be non-zero. Also, using the EFLAGS value
1523 from a shift is apparently very slow on some x86 implementations.
1524
1525 In read-modify-write instructions, the root node in the isel match is
1526 the store, and isel has no way for the use of the EFLAGS result of the
1527 arithmetic to be remapped to the new node.
1528
1529 Add and subtract instructions set OF on signed overflow and CF on unsiged
1530 overflow, while test instructions always clear OF and CF. In order to
1531 replace a test with an add or subtract in a situation where OF or CF is
1532 needed, codegen must be able to prove that the operation cannot see
1533 signed or unsigned overflow, respectively.
1534
1535 //===---------------------------------------------------------------------===//
1536
1537 memcpy/memmove do not lower to SSE copies when possible.  A silly example is:
1538 define <16 x float> @foo(<16 x float> %A) nounwind {
1539         %tmp = alloca <16 x float>, align 16
1540         %tmp2 = alloca <16 x float>, align 16
1541         store <16 x float> %A, <16 x float>* %tmp
1542         %s = bitcast <16 x float>* %tmp to i8*
1543         %s2 = bitcast <16 x float>* %tmp2 to i8*
1544         call void @llvm.memcpy.i64(i8* %s, i8* %s2, i64 64, i32 16)
1545         %R = load <16 x float>* %tmp2
1546         ret <16 x float> %R
1547 }
1548
1549 declare void @llvm.memcpy.i64(i8* nocapture, i8* nocapture, i64, i32) nounwind
1550
1551 which compiles to:
1552
1553 _foo:
1554         subl    $140, %esp
1555         movaps  %xmm3, 112(%esp)
1556         movaps  %xmm2, 96(%esp)
1557         movaps  %xmm1, 80(%esp)
1558         movaps  %xmm0, 64(%esp)
1559         movl    60(%esp), %eax
1560         movl    %eax, 124(%esp)
1561         movl    56(%esp), %eax
1562         movl    %eax, 120(%esp)
1563         movl    52(%esp), %eax
1564         <many many more 32-bit copies>
1565         movaps  (%esp), %xmm0
1566         movaps  16(%esp), %xmm1
1567         movaps  32(%esp), %xmm2
1568         movaps  48(%esp), %xmm3
1569         addl    $140, %esp
1570         ret
1571
1572 On Nehalem, it may even be cheaper to just use movups when unaligned than to
1573 fall back to lower-granularity chunks.
1574
1575 //===---------------------------------------------------------------------===//
1576
1577 Implement processor-specific optimizations for parity with GCC on these
1578 processors.  GCC does two optimizations:
1579
1580 1. ix86_pad_returns inserts a noop before ret instructions if immediately
1581    preceeded by a conditional branch or is the target of a jump.
1582 2. ix86_avoid_jump_misspredicts inserts noops in cases where a 16-byte block of
1583    code contains more than 3 branches.
1584
1585 The first one is done for all AMDs, Core2, and "Generic"
1586 The second one is done for: Atom, Pentium Pro, all AMDs, Pentium 4, Nocona,
1587   Core 2, and "Generic"
1588
1589 //===---------------------------------------------------------------------===//
1590
1591 Testcase:
1592 int a(int x) { return (x & 127) > 31; }
1593
1594 Current output:
1595         movl    4(%esp), %eax
1596         andl    $127, %eax
1597         cmpl    $31, %eax
1598         seta    %al
1599         movzbl  %al, %eax
1600         ret
1601
1602 Ideal output:
1603         xorl    %eax, %eax
1604         testl   $96, 4(%esp)
1605         setne   %al
1606         ret
1607
1608 This should definitely be done in instcombine, canonicalizing the range
1609 condition into a != condition.  We get this IR:
1610
1611 define i32 @a(i32 %x) nounwind readnone {
1612 entry:
1613         %0 = and i32 %x, 127            ; <i32> [#uses=1]
1614         %1 = icmp ugt i32 %0, 31                ; <i1> [#uses=1]
1615         %2 = zext i1 %1 to i32          ; <i32> [#uses=1]
1616         ret i32 %2
1617 }
1618
1619 Instcombine prefers to strength reduce relational comparisons to equality
1620 comparisons when possible, this should be another case of that.  This could
1621 be handled pretty easily in InstCombiner::visitICmpInstWithInstAndIntCst, but it
1622 looks like InstCombiner::visitICmpInstWithInstAndIntCst should really already
1623 be redesigned to use ComputeMaskedBits and friends.
1624
1625
1626 //===---------------------------------------------------------------------===//
1627 Testcase:
1628 int x(int a) { return (a&0xf0)>>4; }
1629
1630 Current output:
1631         movl    4(%esp), %eax
1632         shrl    $4, %eax
1633         andl    $15, %eax
1634         ret
1635
1636 Ideal output:
1637         movzbl  4(%esp), %eax
1638         shrl    $4, %eax
1639         ret
1640
1641 //===---------------------------------------------------------------------===//
1642
1643 Re-implement atomic builtins __sync_add_and_fetch() and __sync_sub_and_fetch
1644 properly.
1645
1646 When the return value is not used (i.e. only care about the value in the
1647 memory), x86 does not have to use add to implement these. Instead, it can use
1648 add, sub, inc, dec instructions with the "lock" prefix.
1649
1650 This is currently implemented using a bit of instruction selection trick. The
1651 issue is the target independent pattern produces one output and a chain and we
1652 want to map it into one that just output a chain. The current trick is to select
1653 it into a MERGE_VALUES with the first definition being an implicit_def. The
1654 proper solution is to add new ISD opcodes for the no-output variant. DAG
1655 combiner can then transform the node before it gets to target node selection.
1656
1657 Problem #2 is we are adding a whole bunch of x86 atomic instructions when in
1658 fact these instructions are identical to the non-lock versions. We need a way to
1659 add target specific information to target nodes and have this information
1660 carried over to machine instructions. Asm printer (or JIT) can use this
1661 information to add the "lock" prefix.
1662
1663 //===---------------------------------------------------------------------===//
1664
1665 _Bool bar(int *x) { return *x & 1; }
1666
1667 define zeroext i1 @bar(i32* nocapture %x) nounwind readonly {
1668 entry:
1669   %tmp1 = load i32* %x                            ; <i32> [#uses=1]
1670   %and = and i32 %tmp1, 1                         ; <i32> [#uses=1]
1671   %tobool = icmp ne i32 %and, 0                   ; <i1> [#uses=1]
1672   ret i1 %tobool
1673 }
1674
1675 bar:                                                        # @bar
1676 # BB#0:                                                     # %entry
1677         movl    4(%esp), %eax
1678         movb    (%eax), %al
1679         andb    $1, %al
1680         movzbl  %al, %eax
1681         ret
1682
1683 Missed optimization: should be movl+andl.
1684
1685 //===---------------------------------------------------------------------===//
1686
1687 Consider the following two functions compiled with clang:
1688 _Bool foo(int *x) { return !(*x & 4); }
1689 unsigned bar(int *x) { return !(*x & 4); }
1690
1691 foo:
1692         movl    4(%esp), %eax
1693         testb   $4, (%eax)
1694         sete    %al
1695         movzbl  %al, %eax
1696         ret
1697
1698 bar:
1699         movl    4(%esp), %eax
1700         movl    (%eax), %eax
1701         shrl    $2, %eax
1702         andl    $1, %eax
1703         xorl    $1, %eax
1704         ret
1705
1706 The second function generates more code even though the two functions are
1707 are functionally identical.
1708
1709 //===---------------------------------------------------------------------===//
1710
1711 Take the following C code:
1712 int x(int y) { return (y & 63) << 14; }
1713
1714 Code produced by gcc:
1715         andl    $63, %edi
1716         sall    $14, %edi
1717         movl    %edi, %eax
1718         ret
1719
1720 Code produced by clang:
1721         shll    $14, %edi
1722         movl    %edi, %eax
1723         andl    $1032192, %eax
1724         ret
1725
1726 The code produced by gcc is 3 bytes shorter.  This sort of construct often
1727 shows up with bitfields.
1728
1729 //===---------------------------------------------------------------------===//
1730
1731 Take the following C code:
1732 int f(int a, int b) { return (unsigned char)a == (unsigned char)b; }
1733
1734 We generate the following IR with clang:
1735 define i32 @f(i32 %a, i32 %b) nounwind readnone {
1736 entry:
1737   %tmp = xor i32 %b, %a                           ; <i32> [#uses=1]
1738   %tmp6 = and i32 %tmp, 255                       ; <i32> [#uses=1]
1739   %cmp = icmp eq i32 %tmp6, 0                     ; <i1> [#uses=1]
1740   %conv5 = zext i1 %cmp to i32                    ; <i32> [#uses=1]
1741   ret i32 %conv5
1742 }
1743
1744 And the following x86 code:
1745         xorl    %esi, %edi
1746         testb   $-1, %dil
1747         sete    %al
1748         movzbl  %al, %eax
1749         ret
1750
1751 A cmpb instead of the xorl+testb would be one instruction shorter.
1752
1753 //===---------------------------------------------------------------------===//
1754
1755 Given the following C code:
1756 int f(int a, int b) { return (signed char)a == (signed char)b; }
1757
1758 We generate the following IR with clang:
1759 define i32 @f(i32 %a, i32 %b) nounwind readnone {
1760 entry:
1761   %sext = shl i32 %a, 24                          ; <i32> [#uses=1]
1762   %conv1 = ashr i32 %sext, 24                     ; <i32> [#uses=1]
1763   %sext6 = shl i32 %b, 24                         ; <i32> [#uses=1]
1764   %conv4 = ashr i32 %sext6, 24                    ; <i32> [#uses=1]
1765   %cmp = icmp eq i32 %conv1, %conv4               ; <i1> [#uses=1]
1766   %conv5 = zext i1 %cmp to i32                    ; <i32> [#uses=1]
1767   ret i32 %conv5
1768 }
1769
1770 And the following x86 code:
1771         movsbl  %sil, %eax
1772         movsbl  %dil, %ecx
1773         cmpl    %eax, %ecx
1774         sete    %al
1775         movzbl  %al, %eax
1776         ret
1777
1778
1779 It should be possible to eliminate the sign extensions.
1780
1781 //===---------------------------------------------------------------------===//
1782
1783 LLVM misses a load+store narrowing opportunity in this code:
1784
1785 %struct.bf = type { i64, i16, i16, i32 }
1786
1787 @bfi = external global %struct.bf*                ; <%struct.bf**> [#uses=2]
1788
1789 define void @t1() nounwind ssp {
1790 entry:
1791   %0 = load %struct.bf** @bfi, align 8            ; <%struct.bf*> [#uses=1]
1792   %1 = getelementptr %struct.bf* %0, i64 0, i32 1 ; <i16*> [#uses=1]
1793   %2 = bitcast i16* %1 to i32*                    ; <i32*> [#uses=2]
1794   %3 = load i32* %2, align 1                      ; <i32> [#uses=1]
1795   %4 = and i32 %3, -65537                         ; <i32> [#uses=1]
1796   store i32 %4, i32* %2, align 1
1797   %5 = load %struct.bf** @bfi, align 8            ; <%struct.bf*> [#uses=1]
1798   %6 = getelementptr %struct.bf* %5, i64 0, i32 1 ; <i16*> [#uses=1]
1799   %7 = bitcast i16* %6 to i32*                    ; <i32*> [#uses=2]
1800   %8 = load i32* %7, align 1                      ; <i32> [#uses=1]
1801   %9 = and i32 %8, -131073                        ; <i32> [#uses=1]
1802   store i32 %9, i32* %7, align 1
1803   ret void
1804 }
1805
1806 LLVM currently emits this:
1807
1808   movq  bfi(%rip), %rax
1809   andl  $-65537, 8(%rax)
1810   movq  bfi(%rip), %rax
1811   andl  $-131073, 8(%rax)
1812   ret
1813
1814 It could narrow the loads and stores to emit this:
1815
1816   movq  bfi(%rip), %rax
1817   andb  $-2, 10(%rax)
1818   movq  bfi(%rip), %rax
1819   andb  $-3, 10(%rax)
1820   ret
1821
1822 The trouble is that there is a TokenFactor between the store and the
1823 load, making it non-trivial to determine if there's anything between
1824 the load and the store which would prohibit narrowing.
1825
1826 //===---------------------------------------------------------------------===//
1827
1828 This code:
1829 void foo(unsigned x) {
1830   if (x == 0) bar();
1831   else if (x == 1) qux();
1832 }
1833
1834 currently compiles into:
1835 _foo:
1836         movl    4(%esp), %eax
1837         cmpl    $1, %eax
1838         je      LBB0_3
1839         testl   %eax, %eax
1840         jne     LBB0_4
1841
1842 the testl could be removed:
1843 _foo:
1844         movl    4(%esp), %eax
1845         cmpl    $1, %eax
1846         je      LBB0_3
1847         jb      LBB0_4
1848
1849 0 is the only unsigned number < 1.
1850
1851 //===---------------------------------------------------------------------===//
1852
1853 This code:
1854
1855 %0 = type { i32, i1 }
1856
1857 define i32 @add32carry(i32 %sum, i32 %x) nounwind readnone ssp {
1858 entry:
1859   %uadd = tail call %0 @llvm.uadd.with.overflow.i32(i32 %sum, i32 %x)
1860   %cmp = extractvalue %0 %uadd, 1
1861   %inc = zext i1 %cmp to i32
1862   %add = add i32 %x, %sum
1863   %z.0 = add i32 %add, %inc
1864   ret i32 %z.0
1865 }
1866
1867 declare %0 @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone
1868
1869 compiles to:
1870
1871 _add32carry:                            ## @add32carry
1872         addl    %esi, %edi
1873         sbbl    %ecx, %ecx
1874         movl    %edi, %eax
1875         subl    %ecx, %eax
1876         ret
1877
1878 But it could be:
1879
1880 _add32carry:
1881         leal    (%rsi,%rdi), %eax
1882         cmpl    %esi, %eax
1883         adcl    $0, %eax
1884         ret
1885
1886 //===---------------------------------------------------------------------===//
1887
1888 This:
1889 char t(char c) {
1890   return c/3;
1891 }
1892
1893 Compiles to: $clang t.c -S -o - -O3 -mkernel -fomit-frame-pointer
1894
1895 _t:                                     ## @t
1896         movslq  %edi, %rax
1897         imulq   $-1431655765, %rax, %rcx ## imm = 0xFFFFFFFFAAAAAAAB
1898         shrq    $32, %rcx
1899         addl    %ecx, %eax
1900         movl    %eax, %ecx
1901         shrl    $31, %ecx
1902         shrl    %eax
1903         addl    %ecx, %eax
1904         movsbl  %al, %eax
1905         ret
1906
1907 GCC gets:
1908
1909 _t:
1910         movl    $86, %eax
1911         imulb   %dil
1912         shrw    $8, %ax
1913         sarb    $7, %dil
1914         subb    %dil, %al
1915         movsbl  %al,%eax
1916         ret
1917
1918 which is nicer.  This also happens for int, not just char.
1919
1920 //===---------------------------------------------------------------------===//
1921
1922
1923