lib/Target/X86/README.txt

   1 //===---------------------------------------------------------------------===//
   2 // Random ideas for the X86 backend.
   3 //===---------------------------------------------------------------------===//
   4
   5 Missing features:
   6   - Support for SSE4: http://www.intel.com/software/penryn
   7 http://softwarecommunity.intel.com/isn/Downloads/Intel%20SSE4%20Programming%20Reference.pdf
   8   - support for 3DNow!
   9   - weird abis?
  10
  11 //===---------------------------------------------------------------------===//
  12
  13 Add a MUL2U and MUL2S nodes to represent a multiply that returns both the
  14 Hi and Lo parts (combination of MUL and MULH[SU] into one node).  Add this to
  15 X86, & make the dag combiner produce it when needed.  This will eliminate one
  16 imul from the code generated for:
  17
  18 long long test(long long X, long long Y) { return X*Y; }
  19
  20 by using the EAX result from the mul.  We should add a similar node for
  21 DIVREM.
  22
  23 another case is:
  24
  25 long long test(int X, int Y) { return (long long)X*Y; }
  26
  27 ... which should only be one imul instruction.
  28
  29 or:
  30
  31 unsigned long long int t2(unsigned int a, unsigned int b) {
  32        return (unsigned long long)a * b;
  33 }
  34
  35 ... which should be one mul instruction.
  36
  37
  38 This can be done with a custom expander, but it would be nice to move this to
  39 generic code.
  40
  41 //===---------------------------------------------------------------------===//
  42
  43 CodeGen/X86/lea-3.ll:test3 should be a single LEA, not a shift/move.  The X86
  44 backend knows how to three-addressify this shift, but it appears the register
  45 allocator isn't even asking it to do so in this case.  We should investigate
  46 why this isn't happening, it could have significant impact on other important
  47 cases for X86 as well.
  48
  49 //===---------------------------------------------------------------------===//
  50
  51 This should be one DIV/IDIV instruction, not a libcall:
  52
  53 unsigned test(unsigned long long X, unsigned Y) {
  54         return X/Y;
  55 }
  56
  57 This can be done trivially with a custom legalizer.  What about overflow
  58 though?  http://gcc.gnu.org/bugzilla/show_bug.cgi?id=14224
  59
  60 //===---------------------------------------------------------------------===//
  61
  62 Improvements to the multiply -> shift/add algorithm:
  63 http://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html
  64
  65 //===---------------------------------------------------------------------===//
  66
  67 Improve code like this (occurs fairly frequently, e.g. in LLVM):
  68 long long foo(int x) { return 1LL << x; }
  69
  70 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01109.html
  71 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01128.html
  72 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html
  73
  74 Another useful one would be  ~0ULL >> X and ~0ULL << X.
  75
  76 One better solution for 1LL << x is:
  77         xorl    %eax, %eax
  78         xorl    %edx, %edx
  79         testb   $32, %cl
  80         sete    %al
  81         setne   %dl
  82         sall    %cl, %eax
  83         sall    %cl, %edx
  84
  85 But that requires good 8-bit subreg support.
  86
  87 64-bit shifts (in general) expand to really bad code.  Instead of using
  88 cmovs, we should expand to a conditional branch like GCC produces.
  89
  90 //===---------------------------------------------------------------------===//
  91
  92 Compile this:
  93 _Bool f(_Bool a) { return a!=1; }
  94
  95 into:
  96         movzbl  %dil, %eax
  97         xorl    $1, %eax
  98         ret
  99
 100 //===---------------------------------------------------------------------===//
 101
 102 Some isel ideas:
 103
 104 1. Dynamic programming based approach when compile time if not an
 105    issue.
 106 2. Code duplication (addressing mode) during isel.
 107 3. Other ideas from "Register-Sensitive Selection, Duplication, and
 108    Sequencing of Instructions".
 109 4. Scheduling for reduced register pressure.  E.g. "Minimum Register
 110    Instruction Sequence Problem: Revisiting Optimal Code Generation for DAGs"
 111    and other related papers.
 112    http://citeseer.ist.psu.edu/govindarajan01minimum.html
 113
 114 //===---------------------------------------------------------------------===//
 115
 116 Should we promote i16 to i32 to avoid partial register update stalls?
 117
 118 //===---------------------------------------------------------------------===//
 119
 120 Leave any_extend as pseudo instruction and hint to register
 121 allocator. Delay codegen until post register allocation.
 122
 123 //===---------------------------------------------------------------------===//
 124
 125 Count leading zeros and count trailing zeros:
 126
 127 int clz(int X) { return __builtin_clz(X); }
 128 int ctz(int X) { return __builtin_ctz(X); }
 129
 130 $ gcc t.c -S -o - -O3  -fomit-frame-pointer -masm=intel
 131 clz:
 132         bsr     %eax, DWORD PTR [%esp+4]
 133         xor     %eax, 31
 134         ret
 135 ctz:
 136         bsf     %eax, DWORD PTR [%esp+4]
 137         ret
 138
 139 however, check that these are defined for 0 and 32.  Our intrinsics are, GCC's
 140 aren't.
 141
 142 Another example (use predsimplify to eliminate a select):
 143
 144 int foo (unsigned long j) {
 145   if (j)
 146     return __builtin_ffs (j) - 1;
 147   else
 148     return 0;
 149 }
 150
 151 //===---------------------------------------------------------------------===//
 152
 153 Use push/pop instructions in prolog/epilog sequences instead of stores off
 154 ESP (certain code size win, perf win on some [which?] processors).
 155 Also, it appears icc use push for parameter passing. Need to investigate.
 156
 157 //===---------------------------------------------------------------------===//
 158
 159 Only use inc/neg/not instructions on processors where they are faster than
 160 add/sub/xor.  They are slower on the P4 due to only updating some processor
 161 flags.
 162
 163 //===---------------------------------------------------------------------===//
 164
 165 The instruction selector sometimes misses folding a load into a compare.  The
 166 pattern is written as (cmp reg, (load p)).  Because the compare isn't
 167 commutative, it is not matched with the load on both sides.  The dag combiner
 168 should be made smart enough to cannonicalize the load into the RHS of a compare
 169 when it can invert the result of the compare for free.
 170
 171 //===---------------------------------------------------------------------===//
 172
 173 How about intrinsics? An example is:
 174   *res = _mm_mulhi_epu16(*A, _mm_mul_epu32(*B, *C));
 175
 176 compiles to
 177         pmuludq (%eax), %xmm0
 178         movl 8(%esp), %eax
 179         movdqa (%eax), %xmm1
 180         pmulhuw %xmm0, %xmm1
 181
 182 The transformation probably requires a X86 specific pass or a DAG combiner
 183 target specific hook.
 184
 185 //===---------------------------------------------------------------------===//
 186
 187 In many cases, LLVM generates code like this:
 188
 189 _test:
 190         movl 8(%esp), %eax
 191         cmpl %eax, 4(%esp)
 192         setl %al
 193         movzbl %al, %eax
 194         ret
 195
 196 on some processors (which ones?), it is more efficient to do this:
 197
 198 _test:
 199         movl 8(%esp), %ebx
 200         xor  %eax, %eax
 201         cmpl %ebx, 4(%esp)
 202         setl %al
 203         ret
 204
 205 Doing this correctly is tricky though, as the xor clobbers the flags.
 206
 207 //===---------------------------------------------------------------------===//
 208
 209 We should generate bts/btr/etc instructions on targets where they are cheap or
 210 when codesize is important.  e.g., for:
 211
 212 void setbit(int *target, int bit) {
 213     *target |= (1 << bit);
 214 }
 215 void clearbit(int *target, int bit) {
 216     *target &= ~(1 << bit);
 217 }
 218
 219 //===---------------------------------------------------------------------===//
 220
 221 Instead of the following for memset char*, 1, 10:
 222
 223         movl $16843009, 4(%edx)
 224         movl $16843009, (%edx)
 225         movw $257, 8(%edx)
 226
 227 It might be better to generate
 228
 229         movl $16843009, %eax
 230         movl %eax, 4(%edx)
 231         movl %eax, (%edx)
 232         movw al, 8(%edx)
 233
 234 when we can spare a register. It reduces code size.
 235
 236 //===---------------------------------------------------------------------===//
 237
 238 Evaluate what the best way to codegen sdiv X, (2^C) is.  For X/8, we currently
 239 get this:
 240
 241 int %test1(int %X) {
 242         %Y = div int %X, 8
 243         ret int %Y
 244 }
 245
 246 _test1:
 247         movl 4(%esp), %eax
 248         movl %eax, %ecx
 249         sarl $31, %ecx
 250         shrl $29, %ecx
 251         addl %ecx, %eax
 252         sarl $3, %eax
 253         ret
 254
 255 GCC knows several different ways to codegen it, one of which is this:
 256
 257 _test1:
 258         movl    4(%esp), %eax
 259         cmpl    $-1, %eax
 260         leal    7(%eax), %ecx
 261         cmovle  %ecx, %eax
 262         sarl    $3, %eax
 263         ret
 264
 265 which is probably slower, but it's interesting at least :)
 266
 267 //===---------------------------------------------------------------------===//
 268
 269 The first BB of this code:
 270
 271 declare bool %foo()
 272 int %bar() {
 273         %V = call bool %foo()
 274         br bool %V, label %T, label %F
 275 T:
 276         ret int 1
 277 F:
 278         call bool %foo()
 279         ret int 12
 280 }
 281
 282 compiles to:
 283
 284 _bar:
 285         subl $12, %esp
 286         call L_foo$stub
 287         xorb $1, %al
 288         testb %al, %al
 289         jne LBB_bar_2   # F
 290
 291 It would be better to emit "cmp %al, 1" than a xor and test.
 292
 293 //===---------------------------------------------------------------------===//
 294
 295 Enable X86InstrInfo::convertToThreeAddress().
 296
 297 //===---------------------------------------------------------------------===//
 298
 299 We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl
 300 We should leave these as libcalls for everything over a much lower threshold,
 301 since libc is hand tuned for medium and large mem ops (avoiding RFO for large
 302 stores, TLB preheating, etc)
 303
 304 //===---------------------------------------------------------------------===//
 305
 306 Optimize this into something reasonable:
 307  x * copysign(1.0, y) * copysign(1.0, z)
 308
 309 //===---------------------------------------------------------------------===//
 310
 311 Optimize copysign(x, *y) to use an integer load from y.
 312
 313 //===---------------------------------------------------------------------===//
 314
 315 %X = weak global int 0
 316
 317 void %foo(int %N) {
 318         %N = cast int %N to uint
 319         %tmp.24 = setgt int %N, 0
 320         br bool %tmp.24, label %no_exit, label %return
 321
 322 no_exit:
 323         %indvar = phi uint [ 0, %entry ], [ %indvar.next, %no_exit ]
 324         %i.0.0 = cast uint %indvar to int
 325         volatile store int %i.0.0, int* %X
 326         %indvar.next = add uint %indvar, 1
 327         %exitcond = seteq uint %indvar.next, %N
 328         br bool %exitcond, label %return, label %no_exit
 329
 330 return:
 331         ret void
 332 }
 333
 334 compiles into:
 335
 336         .text
 337         .align  4
 338         .globl  _foo
 339 _foo:
 340         movl 4(%esp), %eax
 341         cmpl $1, %eax
 342         jl LBB_foo_4    # return
 343 LBB_foo_1:      # no_exit.preheader
 344         xorl %ecx, %ecx
 345 LBB_foo_2:      # no_exit
 346         movl L_X$non_lazy_ptr, %edx
 347         movl %ecx, (%edx)
 348         incl %ecx
 349         cmpl %eax, %ecx
 350         jne LBB_foo_2   # no_exit
 351 LBB_foo_3:      # return.loopexit
 352 LBB_foo_4:      # return
 353         ret
 354
 355 We should hoist "movl L_X$non_lazy_ptr, %edx" out of the loop after
 356 remateralization is implemented. This can be accomplished with 1) a target
 357 dependent LICM pass or 2) makeing SelectDAG represent the whole function.
 358
 359 //===---------------------------------------------------------------------===//
 360
 361 The following tests perform worse with LSR:
 362
 363 lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor.
 364
 365 //===---------------------------------------------------------------------===//
 366
 367 We are generating far worse code than gcc:
 368
 369 volatile short X, Y;
 370
 371 void foo(int N) {
 372   int i;
 373   for (i = 0; i < N; i++) { X = i; Y = i*4; }
 374 }
 375
 376 LBB1_1: #bb.preheader
 377         xorl %ecx, %ecx
 378         xorw %dx, %dx
 379 LBB1_2: #bb
 380         movl L_X$non_lazy_ptr, %esi
 381         movw %dx, (%esi)
 382         movw %dx, %si
 383         shlw $2, %si
 384         movl L_Y$non_lazy_ptr, %edi
 385         movw %si, (%edi)
 386         incl %ecx
 387         incw %dx
 388         cmpl %eax, %ecx
 389         jne LBB1_2      #bb
 390
 391 vs.
 392
 393         xorl    %edx, %edx
 394         movl    L_X$non_lazy_ptr-"L00000000001$pb"(%ebx), %esi
 395         movl    L_Y$non_lazy_ptr-"L00000000001$pb"(%ebx), %ecx
 396 L4:
 397         movw    %dx, (%esi)
 398         leal    0(,%edx,4), %eax
 399         movw    %ax, (%ecx)
 400         addl    $1, %edx
 401         cmpl    %edx, %edi
 402         jne     L4
 403
 404 There are 3 issues:
 405
 406 1. Lack of post regalloc LICM.
 407 2. Poor sub-regclass support. That leads to inability to promote the 16-bit
 408    arithmetic op to 32-bit and making use of leal.
 409 3. LSR unable to reused IV for a different type (i16 vs. i32) even though
 410    the cast would be free.
 411
 412 //===---------------------------------------------------------------------===//
 413
 414 Teach the coalescer to coalesce vregs of different register classes. e.g. FR32 /
 415 FR64 to VR128.
 416
 417 //===---------------------------------------------------------------------===//
 418
 419 mov $reg, 48(%esp)
 420 ...
 421 leal 48(%esp), %eax
 422 mov %eax, (%esp)
 423 call _foo
 424
 425 Obviously it would have been better for the first mov (or any op) to store
 426 directly %esp[0] if there are no other uses.
 427
 428 //===---------------------------------------------------------------------===//
 429
 430 Adding to the list of cmp / test poor codegen issues:
 431
 432 int test(__m128 *A, __m128 *B) {
 433   if (_mm_comige_ss(*A, *B))
 434     return 3;
 435   else
 436     return 4;
 437 }
 438
 439 _test:
 440         movl 8(%esp), %eax
 441         movaps (%eax), %xmm0
 442         movl 4(%esp), %eax
 443         movaps (%eax), %xmm1
 444         comiss %xmm0, %xmm1
 445         setae %al
 446         movzbl %al, %ecx
 447         movl $3, %eax
 448         movl $4, %edx
 449         cmpl $0, %ecx
 450         cmove %edx, %eax
 451         ret
 452
 453 Note the setae, movzbl, cmpl, cmove can be replaced with a single cmovae. There
 454 are a number of issues. 1) We are introducing a setcc between the result of the
 455 intrisic call and select. 2) The intrinsic is expected to produce a i32 value
 456 so a any extend (which becomes a zero extend) is added.
 457
 458 We probably need some kind of target DAG combine hook to fix this.
 459
 460 //===---------------------------------------------------------------------===//
 461
 462 We generate significantly worse code for this than GCC:
 463 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150
 464 http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701
 465
 466 There is also one case we do worse on PPC.
 467
 468 //===---------------------------------------------------------------------===//
 469
 470 If shorter, we should use things like:
 471 movzwl %ax, %eax
 472 instead of:
 473 andl $65535, %EAX
 474
 475 The former can also be used when the two-addressy nature of the 'and' would
 476 require a copy to be inserted (in X86InstrInfo::convertToThreeAddress).
 477
 478 //===---------------------------------------------------------------------===//
 479
 480 Bad codegen:
 481
 482 char foo(int x) { return x; }
 483
 484 _foo:
 485         movl 4(%esp), %eax
 486         shll $24, %eax
 487         sarl $24, %eax
 488         ret
 489
 490 SIGN_EXTEND_INREG can be implemented as (sext (trunc)) to take advantage of
 491 sub-registers.
 492
 493 //===---------------------------------------------------------------------===//
 494
 495 Consider this:
 496
 497 typedef struct pair { float A, B; } pair;
 498 void pairtest(pair P, float *FP) {
 499         *FP = P.A+P.B;
 500 }
 501
 502 We currently generate this code with llvmgcc4:
 503
 504 _pairtest:
 505         movl 8(%esp), %eax
 506         movl 4(%esp), %ecx
 507         movd %eax, %xmm0
 508         movd %ecx, %xmm1
 509         addss %xmm0, %xmm1
 510         movl 12(%esp), %eax
 511         movss %xmm1, (%eax)
 512         ret
 513
 514 we should be able to generate:
 515 _pairtest:
 516         movss 4(%esp), %xmm0
 517         movl 12(%esp), %eax
 518         addss 8(%esp), %xmm0
 519         movss %xmm0, (%eax)
 520         ret
 521
 522 The issue is that llvmgcc4 is forcing the struct to memory, then passing it as
 523 integer chunks.  It does this so that structs like {short,short} are passed in
 524 a single 32-bit integer stack slot.  We should handle the safe cases above much
 525 nicer, while still handling the hard cases.
 526
 527 While true in general, in this specific case we could do better by promoting
 528 load int + bitcast to float -> load fload.  This basically needs alignment info,
 529 the code is already implemented (but disabled) in dag combine).
 530
 531 //===---------------------------------------------------------------------===//
 532
 533 Another instruction selector deficiency:
 534
 535 void %bar() {
 536         %tmp = load int (int)** %foo
 537         %tmp = tail call int %tmp( int 3 )
 538         ret void
 539 }
 540
 541 _bar:
 542         subl $12, %esp
 543         movl L_foo$non_lazy_ptr, %eax
 544         movl (%eax), %eax
 545         call *%eax
 546         addl $12, %esp
 547         ret
 548
 549 The current isel scheme will not allow the load to be folded in the call since
 550 the load's chain result is read by the callseq_start.
 551
 552 //===---------------------------------------------------------------------===//
 553
 554 Don't forget to find a way to squash noop truncates in the JIT environment.
 555
 556 //===---------------------------------------------------------------------===//
 557
 558 Implement anyext in the same manner as truncate that would allow them to be
 559 eliminated.
 560
 561 //===---------------------------------------------------------------------===//
 562
 563 How about implementing truncate / anyext as a property of machine instruction
 564 operand? i.e. Print as 32-bit super-class register / 16-bit sub-class register.
 565 Do this for the cases where a truncate / anyext is guaranteed to be eliminated.
 566 For IA32 that is truncate from 32 to 16 and anyext from 16 to 32.
 567
 568 //===---------------------------------------------------------------------===//
 569
 570 For this:
 571
 572 int test(int a)
 573 {
 574   return a * 3;
 575 }
 576
 577 We currently emits
 578         imull $3, 4(%esp), %eax
 579
 580 Perhaps this is what we really should generate is? Is imull three or four
 581 cycles? Note: ICC generates this:
 582         movl    4(%esp), %eax
 583         leal    (%eax,%eax,2), %eax
 584
 585 The current instruction priority is based on pattern complexity. The former is
 586 more "complex" because it folds a load so the latter will not be emitted.
 587
 588 Perhaps we should use AddedComplexity to give LEA32r a higher priority? We
 589 should always try to match LEA first since the LEA matching code does some
 590 estimate to determine whether the match is profitable.
 591
 592 However, if we care more about code size, then imull is better. It's two bytes
 593 shorter than movl + leal.
 594
 595 //===---------------------------------------------------------------------===//
 596
 597 Implement CTTZ, CTLZ with bsf and bsr.
 598
 599 //===---------------------------------------------------------------------===//
 600
 601 It appears gcc place string data with linkonce linkage in
 602 .section __TEXT,__const_coal,coalesced instead of
 603 .section __DATA,__const_coal,coalesced.
 604 Take a look at darwin.h, there are other Darwin assembler directives that we
 605 do not make use of.
 606
 607 //===---------------------------------------------------------------------===//
 608
 609 int %foo(int* %a, int %t) {
 610 entry:
 611         br label %cond_true
 612
 613 cond_true:              ; preds = %cond_true, %entry
 614         %x.0.0 = phi int [ 0, %entry ], [ %tmp9, %cond_true ]
 615         %t_addr.0.0 = phi int [ %t, %entry ], [ %tmp7, %cond_true ]
 616         %tmp2 = getelementptr int* %a, int %x.0.0
 617         %tmp3 = load int* %tmp2         ; <int> [#uses=1]
 618         %tmp5 = add int %t_addr.0.0, %x.0.0             ; <int> [#uses=1]
 619         %tmp7 = add int %tmp5, %tmp3            ; <int> [#uses=2]
 620         %tmp9 = add int %x.0.0, 1               ; <int> [#uses=2]
 621         %tmp = setgt int %tmp9, 39              ; <bool> [#uses=1]
 622         br bool %tmp, label %bb12, label %cond_true
 623
 624 bb12:           ; preds = %cond_true
 625         ret int %tmp7
 626 }
 627
 628 is pessimized by -loop-reduce and -indvars
 629
 630 //===---------------------------------------------------------------------===//
 631
 632 u32 to float conversion improvement:
 633
 634 float uint32_2_float( unsigned u ) {
 635   float fl = (int) (u & 0xffff);
 636   float fh = (int) (u >> 16);
 637   fh *= 0x1.0p16f;
 638   return fh + fl;
 639 }
 640
 641 00000000        subl    $0x04,%esp
 642 00000003        movl    0x08(%esp,1),%eax
 643 00000007        movl    %eax,%ecx
 644 00000009        shrl    $0x10,%ecx
 645 0000000c        cvtsi2ss        %ecx,%xmm0
 646 00000010        andl    $0x0000ffff,%eax
 647 00000015        cvtsi2ss        %eax,%xmm1
 648 00000019        mulss   0x00000078,%xmm0
 649 00000021        addss   %xmm1,%xmm0
 650 00000025        movss   %xmm0,(%esp,1)
 651 0000002a        flds    (%esp,1)
 652 0000002d        addl    $0x04,%esp
 653 00000030        ret
 654
 655 //===---------------------------------------------------------------------===//
 656
 657 When using fastcc abi, align stack slot of argument of type double on 8 byte
 658 boundary to improve performance.
 659
 660 //===---------------------------------------------------------------------===//
 661
 662 Codegen:
 663
 664 int f(int a, int b) {
 665   if (a == 4 || a == 6)
 666     b++;
 667   return b;
 668 }
 669
 670
 671 as:
 672
 673 or eax, 2
 674 cmp eax, 6
 675 jz label
 676
 677 //===---------------------------------------------------------------------===//
 678
 679 GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting
 680 simplifications for integer "x cmp y ? a : b".  For example, instead of:
 681
 682 int G;
 683 void f(int X, int Y) {
 684   G = X < 0 ? 14 : 13;
 685 }
 686
 687 compiling to:
 688
 689 _f:
 690         movl $14, %eax
 691         movl $13, %ecx
 692         movl 4(%esp), %edx
 693         testl %edx, %edx
 694         cmovl %eax, %ecx
 695         movl %ecx, _G
 696         ret
 697
 698 it could be:
 699 _f:
 700         movl    4(%esp), %eax
 701         sarl    $31, %eax
 702         notl    %eax
 703         addl    $14, %eax
 704         movl    %eax, _G
 705         ret
 706
 707 etc.
 708
 709 //===---------------------------------------------------------------------===//
 710
 711 Currently we don't have elimination of redundant stack manipulations. Consider
 712 the code:
 713
 714 int %main() {
 715 entry:
 716         call fastcc void %test1( )
 717         call fastcc void %test2( sbyte* cast (void ()* %test1 to sbyte*) )
 718         ret int 0
 719 }
 720
 721 declare fastcc void %test1()
 722
 723 declare fastcc void %test2(sbyte*)
 724
 725
 726 This currently compiles to:
 727
 728         subl $16, %esp
 729         call _test5
 730         addl $12, %esp
 731         subl $16, %esp
 732         movl $_test5, (%esp)
 733         call _test6
 734         addl $12, %esp
 735
 736 The add\sub pair is really unneeded here.
 737
 738 //===---------------------------------------------------------------------===//
 739
 740 We currently compile sign_extend_inreg into two shifts:
 741
 742 long foo(long X) {
 743   return (long)(signed char)X;
 744 }
 745
 746 becomes:
 747
 748 _foo:
 749         movl 4(%esp), %eax
 750         shll $24, %eax
 751         sarl $24, %eax
 752         ret
 753
 754 This could be:
 755
 756 _foo:
 757         movsbl  4(%esp),%eax
 758         ret
 759
 760 //===---------------------------------------------------------------------===//
 761
 762 Consider the expansion of:
 763
 764 uint %test3(uint %X) {
 765         %tmp1 = rem uint %X, 255
 766         ret uint %tmp1
 767 }
 768
 769 Currently it compiles to:
 770
 771 ...
 772         movl $2155905153, %ecx
 773         movl 8(%esp), %esi
 774         movl %esi, %eax
 775         mull %ecx
 776 ...
 777
 778 This could be "reassociated" into:
 779
 780         movl $2155905153, %eax
 781         movl 8(%esp), %ecx
 782         mull %ecx
 783
 784 to avoid the copy.  In fact, the existing two-address stuff would do this
 785 except that mul isn't a commutative 2-addr instruction.  I guess this has
 786 to be done at isel time based on the #uses to mul?
 787
 788 //===---------------------------------------------------------------------===//
 789
 790 Make sure the instruction which starts a loop does not cross a cacheline
 791 boundary. This requires knowning the exact length of each machine instruction.
 792 That is somewhat complicated, but doable. Example 256.bzip2:
 793
 794 In the new trace, the hot loop has an instruction which crosses a cacheline
 795 boundary.  In addition to potential cache misses, this can't help decoding as I
 796 imagine there has to be some kind of complicated decoder reset and realignment
 797 to grab the bytes from the next cacheline.
 798
 799 532  532 0x3cfc movb     (1809(%esp, %esi), %bl   <<<--- spans 2 64 byte lines
 800 942  942 0x3d03 movl     %dh, (1809(%esp, %esi)
 801 937  937 0x3d0a incl     %esi
 802 3    3   0x3d0b cmpb     %bl, %dl
 803 27   27  0x3d0d jnz      0x000062db <main+11707>
 804
 805 //===---------------------------------------------------------------------===//
 806
 807 In c99 mode, the preprocessor doesn't like assembly comments like #TRUNCATE.
 808
 809 //===---------------------------------------------------------------------===//
 810
 811 This could be a single 16-bit load.
 812
 813 int f(char *p) {
 814     if ((p[0] == 1) & (p[1] == 2)) return 1;
 815     return 0;
 816 }
 817
 818 //===---------------------------------------------------------------------===//
 819
 820 We should inline lrintf and probably other libc functions.
 821
 822 //===---------------------------------------------------------------------===//
 823
 824 Start using the flags more.  For example, compile:
 825
 826 int add_zf(int *x, int y, int a, int b) {
 827      if ((*x += y) == 0)
 828           return a;
 829      else
 830           return b;
 831 }
 832
 833 to:
 834        addl    %esi, (%rdi)
 835        movl    %edx, %eax
 836        cmovne  %ecx, %eax
 837        ret
 838 instead of:
 839
 840 _add_zf:
 841         addl (%rdi), %esi
 842         movl %esi, (%rdi)
 843         testl %esi, %esi
 844         cmove %edx, %ecx
 845         movl %ecx, %eax
 846         ret
 847
 848 and:
 849
 850 int add_zf(int *x, int y, int a, int b) {
 851      if ((*x + y) < 0)
 852           return a;
 853      else
 854           return b;
 855 }
 856
 857 to:
 858
 859 add_zf:
 860         addl    (%rdi), %esi
 861         movl    %edx, %eax
 862         cmovns  %ecx, %eax
 863         ret
 864
 865 instead of:
 866
 867 _add_zf:
 868         addl (%rdi), %esi
 869         testl %esi, %esi
 870         cmovs %edx, %ecx
 871         movl %ecx, %eax
 872         ret
 873
 874 //===---------------------------------------------------------------------===//
 875
 876 This:
 877 #include <math.h>
 878 int foo(double X) { return isnan(X); }
 879
 880 compiles to (-m64):
 881
 882 _foo:
 883         pxor %xmm1, %xmm1
 884         ucomisd %xmm1, %xmm0
 885         setp %al
 886         movzbl %al, %eax
 887         ret
 888
 889 the pxor is not needed, we could compare the value against itself.
 890
 891 //===---------------------------------------------------------------------===//
 892
 893 These two functions have identical effects:
 894
 895 unsigned int f(unsigned int i, unsigned int n) {++i; if (i == n) ++i; return i;}
 896 unsigned int f2(unsigned int i, unsigned int n) {++i; i += i == n; return i;}
 897
 898 We currently compile them to:
 899
 900 _f:
 901         movl 4(%esp), %eax
 902         movl %eax, %ecx
 903         incl %ecx
 904         movl 8(%esp), %edx
 905         cmpl %edx, %ecx
 906         jne LBB1_2      #UnifiedReturnBlock
 907 LBB1_1: #cond_true
 908         addl $2, %eax
 909         ret
 910 LBB1_2: #UnifiedReturnBlock
 911         movl %ecx, %eax
 912         ret
 913 _f2:
 914         movl 4(%esp), %eax
 915         movl %eax, %ecx
 916         incl %ecx
 917         cmpl 8(%esp), %ecx
 918         sete %cl
 919         movzbl %cl, %ecx
 920         leal 1(%ecx,%eax), %eax
 921         ret
 922
 923 both of which are inferior to GCC's:
 924
 925 _f:
 926         movl    4(%esp), %edx
 927         leal    1(%edx), %eax
 928         addl    $2, %edx
 929         cmpl    8(%esp), %eax
 930         cmove   %edx, %eax
 931         ret
 932 _f2:
 933         movl    4(%esp), %eax
 934         addl    $1, %eax
 935         xorl    %edx, %edx
 936         cmpl    8(%esp), %eax
 937         sete    %dl
 938         addl    %edx, %eax
 939         ret
 940
 941 //===---------------------------------------------------------------------===//
 942
 943 This code:
 944
 945 void test(int X) {
 946   if (X) abort();
 947 }
 948
 949 is currently compiled to:
 950
 951 _test:
 952         subl $12, %esp
 953         cmpl $0, 16(%esp)
 954         jne LBB1_1
 955         addl $12, %esp
 956         ret
 957 LBB1_1:
 958         call L_abort$stub
 959
 960 It would be better to produce:
 961
 962 _test:
 963         subl $12, %esp
 964         cmpl $0, 16(%esp)
 965         jne L_abort$stub
 966         addl $12, %esp
 967         ret
 968
 969 This can be applied to any no-return function call that takes no arguments etc.
 970 Alternatively, the stack save/restore logic could be shrink-wrapped, producing
 971 something like this:
 972
 973 _test:
 974         cmpl $0, 4(%esp)
 975         jne LBB1_1
 976         ret
 977 LBB1_1:
 978         subl $12, %esp
 979         call L_abort$stub
 980
 981 Both are useful in different situations.  Finally, it could be shrink-wrapped
 982 and tail called, like this:
 983
 984 _test:
 985         cmpl $0, 4(%esp)
 986         jne LBB1_1
 987         ret
 988 LBB1_1:
 989         pop %eax   # realign stack.
 990         call L_abort$stub
 991
 992 Though this probably isn't worth it.
 993
 994 //===---------------------------------------------------------------------===//
 995
 996 We need to teach the codegen to convert two-address INC instructions to LEA
 997 when the flags are dead.  For example, on X86-64, compile:
 998
 999 int foo(int A, int B) {
1000   return A+1;
1001 }
1002
1003 to:
1004
1005 _foo:
1006         leal    1(%edi), %eax
1007         ret
1008
1009 instead of:
1010
1011 _foo:
1012         incl %edi
1013         movl %edi, %eax
1014         ret
1015
1016 Another example is:
1017
1018 ;; X's live range extends beyond the shift, so the register allocator
1019 ;; cannot coalesce it with Y.  Because of this, a copy needs to be
1020 ;; emitted before the shift to save the register value before it is
1021 ;; clobbered.  However, this copy is not needed if the register
1022 ;; allocator turns the shift into an LEA.  This also occurs for ADD.
1023
1024 ; Check that the shift gets turned into an LEA.
1025 ; RUN: llvm-upgrade < %s | llvm-as | llc -march=x86 -x86-asm-syntax=intel | \
1026 ; RUN:   not grep {mov E.X, E.X}
1027
1028 %G = external global int
1029
1030 int %test1(int %X, int %Y) {
1031         %Z = add int %X, %Y
1032         volatile store int %Y, int* %G
1033         volatile store int %Z, int* %G
1034         ret int %X
1035 }
1036
1037 int %test2(int %X) {
1038         %Z = add int %X, 1  ;; inc
1039         volatile store int %Z, int* %G
1040         ret int %X
1041 }
1042
1043 //===---------------------------------------------------------------------===//
1044
1045 We use push/pop of stack space around calls in situations where we don't have to.
1046 Call to f below produces:
1047         subl $16, %esp      <<<<<
1048         movl %eax, (%esp)
1049         call L_f$stub
1050         addl $16, %esp     <<<<<
1051 The stack push/pop can be moved into the prolog/epilog.  It does this because it's
1052 building the frame pointer, but this should not be sufficient, only the use of alloca
1053 should cause it to do this.
1054 (There are other issues shown by this code, but this is one.)
1055
1056 typedef struct _range_t {
1057     float fbias;
1058     float fscale;
1059     int ibias;
1060     int iscale;
1061     int ishift;
1062     unsigned char lut[];
1063 } range_t;
1064
1065 struct _decode_t {
1066     int type:4;
1067     int unit:4;
1068     int alpha:8;
1069     int N:8;
1070     int bpc:8;
1071     int bpp:16;
1072     int skip:8;
1073     int swap:8;
1074     const range_t*const*range;
1075 };
1076
1077 typedef struct _decode_t decode_t;
1078
1079 extern int f(const decode_t* decode);
1080
1081 int decode_byte (const decode_t* decode) {
1082   if (decode->swap != 0)
1083     return f(decode);
1084   return 0;
1085 }
1086
1087
1088 //===---------------------------------------------------------------------===//
1089
1090 This:
1091 #include <xmmintrin.h>
1092 unsigned test(float f) {
1093  return _mm_cvtsi128_si32( (__m128i) _mm_set_ss( f ));
1094 }
1095
1096 Compiles to:
1097 _test:
1098         movss 4(%esp), %xmm0
1099         movd %xmm0, %eax
1100         ret
1101
1102 it should compile to a move from the stack slot directly into eax.  DAGCombine
1103 has this xform, but it is currently disabled until the alignment fields of
1104 the load/store nodes are trustworthy.
1105
1106 //===---------------------------------------------------------------------===//
1107
1108 Sometimes it is better to codegen subtractions from a constant (e.g. 7-x) with
1109 a neg instead of a sub instruction.  Consider:
1110
1111 int test(char X) { return 7-X; }
1112
1113 we currently produce:
1114 _test:
1115         movl $7, %eax
1116         movsbl 4(%esp), %ecx
1117         subl %ecx, %eax
1118         ret
1119
1120 We would use one fewer register if codegen'd as:
1121
1122         movsbl 4(%esp), %eax
1123         neg %eax
1124         add $7, %eax
1125         ret
1126
1127 Note that this isn't beneficial if the load can be folded into the sub.  In
1128 this case, we want a sub:
1129
1130 int test(int X) { return 7-X; }
1131 _test:
1132         movl $7, %eax
1133         subl 4(%esp), %eax
1134         ret
1135