lib/Target/X86/README.txt

   1 //===---------------------------------------------------------------------===//
   2 // Random ideas for the X86 backend.
   3 //===---------------------------------------------------------------------===//
   4
   5 Missing features:
   6   - Support for SSE4: http://www.intel.com/software/penryn
   7 http://softwarecommunity.intel.com/isn/Downloads/Intel%20SSE4%20Programming%20Reference.pdf
   8   - support for 3DNow!
   9   - weird abis?
  10
  11 //===---------------------------------------------------------------------===//
  12
  13 Add a MUL2U and MUL2S nodes to represent a multiply that returns both the
  14 Hi and Lo parts (combination of MUL and MULH[SU] into one node).  Add this to
  15 X86, & make the dag combiner produce it when needed.  This will eliminate one
  16 imul from the code generated for:
  17
  18 long long test(long long X, long long Y) { return X*Y; }
  19
  20 by using the EAX result from the mul.  We should add a similar node for
  21 DIVREM.
  22
  23 another case is:
  24
  25 long long test(int X, int Y) { return (long long)X*Y; }
  26
  27 ... which should only be one imul instruction.
  28
  29 or:
  30
  31 unsigned long long int t2(unsigned int a, unsigned int b) {
  32        return (unsigned long long)a * b;
  33 }
  34
  35 ... which should be one mul instruction.
  36
  37
  38 This can be done with a custom expander, but it would be nice to move this to
  39 generic code.
  40
  41 //===---------------------------------------------------------------------===//
  42
  43 CodeGen/X86/lea-3.ll:test3 should be a single LEA, not a shift/move.  The X86
  44 backend knows how to three-addressify this shift, but it appears the register
  45 allocator isn't even asking it to do so in this case.  We should investigate
  46 why this isn't happening, it could have significant impact on other important
  47 cases for X86 as well.
  48
  49 //===---------------------------------------------------------------------===//
  50
  51 This should be one DIV/IDIV instruction, not a libcall:
  52
  53 unsigned test(unsigned long long X, unsigned Y) {
  54         return X/Y;
  55 }
  56
  57 This can be done trivially with a custom legalizer.  What about overflow
  58 though?  http://gcc.gnu.org/bugzilla/show_bug.cgi?id=14224
  59
  60 //===---------------------------------------------------------------------===//
  61
  62 Improvements to the multiply -> shift/add algorithm:
  63 http://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html
  64
  65 //===---------------------------------------------------------------------===//
  66
  67 Improve code like this (occurs fairly frequently, e.g. in LLVM):
  68 long long foo(int x) { return 1LL << x; }
  69
  70 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01109.html
  71 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01128.html
  72 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html
  73
  74 Another useful one would be  ~0ULL >> X and ~0ULL << X.
  75
  76 One better solution for 1LL << x is:
  77         xorl    %eax, %eax
  78         xorl    %edx, %edx
  79         testb   $32, %cl
  80         sete    %al
  81         setne   %dl
  82         sall    %cl, %eax
  83         sall    %cl, %edx
  84
  85 But that requires good 8-bit subreg support.
  86
  87 64-bit shifts (in general) expand to really bad code.  Instead of using
  88 cmovs, we should expand to a conditional branch like GCC produces.
  89
  90 //===---------------------------------------------------------------------===//
  91
  92 Compile this:
  93 _Bool f(_Bool a) { return a!=1; }
  94
  95 into:
  96         movzbl  %dil, %eax
  97         xorl    $1, %eax
  98         ret
  99
 100 //===---------------------------------------------------------------------===//
 101
 102 Some isel ideas:
 103
 104 1. Dynamic programming based approach when compile time if not an
 105    issue.
 106 2. Code duplication (addressing mode) during isel.
 107 3. Other ideas from "Register-Sensitive Selection, Duplication, and
 108    Sequencing of Instructions".
 109 4. Scheduling for reduced register pressure.  E.g. "Minimum Register
 110    Instruction Sequence Problem: Revisiting Optimal Code Generation for DAGs"
 111    and other related papers.
 112    http://citeseer.ist.psu.edu/govindarajan01minimum.html
 113
 114 //===---------------------------------------------------------------------===//
 115
 116 Should we promote i16 to i32 to avoid partial register update stalls?
 117
 118 //===---------------------------------------------------------------------===//
 119
 120 Leave any_extend as pseudo instruction and hint to register
 121 allocator. Delay codegen until post register allocation.
 122
 123 //===---------------------------------------------------------------------===//
 124
 125 Count leading zeros and count trailing zeros:
 126
 127 int clz(int X) { return __builtin_clz(X); }
 128 int ctz(int X) { return __builtin_ctz(X); }
 129
 130 $ gcc t.c -S -o - -O3  -fomit-frame-pointer -masm=intel
 131 clz:
 132         bsr     %eax, DWORD PTR [%esp+4]
 133         xor     %eax, 31
 134         ret
 135 ctz:
 136         bsf     %eax, DWORD PTR [%esp+4]
 137         ret
 138
 139 however, check that these are defined for 0 and 32.  Our intrinsics are, GCC's
 140 aren't.
 141
 142 Another example (use predsimplify to eliminate a select):
 143
 144 int foo (unsigned long j) {
 145   if (j)
 146     return __builtin_ffs (j) - 1;
 147   else
 148     return 0;
 149 }
 150
 151 //===---------------------------------------------------------------------===//
 152
 153 It appears icc use push for parameter passing. Need to investigate.
 154
 155 //===---------------------------------------------------------------------===//
 156
 157 Only use inc/neg/not instructions on processors where they are faster than
 158 add/sub/xor.  They are slower on the P4 due to only updating some processor
 159 flags.
 160
 161 //===---------------------------------------------------------------------===//
 162
 163 The instruction selector sometimes misses folding a load into a compare.  The
 164 pattern is written as (cmp reg, (load p)).  Because the compare isn't
 165 commutative, it is not matched with the load on both sides.  The dag combiner
 166 should be made smart enough to cannonicalize the load into the RHS of a compare
 167 when it can invert the result of the compare for free.
 168
 169 //===---------------------------------------------------------------------===//
 170
 171 How about intrinsics? An example is:
 172   *res = _mm_mulhi_epu16(*A, _mm_mul_epu32(*B, *C));
 173
 174 compiles to
 175         pmuludq (%eax), %xmm0
 176         movl 8(%esp), %eax
 177         movdqa (%eax), %xmm1
 178         pmulhuw %xmm0, %xmm1
 179
 180 The transformation probably requires a X86 specific pass or a DAG combiner
 181 target specific hook.
 182
 183 //===---------------------------------------------------------------------===//
 184
 185 In many cases, LLVM generates code like this:
 186
 187 _test:
 188         movl 8(%esp), %eax
 189         cmpl %eax, 4(%esp)
 190         setl %al
 191         movzbl %al, %eax
 192         ret
 193
 194 on some processors (which ones?), it is more efficient to do this:
 195
 196 _test:
 197         movl 8(%esp), %ebx
 198         xor  %eax, %eax
 199         cmpl %ebx, 4(%esp)
 200         setl %al
 201         ret
 202
 203 Doing this correctly is tricky though, as the xor clobbers the flags.
 204
 205 //===---------------------------------------------------------------------===//
 206
 207 We should generate bts/btr/etc instructions on targets where they are cheap or
 208 when codesize is important.  e.g., for:
 209
 210 void setbit(int *target, int bit) {
 211     *target |= (1 << bit);
 212 }
 213 void clearbit(int *target, int bit) {
 214     *target &= ~(1 << bit);
 215 }
 216
 217 //===---------------------------------------------------------------------===//
 218
 219 Instead of the following for memset char*, 1, 10:
 220
 221         movl $16843009, 4(%edx)
 222         movl $16843009, (%edx)
 223         movw $257, 8(%edx)
 224
 225 It might be better to generate
 226
 227         movl $16843009, %eax
 228         movl %eax, 4(%edx)
 229         movl %eax, (%edx)
 230         movw al, 8(%edx)
 231
 232 when we can spare a register. It reduces code size.
 233
 234 //===---------------------------------------------------------------------===//
 235
 236 Evaluate what the best way to codegen sdiv X, (2^C) is.  For X/8, we currently
 237 get this:
 238
 239 int %test1(int %X) {
 240         %Y = div int %X, 8
 241         ret int %Y
 242 }
 243
 244 _test1:
 245         movl 4(%esp), %eax
 246         movl %eax, %ecx
 247         sarl $31, %ecx
 248         shrl $29, %ecx
 249         addl %ecx, %eax
 250         sarl $3, %eax
 251         ret
 252
 253 GCC knows several different ways to codegen it, one of which is this:
 254
 255 _test1:
 256         movl    4(%esp), %eax
 257         cmpl    $-1, %eax
 258         leal    7(%eax), %ecx
 259         cmovle  %ecx, %eax
 260         sarl    $3, %eax
 261         ret
 262
 263 which is probably slower, but it's interesting at least :)
 264
 265 //===---------------------------------------------------------------------===//
 266
 267 The first BB of this code:
 268
 269 declare bool %foo()
 270 int %bar() {
 271         %V = call bool %foo()
 272         br bool %V, label %T, label %F
 273 T:
 274         ret int 1
 275 F:
 276         call bool %foo()
 277         ret int 12
 278 }
 279
 280 compiles to:
 281
 282 _bar:
 283         subl $12, %esp
 284         call L_foo$stub
 285         xorb $1, %al
 286         testb %al, %al
 287         jne LBB_bar_2   # F
 288
 289 It would be better to emit "cmp %al, 1" than a xor and test.
 290
 291 //===---------------------------------------------------------------------===//
 292
 293 Enable X86InstrInfo::convertToThreeAddress().
 294
 295 //===---------------------------------------------------------------------===//
 296
 297 We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl
 298 We should leave these as libcalls for everything over a much lower threshold,
 299 since libc is hand tuned for medium and large mem ops (avoiding RFO for large
 300 stores, TLB preheating, etc)
 301
 302 //===---------------------------------------------------------------------===//
 303
 304 Optimize this into something reasonable:
 305  x * copysign(1.0, y) * copysign(1.0, z)
 306
 307 //===---------------------------------------------------------------------===//
 308
 309 Optimize copysign(x, *y) to use an integer load from y.
 310
 311 //===---------------------------------------------------------------------===//
 312
 313 %X = weak global int 0
 314
 315 void %foo(int %N) {
 316         %N = cast int %N to uint
 317         %tmp.24 = setgt int %N, 0
 318         br bool %tmp.24, label %no_exit, label %return
 319
 320 no_exit:
 321         %indvar = phi uint [ 0, %entry ], [ %indvar.next, %no_exit ]
 322         %i.0.0 = cast uint %indvar to int
 323         volatile store int %i.0.0, int* %X
 324         %indvar.next = add uint %indvar, 1
 325         %exitcond = seteq uint %indvar.next, %N
 326         br bool %exitcond, label %return, label %no_exit
 327
 328 return:
 329         ret void
 330 }
 331
 332 compiles into:
 333
 334         .text
 335         .align  4
 336         .globl  _foo
 337 _foo:
 338         movl 4(%esp), %eax
 339         cmpl $1, %eax
 340         jl LBB_foo_4    # return
 341 LBB_foo_1:      # no_exit.preheader
 342         xorl %ecx, %ecx
 343 LBB_foo_2:      # no_exit
 344         movl L_X$non_lazy_ptr, %edx
 345         movl %ecx, (%edx)
 346         incl %ecx
 347         cmpl %eax, %ecx
 348         jne LBB_foo_2   # no_exit
 349 LBB_foo_3:      # return.loopexit
 350 LBB_foo_4:      # return
 351         ret
 352
 353 We should hoist "movl L_X$non_lazy_ptr, %edx" out of the loop after
 354 remateralization is implemented. This can be accomplished with 1) a target
 355 dependent LICM pass or 2) makeing SelectDAG represent the whole function.
 356
 357 //===---------------------------------------------------------------------===//
 358
 359 The following tests perform worse with LSR:
 360
 361 lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor.
 362
 363 //===---------------------------------------------------------------------===//
 364
 365 We are generating far worse code than gcc:
 366
 367 volatile short X, Y;
 368
 369 void foo(int N) {
 370   int i;
 371   for (i = 0; i < N; i++) { X = i; Y = i*4; }
 372 }
 373
 374 LBB1_1: #bb.preheader
 375         xorl %ecx, %ecx
 376         xorw %dx, %dx
 377 LBB1_2: #bb
 378         movl L_X$non_lazy_ptr, %esi
 379         movw %dx, (%esi)
 380         movw %dx, %si
 381         shlw $2, %si
 382         movl L_Y$non_lazy_ptr, %edi
 383         movw %si, (%edi)
 384         incl %ecx
 385         incw %dx
 386         cmpl %eax, %ecx
 387         jne LBB1_2      #bb
 388
 389 vs.
 390
 391         xorl    %edx, %edx
 392         movl    L_X$non_lazy_ptr-"L00000000001$pb"(%ebx), %esi
 393         movl    L_Y$non_lazy_ptr-"L00000000001$pb"(%ebx), %ecx
 394 L4:
 395         movw    %dx, (%esi)
 396         leal    0(,%edx,4), %eax
 397         movw    %ax, (%ecx)
 398         addl    $1, %edx
 399         cmpl    %edx, %edi
 400         jne     L4
 401
 402 There are 3 issues:
 403
 404 1. Lack of post regalloc LICM.
 405 2. Poor sub-regclass support. That leads to inability to promote the 16-bit
 406    arithmetic op to 32-bit and making use of leal.
 407 3. LSR unable to reused IV for a different type (i16 vs. i32) even though
 408    the cast would be free.
 409
 410 //===---------------------------------------------------------------------===//
 411
 412 Teach the coalescer to coalesce vregs of different register classes. e.g. FR32 /
 413 FR64 to VR128.
 414
 415 //===---------------------------------------------------------------------===//
 416
 417 mov $reg, 48(%esp)
 418 ...
 419 leal 48(%esp), %eax
 420 mov %eax, (%esp)
 421 call _foo
 422
 423 Obviously it would have been better for the first mov (or any op) to store
 424 directly %esp[0] if there are no other uses.
 425
 426 //===---------------------------------------------------------------------===//
 427
 428 Adding to the list of cmp / test poor codegen issues:
 429
 430 int test(__m128 *A, __m128 *B) {
 431   if (_mm_comige_ss(*A, *B))
 432     return 3;
 433   else
 434     return 4;
 435 }
 436
 437 _test:
 438         movl 8(%esp), %eax
 439         movaps (%eax), %xmm0
 440         movl 4(%esp), %eax
 441         movaps (%eax), %xmm1
 442         comiss %xmm0, %xmm1
 443         setae %al
 444         movzbl %al, %ecx
 445         movl $3, %eax
 446         movl $4, %edx
 447         cmpl $0, %ecx
 448         cmove %edx, %eax
 449         ret
 450
 451 Note the setae, movzbl, cmpl, cmove can be replaced with a single cmovae. There
 452 are a number of issues. 1) We are introducing a setcc between the result of the
 453 intrisic call and select. 2) The intrinsic is expected to produce a i32 value
 454 so a any extend (which becomes a zero extend) is added.
 455
 456 We probably need some kind of target DAG combine hook to fix this.
 457
 458 //===---------------------------------------------------------------------===//
 459
 460 We generate significantly worse code for this than GCC:
 461 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150
 462 http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701
 463
 464 There is also one case we do worse on PPC.
 465
 466 //===---------------------------------------------------------------------===//
 467
 468 If shorter, we should use things like:
 469 movzwl %ax, %eax
 470 instead of:
 471 andl $65535, %EAX
 472
 473 The former can also be used when the two-addressy nature of the 'and' would
 474 require a copy to be inserted (in X86InstrInfo::convertToThreeAddress).
 475
 476 //===---------------------------------------------------------------------===//
 477
 478 Bad codegen:
 479
 480 char foo(int x) { return x; }
 481
 482 _foo:
 483         movl 4(%esp), %eax
 484         shll $24, %eax
 485         sarl $24, %eax
 486         ret
 487
 488 SIGN_EXTEND_INREG can be implemented as (sext (trunc)) to take advantage of
 489 sub-registers.
 490
 491 //===---------------------------------------------------------------------===//
 492
 493 Consider this:
 494
 495 typedef struct pair { float A, B; } pair;
 496 void pairtest(pair P, float *FP) {
 497         *FP = P.A+P.B;
 498 }
 499
 500 We currently generate this code with llvmgcc4:
 501
 502 _pairtest:
 503         movl 8(%esp), %eax
 504         movl 4(%esp), %ecx
 505         movd %eax, %xmm0
 506         movd %ecx, %xmm1
 507         addss %xmm0, %xmm1
 508         movl 12(%esp), %eax
 509         movss %xmm1, (%eax)
 510         ret
 511
 512 we should be able to generate:
 513 _pairtest:
 514         movss 4(%esp), %xmm0
 515         movl 12(%esp), %eax
 516         addss 8(%esp), %xmm0
 517         movss %xmm0, (%eax)
 518         ret
 519
 520 The issue is that llvmgcc4 is forcing the struct to memory, then passing it as
 521 integer chunks.  It does this so that structs like {short,short} are passed in
 522 a single 32-bit integer stack slot.  We should handle the safe cases above much
 523 nicer, while still handling the hard cases.
 524
 525 While true in general, in this specific case we could do better by promoting
 526 load int + bitcast to float -> load fload.  This basically needs alignment info,
 527 the code is already implemented (but disabled) in dag combine).
 528
 529 //===---------------------------------------------------------------------===//
 530
 531 Another instruction selector deficiency:
 532
 533 void %bar() {
 534         %tmp = load int (int)** %foo
 535         %tmp = tail call int %tmp( int 3 )
 536         ret void
 537 }
 538
 539 _bar:
 540         subl $12, %esp
 541         movl L_foo$non_lazy_ptr, %eax
 542         movl (%eax), %eax
 543         call *%eax
 544         addl $12, %esp
 545         ret
 546
 547 The current isel scheme will not allow the load to be folded in the call since
 548 the load's chain result is read by the callseq_start.
 549
 550 //===---------------------------------------------------------------------===//
 551
 552 Don't forget to find a way to squash noop truncates in the JIT environment.
 553
 554 //===---------------------------------------------------------------------===//
 555
 556 Implement anyext in the same manner as truncate that would allow them to be
 557 eliminated.
 558
 559 //===---------------------------------------------------------------------===//
 560
 561 How about implementing truncate / anyext as a property of machine instruction
 562 operand? i.e. Print as 32-bit super-class register / 16-bit sub-class register.
 563 Do this for the cases where a truncate / anyext is guaranteed to be eliminated.
 564 For IA32 that is truncate from 32 to 16 and anyext from 16 to 32.
 565
 566 //===---------------------------------------------------------------------===//
 567
 568 For this:
 569
 570 int test(int a)
 571 {
 572   return a * 3;
 573 }
 574
 575 We currently emits
 576         imull $3, 4(%esp), %eax
 577
 578 Perhaps this is what we really should generate is? Is imull three or four
 579 cycles? Note: ICC generates this:
 580         movl    4(%esp), %eax
 581         leal    (%eax,%eax,2), %eax
 582
 583 The current instruction priority is based on pattern complexity. The former is
 584 more "complex" because it folds a load so the latter will not be emitted.
 585
 586 Perhaps we should use AddedComplexity to give LEA32r a higher priority? We
 587 should always try to match LEA first since the LEA matching code does some
 588 estimate to determine whether the match is profitable.
 589
 590 However, if we care more about code size, then imull is better. It's two bytes
 591 shorter than movl + leal.
 592
 593 //===---------------------------------------------------------------------===//
 594
 595 Implement CTTZ, CTLZ with bsf and bsr.
 596
 597 //===---------------------------------------------------------------------===//
 598
 599 It appears gcc place string data with linkonce linkage in
 600 .section __TEXT,__const_coal,coalesced instead of
 601 .section __DATA,__const_coal,coalesced.
 602 Take a look at darwin.h, there are other Darwin assembler directives that we
 603 do not make use of.
 604
 605 //===---------------------------------------------------------------------===//
 606
 607 int %foo(int* %a, int %t) {
 608 entry:
 609         br label %cond_true
 610
 611 cond_true:              ; preds = %cond_true, %entry
 612         %x.0.0 = phi int [ 0, %entry ], [ %tmp9, %cond_true ]
 613         %t_addr.0.0 = phi int [ %t, %entry ], [ %tmp7, %cond_true ]
 614         %tmp2 = getelementptr int* %a, int %x.0.0
 615         %tmp3 = load int* %tmp2         ; <int> [#uses=1]
 616         %tmp5 = add int %t_addr.0.0, %x.0.0             ; <int> [#uses=1]
 617         %tmp7 = add int %tmp5, %tmp3            ; <int> [#uses=2]
 618         %tmp9 = add int %x.0.0, 1               ; <int> [#uses=2]
 619         %tmp = setgt int %tmp9, 39              ; <bool> [#uses=1]
 620         br bool %tmp, label %bb12, label %cond_true
 621
 622 bb12:           ; preds = %cond_true
 623         ret int %tmp7
 624 }
 625
 626 is pessimized by -loop-reduce and -indvars
 627
 628 //===---------------------------------------------------------------------===//
 629
 630 u32 to float conversion improvement:
 631
 632 float uint32_2_float( unsigned u ) {
 633   float fl = (int) (u & 0xffff);
 634   float fh = (int) (u >> 16);
 635   fh *= 0x1.0p16f;
 636   return fh + fl;
 637 }
 638
 639 00000000        subl    $0x04,%esp
 640 00000003        movl    0x08(%esp,1),%eax
 641 00000007        movl    %eax,%ecx
 642 00000009        shrl    $0x10,%ecx
 643 0000000c        cvtsi2ss        %ecx,%xmm0
 644 00000010        andl    $0x0000ffff,%eax
 645 00000015        cvtsi2ss        %eax,%xmm1
 646 00000019        mulss   0x00000078,%xmm0
 647 00000021        addss   %xmm1,%xmm0
 648 00000025        movss   %xmm0,(%esp,1)
 649 0000002a        flds    (%esp,1)
 650 0000002d        addl    $0x04,%esp
 651 00000030        ret
 652
 653 //===---------------------------------------------------------------------===//
 654
 655 When using fastcc abi, align stack slot of argument of type double on 8 byte
 656 boundary to improve performance.
 657
 658 //===---------------------------------------------------------------------===//
 659
 660 Codegen:
 661
 662 int f(int a, int b) {
 663   if (a == 4 || a == 6)
 664     b++;
 665   return b;
 666 }
 667
 668
 669 as:
 670
 671 or eax, 2
 672 cmp eax, 6
 673 jz label
 674
 675 //===---------------------------------------------------------------------===//
 676
 677 GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting
 678 simplifications for integer "x cmp y ? a : b".  For example, instead of:
 679
 680 int G;
 681 void f(int X, int Y) {
 682   G = X < 0 ? 14 : 13;
 683 }
 684
 685 compiling to:
 686
 687 _f:
 688         movl $14, %eax
 689         movl $13, %ecx
 690         movl 4(%esp), %edx
 691         testl %edx, %edx
 692         cmovl %eax, %ecx
 693         movl %ecx, _G
 694         ret
 695
 696 it could be:
 697 _f:
 698         movl    4(%esp), %eax
 699         sarl    $31, %eax
 700         notl    %eax
 701         addl    $14, %eax
 702         movl    %eax, _G
 703         ret
 704
 705 etc.
 706
 707 //===---------------------------------------------------------------------===//
 708
 709 Currently we don't have elimination of redundant stack manipulations. Consider
 710 the code:
 711
 712 int %main() {
 713 entry:
 714         call fastcc void %test1( )
 715         call fastcc void %test2( sbyte* cast (void ()* %test1 to sbyte*) )
 716         ret int 0
 717 }
 718
 719 declare fastcc void %test1()
 720
 721 declare fastcc void %test2(sbyte*)
 722
 723
 724 This currently compiles to:
 725
 726         subl $16, %esp
 727         call _test5
 728         addl $12, %esp
 729         subl $16, %esp
 730         movl $_test5, (%esp)
 731         call _test6
 732         addl $12, %esp
 733
 734 The add\sub pair is really unneeded here.
 735
 736 //===---------------------------------------------------------------------===//
 737
 738 We currently compile sign_extend_inreg into two shifts:
 739
 740 long foo(long X) {
 741   return (long)(signed char)X;
 742 }
 743
 744 becomes:
 745
 746 _foo:
 747         movl 4(%esp), %eax
 748         shll $24, %eax
 749         sarl $24, %eax
 750         ret
 751
 752 This could be:
 753
 754 _foo:
 755         movsbl  4(%esp),%eax
 756         ret
 757
 758 //===---------------------------------------------------------------------===//
 759
 760 Consider the expansion of:
 761
 762 uint %test3(uint %X) {
 763         %tmp1 = rem uint %X, 255
 764         ret uint %tmp1
 765 }
 766
 767 Currently it compiles to:
 768
 769 ...
 770         movl $2155905153, %ecx
 771         movl 8(%esp), %esi
 772         movl %esi, %eax
 773         mull %ecx
 774 ...
 775
 776 This could be "reassociated" into:
 777
 778         movl $2155905153, %eax
 779         movl 8(%esp), %ecx
 780         mull %ecx
 781
 782 to avoid the copy.  In fact, the existing two-address stuff would do this
 783 except that mul isn't a commutative 2-addr instruction.  I guess this has
 784 to be done at isel time based on the #uses to mul?
 785
 786 //===---------------------------------------------------------------------===//
 787
 788 Make sure the instruction which starts a loop does not cross a cacheline
 789 boundary. This requires knowning the exact length of each machine instruction.
 790 That is somewhat complicated, but doable. Example 256.bzip2:
 791
 792 In the new trace, the hot loop has an instruction which crosses a cacheline
 793 boundary.  In addition to potential cache misses, this can't help decoding as I
 794 imagine there has to be some kind of complicated decoder reset and realignment
 795 to grab the bytes from the next cacheline.
 796
 797 532  532 0x3cfc movb     (1809(%esp, %esi), %bl   <<<--- spans 2 64 byte lines
 798 942  942 0x3d03 movl     %dh, (1809(%esp, %esi)
 799 937  937 0x3d0a incl     %esi
 800 3    3   0x3d0b cmpb     %bl, %dl
 801 27   27  0x3d0d jnz      0x000062db <main+11707>
 802
 803 //===---------------------------------------------------------------------===//
 804
 805 In c99 mode, the preprocessor doesn't like assembly comments like #TRUNCATE.
 806
 807 //===---------------------------------------------------------------------===//
 808
 809 This could be a single 16-bit load.
 810
 811 int f(char *p) {
 812     if ((p[0] == 1) & (p[1] == 2)) return 1;
 813     return 0;
 814 }
 815
 816 //===---------------------------------------------------------------------===//
 817
 818 We should inline lrintf and probably other libc functions.
 819
 820 //===---------------------------------------------------------------------===//
 821
 822 Start using the flags more.  For example, compile:
 823
 824 int add_zf(int *x, int y, int a, int b) {
 825      if ((*x += y) == 0)
 826           return a;
 827      else
 828           return b;
 829 }
 830
 831 to:
 832        addl    %esi, (%rdi)
 833        movl    %edx, %eax
 834        cmovne  %ecx, %eax
 835        ret
 836 instead of:
 837
 838 _add_zf:
 839         addl (%rdi), %esi
 840         movl %esi, (%rdi)
 841         testl %esi, %esi
 842         cmove %edx, %ecx
 843         movl %ecx, %eax
 844         ret
 845
 846 and:
 847
 848 int add_zf(int *x, int y, int a, int b) {
 849      if ((*x + y) < 0)
 850           return a;
 851      else
 852           return b;
 853 }
 854
 855 to:
 856
 857 add_zf:
 858         addl    (%rdi), %esi
 859         movl    %edx, %eax
 860         cmovns  %ecx, %eax
 861         ret
 862
 863 instead of:
 864
 865 _add_zf:
 866         addl (%rdi), %esi
 867         testl %esi, %esi
 868         cmovs %edx, %ecx
 869         movl %ecx, %eax
 870         ret
 871
 872 //===---------------------------------------------------------------------===//
 873
 874 This:
 875 #include <math.h>
 876 int foo(double X) { return isnan(X); }
 877
 878 compiles to (-m64):
 879
 880 _foo:
 881         pxor %xmm1, %xmm1
 882         ucomisd %xmm1, %xmm0
 883         setp %al
 884         movzbl %al, %eax
 885         ret
 886
 887 the pxor is not needed, we could compare the value against itself.
 888
 889 //===---------------------------------------------------------------------===//
 890
 891 These two functions have identical effects:
 892
 893 unsigned int f(unsigned int i, unsigned int n) {++i; if (i == n) ++i; return i;}
 894 unsigned int f2(unsigned int i, unsigned int n) {++i; i += i == n; return i;}
 895
 896 We currently compile them to:
 897
 898 _f:
 899         movl 4(%esp), %eax
 900         movl %eax, %ecx
 901         incl %ecx
 902         movl 8(%esp), %edx
 903         cmpl %edx, %ecx
 904         jne LBB1_2      #UnifiedReturnBlock
 905 LBB1_1: #cond_true
 906         addl $2, %eax
 907         ret
 908 LBB1_2: #UnifiedReturnBlock
 909         movl %ecx, %eax
 910         ret
 911 _f2:
 912         movl 4(%esp), %eax
 913         movl %eax, %ecx
 914         incl %ecx
 915         cmpl 8(%esp), %ecx
 916         sete %cl
 917         movzbl %cl, %ecx
 918         leal 1(%ecx,%eax), %eax
 919         ret
 920
 921 both of which are inferior to GCC's:
 922
 923 _f:
 924         movl    4(%esp), %edx
 925         leal    1(%edx), %eax
 926         addl    $2, %edx
 927         cmpl    8(%esp), %eax
 928         cmove   %edx, %eax
 929         ret
 930 _f2:
 931         movl    4(%esp), %eax
 932         addl    $1, %eax
 933         xorl    %edx, %edx
 934         cmpl    8(%esp), %eax
 935         sete    %dl
 936         addl    %edx, %eax
 937         ret
 938
 939 //===---------------------------------------------------------------------===//
 940
 941 This code:
 942
 943 void test(int X) {
 944   if (X) abort();
 945 }
 946
 947 is currently compiled to:
 948
 949 _test:
 950         subl $12, %esp
 951         cmpl $0, 16(%esp)
 952         jne LBB1_1
 953         addl $12, %esp
 954         ret
 955 LBB1_1:
 956         call L_abort$stub
 957
 958 It would be better to produce:
 959
 960 _test:
 961         subl $12, %esp
 962         cmpl $0, 16(%esp)
 963         jne L_abort$stub
 964         addl $12, %esp
 965         ret
 966
 967 This can be applied to any no-return function call that takes no arguments etc.
 968 Alternatively, the stack save/restore logic could be shrink-wrapped, producing
 969 something like this:
 970
 971 _test:
 972         cmpl $0, 4(%esp)
 973         jne LBB1_1
 974         ret
 975 LBB1_1:
 976         subl $12, %esp
 977         call L_abort$stub
 978
 979 Both are useful in different situations.  Finally, it could be shrink-wrapped
 980 and tail called, like this:
 981
 982 _test:
 983         cmpl $0, 4(%esp)
 984         jne LBB1_1
 985         ret
 986 LBB1_1:
 987         pop %eax   # realign stack.
 988         call L_abort$stub
 989
 990 Though this probably isn't worth it.
 991
 992 //===---------------------------------------------------------------------===//
 993
 994 We need to teach the codegen to convert two-address INC instructions to LEA
 995 when the flags are dead.  For example, on X86-64, compile:
 996
 997 int foo(int A, int B) {
 998   return A+1;
 999 }
1000
1001 to:
1002
1003 _foo:
1004         leal    1(%edi), %eax
1005         ret
1006
1007 instead of:
1008
1009 _foo:
1010         incl %edi
1011         movl %edi, %eax
1012         ret
1013
1014 Another example is:
1015
1016 ;; X's live range extends beyond the shift, so the register allocator
1017 ;; cannot coalesce it with Y.  Because of this, a copy needs to be
1018 ;; emitted before the shift to save the register value before it is
1019 ;; clobbered.  However, this copy is not needed if the register
1020 ;; allocator turns the shift into an LEA.  This also occurs for ADD.
1021
1022 ; Check that the shift gets turned into an LEA.
1023 ; RUN: llvm-upgrade < %s | llvm-as | llc -march=x86 -x86-asm-syntax=intel | \
1024 ; RUN:   not grep {mov E.X, E.X}
1025
1026 %G = external global int
1027
1028 int %test1(int %X, int %Y) {
1029         %Z = add int %X, %Y
1030         volatile store int %Y, int* %G
1031         volatile store int %Z, int* %G
1032         ret int %X
1033 }
1034
1035 int %test2(int %X) {
1036         %Z = add int %X, 1  ;; inc
1037         volatile store int %Z, int* %G
1038         ret int %X
1039 }
1040
1041 //===---------------------------------------------------------------------===//
1042
1043 This:
1044 #include <xmmintrin.h>
1045 unsigned test(float f) {
1046  return _mm_cvtsi128_si32( (__m128i) _mm_set_ss( f ));
1047 }
1048
1049 Compiles to:
1050 _test:
1051         movss 4(%esp), %xmm0
1052         movd %xmm0, %eax
1053         ret
1054
1055 it should compile to a move from the stack slot directly into eax.  DAGCombine
1056 has this xform, but it is currently disabled until the alignment fields of
1057 the load/store nodes are trustworthy.
1058
1059 //===---------------------------------------------------------------------===//
1060
1061 Sometimes it is better to codegen subtractions from a constant (e.g. 7-x) with
1062 a neg instead of a sub instruction.  Consider:
1063
1064 int test(char X) { return 7-X; }
1065
1066 we currently produce:
1067 _test:
1068         movl $7, %eax
1069         movsbl 4(%esp), %ecx
1070         subl %ecx, %eax
1071         ret
1072
1073 We would use one fewer register if codegen'd as:
1074
1075         movsbl 4(%esp), %eax
1076         neg %eax
1077         add $7, %eax
1078         ret
1079
1080 Note that this isn't beneficial if the load can be folded into the sub.  In
1081 this case, we want a sub:
1082
1083 int test(int X) { return 7-X; }
1084 _test:
1085         movl $7, %eax
1086         subl 4(%esp), %eax
1087         ret
1088
1089 //===---------------------------------------------------------------------===//
1090
1091 For code like:
1092 phi (undef, x)
1093
1094 We get an implicit def on the undef side. If the phi is spilled, we then get:
1095 implicitdef xmm1
1096 store xmm1 -> stack
1097
1098 It should be possible to teach the x86 backend to "fold" the store into the
1099 implicitdef, which just deletes the implicit def.
1100
1101 These instructions should go away:
1102 #IMPLICIT_DEF %xmm1
1103 movaps %xmm1, 192(%esp)
1104 movaps %xmm1, 224(%esp)
1105 movaps %xmm1, 176(%esp)
1106
1107 //===---------------------------------------------------------------------===//
1108
1109 This is a "commutable two-address" register coallescing deficiency:
1110
1111 define <4 x float> @test1(<4 x float> %V) {
1112 entry:
1113         %tmp8 = shufflevector <4 x float> %V, <4 x float> undef, <4 x i32> < i32 3, i32 2, i32 1, i32 0 >               ; <<4 x float>> [#uses=1]
1114         %add = add <4 x float> %tmp8, %V                ; <<4 x float>> [#uses=1]
1115         ret <4 x float> %add
1116 }
1117
1118 this codegens to:
1119
1120 _test1:
1121         pshufd  $27, %xmm0, %xmm1
1122         addps   %xmm0, %xmm1
1123         movaps  %xmm1, %xmm0
1124         ret
1125
1126 instead of:
1127
1128 _test1:
1129         pshufd  $27, %xmm0, %xmm1
1130         addps   %xmm1, %xmm0
1131         ret
1132