lib/Target/X86/README.txt

   1 //===---------------------------------------------------------------------===//
   2 // Random ideas for the X86 backend.
   3 //===---------------------------------------------------------------------===//
   4
   5 Add a MUL2U and MUL2S nodes to represent a multiply that returns both the
   6 Hi and Lo parts (combination of MUL and MULH[SU] into one node).  Add this to
   7 X86, & make the dag combiner produce it when needed.  This will eliminate one
   8 imul from the code generated for:
   9
  10 long long test(long long X, long long Y) { return X*Y; }
  11
  12 by using the EAX result from the mul.  We should add a similar node for
  13 DIVREM.
  14
  15 another case is:
  16
  17 long long test(int X, int Y) { return (long long)X*Y; }
  18
  19 ... which should only be one imul instruction.
  20
  21 This can be done with a custom expander, but it would be nice to move this to
  22 generic code.
  23
  24 //===---------------------------------------------------------------------===//
  25
  26 This should be one DIV/IDIV instruction, not a libcall:
  27
  28 unsigned test(unsigned long long X, unsigned Y) {
  29         return X/Y;
  30 }
  31
  32 This can be done trivially with a custom legalizer.  What about overflow
  33 though?  http://gcc.gnu.org/bugzilla/show_bug.cgi?id=14224
  34
  35 //===---------------------------------------------------------------------===//
  36
  37 Improvements to the multiply -> shift/add algorithm:
  38 http://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html
  39
  40 //===---------------------------------------------------------------------===//
  41
  42 Improve code like this (occurs fairly frequently, e.g. in LLVM):
  43 long long foo(int x) { return 1LL << x; }
  44
  45 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01109.html
  46 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01128.html
  47 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html
  48
  49 Another useful one would be  ~0ULL >> X and ~0ULL << X.
  50
  51 One better solution for 1LL << x is:
  52         xorl    %eax, %eax
  53         xorl    %edx, %edx
  54         testb   $32, %cl
  55         sete    %al
  56         setne   %dl
  57         sall    %cl, %eax
  58         sall    %cl, %edx
  59
  60 But that requires good 8-bit subreg support.
  61
  62 64-bit shifts (in general) expand to really bad code.  Instead of using
  63 cmovs, we should expand to a conditional branch like GCC produces.
  64
  65 //===---------------------------------------------------------------------===//
  66
  67 Compile this:
  68 _Bool f(_Bool a) { return a!=1; }
  69
  70 into:
  71         movzbl  %dil, %eax
  72         xorl    $1, %eax
  73         ret
  74
  75 //===---------------------------------------------------------------------===//
  76
  77 Some isel ideas:
  78
  79 1. Dynamic programming based approach when compile time if not an
  80    issue.
  81 2. Code duplication (addressing mode) during isel.
  82 3. Other ideas from "Register-Sensitive Selection, Duplication, and
  83    Sequencing of Instructions".
  84 4. Scheduling for reduced register pressure.  E.g. "Minimum Register
  85    Instruction Sequence Problem: Revisiting Optimal Code Generation for DAGs"
  86    and other related papers.
  87    http://citeseer.ist.psu.edu/govindarajan01minimum.html
  88
  89 //===---------------------------------------------------------------------===//
  90
  91 Should we promote i16 to i32 to avoid partial register update stalls?
  92
  93 //===---------------------------------------------------------------------===//
  94
  95 Leave any_extend as pseudo instruction and hint to register
  96 allocator. Delay codegen until post register allocation.
  97
  98 //===---------------------------------------------------------------------===//
  99
 100 Count leading zeros and count trailing zeros:
 101
 102 int clz(int X) { return __builtin_clz(X); }
 103 int ctz(int X) { return __builtin_ctz(X); }
 104
 105 $ gcc t.c -S -o - -O3  -fomit-frame-pointer -masm=intel
 106 clz:
 107         bsr     %eax, DWORD PTR [%esp+4]
 108         xor     %eax, 31
 109         ret
 110 ctz:
 111         bsf     %eax, DWORD PTR [%esp+4]
 112         ret
 113
 114 however, check that these are defined for 0 and 32.  Our intrinsics are, GCC's
 115 aren't.
 116
 117 //===---------------------------------------------------------------------===//
 118
 119 Use push/pop instructions in prolog/epilog sequences instead of stores off
 120 ESP (certain code size win, perf win on some [which?] processors).
 121 Also, it appears icc use push for parameter passing. Need to investigate.
 122
 123 //===---------------------------------------------------------------------===//
 124
 125 Only use inc/neg/not instructions on processors where they are faster than
 126 add/sub/xor.  They are slower on the P4 due to only updating some processor
 127 flags.
 128
 129 //===---------------------------------------------------------------------===//
 130
 131 The instruction selector sometimes misses folding a load into a compare.  The
 132 pattern is written as (cmp reg, (load p)).  Because the compare isn't
 133 commutative, it is not matched with the load on both sides.  The dag combiner
 134 should be made smart enough to cannonicalize the load into the RHS of a compare
 135 when it can invert the result of the compare for free.
 136
 137 //===---------------------------------------------------------------------===//
 138
 139 How about intrinsics? An example is:
 140   *res = _mm_mulhi_epu16(*A, _mm_mul_epu32(*B, *C));
 141
 142 compiles to
 143         pmuludq (%eax), %xmm0
 144         movl 8(%esp), %eax
 145         movdqa (%eax), %xmm1
 146         pmulhuw %xmm0, %xmm1
 147
 148 The transformation probably requires a X86 specific pass or a DAG combiner
 149 target specific hook.
 150
 151 //===---------------------------------------------------------------------===//
 152
 153 In many cases, LLVM generates code like this:
 154
 155 _test:
 156         movl 8(%esp), %eax
 157         cmpl %eax, 4(%esp)
 158         setl %al
 159         movzbl %al, %eax
 160         ret
 161
 162 on some processors (which ones?), it is more efficient to do this:
 163
 164 _test:
 165         movl 8(%esp), %ebx
 166         xor  %eax, %eax
 167         cmpl %ebx, 4(%esp)
 168         setl %al
 169         ret
 170
 171 Doing this correctly is tricky though, as the xor clobbers the flags.
 172
 173 //===---------------------------------------------------------------------===//
 174
 175 We should generate bts/btr/etc instructions on targets where they are cheap or
 176 when codesize is important.  e.g., for:
 177
 178 void setbit(int *target, int bit) {
 179     *target |= (1 << bit);
 180 }
 181 void clearbit(int *target, int bit) {
 182     *target &= ~(1 << bit);
 183 }
 184
 185 //===---------------------------------------------------------------------===//
 186
 187 Instead of the following for memset char*, 1, 10:
 188
 189         movl $16843009, 4(%edx)
 190         movl $16843009, (%edx)
 191         movw $257, 8(%edx)
 192
 193 It might be better to generate
 194
 195         movl $16843009, %eax
 196         movl %eax, 4(%edx)
 197         movl %eax, (%edx)
 198         movw al, 8(%edx)
 199
 200 when we can spare a register. It reduces code size.
 201
 202 //===---------------------------------------------------------------------===//
 203
 204 Evaluate what the best way to codegen sdiv X, (2^C) is.  For X/8, we currently
 205 get this:
 206
 207 int %test1(int %X) {
 208         %Y = div int %X, 8
 209         ret int %Y
 210 }
 211
 212 _test1:
 213         movl 4(%esp), %eax
 214         movl %eax, %ecx
 215         sarl $31, %ecx
 216         shrl $29, %ecx
 217         addl %ecx, %eax
 218         sarl $3, %eax
 219         ret
 220
 221 GCC knows several different ways to codegen it, one of which is this:
 222
 223 _test1:
 224         movl    4(%esp), %eax
 225         cmpl    $-1, %eax
 226         leal    7(%eax), %ecx
 227         cmovle  %ecx, %eax
 228         sarl    $3, %eax
 229         ret
 230
 231 which is probably slower, but it's interesting at least :)
 232
 233 //===---------------------------------------------------------------------===//
 234
 235 Should generate min/max for stuff like:
 236
 237 void minf(float a, float b, float *X) {
 238   *X = a <= b ? a : b;
 239 }
 240
 241 Make use of floating point min / max instructions. Perhaps introduce ISD::FMIN
 242 and ISD::FMAX node types?
 243
 244 //===---------------------------------------------------------------------===//
 245
 246 The first BB of this code:
 247
 248 declare bool %foo()
 249 int %bar() {
 250         %V = call bool %foo()
 251         br bool %V, label %T, label %F
 252 T:
 253         ret int 1
 254 F:
 255         call bool %foo()
 256         ret int 12
 257 }
 258
 259 compiles to:
 260
 261 _bar:
 262         subl $12, %esp
 263         call L_foo$stub
 264         xorb $1, %al
 265         testb %al, %al
 266         jne LBB_bar_2   # F
 267
 268 It would be better to emit "cmp %al, 1" than a xor and test.
 269
 270 //===---------------------------------------------------------------------===//
 271
 272 Enable X86InstrInfo::convertToThreeAddress().
 273
 274 //===---------------------------------------------------------------------===//
 275
 276 We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl
 277 We should leave these as libcalls for everything over a much lower threshold,
 278 since libc is hand tuned for medium and large mem ops (avoiding RFO for large
 279 stores, TLB preheating, etc)
 280
 281 //===---------------------------------------------------------------------===//
 282
 283 Optimize this into something reasonable:
 284  x * copysign(1.0, y) * copysign(1.0, z)
 285
 286 //===---------------------------------------------------------------------===//
 287
 288 Optimize copysign(x, *y) to use an integer load from y.
 289
 290 //===---------------------------------------------------------------------===//
 291
 292 %X = weak global int 0
 293
 294 void %foo(int %N) {
 295         %N = cast int %N to uint
 296         %tmp.24 = setgt int %N, 0
 297         br bool %tmp.24, label %no_exit, label %return
 298
 299 no_exit:
 300         %indvar = phi uint [ 0, %entry ], [ %indvar.next, %no_exit ]
 301         %i.0.0 = cast uint %indvar to int
 302         volatile store int %i.0.0, int* %X
 303         %indvar.next = add uint %indvar, 1
 304         %exitcond = seteq uint %indvar.next, %N
 305         br bool %exitcond, label %return, label %no_exit
 306
 307 return:
 308         ret void
 309 }
 310
 311 compiles into:
 312
 313         .text
 314         .align  4
 315         .globl  _foo
 316 _foo:
 317         movl 4(%esp), %eax
 318         cmpl $1, %eax
 319         jl LBB_foo_4    # return
 320 LBB_foo_1:      # no_exit.preheader
 321         xorl %ecx, %ecx
 322 LBB_foo_2:      # no_exit
 323         movl L_X$non_lazy_ptr, %edx
 324         movl %ecx, (%edx)
 325         incl %ecx
 326         cmpl %eax, %ecx
 327         jne LBB_foo_2   # no_exit
 328 LBB_foo_3:      # return.loopexit
 329 LBB_foo_4:      # return
 330         ret
 331
 332 We should hoist "movl L_X$non_lazy_ptr, %edx" out of the loop after
 333 remateralization is implemented. This can be accomplished with 1) a target
 334 dependent LICM pass or 2) makeing SelectDAG represent the whole function.
 335
 336 //===---------------------------------------------------------------------===//
 337
 338 The following tests perform worse with LSR:
 339
 340 lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor.
 341
 342 //===---------------------------------------------------------------------===//
 343
 344 Teach the coalescer to coalesce vregs of different register classes. e.g. FR32 /
 345 FR64 to VR128.
 346
 347 //===---------------------------------------------------------------------===//
 348
 349 mov $reg, 48(%esp)
 350 ...
 351 leal 48(%esp), %eax
 352 mov %eax, (%esp)
 353 call _foo
 354
 355 Obviously it would have been better for the first mov (or any op) to store
 356 directly %esp[0] if there are no other uses.
 357
 358 //===---------------------------------------------------------------------===//
 359
 360 Adding to the list of cmp / test poor codegen issues:
 361
 362 int test(__m128 *A, __m128 *B) {
 363   if (_mm_comige_ss(*A, *B))
 364     return 3;
 365   else
 366     return 4;
 367 }
 368
 369 _test:
 370         movl 8(%esp), %eax
 371         movaps (%eax), %xmm0
 372         movl 4(%esp), %eax
 373         movaps (%eax), %xmm1
 374         comiss %xmm0, %xmm1
 375         setae %al
 376         movzbl %al, %ecx
 377         movl $3, %eax
 378         movl $4, %edx
 379         cmpl $0, %ecx
 380         cmove %edx, %eax
 381         ret
 382
 383 Note the setae, movzbl, cmpl, cmove can be replaced with a single cmovae. There
 384 are a number of issues. 1) We are introducing a setcc between the result of the
 385 intrisic call and select. 2) The intrinsic is expected to produce a i32 value
 386 so a any extend (which becomes a zero extend) is added.
 387
 388 We probably need some kind of target DAG combine hook to fix this.
 389
 390 //===---------------------------------------------------------------------===//
 391
 392 We generate significantly worse code for this than GCC:
 393 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150
 394 http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701
 395
 396 There is also one case we do worse on PPC.
 397
 398 //===---------------------------------------------------------------------===//
 399
 400 If shorter, we should use things like:
 401 movzwl %ax, %eax
 402 instead of:
 403 andl $65535, %EAX
 404
 405 The former can also be used when the two-addressy nature of the 'and' would
 406 require a copy to be inserted (in X86InstrInfo::convertToThreeAddress).
 407
 408 //===---------------------------------------------------------------------===//
 409
 410 Bad codegen:
 411
 412 char foo(int x) { return x; }
 413
 414 _foo:
 415         movl 4(%esp), %eax
 416         shll $24, %eax
 417         sarl $24, %eax
 418         ret
 419
 420 SIGN_EXTEND_INREG can be implemented as (sext (trunc)) to take advantage of
 421 sub-registers.
 422
 423 //===---------------------------------------------------------------------===//
 424
 425 Consider this:
 426
 427 typedef struct pair { float A, B; } pair;
 428 void pairtest(pair P, float *FP) {
 429         *FP = P.A+P.B;
 430 }
 431
 432 We currently generate this code with llvmgcc4:
 433
 434 _pairtest:
 435         subl $12, %esp
 436         movl 20(%esp), %eax
 437         movl %eax, 4(%esp)
 438         movl 16(%esp), %eax
 439         movl %eax, (%esp)
 440         movss (%esp), %xmm0
 441         addss 4(%esp), %xmm0
 442         movl 24(%esp), %eax
 443         movss %xmm0, (%eax)
 444         addl $12, %esp
 445         ret
 446
 447 we should be able to generate:
 448 _pairtest:
 449         movss 4(%esp), %xmm0
 450         movl 12(%esp), %eax
 451         addss 8(%esp), %xmm0
 452         movss %xmm0, (%eax)
 453         ret
 454
 455 The issue is that llvmgcc4 is forcing the struct to memory, then passing it as
 456 integer chunks.  It does this so that structs like {short,short} are passed in
 457 a single 32-bit integer stack slot.  We should handle the safe cases above much
 458 nicer, while still handling the hard cases.
 459
 460 //===---------------------------------------------------------------------===//
 461
 462 Another instruction selector deficiency:
 463
 464 void %bar() {
 465         %tmp = load int (int)** %foo
 466         %tmp = tail call int %tmp( int 3 )
 467         ret void
 468 }
 469
 470 _bar:
 471         subl $12, %esp
 472         movl L_foo$non_lazy_ptr, %eax
 473         movl (%eax), %eax
 474         call *%eax
 475         addl $12, %esp
 476         ret
 477
 478 The current isel scheme will not allow the load to be folded in the call since
 479 the load's chain result is read by the callseq_start.
 480
 481 //===---------------------------------------------------------------------===//
 482
 483 Don't forget to find a way to squash noop truncates in the JIT environment.
 484
 485 //===---------------------------------------------------------------------===//
 486
 487 Implement anyext in the same manner as truncate that would allow them to be
 488 eliminated.
 489
 490 //===---------------------------------------------------------------------===//
 491
 492 How about implementing truncate / anyext as a property of machine instruction
 493 operand? i.e. Print as 32-bit super-class register / 16-bit sub-class register.
 494 Do this for the cases where a truncate / anyext is guaranteed to be eliminated.
 495 For IA32 that is truncate from 32 to 16 and anyext from 16 to 32.
 496
 497 //===---------------------------------------------------------------------===//
 498
 499 For this:
 500
 501 int test(int a)
 502 {
 503   return a * 3;
 504 }
 505
 506 We currently emits
 507         imull $3, 4(%esp), %eax
 508
 509 Perhaps this is what we really should generate is? Is imull three or four
 510 cycles? Note: ICC generates this:
 511         movl    4(%esp), %eax
 512         leal    (%eax,%eax,2), %eax
 513
 514 The current instruction priority is based on pattern complexity. The former is
 515 more "complex" because it folds a load so the latter will not be emitted.
 516
 517 Perhaps we should use AddedComplexity to give LEA32r a higher priority? We
 518 should always try to match LEA first since the LEA matching code does some
 519 estimate to determine whether the match is profitable.
 520
 521 However, if we care more about code size, then imull is better. It's two bytes
 522 shorter than movl + leal.
 523
 524 //===---------------------------------------------------------------------===//
 525
 526 Implement CTTZ, CTLZ with bsf and bsr.
 527
 528 //===---------------------------------------------------------------------===//
 529
 530 It appears gcc place string data with linkonce linkage in
 531 .section __TEXT,__const_coal,coalesced instead of
 532 .section __DATA,__const_coal,coalesced.
 533 Take a look at darwin.h, there are other Darwin assembler directives that we
 534 do not make use of.
 535
 536 //===---------------------------------------------------------------------===//
 537
 538 We should handle __attribute__ ((__visibility__ ("hidden"))).
 539
 540 //===---------------------------------------------------------------------===//
 541
 542 int %foo(int* %a, int %t) {
 543 entry:
 544         br label %cond_true
 545
 546 cond_true:              ; preds = %cond_true, %entry
 547         %x.0.0 = phi int [ 0, %entry ], [ %tmp9, %cond_true ]
 548         %t_addr.0.0 = phi int [ %t, %entry ], [ %tmp7, %cond_true ]
 549         %tmp2 = getelementptr int* %a, int %x.0.0
 550         %tmp3 = load int* %tmp2         ; <int> [#uses=1]
 551         %tmp5 = add int %t_addr.0.0, %x.0.0             ; <int> [#uses=1]
 552         %tmp7 = add int %tmp5, %tmp3            ; <int> [#uses=2]
 553         %tmp9 = add int %x.0.0, 1               ; <int> [#uses=2]
 554         %tmp = setgt int %tmp9, 39              ; <bool> [#uses=1]
 555         br bool %tmp, label %bb12, label %cond_true
 556
 557 bb12:           ; preds = %cond_true
 558         ret int %tmp7
 559 }
 560
 561 is pessimized by -loop-reduce and -indvars
 562
 563 //===---------------------------------------------------------------------===//
 564
 565 Use cpuid to auto-detect CPU features such as SSE, SSE2, and SSE3.
 566
 567 //===---------------------------------------------------------------------===//
 568
 569 u32 to float conversion improvement:
 570
 571 float uint32_2_float( unsigned u ) {
 572   float fl = (int) (u & 0xffff);
 573   float fh = (int) (u >> 16);
 574   fh *= 0x1.0p16f;
 575   return fh + fl;
 576 }
 577
 578 00000000        subl    $0x04,%esp
 579 00000003        movl    0x08(%esp,1),%eax
 580 00000007        movl    %eax,%ecx
 581 00000009        shrl    $0x10,%ecx
 582 0000000c        cvtsi2ss        %ecx,%xmm0
 583 00000010        andl    $0x0000ffff,%eax
 584 00000015        cvtsi2ss        %eax,%xmm1
 585 00000019        mulss   0x00000078,%xmm0
 586 00000021        addss   %xmm1,%xmm0
 587 00000025        movss   %xmm0,(%esp,1)
 588 0000002a        flds    (%esp,1)
 589 0000002d        addl    $0x04,%esp
 590 00000030        ret
 591
 592 //===---------------------------------------------------------------------===//
 593
 594 When using fastcc abi, align stack slot of argument of type double on 8 byte
 595 boundary to improve performance.
 596
 597 //===---------------------------------------------------------------------===//
 598
 599 Codegen:
 600
 601 int f(int a, int b) {
 602   if (a == 4 || a == 6)
 603     b++;
 604   return b;
 605 }
 606
 607
 608 as:
 609
 610 or eax, 2
 611 cmp eax, 6
 612 jz label
 613
 614 If we aren't going to do this, we should lower the switch better.  We compile
 615 the code to:
 616
 617 _f:
 618         movl 8(%esp), %eax
 619         movl 4(%esp), %ecx
 620         cmpl $6, %ecx
 621         jl LBB1_4       #entry
 622         jmp LBB1_3      #entry
 623 LBB1_3: #entry
 624         cmpl $6, %ecx
 625         je LBB1_1       #bb
 626         jmp LBB1_2      #UnifiedReturnBlock
 627 LBB1_4: #entry
 628         cmpl $4, %ecx
 629         jne LBB1_2      #UnifiedReturnBlock
 630 LBB1_1: #bb
 631         incl %eax
 632         ret
 633 LBB1_2: #UnifiedReturnBlock
 634         ret
 635
 636 In the code above, the 'if' is turned into a 'switch' at the mid-level.  It
 637 looks  like the 'lower to branches' mode could be improved a little here.  In
 638 particular, the fall-through to LBB1_3 doesn't need a branch.  It would also be
 639 nice to eliminate the redundant "cmp 6", maybe by lowering to a linear sequence
 640 of compares if there are below a certain number of cases (instead of a binary
 641 sequence)?
 642
 643 //===---------------------------------------------------------------------===//
 644
 645 GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting
 646 simplifications for integer "x cmp y ? a : b".  For example, instead of:
 647
 648 int G;
 649 void f(int X, int Y) {
 650   G = X < 0 ? 14 : 13;
 651 }
 652
 653 compiling to:
 654
 655 _f:
 656         movl $14, %eax
 657         movl $13, %ecx
 658         movl 4(%esp), %edx
 659         testl %edx, %edx
 660         cmovl %eax, %ecx
 661         movl %ecx, _G
 662         ret
 663
 664 it could be:
 665 _f:
 666         movl    4(%esp), %eax
 667         sarl    $31, %eax
 668         notl    %eax
 669         addl    $14, %eax
 670         movl    %eax, _G
 671         ret
 672
 673 etc.
 674
 675 //===---------------------------------------------------------------------===//
 676
 677 Currently we don't have elimination of redundant stack manipulations. Consider
 678 the code:
 679
 680 int %main() {
 681 entry:
 682         call fastcc void %test1( )
 683         call fastcc void %test2( sbyte* cast (void ()* %test1 to sbyte*) )
 684         ret int 0
 685 }
 686
 687 declare fastcc void %test1()
 688
 689 declare fastcc void %test2(sbyte*)
 690
 691
 692 This currently compiles to:
 693
 694         subl $16, %esp
 695         call _test5
 696         addl $12, %esp
 697         subl $16, %esp
 698         movl $_test5, (%esp)
 699         call _test6
 700         addl $12, %esp
 701
 702 The add\sub pair is really unneeded here.
 703
 704 //===---------------------------------------------------------------------===//
 705
 706 We generate really bad code in some cases due to lowering SETCC/SELECT at
 707 legalize time, which prevents the post-legalize dag combine pass from
 708 understanding the code.  As a silly example, this prevents us from folding
 709 stuff like this:
 710
 711 bool %test(ulong %x) {
 712   %tmp = setlt ulong %x, 4294967296
 713   ret bool %tmp
 714 }
 715
 716 into x.h == 0
 717
 718