lib/Target/X86/README.txt

   1 //===---------------------------------------------------------------------===//
   2 // Random ideas for the X86 backend.
   3 //===---------------------------------------------------------------------===//
   4
   5 Add a MUL2U and MUL2S nodes to represent a multiply that returns both the
   6 Hi and Lo parts (combination of MUL and MULH[SU] into one node).  Add this to
   7 X86, & make the dag combiner produce it when needed.  This will eliminate one
   8 imul from the code generated for:
   9
  10 long long test(long long X, long long Y) { return X*Y; }
  11
  12 by using the EAX result from the mul.  We should add a similar node for
  13 DIVREM.
  14
  15 another case is:
  16
  17 long long test(int X, int Y) { return (long long)X*Y; }
  18
  19 ... which should only be one imul instruction.
  20
  21 //===---------------------------------------------------------------------===//
  22
  23 This should be one DIV/IDIV instruction, not a libcall:
  24
  25 unsigned test(unsigned long long X, unsigned Y) {
  26         return X/Y;
  27 }
  28
  29 This can be done trivially with a custom legalizer.  What about overflow
  30 though?  http://gcc.gnu.org/bugzilla/show_bug.cgi?id=14224
  31
  32 //===---------------------------------------------------------------------===//
  33
  34 Improvements to the multiply -> shift/add algorithm:
  35 http://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html
  36
  37 //===---------------------------------------------------------------------===//
  38
  39 Improve code like this (occurs fairly frequently, e.g. in LLVM):
  40 long long foo(int x) { return 1LL << x; }
  41
  42 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01109.html
  43 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01128.html
  44 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html
  45
  46 Another useful one would be  ~0ULL >> X and ~0ULL << X.
  47
  48 One better solution for 1LL << x is:
  49         xorl    %eax, %eax
  50         xorl    %edx, %edx
  51         testb   $32, %cl
  52         sete    %al
  53         setne   %dl
  54         sall    %cl, %eax
  55         sall    %cl, %edx
  56
  57 But that requires good 8-bit subreg support.
  58
  59
  60
  61 //===---------------------------------------------------------------------===//
  62
  63 Compile this:
  64 _Bool f(_Bool a) { return a!=1; }
  65
  66 into:
  67         movzbl  %dil, %eax
  68         xorl    $1, %eax
  69         ret
  70
  71 //===---------------------------------------------------------------------===//
  72
  73 Some isel ideas:
  74
  75 1. Dynamic programming based approach when compile time if not an
  76    issue.
  77 2. Code duplication (addressing mode) during isel.
  78 3. Other ideas from "Register-Sensitive Selection, Duplication, and
  79    Sequencing of Instructions".
  80 4. Scheduling for reduced register pressure.  E.g. "Minimum Register
  81    Instruction Sequence Problem: Revisiting Optimal Code Generation for DAGs"
  82    and other related papers.
  83    http://citeseer.ist.psu.edu/govindarajan01minimum.html
  84
  85 //===---------------------------------------------------------------------===//
  86
  87 Should we promote i16 to i32 to avoid partial register update stalls?
  88
  89 //===---------------------------------------------------------------------===//
  90
  91 Leave any_extend as pseudo instruction and hint to register
  92 allocator. Delay codegen until post register allocation.
  93
  94 //===---------------------------------------------------------------------===//
  95
  96 Count leading zeros and count trailing zeros:
  97
  98 int clz(int X) { return __builtin_clz(X); }
  99 int ctz(int X) { return __builtin_ctz(X); }
 100
 101 $ gcc t.c -S -o - -O3  -fomit-frame-pointer -masm=intel
 102 clz:
 103         bsr     %eax, DWORD PTR [%esp+4]
 104         xor     %eax, 31
 105         ret
 106 ctz:
 107         bsf     %eax, DWORD PTR [%esp+4]
 108         ret
 109
 110 however, check that these are defined for 0 and 32.  Our intrinsics are, GCC's
 111 aren't.
 112
 113 //===---------------------------------------------------------------------===//
 114
 115 Use push/pop instructions in prolog/epilog sequences instead of stores off
 116 ESP (certain code size win, perf win on some [which?] processors).
 117 Also, it appears icc use push for parameter passing. Need to investigate.
 118
 119 //===---------------------------------------------------------------------===//
 120
 121 Only use inc/neg/not instructions on processors where they are faster than
 122 add/sub/xor.  They are slower on the P4 due to only updating some processor
 123 flags.
 124
 125 //===---------------------------------------------------------------------===//
 126
 127 The instruction selector sometimes misses folding a load into a compare.  The
 128 pattern is written as (cmp reg, (load p)).  Because the compare isn't
 129 commutative, it is not matched with the load on both sides.  The dag combiner
 130 should be made smart enough to cannonicalize the load into the RHS of a compare
 131 when it can invert the result of the compare for free.
 132
 133 //===---------------------------------------------------------------------===//
 134
 135 How about intrinsics? An example is:
 136   *res = _mm_mulhi_epu16(*A, _mm_mul_epu32(*B, *C));
 137
 138 compiles to
 139         pmuludq (%eax), %xmm0
 140         movl 8(%esp), %eax
 141         movdqa (%eax), %xmm1
 142         pmulhuw %xmm0, %xmm1
 143
 144 The transformation probably requires a X86 specific pass or a DAG combiner
 145 target specific hook.
 146
 147 //===---------------------------------------------------------------------===//
 148
 149 In many cases, LLVM generates code like this:
 150
 151 _test:
 152         movl 8(%esp), %eax
 153         cmpl %eax, 4(%esp)
 154         setl %al
 155         movzbl %al, %eax
 156         ret
 157
 158 on some processors (which ones?), it is more efficient to do this:
 159
 160 _test:
 161         movl 8(%esp), %ebx
 162         xor  %eax, %eax
 163         cmpl %ebx, 4(%esp)
 164         setl %al
 165         ret
 166
 167 Doing this correctly is tricky though, as the xor clobbers the flags.
 168
 169 //===---------------------------------------------------------------------===//
 170
 171 We should generate bts/btr/etc instructions on targets where they are cheap or
 172 when codesize is important.  e.g., for:
 173
 174 void setbit(int *target, int bit) {
 175     *target |= (1 << bit);
 176 }
 177 void clearbit(int *target, int bit) {
 178     *target &= ~(1 << bit);
 179 }
 180
 181 //===---------------------------------------------------------------------===//
 182
 183 Instead of the following for memset char*, 1, 10:
 184
 185         movl $16843009, 4(%edx)
 186         movl $16843009, (%edx)
 187         movw $257, 8(%edx)
 188
 189 It might be better to generate
 190
 191         movl $16843009, %eax
 192         movl %eax, 4(%edx)
 193         movl %eax, (%edx)
 194         movw al, 8(%edx)
 195
 196 when we can spare a register. It reduces code size.
 197
 198 //===---------------------------------------------------------------------===//
 199
 200 Evaluate what the best way to codegen sdiv X, (2^C) is.  For X/8, we currently
 201 get this:
 202
 203 int %test1(int %X) {
 204         %Y = div int %X, 8
 205         ret int %Y
 206 }
 207
 208 _test1:
 209         movl 4(%esp), %eax
 210         movl %eax, %ecx
 211         sarl $31, %ecx
 212         shrl $29, %ecx
 213         addl %ecx, %eax
 214         sarl $3, %eax
 215         ret
 216
 217 GCC knows several different ways to codegen it, one of which is this:
 218
 219 _test1:
 220         movl    4(%esp), %eax
 221         cmpl    $-1, %eax
 222         leal    7(%eax), %ecx
 223         cmovle  %ecx, %eax
 224         sarl    $3, %eax
 225         ret
 226
 227 which is probably slower, but it's interesting at least :)
 228
 229 //===---------------------------------------------------------------------===//
 230
 231 Should generate min/max for stuff like:
 232
 233 void minf(float a, float b, float *X) {
 234   *X = a <= b ? a : b;
 235 }
 236
 237 Make use of floating point min / max instructions. Perhaps introduce ISD::FMIN
 238 and ISD::FMAX node types?
 239
 240 //===---------------------------------------------------------------------===//
 241
 242 The first BB of this code:
 243
 244 declare bool %foo()
 245 int %bar() {
 246         %V = call bool %foo()
 247         br bool %V, label %T, label %F
 248 T:
 249         ret int 1
 250 F:
 251         call bool %foo()
 252         ret int 12
 253 }
 254
 255 compiles to:
 256
 257 _bar:
 258         subl $12, %esp
 259         call L_foo$stub
 260         xorb $1, %al
 261         testb %al, %al
 262         jne LBB_bar_2   # F
 263
 264 It would be better to emit "cmp %al, 1" than a xor and test.
 265
 266 //===---------------------------------------------------------------------===//
 267
 268 Enable X86InstrInfo::convertToThreeAddress().
 269
 270 //===---------------------------------------------------------------------===//
 271
 272 We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl
 273 We should leave these as libcalls for everything over a much lower threshold,
 274 since libc is hand tuned for medium and large mem ops (avoiding RFO for large
 275 stores, TLB preheating, etc)
 276
 277 //===---------------------------------------------------------------------===//
 278
 279 Optimize this into something reasonable:
 280  x * copysign(1.0, y) * copysign(1.0, z)
 281
 282 //===---------------------------------------------------------------------===//
 283
 284 Optimize copysign(x, *y) to use an integer load from y.
 285
 286 //===---------------------------------------------------------------------===//
 287
 288 %X = weak global int 0
 289
 290 void %foo(int %N) {
 291         %N = cast int %N to uint
 292         %tmp.24 = setgt int %N, 0
 293         br bool %tmp.24, label %no_exit, label %return
 294
 295 no_exit:
 296         %indvar = phi uint [ 0, %entry ], [ %indvar.next, %no_exit ]
 297         %i.0.0 = cast uint %indvar to int
 298         volatile store int %i.0.0, int* %X
 299         %indvar.next = add uint %indvar, 1
 300         %exitcond = seteq uint %indvar.next, %N
 301         br bool %exitcond, label %return, label %no_exit
 302
 303 return:
 304         ret void
 305 }
 306
 307 compiles into:
 308
 309         .text
 310         .align  4
 311         .globl  _foo
 312 _foo:
 313         movl 4(%esp), %eax
 314         cmpl $1, %eax
 315         jl LBB_foo_4    # return
 316 LBB_foo_1:      # no_exit.preheader
 317         xorl %ecx, %ecx
 318 LBB_foo_2:      # no_exit
 319         movl L_X$non_lazy_ptr, %edx
 320         movl %ecx, (%edx)
 321         incl %ecx
 322         cmpl %eax, %ecx
 323         jne LBB_foo_2   # no_exit
 324 LBB_foo_3:      # return.loopexit
 325 LBB_foo_4:      # return
 326         ret
 327
 328 We should hoist "movl L_X$non_lazy_ptr, %edx" out of the loop after
 329 remateralization is implemented. This can be accomplished with 1) a target
 330 dependent LICM pass or 2) makeing SelectDAG represent the whole function.
 331
 332 //===---------------------------------------------------------------------===//
 333
 334 The following tests perform worse with LSR:
 335
 336 lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor.
 337
 338 //===---------------------------------------------------------------------===//
 339
 340 Teach the coalescer to coalesce vregs of different register classes. e.g. FR32 /
 341 FR64 to VR128.
 342
 343 //===---------------------------------------------------------------------===//
 344
 345 mov $reg, 48(%esp)
 346 ...
 347 leal 48(%esp), %eax
 348 mov %eax, (%esp)
 349 call _foo
 350
 351 Obviously it would have been better for the first mov (or any op) to store
 352 directly %esp[0] if there are no other uses.
 353
 354 //===---------------------------------------------------------------------===//
 355
 356 Adding to the list of cmp / test poor codegen issues:
 357
 358 int test(__m128 *A, __m128 *B) {
 359   if (_mm_comige_ss(*A, *B))
 360     return 3;
 361   else
 362     return 4;
 363 }
 364
 365 _test:
 366         movl 8(%esp), %eax
 367         movaps (%eax), %xmm0
 368         movl 4(%esp), %eax
 369         movaps (%eax), %xmm1
 370         comiss %xmm0, %xmm1
 371         setae %al
 372         movzbl %al, %ecx
 373         movl $3, %eax
 374         movl $4, %edx
 375         cmpl $0, %ecx
 376         cmove %edx, %eax
 377         ret
 378
 379 Note the setae, movzbl, cmpl, cmove can be replaced with a single cmovae. There
 380 are a number of issues. 1) We are introducing a setcc between the result of the
 381 intrisic call and select. 2) The intrinsic is expected to produce a i32 value
 382 so a any extend (which becomes a zero extend) is added.
 383
 384 We probably need some kind of target DAG combine hook to fix this.
 385
 386 //===---------------------------------------------------------------------===//
 387
 388 We generate significantly worse code for this than GCC:
 389 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150
 390 http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701
 391
 392 There is also one case we do worse on PPC.
 393
 394 //===---------------------------------------------------------------------===//
 395
 396 If shorter, we should use things like:
 397 movzwl %ax, %eax
 398 instead of:
 399 andl $65535, %EAX
 400
 401 The former can also be used when the two-addressy nature of the 'and' would
 402 require a copy to be inserted (in X86InstrInfo::convertToThreeAddress).
 403
 404 //===---------------------------------------------------------------------===//
 405
 406 Bad codegen:
 407
 408 char foo(int x) { return x; }
 409
 410 _foo:
 411         movl 4(%esp), %eax
 412         shll $24, %eax
 413         sarl $24, %eax
 414         ret
 415
 416 SIGN_EXTEND_INREG can be implemented as (sext (trunc)) to take advantage of
 417 sub-registers.
 418
 419 //===---------------------------------------------------------------------===//
 420
 421 Consider this:
 422
 423 typedef struct pair { float A, B; } pair;
 424 void pairtest(pair P, float *FP) {
 425         *FP = P.A+P.B;
 426 }
 427
 428 We currently generate this code with llvmgcc4:
 429
 430 _pairtest:
 431         subl $12, %esp
 432         movl 20(%esp), %eax
 433         movl %eax, 4(%esp)
 434         movl 16(%esp), %eax
 435         movl %eax, (%esp)
 436         movss (%esp), %xmm0
 437         addss 4(%esp), %xmm0
 438         movl 24(%esp), %eax
 439         movss %xmm0, (%eax)
 440         addl $12, %esp
 441         ret
 442
 443 we should be able to generate:
 444 _pairtest:
 445         movss 4(%esp), %xmm0
 446         movl 12(%esp), %eax
 447         addss 8(%esp), %xmm0
 448         movss %xmm0, (%eax)
 449         ret
 450
 451 The issue is that llvmgcc4 is forcing the struct to memory, then passing it as
 452 integer chunks.  It does this so that structs like {short,short} are passed in
 453 a single 32-bit integer stack slot.  We should handle the safe cases above much
 454 nicer, while still handling the hard cases.
 455
 456 //===---------------------------------------------------------------------===//
 457
 458 Another instruction selector deficiency:
 459
 460 void %bar() {
 461         %tmp = load int (int)** %foo
 462         %tmp = tail call int %tmp( int 3 )
 463         ret void
 464 }
 465
 466 _bar:
 467         subl $12, %esp
 468         movl L_foo$non_lazy_ptr, %eax
 469         movl (%eax), %eax
 470         call *%eax
 471         addl $12, %esp
 472         ret
 473
 474 The current isel scheme will not allow the load to be folded in the call since
 475 the load's chain result is read by the callseq_start.
 476
 477 //===---------------------------------------------------------------------===//
 478
 479 Don't forget to find a way to squash noop truncates in the JIT environment.
 480
 481 //===---------------------------------------------------------------------===//
 482
 483 Implement anyext in the same manner as truncate that would allow them to be
 484 eliminated.
 485
 486 //===---------------------------------------------------------------------===//
 487
 488 How about implementing truncate / anyext as a property of machine instruction
 489 operand? i.e. Print as 32-bit super-class register / 16-bit sub-class register.
 490 Do this for the cases where a truncate / anyext is guaranteed to be eliminated.
 491 For IA32 that is truncate from 32 to 16 and anyext from 16 to 32.
 492
 493 //===---------------------------------------------------------------------===//
 494
 495 For this:
 496
 497 int test(int a)
 498 {
 499   return a * 3;
 500 }
 501
 502 We currently emits
 503         imull $3, 4(%esp), %eax
 504
 505 Perhaps this is what we really should generate is? Is imull three or four
 506 cycles? Note: ICC generates this:
 507         movl    4(%esp), %eax
 508         leal    (%eax,%eax,2), %eax
 509
 510 The current instruction priority is based on pattern complexity. The former is
 511 more "complex" because it folds a load so the latter will not be emitted.
 512
 513 Perhaps we should use AddedComplexity to give LEA32r a higher priority? We
 514 should always try to match LEA first since the LEA matching code does some
 515 estimate to determine whether the match is profitable.
 516
 517 However, if we care more about code size, then imull is better. It's two bytes
 518 shorter than movl + leal.
 519
 520 //===---------------------------------------------------------------------===//
 521
 522 Implement CTTZ, CTLZ with bsf and bsr.
 523
 524 //===---------------------------------------------------------------------===//
 525
 526 It appears gcc place string data with linkonce linkage in
 527 .section __TEXT,__const_coal,coalesced instead of
 528 .section __DATA,__const_coal,coalesced.
 529 Take a look at darwin.h, there are other Darwin assembler directives that we
 530 do not make use of.
 531
 532 //===---------------------------------------------------------------------===//
 533
 534 We should handle __attribute__ ((__visibility__ ("hidden"))).
 535
 536 //===---------------------------------------------------------------------===//
 537
 538 int %foo(int* %a, int %t) {
 539 entry:
 540         br label %cond_true
 541
 542 cond_true:              ; preds = %cond_true, %entry
 543         %x.0.0 = phi int [ 0, %entry ], [ %tmp9, %cond_true ]           ; <int> [#uses=3]
 544         %t_addr.0.0 = phi int [ %t, %entry ], [ %tmp7, %cond_true ]             ; <int> [#uses=1]
 545         %tmp2 = getelementptr int* %a, int %x.0.0               ; <int*> [#uses=1]
 546         %tmp3 = load int* %tmp2         ; <int> [#uses=1]
 547         %tmp5 = add int %t_addr.0.0, %x.0.0             ; <int> [#uses=1]
 548         %tmp7 = add int %tmp5, %tmp3            ; <int> [#uses=2]
 549         %tmp9 = add int %x.0.0, 1               ; <int> [#uses=2]
 550         %tmp = setgt int %tmp9, 39              ; <bool> [#uses=1]
 551         br bool %tmp, label %bb12, label %cond_true
 552
 553 bb12:           ; preds = %cond_true
 554         ret int %tmp7
 555 }
 556
 557 is pessimized by -loop-reduce and -indvars
 558
 559 //===---------------------------------------------------------------------===//
 560
 561 Use cpuid to auto-detect CPU features such as SSE, SSE2, and SSE3.
 562
 563 //===---------------------------------------------------------------------===//
 564
 565 u32 to float conversion improvement:
 566
 567 float uint32_2_float( unsigned u ) {
 568   float fl = (int) (u & 0xffff);
 569   float fh = (int) (u >> 16);
 570   fh *= 0x1.0p16f;
 571   return fh + fl;
 572 }
 573
 574 00000000        subl    $0x04,%esp
 575 00000003        movl    0x08(%esp,1),%eax
 576 00000007        movl    %eax,%ecx
 577 00000009        shrl    $0x10,%ecx
 578 0000000c        cvtsi2ss        %ecx,%xmm0
 579 00000010        andl    $0x0000ffff,%eax
 580 00000015        cvtsi2ss        %eax,%xmm1
 581 00000019        mulss   0x00000078,%xmm0
 582 00000021        addss   %xmm1,%xmm0
 583 00000025        movss   %xmm0,(%esp,1)
 584 0000002a        flds    (%esp,1)
 585 0000002d        addl    $0x04,%esp
 586 00000030        ret
 587
 588 //===---------------------------------------------------------------------===//
 589
 590 When using fastcc abi, align stack slot of argument of type double on 8 byte
 591 boundary to improve performance.
 592
 593 //===---------------------------------------------------------------------===//
 594
 595 Codegen:
 596
 597 int f(int a, int b) {
 598   if (a == 4 || a == 6)
 599     b++;
 600   return b;
 601 }
 602
 603
 604 as:
 605
 606 or eax, 2
 607 cmp eax, 6
 608 jz label
 609
 610 If we aren't going to do this, we should lower the switch better.  We compile
 611 the code to:
 612
 613 _f:
 614         movl 8(%esp), %eax
 615         movl 4(%esp), %ecx
 616         cmpl $6, %ecx
 617         jl LBB1_4       #entry
 618         jmp LBB1_3      #entry
 619 LBB1_3: #entry
 620         cmpl $6, %ecx
 621         je LBB1_1       #bb
 622         jmp LBB1_2      #UnifiedReturnBlock
 623 LBB1_4: #entry
 624         cmpl $4, %ecx
 625         jne LBB1_2      #UnifiedReturnBlock
 626 LBB1_1: #bb
 627         incl %eax
 628         ret
 629 LBB1_2: #UnifiedReturnBlock
 630         ret
 631
 632 In the code above, the 'if' is turned into a 'switch' at the mid-level.  It looks
 633 like the 'lower to branches' mode could be improved a little here.  In particular,
 634 the fall-through to LBB1_3 doesn't need a branch.  It would also be nice to
 635 eliminate the redundant "cmp 6", maybe by lowering to a linear sequence of
 636 compares if there are below a certain number of cases (instead of a binary sequence)?
 637
 638 //===---------------------------------------------------------------------===//
 639
 640 Compile:
 641 int %test(ulong *%tmp) {
 642         %tmp = load ulong* %tmp         ; <ulong> [#uses=1]
 643         %tmp.mask = shr ulong %tmp, ubyte 50            ; <ulong> [#uses=1]
 644         %tmp.mask = cast ulong %tmp.mask to ubyte               ; <ubyte> [#uses=1]
 645         %tmp2 = and ubyte %tmp.mask, 3          ; <ubyte> [#uses=1]
 646         %tmp2 = cast ubyte %tmp2 to int         ; <int> [#uses=1]
 647         ret int %tmp2
 648 }
 649
 650 to:
 651
 652 _test:
 653         movl 4(%esp), %eax
 654         movl 4(%eax), %eax
 655         shrl $18, %eax
 656         andl $3, %eax
 657         ret
 658
 659 instead of:
 660
 661 _test:
 662         movl 4(%esp), %eax
 663         movl 4(%eax), %eax
 664         shrl $18, %eax
 665         # TRUNCATE movb %al, %al
 666         andb $3, %al
 667         movzbl %al, %eax
 668         ret
 669
 670 This saves a movzbl, and saves a truncate if it doesn't get coallesced right.
 671 This is a simple DAGCombine to propagate the zext through the and.
 672
 673 //===---------------------------------------------------------------------===//
 674
 675 GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting
 676 simplifications for integer "x cmp y ? a : b".  For example, instead of:
 677
 678 int G;
 679 void f(int X, int Y) {
 680   G = X < 0 ? 14 : 13;
 681 }
 682
 683 compiling to:
 684
 685 _f:
 686         movl $14, %eax
 687         movl $13, %ecx
 688         movl 4(%esp), %edx
 689         testl %edx, %edx
 690         cmovl %eax, %ecx
 691         movl %ecx, _G
 692         ret
 693
 694 it could be:
 695 _f:
 696         movl    4(%esp), %eax
 697         sarl    $31, %eax
 698         notl    %eax
 699         addl    $14, %eax
 700         movl    %eax, _G
 701         ret
 702
 703 etc.
 704
 705 //===---------------------------------------------------------------------===//
 706