lib/Target/PowerPC/README.txt

   1 //===- README.txt - Notes for improving PowerPC-specific code gen ---------===//
   2
   3 TODO:
   4 * gpr0 allocation
   5 * implement do-loop -> bdnz transform
   6
   7 ===-------------------------------------------------------------------------===
   8
   9 Support 'update' load/store instructions.  These are cracked on the G5, but are
  10 still a codesize win.
  11
  12 With preinc enabled, this:
  13
  14 long *%test4(long *%X, long *%dest) {
  15         %Y = getelementptr long* %X, int 4
  16         %A = load long* %Y
  17         store long %A, long* %dest
  18         ret long* %Y
  19 }
  20
  21 compiles to:
  22
  23 _test4:
  24         mr r2, r3
  25         lwzu r5, 32(r2)
  26         lwz r3, 36(r3)
  27         stw r5, 0(r4)
  28         stw r3, 4(r4)
  29         mr r3, r2
  30         blr
  31
  32 with -sched=list-burr, I get:
  33
  34 _test4:
  35         lwz r2, 36(r3)
  36         lwzu r5, 32(r3)
  37         stw r2, 4(r4)
  38         stw r5, 0(r4)
  39         blr
  40
  41 ===-------------------------------------------------------------------------===
  42
  43 We compile the hottest inner loop of viterbi to:
  44
  45         li r6, 0
  46         b LBB1_84       ;bb432.i
  47 LBB1_83:        ;bb420.i
  48         lbzx r8, r5, r7
  49         addi r6, r7, 1
  50         stbx r8, r4, r7
  51 LBB1_84:        ;bb432.i
  52         mr r7, r6
  53         cmplwi cr0, r7, 143
  54         bne cr0, LBB1_83        ;bb420.i
  55
  56 The CBE manages to produce:
  57
  58         li r0, 143
  59         mtctr r0
  60 loop:
  61         lbzx r2, r2, r11
  62         stbx r0, r2, r9
  63         addi r2, r2, 1
  64         bdz later
  65         b loop
  66
  67 This could be much better (bdnz instead of bdz) but it still beats us.  If we
  68 produced this with bdnz, the loop would be a single dispatch group.
  69
  70 ===-------------------------------------------------------------------------===
  71
  72 Compile:
  73
  74 void foo(int *P) {
  75  if (P)  *P = 0;
  76 }
  77
  78 into:
  79
  80 _foo:
  81         cmpwi cr0,r3,0
  82         beqlr cr0
  83         li r0,0
  84         stw r0,0(r3)
  85         blr
  86
  87 This is effectively a simple form of predication.
  88
  89 ===-------------------------------------------------------------------------===
  90
  91 Lump the constant pool for each function into ONE pic object, and reference
  92 pieces of it as offsets from the start.  For functions like this (contrived
  93 to have lots of constants obviously):
  94
  95 double X(double Y) { return (Y*1.23 + 4.512)*2.34 + 14.38; }
  96
  97 We generate:
  98
  99 _X:
 100         lis r2, ha16(.CPI_X_0)
 101         lfd f0, lo16(.CPI_X_0)(r2)
 102         lis r2, ha16(.CPI_X_1)
 103         lfd f2, lo16(.CPI_X_1)(r2)
 104         fmadd f0, f1, f0, f2
 105         lis r2, ha16(.CPI_X_2)
 106         lfd f1, lo16(.CPI_X_2)(r2)
 107         lis r2, ha16(.CPI_X_3)
 108         lfd f2, lo16(.CPI_X_3)(r2)
 109         fmadd f1, f0, f1, f2
 110         blr
 111
 112 It would be better to materialize .CPI_X into a register, then use immediates
 113 off of the register to avoid the lis's.  This is even more important in PIC
 114 mode.
 115
 116 Note that this (and the static variable version) is discussed here for GCC:
 117 http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
 118
 119 ===-------------------------------------------------------------------------===
 120
 121 PIC Code Gen IPO optimization:
 122
 123 Squish small scalar globals together into a single global struct, allowing the
 124 address of the struct to be CSE'd, avoiding PIC accesses (also reduces the size
 125 of the GOT on targets with one).
 126
 127 Note that this is discussed here for GCC:
 128 http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
 129
 130 ===-------------------------------------------------------------------------===
 131
 132 Implement Newton-Rhapson method for improving estimate instructions to the
 133 correct accuracy, and implementing divide as multiply by reciprocal when it has
 134 more than one use.  Itanium will want this too.
 135
 136 ===-------------------------------------------------------------------------===
 137
 138 Compile this:
 139
 140 int %f1(int %a, int %b) {
 141         %tmp.1 = and int %a, 15         ; <int> [#uses=1]
 142         %tmp.3 = and int %b, 240                ; <int> [#uses=1]
 143         %tmp.4 = or int %tmp.3, %tmp.1          ; <int> [#uses=1]
 144         ret int %tmp.4
 145 }
 146
 147 without a copy.  We make this currently:
 148
 149 _f1:
 150         rlwinm r2, r4, 0, 24, 27
 151         rlwimi r2, r3, 0, 28, 31
 152         or r3, r2, r2
 153         blr
 154
 155 The two-addr pass or RA needs to learn when it is profitable to commute an
 156 instruction to avoid a copy AFTER the 2-addr instruction.  The 2-addr pass
 157 currently only commutes to avoid inserting a copy BEFORE the two addr instr.
 158
 159 ===-------------------------------------------------------------------------===
 160
 161 Compile offsets from allocas:
 162
 163 int *%test() {
 164         %X = alloca { int, int }
 165         %Y = getelementptr {int,int}* %X, int 0, uint 1
 166         ret int* %Y
 167 }
 168
 169 into a single add, not two:
 170
 171 _test:
 172         addi r2, r1, -8
 173         addi r3, r2, 4
 174         blr
 175
 176 --> important for C++.
 177
 178 ===-------------------------------------------------------------------------===
 179
 180 No loads or stores of the constants should be needed:
 181
 182 struct foo { double X, Y; };
 183 void xxx(struct foo F);
 184 void bar() { struct foo R = { 1.0, 2.0 }; xxx(R); }
 185
 186 ===-------------------------------------------------------------------------===
 187
 188 Darwin Stub LICM optimization:
 189
 190 Loops like this:
 191
 192   for (...)  bar();
 193
 194 Have to go through an indirect stub if bar is external or linkonce.  It would
 195 be better to compile it as:
 196
 197      fp = &bar;
 198      for (...)  fp();
 199
 200 which only computes the address of bar once (instead of each time through the
 201 stub).  This is Darwin specific and would have to be done in the code generator.
 202 Probably not a win on x86.
 203
 204 ===-------------------------------------------------------------------------===
 205
 206 Simple IPO for argument passing, change:
 207   void foo(int X, double Y, int Z) -> void foo(int X, int Z, double Y)
 208
 209 the Darwin ABI specifies that any integer arguments in the first 32 bytes worth
 210 of arguments get assigned to r3 through r10. That is, if you have a function
 211 foo(int, double, int) you get r3, f1, r6, since the 64 bit double ate up the
 212 argument bytes for r4 and r5. The trick then would be to shuffle the argument
 213 order for functions we can internalize so that the maximum number of
 214 integers/pointers get passed in regs before you see any of the fp arguments.
 215
 216 Instead of implementing this, it would actually probably be easier to just
 217 implement a PPC fastcc, where we could do whatever we wanted to the CC,
 218 including having this work sanely.
 219
 220 ===-------------------------------------------------------------------------===
 221
 222 Fix Darwin FP-In-Integer Registers ABI
 223
 224 Darwin passes doubles in structures in integer registers, which is very very
 225 bad.  Add something like a BIT_CONVERT to LLVM, then do an i-p transformation
 226 that percolates these things out of functions.
 227
 228 Check out how horrible this is:
 229 http://gcc.gnu.org/ml/gcc/2005-10/msg01036.html
 230
 231 This is an extension of "interprocedural CC unmunging" that can't be done with
 232 just fastcc.
 233
 234 ===-------------------------------------------------------------------------===
 235
 236 Compile this:
 237
 238 int foo(int a) {
 239   int b = (a < 8);
 240   if (b) {
 241     return b * 3;     // ignore the fact that this is always 3.
 242   } else {
 243     return 2;
 244   }
 245 }
 246
 247 into something not this:
 248
 249 _foo:
 250 1)      cmpwi cr7, r3, 8
 251         mfcr r2, 1
 252         rlwinm r2, r2, 29, 31, 31
 253 1)      cmpwi cr0, r3, 7
 254         bgt cr0, LBB1_2 ; UnifiedReturnBlock
 255 LBB1_1: ; then
 256         rlwinm r2, r2, 0, 31, 31
 257         mulli r3, r2, 3
 258         blr
 259 LBB1_2: ; UnifiedReturnBlock
 260         li r3, 2
 261         blr
 262
 263 In particular, the two compares (marked 1) could be shared by reversing one.
 264 This could be done in the dag combiner, by swapping a BR_CC when a SETCC of the
 265 same operands (but backwards) exists.  In this case, this wouldn't save us
 266 anything though, because the compares still wouldn't be shared.
 267
 268 ===-------------------------------------------------------------------------===
 269
 270 The legalizer should lower this:
 271
 272 bool %test(ulong %x) {
 273   %tmp = setlt ulong %x, 4294967296
 274   ret bool %tmp
 275 }
 276
 277 into "if x.high == 0", not:
 278
 279 _test:
 280         addi r2, r3, -1
 281         cntlzw r2, r2
 282         cntlzw r3, r3
 283         srwi r2, r2, 5
 284         srwi r4, r3, 5
 285         li r3, 0
 286         cmpwi cr0, r2, 0
 287         bne cr0, LBB1_2 ;
 288 LBB1_1:
 289         or r3, r4, r4
 290 LBB1_2:
 291         blr
 292
 293 noticed in 2005-05-11-Popcount-ffs-fls.c.
 294
 295
 296 ===-------------------------------------------------------------------------===
 297
 298 We should custom expand setcc instead of pretending that we have it.  That
 299 would allow us to expose the access of the crbit after the mfcr, allowing
 300 that access to be trivially folded into other ops.  A simple example:
 301
 302 int foo(int a, int b) { return (a < b) << 4; }
 303
 304 compiles into:
 305
 306 _foo:
 307         cmpw cr7, r3, r4
 308         mfcr r2, 1
 309         rlwinm r2, r2, 29, 31, 31
 310         slwi r3, r2, 4
 311         blr
 312
 313 ===-------------------------------------------------------------------------===
 314
 315 Fold add and sub with constant into non-extern, non-weak addresses so this:
 316
 317 static int a;
 318 void bar(int b) { a = b; }
 319 void foo(unsigned char *c) {
 320   *c = a;
 321 }
 322
 323 So that
 324
 325 _foo:
 326         lis r2, ha16(_a)
 327         la r2, lo16(_a)(r2)
 328         lbz r2, 3(r2)
 329         stb r2, 0(r3)
 330         blr
 331
 332 Becomes
 333
 334 _foo:
 335         lis r2, ha16(_a+3)
 336         lbz r2, lo16(_a+3)(r2)
 337         stb r2, 0(r3)
 338         blr
 339
 340 ===-------------------------------------------------------------------------===
 341
 342 We generate really bad code for this:
 343
 344 int f(signed char *a, _Bool b, _Bool c) {
 345    signed char t = 0;
 346   if (b)  t = *a;
 347   if (c)  *a = t;
 348 }
 349
 350 ===-------------------------------------------------------------------------===
 351
 352 This:
 353 int test(unsigned *P) { return *P >> 24; }
 354
 355 Should compile to:
 356
 357 _test:
 358         lbz r3,0(r3)
 359         blr
 360
 361 not:
 362
 363 _test:
 364         lwz r2, 0(r3)
 365         srwi r3, r2, 24
 366         blr
 367
 368 ===-------------------------------------------------------------------------===
 369
 370 On the G5, logical CR operations are more expensive in their three
 371 address form: ops that read/write the same register are half as expensive as
 372 those that read from two registers that are different from their destination.
 373
 374 We should model this with two separate instructions.  The isel should generate
 375 the "two address" form of the instructions.  When the register allocator
 376 detects that it needs to insert a copy due to the two-addresness of the CR
 377 logical op, it will invoke PPCInstrInfo::convertToThreeAddress.  At this point
 378 we can convert to the "three address" instruction, to save code space.
 379
 380 This only matters when we start generating cr logical ops.
 381
 382 ===-------------------------------------------------------------------------===
 383
 384 We should compile these two functions to the same thing:
 385
 386 #include <stdlib.h>
 387 void f(int a, int b, int *P) {
 388   *P = (a-b)>=0?(a-b):(b-a);
 389 }
 390 void g(int a, int b, int *P) {
 391   *P = abs(a-b);
 392 }
 393
 394 Further, they should compile to something better than:
 395
 396 _g:
 397         subf r2, r4, r3
 398         subfic r3, r2, 0
 399         cmpwi cr0, r2, -1
 400         bgt cr0, LBB2_2 ; entry
 401 LBB2_1: ; entry
 402         mr r2, r3
 403 LBB2_2: ; entry
 404         stw r2, 0(r5)
 405         blr
 406
 407 GCC produces:
 408
 409 _g:
 410         subf r4,r4,r3
 411         srawi r2,r4,31
 412         xor r0,r2,r4
 413         subf r0,r2,r0
 414         stw r0,0(r5)
 415         blr
 416
 417 ... which is much nicer.
 418
 419 This theoretically may help improve twolf slightly (used in dimbox.c:142?).
 420
 421 ===-------------------------------------------------------------------------===
 422
 423 int foo(int N, int ***W, int **TK, int X) {
 424   int t, i;
 425
 426   for (t = 0; t < N; ++t)
 427     for (i = 0; i < 4; ++i)
 428       W[t / X][i][t % X] = TK[i][t];
 429
 430   return 5;
 431 }
 432
 433 We generate relatively atrocious code for this loop compared to gcc.
 434
 435 We could also strength reduce the rem and the div:
 436 http://www.lcs.mit.edu/pubs/pdf/MIT-LCS-TM-600.pdf
 437
 438 ===-------------------------------------------------------------------------===
 439
 440 float foo(float X) { return (int)(X); }
 441
 442 Currently produces:
 443
 444 _foo:
 445         fctiwz f0, f1
 446         stfd f0, -8(r1)
 447         lwz r2, -4(r1)
 448         extsw r2, r2
 449         std r2, -16(r1)
 450         lfd f0, -16(r1)
 451         fcfid f0, f0
 452         frsp f1, f0
 453         blr
 454
 455 We could use a target dag combine to turn the lwz/extsw into an lwa when the
 456 lwz has a single use.  Since LWA is cracked anyway, this would be a codesize
 457 win only.
 458
 459 ===-------------------------------------------------------------------------===
 460
 461 We generate ugly code for this:
 462
 463 void func(unsigned int *ret, float dx, float dy, float dz, float dw) {
 464   unsigned code = 0;
 465   if(dx < -dw) code |= 1;
 466   if(dx > dw)  code |= 2;
 467   if(dy < -dw) code |= 4;
 468   if(dy > dw)  code |= 8;
 469   if(dz < -dw) code |= 16;
 470   if(dz > dw)  code |= 32;
 471   *ret = code;
 472 }
 473
 474 ===-------------------------------------------------------------------------===
 475
 476 Complete the signed i32 to FP conversion code using 64-bit registers
 477 transformation, good for PI.  See PPCISelLowering.cpp, this comment:
 478
 479      // FIXME: disable this lowered code.  This generates 64-bit register values,
 480      // and we don't model the fact that the top part is clobbered by calls.  We
 481      // need to flag these together so that the value isn't live across a call.
 482      //setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
 483
 484 Also, if the registers are spilled to the stack, we have to ensure that all
 485 64-bits of them are save/restored, otherwise we will miscompile the code.  It
 486 sounds like we need to get the 64-bit register classes going.
 487
 488 ===-------------------------------------------------------------------------===
 489
 490 %struct.B = type { ubyte, [3 x ubyte] }
 491
 492 void %foo(%struct.B* %b) {
 493 entry:
 494         %tmp = cast %struct.B* %b to uint*              ; <uint*> [#uses=1]
 495         %tmp = load uint* %tmp          ; <uint> [#uses=1]
 496         %tmp3 = cast %struct.B* %b to uint*             ; <uint*> [#uses=1]
 497         %tmp4 = load uint* %tmp3                ; <uint> [#uses=1]
 498         %tmp8 = cast %struct.B* %b to uint*             ; <uint*> [#uses=2]
 499         %tmp9 = load uint* %tmp8                ; <uint> [#uses=1]
 500         %tmp4.mask17 = shl uint %tmp4, ubyte 1          ; <uint> [#uses=1]
 501         %tmp1415 = and uint %tmp4.mask17, 2147483648            ; <uint> [#uses=1]
 502         %tmp.masked = and uint %tmp, 2147483648         ; <uint> [#uses=1]
 503         %tmp11 = or uint %tmp1415, %tmp.masked          ; <uint> [#uses=1]
 504         %tmp12 = and uint %tmp9, 2147483647             ; <uint> [#uses=1]
 505         %tmp13 = or uint %tmp12, %tmp11         ; <uint> [#uses=1]
 506         store uint %tmp13, uint* %tmp8
 507         ret void
 508 }
 509
 510 We emit:
 511
 512 _foo:
 513         lwz r2, 0(r3)
 514         slwi r4, r2, 1
 515         or r4, r4, r2
 516         rlwimi r2, r4, 0, 0, 0
 517         stw r2, 0(r3)
 518         blr
 519
 520 We could collapse a bunch of those ORs and ANDs and generate the following
 521 equivalent code:
 522
 523 _foo:
 524         lwz r2, 0(r3)
 525         rlwinm r4, r2, 1, 0, 0
 526         or r2, r2, r4
 527         stw r2, 0(r3)
 528         blr
 529
 530 ===-------------------------------------------------------------------------===
 531
 532 We compile:
 533
 534 unsigned test6(unsigned x) {
 535   return ((x & 0x00FF0000) >> 16) | ((x & 0x000000FF) << 16);
 536 }
 537
 538 into:
 539
 540 _test6:
 541         lis r2, 255
 542         rlwinm r3, r3, 16, 0, 31
 543         ori r2, r2, 255
 544         and r3, r3, r2
 545         blr
 546
 547 GCC gets it down to:
 548
 549 _test6:
 550         rlwinm r0,r3,16,8,15
 551         rlwinm r3,r3,16,24,31
 552         or r3,r3,r0
 553         blr
 554
 555
 556 ===-------------------------------------------------------------------------===
 557
 558 Consider a function like this:
 559
 560 float foo(float X) { return X + 1234.4123f; }
 561
 562 The FP constant ends up in the constant pool, so we need to get the LR register.
 563  This ends up producing code like this:
 564
 565 _foo:
 566 .LBB_foo_0:     ; entry
 567         mflr r11
 568 ***     stw r11, 8(r1)
 569         bl "L00000$pb"
 570 "L00000$pb":
 571         mflr r2
 572         addis r2, r2, ha16(.CPI_foo_0-"L00000$pb")
 573         lfs f0, lo16(.CPI_foo_0-"L00000$pb")(r2)
 574         fadds f1, f1, f0
 575 ***     lwz r11, 8(r1)
 576         mtlr r11
 577         blr
 578
 579 This is functional, but there is no reason to spill the LR register all the way
 580 to the stack (the two marked instrs): spilling it to a GPR is quite enough.
 581
 582 Implementing this will require some codegen improvements.  Nate writes:
 583
 584 "So basically what we need to support the "no stack frame save and restore" is a
 585 generalization of the LR optimization to "callee-save regs".
 586
 587 Currently, we have LR marked as a callee-save reg.  The register allocator sees
 588 that it's callee save, and spills it directly to the stack.
 589
 590 Ideally, something like this would happen:
 591
 592 LR would be in a separate register class from the GPRs. The class of LR would be
 593 marked "unspillable".  When the register allocator came across an unspillable
 594 reg, it would ask "what is the best class to copy this into that I *can* spill"
 595 If it gets a class back, which it will in this case (the gprs), it grabs a free
 596 register of that class.  If it is then later necessary to spill that reg, so be
 597 it.
 598
 599 ===-------------------------------------------------------------------------===