lib/Target/PowerPC/README.txt

   1 //===- README.txt - Notes for improving PowerPC-specific code gen ---------===//
   2
   3 TODO:
   4 * gpr0 allocation
   5 * implement do-loop -> bdnz transform
   6 * implement powerpc-64 for darwin
   7
   8 ===-------------------------------------------------------------------------===
   9
  10 Support 'update' load/store instructions.  These are cracked on the G5, but are
  11 still a codesize win.
  12
  13 ===-------------------------------------------------------------------------===
  14
  15 Teach the .td file to pattern match PPC::BR_COND to appropriate bc variant, so
  16 we don't have to always run the branch selector for small functions.
  17
  18 ===-------------------------------------------------------------------------===
  19
  20 * Codegen this:
  21
  22    void test2(int X) {
  23      if (X == 0x12345678) bar();
  24    }
  25
  26     as:
  27
  28        xoris r0,r3,0x1234
  29        cmplwi cr0,r0,0x5678
  30        beq cr0,L6
  31
  32     not:
  33
  34         lis r2, 4660
  35         ori r2, r2, 22136
  36         cmpw cr0, r3, r2
  37         bne .LBB_test2_2
  38
  39 ===-------------------------------------------------------------------------===
  40
  41 Lump the constant pool for each function into ONE pic object, and reference
  42 pieces of it as offsets from the start.  For functions like this (contrived
  43 to have lots of constants obviously):
  44
  45 double X(double Y) { return (Y*1.23 + 4.512)*2.34 + 14.38; }
  46
  47 We generate:
  48
  49 _X:
  50         lis r2, ha16(.CPI_X_0)
  51         lfd f0, lo16(.CPI_X_0)(r2)
  52         lis r2, ha16(.CPI_X_1)
  53         lfd f2, lo16(.CPI_X_1)(r2)
  54         fmadd f0, f1, f0, f2
  55         lis r2, ha16(.CPI_X_2)
  56         lfd f1, lo16(.CPI_X_2)(r2)
  57         lis r2, ha16(.CPI_X_3)
  58         lfd f2, lo16(.CPI_X_3)(r2)
  59         fmadd f1, f0, f1, f2
  60         blr
  61
  62 It would be better to materialize .CPI_X into a register, then use immediates
  63 off of the register to avoid the lis's.  This is even more important in PIC
  64 mode.
  65
  66 Note that this (and the static variable version) is discussed here for GCC:
  67 http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
  68
  69 ===-------------------------------------------------------------------------===
  70
  71 PIC Code Gen IPO optimization:
  72
  73 Squish small scalar globals together into a single global struct, allowing the
  74 address of the struct to be CSE'd, avoiding PIC accesses (also reduces the size
  75 of the GOT on targets with one).
  76
  77 Note that this is discussed here for GCC:
  78 http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
  79
  80 ===-------------------------------------------------------------------------===
  81
  82 Implement Newton-Rhapson method for improving estimate instructions to the
  83 correct accuracy, and implementing divide as multiply by reciprocal when it has
  84 more than one use.  Itanium will want this too.
  85
  86 ===-------------------------------------------------------------------------===
  87
  88 Compile this:
  89
  90 int %f1(int %a, int %b) {
  91         %tmp.1 = and int %a, 15         ; <int> [#uses=1]
  92         %tmp.3 = and int %b, 240                ; <int> [#uses=1]
  93         %tmp.4 = or int %tmp.3, %tmp.1          ; <int> [#uses=1]
  94         ret int %tmp.4
  95 }
  96
  97 without a copy.  We make this currently:
  98
  99 _f1:
 100         rlwinm r2, r4, 0, 24, 27
 101         rlwimi r2, r3, 0, 28, 31
 102         or r3, r2, r2
 103         blr
 104
 105 The two-addr pass or RA needs to learn when it is profitable to commute an
 106 instruction to avoid a copy AFTER the 2-addr instruction.  The 2-addr pass
 107 currently only commutes to avoid inserting a copy BEFORE the two addr instr.
 108
 109 ===-------------------------------------------------------------------------===
 110
 111 Compile offsets from allocas:
 112
 113 int *%test() {
 114         %X = alloca { int, int }
 115         %Y = getelementptr {int,int}* %X, int 0, uint 1
 116         ret int* %Y
 117 }
 118
 119 into a single add, not two:
 120
 121 _test:
 122         addi r2, r1, -8
 123         addi r3, r2, 4
 124         blr
 125
 126 --> important for C++.
 127
 128 ===-------------------------------------------------------------------------===
 129
 130 int test3(int a, int b) { return (a < 0) ? a : 0; }
 131
 132 should be branch free code.  LLVM is turning it into < 1 because of the RHS.
 133
 134 ===-------------------------------------------------------------------------===
 135
 136 No loads or stores of the constants should be needed:
 137
 138 struct foo { double X, Y; };
 139 void xxx(struct foo F);
 140 void bar() { struct foo R = { 1.0, 2.0 }; xxx(R); }
 141
 142 ===-------------------------------------------------------------------------===
 143
 144 Darwin Stub LICM optimization:
 145
 146 Loops like this:
 147
 148   for (...)  bar();
 149
 150 Have to go through an indirect stub if bar is external or linkonce.  It would
 151 be better to compile it as:
 152
 153      fp = &bar;
 154      for (...)  fp();
 155
 156 which only computes the address of bar once (instead of each time through the
 157 stub).  This is Darwin specific and would have to be done in the code generator.
 158 Probably not a win on x86.
 159
 160 ===-------------------------------------------------------------------------===
 161
 162 PowerPC i1/setcc stuff (depends on subreg stuff):
 163
 164 Check out the PPC code we get for 'compare' in this testcase:
 165 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=19672
 166
 167 oof.  on top of not doing the logical crnand instead of (mfcr, mfcr,
 168 invert, invert, or), we then have to compare it against zero instead of
 169 using the value already in a CR!
 170
 171 that should be something like
 172         cmpw cr7, r8, r5
 173         cmpw cr0, r7, r3
 174         crnand cr0, cr0, cr7
 175         bne cr0, LBB_compare_4
 176
 177 instead of
 178         cmpw cr7, r8, r5
 179         cmpw cr0, r7, r3
 180         mfcr r7, 1
 181         mcrf cr7, cr0
 182         mfcr r8, 1
 183         rlwinm r7, r7, 30, 31, 31
 184         rlwinm r8, r8, 30, 31, 31
 185         xori r7, r7, 1
 186         xori r8, r8, 1
 187         addi r2, r2, 1
 188         or r7, r8, r7
 189         cmpwi cr0, r7, 0
 190         bne cr0, LBB_compare_4  ; loopexit
 191
 192 FreeBench/mason has a basic block that looks like this:
 193
 194          %tmp.130 = seteq int %p.0__, 5          ; <bool> [#uses=1]
 195          %tmp.134 = seteq int %p.1__, 6          ; <bool> [#uses=1]
 196          %tmp.139 = seteq int %p.2__, 12         ; <bool> [#uses=1]
 197          %tmp.144 = seteq int %p.3__, 13         ; <bool> [#uses=1]
 198          %tmp.149 = seteq int %p.4__, 14         ; <bool> [#uses=1]
 199          %tmp.154 = seteq int %p.5__, 15         ; <bool> [#uses=1]
 200          %bothcond = and bool %tmp.134, %tmp.130         ; <bool> [#uses=1]
 201          %bothcond123 = and bool %bothcond, %tmp.139             ; <bool>
 202          %bothcond124 = and bool %bothcond123, %tmp.144          ; <bool>
 203          %bothcond125 = and bool %bothcond124, %tmp.149          ; <bool>
 204          %bothcond126 = and bool %bothcond125, %tmp.154          ; <bool>
 205          br bool %bothcond126, label %shortcirc_next.5, label %else.0
 206
 207 This is a particularly important case where handling CRs better will help.
 208
 209 ===-------------------------------------------------------------------------===
 210
 211 Simple IPO for argument passing, change:
 212   void foo(int X, double Y, int Z) -> void foo(int X, int Z, double Y)
 213
 214 the Darwin ABI specifies that any integer arguments in the first 32 bytes worth
 215 of arguments get assigned to r3 through r10. That is, if you have a function
 216 foo(int, double, int) you get r3, f1, r6, since the 64 bit double ate up the
 217 argument bytes for r4 and r5. The trick then would be to shuffle the argument
 218 order for functions we can internalize so that the maximum number of
 219 integers/pointers get passed in regs before you see any of the fp arguments.
 220
 221 Instead of implementing this, it would actually probably be easier to just
 222 implement a PPC fastcc, where we could do whatever we wanted to the CC,
 223 including having this work sanely.
 224
 225 ===-------------------------------------------------------------------------===
 226
 227 Fix Darwin FP-In-Integer Registers ABI
 228
 229 Darwin passes doubles in structures in integer registers, which is very very
 230 bad.  Add something like a BIT_CONVERT to LLVM, then do an i-p transformation
 231 that percolates these things out of functions.
 232
 233 Check out how horrible this is:
 234 http://gcc.gnu.org/ml/gcc/2005-10/msg01036.html
 235
 236 This is an extension of "interprocedural CC unmunging" that can't be done with
 237 just fastcc.
 238
 239 ===-------------------------------------------------------------------------===
 240
 241 Generate lwbrx and other byteswapping load/store instructions when reasonable.
 242
 243 ===-------------------------------------------------------------------------===
 244
 245 Compile this:
 246
 247 int foo(int a) {
 248   int b = (a < 8);
 249   if (b) {
 250     return b * 3;     // ignore the fact that this is always 3.
 251   } else {
 252     return 2;
 253   }
 254 }
 255
 256 into something not this:
 257
 258 _foo:
 259 1)      cmpwi cr7, r3, 8
 260         mfcr r2, 1
 261         rlwinm r2, r2, 29, 31, 31
 262 1)      cmpwi cr0, r3, 7
 263         bgt cr0, LBB1_2 ; UnifiedReturnBlock
 264 LBB1_1: ; then
 265         rlwinm r2, r2, 0, 31, 31
 266         mulli r3, r2, 3
 267         blr
 268 LBB1_2: ; UnifiedReturnBlock
 269         li r3, 2
 270         blr
 271
 272 In particular, the two compares (marked 1) could be shared by reversing one.
 273 This could be done in the dag combiner, by swapping a BR_CC when a SETCC of the
 274 same operands (but backwards) exists.  In this case, this wouldn't save us
 275 anything though, because the compares still wouldn't be shared.
 276
 277 ===-------------------------------------------------------------------------===
 278
 279 The legalizer should lower this:
 280
 281 bool %test(ulong %x) {
 282   %tmp = setlt ulong %x, 4294967296
 283   ret bool %tmp
 284 }
 285
 286 into "if x.high == 0", not:
 287
 288 _test:
 289         addi r2, r3, -1
 290         cntlzw r2, r2
 291         cntlzw r3, r3
 292         srwi r2, r2, 5
 293         srwi r4, r3, 5
 294         li r3, 0
 295         cmpwi cr0, r2, 0
 296         bne cr0, LBB1_2 ;
 297 LBB1_1:
 298         or r3, r4, r4
 299 LBB1_2:
 300         blr
 301
 302 noticed in 2005-05-11-Popcount-ffs-fls.c.
 303
 304
 305 ===-------------------------------------------------------------------------===
 306
 307 We should custom expand setcc instead of pretending that we have it.  That
 308 would allow us to expose the access of the crbit after the mfcr, allowing
 309 that access to be trivially folded into other ops.  A simple example:
 310
 311 int foo(int a, int b) { return (a < b) << 4; }
 312
 313 compiles into:
 314
 315 _foo:
 316         cmpw cr7, r3, r4
 317         mfcr r2, 1
 318         rlwinm r2, r2, 29, 31, 31
 319         slwi r3, r2, 4
 320         blr
 321
 322 ===-------------------------------------------------------------------------===
 323
 324 Fold add and sub with constant into non-extern, non-weak addresses so this:
 325
 326 static int a;
 327 void bar(int b) { a = b; }
 328 void foo(unsigned char *c) {
 329   *c = a;
 330 }
 331
 332 So that
 333
 334 _foo:
 335         lis r2, ha16(_a)
 336         la r2, lo16(_a)(r2)
 337         lbz r2, 3(r2)
 338         stb r2, 0(r3)
 339         blr
 340
 341 Becomes
 342
 343 _foo:
 344         lis r2, ha16(_a+3)
 345         lbz r2, lo16(_a+3)(r2)
 346         stb r2, 0(r3)
 347         blr
 348
 349 ===-------------------------------------------------------------------------===
 350
 351 We generate really bad code for this:
 352
 353 int f(signed char *a, _Bool b, _Bool c) {
 354    signed char t = 0;
 355   if (b)  t = *a;
 356   if (c)  *a = t;
 357 }
 358
 359 ===-------------------------------------------------------------------------===
 360
 361 This:
 362 int test(unsigned *P) { return *P >> 24; }
 363
 364 Should compile to:
 365
 366 _test:
 367         lbz r3,0(r3)
 368         blr
 369
 370 not:
 371
 372 _test:
 373         lwz r2, 0(r3)
 374         srwi r3, r2, 24
 375         blr
 376
 377 ===-------------------------------------------------------------------------===
 378
 379 On the G5, logical CR operations are more expensive in their three
 380 address form: ops that read/write the same register are half as expensive as
 381 those that read from two registers that are different from their destination.
 382
 383 We should model this with two separate instructions.  The isel should generate
 384 the "two address" form of the instructions.  When the register allocator
 385 detects that it needs to insert a copy due to the two-addresness of the CR
 386 logical op, it will invoke PPCInstrInfo::convertToThreeAddress.  At this point
 387 we can convert to the "three address" instruction, to save code space.
 388
 389 This only matters when we start generating cr logical ops.
 390
 391 ===-------------------------------------------------------------------------===
 392
 393 We should compile these two functions to the same thing:
 394
 395 #include <stdlib.h>
 396 void f(int a, int b, int *P) {
 397   *P = (a-b)>=0?(a-b):(b-a);
 398 }
 399 void g(int a, int b, int *P) {
 400   *P = abs(a-b);
 401 }
 402
 403 Further, they should compile to something better than:
 404
 405 _g:
 406         subf r2, r4, r3
 407         subfic r3, r2, 0
 408         cmpwi cr0, r2, -1
 409         bgt cr0, LBB2_2 ; entry
 410 LBB2_1: ; entry
 411         mr r2, r3
 412 LBB2_2: ; entry
 413         stw r2, 0(r5)
 414         blr
 415
 416 GCC produces:
 417
 418 _g:
 419         subf r4,r4,r3
 420         srawi r2,r4,31
 421         xor r0,r2,r4
 422         subf r0,r2,r0
 423         stw r0,0(r5)
 424         blr
 425
 426 ... which is much nicer.
 427
 428 This theoretically may help improve twolf slightly (used in dimbox.c:142?).
 429
 430 ===-------------------------------------------------------------------------===
 431
 432 int foo(int N, int ***W, int **TK, int X) {
 433   int t, i;
 434
 435   for (t = 0; t < N; ++t)
 436     for (i = 0; i < 4; ++i)
 437       W[t / X][i][t % X] = TK[i][t];
 438
 439   return 5;
 440 }
 441
 442 We generate relatively atrocious code for this loop compared to gcc.
 443
 444 We could also strength reduce the rem and the div:
 445 http://www.lcs.mit.edu/pubs/pdf/MIT-LCS-TM-600.pdf
 446
 447 ===-------------------------------------------------------------------------===
 448
 449 float foo(float X) { return (int)(X); }
 450
 451 Currently produces:
 452
 453 _foo:
 454         fctiwz f0, f1
 455         stfd f0, -8(r1)
 456         lwz r2, -4(r1)
 457         extsw r2, r2
 458         std r2, -16(r1)
 459         lfd f0, -16(r1)
 460         fcfid f0, f0
 461         frsp f1, f0
 462         blr
 463
 464 We could use a target dag combine to turn the lwz/extsw into an lwa when the
 465 lwz has a single use.  Since LWA is cracked anyway, this would be a codesize
 466 win only.
 467
 468 ===-------------------------------------------------------------------------===
 469
 470 We generate ugly code for this:
 471
 472 void func(unsigned int *ret, float dx, float dy, float dz, float dw) {
 473   unsigned code = 0;
 474   if(dx < -dw) code |= 1;
 475   if(dx > dw)  code |= 2;
 476   if(dy < -dw) code |= 4;
 477   if(dy > dw)  code |= 8;
 478   if(dz < -dw) code |= 16;
 479   if(dz > dw)  code |= 32;
 480   *ret = code;
 481 }
 482
 483 ===-------------------------------------------------------------------------===
 484
 485 Complete the signed i32 to FP conversion code using 64-bit registers
 486 transformation, good for PI.  See PPCISelLowering.cpp, this comment:
 487
 488      // FIXME: disable this lowered code.  This generates 64-bit register values,
 489      // and we don't model the fact that the top part is clobbered by calls.  We
 490      // need to flag these together so that the value isn't live across a call.
 491      //setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
 492
 493 ===-------------------------------------------------------------------------===
 494
 495 Another missed rlwimi case:
 496
 497 void %foo(uint *%tmp) {
 498         %tmp = load uint* %tmp          ; <uint> [#uses=3]
 499         %tmp1 = shr uint %tmp, ubyte 31         ; <uint> [#uses=1]
 500         %tmp1 = cast uint %tmp1 to ubyte                ; <ubyte> [#uses=1]
 501         %tmp4.mask = shr uint %tmp, ubyte 30            ; <uint> [#uses=1]
 502         %tmp4.mask = cast uint %tmp4.mask to ubyte              ; <ubyte> [#uses=1]
 503         %tmp = or ubyte %tmp4.mask, %tmp1               ; <ubyte> [#uses=1]
 504         %tmp10 = cast ubyte %tmp to uint                ; <uint> [#uses=1]
 505         %tmp11 = shl uint %tmp10, ubyte 31              ; <uint> [#uses=1]
 506         %tmp12 = and uint %tmp, 2147483647              ; <uint> [#uses=1]
 507         %tmp13 = or uint %tmp11, %tmp12         ; <uint> [#uses=1]
 508         store uint %tmp13, uint* %tmp
 509         ret void
 510 }
 511
 512 We emit:
 513
 514 _foo:
 515         lwz r2, 0(r3)
 516         srwi r4, r2, 30
 517         srwi r5, r2, 31
 518         or r4, r4, r5
 519         rlwimi r2, r4, 31, 0, 0
 520         stw r2, 0(r3)
 521         blr
 522
 523 What this code is really doing is ORing bit 0 with bit 1.  We could codegen this
 524 as:
 525
 526 _foo:
 527         lwz r2, 0(r3)
 528         slwi r4, r2, 1
 529         rlwinm r4, r4, 0, 0, 0
 530         or r2, r2, r4
 531         stw r2, 0(r3)
 532         blr