lib/Target/PowerPC/README.txt

   1 //===- README.txt - Notes for improving PowerPC-specific code gen ---------===//
   2
   3 TODO:
   4 * gpr0 allocation
   5 * implement do-loop -> bdnz transform
   6
   7 ===-------------------------------------------------------------------------===
   8
   9 We only produce the rlwnm instruction for rotate instructions.  We should
  10 at least match stuff like:
  11
  12 unsigned rot_and(unsigned X, int Y) {
  13   unsigned T = (X << Y) | (X >> (32-Y));
  14   T &= 127;
  15   return T;
  16 }
  17
  18 _foo3:
  19         rlwnm r2, r3, r4, 0, 31
  20         rlwinm r3, r2, 0, 25, 31
  21         blr
  22
  23 ... which is the basic pattern that should be written in the instr.  It may
  24 also be useful for stuff like:
  25
  26 long long foo2(long long X, int C) {
  27   return X << (C&~32);
  28 }
  29
  30 which currently produces:
  31
  32 _foo2:
  33         rlwinm r2, r5, 0, 27, 25
  34         subfic r5, r2, 32
  35         slw r3, r3, r2
  36         srw r5, r4, r5
  37         or r3, r3, r5
  38         slw r4, r4, r2
  39         blr
  40
  41 ===-------------------------------------------------------------------------===
  42
  43 Support 'update' load/store instructions.  These are cracked on the G5, but are
  44 still a codesize win.
  45
  46 ===-------------------------------------------------------------------------===
  47
  48 Teach the .td file to pattern match PPC::BR_COND to appropriate bc variant, so
  49 we don't have to always run the branch selector for small functions.
  50
  51 ===-------------------------------------------------------------------------===
  52
  53 Lump the constant pool for each function into ONE pic object, and reference
  54 pieces of it as offsets from the start.  For functions like this (contrived
  55 to have lots of constants obviously):
  56
  57 double X(double Y) { return (Y*1.23 + 4.512)*2.34 + 14.38; }
  58
  59 We generate:
  60
  61 _X:
  62         lis r2, ha16(.CPI_X_0)
  63         lfd f0, lo16(.CPI_X_0)(r2)
  64         lis r2, ha16(.CPI_X_1)
  65         lfd f2, lo16(.CPI_X_1)(r2)
  66         fmadd f0, f1, f0, f2
  67         lis r2, ha16(.CPI_X_2)
  68         lfd f1, lo16(.CPI_X_2)(r2)
  69         lis r2, ha16(.CPI_X_3)
  70         lfd f2, lo16(.CPI_X_3)(r2)
  71         fmadd f1, f0, f1, f2
  72         blr
  73
  74 It would be better to materialize .CPI_X into a register, then use immediates
  75 off of the register to avoid the lis's.  This is even more important in PIC
  76 mode.
  77
  78 Note that this (and the static variable version) is discussed here for GCC:
  79 http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
  80
  81 ===-------------------------------------------------------------------------===
  82
  83 PIC Code Gen IPO optimization:
  84
  85 Squish small scalar globals together into a single global struct, allowing the
  86 address of the struct to be CSE'd, avoiding PIC accesses (also reduces the size
  87 of the GOT on targets with one).
  88
  89 Note that this is discussed here for GCC:
  90 http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
  91
  92 ===-------------------------------------------------------------------------===
  93
  94 Implement Newton-Rhapson method for improving estimate instructions to the
  95 correct accuracy, and implementing divide as multiply by reciprocal when it has
  96 more than one use.  Itanium will want this too.
  97
  98 ===-------------------------------------------------------------------------===
  99
 100 Compile this:
 101
 102 int %f1(int %a, int %b) {
 103         %tmp.1 = and int %a, 15         ; <int> [#uses=1]
 104         %tmp.3 = and int %b, 240                ; <int> [#uses=1]
 105         %tmp.4 = or int %tmp.3, %tmp.1          ; <int> [#uses=1]
 106         ret int %tmp.4
 107 }
 108
 109 without a copy.  We make this currently:
 110
 111 _f1:
 112         rlwinm r2, r4, 0, 24, 27
 113         rlwimi r2, r3, 0, 28, 31
 114         or r3, r2, r2
 115         blr
 116
 117 The two-addr pass or RA needs to learn when it is profitable to commute an
 118 instruction to avoid a copy AFTER the 2-addr instruction.  The 2-addr pass
 119 currently only commutes to avoid inserting a copy BEFORE the two addr instr.
 120
 121 ===-------------------------------------------------------------------------===
 122
 123 Compile offsets from allocas:
 124
 125 int *%test() {
 126         %X = alloca { int, int }
 127         %Y = getelementptr {int,int}* %X, int 0, uint 1
 128         ret int* %Y
 129 }
 130
 131 into a single add, not two:
 132
 133 _test:
 134         addi r2, r1, -8
 135         addi r3, r2, 4
 136         blr
 137
 138 --> important for C++.
 139
 140 ===-------------------------------------------------------------------------===
 141
 142 No loads or stores of the constants should be needed:
 143
 144 struct foo { double X, Y; };
 145 void xxx(struct foo F);
 146 void bar() { struct foo R = { 1.0, 2.0 }; xxx(R); }
 147
 148 ===-------------------------------------------------------------------------===
 149
 150 Darwin Stub LICM optimization:
 151
 152 Loops like this:
 153
 154   for (...)  bar();
 155
 156 Have to go through an indirect stub if bar is external or linkonce.  It would
 157 be better to compile it as:
 158
 159      fp = &bar;
 160      for (...)  fp();
 161
 162 which only computes the address of bar once (instead of each time through the
 163 stub).  This is Darwin specific and would have to be done in the code generator.
 164 Probably not a win on x86.
 165
 166 ===-------------------------------------------------------------------------===
 167
 168 PowerPC i1/setcc stuff (depends on subreg stuff):
 169
 170 Check out the PPC code we get for 'compare' in this testcase:
 171 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=19672
 172
 173 oof.  on top of not doing the logical crnand instead of (mfcr, mfcr,
 174 invert, invert, or), we then have to compare it against zero instead of
 175 using the value already in a CR!
 176
 177 that should be something like
 178         cmpw cr7, r8, r5
 179         cmpw cr0, r7, r3
 180         crnand cr0, cr0, cr7
 181         bne cr0, LBB_compare_4
 182
 183 instead of
 184         cmpw cr7, r8, r5
 185         cmpw cr0, r7, r3
 186         mfcr r7, 1
 187         mcrf cr7, cr0
 188         mfcr r8, 1
 189         rlwinm r7, r7, 30, 31, 31
 190         rlwinm r8, r8, 30, 31, 31
 191         xori r7, r7, 1
 192         xori r8, r8, 1
 193         addi r2, r2, 1
 194         or r7, r8, r7
 195         cmpwi cr0, r7, 0
 196         bne cr0, LBB_compare_4  ; loopexit
 197
 198 FreeBench/mason has a basic block that looks like this:
 199
 200          %tmp.130 = seteq int %p.0__, 5          ; <bool> [#uses=1]
 201          %tmp.134 = seteq int %p.1__, 6          ; <bool> [#uses=1]
 202          %tmp.139 = seteq int %p.2__, 12         ; <bool> [#uses=1]
 203          %tmp.144 = seteq int %p.3__, 13         ; <bool> [#uses=1]
 204          %tmp.149 = seteq int %p.4__, 14         ; <bool> [#uses=1]
 205          %tmp.154 = seteq int %p.5__, 15         ; <bool> [#uses=1]
 206          %bothcond = and bool %tmp.134, %tmp.130         ; <bool> [#uses=1]
 207          %bothcond123 = and bool %bothcond, %tmp.139             ; <bool>
 208          %bothcond124 = and bool %bothcond123, %tmp.144          ; <bool>
 209          %bothcond125 = and bool %bothcond124, %tmp.149          ; <bool>
 210          %bothcond126 = and bool %bothcond125, %tmp.154          ; <bool>
 211          br bool %bothcond126, label %shortcirc_next.5, label %else.0
 212
 213 This is a particularly important case where handling CRs better will help.
 214
 215 ===-------------------------------------------------------------------------===
 216
 217 Simple IPO for argument passing, change:
 218   void foo(int X, double Y, int Z) -> void foo(int X, int Z, double Y)
 219
 220 the Darwin ABI specifies that any integer arguments in the first 32 bytes worth
 221 of arguments get assigned to r3 through r10. That is, if you have a function
 222 foo(int, double, int) you get r3, f1, r6, since the 64 bit double ate up the
 223 argument bytes for r4 and r5. The trick then would be to shuffle the argument
 224 order for functions we can internalize so that the maximum number of
 225 integers/pointers get passed in regs before you see any of the fp arguments.
 226
 227 Instead of implementing this, it would actually probably be easier to just
 228 implement a PPC fastcc, where we could do whatever we wanted to the CC,
 229 including having this work sanely.
 230
 231 ===-------------------------------------------------------------------------===
 232
 233 Fix Darwin FP-In-Integer Registers ABI
 234
 235 Darwin passes doubles in structures in integer registers, which is very very
 236 bad.  Add something like a BIT_CONVERT to LLVM, then do an i-p transformation
 237 that percolates these things out of functions.
 238
 239 Check out how horrible this is:
 240 http://gcc.gnu.org/ml/gcc/2005-10/msg01036.html
 241
 242 This is an extension of "interprocedural CC unmunging" that can't be done with
 243 just fastcc.
 244
 245 ===-------------------------------------------------------------------------===
 246
 247 Compile this:
 248
 249 int foo(int a) {
 250   int b = (a < 8);
 251   if (b) {
 252     return b * 3;     // ignore the fact that this is always 3.
 253   } else {
 254     return 2;
 255   }
 256 }
 257
 258 into something not this:
 259
 260 _foo:
 261 1)      cmpwi cr7, r3, 8
 262         mfcr r2, 1
 263         rlwinm r2, r2, 29, 31, 31
 264 1)      cmpwi cr0, r3, 7
 265         bgt cr0, LBB1_2 ; UnifiedReturnBlock
 266 LBB1_1: ; then
 267         rlwinm r2, r2, 0, 31, 31
 268         mulli r3, r2, 3
 269         blr
 270 LBB1_2: ; UnifiedReturnBlock
 271         li r3, 2
 272         blr
 273
 274 In particular, the two compares (marked 1) could be shared by reversing one.
 275 This could be done in the dag combiner, by swapping a BR_CC when a SETCC of the
 276 same operands (but backwards) exists.  In this case, this wouldn't save us
 277 anything though, because the compares still wouldn't be shared.
 278
 279 ===-------------------------------------------------------------------------===
 280
 281 The legalizer should lower this:
 282
 283 bool %test(ulong %x) {
 284   %tmp = setlt ulong %x, 4294967296
 285   ret bool %tmp
 286 }
 287
 288 into "if x.high == 0", not:
 289
 290 _test:
 291         addi r2, r3, -1
 292         cntlzw r2, r2
 293         cntlzw r3, r3
 294         srwi r2, r2, 5
 295         srwi r4, r3, 5
 296         li r3, 0
 297         cmpwi cr0, r2, 0
 298         bne cr0, LBB1_2 ;
 299 LBB1_1:
 300         or r3, r4, r4
 301 LBB1_2:
 302         blr
 303
 304 noticed in 2005-05-11-Popcount-ffs-fls.c.
 305
 306
 307 ===-------------------------------------------------------------------------===
 308
 309 We should custom expand setcc instead of pretending that we have it.  That
 310 would allow us to expose the access of the crbit after the mfcr, allowing
 311 that access to be trivially folded into other ops.  A simple example:
 312
 313 int foo(int a, int b) { return (a < b) << 4; }
 314
 315 compiles into:
 316
 317 _foo:
 318         cmpw cr7, r3, r4
 319         mfcr r2, 1
 320         rlwinm r2, r2, 29, 31, 31
 321         slwi r3, r2, 4
 322         blr
 323
 324 ===-------------------------------------------------------------------------===
 325
 326 Fold add and sub with constant into non-extern, non-weak addresses so this:
 327
 328 static int a;
 329 void bar(int b) { a = b; }
 330 void foo(unsigned char *c) {
 331   *c = a;
 332 }
 333
 334 So that
 335
 336 _foo:
 337         lis r2, ha16(_a)
 338         la r2, lo16(_a)(r2)
 339         lbz r2, 3(r2)
 340         stb r2, 0(r3)
 341         blr
 342
 343 Becomes
 344
 345 _foo:
 346         lis r2, ha16(_a+3)
 347         lbz r2, lo16(_a+3)(r2)
 348         stb r2, 0(r3)
 349         blr
 350
 351 ===-------------------------------------------------------------------------===
 352
 353 We generate really bad code for this:
 354
 355 int f(signed char *a, _Bool b, _Bool c) {
 356    signed char t = 0;
 357   if (b)  t = *a;
 358   if (c)  *a = t;
 359 }
 360
 361 ===-------------------------------------------------------------------------===
 362
 363 This:
 364 int test(unsigned *P) { return *P >> 24; }
 365
 366 Should compile to:
 367
 368 _test:
 369         lbz r3,0(r3)
 370         blr
 371
 372 not:
 373
 374 _test:
 375         lwz r2, 0(r3)
 376         srwi r3, r2, 24
 377         blr
 378
 379 ===-------------------------------------------------------------------------===
 380
 381 On the G5, logical CR operations are more expensive in their three
 382 address form: ops that read/write the same register are half as expensive as
 383 those that read from two registers that are different from their destination.
 384
 385 We should model this with two separate instructions.  The isel should generate
 386 the "two address" form of the instructions.  When the register allocator
 387 detects that it needs to insert a copy due to the two-addresness of the CR
 388 logical op, it will invoke PPCInstrInfo::convertToThreeAddress.  At this point
 389 we can convert to the "three address" instruction, to save code space.
 390
 391 This only matters when we start generating cr logical ops.
 392
 393 ===-------------------------------------------------------------------------===
 394
 395 We should compile these two functions to the same thing:
 396
 397 #include <stdlib.h>
 398 void f(int a, int b, int *P) {
 399   *P = (a-b)>=0?(a-b):(b-a);
 400 }
 401 void g(int a, int b, int *P) {
 402   *P = abs(a-b);
 403 }
 404
 405 Further, they should compile to something better than:
 406
 407 _g:
 408         subf r2, r4, r3
 409         subfic r3, r2, 0
 410         cmpwi cr0, r2, -1
 411         bgt cr0, LBB2_2 ; entry
 412 LBB2_1: ; entry
 413         mr r2, r3
 414 LBB2_2: ; entry
 415         stw r2, 0(r5)
 416         blr
 417
 418 GCC produces:
 419
 420 _g:
 421         subf r4,r4,r3
 422         srawi r2,r4,31
 423         xor r0,r2,r4
 424         subf r0,r2,r0
 425         stw r0,0(r5)
 426         blr
 427
 428 ... which is much nicer.
 429
 430 This theoretically may help improve twolf slightly (used in dimbox.c:142?).
 431
 432 ===-------------------------------------------------------------------------===
 433
 434 int foo(int N, int ***W, int **TK, int X) {
 435   int t, i;
 436
 437   for (t = 0; t < N; ++t)
 438     for (i = 0; i < 4; ++i)
 439       W[t / X][i][t % X] = TK[i][t];
 440
 441   return 5;
 442 }
 443
 444 We generate relatively atrocious code for this loop compared to gcc.
 445
 446 We could also strength reduce the rem and the div:
 447 http://www.lcs.mit.edu/pubs/pdf/MIT-LCS-TM-600.pdf
 448
 449 ===-------------------------------------------------------------------------===
 450
 451 float foo(float X) { return (int)(X); }
 452
 453 Currently produces:
 454
 455 _foo:
 456         fctiwz f0, f1
 457         stfd f0, -8(r1)
 458         lwz r2, -4(r1)
 459         extsw r2, r2
 460         std r2, -16(r1)
 461         lfd f0, -16(r1)
 462         fcfid f0, f0
 463         frsp f1, f0
 464         blr
 465
 466 We could use a target dag combine to turn the lwz/extsw into an lwa when the
 467 lwz has a single use.  Since LWA is cracked anyway, this would be a codesize
 468 win only.
 469
 470 ===-------------------------------------------------------------------------===
 471
 472 We generate ugly code for this:
 473
 474 void func(unsigned int *ret, float dx, float dy, float dz, float dw) {
 475   unsigned code = 0;
 476   if(dx < -dw) code |= 1;
 477   if(dx > dw)  code |= 2;
 478   if(dy < -dw) code |= 4;
 479   if(dy > dw)  code |= 8;
 480   if(dz < -dw) code |= 16;
 481   if(dz > dw)  code |= 32;
 482   *ret = code;
 483 }
 484
 485 ===-------------------------------------------------------------------------===
 486
 487 Complete the signed i32 to FP conversion code using 64-bit registers
 488 transformation, good for PI.  See PPCISelLowering.cpp, this comment:
 489
 490      // FIXME: disable this lowered code.  This generates 64-bit register values,
 491      // and we don't model the fact that the top part is clobbered by calls.  We
 492      // need to flag these together so that the value isn't live across a call.
 493      //setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
 494
 495 Also, if the registers are spilled to the stack, we have to ensure that all
 496 64-bits of them are save/restored, otherwise we will miscompile the code.  It
 497 sounds like we need to get the 64-bit register classes going.
 498
 499 ===-------------------------------------------------------------------------===
 500
 501 %struct.B = type { ubyte, [3 x ubyte] }
 502
 503 void %foo(%struct.B* %b) {
 504 entry:
 505         %tmp = cast %struct.B* %b to uint*              ; <uint*> [#uses=1]
 506         %tmp = load uint* %tmp          ; <uint> [#uses=1]
 507         %tmp3 = cast %struct.B* %b to uint*             ; <uint*> [#uses=1]
 508         %tmp4 = load uint* %tmp3                ; <uint> [#uses=1]
 509         %tmp8 = cast %struct.B* %b to uint*             ; <uint*> [#uses=2]
 510         %tmp9 = load uint* %tmp8                ; <uint> [#uses=1]
 511         %tmp4.mask17 = shl uint %tmp4, ubyte 1          ; <uint> [#uses=1]
 512         %tmp1415 = and uint %tmp4.mask17, 2147483648            ; <uint> [#uses=1]
 513         %tmp.masked = and uint %tmp, 2147483648         ; <uint> [#uses=1]
 514         %tmp11 = or uint %tmp1415, %tmp.masked          ; <uint> [#uses=1]
 515         %tmp12 = and uint %tmp9, 2147483647             ; <uint> [#uses=1]
 516         %tmp13 = or uint %tmp12, %tmp11         ; <uint> [#uses=1]
 517         store uint %tmp13, uint* %tmp8
 518         ret void
 519 }
 520
 521 We emit:
 522
 523 _foo:
 524         lwz r2, 0(r3)
 525         slwi r4, r2, 1
 526         or r4, r4, r2
 527         rlwimi r2, r4, 0, 0, 0
 528         stw r2, 0(r3)
 529         blr
 530
 531 We could collapse a bunch of those ORs and ANDs and generate the following
 532 equivalent code:
 533
 534 _foo:
 535         lwz r2, 0(r3)
 536         rlwinm r4, r2, 1, 0, 0
 537         or r2, r2, r4
 538         stw r2, 0(r3)
 539         blr
 540
 541 ===-------------------------------------------------------------------------===
 542
 543 We compile:
 544
 545 unsigned test6(unsigned x) {
 546   return ((x & 0x00FF0000) >> 16) | ((x & 0x000000FF) << 16);
 547 }
 548
 549 into:
 550
 551 _test6:
 552         lis r2, 255
 553         rlwinm r3, r3, 16, 0, 31
 554         ori r2, r2, 255
 555         and r3, r3, r2
 556         blr
 557
 558 GCC gets it down to:
 559
 560 _test6:
 561         rlwinm r0,r3,16,8,15
 562         rlwinm r3,r3,16,24,31
 563         or r3,r3,r0
 564         blr
 565