lib/Target/PowerPC/README.txt

   1 TODO:
   2 * gpr0 allocation
   3 * implement do-loop -> bdnz transform
   4 * implement powerpc-64 for darwin
   5
   6 ===-------------------------------------------------------------------------===
   7
   8 Support 'update' load/store instructions.  These are cracked on the G5, but are
   9 still a codesize win.
  10
  11 ===-------------------------------------------------------------------------===
  12
  13 Teach the .td file to pattern match PPC::BR_COND to appropriate bc variant, so
  14 we don't have to always run the branch selector for small functions.
  15
  16 ===-------------------------------------------------------------------------===
  17
  18 * Codegen this:
  19
  20    void test2(int X) {
  21      if (X == 0x12345678) bar();
  22    }
  23
  24     as:
  25
  26        xoris r0,r3,0x1234
  27        cmplwi cr0,r0,0x5678
  28        beq cr0,L6
  29
  30     not:
  31
  32         lis r2, 4660
  33         ori r2, r2, 22136
  34         cmpw cr0, r3, r2
  35         bne .LBB_test2_2
  36
  37 ===-------------------------------------------------------------------------===
  38
  39 Lump the constant pool for each function into ONE pic object, and reference
  40 pieces of it as offsets from the start.  For functions like this (contrived
  41 to have lots of constants obviously):
  42
  43 double X(double Y) { return (Y*1.23 + 4.512)*2.34 + 14.38; }
  44
  45 We generate:
  46
  47 _X:
  48         lis r2, ha16(.CPI_X_0)
  49         lfd f0, lo16(.CPI_X_0)(r2)
  50         lis r2, ha16(.CPI_X_1)
  51         lfd f2, lo16(.CPI_X_1)(r2)
  52         fmadd f0, f1, f0, f2
  53         lis r2, ha16(.CPI_X_2)
  54         lfd f1, lo16(.CPI_X_2)(r2)
  55         lis r2, ha16(.CPI_X_3)
  56         lfd f2, lo16(.CPI_X_3)(r2)
  57         fmadd f1, f0, f1, f2
  58         blr
  59
  60 It would be better to materialize .CPI_X into a register, then use immediates
  61 off of the register to avoid the lis's.  This is even more important in PIC
  62 mode.
  63
  64 Note that this (and the static variable version) is discussed here for GCC:
  65 http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
  66
  67 ===-------------------------------------------------------------------------===
  68
  69 PIC Code Gen IPO optimization:
  70
  71 Squish small scalar globals together into a single global struct, allowing the
  72 address of the struct to be CSE'd, avoiding PIC accesses (also reduces the size
  73 of the GOT on targets with one).
  74
  75 Note that this is discussed here for GCC:
  76 http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
  77
  78 ===-------------------------------------------------------------------------===
  79
  80 Implement Newton-Rhapson method for improving estimate instructions to the
  81 correct accuracy, and implementing divide as multiply by reciprocal when it has
  82 more than one use.  Itanium will want this too.
  83
  84 ===-------------------------------------------------------------------------===
  85
  86 #define  ARRAY_LENGTH  16
  87
  88 union bitfield {
  89         struct {
  90 #ifndef __ppc__
  91                 unsigned int                       field0 : 6;
  92                 unsigned int                       field1 : 6;
  93                 unsigned int                       field2 : 6;
  94                 unsigned int                       field3 : 6;
  95                 unsigned int                       field4 : 3;
  96                 unsigned int                       field5 : 4;
  97                 unsigned int                       field6 : 1;
  98 #else
  99                 unsigned int                       field6 : 1;
 100                 unsigned int                       field5 : 4;
 101                 unsigned int                       field4 : 3;
 102                 unsigned int                       field3 : 6;
 103                 unsigned int                       field2 : 6;
 104                 unsigned int                       field1 : 6;
 105                 unsigned int                       field0 : 6;
 106 #endif
 107         } bitfields, bits;
 108         unsigned int    u32All;
 109         signed int      i32All;
 110         float   f32All;
 111 };
 112
 113
 114 typedef struct program_t {
 115         union bitfield    array[ARRAY_LENGTH];
 116     int               size;
 117     int               loaded;
 118 } program;
 119
 120
 121 void AdjustBitfields(program* prog, unsigned int fmt1)
 122 {
 123         prog->array[0].bitfields.field0 = fmt1;
 124         prog->array[0].bitfields.field1 = fmt1 + 1;
 125 }
 126
 127 We currently generate:
 128
 129 _AdjustBitfields:
 130         lwz r2, 0(r3)
 131         addi r5, r4, 1
 132         rlwinm r2, r2, 0, 0, 19
 133         rlwinm r5, r5, 6, 20, 25
 134         rlwimi r2, r4, 0, 26, 31
 135         or r2, r2, r5
 136         stw r2, 0(r3)
 137         blr
 138
 139 We should teach someone that or (rlwimi, rlwinm) with disjoint masks can be
 140 turned into rlwimi (rlwimi)
 141
 142 The better codegen would be:
 143
 144 _AdjustBitfields:
 145         lwz r0,0(r3)
 146         rlwinm r4,r4,0,0xff
 147         rlwimi r0,r4,0,26,31
 148         addi r4,r4,1
 149         rlwimi r0,r4,6,20,25
 150         stw r0,0(r3)
 151         blr
 152
 153 ===-------------------------------------------------------------------------===
 154
 155 Compile this:
 156
 157 int %f1(int %a, int %b) {
 158         %tmp.1 = and int %a, 15         ; <int> [#uses=1]
 159         %tmp.3 = and int %b, 240                ; <int> [#uses=1]
 160         %tmp.4 = or int %tmp.3, %tmp.1          ; <int> [#uses=1]
 161         ret int %tmp.4
 162 }
 163
 164 without a copy.  We make this currently:
 165
 166 _f1:
 167         rlwinm r2, r4, 0, 24, 27
 168         rlwimi r2, r3, 0, 28, 31
 169         or r3, r2, r2
 170         blr
 171
 172 The two-addr pass or RA needs to learn when it is profitable to commute an
 173 instruction to avoid a copy AFTER the 2-addr instruction.  The 2-addr pass
 174 currently only commutes to avoid inserting a copy BEFORE the two addr instr.
 175
 176 ===-------------------------------------------------------------------------===
 177
 178 Compile offsets from allocas:
 179
 180 int *%test() {
 181         %X = alloca { int, int }
 182         %Y = getelementptr {int,int}* %X, int 0, uint 1
 183         ret int* %Y
 184 }
 185
 186 into a single add, not two:
 187
 188 _test:
 189         addi r2, r1, -8
 190         addi r3, r2, 4
 191         blr
 192
 193 --> important for C++.
 194
 195 ===-------------------------------------------------------------------------===
 196
 197 int test3(int a, int b) { return (a < 0) ? a : 0; }
 198
 199 should be branch free code.  LLVM is turning it into < 1 because of the RHS.
 200
 201 ===-------------------------------------------------------------------------===
 202
 203 No loads or stores of the constants should be needed:
 204
 205 struct foo { double X, Y; };
 206 void xxx(struct foo F);
 207 void bar() { struct foo R = { 1.0, 2.0 }; xxx(R); }
 208
 209 ===-------------------------------------------------------------------------===
 210
 211 Darwin Stub LICM optimization:
 212
 213 Loops like this:
 214
 215   for (...)  bar();
 216
 217 Have to go through an indirect stub if bar is external or linkonce.  It would
 218 be better to compile it as:
 219
 220      fp = &bar;
 221      for (...)  fp();
 222
 223 which only computes the address of bar once (instead of each time through the
 224 stub).  This is Darwin specific and would have to be done in the code generator.
 225 Probably not a win on x86.
 226
 227 ===-------------------------------------------------------------------------===
 228
 229 PowerPC i1/setcc stuff (depends on subreg stuff):
 230
 231 Check out the PPC code we get for 'compare' in this testcase:
 232 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=19672
 233
 234 oof.  on top of not doing the logical crnand instead of (mfcr, mfcr,
 235 invert, invert, or), we then have to compare it against zero instead of
 236 using the value already in a CR!
 237
 238 that should be something like
 239         cmpw cr7, r8, r5
 240         cmpw cr0, r7, r3
 241         crnand cr0, cr0, cr7
 242         bne cr0, LBB_compare_4
 243
 244 instead of
 245         cmpw cr7, r8, r5
 246         cmpw cr0, r7, r3
 247         mfcr r7, 1
 248         mcrf cr7, cr0
 249         mfcr r8, 1
 250         rlwinm r7, r7, 30, 31, 31
 251         rlwinm r8, r8, 30, 31, 31
 252         xori r7, r7, 1
 253         xori r8, r8, 1
 254         addi r2, r2, 1
 255         or r7, r8, r7
 256         cmpwi cr0, r7, 0
 257         bne cr0, LBB_compare_4  ; loopexit
 258
 259 FreeBench/mason has a basic block that looks like this:
 260
 261          %tmp.130 = seteq int %p.0__, 5          ; <bool> [#uses=1]
 262          %tmp.134 = seteq int %p.1__, 6          ; <bool> [#uses=1]
 263          %tmp.139 = seteq int %p.2__, 12         ; <bool> [#uses=1]
 264          %tmp.144 = seteq int %p.3__, 13         ; <bool> [#uses=1]
 265          %tmp.149 = seteq int %p.4__, 14         ; <bool> [#uses=1]
 266          %tmp.154 = seteq int %p.5__, 15         ; <bool> [#uses=1]
 267          %bothcond = and bool %tmp.134, %tmp.130         ; <bool> [#uses=1]
 268          %bothcond123 = and bool %bothcond, %tmp.139             ; <bool>
 269          %bothcond124 = and bool %bothcond123, %tmp.144          ; <bool>
 270          %bothcond125 = and bool %bothcond124, %tmp.149          ; <bool>
 271          %bothcond126 = and bool %bothcond125, %tmp.154          ; <bool>
 272          br bool %bothcond126, label %shortcirc_next.5, label %else.0
 273
 274 This is a particularly important case where handling CRs better will help.
 275
 276 ===-------------------------------------------------------------------------===
 277
 278 Simple IPO for argument passing, change:
 279   void foo(int X, double Y, int Z) -> void foo(int X, int Z, double Y)
 280
 281 the Darwin ABI specifies that any integer arguments in the first 32 bytes worth
 282 of arguments get assigned to r3 through r10. That is, if you have a function
 283 foo(int, double, int) you get r3, f1, r6, since the 64 bit double ate up the
 284 argument bytes for r4 and r5. The trick then would be to shuffle the argument
 285 order for functions we can internalize so that the maximum number of
 286 integers/pointers get passed in regs before you see any of the fp arguments.
 287
 288 Instead of implementing this, it would actually probably be easier to just
 289 implement a PPC fastcc, where we could do whatever we wanted to the CC,
 290 including having this work sanely.
 291
 292 ===-------------------------------------------------------------------------===
 293
 294 Fix Darwin FP-In-Integer Registers ABI
 295
 296 Darwin passes doubles in structures in integer registers, which is very very
 297 bad.  Add something like a BIT_CONVERT to LLVM, then do an i-p transformation
 298 that percolates these things out of functions.
 299
 300 Check out how horrible this is:
 301 http://gcc.gnu.org/ml/gcc/2005-10/msg01036.html
 302
 303 This is an extension of "interprocedural CC unmunging" that can't be done with
 304 just fastcc.
 305
 306 ===-------------------------------------------------------------------------===
 307
 308 Generate lwbrx and other byteswapping load/store instructions when reasonable.
 309
 310 ===-------------------------------------------------------------------------===
 311
 312 Implement TargetConstantVec, and set up PPC to custom lower ConstantVec into
 313 TargetConstantVec's if it's one of the many forms that are algorithmically
 314 computable using the spiffy altivec instructions.
 315
 316 ===-------------------------------------------------------------------------===
 317
 318 Compile this:
 319
 320 int foo(int a) {
 321   int b = (a < 8);
 322   if (b) {
 323     return b * 3;     // ignore the fact that this is always 3.
 324   } else {
 325     return 2;
 326   }
 327 }
 328
 329 into something not this:
 330
 331 _foo:
 332 1)      cmpwi cr7, r3, 8
 333         mfcr r2, 1
 334         rlwinm r2, r2, 29, 31, 31
 335 1)      cmpwi cr0, r3, 7
 336         bgt cr0, LBB1_2 ; UnifiedReturnBlock
 337 LBB1_1: ; then
 338         rlwinm r2, r2, 0, 31, 31
 339         mulli r3, r2, 3
 340         blr
 341 LBB1_2: ; UnifiedReturnBlock
 342         li r3, 2
 343         blr
 344
 345 In particular, the two compares (marked 1) could be shared by reversing one.
 346 This could be done in the dag combiner, by swapping a BR_CC when a SETCC of the
 347 same operands (but backwards) exists.  In this case, this wouldn't save us
 348 anything though, because the compares still wouldn't be shared.
 349
 350 ===-------------------------------------------------------------------------===
 351
 352 The legalizer should lower this:
 353
 354 bool %test(ulong %x) {
 355   %tmp = setlt ulong %x, 4294967296
 356   ret bool %tmp
 357 }
 358
 359 into "if x.high == 0", not:
 360
 361 _test:
 362         addi r2, r3, -1
 363         cntlzw r2, r2
 364         cntlzw r3, r3
 365         srwi r2, r2, 5
 366         srwi r4, r3, 5
 367         li r3, 0
 368         cmpwi cr0, r2, 0
 369         bne cr0, LBB1_2 ;
 370 LBB1_1:
 371         or r3, r4, r4
 372 LBB1_2:
 373         blr
 374
 375 noticed in 2005-05-11-Popcount-ffs-fls.c.
 376
 377
 378 ===-------------------------------------------------------------------------===
 379
 380 We should custom expand setcc instead of pretending that we have it.  That
 381 would allow us to expose the access of the crbit after the mfcr, allowing
 382 that access to be trivially folded into other ops.  A simple example:
 383
 384 int foo(int a, int b) { return (a < b) << 4; }
 385
 386 compiles into:
 387
 388 _foo:
 389         cmpw cr7, r3, r4
 390         mfcr r2, 1
 391         rlwinm r2, r2, 29, 31, 31
 392         slwi r3, r2, 4
 393         blr
 394
 395 ===-------------------------------------------------------------------------===
 396
 397 Fold add and sub with constant into non-extern, non-weak addresses so this:
 398
 399 static int a;
 400 void bar(int b) { a = b; }
 401 void foo(unsigned char *c) {
 402   *c = a;
 403 }
 404
 405 So that
 406
 407 _foo:
 408         lis r2, ha16(_a)
 409         la r2, lo16(_a)(r2)
 410         lbz r2, 3(r2)
 411         stb r2, 0(r3)
 412         blr
 413
 414 Becomes
 415
 416 _foo:
 417         lis r2, ha16(_a+3)
 418         lbz r2, lo16(_a+3)(r2)
 419         stb r2, 0(r3)
 420         blr
 421
 422 ===-------------------------------------------------------------------------===
 423
 424 We generate really bad code for this:
 425
 426 int f(signed char *a, _Bool b, _Bool c) {
 427    signed char t = 0;
 428   if (b)  t = *a;
 429   if (c)  *a = t;
 430 }
 431
 432 ===-------------------------------------------------------------------------===
 433
 434 This:
 435 int test(unsigned *P) { return *P >> 24; }
 436
 437 Should compile to:
 438
 439 _test:
 440         lbz r3,0(r3)
 441         blr
 442
 443 not:
 444
 445 _test:
 446         lwz r2, 0(r3)
 447         srwi r3, r2, 24
 448         blr
 449
 450 ===-------------------------------------------------------------------------===
 451
 452 On the G5, logical CR operations are more expensive in their three
 453 address form: ops that read/write the same register are half as expensive as
 454 those that read from two registers that are different from their destination.
 455
 456 We should model this with two separate instructions.  The isel should generate
 457 the "two address" form of the instructions.  When the register allocator
 458 detects that it needs to insert a copy due to the two-addresness of the CR
 459 logical op, it will invoke PPCInstrInfo::convertToThreeAddress.  At this point
 460 we can convert to the "three address" instruction, to save code space.
 461
 462 This only matters when we start generating cr logical ops.
 463
 464 ===-------------------------------------------------------------------------===
 465
 466 We should compile these two functions to the same thing:
 467
 468 #include <stdlib.h>
 469 void f(int a, int b, int *P) {
 470   *P = (a-b)>=0?(a-b):(b-a);
 471 }
 472 void g(int a, int b, int *P) {
 473   *P = abs(a-b);
 474 }
 475
 476 Further, they should compile to something better than:
 477
 478 _g:
 479         subf r2, r4, r3
 480         subfic r3, r2, 0
 481         cmpwi cr0, r2, -1
 482         bgt cr0, LBB2_2 ; entry
 483 LBB2_1: ; entry
 484         mr r2, r3
 485 LBB2_2: ; entry
 486         stw r2, 0(r5)
 487         blr
 488
 489 GCC produces:
 490
 491 _g:
 492         subf r4,r4,r3
 493         srawi r2,r4,31
 494         xor r0,r2,r4
 495         subf r0,r2,r0
 496         stw r0,0(r5)
 497         blr
 498
 499 ... which is much nicer.
 500
 501 This theoretically may help improve twolf slightly (used in dimbox.c:142?).
 502
 503 ===-------------------------------------------------------------------------===
 504
 505 Implement PPCInstrInfo::isLoadFromStackSlot/isStoreToStackSlot for vector
 506 registers, to generate better spill code.
 507
 508 ===-------------------------------------------------------------------------===
 509
 510 int foo(int N, int ***W, int **TK, int X) {
 511   int t, i;
 512
 513   for (t = 0; t < N; ++t)
 514     for (i = 0; i < 4; ++i)
 515       W[t / X][i][t % X] = TK[i][t];
 516
 517   return 5;
 518 }
 519
 520 We generate relatively atrocious code for this loop compared to gcc.
 521
 522 We could also strength reduce the rem and the div:
 523 http://www.lcs.mit.edu/pubs/pdf/MIT-LCS-TM-600.pdf
 524
 525 ===-------------------------------------------------------------------------===
 526
 527 Altivec support.  The first should be a single lvx from the constant pool, the
 528 second should be a xor/stvx:
 529
 530 void foo(void) {
 531   int x[8] __attribute__((aligned(128))) = { 1, 1, 1, 1, 1, 1, 1, 1 };
 532   bar (x);
 533 }
 534
 535 #include <string.h>
 536 void foo(void) {
 537   int x[8] __attribute__((aligned(128)));
 538   memset (x, 0, sizeof (x));
 539   bar (x);
 540 }
 541
 542 ===-------------------------------------------------------------------------===
 543
 544 Altivec: Codegen'ing MUL with vector FMADD should add -0.0, not 0.0:
 545 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=8763
 546
 547 We need to codegen -0.0 vector efficiently (no constant pool load).
 548
 549 When -ffast-math is on, we can use 0.0.
 550
 551 ===-------------------------------------------------------------------------===