lib/Target/PowerPC/README.txt

   1 TODO:
   2 * gpr0 allocation
   3 * implement do-loop -> bdnz transform
   4 * implement powerpc-64 for darwin
   5
   6 ===-------------------------------------------------------------------------===
   7
   8 Support 'update' load/store instructions.  These are cracked on the G5, but are
   9 still a codesize win.
  10
  11 ===-------------------------------------------------------------------------===
  12
  13 Should hint to the branch select pass that it doesn't need to print the second
  14 unconditional branch, so we don't end up with things like:
  15         b .LBBl42__2E_expand_function_8_674     ; loopentry.24
  16         b .LBBl42__2E_expand_function_8_42      ; NewDefault
  17         b .LBBl42__2E_expand_function_8_42      ; NewDefault
  18
  19 This occurs in SPASS.
  20
  21 ===-------------------------------------------------------------------------===
  22
  23 * Codegen this:
  24
  25    void test2(int X) {
  26      if (X == 0x12345678) bar();
  27    }
  28
  29     as:
  30
  31        xoris r0,r3,0x1234
  32        cmplwi cr0,r0,0x5678
  33        beq cr0,L6
  34
  35     not:
  36
  37         lis r2, 4660
  38         ori r2, r2, 22136
  39         cmpw cr0, r3, r2
  40         bne .LBB_test2_2
  41
  42 ===-------------------------------------------------------------------------===
  43
  44 Lump the constant pool for each function into ONE pic object, and reference
  45 pieces of it as offsets from the start.  For functions like this (contrived
  46 to have lots of constants obviously):
  47
  48 double X(double Y) { return (Y*1.23 + 4.512)*2.34 + 14.38; }
  49
  50 We generate:
  51
  52 _X:
  53         lis r2, ha16(.CPI_X_0)
  54         lfd f0, lo16(.CPI_X_0)(r2)
  55         lis r2, ha16(.CPI_X_1)
  56         lfd f2, lo16(.CPI_X_1)(r2)
  57         fmadd f0, f1, f0, f2
  58         lis r2, ha16(.CPI_X_2)
  59         lfd f1, lo16(.CPI_X_2)(r2)
  60         lis r2, ha16(.CPI_X_3)
  61         lfd f2, lo16(.CPI_X_3)(r2)
  62         fmadd f1, f0, f1, f2
  63         blr
  64
  65 It would be better to materialize .CPI_X into a register, then use immediates
  66 off of the register to avoid the lis's.  This is even more important in PIC
  67 mode.
  68
  69 Note that this (and the static variable version) is discussed here for GCC:
  70 http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
  71
  72 ===-------------------------------------------------------------------------===
  73
  74 PIC Code Gen IPO optimization:
  75
  76 Squish small scalar globals together into a single global struct, allowing the
  77 address of the struct to be CSE'd, avoiding PIC accesses (also reduces the size
  78 of the GOT on targets with one).
  79
  80 Note that this is discussed here for GCC:
  81 http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
  82
  83 ===-------------------------------------------------------------------------===
  84
  85 Implement Newton-Rhapson method for improving estimate instructions to the
  86 correct accuracy, and implementing divide as multiply by reciprocal when it has
  87 more than one use.  Itanium will want this too.
  88
  89 ===-------------------------------------------------------------------------===
  90
  91 #define  ARRAY_LENGTH  16
  92
  93 union bitfield {
  94         struct {
  95 #ifndef __ppc__
  96                 unsigned int                       field0 : 6;
  97                 unsigned int                       field1 : 6;
  98                 unsigned int                       field2 : 6;
  99                 unsigned int                       field3 : 6;
 100                 unsigned int                       field4 : 3;
 101                 unsigned int                       field5 : 4;
 102                 unsigned int                       field6 : 1;
 103 #else
 104                 unsigned int                       field6 : 1;
 105                 unsigned int                       field5 : 4;
 106                 unsigned int                       field4 : 3;
 107                 unsigned int                       field3 : 6;
 108                 unsigned int                       field2 : 6;
 109                 unsigned int                       field1 : 6;
 110                 unsigned int                       field0 : 6;
 111 #endif
 112         } bitfields, bits;
 113         unsigned int    u32All;
 114         signed int      i32All;
 115         float   f32All;
 116 };
 117
 118
 119 typedef struct program_t {
 120         union bitfield    array[ARRAY_LENGTH];
 121     int               size;
 122     int               loaded;
 123 } program;
 124
 125
 126 void AdjustBitfields(program* prog, unsigned int fmt1)
 127 {
 128         prog->array[0].bitfields.field0 = fmt1;
 129         prog->array[0].bitfields.field1 = fmt1 + 1;
 130 }
 131
 132 We currently generate:
 133
 134 _AdjustBitfields:
 135         lwz r2, 0(r3)
 136         addi r5, r4, 1
 137         rlwinm r2, r2, 0, 0, 19
 138         rlwinm r5, r5, 6, 20, 25
 139         rlwimi r2, r4, 0, 26, 31
 140         or r2, r2, r5
 141         stw r2, 0(r3)
 142         blr
 143
 144 We should teach someone that or (rlwimi, rlwinm) with disjoint masks can be
 145 turned into rlwimi (rlwimi)
 146
 147 The better codegen would be:
 148
 149 _AdjustBitfields:
 150         lwz r0,0(r3)
 151         rlwinm r4,r4,0,0xff
 152         rlwimi r0,r4,0,26,31
 153         addi r4,r4,1
 154         rlwimi r0,r4,6,20,25
 155         stw r0,0(r3)
 156         blr
 157
 158 ===-------------------------------------------------------------------------===
 159
 160 Compile this:
 161
 162 int %f1(int %a, int %b) {
 163         %tmp.1 = and int %a, 15         ; <int> [#uses=1]
 164         %tmp.3 = and int %b, 240                ; <int> [#uses=1]
 165         %tmp.4 = or int %tmp.3, %tmp.1          ; <int> [#uses=1]
 166         ret int %tmp.4
 167 }
 168
 169 without a copy.  We make this currently:
 170
 171 _f1:
 172         rlwinm r2, r4, 0, 24, 27
 173         rlwimi r2, r3, 0, 28, 31
 174         or r3, r2, r2
 175         blr
 176
 177 The two-addr pass or RA needs to learn when it is profitable to commute an
 178 instruction to avoid a copy AFTER the 2-addr instruction.  The 2-addr pass
 179 currently only commutes to avoid inserting a copy BEFORE the two addr instr.
 180
 181 ===-------------------------------------------------------------------------===
 182
 183 Compile offsets from allocas:
 184
 185 int *%test() {
 186         %X = alloca { int, int }
 187         %Y = getelementptr {int,int}* %X, int 0, uint 1
 188         ret int* %Y
 189 }
 190
 191 into a single add, not two:
 192
 193 _test:
 194         addi r2, r1, -8
 195         addi r3, r2, 4
 196         blr
 197
 198 --> important for C++.
 199
 200 ===-------------------------------------------------------------------------===
 201
 202 int test3(int a, int b) { return (a < 0) ? a : 0; }
 203
 204 should be branch free code.  LLVM is turning it into < 1 because of the RHS.
 205
 206 ===-------------------------------------------------------------------------===
 207
 208 No loads or stores of the constants should be needed:
 209
 210 struct foo { double X, Y; };
 211 void xxx(struct foo F);
 212 void bar() { struct foo R = { 1.0, 2.0 }; xxx(R); }
 213
 214 ===-------------------------------------------------------------------------===
 215
 216 Darwin Stub LICM optimization:
 217
 218 Loops like this:
 219
 220   for (...)  bar();
 221
 222 Have to go through an indirect stub if bar is external or linkonce.  It would
 223 be better to compile it as:
 224
 225      fp = &bar;
 226      for (...)  fp();
 227
 228 which only computes the address of bar once (instead of each time through the
 229 stub).  This is Darwin specific and would have to be done in the code generator.
 230 Probably not a win on x86.
 231
 232 ===-------------------------------------------------------------------------===
 233
 234 PowerPC i1/setcc stuff (depends on subreg stuff):
 235
 236 Check out the PPC code we get for 'compare' in this testcase:
 237 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=19672
 238
 239 oof.  on top of not doing the logical crnand instead of (mfcr, mfcr,
 240 invert, invert, or), we then have to compare it against zero instead of
 241 using the value already in a CR!
 242
 243 that should be something like
 244         cmpw cr7, r8, r5
 245         cmpw cr0, r7, r3
 246         crnand cr0, cr0, cr7
 247         bne cr0, LBB_compare_4
 248
 249 instead of
 250         cmpw cr7, r8, r5
 251         cmpw cr0, r7, r3
 252         mfcr r7, 1
 253         mcrf cr7, cr0
 254         mfcr r8, 1
 255         rlwinm r7, r7, 30, 31, 31
 256         rlwinm r8, r8, 30, 31, 31
 257         xori r7, r7, 1
 258         xori r8, r8, 1
 259         addi r2, r2, 1
 260         or r7, r8, r7
 261         cmpwi cr0, r7, 0
 262         bne cr0, LBB_compare_4  ; loopexit
 263
 264 FreeBench/mason has a basic block that looks like this:
 265
 266          %tmp.130 = seteq int %p.0__, 5          ; <bool> [#uses=1]
 267          %tmp.134 = seteq int %p.1__, 6          ; <bool> [#uses=1]
 268          %tmp.139 = seteq int %p.2__, 12         ; <bool> [#uses=1]
 269          %tmp.144 = seteq int %p.3__, 13         ; <bool> [#uses=1]
 270          %tmp.149 = seteq int %p.4__, 14         ; <bool> [#uses=1]
 271          %tmp.154 = seteq int %p.5__, 15         ; <bool> [#uses=1]
 272          %bothcond = and bool %tmp.134, %tmp.130         ; <bool> [#uses=1]
 273          %bothcond123 = and bool %bothcond, %tmp.139             ; <bool>
 274          %bothcond124 = and bool %bothcond123, %tmp.144          ; <bool>
 275          %bothcond125 = and bool %bothcond124, %tmp.149          ; <bool>
 276          %bothcond126 = and bool %bothcond125, %tmp.154          ; <bool>
 277          br bool %bothcond126, label %shortcirc_next.5, label %else.0
 278
 279 This is a particularly important case where handling CRs better will help.
 280
 281 ===-------------------------------------------------------------------------===
 282
 283 Simple IPO for argument passing, change:
 284   void foo(int X, double Y, int Z) -> void foo(int X, int Z, double Y)
 285
 286 the Darwin ABI specifies that any integer arguments in the first 32 bytes worth
 287 of arguments get assigned to r3 through r10. That is, if you have a function
 288 foo(int, double, int) you get r3, f1, r6, since the 64 bit double ate up the
 289 argument bytes for r4 and r5. The trick then would be to shuffle the argument
 290 order for functions we can internalize so that the maximum number of
 291 integers/pointers get passed in regs before you see any of the fp arguments.
 292
 293 Instead of implementing this, it would actually probably be easier to just
 294 implement a PPC fastcc, where we could do whatever we wanted to the CC,
 295 including having this work sanely.
 296
 297 ===-------------------------------------------------------------------------===
 298
 299 Fix Darwin FP-In-Integer Registers ABI
 300
 301 Darwin passes doubles in structures in integer registers, which is very very
 302 bad.  Add something like a BIT_CONVERT to LLVM, then do an i-p transformation
 303 that percolates these things out of functions.
 304
 305 Check out how horrible this is:
 306 http://gcc.gnu.org/ml/gcc/2005-10/msg01036.html
 307
 308 This is an extension of "interprocedural CC unmunging" that can't be done with
 309 just fastcc.
 310
 311 ===-------------------------------------------------------------------------===
 312
 313 Generate lwbrx and other byteswapping load/store instructions when reasonable.
 314
 315 ===-------------------------------------------------------------------------===
 316
 317 Implement TargetConstantVec, and set up PPC to custom lower ConstantVec into
 318 TargetConstantVec's if it's one of the many forms that are algorithmically
 319 computable using the spiffy altivec instructions.
 320
 321 ===-------------------------------------------------------------------------===
 322
 323 Compile this:
 324
 325 int foo(int a) {
 326   int b = (a < 8);
 327   if (b) {
 328     return b * 3;     // ignore the fact that this is always 3.
 329   } else {
 330     return 2;
 331   }
 332 }
 333
 334 into something not this:
 335
 336 _foo:
 337 1)      cmpwi cr7, r3, 8
 338         mfcr r2, 1
 339         rlwinm r2, r2, 29, 31, 31
 340 1)      cmpwi cr0, r3, 7
 341         bgt cr0, LBB1_2 ; UnifiedReturnBlock
 342 LBB1_1: ; then
 343         rlwinm r2, r2, 0, 31, 31
 344         mulli r3, r2, 3
 345         blr
 346 LBB1_2: ; UnifiedReturnBlock
 347         li r3, 2
 348         blr
 349
 350 In particular, the two compares (marked 1) could be shared by reversing one.
 351 This could be done in the dag combiner, by swapping a BR_CC when a SETCC of the
 352 same operands (but backwards) exists.  In this case, this wouldn't save us
 353 anything though, because the compares still wouldn't be shared.
 354
 355 ===-------------------------------------------------------------------------===
 356
 357 The legalizer should lower this:
 358
 359 bool %test(ulong %x) {
 360   %tmp = setlt ulong %x, 4294967296
 361   ret bool %tmp
 362 }
 363
 364 into "if x.high == 0", not:
 365
 366 _test:
 367         addi r2, r3, -1
 368         cntlzw r2, r2
 369         cntlzw r3, r3
 370         srwi r2, r2, 5
 371         srwi r4, r3, 5
 372         li r3, 0
 373         cmpwi cr0, r2, 0
 374         bne cr0, LBB1_2 ;
 375 LBB1_1:
 376         or r3, r4, r4
 377 LBB1_2:
 378         blr
 379
 380 noticed in 2005-05-11-Popcount-ffs-fls.c.
 381
 382
 383 ===-------------------------------------------------------------------------===
 384
 385 We should custom expand setcc instead of pretending that we have it.  That
 386 would allow us to expose the access of the crbit after the mfcr, allowing
 387 that access to be trivially folded into other ops.  A simple example:
 388
 389 int foo(int a, int b) { return (a < b) << 4; }
 390
 391 compiles into:
 392
 393 _foo:
 394         cmpw cr7, r3, r4
 395         mfcr r2, 1
 396         rlwinm r2, r2, 29, 31, 31
 397         slwi r3, r2, 4
 398         blr
 399
 400 ===-------------------------------------------------------------------------===
 401
 402 Fold add and sub with constant into non-extern, non-weak addresses so this:
 403
 404 static int a;
 405 void bar(int b) { a = b; }
 406 void foo(unsigned char *c) {
 407   *c = a;
 408 }
 409
 410 So that
 411
 412 _foo:
 413         lis r2, ha16(_a)
 414         la r2, lo16(_a)(r2)
 415         lbz r2, 3(r2)
 416         stb r2, 0(r3)
 417         blr
 418
 419 Becomes
 420
 421 _foo:
 422         lis r2, ha16(_a+3)
 423         lbz r2, lo16(_a+3)(r2)
 424         stb r2, 0(r3)
 425         blr
 426
 427 ===-------------------------------------------------------------------------===
 428
 429 We generate really bad code for this:
 430
 431 int f(signed char *a, _Bool b, _Bool c) {
 432    signed char t = 0;
 433   if (b)  t = *a;
 434   if (c)  *a = t;
 435 }
 436
 437 ===-------------------------------------------------------------------------===
 438
 439 This:
 440 int test(unsigned *P) { return *P >> 24; }
 441
 442 Should compile to:
 443
 444 _test:
 445         lbz r3,0(r3)
 446         blr
 447
 448 not:
 449
 450 _test:
 451         lwz r2, 0(r3)
 452         srwi r3, r2, 24
 453         blr
 454
 455 ===-------------------------------------------------------------------------===
 456
 457 On the G5, logical CR operations are more expensive in their three
 458 address form: ops that read/write the same register are half as expensive as
 459 those that read from two registers that are different from their destination.
 460
 461 We should model this with two separate instructions.  The isel should generate
 462 the "two address" form of the instructions.  When the register allocator
 463 detects that it needs to insert a copy due to the two-addresness of the CR
 464 logical op, it will invoke PPCInstrInfo::convertToThreeAddress.  At this point
 465 we can convert to the "three address" instruction, to save code space.
 466
 467 This only matters when we start generating cr logical ops.
 468
 469 ===-------------------------------------------------------------------------===
 470
 471 We should compile these two functions to the same thing:
 472
 473 #include <stdlib.h>
 474 void f(int a, int b, int *P) {
 475   *P = (a-b)>=0?(a-b):(b-a);
 476 }
 477 void g(int a, int b, int *P) {
 478   *P = abs(a-b);
 479 }
 480
 481 Further, they should compile to something better than:
 482
 483 _g:
 484         subf r2, r4, r3
 485         subfic r3, r2, 0
 486         cmpwi cr0, r2, -1
 487         bgt cr0, LBB2_2 ; entry
 488 LBB2_1: ; entry
 489         mr r2, r3
 490 LBB2_2: ; entry
 491         stw r2, 0(r5)
 492         blr
 493
 494 GCC produces:
 495
 496 _g:
 497         subf r4,r4,r3
 498         srawi r2,r4,31
 499         xor r0,r2,r4
 500         subf r0,r2,r0
 501         stw r0,0(r5)
 502         blr
 503
 504 ... which is much nicer.
 505
 506 This theoretically may help improve twolf slightly (used in dimbox.c:142?).
 507
 508 ===-------------------------------------------------------------------------===
 509
 510 int foo(int N, int ***W, int **TK, int X) {
 511   int t, i;
 512
 513   for (t = 0; t < N; ++t)
 514     for (i = 0; i < 4; ++i)
 515       W[t / X][i][t % X] = TK[i][t];
 516
 517   return 5;
 518 }
 519
 520 We generate relatively atrocious code for this loop compared to gcc.