lib/Target/PowerPC/README.txt

   1 TODO:
   2 * gpr0 allocation
   3 * implement do-loop -> bdnz transform
   4 * implement powerpc-64 for darwin
   5
   6 ===-------------------------------------------------------------------------===
   7
   8 Use the stfiwx instruction for:
   9
  10 void foo(float a, int *b) { *b = a; }
  11
  12 ===-------------------------------------------------------------------------===
  13
  14 Support 'update' load/store instructions.  These are cracked on the G5, but are
  15 still a codesize win.
  16
  17 ===-------------------------------------------------------------------------===
  18
  19 Should hint to the branch select pass that it doesn't need to print the second
  20 unconditional branch, so we don't end up with things like:
  21         b .LBBl42__2E_expand_function_8_674     ; loopentry.24
  22         b .LBBl42__2E_expand_function_8_42      ; NewDefault
  23         b .LBBl42__2E_expand_function_8_42      ; NewDefault
  24
  25 This occurs in SPASS.
  26
  27 ===-------------------------------------------------------------------------===
  28
  29 * Codegen this:
  30
  31    void test2(int X) {
  32      if (X == 0x12345678) bar();
  33    }
  34
  35     as:
  36
  37        xoris r0,r3,0x1234
  38        cmplwi cr0,r0,0x5678
  39        beq cr0,L6
  40
  41     not:
  42
  43         lis r2, 4660
  44         ori r2, r2, 22136
  45         cmpw cr0, r3, r2
  46         bne .LBB_test2_2
  47
  48 ===-------------------------------------------------------------------------===
  49
  50 Lump the constant pool for each function into ONE pic object, and reference
  51 pieces of it as offsets from the start.  For functions like this (contrived
  52 to have lots of constants obviously):
  53
  54 double X(double Y) { return (Y*1.23 + 4.512)*2.34 + 14.38; }
  55
  56 We generate:
  57
  58 _X:
  59         lis r2, ha16(.CPI_X_0)
  60         lfd f0, lo16(.CPI_X_0)(r2)
  61         lis r2, ha16(.CPI_X_1)
  62         lfd f2, lo16(.CPI_X_1)(r2)
  63         fmadd f0, f1, f0, f2
  64         lis r2, ha16(.CPI_X_2)
  65         lfd f1, lo16(.CPI_X_2)(r2)
  66         lis r2, ha16(.CPI_X_3)
  67         lfd f2, lo16(.CPI_X_3)(r2)
  68         fmadd f1, f0, f1, f2
  69         blr
  70
  71 It would be better to materialize .CPI_X into a register, then use immediates
  72 off of the register to avoid the lis's.  This is even more important in PIC
  73 mode.
  74
  75 Note that this (and the static variable version) is discussed here for GCC:
  76 http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
  77
  78 ===-------------------------------------------------------------------------===
  79
  80 PIC Code Gen IPO optimization:
  81
  82 Squish small scalar globals together into a single global struct, allowing the
  83 address of the struct to be CSE'd, avoiding PIC accesses (also reduces the size
  84 of the GOT on targets with one).
  85
  86 Note that this is discussed here for GCC:
  87 http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
  88
  89 ===-------------------------------------------------------------------------===
  90
  91 Implement Newton-Rhapson method for improving estimate instructions to the
  92 correct accuracy, and implementing divide as multiply by reciprocal when it has
  93 more than one use.  Itanium will want this too.
  94
  95 ===-------------------------------------------------------------------------===
  96
  97 #define  ARRAY_LENGTH  16
  98
  99 union bitfield {
 100         struct {
 101 #ifndef __ppc__
 102                 unsigned int                       field0 : 6;
 103                 unsigned int                       field1 : 6;
 104                 unsigned int                       field2 : 6;
 105                 unsigned int                       field3 : 6;
 106                 unsigned int                       field4 : 3;
 107                 unsigned int                       field5 : 4;
 108                 unsigned int                       field6 : 1;
 109 #else
 110                 unsigned int                       field6 : 1;
 111                 unsigned int                       field5 : 4;
 112                 unsigned int                       field4 : 3;
 113                 unsigned int                       field3 : 6;
 114                 unsigned int                       field2 : 6;
 115                 unsigned int                       field1 : 6;
 116                 unsigned int                       field0 : 6;
 117 #endif
 118         } bitfields, bits;
 119         unsigned int    u32All;
 120         signed int      i32All;
 121         float   f32All;
 122 };
 123
 124
 125 typedef struct program_t {
 126         union bitfield    array[ARRAY_LENGTH];
 127     int               size;
 128     int               loaded;
 129 } program;
 130
 131
 132 void AdjustBitfields(program* prog, unsigned int fmt1)
 133 {
 134         prog->array[0].bitfields.field0 = fmt1;
 135         prog->array[0].bitfields.field1 = fmt1 + 1;
 136 }
 137
 138 We currently generate:
 139
 140 _AdjustBitfields:
 141         lwz r2, 0(r3)
 142         addi r5, r4, 1
 143         rlwinm r2, r2, 0, 0, 19
 144         rlwinm r5, r5, 6, 20, 25
 145         rlwimi r2, r4, 0, 26, 31
 146         or r2, r2, r5
 147         stw r2, 0(r3)
 148         blr
 149
 150 We should teach someone that or (rlwimi, rlwinm) with disjoint masks can be
 151 turned into rlwimi (rlwimi)
 152
 153 The better codegen would be:
 154
 155 _AdjustBitfields:
 156         lwz r0,0(r3)
 157         rlwinm r4,r4,0,0xff
 158         rlwimi r0,r4,0,26,31
 159         addi r4,r4,1
 160         rlwimi r0,r4,6,20,25
 161         stw r0,0(r3)
 162         blr
 163
 164 ===-------------------------------------------------------------------------===
 165
 166 Compile this:
 167
 168 int %f1(int %a, int %b) {
 169         %tmp.1 = and int %a, 15         ; <int> [#uses=1]
 170         %tmp.3 = and int %b, 240                ; <int> [#uses=1]
 171         %tmp.4 = or int %tmp.3, %tmp.1          ; <int> [#uses=1]
 172         ret int %tmp.4
 173 }
 174
 175 without a copy.  We make this currently:
 176
 177 _f1:
 178         rlwinm r2, r4, 0, 24, 27
 179         rlwimi r2, r3, 0, 28, 31
 180         or r3, r2, r2
 181         blr
 182
 183 The two-addr pass or RA needs to learn when it is profitable to commute an
 184 instruction to avoid a copy AFTER the 2-addr instruction.  The 2-addr pass
 185 currently only commutes to avoid inserting a copy BEFORE the two addr instr.
 186
 187 ===-------------------------------------------------------------------------===
 188
 189 Compile offsets from allocas:
 190
 191 int *%test() {
 192         %X = alloca { int, int }
 193         %Y = getelementptr {int,int}* %X, int 0, uint 1
 194         ret int* %Y
 195 }
 196
 197 into a single add, not two:
 198
 199 _test:
 200         addi r2, r1, -8
 201         addi r3, r2, 4
 202         blr
 203
 204 --> important for C++.
 205
 206 ===-------------------------------------------------------------------------===
 207
 208 int test3(int a, int b) { return (a < 0) ? a : 0; }
 209
 210 should be branch free code.  LLVM is turning it into < 1 because of the RHS.
 211
 212 ===-------------------------------------------------------------------------===
 213
 214 No loads or stores of the constants should be needed:
 215
 216 struct foo { double X, Y; };
 217 void xxx(struct foo F);
 218 void bar() { struct foo R = { 1.0, 2.0 }; xxx(R); }
 219
 220 ===-------------------------------------------------------------------------===
 221
 222 Darwin Stub LICM optimization:
 223
 224 Loops like this:
 225
 226   for (...)  bar();
 227
 228 Have to go through an indirect stub if bar is external or linkonce.  It would
 229 be better to compile it as:
 230
 231      fp = &bar;
 232      for (...)  fp();
 233
 234 which only computes the address of bar once (instead of each time through the
 235 stub).  This is Darwin specific and would have to be done in the code generator.
 236 Probably not a win on x86.
 237
 238 ===-------------------------------------------------------------------------===
 239
 240 PowerPC i1/setcc stuff (depends on subreg stuff):
 241
 242 Check out the PPC code we get for 'compare' in this testcase:
 243 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=19672
 244
 245 oof.  on top of not doing the logical crnand instead of (mfcr, mfcr,
 246 invert, invert, or), we then have to compare it against zero instead of
 247 using the value already in a CR!
 248
 249 that should be something like
 250         cmpw cr7, r8, r5
 251         cmpw cr0, r7, r3
 252         crnand cr0, cr0, cr7
 253         bne cr0, LBB_compare_4
 254
 255 instead of
 256         cmpw cr7, r8, r5
 257         cmpw cr0, r7, r3
 258         mfcr r7, 1
 259         mcrf cr7, cr0
 260         mfcr r8, 1
 261         rlwinm r7, r7, 30, 31, 31
 262         rlwinm r8, r8, 30, 31, 31
 263         xori r7, r7, 1
 264         xori r8, r8, 1
 265         addi r2, r2, 1
 266         or r7, r8, r7
 267         cmpwi cr0, r7, 0
 268         bne cr0, LBB_compare_4  ; loopexit
 269
 270 FreeBench/mason has a basic block that looks like this:
 271
 272          %tmp.130 = seteq int %p.0__, 5          ; <bool> [#uses=1]
 273          %tmp.134 = seteq int %p.1__, 6          ; <bool> [#uses=1]
 274          %tmp.139 = seteq int %p.2__, 12         ; <bool> [#uses=1]
 275          %tmp.144 = seteq int %p.3__, 13         ; <bool> [#uses=1]
 276          %tmp.149 = seteq int %p.4__, 14         ; <bool> [#uses=1]
 277          %tmp.154 = seteq int %p.5__, 15         ; <bool> [#uses=1]
 278          %bothcond = and bool %tmp.134, %tmp.130         ; <bool> [#uses=1]
 279          %bothcond123 = and bool %bothcond, %tmp.139             ; <bool>
 280          %bothcond124 = and bool %bothcond123, %tmp.144          ; <bool>
 281          %bothcond125 = and bool %bothcond124, %tmp.149          ; <bool>
 282          %bothcond126 = and bool %bothcond125, %tmp.154          ; <bool>
 283          br bool %bothcond126, label %shortcirc_next.5, label %else.0
 284
 285 This is a particularly important case where handling CRs better will help.
 286
 287 ===-------------------------------------------------------------------------===
 288
 289 Simple IPO for argument passing, change:
 290   void foo(int X, double Y, int Z) -> void foo(int X, int Z, double Y)
 291
 292 the Darwin ABI specifies that any integer arguments in the first 32 bytes worth
 293 of arguments get assigned to r3 through r10. That is, if you have a function
 294 foo(int, double, int) you get r3, f1, r6, since the 64 bit double ate up the
 295 argument bytes for r4 and r5. The trick then would be to shuffle the argument
 296 order for functions we can internalize so that the maximum number of
 297 integers/pointers get passed in regs before you see any of the fp arguments.
 298
 299 Instead of implementing this, it would actually probably be easier to just
 300 implement a PPC fastcc, where we could do whatever we wanted to the CC,
 301 including having this work sanely.
 302
 303 ===-------------------------------------------------------------------------===
 304
 305 Fix Darwin FP-In-Integer Registers ABI
 306
 307 Darwin passes doubles in structures in integer registers, which is very very
 308 bad.  Add something like a BIT_CONVERT to LLVM, then do an i-p transformation
 309 that percolates these things out of functions.
 310
 311 Check out how horrible this is:
 312 http://gcc.gnu.org/ml/gcc/2005-10/msg01036.html
 313
 314 This is an extension of "interprocedural CC unmunging" that can't be done with
 315 just fastcc.
 316
 317 ===-------------------------------------------------------------------------===
 318
 319 Generate lwbrx and other byteswapping load/store instructions when reasonable.
 320
 321 ===-------------------------------------------------------------------------===
 322
 323 Implement TargetConstantVec, and set up PPC to custom lower ConstantVec into
 324 TargetConstantVec's if it's one of the many forms that are algorithmically
 325 computable using the spiffy altivec instructions.
 326
 327 ===-------------------------------------------------------------------------===
 328
 329 Compile this:
 330
 331 double %test(double %X) {
 332         %Y = cast double %X to long
 333         %Z = cast long %Y to double
 334         ret double %Z
 335 }
 336
 337 to this:
 338
 339 _test:
 340         fctidz f0, f1
 341         stfd f0, -8(r1)
 342         lwz r2, -4(r1)
 343         lwz r3, -8(r1)
 344         stw r2, -12(r1)
 345         stw r3, -16(r1)
 346         lfd f0, -16(r1)
 347         fcfid f1, f0
 348         blr
 349
 350 without the lwz/stw's.
 351
 352 ===-------------------------------------------------------------------------===
 353
 354 Compile this:
 355
 356 int foo(int a) {
 357   int b = (a < 8);
 358   if (b) {
 359     return b * 3;     // ignore the fact that this is always 3.
 360   } else {
 361     return 2;
 362   }
 363 }
 364
 365 into something not this:
 366
 367 _foo:
 368 1)      cmpwi cr7, r3, 8
 369         mfcr r2, 1
 370         rlwinm r2, r2, 29, 31, 31
 371 1)      cmpwi cr0, r3, 7
 372         bgt cr0, LBB1_2 ; UnifiedReturnBlock
 373 LBB1_1: ; then
 374         rlwinm r2, r2, 0, 31, 31
 375         mulli r3, r2, 3
 376         blr
 377 LBB1_2: ; UnifiedReturnBlock
 378         li r3, 2
 379         blr
 380
 381 In particular, the two compares (marked 1) could be shared by reversing one.
 382 This could be done in the dag combiner, by swapping a BR_CC when a SETCC of the
 383 same operands (but backwards) exists.  In this case, this wouldn't save us
 384 anything though, because the compares still wouldn't be shared.
 385
 386 ===-------------------------------------------------------------------------===
 387
 388 The legalizer should lower this:
 389
 390 bool %test(ulong %x) {
 391   %tmp = setlt ulong %x, 4294967296
 392   ret bool %tmp
 393 }
 394
 395 into "if x.high == 0", not:
 396
 397 _test:
 398         addi r2, r3, -1
 399         cntlzw r2, r2
 400         cntlzw r3, r3
 401         srwi r2, r2, 5
 402         srwi r4, r3, 5
 403         li r3, 0
 404         cmpwi cr0, r2, 0
 405         bne cr0, LBB1_2 ;
 406 LBB1_1:
 407         or r3, r4, r4
 408 LBB1_2:
 409         blr
 410
 411 noticed in 2005-05-11-Popcount-ffs-fls.c.
 412
 413
 414 ===-------------------------------------------------------------------------===
 415
 416 We should custom expand setcc instead of pretending that we have it.  That
 417 would allow us to expose the access of the crbit after the mfcr, allowing
 418 that access to be trivially folded into other ops.  A simple example:
 419
 420 int foo(int a, int b) { return (a < b) << 4; }
 421
 422 compiles into:
 423
 424 _foo:
 425         cmpw cr7, r3, r4
 426         mfcr r2, 1
 427         rlwinm r2, r2, 29, 31, 31
 428         slwi r3, r2, 4
 429         blr
 430
 431 ===-------------------------------------------------------------------------===
 432
 433 Fold add and sub with constant into non-extern, non-weak addresses so this:
 434
 435 static int a;
 436 void bar(int b) { a = b; }
 437 void foo(unsigned char *c) {
 438   *c = a;
 439 }
 440
 441 So that
 442
 443 _foo:
 444         lis r2, ha16(_a)
 445         la r2, lo16(_a)(r2)
 446         lbz r2, 3(r2)
 447         stb r2, 0(r3)
 448         blr
 449
 450 Becomes
 451
 452 _foo:
 453         lis r2, ha16(_a+3)
 454         lbz r2, lo16(_a+3)(r2)
 455         stb r2, 0(r3)
 456         blr
 457
 458 ===-------------------------------------------------------------------------===
 459
 460 We generate really bad code for this:
 461
 462 int f(signed char *a, _Bool b, _Bool c) {
 463    signed char t = 0;
 464   if (b)  t = *a;
 465   if (c)  *a = t;
 466 }
 467