lib/Target/PowerPC/README.txt

   1 TODO:
   2 * gpr0 allocation
   3 * implement do-loop -> bdnz transform
   4 * implement powerpc-64 for darwin
   5
   6 ===-------------------------------------------------------------------------===
   7
   8 Use the stfiwx instruction for:
   9
  10 void foo(float a, int *b) { *b = a; }
  11
  12 ===-------------------------------------------------------------------------===
  13
  14 Support 'update' load/store instructions.  These are cracked on the G5, but are
  15 still a codesize win.
  16
  17 ===-------------------------------------------------------------------------===
  18
  19 Should hint to the branch select pass that it doesn't need to print the second
  20 unconditional branch, so we don't end up with things like:
  21         b .LBBl42__2E_expand_function_8_674     ; loopentry.24
  22         b .LBBl42__2E_expand_function_8_42      ; NewDefault
  23         b .LBBl42__2E_expand_function_8_42      ; NewDefault
  24
  25 This occurs in SPASS.
  26
  27 ===-------------------------------------------------------------------------===
  28
  29 * Codegen this:
  30
  31    void test2(int X) {
  32      if (X == 0x12345678) bar();
  33    }
  34
  35     as:
  36
  37        xoris r0,r3,0x1234
  38        cmplwi cr0,r0,0x5678
  39        beq cr0,L6
  40
  41     not:
  42
  43         lis r2, 4660
  44         ori r2, r2, 22136
  45         cmpw cr0, r3, r2
  46         bne .LBB_test2_2
  47
  48 ===-------------------------------------------------------------------------===
  49
  50 Lump the constant pool for each function into ONE pic object, and reference
  51 pieces of it as offsets from the start.  For functions like this (contrived
  52 to have lots of constants obviously):
  53
  54 double X(double Y) { return (Y*1.23 + 4.512)*2.34 + 14.38; }
  55
  56 We generate:
  57
  58 _X:
  59         lis r2, ha16(.CPI_X_0)
  60         lfd f0, lo16(.CPI_X_0)(r2)
  61         lis r2, ha16(.CPI_X_1)
  62         lfd f2, lo16(.CPI_X_1)(r2)
  63         fmadd f0, f1, f0, f2
  64         lis r2, ha16(.CPI_X_2)
  65         lfd f1, lo16(.CPI_X_2)(r2)
  66         lis r2, ha16(.CPI_X_3)
  67         lfd f2, lo16(.CPI_X_3)(r2)
  68         fmadd f1, f0, f1, f2
  69         blr
  70
  71 It would be better to materialize .CPI_X into a register, then use immediates
  72 off of the register to avoid the lis's.  This is even more important in PIC
  73 mode.
  74
  75 Note that this (and the static variable version) is discussed here for GCC:
  76 http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
  77
  78 ===-------------------------------------------------------------------------===
  79
  80 PIC Code Gen IPO optimization:
  81
  82 Squish small scalar globals together into a single global struct, allowing the
  83 address of the struct to be CSE'd, avoiding PIC accesses (also reduces the size
  84 of the GOT on targets with one).
  85
  86 Note that this is discussed here for GCC:
  87 http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
  88
  89 ===-------------------------------------------------------------------------===
  90
  91 Implement Newton-Rhapson method for improving estimate instructions to the
  92 correct accuracy, and implementing divide as multiply by reciprocal when it has
  93 more than one use.  Itanium will want this too.
  94
  95 ===-------------------------------------------------------------------------===
  96
  97 #define  ARRAY_LENGTH  16
  98
  99 union bitfield {
 100         struct {
 101 #ifndef __ppc__
 102                 unsigned int                       field0 : 6;
 103                 unsigned int                       field1 : 6;
 104                 unsigned int                       field2 : 6;
 105                 unsigned int                       field3 : 6;
 106                 unsigned int                       field4 : 3;
 107                 unsigned int                       field5 : 4;
 108                 unsigned int                       field6 : 1;
 109 #else
 110                 unsigned int                       field6 : 1;
 111                 unsigned int                       field5 : 4;
 112                 unsigned int                       field4 : 3;
 113                 unsigned int                       field3 : 6;
 114                 unsigned int                       field2 : 6;
 115                 unsigned int                       field1 : 6;
 116                 unsigned int                       field0 : 6;
 117 #endif
 118         } bitfields, bits;
 119         unsigned int    u32All;
 120         signed int      i32All;
 121         float   f32All;
 122 };
 123
 124
 125 typedef struct program_t {
 126         union bitfield    array[ARRAY_LENGTH];
 127     int               size;
 128     int               loaded;
 129 } program;
 130
 131
 132 void AdjustBitfields(program* prog, unsigned int fmt1)
 133 {
 134         prog->array[0].bitfields.field0 = fmt1;
 135         prog->array[0].bitfields.field1 = fmt1 + 1;
 136 }
 137
 138 We currently generate:
 139
 140 _AdjustBitfields:
 141         lwz r2, 0(r3)
 142         addi r5, r4, 1
 143         rlwinm r2, r2, 0, 0, 19
 144         rlwinm r5, r5, 6, 20, 25
 145         rlwimi r2, r4, 0, 26, 31
 146         or r2, r2, r5
 147         stw r2, 0(r3)
 148         blr
 149
 150 We should teach someone that or (rlwimi, rlwinm) with disjoint masks can be
 151 turned into rlwimi (rlwimi)
 152
 153 The better codegen would be:
 154
 155 _AdjustBitfields:
 156         lwz r0,0(r3)
 157         rlwinm r4,r4,0,0xff
 158         rlwimi r0,r4,0,26,31
 159         addi r4,r4,1
 160         rlwimi r0,r4,6,20,25
 161         stw r0,0(r3)
 162         blr
 163
 164 ===-------------------------------------------------------------------------===
 165
 166 Compile this:
 167
 168 int %f1(int %a, int %b) {
 169         %tmp.1 = and int %a, 15         ; <int> [#uses=1]
 170         %tmp.3 = and int %b, 240                ; <int> [#uses=1]
 171         %tmp.4 = or int %tmp.3, %tmp.1          ; <int> [#uses=1]
 172         ret int %tmp.4
 173 }
 174
 175 without a copy.  We make this currently:
 176
 177 _f1:
 178         rlwinm r2, r4, 0, 24, 27
 179         rlwimi r2, r3, 0, 28, 31
 180         or r3, r2, r2
 181         blr
 182
 183 The two-addr pass or RA needs to learn when it is profitable to commute an
 184 instruction to avoid a copy AFTER the 2-addr instruction.  The 2-addr pass
 185 currently only commutes to avoid inserting a copy BEFORE the two addr instr.
 186
 187 ===-------------------------------------------------------------------------===
 188
 189 Compile offsets from allocas:
 190
 191 int *%test() {
 192         %X = alloca { int, int }
 193         %Y = getelementptr {int,int}* %X, int 0, uint 1
 194         ret int* %Y
 195 }
 196
 197 into a single add, not two:
 198
 199 _test:
 200         addi r2, r1, -8
 201         addi r3, r2, 4
 202         blr
 203
 204 --> important for C++.
 205
 206 ===-------------------------------------------------------------------------===
 207
 208 int test3(int a, int b) { return (a < 0) ? a : 0; }
 209
 210 should be branch free code.  LLVM is turning it into < 1 because of the RHS.
 211
 212 ===-------------------------------------------------------------------------===
 213
 214 No loads or stores of the constants should be needed:
 215
 216 struct foo { double X, Y; };
 217 void xxx(struct foo F);
 218 void bar() { struct foo R = { 1.0, 2.0 }; xxx(R); }
 219
 220 ===-------------------------------------------------------------------------===
 221
 222 Darwin Stub LICM optimization:
 223
 224 Loops like this:
 225
 226   for (...)  bar();
 227
 228 Have to go through an indirect stub if bar is external or linkonce.  It would
 229 be better to compile it as:
 230
 231      fp = &bar;
 232      for (...)  fp();
 233
 234 which only computes the address of bar once (instead of each time through the
 235 stub).  This is Darwin specific and would have to be done in the code generator.
 236 Probably not a win on x86.
 237
 238 ===-------------------------------------------------------------------------===
 239
 240 PowerPC i1/setcc stuff (depends on subreg stuff):
 241
 242 Check out the PPC code we get for 'compare' in this testcase:
 243 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=19672
 244
 245 oof.  on top of not doing the logical crnand instead of (mfcr, mfcr,
 246 invert, invert, or), we then have to compare it against zero instead of
 247 using the value already in a CR!
 248
 249 that should be something like
 250         cmpw cr7, r8, r5
 251         cmpw cr0, r7, r3
 252         crnand cr0, cr0, cr7
 253         bne cr0, LBB_compare_4
 254
 255 instead of
 256         cmpw cr7, r8, r5
 257         cmpw cr0, r7, r3
 258         mfcr r7, 1
 259         mcrf cr7, cr0
 260         mfcr r8, 1
 261         rlwinm r7, r7, 30, 31, 31
 262         rlwinm r8, r8, 30, 31, 31
 263         xori r7, r7, 1
 264         xori r8, r8, 1
 265         addi r2, r2, 1
 266         or r7, r8, r7
 267         cmpwi cr0, r7, 0
 268         bne cr0, LBB_compare_4  ; loopexit
 269
 270 FreeBench/mason has a basic block that looks like this:
 271
 272          %tmp.130 = seteq int %p.0__, 5          ; <bool> [#uses=1]
 273          %tmp.134 = seteq int %p.1__, 6          ; <bool> [#uses=1]
 274          %tmp.139 = seteq int %p.2__, 12         ; <bool> [#uses=1]
 275          %tmp.144 = seteq int %p.3__, 13         ; <bool> [#uses=1]
 276          %tmp.149 = seteq int %p.4__, 14         ; <bool> [#uses=1]
 277          %tmp.154 = seteq int %p.5__, 15         ; <bool> [#uses=1]
 278          %bothcond = and bool %tmp.134, %tmp.130         ; <bool> [#uses=1]
 279          %bothcond123 = and bool %bothcond, %tmp.139             ; <bool>
 280          %bothcond124 = and bool %bothcond123, %tmp.144          ; <bool>
 281          %bothcond125 = and bool %bothcond124, %tmp.149          ; <bool>
 282          %bothcond126 = and bool %bothcond125, %tmp.154          ; <bool>
 283          br bool %bothcond126, label %shortcirc_next.5, label %else.0
 284
 285 This is a particularly important case where handling CRs better will help.
 286
 287 ===-------------------------------------------------------------------------===
 288
 289 Simple IPO for argument passing, change:
 290   void foo(int X, double Y, int Z) -> void foo(int X, int Z, double Y)
 291
 292 the Darwin ABI specifies that any integer arguments in the first 32 bytes worth
 293 of arguments get assigned to r3 through r10. That is, if you have a function
 294 foo(int, double, int) you get r3, f1, r6, since the 64 bit double ate up the
 295 argument bytes for r4 and r5. The trick then would be to shuffle the argument
 296 order for functions we can internalize so that the maximum number of
 297 integers/pointers get passed in regs before you see any of the fp arguments.
 298
 299 Instead of implementing this, it would actually probably be easier to just
 300 implement a PPC fastcc, where we could do whatever we wanted to the CC,
 301 including having this work sanely.
 302
 303 ===-------------------------------------------------------------------------===
 304
 305 Fix Darwin FP-In-Integer Registers ABI
 306
 307 Darwin passes doubles in structures in integer registers, which is very very
 308 bad.  Add something like a BIT_CONVERT to LLVM, then do an i-p transformation
 309 that percolates these things out of functions.
 310
 311 Check out how horrible this is:
 312 http://gcc.gnu.org/ml/gcc/2005-10/msg01036.html
 313
 314 This is an extension of "interprocedural CC unmunging" that can't be done with
 315 just fastcc.
 316
 317 ===-------------------------------------------------------------------------===
 318
 319 Generate lwbrx and other byteswapping load/store instructions when reasonable.
 320
 321 ===-------------------------------------------------------------------------===
 322
 323 Implement TargetConstantVec, and set up PPC to custom lower ConstantVec into
 324 TargetConstantVec's if it's one of the many forms that are algorithmically
 325 computable using the spiffy altivec instructions.
 326
 327 ===-------------------------------------------------------------------------===
 328
 329 Compile this:
 330
 331 int foo(int a) {
 332   int b = (a < 8);
 333   if (b) {
 334     return b * 3;     // ignore the fact that this is always 3.
 335   } else {
 336     return 2;
 337   }
 338 }
 339
 340 into something not this:
 341
 342 _foo:
 343 1)      cmpwi cr7, r3, 8
 344         mfcr r2, 1
 345         rlwinm r2, r2, 29, 31, 31
 346 1)      cmpwi cr0, r3, 7
 347         bgt cr0, LBB1_2 ; UnifiedReturnBlock
 348 LBB1_1: ; then
 349         rlwinm r2, r2, 0, 31, 31
 350         mulli r3, r2, 3
 351         blr
 352 LBB1_2: ; UnifiedReturnBlock
 353         li r3, 2
 354         blr
 355
 356 In particular, the two compares (marked 1) could be shared by reversing one.
 357 This could be done in the dag combiner, by swapping a BR_CC when a SETCC of the
 358 same operands (but backwards) exists.  In this case, this wouldn't save us
 359 anything though, because the compares still wouldn't be shared.
 360
 361 ===-------------------------------------------------------------------------===
 362
 363 The legalizer should lower this:
 364
 365 bool %test(ulong %x) {
 366   %tmp = setlt ulong %x, 4294967296
 367   ret bool %tmp
 368 }
 369
 370 into "if x.high == 0", not:
 371
 372 _test:
 373         addi r2, r3, -1
 374         cntlzw r2, r2
 375         cntlzw r3, r3
 376         srwi r2, r2, 5
 377         srwi r4, r3, 5
 378         li r3, 0
 379         cmpwi cr0, r2, 0
 380         bne cr0, LBB1_2 ;
 381 LBB1_1:
 382         or r3, r4, r4
 383 LBB1_2:
 384         blr
 385
 386 noticed in 2005-05-11-Popcount-ffs-fls.c.
 387
 388
 389 ===-------------------------------------------------------------------------===
 390
 391 We should custom expand setcc instead of pretending that we have it.  That
 392 would allow us to expose the access of the crbit after the mfcr, allowing
 393 that access to be trivially folded into other ops.  A simple example:
 394
 395 int foo(int a, int b) { return (a < b) << 4; }
 396
 397 compiles into:
 398
 399 _foo:
 400         cmpw cr7, r3, r4
 401         mfcr r2, 1
 402         rlwinm r2, r2, 29, 31, 31
 403         slwi r3, r2, 4
 404         blr
 405
 406 ===-------------------------------------------------------------------------===
 407
 408 Fold add and sub with constant into non-extern, non-weak addresses so this:
 409
 410 static int a;
 411 void bar(int b) { a = b; }
 412 void foo(unsigned char *c) {
 413   *c = a;
 414 }
 415
 416 So that
 417
 418 _foo:
 419         lis r2, ha16(_a)
 420         la r2, lo16(_a)(r2)
 421         lbz r2, 3(r2)
 422         stb r2, 0(r3)
 423         blr
 424
 425 Becomes
 426
 427 _foo:
 428         lis r2, ha16(_a+3)
 429         lbz r2, lo16(_a+3)(r2)
 430         stb r2, 0(r3)
 431         blr
 432
 433 ===-------------------------------------------------------------------------===
 434
 435 We generate really bad code for this:
 436
 437 int f(signed char *a, _Bool b, _Bool c) {
 438    signed char t = 0;
 439   if (b)  t = *a;
 440   if (c)  *a = t;
 441 }
 442