lib/Target/PowerPC/README.txt

   1 //===- README.txt - Notes for improving PowerPC-specific code gen ---------===//
   2
   3 TODO:
   4 * gpr0 allocation
   5 * implement do-loop -> bdnz transform
   6
   7 ===-------------------------------------------------------------------------===
   8
   9 Support 'update' load/store instructions.  These are cracked on the G5, but are
  10 still a codesize win.
  11
  12 ===-------------------------------------------------------------------------===
  13
  14 Teach the .td file to pattern match PPC::BR_COND to appropriate bc variant, so
  15 we don't have to always run the branch selector for small functions.
  16
  17 ===-------------------------------------------------------------------------===
  18
  19 * Codegen this:
  20
  21    void test2(int X) {
  22      if (X == 0x12345678) bar();
  23    }
  24
  25     as:
  26
  27        xoris r0,r3,0x1234
  28        cmplwi cr0,r0,0x5678
  29        beq cr0,L6
  30
  31     not:
  32
  33         lis r2, 4660
  34         ori r2, r2, 22136
  35         cmpw cr0, r3, r2
  36         bne .LBB_test2_2
  37
  38 ===-------------------------------------------------------------------------===
  39
  40 Lump the constant pool for each function into ONE pic object, and reference
  41 pieces of it as offsets from the start.  For functions like this (contrived
  42 to have lots of constants obviously):
  43
  44 double X(double Y) { return (Y*1.23 + 4.512)*2.34 + 14.38; }
  45
  46 We generate:
  47
  48 _X:
  49         lis r2, ha16(.CPI_X_0)
  50         lfd f0, lo16(.CPI_X_0)(r2)
  51         lis r2, ha16(.CPI_X_1)
  52         lfd f2, lo16(.CPI_X_1)(r2)
  53         fmadd f0, f1, f0, f2
  54         lis r2, ha16(.CPI_X_2)
  55         lfd f1, lo16(.CPI_X_2)(r2)
  56         lis r2, ha16(.CPI_X_3)
  57         lfd f2, lo16(.CPI_X_3)(r2)
  58         fmadd f1, f0, f1, f2
  59         blr
  60
  61 It would be better to materialize .CPI_X into a register, then use immediates
  62 off of the register to avoid the lis's.  This is even more important in PIC
  63 mode.
  64
  65 Note that this (and the static variable version) is discussed here for GCC:
  66 http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
  67
  68 ===-------------------------------------------------------------------------===
  69
  70 PIC Code Gen IPO optimization:
  71
  72 Squish small scalar globals together into a single global struct, allowing the
  73 address of the struct to be CSE'd, avoiding PIC accesses (also reduces the size
  74 of the GOT on targets with one).
  75
  76 Note that this is discussed here for GCC:
  77 http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
  78
  79 ===-------------------------------------------------------------------------===
  80
  81 Implement Newton-Rhapson method for improving estimate instructions to the
  82 correct accuracy, and implementing divide as multiply by reciprocal when it has
  83 more than one use.  Itanium will want this too.
  84
  85 ===-------------------------------------------------------------------------===
  86
  87 Compile this:
  88
  89 int %f1(int %a, int %b) {
  90         %tmp.1 = and int %a, 15         ; <int> [#uses=1]
  91         %tmp.3 = and int %b, 240                ; <int> [#uses=1]
  92         %tmp.4 = or int %tmp.3, %tmp.1          ; <int> [#uses=1]
  93         ret int %tmp.4
  94 }
  95
  96 without a copy.  We make this currently:
  97
  98 _f1:
  99         rlwinm r2, r4, 0, 24, 27
 100         rlwimi r2, r3, 0, 28, 31
 101         or r3, r2, r2
 102         blr
 103
 104 The two-addr pass or RA needs to learn when it is profitable to commute an
 105 instruction to avoid a copy AFTER the 2-addr instruction.  The 2-addr pass
 106 currently only commutes to avoid inserting a copy BEFORE the two addr instr.
 107
 108 ===-------------------------------------------------------------------------===
 109
 110 Compile offsets from allocas:
 111
 112 int *%test() {
 113         %X = alloca { int, int }
 114         %Y = getelementptr {int,int}* %X, int 0, uint 1
 115         ret int* %Y
 116 }
 117
 118 into a single add, not two:
 119
 120 _test:
 121         addi r2, r1, -8
 122         addi r3, r2, 4
 123         blr
 124
 125 --> important for C++.
 126
 127 ===-------------------------------------------------------------------------===
 128
 129 int test3(int a, int b) { return (a < 0) ? a : 0; }
 130
 131 should be branch free code.  LLVM is turning it into < 1 because of the RHS.
 132
 133 ===-------------------------------------------------------------------------===
 134
 135 No loads or stores of the constants should be needed:
 136
 137 struct foo { double X, Y; };
 138 void xxx(struct foo F);
 139 void bar() { struct foo R = { 1.0, 2.0 }; xxx(R); }
 140
 141 ===-------------------------------------------------------------------------===
 142
 143 Darwin Stub LICM optimization:
 144
 145 Loops like this:
 146
 147   for (...)  bar();
 148
 149 Have to go through an indirect stub if bar is external or linkonce.  It would
 150 be better to compile it as:
 151
 152      fp = &bar;
 153      for (...)  fp();
 154
 155 which only computes the address of bar once (instead of each time through the
 156 stub).  This is Darwin specific and would have to be done in the code generator.
 157 Probably not a win on x86.
 158
 159 ===-------------------------------------------------------------------------===
 160
 161 PowerPC i1/setcc stuff (depends on subreg stuff):
 162
 163 Check out the PPC code we get for 'compare' in this testcase:
 164 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=19672
 165
 166 oof.  on top of not doing the logical crnand instead of (mfcr, mfcr,
 167 invert, invert, or), we then have to compare it against zero instead of
 168 using the value already in a CR!
 169
 170 that should be something like
 171         cmpw cr7, r8, r5
 172         cmpw cr0, r7, r3
 173         crnand cr0, cr0, cr7
 174         bne cr0, LBB_compare_4
 175
 176 instead of
 177         cmpw cr7, r8, r5
 178         cmpw cr0, r7, r3
 179         mfcr r7, 1
 180         mcrf cr7, cr0
 181         mfcr r8, 1
 182         rlwinm r7, r7, 30, 31, 31
 183         rlwinm r8, r8, 30, 31, 31
 184         xori r7, r7, 1
 185         xori r8, r8, 1
 186         addi r2, r2, 1
 187         or r7, r8, r7
 188         cmpwi cr0, r7, 0
 189         bne cr0, LBB_compare_4  ; loopexit
 190
 191 FreeBench/mason has a basic block that looks like this:
 192
 193          %tmp.130 = seteq int %p.0__, 5          ; <bool> [#uses=1]
 194          %tmp.134 = seteq int %p.1__, 6          ; <bool> [#uses=1]
 195          %tmp.139 = seteq int %p.2__, 12         ; <bool> [#uses=1]
 196          %tmp.144 = seteq int %p.3__, 13         ; <bool> [#uses=1]
 197          %tmp.149 = seteq int %p.4__, 14         ; <bool> [#uses=1]
 198          %tmp.154 = seteq int %p.5__, 15         ; <bool> [#uses=1]
 199          %bothcond = and bool %tmp.134, %tmp.130         ; <bool> [#uses=1]
 200          %bothcond123 = and bool %bothcond, %tmp.139             ; <bool>
 201          %bothcond124 = and bool %bothcond123, %tmp.144          ; <bool>
 202          %bothcond125 = and bool %bothcond124, %tmp.149          ; <bool>
 203          %bothcond126 = and bool %bothcond125, %tmp.154          ; <bool>
 204          br bool %bothcond126, label %shortcirc_next.5, label %else.0
 205
 206 This is a particularly important case where handling CRs better will help.
 207
 208 ===-------------------------------------------------------------------------===
 209
 210 Simple IPO for argument passing, change:
 211   void foo(int X, double Y, int Z) -> void foo(int X, int Z, double Y)
 212
 213 the Darwin ABI specifies that any integer arguments in the first 32 bytes worth
 214 of arguments get assigned to r3 through r10. That is, if you have a function
 215 foo(int, double, int) you get r3, f1, r6, since the 64 bit double ate up the
 216 argument bytes for r4 and r5. The trick then would be to shuffle the argument
 217 order for functions we can internalize so that the maximum number of
 218 integers/pointers get passed in regs before you see any of the fp arguments.
 219
 220 Instead of implementing this, it would actually probably be easier to just
 221 implement a PPC fastcc, where we could do whatever we wanted to the CC,
 222 including having this work sanely.
 223
 224 ===-------------------------------------------------------------------------===
 225
 226 Fix Darwin FP-In-Integer Registers ABI
 227
 228 Darwin passes doubles in structures in integer registers, which is very very
 229 bad.  Add something like a BIT_CONVERT to LLVM, then do an i-p transformation
 230 that percolates these things out of functions.
 231
 232 Check out how horrible this is:
 233 http://gcc.gnu.org/ml/gcc/2005-10/msg01036.html
 234
 235 This is an extension of "interprocedural CC unmunging" that can't be done with
 236 just fastcc.
 237
 238 ===-------------------------------------------------------------------------===
 239
 240 Compile this:
 241
 242 int foo(int a) {
 243   int b = (a < 8);
 244   if (b) {
 245     return b * 3;     // ignore the fact that this is always 3.
 246   } else {
 247     return 2;
 248   }
 249 }
 250
 251 into something not this:
 252
 253 _foo:
 254 1)      cmpwi cr7, r3, 8
 255         mfcr r2, 1
 256         rlwinm r2, r2, 29, 31, 31
 257 1)      cmpwi cr0, r3, 7
 258         bgt cr0, LBB1_2 ; UnifiedReturnBlock
 259 LBB1_1: ; then
 260         rlwinm r2, r2, 0, 31, 31
 261         mulli r3, r2, 3
 262         blr
 263 LBB1_2: ; UnifiedReturnBlock
 264         li r3, 2
 265         blr
 266
 267 In particular, the two compares (marked 1) could be shared by reversing one.
 268 This could be done in the dag combiner, by swapping a BR_CC when a SETCC of the
 269 same operands (but backwards) exists.  In this case, this wouldn't save us
 270 anything though, because the compares still wouldn't be shared.
 271
 272 ===-------------------------------------------------------------------------===
 273
 274 The legalizer should lower this:
 275
 276 bool %test(ulong %x) {
 277   %tmp = setlt ulong %x, 4294967296
 278   ret bool %tmp
 279 }
 280
 281 into "if x.high == 0", not:
 282
 283 _test:
 284         addi r2, r3, -1
 285         cntlzw r2, r2
 286         cntlzw r3, r3
 287         srwi r2, r2, 5
 288         srwi r4, r3, 5
 289         li r3, 0
 290         cmpwi cr0, r2, 0
 291         bne cr0, LBB1_2 ;
 292 LBB1_1:
 293         or r3, r4, r4
 294 LBB1_2:
 295         blr
 296
 297 noticed in 2005-05-11-Popcount-ffs-fls.c.
 298
 299
 300 ===-------------------------------------------------------------------------===
 301
 302 We should custom expand setcc instead of pretending that we have it.  That
 303 would allow us to expose the access of the crbit after the mfcr, allowing
 304 that access to be trivially folded into other ops.  A simple example:
 305
 306 int foo(int a, int b) { return (a < b) << 4; }
 307
 308 compiles into:
 309
 310 _foo:
 311         cmpw cr7, r3, r4
 312         mfcr r2, 1
 313         rlwinm r2, r2, 29, 31, 31
 314         slwi r3, r2, 4
 315         blr
 316
 317 ===-------------------------------------------------------------------------===
 318
 319 Fold add and sub with constant into non-extern, non-weak addresses so this:
 320
 321 static int a;
 322 void bar(int b) { a = b; }
 323 void foo(unsigned char *c) {
 324   *c = a;
 325 }
 326
 327 So that
 328
 329 _foo:
 330         lis r2, ha16(_a)
 331         la r2, lo16(_a)(r2)
 332         lbz r2, 3(r2)
 333         stb r2, 0(r3)
 334         blr
 335
 336 Becomes
 337
 338 _foo:
 339         lis r2, ha16(_a+3)
 340         lbz r2, lo16(_a+3)(r2)
 341         stb r2, 0(r3)
 342         blr
 343
 344 ===-------------------------------------------------------------------------===
 345
 346 We generate really bad code for this:
 347
 348 int f(signed char *a, _Bool b, _Bool c) {
 349    signed char t = 0;
 350   if (b)  t = *a;
 351   if (c)  *a = t;
 352 }
 353
 354 ===-------------------------------------------------------------------------===
 355
 356 This:
 357 int test(unsigned *P) { return *P >> 24; }
 358
 359 Should compile to:
 360
 361 _test:
 362         lbz r3,0(r3)
 363         blr
 364
 365 not:
 366
 367 _test:
 368         lwz r2, 0(r3)
 369         srwi r3, r2, 24
 370         blr
 371
 372 ===-------------------------------------------------------------------------===
 373
 374 On the G5, logical CR operations are more expensive in their three
 375 address form: ops that read/write the same register are half as expensive as
 376 those that read from two registers that are different from their destination.
 377
 378 We should model this with two separate instructions.  The isel should generate
 379 the "two address" form of the instructions.  When the register allocator
 380 detects that it needs to insert a copy due to the two-addresness of the CR
 381 logical op, it will invoke PPCInstrInfo::convertToThreeAddress.  At this point
 382 we can convert to the "three address" instruction, to save code space.
 383
 384 This only matters when we start generating cr logical ops.
 385
 386 ===-------------------------------------------------------------------------===
 387
 388 We should compile these two functions to the same thing:
 389
 390 #include <stdlib.h>
 391 void f(int a, int b, int *P) {
 392   *P = (a-b)>=0?(a-b):(b-a);
 393 }
 394 void g(int a, int b, int *P) {
 395   *P = abs(a-b);
 396 }
 397
 398 Further, they should compile to something better than:
 399
 400 _g:
 401         subf r2, r4, r3
 402         subfic r3, r2, 0
 403         cmpwi cr0, r2, -1
 404         bgt cr0, LBB2_2 ; entry
 405 LBB2_1: ; entry
 406         mr r2, r3
 407 LBB2_2: ; entry
 408         stw r2, 0(r5)
 409         blr
 410
 411 GCC produces:
 412
 413 _g:
 414         subf r4,r4,r3
 415         srawi r2,r4,31
 416         xor r0,r2,r4
 417         subf r0,r2,r0
 418         stw r0,0(r5)
 419         blr
 420
 421 ... which is much nicer.
 422
 423 This theoretically may help improve twolf slightly (used in dimbox.c:142?).
 424
 425 ===-------------------------------------------------------------------------===
 426
 427 int foo(int N, int ***W, int **TK, int X) {
 428   int t, i;
 429
 430   for (t = 0; t < N; ++t)
 431     for (i = 0; i < 4; ++i)
 432       W[t / X][i][t % X] = TK[i][t];
 433
 434   return 5;
 435 }
 436
 437 We generate relatively atrocious code for this loop compared to gcc.
 438
 439 We could also strength reduce the rem and the div:
 440 http://www.lcs.mit.edu/pubs/pdf/MIT-LCS-TM-600.pdf
 441
 442 ===-------------------------------------------------------------------------===
 443
 444 float foo(float X) { return (int)(X); }
 445
 446 Currently produces:
 447
 448 _foo:
 449         fctiwz f0, f1
 450         stfd f0, -8(r1)
 451         lwz r2, -4(r1)
 452         extsw r2, r2
 453         std r2, -16(r1)
 454         lfd f0, -16(r1)
 455         fcfid f0, f0
 456         frsp f1, f0
 457         blr
 458
 459 We could use a target dag combine to turn the lwz/extsw into an lwa when the
 460 lwz has a single use.  Since LWA is cracked anyway, this would be a codesize
 461 win only.
 462
 463 ===-------------------------------------------------------------------------===
 464
 465 We generate ugly code for this:
 466
 467 void func(unsigned int *ret, float dx, float dy, float dz, float dw) {
 468   unsigned code = 0;
 469   if(dx < -dw) code |= 1;
 470   if(dx > dw)  code |= 2;
 471   if(dy < -dw) code |= 4;
 472   if(dy > dw)  code |= 8;
 473   if(dz < -dw) code |= 16;
 474   if(dz > dw)  code |= 32;
 475   *ret = code;
 476 }
 477
 478 ===-------------------------------------------------------------------------===
 479
 480 Complete the signed i32 to FP conversion code using 64-bit registers
 481 transformation, good for PI.  See PPCISelLowering.cpp, this comment:
 482
 483      // FIXME: disable this lowered code.  This generates 64-bit register values,
 484      // and we don't model the fact that the top part is clobbered by calls.  We
 485      // need to flag these together so that the value isn't live across a call.
 486      //setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
 487
 488 Also, if the registers are spilled to the stack, we have to ensure that all
 489 64-bits of them are save/restored, otherwise we will miscompile the code.  It
 490 sounds like we need to get the 64-bit register classes going.
 491
 492 ===-------------------------------------------------------------------------===
 493
 494 %struct.B = type { ubyte, [3 x ubyte] }
 495
 496 void %foo(%struct.B* %b) {
 497 entry:
 498         %tmp = cast %struct.B* %b to uint*              ; <uint*> [#uses=1]
 499         %tmp = load uint* %tmp          ; <uint> [#uses=1]
 500         %tmp3 = cast %struct.B* %b to uint*             ; <uint*> [#uses=1]
 501         %tmp4 = load uint* %tmp3                ; <uint> [#uses=1]
 502         %tmp8 = cast %struct.B* %b to uint*             ; <uint*> [#uses=2]
 503         %tmp9 = load uint* %tmp8                ; <uint> [#uses=1]
 504         %tmp4.mask17 = shl uint %tmp4, ubyte 1          ; <uint> [#uses=1]
 505         %tmp1415 = and uint %tmp4.mask17, 2147483648            ; <uint> [#uses=1]
 506         %tmp.masked = and uint %tmp, 2147483648         ; <uint> [#uses=1]
 507         %tmp11 = or uint %tmp1415, %tmp.masked          ; <uint> [#uses=1]
 508         %tmp12 = and uint %tmp9, 2147483647             ; <uint> [#uses=1]
 509         %tmp13 = or uint %tmp12, %tmp11         ; <uint> [#uses=1]
 510         store uint %tmp13, uint* %tmp8
 511         ret void
 512 }
 513
 514 We emit:
 515
 516 _foo:
 517         lwz r2, 0(r3)
 518         slwi r4, r2, 1
 519         or r4, r4, r2
 520         rlwimi r2, r4, 0, 0, 0
 521         stw r2, 0(r3)
 522         blr
 523
 524 We could collapse a bunch of those ORs and ANDs and generate the following
 525 equivalent code:
 526
 527 _foo:
 528         lwz r2, 0(r3)
 529         rlwinm r4, r2, 1, 0, 0
 530         or r2, r2, r4
 531         stw r2, 0(r3)
 532         blr
 533
 534 ===-------------------------------------------------------------------------===
 535
 536 On PPC64, this results in a truncate followed by a truncstore.  These should
 537 be folded together.
 538
 539 unsigned short G;
 540 void foo(unsigned long H) { G = H; }
 541