lib/Target/ARM/README.txt

   1 //===---------------------------------------------------------------------===//
   2 // Random ideas for the ARM backend.
   3 //===---------------------------------------------------------------------===//
   4
   5 Reimplement 'select' in terms of 'SEL'.
   6
   7 * We would really like to support UXTAB16, but we need to prove that the
   8   add doesn't need to overflow between the two 16-bit chunks.
   9
  10 * implement predication support
  11 * Implement pre/post increment support.  (e.g. PR935)
  12 * Coalesce stack slots!
  13 * Implement smarter constant generation for binops with large immediates.
  14
  15 * Consider materializing FP constants like 0.0f and 1.0f using integer
  16   immediate instructions then copy to FPU.  Slower than load into FPU?
  17
  18 //===---------------------------------------------------------------------===//
  19
  20 The constant island pass has been much improved; all the todo items in the
  21 previous version of this document have been addressed.  However, there are still
  22 things that can be done:
  23
  24 1.  When there isn't an existing water, the current MBB is split right after
  25 the use.  It would be profitable to look farther forward, especially on Thumb,
  26 where negative offsets won't work.
  27 Now it will put the island at the end of the block if that is in range.  If it
  28 is not in range things still work as above, which is poor on Thumb.
  29
  30 2.  There may be some advantage to trying to be smarter about the initial
  31 placement, rather than putting everything at the end.
  32
  33 3.  The handling of 2-byte padding for Thumb is overly conservative.  There
  34 would be a small gain to keeping accurate track of the padding (which would
  35 require aligning functions containing constant pools to 4-byte boundaries).
  36
  37 //===---------------------------------------------------------------------===//
  38
  39 We need to start generating predicated instructions.  The .td files have a way
  40 to express this now (see the PPC conditional return instruction), but the
  41 branch folding pass (or a new if-cvt pass) should start producing these, at
  42 least in the trivial case.
  43
  44 Among the obvious wins, doing so can eliminate the need to custom expand
  45 copysign (i.e. we won't need to custom expand it to get the conditional
  46 negate).
  47
  48 This allows us to eliminate one instruction from:
  49
  50 define i32 @_Z6slow4bii(i32 %x, i32 %y) {
  51         %tmp = icmp sgt i32 %x, %y
  52         %retval = select i1 %tmp, i32 %x, i32 %y
  53         ret i32 %retval
  54 }
  55
  56 __Z6slow4bii:
  57         cmp r0, r1
  58         movgt r1, r0
  59         mov r0, r1
  60         bx lr
  61
  62 //===---------------------------------------------------------------------===//
  63
  64 Implement long long "X-3" with instructions that fold the immediate in.  These
  65 were disabled due to badness with the ARM carry flag on subtracts.
  66
  67 //===---------------------------------------------------------------------===//
  68
  69 We currently compile abs:
  70 int foo(int p) { return p < 0 ? -p : p; }
  71
  72 into:
  73
  74 _foo:
  75         rsb r1, r0, #0
  76         cmn r0, #1
  77         movgt r1, r0
  78         mov r0, r1
  79         bx lr
  80
  81 This is very, uh, literal.  This could be a 3 operation sequence:
  82   t = (p sra 31);
  83   res = (p xor t)-t
  84
  85 Which would be better.  This occurs in png decode.
  86
  87 //===---------------------------------------------------------------------===//
  88
  89 More load / store optimizations:
  90 1) Look past instructions without side-effects (not load, store, branch, etc.)
  91    when forming the list of loads / stores to optimize.
  92
  93 2) Smarter register allocation?
  94 We are probably missing some opportunities to use ldm / stm. Consider:
  95
  96 ldr r5, [r0]
  97 ldr r4, [r0, #4]
  98
  99 This cannot be merged into a ldm. Perhaps we will need to do the transformation
 100 before register allocation. Then teach the register allocator to allocate a
 101 chunk of consecutive registers.
 102
 103 3) Better representation for block transfer? This is from Olden/power:
 104
 105         fldd d0, [r4]
 106         fstd d0, [r4, #+32]
 107         fldd d0, [r4, #+8]
 108         fstd d0, [r4, #+40]
 109         fldd d0, [r4, #+16]
 110         fstd d0, [r4, #+48]
 111         fldd d0, [r4, #+24]
 112         fstd d0, [r4, #+56]
 113
 114 If we can spare the registers, it would be better to use fldm and fstm here.
 115 Need major register allocator enhancement though.
 116
 117 4) Can we recognize the relative position of constantpool entries? i.e. Treat
 118
 119         ldr r0, LCPI17_3
 120         ldr r1, LCPI17_4
 121         ldr r2, LCPI17_5
 122
 123    as
 124         ldr r0, LCPI17
 125         ldr r1, LCPI17+4
 126         ldr r2, LCPI17+8
 127
 128    Then the ldr's can be combined into a single ldm. See Olden/power.
 129
 130 Note for ARM v4 gcc uses ldmia to load a pair of 32-bit values to represent a
 131 double 64-bit FP constant:
 132
 133         adr     r0, L6
 134         ldmia   r0, {r0-r1}
 135
 136         .align 2
 137 L6:
 138         .long   -858993459
 139         .long   1074318540
 140
 141 5) Can we make use of ldrd and strd? Instead of generating ldm / stm, use
 142 ldrd/strd instead if there are only two destination registers that form an
 143 odd/even pair. However, we probably would pay a penalty if the address is not
 144 aligned on 8-byte boundary. This requires more information on load / store
 145 nodes (and MI's?) then we currently carry.
 146
 147 //===---------------------------------------------------------------------===//
 148
 149 * Consider this silly example:
 150
 151 double bar(double x) {
 152   double r = foo(3.1);
 153   return x+r;
 154 }
 155
 156 _bar:
 157         sub sp, sp, #16
 158         str r4, [sp, #+12]
 159         str r5, [sp, #+8]
 160         str lr, [sp, #+4]
 161         mov r4, r0
 162         mov r5, r1
 163         ldr r0, LCPI2_0
 164         bl _foo
 165         fmsr f0, r0
 166         fcvtsd d0, f0
 167         fmdrr d1, r4, r5
 168         faddd d0, d0, d1
 169         fmrrd r0, r1, d0
 170         ldr lr, [sp, #+4]
 171         ldr r5, [sp, #+8]
 172         ldr r4, [sp, #+12]
 173         add sp, sp, #16
 174         bx lr
 175
 176 Ignore the prologue and epilogue stuff for a second. Note
 177         mov r4, r0
 178         mov r5, r1
 179 the copys to callee-save registers and the fact they are only being used by the
 180 fmdrr instruction. It would have been better had the fmdrr been scheduled
 181 before the call and place the result in a callee-save DPR register. The two
 182 mov ops would not have been necessary.
 183
 184 //===---------------------------------------------------------------------===//
 185
 186 Calling convention related stuff:
 187
 188 * gcc's parameter passing implementation is terrible and we suffer as a result:
 189
 190 e.g.
 191 struct s {
 192   double d1;
 193   int s1;
 194 };
 195
 196 void foo(struct s S) {
 197   printf("%g, %d\n", S.d1, S.s1);
 198 }
 199
 200 'S' is passed via registers r0, r1, r2. But gcc stores them to the stack, and
 201 then reload them to r1, r2, and r3 before issuing the call (r0 contains the
 202 address of the format string):
 203
 204         stmfd   sp!, {r7, lr}
 205         add     r7, sp, #0
 206         sub     sp, sp, #12
 207         stmia   sp, {r0, r1, r2}
 208         ldmia   sp, {r1-r2}
 209         ldr     r0, L5
 210         ldr     r3, [sp, #8]
 211 L2:
 212         add     r0, pc, r0
 213         bl      L_printf$stub
 214
 215 Instead of a stmia, ldmia, and a ldr, wouldn't it be better to do three moves?
 216
 217 * Return an aggregate type is even worse:
 218
 219 e.g.
 220 struct s foo(void) {
 221   struct s S = {1.1, 2};
 222   return S;
 223 }
 224
 225         mov     ip, r0
 226         ldr     r0, L5
 227         sub     sp, sp, #12
 228 L2:
 229         add     r0, pc, r0
 230         @ lr needed for prologue
 231         ldmia   r0, {r0, r1, r2}
 232         stmia   sp, {r0, r1, r2}
 233         stmia   ip, {r0, r1, r2}
 234         mov     r0, ip
 235         add     sp, sp, #12
 236         bx      lr
 237
 238 r0 (and later ip) is the hidden parameter from caller to store the value in. The
 239 first ldmia loads the constants into r0, r1, r2. The last stmia stores r0, r1,
 240 r2 into the address passed in. However, there is one additional stmia that
 241 stores r0, r1, and r2 to some stack location. The store is dead.
 242
 243 The llvm-gcc generated code looks like this:
 244
 245 csretcc void %foo(%struct.s* %agg.result) {
 246 entry:
 247         %S = alloca %struct.s, align 4          ; <%struct.s*> [#uses=1]
 248         %memtmp = alloca %struct.s              ; <%struct.s*> [#uses=1]
 249         cast %struct.s* %S to sbyte*            ; <sbyte*>:0 [#uses=2]
 250         call void %llvm.memcpy.i32( sbyte* %0, sbyte* cast ({ double, int }* %C.0.904 to sbyte*), uint 12, uint 4 )
 251         cast %struct.s* %agg.result to sbyte*           ; <sbyte*>:1 [#uses=2]
 252         call void %llvm.memcpy.i32( sbyte* %1, sbyte* %0, uint 12, uint 0 )
 253         cast %struct.s* %memtmp to sbyte*               ; <sbyte*>:2 [#uses=1]
 254         call void %llvm.memcpy.i32( sbyte* %2, sbyte* %1, uint 12, uint 0 )
 255         ret void
 256 }
 257
 258 llc ends up issuing two memcpy's (the first memcpy becomes 3 loads from
 259 constantpool). Perhaps we should 1) fix llvm-gcc so the memcpy is translated
 260 into a number of load and stores, or 2) custom lower memcpy (of small size) to
 261 be ldmia / stmia. I think option 2 is better but the current register
 262 allocator cannot allocate a chunk of registers at a time.
 263
 264 A feasible temporary solution is to use specific physical registers at the
 265 lowering time for small (<= 4 words?) transfer size.
 266
 267 * ARM CSRet calling convention requires the hidden argument to be returned by
 268 the callee.
 269
 270 //===---------------------------------------------------------------------===//
 271
 272 We can definitely do a better job on BB placements to eliminate some branches.
 273 It's very common to see llvm generated assembly code that looks like this:
 274
 275 LBB3:
 276  ...
 277 LBB4:
 278 ...
 279   beq LBB3
 280   b LBB2
 281
 282 If BB4 is the only predecessor of BB3, then we can emit BB3 after BB4. We can
 283 then eliminate beq and and turn the unconditional branch to LBB2 to a bne.
 284
 285 See McCat/18-imp/ComputeBoundingBoxes for an example.
 286
 287 //===---------------------------------------------------------------------===//
 288
 289 We need register scavenging.  Currently, the 'ip' register is reserved in case
 290 frame indexes are too big.  This means that we generate extra code for stuff
 291 like this:
 292
 293 void foo(unsigned x, unsigned y, unsigned z, unsigned *a, unsigned *b, unsigned *c) {
 294    short Rconst = (short) (16384.0f * 1.40200 + 0.5 );
 295    *a = x * Rconst;
 296    *b = y * Rconst;
 297    *c = z * Rconst;
 298 }
 299
 300 we compile it to:
 301
 302 _foo:
 303 ***     stmfd sp!, {r4, r7}
 304 ***     add r7, sp, #4
 305         mov r4, #186
 306         orr r4, r4, #89, 24 @ 22784
 307         mul r0, r0, r4
 308         str r0, [r3]
 309         mul r0, r1, r4
 310         ldr r1, [sp, #+8]
 311         str r0, [r1]
 312         mul r0, r2, r4
 313         ldr r1, [sp, #+12]
 314         str r0, [r1]
 315 ***     sub sp, r7, #4
 316 ***     ldmfd sp!, {r4, r7}
 317         bx lr
 318
 319 GCC produces:
 320
 321 _foo:
 322         ldr     ip, L4
 323         mul     r0, ip, r0
 324         mul     r1, ip, r1
 325         str     r0, [r3, #0]
 326         ldr     r3, [sp, #0]
 327         mul     r2, ip, r2
 328         str     r1, [r3, #0]
 329         ldr     r3, [sp, #4]
 330         str     r2, [r3, #0]
 331         bx      lr
 332 L4:
 333         .long   22970
 334
 335 This is apparently all because we couldn't use ip here.
 336
 337 //===---------------------------------------------------------------------===//
 338
 339 Pre-/post- indexed load / stores:
 340
 341 1) We should not make the pre/post- indexed load/store transform if the base ptr
 342 is guaranteed to be live beyond the load/store. This can happen if the base
 343 ptr is live out of the block we are performing the optimization. e.g.
 344
 345 mov r1, r2
 346 ldr r3, [r1], #4
 347 ...
 348
 349 vs.
 350
 351 ldr r3, [r2]
 352 add r1, r2, #4
 353 ...
 354
 355 In most cases, this is just a wasted optimization. However, sometimes it can
 356 negatively impact the performance because two-address code is more restrictive
 357 when it comes to scheduling.
 358
 359 Unfortunately, liveout information is currently unavailable during DAG combine
 360 time.
 361
 362 2) Consider spliting a indexed load / store into a pair of add/sub + load/store
 363    to solve #1 (in TwoAddressInstructionPass.cpp).
 364
 365 3) Enhance LSR to generate more opportunities for indexed ops.
 366
 367 4) Once we added support for multiple result patterns, write indexed loads
 368    patterns instead of C++ instruction selection code.
 369
 370 5) Use FLDM / FSTM to emulate indexed FP load / store.
 371
 372 //===---------------------------------------------------------------------===//
 373
 374 We should add i64 support to take advantage of the 64-bit load / stores.
 375 We can add a pseudo i64 register class containing pseudo registers that are
 376 register pairs. All other ops (e.g. add, sub) would be expanded as usual.
 377
 378 We need to add pseudo instructions (i.e. gethi / getlo) to extract i32 registers
 379 from the i64 register. These are single moves which can be eliminated if the
 380 destination register is a sub-register of the source. We should implement proper
 381 subreg support in the register allocator to coalesce these away.
 382
 383 There are other minor issues such as multiple instructions for a spill / restore
 384 / move.
 385
 386 //===---------------------------------------------------------------------===//
 387
 388 Implement support for some more tricky ways to materialize immediates.  For
 389 example, to get 0xffff8000, we can use:
 390
 391 mov r9, #&3f8000
 392 sub r9, r9, #&400000
 393
 394 //===---------------------------------------------------------------------===//
 395
 396 We sometimes generate multiple add / sub instructions to update sp in prologue
 397 and epilogue if the inc / dec value is too large to fit in a single immediate
 398 operand. In some cases, perhaps it might be better to load the value from a
 399 constantpool instead.
 400
 401 //===---------------------------------------------------------------------===//
 402
 403 GCC generates significantly better code for this function.
 404
 405 int foo(int StackPtr, unsigned char *Line, unsigned char *Stack, int LineLen) {
 406     int i = 0;
 407
 408     if (StackPtr != 0) {
 409        while (StackPtr != 0 && i < (((LineLen) < (32768))? (LineLen) : (32768)))
 410           Line[i++] = Stack[--StackPtr];
 411         if (LineLen > 32768)
 412         {
 413             while (StackPtr != 0 && i < LineLen)
 414             {
 415                 i++;
 416                 --StackPtr;
 417             }
 418         }
 419     }
 420     return StackPtr;
 421 }
 422
 423 //===---------------------------------------------------------------------===//
 424
 425 This should compile to the mlas instruction:
 426 int mlas(int x, int y, int z) { return ((x * y + z) < 0) ? 7 : 13; }
 427
 428 //===---------------------------------------------------------------------===//
 429
 430 At some point, we should triage these to see if they still apply to us:
 431
 432 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=19598
 433 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=18560
 434 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=27016
 435
 436 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=11831
 437 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=11826
 438 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=11825
 439 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=11824
 440 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=11823
 441 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=11820
 442 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=10982
 443
 444 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=10242
 445 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9831
 446 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9760
 447 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9759
 448 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9703
 449 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9702
 450 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9663
 451
 452 http://www.inf.u-szeged.hu/gcc-arm/
 453 http://citeseer.ist.psu.edu/debus04linktime.html
 454
 455 //===---------------------------------------------------------------------===//