lib/Target/ARM/README.txt

   1 //===---------------------------------------------------------------------===//
   2 // Random ideas for the ARM backend.
   3 //===---------------------------------------------------------------------===//
   4
   5 Reimplement 'select' in terms of 'SEL'.
   6
   7 * We would really like to support UXTAB16, but we need to prove that the
   8   add doesn't need to overflow between the two 16-bit chunks.
   9
  10 * implement predication support
  11 * Implement pre/post increment support.  (e.g. PR935)
  12 * Coalesce stack slots!
  13 * Implement smarter constant generation for binops with large immediates.
  14
  15 * Consider materializing FP constants like 0.0f and 1.0f using integer
  16   immediate instructions then copy to FPU.  Slower than load into FPU?
  17
  18 //===---------------------------------------------------------------------===//
  19
  20 The constant island pass is in good shape.  Some cleanups might be desirable,
  21 but there is unlikely to be much improvement in the generated code.
  22
  23 1.  There may be some advantage to trying to be smarter about the initial
  24 placement, rather than putting everything at the end.
  25
  26 2.  The handling of 2-byte padding for Thumb is overly conservative.  There
  27 would be a small gain to keeping accurate track of the padding (which would
  28 require aligning functions containing constant pools to 4-byte boundaries).
  29
  30 3.  There might be some compile-time efficiency to be had by representing
  31 consecutive islands as a single block rather than multiple blocks.
  32
  33 //===---------------------------------------------------------------------===//
  34
  35 We need to start generating predicated instructions.  The .td files have a way
  36 to express this now (see the PPC conditional return instruction), but the
  37 branch folding pass (or a new if-cvt pass) should start producing these, at
  38 least in the trivial case.
  39
  40 Among the obvious wins, doing so can eliminate the need to custom expand
  41 copysign (i.e. we won't need to custom expand it to get the conditional
  42 negate).
  43
  44 This allows us to eliminate one instruction from:
  45
  46 define i32 @_Z6slow4bii(i32 %x, i32 %y) {
  47         %tmp = icmp sgt i32 %x, %y
  48         %retval = select i1 %tmp, i32 %x, i32 %y
  49         ret i32 %retval
  50 }
  51
  52 __Z6slow4bii:
  53         cmp r0, r1
  54         movgt r1, r0
  55         mov r0, r1
  56         bx lr
  57
  58 //===---------------------------------------------------------------------===//
  59
  60 Implement long long "X-3" with instructions that fold the immediate in.  These
  61 were disabled due to badness with the ARM carry flag on subtracts.
  62
  63 //===---------------------------------------------------------------------===//
  64
  65 We currently compile abs:
  66 int foo(int p) { return p < 0 ? -p : p; }
  67
  68 into:
  69
  70 _foo:
  71         rsb r1, r0, #0
  72         cmn r0, #1
  73         movgt r1, r0
  74         mov r0, r1
  75         bx lr
  76
  77 This is very, uh, literal.  This could be a 3 operation sequence:
  78   t = (p sra 31);
  79   res = (p xor t)-t
  80
  81 Which would be better.  This occurs in png decode.
  82
  83 //===---------------------------------------------------------------------===//
  84
  85 More load / store optimizations:
  86 1) Look past instructions without side-effects (not load, store, branch, etc.)
  87    when forming the list of loads / stores to optimize.
  88
  89 2) Smarter register allocation?
  90 We are probably missing some opportunities to use ldm / stm. Consider:
  91
  92 ldr r5, [r0]
  93 ldr r4, [r0, #4]
  94
  95 This cannot be merged into a ldm. Perhaps we will need to do the transformation
  96 before register allocation. Then teach the register allocator to allocate a
  97 chunk of consecutive registers.
  98
  99 3) Better representation for block transfer? This is from Olden/power:
 100
 101         fldd d0, [r4]
 102         fstd d0, [r4, #+32]
 103         fldd d0, [r4, #+8]
 104         fstd d0, [r4, #+40]
 105         fldd d0, [r4, #+16]
 106         fstd d0, [r4, #+48]
 107         fldd d0, [r4, #+24]
 108         fstd d0, [r4, #+56]
 109
 110 If we can spare the registers, it would be better to use fldm and fstm here.
 111 Need major register allocator enhancement though.
 112
 113 4) Can we recognize the relative position of constantpool entries? i.e. Treat
 114
 115         ldr r0, LCPI17_3
 116         ldr r1, LCPI17_4
 117         ldr r2, LCPI17_5
 118
 119    as
 120         ldr r0, LCPI17
 121         ldr r1, LCPI17+4
 122         ldr r2, LCPI17+8
 123
 124    Then the ldr's can be combined into a single ldm. See Olden/power.
 125
 126 Note for ARM v4 gcc uses ldmia to load a pair of 32-bit values to represent a
 127 double 64-bit FP constant:
 128
 129         adr     r0, L6
 130         ldmia   r0, {r0-r1}
 131
 132         .align 2
 133 L6:
 134         .long   -858993459
 135         .long   1074318540
 136
 137 5) Can we make use of ldrd and strd? Instead of generating ldm / stm, use
 138 ldrd/strd instead if there are only two destination registers that form an
 139 odd/even pair. However, we probably would pay a penalty if the address is not
 140 aligned on 8-byte boundary. This requires more information on load / store
 141 nodes (and MI's?) then we currently carry.
 142
 143 //===---------------------------------------------------------------------===//
 144
 145 * Consider this silly example:
 146
 147 double bar(double x) {
 148   double r = foo(3.1);
 149   return x+r;
 150 }
 151
 152 _bar:
 153         sub sp, sp, #16
 154         str r4, [sp, #+12]
 155         str r5, [sp, #+8]
 156         str lr, [sp, #+4]
 157         mov r4, r0
 158         mov r5, r1
 159         ldr r0, LCPI2_0
 160         bl _foo
 161         fmsr f0, r0
 162         fcvtsd d0, f0
 163         fmdrr d1, r4, r5
 164         faddd d0, d0, d1
 165         fmrrd r0, r1, d0
 166         ldr lr, [sp, #+4]
 167         ldr r5, [sp, #+8]
 168         ldr r4, [sp, #+12]
 169         add sp, sp, #16
 170         bx lr
 171
 172 Ignore the prologue and epilogue stuff for a second. Note
 173         mov r4, r0
 174         mov r5, r1
 175 the copys to callee-save registers and the fact they are only being used by the
 176 fmdrr instruction. It would have been better had the fmdrr been scheduled
 177 before the call and place the result in a callee-save DPR register. The two
 178 mov ops would not have been necessary.
 179
 180 //===---------------------------------------------------------------------===//
 181
 182 Calling convention related stuff:
 183
 184 * gcc's parameter passing implementation is terrible and we suffer as a result:
 185
 186 e.g.
 187 struct s {
 188   double d1;
 189   int s1;
 190 };
 191
 192 void foo(struct s S) {
 193   printf("%g, %d\n", S.d1, S.s1);
 194 }
 195
 196 'S' is passed via registers r0, r1, r2. But gcc stores them to the stack, and
 197 then reload them to r1, r2, and r3 before issuing the call (r0 contains the
 198 address of the format string):
 199
 200         stmfd   sp!, {r7, lr}
 201         add     r7, sp, #0
 202         sub     sp, sp, #12
 203         stmia   sp, {r0, r1, r2}
 204         ldmia   sp, {r1-r2}
 205         ldr     r0, L5
 206         ldr     r3, [sp, #8]
 207 L2:
 208         add     r0, pc, r0
 209         bl      L_printf$stub
 210
 211 Instead of a stmia, ldmia, and a ldr, wouldn't it be better to do three moves?
 212
 213 * Return an aggregate type is even worse:
 214
 215 e.g.
 216 struct s foo(void) {
 217   struct s S = {1.1, 2};
 218   return S;
 219 }
 220
 221         mov     ip, r0
 222         ldr     r0, L5
 223         sub     sp, sp, #12
 224 L2:
 225         add     r0, pc, r0
 226         @ lr needed for prologue
 227         ldmia   r0, {r0, r1, r2}
 228         stmia   sp, {r0, r1, r2}
 229         stmia   ip, {r0, r1, r2}
 230         mov     r0, ip
 231         add     sp, sp, #12
 232         bx      lr
 233
 234 r0 (and later ip) is the hidden parameter from caller to store the value in. The
 235 first ldmia loads the constants into r0, r1, r2. The last stmia stores r0, r1,
 236 r2 into the address passed in. However, there is one additional stmia that
 237 stores r0, r1, and r2 to some stack location. The store is dead.
 238
 239 The llvm-gcc generated code looks like this:
 240
 241 csretcc void %foo(%struct.s* %agg.result) {
 242 entry:
 243         %S = alloca %struct.s, align 4          ; <%struct.s*> [#uses=1]
 244         %memtmp = alloca %struct.s              ; <%struct.s*> [#uses=1]
 245         cast %struct.s* %S to sbyte*            ; <sbyte*>:0 [#uses=2]
 246         call void %llvm.memcpy.i32( sbyte* %0, sbyte* cast ({ double, int }* %C.0.904 to sbyte*), uint 12, uint 4 )
 247         cast %struct.s* %agg.result to sbyte*           ; <sbyte*>:1 [#uses=2]
 248         call void %llvm.memcpy.i32( sbyte* %1, sbyte* %0, uint 12, uint 0 )
 249         cast %struct.s* %memtmp to sbyte*               ; <sbyte*>:2 [#uses=1]
 250         call void %llvm.memcpy.i32( sbyte* %2, sbyte* %1, uint 12, uint 0 )
 251         ret void
 252 }
 253
 254 llc ends up issuing two memcpy's (the first memcpy becomes 3 loads from
 255 constantpool). Perhaps we should 1) fix llvm-gcc so the memcpy is translated
 256 into a number of load and stores, or 2) custom lower memcpy (of small size) to
 257 be ldmia / stmia. I think option 2 is better but the current register
 258 allocator cannot allocate a chunk of registers at a time.
 259
 260 A feasible temporary solution is to use specific physical registers at the
 261 lowering time for small (<= 4 words?) transfer size.
 262
 263 * ARM CSRet calling convention requires the hidden argument to be returned by
 264 the callee.
 265
 266 //===---------------------------------------------------------------------===//
 267
 268 We can definitely do a better job on BB placements to eliminate some branches.
 269 It's very common to see llvm generated assembly code that looks like this:
 270
 271 LBB3:
 272  ...
 273 LBB4:
 274 ...
 275   beq LBB3
 276   b LBB2
 277
 278 If BB4 is the only predecessor of BB3, then we can emit BB3 after BB4. We can
 279 then eliminate beq and and turn the unconditional branch to LBB2 to a bne.
 280
 281 See McCat/18-imp/ComputeBoundingBoxes for an example.
 282
 283 //===---------------------------------------------------------------------===//
 284
 285 We need register scavenging.  Currently, the 'ip' register is reserved in case
 286 frame indexes are too big.  This means that we generate extra code for stuff
 287 like this:
 288
 289 void foo(unsigned x, unsigned y, unsigned z, unsigned *a, unsigned *b, unsigned *c) {
 290    short Rconst = (short) (16384.0f * 1.40200 + 0.5 );
 291    *a = x * Rconst;
 292    *b = y * Rconst;
 293    *c = z * Rconst;
 294 }
 295
 296 we compile it to:
 297
 298 _foo:
 299 ***     stmfd sp!, {r4, r7}
 300 ***     add r7, sp, #4
 301         mov r4, #186
 302         orr r4, r4, #89, 24 @ 22784
 303         mul r0, r0, r4
 304         str r0, [r3]
 305         mul r0, r1, r4
 306         ldr r1, [sp, #+8]
 307         str r0, [r1]
 308         mul r0, r2, r4
 309         ldr r1, [sp, #+12]
 310         str r0, [r1]
 311 ***     sub sp, r7, #4
 312 ***     ldmfd sp!, {r4, r7}
 313         bx lr
 314
 315 GCC produces:
 316
 317 _foo:
 318         ldr     ip, L4
 319         mul     r0, ip, r0
 320         mul     r1, ip, r1
 321         str     r0, [r3, #0]
 322         ldr     r3, [sp, #0]
 323         mul     r2, ip, r2
 324         str     r1, [r3, #0]
 325         ldr     r3, [sp, #4]
 326         str     r2, [r3, #0]
 327         bx      lr
 328 L4:
 329         .long   22970
 330
 331 This is apparently all because we couldn't use ip here.
 332
 333 //===---------------------------------------------------------------------===//
 334
 335 Pre-/post- indexed load / stores:
 336
 337 1) We should not make the pre/post- indexed load/store transform if the base ptr
 338 is guaranteed to be live beyond the load/store. This can happen if the base
 339 ptr is live out of the block we are performing the optimization. e.g.
 340
 341 mov r1, r2
 342 ldr r3, [r1], #4
 343 ...
 344
 345 vs.
 346
 347 ldr r3, [r2]
 348 add r1, r2, #4
 349 ...
 350
 351 In most cases, this is just a wasted optimization. However, sometimes it can
 352 negatively impact the performance because two-address code is more restrictive
 353 when it comes to scheduling.
 354
 355 Unfortunately, liveout information is currently unavailable during DAG combine
 356 time.
 357
 358 2) Consider spliting a indexed load / store into a pair of add/sub + load/store
 359    to solve #1 (in TwoAddressInstructionPass.cpp).
 360
 361 3) Enhance LSR to generate more opportunities for indexed ops.
 362
 363 4) Once we added support for multiple result patterns, write indexed loads
 364    patterns instead of C++ instruction selection code.
 365
 366 5) Use FLDM / FSTM to emulate indexed FP load / store.
 367
 368 //===---------------------------------------------------------------------===//
 369
 370 We should add i64 support to take advantage of the 64-bit load / stores.
 371 We can add a pseudo i64 register class containing pseudo registers that are
 372 register pairs. All other ops (e.g. add, sub) would be expanded as usual.
 373
 374 We need to add pseudo instructions (i.e. gethi / getlo) to extract i32 registers
 375 from the i64 register. These are single moves which can be eliminated if the
 376 destination register is a sub-register of the source. We should implement proper
 377 subreg support in the register allocator to coalesce these away.
 378
 379 There are other minor issues such as multiple instructions for a spill / restore
 380 / move.
 381
 382 //===---------------------------------------------------------------------===//
 383
 384 Implement support for some more tricky ways to materialize immediates.  For
 385 example, to get 0xffff8000, we can use:
 386
 387 mov r9, #&3f8000
 388 sub r9, r9, #&400000
 389
 390 //===---------------------------------------------------------------------===//
 391
 392 We sometimes generate multiple add / sub instructions to update sp in prologue
 393 and epilogue if the inc / dec value is too large to fit in a single immediate
 394 operand. In some cases, perhaps it might be better to load the value from a
 395 constantpool instead.
 396
 397 //===---------------------------------------------------------------------===//
 398
 399 GCC generates significantly better code for this function.
 400
 401 int foo(int StackPtr, unsigned char *Line, unsigned char *Stack, int LineLen) {
 402     int i = 0;
 403
 404     if (StackPtr != 0) {
 405        while (StackPtr != 0 && i < (((LineLen) < (32768))? (LineLen) : (32768)))
 406           Line[i++] = Stack[--StackPtr];
 407         if (LineLen > 32768)
 408         {
 409             while (StackPtr != 0 && i < LineLen)
 410             {
 411                 i++;
 412                 --StackPtr;
 413             }
 414         }
 415     }
 416     return StackPtr;
 417 }
 418
 419 //===---------------------------------------------------------------------===//
 420
 421 This should compile to the mlas instruction:
 422 int mlas(int x, int y, int z) { return ((x * y + z) < 0) ? 7 : 13; }
 423
 424 //===---------------------------------------------------------------------===//
 425
 426 At some point, we should triage these to see if they still apply to us:
 427
 428 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=19598
 429 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=18560
 430 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=27016
 431
 432 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=11831
 433 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=11826
 434 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=11825
 435 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=11824
 436 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=11823
 437 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=11820
 438 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=10982
 439
 440 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=10242
 441 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9831
 442 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9760
 443 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9759
 444 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9703
 445 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9702
 446 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9663
 447
 448 http://www.inf.u-szeged.hu/gcc-arm/
 449 http://citeseer.ist.psu.edu/debus04linktime.html
 450
 451 //===---------------------------------------------------------------------===//