lib/Target/ARM/README-Thumb.txt

   1 //===---------------------------------------------------------------------===//
   2 // Random ideas for the ARM backend (Thumb specific).
   3 //===---------------------------------------------------------------------===//
   4
   5 * Add support for compiling functions in both ARM and Thumb mode, then taking
   6   the smallest.
   7 * Add support for compiling individual basic blocks in thumb mode, when in a
   8   larger ARM function.  This can be used for presumed cold code, like paths
   9   to abort (failure path of asserts), EH handling code, etc.
  10
  11 * Thumb doesn't have normal pre/post increment addressing modes, but you can
  12   load/store 32-bit integers with pre/postinc by using load/store multiple
  13   instrs with a single register.
  14
  15 * Make better use of high registers r8, r10, r11, r12 (ip). Some variants of add
  16   and cmp instructions can use high registers. Also, we can use them as
  17   temporaries to spill values into.
  18
  19 * In thumb mode, short, byte, and bool preferred alignments are currently set
  20   to 4 to accommodate ISA restriction (i.e. add sp, #imm, imm must be multiple
  21   of 4).
  22
  23 //===---------------------------------------------------------------------===//
  24
  25 Potential jumptable improvements:
  26
  27 * If we know function size is less than (1 << 16) * 2 bytes, we can use 16-bit
  28   jumptable entries (e.g. (L1 - L2) >> 1). Or even smaller entries if the
  29   function is even smaller. This also applies to ARM.
  30
  31 * Thumb jumptable codegen can improve given some help from the assembler. This
  32   is what we generate right now:
  33
  34         .set PCRELV0, (LJTI1_0_0-(LPCRELL0+4))
  35 LPCRELL0:
  36         mov r1, #PCRELV0
  37         add r1, pc
  38         ldr r0, [r0, r1]
  39         cpy pc, r0
  40         .align  2
  41 LJTI1_0_0:
  42         .long    LBB1_3
  43         ...
  44
  45 Note there is another pc relative add that we can take advantage of.
  46      add r1, pc, #imm_8 * 4
  47
  48 We should be able to generate:
  49
  50 LPCRELL0:
  51         add r1, LJTI1_0_0
  52         ldr r0, [r0, r1]
  53         cpy pc, r0
  54         .align  2
  55 LJTI1_0_0:
  56         .long    LBB1_3
  57
  58 if the assembler can translate the add to:
  59        add r1, pc, #((LJTI1_0_0-(LPCRELL0+4))&0xfffffffc)
  60
  61 Note the assembler also does something similar to constpool load:
  62 LPCRELL0:
  63      ldr r0, LCPI1_0
  64 =>
  65      ldr r0, pc, #((LCPI1_0-(LPCRELL0+4))&0xfffffffc)
  66
  67
  68 //===---------------------------------------------------------------------===//
  69
  70 We compiles the following:
  71
  72 define i16 @func_entry_2E_ce(i32 %i) {
  73         switch i32 %i, label %bb12.exitStub [
  74                  i32 0, label %bb4.exitStub
  75                  i32 1, label %bb9.exitStub
  76                  i32 2, label %bb4.exitStub
  77                  i32 3, label %bb4.exitStub
  78                  i32 7, label %bb9.exitStub
  79                  i32 8, label %bb.exitStub
  80                  i32 9, label %bb9.exitStub
  81         ]
  82
  83 bb12.exitStub:
  84         ret i16 0
  85
  86 bb4.exitStub:
  87         ret i16 1
  88
  89 bb9.exitStub:
  90         ret i16 2
  91
  92 bb.exitStub:
  93         ret i16 3
  94 }
  95
  96 into:
  97
  98 _func_entry_2E_ce:
  99         mov r2, #1
 100         lsl r2, r0
 101         cmp r0, #9
 102         bhi LBB1_4      @bb12.exitStub
 103 LBB1_1: @newFuncRoot
 104         mov r1, #13
 105         tst r2, r1
 106         bne LBB1_5      @bb4.exitStub
 107 LBB1_2: @newFuncRoot
 108         ldr r1, LCPI1_0
 109         tst r2, r1
 110         bne LBB1_6      @bb9.exitStub
 111 LBB1_3: @newFuncRoot
 112         mov r1, #1
 113         lsl r1, r1, #8
 114         tst r2, r1
 115         bne LBB1_7      @bb.exitStub
 116 LBB1_4: @bb12.exitStub
 117         mov r0, #0
 118         bx lr
 119 LBB1_5: @bb4.exitStub
 120         mov r0, #1
 121         bx lr
 122 LBB1_6: @bb9.exitStub
 123         mov r0, #2
 124         bx lr
 125 LBB1_7: @bb.exitStub
 126         mov r0, #3
 127         bx lr
 128 LBB1_8:
 129         .align  2
 130 LCPI1_0:
 131         .long   642
 132
 133
 134 gcc compiles to:
 135
 136         cmp     r0, #9
 137         @ lr needed for prologue
 138         bhi     L2
 139         ldr     r3, L11
 140         mov     r2, #1
 141         mov     r1, r2, asl r0
 142         ands    r0, r3, r2, asl r0
 143         movne   r0, #2
 144         bxne    lr
 145         tst     r1, #13
 146         beq     L9
 147 L3:
 148         mov     r0, r2
 149         bx      lr
 150 L9:
 151         tst     r1, #256
 152         movne   r0, #3
 153         bxne    lr
 154 L2:
 155         mov     r0, #0
 156         bx      lr
 157 L12:
 158         .align 2
 159 L11:
 160         .long   642
 161
 162
 163 GCC is doing a couple of clever things here:
 164   1. It is predicating one of the returns.  This isn't a clear win though: in
 165      cases where that return isn't taken, it is replacing one condbranch with
 166      two 'ne' predicated instructions.
 167   2. It is sinking the shift of "1 << i" into the tst, and using ands instead of
 168      tst.  This will probably require whole function isel.
 169   3. GCC emits:
 170         tst     r1, #256
 171      we emit:
 172         mov r1, #1
 173         lsl r1, r1, #8
 174         tst r2, r1
 175
 176
 177 //===---------------------------------------------------------------------===//
 178
 179 When spilling in thumb mode and the sp offset is too large to fit in the ldr /
 180 str offset field, we load the offset from a constpool entry and add it to sp:
 181
 182 ldr r2, LCPI
 183 add r2, sp
 184 ldr r2, [r2]
 185
 186 These instructions preserve the condition code which is important if the spill
 187 is between a cmp and a bcc instruction. However, we can use the (potentially)
 188 cheaper sequnce if we know it's ok to clobber the condition register.
 189
 190 add r2, sp, #255 * 4
 191 add r2, #132
 192 ldr r2, [r2, #7 * 4]
 193
 194 This is especially bad when dynamic alloca is used. The all fixed size stack
 195 objects are referenced off the frame pointer with negative offsets. See
 196 oggenc for an example.
 197
 198 //===---------------------------------------------------------------------===//
 199
 200 We are reserving R3 as a scratch register under thumb mode. So if it is live in
 201 to the function, we save / restore R3 to / from R12. Until register scavenging
 202 is done, we should save R3 to a high callee saved reg at emitPrologue time
 203 (when hasFP is true or stack size is large) and restore R3 from that register
 204 instead. This allows us to at least get rid of the save to r12 everytime it is
 205 used.
 206
 207 //===---------------------------------------------------------------------===//
 208
 209 Poor codegen test/CodeGen/ARM/select.ll f7:
 210
 211         ldr r5, LCPI1_0
 212 LPC0:
 213         add r5, pc
 214         ldr r6, LCPI1_1
 215         ldr r2, LCPI1_2
 216         cpy r3, r6
 217         cpy lr, pc
 218         bx r5
 219
 220 //===---------------------------------------------------------------------===//
 221
 222 Make register allocator / spiller smarter so we can re-materialize "mov r, imm",
 223 etc. Almost all Thumb instructions clobber condition code.