lib/Target/ARM/README-Thumb.txt

   1 //===---------------------------------------------------------------------===//
   2 // Random ideas for the ARM backend (Thumb specific).
   3 //===---------------------------------------------------------------------===//
   4
   5 * Add support for compiling functions in both ARM and Thumb mode, then taking
   6   the smallest.
   7 * Add support for compiling individual basic blocks in thumb mode, when in a
   8   larger ARM function.  This can be used for presumed cold code, like paths
   9   to abort (failure path of asserts), EH handling code, etc.
  10
  11 * Thumb doesn't have normal pre/post increment addressing modes, but you can
  12   load/store 32-bit integers with pre/postinc by using load/store multiple
  13   instrs with a single register.
  14
  15 * Make better use of high registers r8, r10, r11, r12 (ip). Some variants of add
  16   and cmp instructions can use high registers. Also, we can use them as
  17   temporaries to spill values into.
  18
  19 * In thumb mode, short, byte, and bool preferred alignments are currently set
  20   to 4 to accommodate ISA restriction (i.e. add sp, #imm, imm must be multiple
  21   of 4).
  22
  23 //===---------------------------------------------------------------------===//
  24
  25 Potential jumptable improvements:
  26
  27 * If we know function size is less than (1 << 16) * 2 bytes, we can use 16-bit
  28   jumptable entries (e.g. (L1 - L2) >> 1). Or even smaller entries if the
  29   function is even smaller. This also applies to ARM.
  30
  31 * Thumb jumptable codegen can improve given some help from the assembler. This
  32   is what we generate right now:
  33
  34         .set PCRELV0, (LJTI1_0_0-(LPCRELL0+4))
  35 LPCRELL0:
  36         mov r1, #PCRELV0
  37         add r1, pc
  38         ldr r0, [r0, r1]
  39         cpy pc, r0
  40         .align  2
  41 LJTI1_0_0:
  42         .long    LBB1_3
  43         ...
  44
  45 Note there is another pc relative add that we can take advantage of.
  46      add r1, pc, #imm_8 * 4
  47
  48 We should be able to generate:
  49
  50 LPCRELL0:
  51         add r1, LJTI1_0_0
  52         ldr r0, [r0, r1]
  53         cpy pc, r0
  54         .align  2
  55 LJTI1_0_0:
  56         .long    LBB1_3
  57
  58 if the assembler can translate the add to:
  59        add r1, pc, #((LJTI1_0_0-(LPCRELL0+4))&0xfffffffc)
  60
  61 Note the assembler also does something similar to constpool load:
  62 LPCRELL0:
  63      ldr r0, LCPI1_0
  64 =>
  65      ldr r0, pc, #((LCPI1_0-(LPCRELL0+4))&0xfffffffc)
  66
  67
  68 //===---------------------------------------------------------------------===//
  69
  70 We compiles the following using a jump table.
  71
  72 define i16 @func_entry_2E_ce(i32 %i) {
  73 newFuncRoot:
  74         br label %entry.ce
  75
  76 bb12.exitStub:          ; preds = %entry.ce
  77         ret i16 0
  78
  79 bb4.exitStub:           ; preds = %entry.ce, %entry.ce, %entry.ce
  80         ret i16 1
  81
  82 bb9.exitStub:           ; preds = %entry.ce, %entry.ce, %entry.ce
  83         ret i16 2
  84
  85 bb.exitStub:            ; preds = %entry.ce
  86         ret i16 3
  87
  88 entry.ce:               ; preds = %newFuncRoot
  89         switch i32 %i, label %bb12.exitStub [
  90                  i32 0, label %bb4.exitStub
  91                  i32 1, label %bb9.exitStub
  92                  i32 2, label %bb4.exitStub
  93                  i32 3, label %bb4.exitStub
  94                  i32 7, label %bb9.exitStub
  95                  i32 8, label %bb.exitStub
  96                  i32 9, label %bb9.exitStub
  97         ]
  98 }
  99
 100 gcc compiles to:
 101
 102         cmp     r0, #9
 103         @ lr needed for prologue
 104         bhi     L2
 105         ldr     r3, L11
 106         mov     r2, #1
 107         mov     r1, r2, asl r0
 108         ands    r0, r3, r2, asl r0
 109         movne   r0, #2
 110         bxne    lr
 111         tst     r1, #13
 112         beq     L9
 113 L3:
 114         mov     r0, r2
 115         bx      lr
 116 L9:
 117         tst     r1, #256
 118         movne   r0, #3
 119         bxne    lr
 120 L2:
 121         mov     r0, #0
 122         bx      lr
 123 L12:
 124         .align 2
 125 L11:
 126         .long   642
 127
 128 //===---------------------------------------------------------------------===//
 129
 130 When spilling in thumb mode and the sp offset is too large to fit in the ldr /
 131 str offset field, we load the offset from a constpool entry and add it to sp:
 132
 133 ldr r2, LCPI
 134 add r2, sp
 135 ldr r2, [r2]
 136
 137 These instructions preserve the condition code which is important if the spill
 138 is between a cmp and a bcc instruction. However, we can use the (potentially)
 139 cheaper sequnce if we know it's ok to clobber the condition register.
 140
 141 add r2, sp, #255 * 4
 142 add r2, #132
 143 ldr r2, [r2, #7 * 4]
 144
 145 This is especially bad when dynamic alloca is used. The all fixed size stack
 146 objects are referenced off the frame pointer with negative offsets. See
 147 oggenc for an example.