X-Git-Url: http://plrg.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FARM%2FREADME-Thumb.txt;h=2d031d015f364dc4d0210b35e5fb169258e6628d;hb=da49414c4bac6954614f059ac063e1254ae3541a;hp=6c2cb710ab30a4af204207ad4ca4760e28946800;hpb=8e59ea998f1357768aa43cb00187e6c1c1a1cc7e;p=oota-llvm.git diff --git a/lib/Target/ARM/README-Thumb.txt b/lib/Target/ARM/README-Thumb.txt index 6c2cb710ab3..2d031d015f3 100644 --- a/lib/Target/ARM/README-Thumb.txt +++ b/lib/Target/ARM/README-Thumb.txt @@ -4,6 +4,7 @@ * Add support for compiling functions in both ARM and Thumb mode, then taking the smallest. + * Add support for compiling individual basic blocks in thumb mode, when in a larger ARM function. This can be used for presumed cold code, like paths to abort (failure path of asserts), EH handling code, etc. @@ -36,7 +37,7 @@ LPCRELL0: mov r1, #PCRELV0 add r1, pc ldr r0, [r0, r1] - cpy pc, r0 + mov pc, r0 .align 2 LJTI1_0_0: .long LBB1_3 @@ -50,7 +51,7 @@ We should be able to generate: LPCRELL0: add r1, LJTI1_0_0 ldr r0, [r0, r1] - cpy pc, r0 + mov pc, r0 .align 2 LJTI1_0_0: .long LBB1_3 @@ -67,25 +68,9 @@ LPCRELL0: //===---------------------------------------------------------------------===// -We compiles the following using a jump table. +We compile the following: define i16 @func_entry_2E_ce(i32 %i) { -newFuncRoot: - br label %entry.ce - -bb12.exitStub: ; preds = %entry.ce - ret i16 0 - -bb4.exitStub: ; preds = %entry.ce, %entry.ce, %entry.ce - ret i16 1 - -bb9.exitStub: ; preds = %entry.ce, %entry.ce, %entry.ce - ret i16 2 - -bb.exitStub: ; preds = %entry.ce - ret i16 3 - -entry.ce: ; preds = %newFuncRoot switch i32 %i, label %bb12.exitStub [ i32 0, label %bb4.exitStub i32 1, label %bb9.exitStub @@ -95,8 +80,58 @@ entry.ce: ; preds = %newFuncRoot i32 8, label %bb.exitStub i32 9, label %bb9.exitStub ] + +bb12.exitStub: + ret i16 0 + +bb4.exitStub: + ret i16 1 + +bb9.exitStub: + ret i16 2 + +bb.exitStub: + ret i16 3 } +into: + +_func_entry_2E_ce: + mov r2, #1 + lsl r2, r0 + cmp r0, #9 + bhi LBB1_4 @bb12.exitStub +LBB1_1: @newFuncRoot + mov r1, #13 + tst r2, r1 + bne LBB1_5 @bb4.exitStub +LBB1_2: @newFuncRoot + ldr r1, LCPI1_0 + tst r2, r1 + bne LBB1_6 @bb9.exitStub +LBB1_3: @newFuncRoot + mov r1, #1 + lsl r1, r1, #8 + tst r2, r1 + bne LBB1_7 @bb.exitStub +LBB1_4: @bb12.exitStub + mov r0, #0 + bx lr +LBB1_5: @bb4.exitStub + mov r0, #1 + bx lr +LBB1_6: @bb9.exitStub + mov r0, #2 + bx lr +LBB1_7: @bb.exitStub + mov r0, #3 + bx lr +LBB1_8: + .align 2 +LCPI1_0: + .long 642 + + gcc compiles to: cmp r0, #9 @@ -124,6 +159,20 @@ L12: .align 2 L11: .long 642 + + +GCC is doing a couple of clever things here: + 1. It is predicating one of the returns. This isn't a clear win though: in + cases where that return isn't taken, it is replacing one condbranch with + two 'ne' predicated instructions. + 2. It is sinking the shift of "1 << i" into the tst, and using ands instead of + tst. This will probably require whole function isel. + 3. GCC emits: + tst r1, #256 + we emit: + mov r1, #1 + lsl r1, r1, #8 + tst r2, r1 //===---------------------------------------------------------------------===// @@ -141,3 +190,72 @@ cheaper sequnce if we know it's ok to clobber the condition register. add r2, sp, #255 * 4 add r2, #132 ldr r2, [r2, #7 * 4] + +This is especially bad when dynamic alloca is used. The all fixed size stack +objects are referenced off the frame pointer with negative offsets. See +oggenc for an example. + +//===---------------------------------------------------------------------===// + +Poor codegen test/CodeGen/ARM/select.ll f7: + + ldr r5, LCPI1_0 +LPC0: + add r5, pc + ldr r6, LCPI1_1 + ldr r2, LCPI1_2 + mov r3, r6 + mov lr, pc + bx r5 + +//===---------------------------------------------------------------------===// + +Make register allocator / spiller smarter so we can re-materialize "mov r, imm", +etc. Almost all Thumb instructions clobber condition code. + +//===---------------------------------------------------------------------===// + +Thumb load / store address mode offsets are scaled. The values kept in the +instruction operands are pre-scale values. This probably ought to be changed +to avoid extra work when we convert Thumb2 instructions to Thumb1 instructions. + +//===---------------------------------------------------------------------===// + +We need to make (some of the) Thumb1 instructions predicable. That will allow +shrinking of predicated Thumb2 instructions. To allow this, we need to be able +to toggle the 's' bit since they do not set CPSR when they are inside IT blocks. + +//===---------------------------------------------------------------------===// + +Make use of hi register variants of cmp: tCMPhir / tCMPZhir. + +//===---------------------------------------------------------------------===// + +Thumb1 immediate field sometimes keep pre-scaled values. See +ThumbRegisterInfo::eliminateFrameIndex. This is inconsistent from ARM and +Thumb2. + +//===---------------------------------------------------------------------===// + +Rather than having tBR_JTr print a ".align 2" and constant island pass pad it, +add a target specific ALIGN instruction instead. That way, GetInstSizeInBytes +won't have to over-estimate. It can also be used for loop alignment pass. + +//===---------------------------------------------------------------------===// + +We generate conditional code for icmp when we don't need to. This code: + + int foo(int s) { + return s == 1; + } + +produces: + +foo: + cmp r0, #1 + mov.w r0, #0 + it eq + moveq r0, #1 + bx lr + +when it could use subs + adcs. This is GCC PR46975.