for (i = 0; i < N; i++) { X = i; Y = i*4; }
}
-LBB1_1: #bb.preheader
- xorl %ecx, %ecx
- xorw %dx, %dx
-LBB1_2: #bb
- movl L_X$non_lazy_ptr, %esi
- movw %dx, (%esi)
- movw %dx, %si
- shlw $2, %si
- movl L_Y$non_lazy_ptr, %edi
- movw %si, (%edi)
- incl %ecx
- incw %dx
- cmpl %eax, %ecx
- jne LBB1_2 #bb
+LBB1_1: # entry.bb_crit_edge
+ xorl %ecx, %ecx
+ xorw %dx, %dx
+LBB1_2: # bb
+ movl L_X$non_lazy_ptr, %esi
+ movw %cx, (%esi)
+ movl L_Y$non_lazy_ptr, %esi
+ movw %dx, (%esi)
+ addw $4, %dx
+ incl %ecx
+ cmpl %eax, %ecx
+ jne LBB1_2 # bb
vs.
cmpl %edx, %edi
jne L4
-There are 3 issues:
-
-1. Lack of post regalloc LICM.
-2. LSR unable to reused IV for a different type (i16 vs. i32) even though
- the cast would be free.
+This is due to the lack of post regalloc LICM.
//===---------------------------------------------------------------------===//
etc.
+Another is:
+int usesbb(unsigned int a, unsigned int b) {
+ return (a < b ? -1 : 0);
+}
+to:
+_usesbb:
+ movl 8(%esp), %eax
+ cmpl %eax, 4(%esp)
+ sbbl %eax, %eax
+ ret
+
+instead of:
+_usesbb:
+ xorl %eax, %eax
+ movl 8(%esp), %ecx
+ cmpl %ecx, 4(%esp)
+ movl $4294967295, %ecx
+ cmovb %ecx, %eax
+ ret
+
//===---------------------------------------------------------------------===//
Currently we don't have elimination of redundant stack manipulations. Consider
//===---------------------------------------------------------------------===//
-We currently compile sign_extend_inreg into two shifts:
-
-long foo(long X) {
- return (long)(signed char)X;
-}
-
-becomes:
-
-_foo:
- movl 4(%esp), %eax
- shll $24, %eax
- sarl $24, %eax
- ret
-
-This could be:
-
-_foo:
- movsbl 4(%esp),%eax
- ret
-
-//===---------------------------------------------------------------------===//
-
Consider the expansion of:
uint %test3(uint %X) {
//===---------------------------------------------------------------------===//
-This:
-#include <math.h>
-int foo(double X) { return isnan(X); }
-
-compiles to (-m64):
-
-_foo:
- pxor %xmm1, %xmm1
- ucomisd %xmm1, %xmm0
- setp %al
- movzbl %al, %eax
- ret
-
-the pxor is not needed, we could compare the value against itself.
-
-//===---------------------------------------------------------------------===//
-
These two functions have identical effects:
unsigned int f(unsigned int i, unsigned int n) {++i; if (i == n) ++i; return i;}
//===---------------------------------------------------------------------===//
-For code like:
-phi (undef, x)
-
-We get an implicit def on the undef side. If the phi is spilled, we then get:
-implicitdef xmm1
-store xmm1 -> stack
-
-It should be possible to teach the x86 backend to "fold" the store into the
-implicitdef, which just deletes the implicit def.
-
-These instructions should go away:
-#IMPLICIT_DEF %xmm1
-movaps %xmm1, 192(%esp)
-movaps %xmm1, 224(%esp)
-movaps %xmm1, 176(%esp)
-
-//===---------------------------------------------------------------------===//
-
This is a "commutable two-address" register coallescing deficiency:
define <4 x float> @test1(<4 x float> %V) {
L5:
//===---------------------------------------------------------------------===//
+
Tail call optimization improvements: Tail call optimization currently
-pushes all arguments on the top of the stack (their normal place if
-that was a not tail call optimized functiong call ) before moving them
-to actual stack slot. this is done to prevent overwriting of paramters
-(see example below) that might be used, since the arguments of the
-callee overwrites callers arguments.
+pushes all arguments on the top of the stack (their normal place for
+non-tail call optimized calls) that source from the callers arguments
+or that source from a virtual register (also possibly sourcing from
+callers arguments).
+This is done to prevent overwriting of parameters (see example
+below) that might be used later.
- example:
+example:
int callee(int32, int64);
int caller(int32 arg1, int32 arg2) {
[arg2] -> [(int64)
[RETADDR] local ]
-moving arg1 onto the stack slot of callee function would overwrite
+Moving arg1 onto the stack slot of callee function would overwrite
arg2 of the caller.
Possible optimizations:
- - only push those arguments to the top of the stack that are actual
- parameters of the caller function and have no local value in the
- caller
-
- in above example local does not need to be pushed onto the top of
- the stack as it is definitetly not a caller's function parameter
- - analyse the actual parameters of the callee to see which would
- overwrite a caller paramter which is used by the callee and only
- push them onto the top of the stack
+ - Analyse the actual parameters of the callee to see which would
+ overwrite a caller parameter which is used by the callee and only
+ push them onto the top of the stack.
int callee (int32 arg1, int32 arg2);
int caller (int32 arg1, int32 arg2) {
return callee(arg1,arg2);
}
- here we don't need to write any variables to the top of the stack
- since they don't overwrite each other
+ Here we don't need to write any variables to the top of the stack
+ since they don't overwrite each other.
int callee (int32 arg1, int32 arg2);
int caller (int32 arg1, int32 arg2) {
return callee(arg2,arg1);
}
- here we need to push the arguments because they overwrite each other
-
-
- code for lowering directly onto callers arguments:
-+ SmallVector<std::pair<unsigned, SDOperand>, 8> RegsToPass;
-+ SmallVector<SDOperand, 8> MemOpChains;
-+
-+ SDOperand FramePtr;
-+ SDOperand PtrOff;
-+ SDOperand FIN;
-+ int FI = 0;
-+ // Walk the register/memloc assignments, inserting copies/loads.
-+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
-+ CCValAssign &VA = ArgLocs[i];
-+ SDOperand Arg = Op.getOperand(5+2*VA.getValNo());
-+
-+ ....
-+
-+ if (VA.isRegLoc()) {
-+ RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
-+ } else {
-+ assert(VA.isMemLoc());
-+ // create frame index
-+ int32_t Offset = VA.getLocMemOffset()+FPDiff;
-+ uint32_t OpSize = (MVT::getSizeInBits(VA.getLocVT())+7)/8;
-+ FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset);
-+ FIN = DAG.getFrameIndex(FI, MVT::i32);
-+ // store relative to framepointer
-+ MemOpChains.push_back(DAG.getStore(Chain, Arg, FIN, NULL, 0));
-+ }
-+ }
+ Here we need to push the arguments because they overwrite each
+ other.
+
+//===---------------------------------------------------------------------===//
+
+main ()
+{
+ int i = 0;
+ unsigned long int z = 0;
+
+ do {
+ z -= 0x00004000;
+ i++;
+ if (i > 0x00040000)
+ abort ();
+ } while (z > 0);
+ exit (0);
+}
+
+gcc compiles this to:
+
+_main:
+ subl $28, %esp
+ xorl %eax, %eax
+ jmp L2
+L3:
+ cmpl $262144, %eax
+ je L10
+L2:
+ addl $1, %eax
+ cmpl $262145, %eax
+ jne L3
+ call L_abort$stub
+L10:
+ movl $0, (%esp)
+ call L_exit$stub
+
+llvm:
+
+_main:
+ subl $12, %esp
+ movl $1, %eax
+ movl $16384, %ecx
+LBB1_1: # bb
+ cmpl $262145, %eax
+ jge LBB1_4 # cond_true
+LBB1_2: # cond_next
+ incl %eax
+ addl $4294950912, %ecx
+ cmpl $16384, %ecx
+ jne LBB1_1 # bb
+LBB1_3: # bb11
+ xorl %eax, %eax
+ addl $12, %esp
+ ret
+LBB1_4: # cond_true
+ call L_abort$stub
+
+1. LSR should rewrite the first cmp with induction variable %ecx.
+2. DAG combiner should fold
+ leal 1(%eax), %edx
+ cmpl $262145, %edx
+ =>
+ cmpl $262144, %eax
+
+//===---------------------------------------------------------------------===//
+
+define i64 @test(double %X) {
+ %Y = fptosi double %X to i64
+ ret i64 %Y
+}
+
+compiles to:
+
+_test:
+ subl $20, %esp
+ movsd 24(%esp), %xmm0
+ movsd %xmm0, 8(%esp)
+ fldl 8(%esp)
+ fisttpll (%esp)
+ movl 4(%esp), %edx
+ movl (%esp), %eax
+ addl $20, %esp
+ #FP_REG_KILL
+ ret
+
+This should just fldl directly from the input stack slot.
+
+//===---------------------------------------------------------------------===//
+
+This code:
+int foo (int x) { return (x & 65535) | 255; }
+
+Should compile into:
+
+_foo:
+ movzwl 4(%esp), %eax
+ orb $-1, %al ;; 'orl 255' is also fine :)
+ ret
+
+instead of:
+_foo:
+ movl $255, %eax
+ orl 4(%esp), %eax
+ andl $65535, %eax
+ ret
+
+//===---------------------------------------------------------------------===//
+
+We're missing an obvious fold of a load into imul:
+
+int test(long a, long b) { return a * b; }
+
+LLVM produces:
+_test:
+ movl 4(%esp), %ecx
+ movl 8(%esp), %eax
+ imull %ecx, %eax
+ ret
+
+vs:
+_test:
+ movl 8(%esp), %eax
+ imull 4(%esp), %eax
+ ret
+
+//===---------------------------------------------------------------------===//
+
+We can fold a store into "zeroing a reg". Instead of:
+
+xorl %eax, %eax
+movl %eax, 124(%esp)
+
+we should get:
+
+movl $0, 124(%esp)
+
+if the flags of the xor are dead.
+
+Likewise, we isel "x<<1" into "add reg,reg". If reg is spilled, this should
+be folded into: shl [mem], 1
+
+//===---------------------------------------------------------------------===//
+
+This testcase misses a read/modify/write opportunity (from PR1425):
+
+void vertical_decompose97iH1(int *b0, int *b1, int *b2, int width){
+ int i;
+ for(i=0; i<width; i++)
+ b1[i] += (1*(b0[i] + b2[i])+0)>>0;
+}
+
+We compile it down to:
+
+LBB1_2: # bb
+ movl (%esi,%edi,4), %ebx
+ addl (%ecx,%edi,4), %ebx
+ addl (%edx,%edi,4), %ebx
+ movl %ebx, (%ecx,%edi,4)
+ incl %edi
+ cmpl %eax, %edi
+ jne LBB1_2 # bb
+
+the inner loop should add to the memory location (%ecx,%edi,4), saving
+a mov. Something like:
+
+ movl (%esi,%edi,4), %ebx
+ addl (%edx,%edi,4), %ebx
+ addl %ebx, (%ecx,%edi,4)
+
+Here is another interesting example:
+
+void vertical_compose97iH1(int *b0, int *b1, int *b2, int width){
+ int i;
+ for(i=0; i<width; i++)
+ b1[i] -= (1*(b0[i] + b2[i])+0)>>0;
+}
+
+We miss the r/m/w opportunity here by using 2 subs instead of an add+sub[mem]:
+
+LBB9_2: # bb
+ movl (%ecx,%edi,4), %ebx
+ subl (%esi,%edi,4), %ebx
+ subl (%edx,%edi,4), %ebx
+ movl %ebx, (%ecx,%edi,4)
+ incl %edi
+ cmpl %eax, %edi
+ jne LBB9_2 # bb
+
+Additionally, LSR should rewrite the exit condition of these loops to use
+a stride-4 IV, would would allow all the scales in the loop to go away.
+This would result in smaller code and more efficient microops.
+
+//===---------------------------------------------------------------------===//
+
+In SSE mode, we turn abs and neg into a load from the constant pool plus a xor
+or and instruction, for example:
+
+ xorpd LCPI1_0, %xmm2
+
+However, if xmm2 gets spilled, we end up with really ugly code like this:
+
+ movsd (%esp), %xmm0
+ xorpd LCPI1_0, %xmm0
+ movsd %xmm0, (%esp)
+
+Since we 'know' that this is a 'neg', we can actually "fold" the spill into
+the neg/abs instruction, turning it into an *integer* operation, like this:
+
+ xorl 2147483648, [mem+4] ## 2147483648 = (1 << 31)
+
+you could also use xorb, but xorl is less likely to lead to a partial register
+stall. Here is a contrived testcase:
+
+double a, b, c;
+void test(double *P) {
+ double X = *P;
+ a = X;
+ bar();
+ X = -X;
+ b = X;
+ bar();
+ c = X;
+}
+
//===---------------------------------------------------------------------===//