//===---------------------------------------------------------------------===//
-How about intrinsics? An example is:
- *res = _mm_mulhi_epu16(*A, _mm_mul_epu32(*B, *C));
-
-compiles to
- pmuludq (%eax), %xmm0
- movl 8(%esp), %eax
- movdqa (%eax), %xmm1
- pmulhuw %xmm0, %xmm1
-
-The transformation probably requires a X86 specific pass or a DAG combiner
-target specific hook.
-
-//===---------------------------------------------------------------------===//
-
In many cases, LLVM generates code like this:
_test:
//===---------------------------------------------------------------------===//
-Teach the coalescer to coalesce vregs of different register classes. e.g. FR32 /
-FR64 to VR128.
-
-//===---------------------------------------------------------------------===//
-
Adding to the list of cmp / test poor codegen issues:
int test(__m128 *A, __m128 *B) {
//===---------------------------------------------------------------------===//
-Start using the flags more. For example, compile:
+Use the FLAGS values from arithmetic instructions more. For example, compile:
int add_zf(int *x, int y, int a, int b) {
if ((*x += y) == 0)
movl %ecx, %eax
ret
-and:
-
-int add_zf(int *x, int y, int a, int b) {
- if ((*x + y) < 0)
- return a;
- else
- return b;
-}
-
-to:
-
-add_zf:
- addl (%rdi), %esi
- movl %edx, %eax
- cmovns %ecx, %eax
- ret
-
-instead of:
-
-_add_zf:
- addl (%rdi), %esi
- testl %esi, %esi
- cmovs %edx, %ecx
- movl %ecx, %eax
- ret
+As another example, compile function f2 in test/CodeGen/X86/cmp-test.ll
+without a test instruction.
//===---------------------------------------------------------------------===//
//===---------------------------------------------------------------------===//
-We need to teach the codegen to convert two-address INC instructions to LEA
-when the flags are dead (likewise dec). For example, on X86-64, compile:
-
-int foo(int A, int B) {
- return A+1;
-}
-
-to:
-
-_foo:
- leal 1(%edi), %eax
- ret
-
-instead of:
-
-_foo:
- incl %edi
- movl %edi, %eax
- ret
-
-Another example is:
-
-;; X's live range extends beyond the shift, so the register allocator
-;; cannot coalesce it with Y. Because of this, a copy needs to be
-;; emitted before the shift to save the register value before it is
-;; clobbered. However, this copy is not needed if the register
-;; allocator turns the shift into an LEA. This also occurs for ADD.
-
-; Check that the shift gets turned into an LEA.
-; RUN: llvm-as < %s | llc -march=x86 -x86-asm-syntax=intel | \
-; RUN: not grep {mov E.X, E.X}
-
-@G = external global i32 ; <i32*> [#uses=3]
-
-define i32 @test1(i32 %X, i32 %Y) {
- %Z = add i32 %X, %Y ; <i32> [#uses=1]
- volatile store i32 %Y, i32* @G
- volatile store i32 %Z, i32* @G
- ret i32 %X
-}
-
-define i32 @test2(i32 %X) {
- %Z = add i32 %X, 1 ; <i32> [#uses=1]
- volatile store i32 %Z, i32* @G
- ret i32 %X
-}
-
-//===---------------------------------------------------------------------===//
-
Sometimes it is better to codegen subtractions from a constant (e.g. 7-x) with
a neg instead of a sub instruction. Consider:
ret
-//===---------------------------------------------------------------------===//
-
-Re-materialize MOV32r0 etc. with xor instead of changing them to moves if the
-condition register is dead. xor reg reg is shorter than mov reg, #0.
-
//===---------------------------------------------------------------------===//
The following code:
//===---------------------------------------------------------------------===//
-This testcase misses a read/modify/write opportunity (from PR1425):
-
-void vertical_decompose97iH1(int *b0, int *b1, int *b2, int width){
- int i;
- for(i=0; i<width; i++)
- b1[i] += (1*(b0[i] + b2[i])+0)>>0;
-}
-
-We compile it down to:
-
-LBB1_2: # bb
- movl (%esi,%edi,4), %ebx
- addl (%ecx,%edi,4), %ebx
- addl (%edx,%edi,4), %ebx
- movl %ebx, (%ecx,%edi,4)
- incl %edi
- cmpl %eax, %edi
- jne LBB1_2 # bb
-
-the inner loop should add to the memory location (%ecx,%edi,4), saving
-a mov. Something like:
-
- movl (%esi,%edi,4), %ebx
- addl (%edx,%edi,4), %ebx
- addl %ebx, (%ecx,%edi,4)
-
-Here is another interesting example:
-
-void vertical_compose97iH1(int *b0, int *b1, int *b2, int width){
- int i;
- for(i=0; i<width; i++)
- b1[i] -= (1*(b0[i] + b2[i])+0)>>0;
-}
-
-We miss the r/m/w opportunity here by using 2 subs instead of an add+sub[mem]:
-
-LBB9_2: # bb
- movl (%ecx,%edi,4), %ebx
- subl (%esi,%edi,4), %ebx
- subl (%edx,%edi,4), %ebx
- movl %ebx, (%ecx,%edi,4)
- incl %edi
- cmpl %eax, %edi
- jne LBB9_2 # bb
-
-Additionally, LSR should rewrite the exit condition of these loops to use
-a stride-4 IV, would would allow all the scales in the loop to go away.
-This would result in smaller code and more efficient microops.
-
-//===---------------------------------------------------------------------===//
-
In SSE mode, we turn abs and neg into a load from the constant pool plus a xor
or and instruction, for example:
//===---------------------------------------------------------------------===//
-handling llvm.memory.barrier on pre SSE2 cpus
-
-should generate:
-lock ; mov %esp, %esp
-
-//===---------------------------------------------------------------------===//
-
The generated code on x86 for checking for signed overflow on a multiply the
obvious way is much longer than it needs to be.
xorl %eax, %eax
ret
-There are a few possible improvements here:
-1. We should be able to eliminate the dead load into %ecx
-2. We could change the "movl 8(%esp), %eax" into
- "movzwl 10(%esp), %eax"; this lets us change the cmpl
- into a testl, which is shorter, and eliminate the shift.
-
-We could also in theory eliminate the branch by using a conditional
-for the address of the load, but that seems unlikely to be worthwhile
-in general.
+We could change the "movl 8(%esp), %eax" into "movzwl 10(%esp), %eax"; this
+lets us change the cmpl into a testl, which is shorter, and eliminate the shift.
//===---------------------------------------------------------------------===//
to:
-_foo:
+foo: # @foo
+# BB#0: # %entry
+ movl 4(%esp), %ecx
cmpb $0, 16(%esp)
- movl 12(%esp), %ecx
+ je .LBB0_2
+# BB#1: # %bb
movl 8(%esp), %eax
- movl 4(%esp), %edx
- je LBB1_2 # bb7
-LBB1_1: # bb
- addl %edx, %eax
+ addl %ecx, %eax
ret
-LBB1_2: # bb7
- movl %edx, %eax
- subl %ecx, %eax
+.LBB0_2: # %bb7
+ movl 12(%esp), %edx
+ movl %ecx, %eax
+ subl %edx, %eax
ret
-The coalescer could coalesce "edx" with "eax" to avoid the movl in LBB1_2
-if it commuted the addl in LBB1_1.
+There's an obviously unnecessary movl in .LBB0_2, and we could eliminate a
+couple more movls by putting 4(%esp) into %eax instead of %ecx.
//===---------------------------------------------------------------------===//
//===---------------------------------------------------------------------===//
-Legalize loses track of the fact that bools are always zero extended when in
-memory. This causes us to compile abort_gzip (from 164.gzip) from:
+Take the following:
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
target triple = "i386-apple-darwin8"
}
declare void @exit(i32) noreturn nounwind
-into:
-
-_abort_gzip:
+This compiles into:
+_abort_gzip: ## @abort_gzip
+## BB#0: ## %entry
subl $12, %esp
movb _in_exit.4870.b, %al
- notb %al
- testb $1, %al
- jne LBB1_2 ## bb4.i
-LBB1_1: ## bb.i
- ...
+ cmpb $1, %al
+ jne LBB0_2
+
+We somehow miss folding the movb into the cmpb.
//===---------------------------------------------------------------------===//
cmpl $150, %edi
jne LBB1_1 ## bb1
+The issue is that we hoist the cast of "scaler" to long long outside of the
+loop, the value comes into the loop as two values, and
+RegsForValue::getCopyFromRegs doesn't know how to put an AssertSext on the
+constructed BUILD_PAIR which represents the cast value.
+
//===---------------------------------------------------------------------===//
Test instructions can be eliminated by using EFLAGS values from arithmetic
//===---------------------------------------------------------------------===//
-int func(int a, int b) { if (a & 0x80) b |= 0x80; else b &= ~0x80; return b; }
+_Bool bar(int *x) { return *x & 1; }
-Current:
+define zeroext i1 @bar(i32* nocapture %x) nounwind readonly {
+entry:
+ %tmp1 = load i32* %x ; <i32> [#uses=1]
+ %and = and i32 %tmp1, 1 ; <i32> [#uses=1]
+ %tobool = icmp ne i32 %and, 0 ; <i1> [#uses=1]
+ ret i1 %tobool
+}
+bar: # @bar
+# BB#0: # %entry
+ movl 4(%esp), %eax
+ movb (%eax), %al
+ andb $1, %al
+ movzbl %al, %eax
+ ret
- movb %sil, %al
- andb $127, %sil
- orb $-128, %al
- testb %dil, %dil
- js LBB1_2
- movb %sil, %al
-LBB1_2:
- movsbl %al, %eax
+Missed optimization: should be movl+andl.
+//===---------------------------------------------------------------------===//
-Better:
+Consider the following two functions compiled with clang:
+_Bool foo(int *x) { return !(*x & 4); }
+unsigned bar(int *x) { return !(*x & 4); }
- movl %esi, %eax
- orl $-128, %eax
- andl $127, %esi
- testb %dil, %dil
- cmovns %esi, %eax
- movsbl %al,%eax
+foo:
+ movl 4(%esp), %eax
+ testb $4, (%eax)
+ sete %al
+ movzbl %al, %eax
+ ret
-Best (recognize this as 'b = (b & ~0x80) | (a & 0x80)'):
+bar:
+ movl 4(%esp), %eax
+ movl (%eax), %eax
+ shrl $2, %eax
+ andl $1, %eax
+ xorl $1, %eax
+ ret
+
+The second function generates more code even though the two functions are
+are functionally identical.
+
+//===---------------------------------------------------------------------===//
- andb $-128, %dil
- andb $127, %sil
- orb %dil, %sil
- movsbl %sil, %eax
+Take the following C code:
+int x(int y) { return (y & 63) << 14; }
+
+Code produced by gcc:
+ andl $63, %edi
+ sall $14, %edi
+ movl %edi, %eax
+ ret
+
+Code produced by clang:
+ shll $14, %edi
+ movl %edi, %eax
+ andl $1032192, %eax
+ ret
+
+The code produced by gcc is 3 bytes shorter. This sort of construct often
+shows up with bitfields.
+
+//===---------------------------------------------------------------------===//