//===---------------------------------------------------------------------===//
-With the recent changes to make the implicit def/use set explicit in
-machineinstrs, we should change the target descriptions for 'call' instructions
-so that the .td files don't list all the call-clobbered registers as implicit
-defs. Instead, these should be added by the code generator (e.g. on the dag).
-
-This has a number of uses:
-
-1. PPC32/64 and X86 32/64 can avoid having multiple copies of call instructions
- for their different impdef sets.
-2. Targets with multiple calling convs (e.g. x86) which have different clobber
- sets don't need copies of call instructions.
-3. 'Interprocedural register allocation' can be done to reduce the clobber sets
- of calls.
-
-//===---------------------------------------------------------------------===//
-
We should recognized various "overflow detection" idioms and translate them into
llvm.uadd.with.overflow and similar intrinsics. Here is a multiply idiom:
//===---------------------------------------------------------------------===//
-For vector types, TargetData.cpp::getTypeInfo() returns alignment that is equal
+For vector types, DataLayout.cpp::getTypeInfo() returns alignment that is equal
to the type size. It works but can be overly conservative as the alignment of
specific vector types are target dependent.
//===---------------------------------------------------------------------===//
+[LOOP DELETION]
+
+We don't delete this output free loop, because trip count analysis doesn't
+realize that it is finite (if it were infinite, it would be undefined). Not
+having this blocks Loop Idiom from matching strlen and friends.
+
+void foo(char *C) {
+ int x = 0;
+ while (*C)
+ ++x,++C;
+}
+
+//===---------------------------------------------------------------------===//
+
[LOOP RECOGNITION]
These idioms should be recognized as popcount (see PR1488):
c += v & 1;
return c;
}
-unsigned countbits_fast(unsigned v){
- unsigned c;
- for (c = 0; v; c++)
- v &= v - 1; // clear the least significant bit set
- return c;
-}
-BITBOARD = unsigned long long
-int PopCnt(register BITBOARD a) {
- register int c=0;
- while(a) {
- c++;
- a &= a - 1;
- }
- return c;
-}
unsigned int popcount(unsigned int input) {
unsigned int count = 0;
for (unsigned int i = 0; i < 4 * 8; i++)
return count;
}
+This should be recognized as CLZ: rdar://8459039
+
+unsigned clz_a(unsigned a) {
+ int i;
+ for (i=0;i<32;i++)
+ if (a & (1<<(31-i)))
+ return i;
+ return 32;
+}
+
This sort of thing should be added to the loop idiom pass.
//===---------------------------------------------------------------------===//
//===---------------------------------------------------------------------===//
-LSR should know what GPR types a target has from TargetData. This code:
-
-volatile short X, Y; // globals
-
-void foo(int N) {
- int i;
- for (i = 0; i < N; i++) { X = i; Y = i*4; }
-}
-
-produces two near identical IV's (after promotion) on PPC/ARM:
-
-LBB1_2:
- ldr r3, LCPI1_0
- ldr r3, [r3]
- strh r2, [r3]
- ldr r3, LCPI1_1
- ldr r3, [r3]
- strh r1, [r3]
- add r1, r1, #4
- add r2, r2, #1 <- [0,+,1]
- sub r0, r0, #1 <- [0,-,1]
- cmp r0, #0
- bne LBB1_2
-
-LSR should reuse the "+" IV for the exit test.
-
-//===---------------------------------------------------------------------===//
-
Tail call elim should be more aggressive, checking to see if the call is
followed by an uncond branch to an exit block.
//===---------------------------------------------------------------------===//
-We miss some instcombines for stuff like this:
-void bar (void);
-void foo (unsigned int a) {
- /* This one is equivalent to a >= (3 << 2). */
- if ((a >> 2) >= 3)
- bar ();
-}
-
-A few other related ones are in GCC PR14753.
-
-//===---------------------------------------------------------------------===//
-
Divisibility by constant can be simplified (according to GCC PR12849) from
being a mulhi to being a mul lo (cheaper). Testcase:
if ((a >> 2) > 5)
bar ();
}
+
All should simplify to a single comparison. All of these are
currently not optimized with "clang -emit-llvm-bc | opt
-std-compile-opts".
//===---------------------------------------------------------------------===//
+int g(int x) { return (x - 10) < 0; }
+Should combine to "x <= 9" (the sub has nsw). Currently not
+optimized with "clang -emit-llvm-bc | opt -std-compile-opts".
+
+//===---------------------------------------------------------------------===//
+
+int g(int x) { return (x + 10) < 0; }
+Should combine to "x < -10" (the add has nsw). Currently not
+optimized with "clang -emit-llvm-bc | opt -std-compile-opts".
+
+//===---------------------------------------------------------------------===//
+
+int f(int i, int j) { return i < j + 1; }
+int g(int i, int j) { return j > i - 1; }
+Should combine to "i <= j" (the add/sub has nsw). Currently not
+optimized with "clang -emit-llvm-bc | opt -std-compile-opts".
+
+//===---------------------------------------------------------------------===//
+
+unsigned f(unsigned x) { return ((x & 7) + 1) & 15; }
+The & 15 part should be optimized away, it doesn't change the result. Currently
+not optimized with "clang -emit-llvm-bc | opt -std-compile-opts".
+
+//===---------------------------------------------------------------------===//
+
This was noticed in the entryblock for grokdeclarator in 403.gcc:
%tmp = icmp eq i32 %decl_context, 4
//===---------------------------------------------------------------------===//
-A/B get pinned to the stack because we turn an if/then into a select instead
-of PRE'ing the load/store. This may be fixable in instcombine:
-http://gcc.gnu.org/bugzilla/show_bug.cgi?id=37892
-
-struct X { int i; };
-int foo (int x) {
- struct X a;
- struct X b;
- struct X *p;
- a.i = 1;
- b.i = 2;
- if (x)
- p = &a;
- else
- p = &b;
- return p->i;
-}
-
-//===---------------------------------------------------------------------===//
-
Interesting missed case because of control flow flattening (should be 2 loads):
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=26629
With: llvm-gcc t2.c -S -o - -O0 -emit-llvm | llvm-as |
//===---------------------------------------------------------------------===//
+simplifylibcalls should turn these snprintf idioms into memcpy (GCC PR47917)
+
+char buf1[6], buf2[6], buf3[4], buf4[4];
+int i;
+
+int foo (void) {
+ int ret = snprintf (buf1, sizeof buf1, "abcde");
+ ret += snprintf (buf2, sizeof buf2, "abcdef") * 16;
+ ret += snprintf (buf3, sizeof buf3, "%s", i++ < 6 ? "abc" : "def") * 256;
+ ret += snprintf (buf4, sizeof buf4, "%s", i++ > 10 ? "abcde" : "defgh")*4096;
+ return ret;
+}
+
+//===---------------------------------------------------------------------===//
+
"gas" uses this idiom:
else if (strchr ("+-/*%|&^:[]()~", *intel_parser.op_string))
..
//===---------------------------------------------------------------------===//
-Take the following testcase on x86-64 (similar testcases exist for all targets
-with addc/adde):
-
-define void @a(i64* nocapture %s, i64* nocapture %t, i64 %a, i64 %b,
-i64 %c) nounwind {
-entry:
- %0 = zext i64 %a to i128 ; <i128> [#uses=1]
- %1 = zext i64 %b to i128 ; <i128> [#uses=1]
- %2 = add i128 %1, %0 ; <i128> [#uses=2]
- %3 = zext i64 %c to i128 ; <i128> [#uses=1]
- %4 = shl i128 %3, 64 ; <i128> [#uses=1]
- %5 = add i128 %4, %2 ; <i128> [#uses=1]
- %6 = lshr i128 %5, 64 ; <i128> [#uses=1]
- %7 = trunc i128 %6 to i64 ; <i64> [#uses=1]
- store i64 %7, i64* %s, align 8
- %8 = trunc i128 %2 to i64 ; <i64> [#uses=1]
- store i64 %8, i64* %t, align 8
- ret void
-}
-
-Generated code:
- addq %rcx, %rdx
- movl $0, %eax
- adcq $0, %rax
- addq %r8, %rax
- movq %rax, (%rdi)
- movq %rdx, (%rsi)
- ret
-
-Expected code:
- addq %rcx, %rdx
- adcq $0, %r8
- movq %r8, (%rdi)
- movq %rdx, (%rsi)
- ret
-
-The generated SelectionDAG has an ADD of an ADDE, where both operands of the
-ADDE are zero. Replacing one of the operands of the ADDE with the other operand
-of the ADD, and replacing the ADD with the ADDE, should give the desired result.
-
-(That said, we are doing a lot better than gcc on this testcase. :) )
-
-//===---------------------------------------------------------------------===//
-
-Switch lowering generates less than ideal code for the following switch:
define void @a(i32 %x) nounwind {
entry:
switch i32 %x, label %if.end [
Generated code on x86-64 (other platforms give similar results):
a:
cmpl $5, %edi
- ja .LBB0_2
- movl %edi, %eax
- movl $47, %ecx
- btq %rax, %rcx
- jb .LBB0_3
+ ja LBB2_2
+ cmpl $4, %edi
+ jne LBB2_3
.LBB0_2:
ret
.LBB0_3:
jmp foo # TAILCALL
-The movl+movl+btq+jb could be simplified to a cmpl+jne.
-
-Or, if we wanted to be really clever, we could simplify the whole thing to
+If we wanted to be really clever, we could simplify the whole thing to
something like the following, which eliminates a branch:
xorl $1, %edi
cmpl $4, %edi
ret
.LBB0_2:
jmp foo # TAILCALL
-//===---------------------------------------------------------------------===//
-Given a branch where the two target blocks are identical ("ret i32 %b" in
-both), simplifycfg will simplify them away. But not so for a switch statement:
-
-define i32 @f(i32 %a, i32 %b) nounwind readnone {
-entry:
- switch i32 %a, label %bb3 [
- i32 4, label %bb
- i32 6, label %bb
- ]
-
-bb: ; preds = %entry, %entry
- ret i32 %b
-
-bb3: ; preds = %entry
- ret i32 %b
-}
-//===---------------------------------------------------------------------===//
-
-clang -O3 fails to devirtualize this virtual inheritance case: (GCC PR45875)
-Looks related to PR3100
-
-struct c1 {};
-struct c10 : c1{
- virtual void foo ();
-};
-struct c11 : c10, c1{
- virtual void f6 ();
-};
-struct c28 : virtual c11{
- void f6 ();
-};
-void check_c28 () {
- c28 obj;
- c11 *ptr = &obj;
- ptr->f6 ();
-}
//===---------------------------------------------------------------------===//
}
This shouldn't need the ((zext (%n - 1)) + 1) game, and it should ideally fold
-the two memset's together. The issue with %n seems to stem from poor handling
-of the original loop.
+the two memset's together.
-To simplify this, we need SCEV to know that "n != 0" because of the dominating
-conditional. That would turn the second memset into a simple memset of 'n'.
+The issue with the addition only occurs in 64-bit mode, and appears to be at
+least partially caused by Scalar Evolution not keeping its cache updated: it
+returns the "wrong" result immediately after indvars runs, but figures out the
+expected result if it is run from scratch on IR resulting from running indvars.
//===---------------------------------------------------------------------===//
//===---------------------------------------------------------------------===//
-We miss an optzn when lowering divide by some constants. For example:
- int test(int x) { return x/10; }
+We don't fold (icmp (add) (add)) unless the two adds only have a single use.
+There are a lot of cases that we're refusing to fold in (e.g.) 256.bzip2, for
+example:
+
+ %indvar.next90 = add i64 %indvar89, 1 ;; Has 2 uses
+ %tmp96 = add i64 %tmp95, 1 ;; Has 1 use
+ %exitcond97 = icmp eq i64 %indvar.next90, %tmp96
+
+We don't fold this because we don't want to introduce an overlapped live range
+of the ivar. However if we can make this more aggressive without causing
+performance issues in two ways:
+
+1. If *either* the LHS or RHS has a single use, we can definitely do the
+ transformation. In the overlapping liverange case we're trading one register
+ use for one fewer operation, which is a reasonable trade. Before doing this
+ we should verify that the llc output actually shrinks for some benchmarks.
+2. If both ops have multiple uses, we can still fold it if the operations are
+ both sinkable to *after* the icmp (e.g. in a subsequent block) which doesn't
+ increase register pressure.
+
+There are a ton of icmp's we aren't simplifying because of the reg pressure
+concern. Care is warranted here though because many of these are induction
+variables and other cases that matter a lot to performance, like the above.
+Here's a blob of code that you can drop into the bottom of visitICmp to see some
+missed cases:
+
+ { Value *A, *B, *C, *D;
+ if (match(Op0, m_Add(m_Value(A), m_Value(B))) &&
+ match(Op1, m_Add(m_Value(C), m_Value(D))) &&
+ (A == C || A == D || B == C || B == D)) {
+ errs() << "OP0 = " << *Op0 << " U=" << Op0->getNumUses() << "\n";
+ errs() << "OP1 = " << *Op1 << " U=" << Op1->getNumUses() << "\n";
+ errs() << "CMP = " << I << "\n\n";
+ }
+ }
-We produce:
+//===---------------------------------------------------------------------===//
-_test: ## @test
-## BB#0: ## %entry
- movslq %edi, %rax
- imulq $1717986919, %rax, %rax ## imm = 0x66666667
- movq %rax, %rcx
- shrq $63, %rcx
-** shrq $32, %rax
-** sarl $2, %eax
- addl %ecx, %eax
- ret
+define i1 @test1(i32 %x) nounwind {
+ %and = and i32 %x, 3
+ %cmp = icmp ult i32 %and, 2
+ ret i1 %cmp
+}
+
+Can be folded to (x & 2) == 0.
+
+define i1 @test2(i32 %x) nounwind {
+ %and = and i32 %x, 3
+ %cmp = icmp ugt i32 %and, 1
+ ret i1 %cmp
+}
+
+Can be folded to (x & 2) != 0.
+
+SimplifyDemandedBits shrinks the "and" constant to 2 but instcombine misses the
+icmp transform.
+
+//===---------------------------------------------------------------------===//
+
+This code:
+
+typedef struct {
+int f1:1;
+int f2:1;
+int f3:1;
+int f4:29;
+} t1;
+
+typedef struct {
+int f1:1;
+int f2:1;
+int f3:30;
+} t2;
+
+t1 s1;
+t2 s2;
+
+void func1(void)
+{
+s1.f1 = s2.f1;
+s1.f2 = s2.f2;
+}
+
+Compiles into this IR (on x86-64 at least):
+
+%struct.t1 = type { i8, [3 x i8] }
+@s2 = global %struct.t1 zeroinitializer, align 4
+@s1 = global %struct.t1 zeroinitializer, align 4
+define void @func1() nounwind ssp noredzone {
+entry:
+ %0 = load i32* bitcast (%struct.t1* @s2 to i32*), align 4
+ %bf.val.sext5 = and i32 %0, 1
+ %1 = load i32* bitcast (%struct.t1* @s1 to i32*), align 4
+ %2 = and i32 %1, -4
+ %3 = or i32 %2, %bf.val.sext5
+ %bf.val.sext26 = and i32 %0, 2
+ %4 = or i32 %3, %bf.val.sext26
+ store i32 %4, i32* bitcast (%struct.t1* @s1 to i32*), align 4
+ ret void
+}
+
+The two or/and's should be merged into one each.
+
+//===---------------------------------------------------------------------===//
+
+Machine level code hoisting can be useful in some cases. For example, PR9408
+is about:
+
+typedef union {
+ void (*f1)(int);
+ void (*f2)(long);
+} funcs;
+
+void foo(funcs f, int which) {
+ int a = 5;
+ if (which) {
+ f.f1(a);
+ } else {
+ f.f2(a);
+ }
+}
+
+which we compile to:
+
+foo: # @foo
+# BB#0: # %entry
+ pushq %rbp
+ movq %rsp, %rbp
+ testl %esi, %esi
+ movq %rdi, %rax
+ je .LBB0_2
+# BB#1: # %if.then
+ movl $5, %edi
+ callq *%rax
+ popq %rbp
+ ret
+.LBB0_2: # %if.else
+ movl $5, %edi
+ callq *%rax
+ popq %rbp
+ ret
+
+Note that bb1 and bb2 are the same. This doesn't happen at the IR level
+because one call is passing an i32 and the other is passing an i64.
+
+//===---------------------------------------------------------------------===//
+
+I see this sort of pattern in 176.gcc in a few places (e.g. the start of
+store_bit_field). The rem should be replaced with a multiply and subtract:
+
+ %3 = sdiv i32 %A, %B
+ %4 = srem i32 %A, %B
+
+Similarly for udiv/urem. Note that this shouldn't be done on X86 or ARM,
+which can do this in a single operation (instruction or libcall). It is
+probably best to do this in the code generator.
+
+//===---------------------------------------------------------------------===//
+
+unsigned foo(unsigned x, unsigned y) { return (x & y) == 0 || x == 0; }
+should fold to (x & y) == 0.
+
+//===---------------------------------------------------------------------===//
-The two starred instructions could be replaced with a "sarl $34, %rax". This
-occurs in 186.crafty very frequently.
+unsigned foo(unsigned x, unsigned y) { return x > y && x != 0; }
+should fold to x > y.
//===---------------------------------------------------------------------===//