//===---------------------------------------------------------------------===//
-With the recent changes to make the implicit def/use set explicit in
-machineinstrs, we should change the target descriptions for 'call' instructions
-so that the .td files don't list all the call-clobbered registers as implicit
-defs. Instead, these should be added by the code generator (e.g. on the dag).
-
-This has a number of uses:
-
-1. PPC32/64 and X86 32/64 can avoid having multiple copies of call instructions
- for their different impdef sets.
-2. Targets with multiple calling convs (e.g. x86) which have different clobber
- sets don't need copies of call instructions.
-3. 'Interprocedural register allocation' can be done to reduce the clobber sets
- of calls.
-
-//===---------------------------------------------------------------------===//
-
We should recognized various "overflow detection" idioms and translate them into
llvm.uadd.with.overflow and similar intrinsics. Here is a multiply idiom:
something that reassoc doesn't think about yet.
-//===---------------------------------------------------------------------===//
-
-This function: (derived from GCC PR19988)
-double foo(double x, double y) {
- return ((x + 0.1234 * y) * (x + -0.1234 * y));
-}
-
-compiles to:
-_foo:
- movapd %xmm1, %xmm2
- mulsd LCPI1_1(%rip), %xmm1
- mulsd LCPI1_0(%rip), %xmm2
- addsd %xmm0, %xmm1
- addsd %xmm0, %xmm2
- movapd %xmm1, %xmm0
- mulsd %xmm2, %xmm0
- ret
-
-Reassociate should be able to turn it into:
-
-double foo(double x, double y) {
- return ((x + 0.1234 * y) * (x - 0.1234 * y));
-}
-
-Which allows the multiply by constant to be CSE'd, producing:
-
-_foo:
- mulsd LCPI1_0(%rip), %xmm1
- movapd %xmm1, %xmm2
- addsd %xmm0, %xmm2
- subsd %xmm1, %xmm0
- mulsd %xmm2, %xmm0
- ret
-
-This doesn't need -ffast-math support at all. This is particularly bad because
-the llvm-gcc frontend is canonicalizing the later into the former, but clang
-doesn't have this problem.
-
//===---------------------------------------------------------------------===//
These two functions should generate the same code on big-endian systems:
//===---------------------------------------------------------------------===//
-For vector types, TargetData.cpp::getTypeInfo() returns alignment that is equal
+For vector types, DataLayout.cpp::getTypeInfo() returns alignment that is equal
to the type size. It works but can be overly conservative as the alignment of
specific vector types are target dependent.
c += v & 1;
return c;
}
-unsigned countbits_fast(unsigned v){
- unsigned c;
- for (c = 0; v; c++)
- v &= v - 1; // clear the least significant bit set
- return c;
-}
-BITBOARD = unsigned long long
-int PopCnt(register BITBOARD a) {
- register int c=0;
- while(a) {
- c++;
- a &= a - 1;
- }
- return c;
-}
unsigned int popcount(unsigned int input) {
unsigned int count = 0;
for (unsigned int i = 0; i < 4 * 8; i++)
return ((a & (c - 1)) != 0) | ((b & (c - 1)) != 0);
}
Both should combine to ((a|b) & (c-1)) != 0. Currently not optimized with
-"clang -emit-llvm-bc | opt -std-compile-opts".
+"clang -emit-llvm-bc | opt -O3".
//===---------------------------------------------------------------------===//
}
The expression should optimize to something like
"!((start|end)&~PMD_MASK). Currently not optimized with "clang
--emit-llvm-bc | opt -std-compile-opts".
+-emit-llvm-bc | opt -O3".
//===---------------------------------------------------------------------===//
return (abs(x)) >= 0;
}
This should optimize to x == INT_MIN. (With -fwrapv.) Currently not
-optimized with "clang -emit-llvm-bc | opt -std-compile-opts".
+optimized with "clang -emit-llvm-bc | opt -O3".
//===---------------------------------------------------------------------===//
bar ();
}
-void neg_eq_cst(unsigned int a) {
-if (-a == 123)
-bar();
-}
-
All should simplify to a single comparison. All of these are
currently not optimized with "clang -emit-llvm-bc | opt
--std-compile-opts".
+-O3".
//===---------------------------------------------------------------------===//
From GCC Bug 32605:
int c(int* x) {return (char*)x+2 == (char*)x;}
Should combine to 0. Currently not optimized with "clang
--emit-llvm-bc | opt -std-compile-opts" (although llc can optimize it).
+-emit-llvm-bc | opt -O3" (although llc can optimize it).
//===---------------------------------------------------------------------===//
int a(unsigned b) {return ((b << 31) | (b << 30)) >> 31;}
Should be combined to "((b >> 1) | b) & 1". Currently not optimized
-with "clang -emit-llvm-bc | opt -std-compile-opts".
+with "clang -emit-llvm-bc | opt -O3".
//===---------------------------------------------------------------------===//
unsigned a(unsigned x, unsigned y) { return x | (y & 1) | (y & 2);}
Should combine to "x | (y & 3)". Currently not optimized with "clang
--emit-llvm-bc | opt -std-compile-opts".
+-emit-llvm-bc | opt -O3".
//===---------------------------------------------------------------------===//
int a(int a, int b, int c) {return (~a & c) | ((c|a) & b);}
Should fold to "(~a & c) | (a & b)". Currently not optimized with
-"clang -emit-llvm-bc | opt -std-compile-opts".
+"clang -emit-llvm-bc | opt -O3".
//===---------------------------------------------------------------------===//
int a(int a,int b) {return (~(a|b))|a;}
Should fold to "a|~b". Currently not optimized with "clang
--emit-llvm-bc | opt -std-compile-opts".
+-emit-llvm-bc | opt -O3".
//===---------------------------------------------------------------------===//
int a(int a, int b) {return (a&&b) || (a&&!b);}
Should fold to "a". Currently not optimized with "clang -emit-llvm-bc
-| opt -std-compile-opts".
+| opt -O3".
//===---------------------------------------------------------------------===//
int a(int a, int b, int c) {return (a&&b) || (!a&&c);}
Should fold to "a ? b : c", or at least something sane. Currently not
-optimized with "clang -emit-llvm-bc | opt -std-compile-opts".
+optimized with "clang -emit-llvm-bc | opt -O3".
//===---------------------------------------------------------------------===//
int a(int a, int b, int c) {return (a&&b) || (a&&c) || (a&&b&&c);}
Should fold to a && (b || c). Currently not optimized with "clang
--emit-llvm-bc | opt -std-compile-opts".
+-emit-llvm-bc | opt -O3".
//===---------------------------------------------------------------------===//
int a(int x) {return x | ((x & 8) ^ 8);}
Should combine to x | 8. Currently not optimized with "clang
--emit-llvm-bc | opt -std-compile-opts".
+-emit-llvm-bc | opt -O3".
//===---------------------------------------------------------------------===//
int a(int x) {return x ^ ((x & 8) ^ 8);}
Should also combine to x | 8. Currently not optimized with "clang
--emit-llvm-bc | opt -std-compile-opts".
+-emit-llvm-bc | opt -O3".
//===---------------------------------------------------------------------===//
int a(int x) {return ((x | -9) ^ 8) & x;}
Should combine to x & -9. Currently not optimized with "clang
--emit-llvm-bc | opt -std-compile-opts".
+-emit-llvm-bc | opt -O3".
//===---------------------------------------------------------------------===//
unsigned a(unsigned a) {return a * 0x11111111 >> 28 & 1;}
Should combine to "a * 0x88888888 >> 31". Currently not optimized
-with "clang -emit-llvm-bc | opt -std-compile-opts".
+with "clang -emit-llvm-bc | opt -O3".
//===---------------------------------------------------------------------===//
unsigned a(char* x) {if ((*x & 32) == 0) return b();}
There's an unnecessary zext in the generated code with "clang
--emit-llvm-bc | opt -std-compile-opts".
+-emit-llvm-bc | opt -O3".
//===---------------------------------------------------------------------===//
unsigned a(unsigned long long x) {return 40 * (x >> 1);}
Should combine to "20 * (((unsigned)x) & -2)". Currently not
-optimized with "clang -emit-llvm-bc | opt -std-compile-opts".
+optimized with "clang -emit-llvm-bc | opt -O3".
+
+//===---------------------------------------------------------------------===//
+
+int g(int x) { return (x - 10) < 0; }
+Should combine to "x <= 9" (the sub has nsw). Currently not
+optimized with "clang -emit-llvm-bc | opt -O3".
+
+//===---------------------------------------------------------------------===//
+
+int g(int x) { return (x + 10) < 0; }
+Should combine to "x < -10" (the add has nsw). Currently not
+optimized with "clang -emit-llvm-bc | opt -O3".
+
+//===---------------------------------------------------------------------===//
+
+int f(int i, int j) { return i < j + 1; }
+int g(int i, int j) { return j > i - 1; }
+Should combine to "i <= j" (the add/sub has nsw). Currently not
+optimized with "clang -emit-llvm-bc | opt -O3".
+
+//===---------------------------------------------------------------------===//
+
+unsigned f(unsigned x) { return ((x & 7) + 1) & 15; }
+The & 15 part should be optimized away, it doesn't change the result. Currently
+not optimized with "clang -emit-llvm-bc | opt -O3".
//===---------------------------------------------------------------------===//
GCC testsuite, ones we don't get yet are (checked through loadpre25):
[CRIT EDGE BREAKING]
-loadpre3.c predcom-4.c
+predcom-4.c
[PRE OF READONLY CALL]
loadpre5.c
..
else if (strchr ("<>", *intel_parser.op_string)
-Those should be turned into a switch.
+Those should be turned into a switch. SimplifyLibCalls only gets the second
+case.
//===---------------------------------------------------------------------===//
//===---------------------------------------------------------------------===//
-Switch lowering generates less than ideal code for the following switch:
define void @a(i32 %x) nounwind {
entry:
switch i32 %x, label %if.end [
Generated code on x86-64 (other platforms give similar results):
a:
cmpl $5, %edi
- ja .LBB0_2
- movl %edi, %eax
- movl $47, %ecx
- btq %rax, %rcx
- jb .LBB0_3
+ ja LBB2_2
+ cmpl $4, %edi
+ jne LBB2_3
.LBB0_2:
ret
.LBB0_3:
jmp foo # TAILCALL
-The movl+movl+btq+jb could be simplified to a cmpl+jne.
-
-Or, if we wanted to be really clever, we could simplify the whole thing to
+If we wanted to be really clever, we could simplify the whole thing to
something like the following, which eliminates a branch:
xorl $1, %edi
cmpl $4, %edi
//===---------------------------------------------------------------------===//
-This code (from Benchmarks/Dhrystone/dry.c):
-
-define i32 @Func1(i32, i32) nounwind readnone optsize ssp {
-entry:
- %sext = shl i32 %0, 24
- %conv = ashr i32 %sext, 24
- %sext6 = shl i32 %1, 24
- %conv4 = ashr i32 %sext6, 24
- %cmp = icmp eq i32 %conv, %conv4
- %. = select i1 %cmp, i32 10000, i32 0
- ret i32 %.
-}
-
-Should be simplified into something like:
-
-define i32 @Func1(i32, i32) nounwind readnone optsize ssp {
-entry:
- %sext = shl i32 %0, 24
- %conv = and i32 %sext, 0xFF000000
- %sext6 = shl i32 %1, 24
- %conv4 = and i32 %sext6, 0xFF000000
- %cmp = icmp eq i32 %conv, %conv4
- %. = select i1 %cmp, i32 10000, i32 0
- ret i32 %.
-}
-
-and then to:
-
-define i32 @Func1(i32, i32) nounwind readnone optsize ssp {
-entry:
- %conv = and i32 %0, 0xFF
- %conv4 = and i32 %1, 0xFF
- %cmp = icmp eq i32 %conv, %conv4
- %. = select i1 %cmp, i32 10000, i32 0
- ret i32 %.
-}
-//===---------------------------------------------------------------------===//
-
clang -O3 currently compiles this code
int g(unsigned int a) {
}
This shouldn't need the ((zext (%n - 1)) + 1) game, and it should ideally fold
-the two memset's together. The issue with %n seems to stem from poor handling
-of the original loop.
+the two memset's together.
-To simplify this, we need SCEV to know that "n != 0" because of the dominating
-conditional. That would turn the second memset into a simple memset of 'n'.
+The issue with the addition only occurs in 64-bit mode, and appears to be at
+least partially caused by Scalar Evolution not keeping its cache updated: it
+returns the "wrong" result immediately after indvars runs, but figures out the
+expected result if it is run from scratch on IR resulting from running indvars.
//===---------------------------------------------------------------------===//
//===---------------------------------------------------------------------===//
+define i1 @test1(i32 %x) nounwind {
+ %and = and i32 %x, 3
+ %cmp = icmp ult i32 %and, 2
+ ret i1 %cmp
+}
+
+Can be folded to (x & 2) == 0.
+
+define i1 @test2(i32 %x) nounwind {
+ %and = and i32 %x, 3
+ %cmp = icmp ugt i32 %and, 1
+ ret i1 %cmp
+}
+
+Can be folded to (x & 2) != 0.
+
+SimplifyDemandedBits shrinks the "and" constant to 2 but instcombine misses the
+icmp transform.
+
+//===---------------------------------------------------------------------===//
+
+This code:
+
+typedef struct {
+int f1:1;
+int f2:1;
+int f3:1;
+int f4:29;
+} t1;
+
+typedef struct {
+int f1:1;
+int f2:1;
+int f3:30;
+} t2;
+
+t1 s1;
+t2 s2;
+
+void func1(void)
+{
+s1.f1 = s2.f1;
+s1.f2 = s2.f2;
+}
+
+Compiles into this IR (on x86-64 at least):
+
+%struct.t1 = type { i8, [3 x i8] }
+@s2 = global %struct.t1 zeroinitializer, align 4
+@s1 = global %struct.t1 zeroinitializer, align 4
+define void @func1() nounwind ssp noredzone {
+entry:
+ %0 = load i32* bitcast (%struct.t1* @s2 to i32*), align 4
+ %bf.val.sext5 = and i32 %0, 1
+ %1 = load i32* bitcast (%struct.t1* @s1 to i32*), align 4
+ %2 = and i32 %1, -4
+ %3 = or i32 %2, %bf.val.sext5
+ %bf.val.sext26 = and i32 %0, 2
+ %4 = or i32 %3, %bf.val.sext26
+ store i32 %4, i32* bitcast (%struct.t1* @s1 to i32*), align 4
+ ret void
+}
+
+The two or/and's should be merged into one each.
+
+//===---------------------------------------------------------------------===//
+
+Machine level code hoisting can be useful in some cases. For example, PR9408
+is about:
+
+typedef union {
+ void (*f1)(int);
+ void (*f2)(long);
+} funcs;
+
+void foo(funcs f, int which) {
+ int a = 5;
+ if (which) {
+ f.f1(a);
+ } else {
+ f.f2(a);
+ }
+}
+
+which we compile to:
+
+foo: # @foo
+# BB#0: # %entry
+ pushq %rbp
+ movq %rsp, %rbp
+ testl %esi, %esi
+ movq %rdi, %rax
+ je .LBB0_2
+# BB#1: # %if.then
+ movl $5, %edi
+ callq *%rax
+ popq %rbp
+ ret
+.LBB0_2: # %if.else
+ movl $5, %edi
+ callq *%rax
+ popq %rbp
+ ret
+
+Note that bb1 and bb2 are the same. This doesn't happen at the IR level
+because one call is passing an i32 and the other is passing an i64.
+//===---------------------------------------------------------------------===//
+
+I see this sort of pattern in 176.gcc in a few places (e.g. the start of
+store_bit_field). The rem should be replaced with a multiply and subtract:
+
+ %3 = sdiv i32 %A, %B
+ %4 = srem i32 %A, %B
+
+Similarly for udiv/urem. Note that this shouldn't be done on X86 or ARM,
+which can do this in a single operation (instruction or libcall). It is
+probably best to do this in the code generator.
+
+//===---------------------------------------------------------------------===//
+
+unsigned foo(unsigned x, unsigned y) { return (x & y) == 0 || x == 0; }
+should fold to (x & y) == 0.
+
+//===---------------------------------------------------------------------===//
+
+unsigned foo(unsigned x, unsigned y) { return x > y && x != 0; }
+should fold to x > y.
+
+//===---------------------------------------------------------------------===//