+//===---------------------------------------------------------------------===//
+
+We should use DSE + llvm.lifetime.end to delete dead vtable pointer updates.
+See GCC PR34949
+
+Another interesting case is that something related could be used for variables
+that go const after their ctor has finished. In these cases, globalopt (which
+can statically run the constructor) could mark the global const (so it gets put
+in the readonly section). A testcase would be:
+
+#include <complex>
+using namespace std;
+const complex<char> should_be_in_rodata (42,-42);
+complex<char> should_be_in_data (42,-42);
+complex<char> should_be_in_bss;
+
+Where we currently evaluate the ctors but the globals don't become const because
+the optimizer doesn't know they "become const" after the ctor is done. See
+GCC PR4131 for more examples.
+
+//===---------------------------------------------------------------------===//
+
+In this code:
+
+long foo(long x) {
+ return x > 1 ? x : 1;
+}
+
+LLVM emits a comparison with 1 instead of 0. 0 would be equivalent
+and cheaper on most targets.
+
+LLVM prefers comparisons with zero over non-zero in general, but in this
+case it choses instead to keep the max operation obvious.
+
+//===---------------------------------------------------------------------===//
+
+define void @a(i32 %x) nounwind {
+entry:
+ switch i32 %x, label %if.end [
+ i32 0, label %if.then
+ i32 1, label %if.then
+ i32 2, label %if.then
+ i32 3, label %if.then
+ i32 5, label %if.then
+ ]
+if.then:
+ tail call void @foo() nounwind
+ ret void
+if.end:
+ ret void
+}
+declare void @foo()
+
+Generated code on x86-64 (other platforms give similar results):
+a:
+ cmpl $5, %edi
+ ja LBB2_2
+ cmpl $4, %edi
+ jne LBB2_3
+.LBB0_2:
+ ret
+.LBB0_3:
+ jmp foo # TAILCALL
+
+If we wanted to be really clever, we could simplify the whole thing to
+something like the following, which eliminates a branch:
+ xorl $1, %edi
+ cmpl $4, %edi
+ ja .LBB0_2
+ ret
+.LBB0_2:
+ jmp foo # TAILCALL
+
+//===---------------------------------------------------------------------===//
+
+We compile this:
+
+int foo(int a) { return (a & (~15)) / 16; }
+
+Into:
+
+define i32 @foo(i32 %a) nounwind readnone ssp {
+entry:
+ %and = and i32 %a, -16
+ %div = sdiv i32 %and, 16
+ ret i32 %div
+}
+
+but this code (X & -A)/A is X >> log2(A) when A is a power of 2, so this case
+should be instcombined into just "a >> 4".
+
+We do get this at the codegen level, so something knows about it, but
+instcombine should catch it earlier:
+
+_foo: ## @foo
+## BB#0: ## %entry
+ movl %edi, %eax
+ sarl $4, %eax
+ ret
+
+//===---------------------------------------------------------------------===//
+
+This code (from GCC PR28685):
+
+int test(int a, int b) {
+ int lt = a < b;
+ int eq = a == b;
+ if (lt)
+ return 1;
+ return eq;
+}
+
+Is compiled to:
+
+define i32 @test(i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+ %cmp = icmp slt i32 %a, %b
+ br i1 %cmp, label %return, label %if.end
+
+if.end: ; preds = %entry
+ %cmp5 = icmp eq i32 %a, %b
+ %conv6 = zext i1 %cmp5 to i32
+ ret i32 %conv6
+
+return: ; preds = %entry
+ ret i32 1
+}
+
+it could be:
+
+define i32 @test__(i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+ %0 = icmp sle i32 %a, %b
+ %retval = zext i1 %0 to i32
+ ret i32 %retval
+}
+
+//===---------------------------------------------------------------------===//
+
+This code can be seen in viterbi:
+
+ %64 = call noalias i8* @malloc(i64 %62) nounwind
+...
+ %67 = call i64 @llvm.objectsize.i64(i8* %64, i1 false) nounwind
+ %68 = call i8* @__memset_chk(i8* %64, i32 0, i64 %62, i64 %67) nounwind
+
+llvm.objectsize.i64 should be taught about malloc/calloc, allowing it to
+fold to %62. This is a security win (overflows of malloc will get caught)
+and also a performance win by exposing more memsets to the optimizer.
+
+This occurs several times in viterbi.
+
+Note that this would change the semantics of @llvm.objectsize which by its
+current definition always folds to a constant. We also should make sure that
+we remove checking in code like
+
+ char *p = malloc(strlen(s)+1);
+ __strcpy_chk(p, s, __builtin_objectsize(p, 0));
+
+//===---------------------------------------------------------------------===//
+
+clang -O3 currently compiles this code
+
+int g(unsigned int a) {
+ unsigned int c[100];
+ c[10] = a;
+ c[11] = a;
+ unsigned int b = c[10] + c[11];
+ if(b > a*2) a = 4;
+ else a = 8;
+ return a + 7;
+}
+
+into
+
+define i32 @g(i32 a) nounwind readnone {
+ %add = shl i32 %a, 1
+ %mul = shl i32 %a, 1
+ %cmp = icmp ugt i32 %add, %mul
+ %a.addr.0 = select i1 %cmp, i32 11, i32 15
+ ret i32 %a.addr.0
+}
+
+The icmp should fold to false. This CSE opportunity is only available
+after GVN and InstCombine have run.
+
+//===---------------------------------------------------------------------===//
+
+memcpyopt should turn this:
+
+define i8* @test10(i32 %x) {
+ %alloc = call noalias i8* @malloc(i32 %x) nounwind
+ call void @llvm.memset.p0i8.i32(i8* %alloc, i8 0, i32 %x, i32 1, i1 false)
+ ret i8* %alloc
+}
+
+into a call to calloc. We should make sure that we analyze calloc as
+aggressively as malloc though.
+
+//===---------------------------------------------------------------------===//
+
+clang -O3 doesn't optimize this:
+
+void f1(int* begin, int* end) {
+ std::fill(begin, end, 0);
+}
+
+into a memset. This is PR8942.
+
+//===---------------------------------------------------------------------===//
+
+clang -O3 -fno-exceptions currently compiles this code:
+
+void f(int N) {
+ std::vector<int> v(N);
+
+ extern void sink(void*); sink(&v);
+}
+
+into
+
+define void @_Z1fi(i32 %N) nounwind {
+entry:
+ %v2 = alloca [3 x i32*], align 8
+ %v2.sub = getelementptr inbounds [3 x i32*]* %v2, i64 0, i64 0
+ %tmpcast = bitcast [3 x i32*]* %v2 to %"class.std::vector"*
+ %conv = sext i32 %N to i64
+ store i32* null, i32** %v2.sub, align 8, !tbaa !0
+ %tmp3.i.i.i.i.i = getelementptr inbounds [3 x i32*]* %v2, i64 0, i64 1
+ store i32* null, i32** %tmp3.i.i.i.i.i, align 8, !tbaa !0
+ %tmp4.i.i.i.i.i = getelementptr inbounds [3 x i32*]* %v2, i64 0, i64 2
+ store i32* null, i32** %tmp4.i.i.i.i.i, align 8, !tbaa !0
+ %cmp.i.i.i.i = icmp eq i32 %N, 0
+ br i1 %cmp.i.i.i.i, label %_ZNSt12_Vector_baseIiSaIiEEC2EmRKS0_.exit.thread.i.i, label %cond.true.i.i.i.i
+
+_ZNSt12_Vector_baseIiSaIiEEC2EmRKS0_.exit.thread.i.i: ; preds = %entry
+ store i32* null, i32** %v2.sub, align 8, !tbaa !0
+ store i32* null, i32** %tmp3.i.i.i.i.i, align 8, !tbaa !0
+ %add.ptr.i5.i.i = getelementptr inbounds i32* null, i64 %conv
+ store i32* %add.ptr.i5.i.i, i32** %tmp4.i.i.i.i.i, align 8, !tbaa !0
+ br label %_ZNSt6vectorIiSaIiEEC1EmRKiRKS0_.exit
+
+cond.true.i.i.i.i: ; preds = %entry
+ %cmp.i.i.i.i.i = icmp slt i32 %N, 0
+ br i1 %cmp.i.i.i.i.i, label %if.then.i.i.i.i.i, label %_ZNSt12_Vector_baseIiSaIiEEC2EmRKS0_.exit.i.i
+
+if.then.i.i.i.i.i: ; preds = %cond.true.i.i.i.i
+ call void @_ZSt17__throw_bad_allocv() noreturn nounwind
+ unreachable
+
+_ZNSt12_Vector_baseIiSaIiEEC2EmRKS0_.exit.i.i: ; preds = %cond.true.i.i.i.i
+ %mul.i.i.i.i.i = shl i64 %conv, 2
+ %call3.i.i.i.i.i = call noalias i8* @_Znwm(i64 %mul.i.i.i.i.i) nounwind
+ %0 = bitcast i8* %call3.i.i.i.i.i to i32*
+ store i32* %0, i32** %v2.sub, align 8, !tbaa !0
+ store i32* %0, i32** %tmp3.i.i.i.i.i, align 8, !tbaa !0
+ %add.ptr.i.i.i = getelementptr inbounds i32* %0, i64 %conv
+ store i32* %add.ptr.i.i.i, i32** %tmp4.i.i.i.i.i, align 8, !tbaa !0
+ call void @llvm.memset.p0i8.i64(i8* %call3.i.i.i.i.i, i8 0, i64 %mul.i.i.i.i.i, i32 4, i1 false)
+ br label %_ZNSt6vectorIiSaIiEEC1EmRKiRKS0_.exit
+
+This is just the handling the construction of the vector. Most surprising here
+is the fact that all three null stores in %entry are dead (because we do no
+cross-block DSE).
+
+Also surprising is that %conv isn't simplified to 0 in %....exit.thread.i.i.
+This is a because the client of LazyValueInfo doesn't simplify all instruction
+operands, just selected ones.
+
+//===---------------------------------------------------------------------===//
+
+clang -O3 -fno-exceptions currently compiles this code:
+
+void f(char* a, int n) {
+ __builtin_memset(a, 0, n);
+ for (int i = 0; i < n; ++i)
+ a[i] = 0;
+}
+
+into:
+
+define void @_Z1fPci(i8* nocapture %a, i32 %n) nounwind {
+entry:
+ %conv = sext i32 %n to i64
+ tail call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 %conv, i32 1, i1 false)
+ %cmp8 = icmp sgt i32 %n, 0
+ br i1 %cmp8, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph: ; preds = %entry
+ %tmp10 = add i32 %n, -1
+ %tmp11 = zext i32 %tmp10 to i64
+ %tmp12 = add i64 %tmp11, 1
+ call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 %tmp12, i32 1, i1 false)
+ ret void
+
+for.end: ; preds = %entry
+ ret void
+}
+
+This shouldn't need the ((zext (%n - 1)) + 1) game, and it should ideally fold
+the two memset's together.
+
+The issue with the addition only occurs in 64-bit mode, and appears to be at
+least partially caused by Scalar Evolution not keeping its cache updated: it
+returns the "wrong" result immediately after indvars runs, but figures out the
+expected result if it is run from scratch on IR resulting from running indvars.
+
+//===---------------------------------------------------------------------===//
+
+clang -O3 -fno-exceptions currently compiles this code:
+
+struct S {
+ unsigned short m1, m2;
+ unsigned char m3, m4;
+};
+
+void f(int N) {
+ std::vector<S> v(N);
+ extern void sink(void*); sink(&v);
+}
+
+into poor code for zero-initializing 'v' when N is >0. The problem is that
+S is only 6 bytes, but each element is 8 byte-aligned. We generate a loop and
+4 stores on each iteration. If the struct were 8 bytes, this gets turned into
+a memset.
+
+In order to handle this we have to:
+ A) Teach clang to generate metadata for memsets of structs that have holes in
+ them.
+ B) Teach clang to use such a memset for zero init of this struct (since it has
+ a hole), instead of doing elementwise zeroing.
+
+//===---------------------------------------------------------------------===//
+
+clang -O3 currently compiles this code:
+
+extern const int magic;
+double f() { return 0.0 * magic; }
+
+into
+
+@magic = external constant i32
+
+define double @_Z1fv() nounwind readnone {
+entry:
+ %tmp = load i32* @magic, align 4, !tbaa !0
+ %conv = sitofp i32 %tmp to double
+ %mul = fmul double %conv, 0.000000e+00
+ ret double %mul
+}
+
+We should be able to fold away this fmul to 0.0. More generally, fmul(x,0.0)
+can be folded to 0.0 if we can prove that the LHS is not -0.0, not a NaN, and
+not an INF. The CannotBeNegativeZero predicate in value tracking should be
+extended to support general "fpclassify" operations that can return
+yes/no/unknown for each of these predicates.
+
+In this predicate, we know that uitofp is trivially never NaN or -0.0, and
+we know that it isn't +/-Inf if the floating point type has enough exponent bits
+to represent the largest integer value as < inf.
+
+//===---------------------------------------------------------------------===//
+
+When optimizing a transformation that can change the sign of 0.0 (such as the
+0.0*val -> 0.0 transformation above), it might be provable that the sign of the
+expression doesn't matter. For example, by the above rules, we can't transform
+fmul(sitofp(x), 0.0) into 0.0, because x might be -1 and the result of the
+expression is defined to be -0.0.
+
+If we look at the uses of the fmul for example, we might be able to prove that
+all uses don't care about the sign of zero. For example, if we have:
+
+ fadd(fmul(sitofp(x), 0.0), 2.0)
+
+Since we know that x+2.0 doesn't care about the sign of any zeros in X, we can
+transform the fmul to 0.0, and then the fadd to 2.0.
+
+//===---------------------------------------------------------------------===//
+
+We should enhance memcpy/memcpy/memset to allow a metadata node on them
+indicating that some bytes of the transfer are undefined. This is useful for
+frontends like clang when lowering struct copies, when some elements of the
+struct are undefined. Consider something like this:
+
+struct x {
+ char a;
+ int b[4];
+};
+void foo(struct x*P);
+struct x testfunc() {
+ struct x V1, V2;
+ foo(&V1);
+ V2 = V1;
+
+ return V2;
+}
+
+We currently compile this to:
+$ clang t.c -S -o - -O0 -emit-llvm | opt -scalarrepl -S
+
+
+%struct.x = type { i8, [4 x i32] }
+
+define void @testfunc(%struct.x* sret %agg.result) nounwind ssp {
+entry:
+ %V1 = alloca %struct.x, align 4
+ call void @foo(%struct.x* %V1)
+ %tmp1 = bitcast %struct.x* %V1 to i8*
+ %0 = bitcast %struct.x* %V1 to i160*
+ %srcval1 = load i160* %0, align 4
+ %tmp2 = bitcast %struct.x* %agg.result to i8*
+ %1 = bitcast %struct.x* %agg.result to i160*
+ store i160 %srcval1, i160* %1, align 4
+ ret void
+}
+
+This happens because SRoA sees that the temp alloca has is being memcpy'd into
+and out of and it has holes and it has to be conservative. If we knew about the
+holes, then this could be much much better.
+
+Having information about these holes would also improve memcpy (etc) lowering at
+llc time when it gets inlined, because we can use smaller transfers. This also
+avoids partial register stalls in some important cases.
+
+//===---------------------------------------------------------------------===//
+
+We don't fold (icmp (add) (add)) unless the two adds only have a single use.
+There are a lot of cases that we're refusing to fold in (e.g.) 256.bzip2, for
+example:
+
+ %indvar.next90 = add i64 %indvar89, 1 ;; Has 2 uses
+ %tmp96 = add i64 %tmp95, 1 ;; Has 1 use
+ %exitcond97 = icmp eq i64 %indvar.next90, %tmp96
+
+We don't fold this because we don't want to introduce an overlapped live range
+of the ivar. However if we can make this more aggressive without causing
+performance issues in two ways:
+
+1. If *either* the LHS or RHS has a single use, we can definitely do the
+ transformation. In the overlapping liverange case we're trading one register
+ use for one fewer operation, which is a reasonable trade. Before doing this
+ we should verify that the llc output actually shrinks for some benchmarks.
+2. If both ops have multiple uses, we can still fold it if the operations are
+ both sinkable to *after* the icmp (e.g. in a subsequent block) which doesn't
+ increase register pressure.
+
+There are a ton of icmp's we aren't simplifying because of the reg pressure
+concern. Care is warranted here though because many of these are induction
+variables and other cases that matter a lot to performance, like the above.
+Here's a blob of code that you can drop into the bottom of visitICmp to see some
+missed cases:
+
+ { Value *A, *B, *C, *D;
+ if (match(Op0, m_Add(m_Value(A), m_Value(B))) &&
+ match(Op1, m_Add(m_Value(C), m_Value(D))) &&
+ (A == C || A == D || B == C || B == D)) {
+ errs() << "OP0 = " << *Op0 << " U=" << Op0->getNumUses() << "\n";
+ errs() << "OP1 = " << *Op1 << " U=" << Op1->getNumUses() << "\n";
+ errs() << "CMP = " << I << "\n\n";
+ }
+ }
+
+//===---------------------------------------------------------------------===//
+
+define i1 @test1(i32 %x) nounwind {
+ %and = and i32 %x, 3
+ %cmp = icmp ult i32 %and, 2
+ ret i1 %cmp
+}
+
+Can be folded to (x & 2) == 0.
+
+define i1 @test2(i32 %x) nounwind {
+ %and = and i32 %x, 3
+ %cmp = icmp ugt i32 %and, 1
+ ret i1 %cmp
+}
+
+Can be folded to (x & 2) != 0.
+
+SimplifyDemandedBits shrinks the "and" constant to 2 but instcombine misses the
+icmp transform.
+
+//===---------------------------------------------------------------------===//
+
+This code:
+
+typedef struct {
+int f1:1;
+int f2:1;
+int f3:1;
+int f4:29;
+} t1;
+
+typedef struct {
+int f1:1;
+int f2:1;
+int f3:30;
+} t2;
+
+t1 s1;
+t2 s2;
+
+void func1(void)
+{
+s1.f1 = s2.f1;
+s1.f2 = s2.f2;
+}
+
+Compiles into this IR (on x86-64 at least):
+
+%struct.t1 = type { i8, [3 x i8] }
+@s2 = global %struct.t1 zeroinitializer, align 4
+@s1 = global %struct.t1 zeroinitializer, align 4
+define void @func1() nounwind ssp noredzone {
+entry:
+ %0 = load i32* bitcast (%struct.t1* @s2 to i32*), align 4
+ %bf.val.sext5 = and i32 %0, 1
+ %1 = load i32* bitcast (%struct.t1* @s1 to i32*), align 4
+ %2 = and i32 %1, -4
+ %3 = or i32 %2, %bf.val.sext5
+ %bf.val.sext26 = and i32 %0, 2
+ %4 = or i32 %3, %bf.val.sext26
+ store i32 %4, i32* bitcast (%struct.t1* @s1 to i32*), align 4
+ ret void
+}
+
+The two or/and's should be merged into one each.
+
+//===---------------------------------------------------------------------===//
+
+Machine level code hoisting can be useful in some cases. For example, PR9408
+is about:
+
+typedef union {
+ void (*f1)(int);
+ void (*f2)(long);
+} funcs;
+
+void foo(funcs f, int which) {
+ int a = 5;
+ if (which) {
+ f.f1(a);
+ } else {
+ f.f2(a);
+ }
+}
+
+which we compile to:
+
+foo: # @foo
+# BB#0: # %entry
+ pushq %rbp
+ movq %rsp, %rbp
+ testl %esi, %esi
+ movq %rdi, %rax
+ je .LBB0_2
+# BB#1: # %if.then
+ movl $5, %edi
+ callq *%rax
+ popq %rbp
+ ret
+.LBB0_2: # %if.else
+ movl $5, %edi
+ callq *%rax
+ popq %rbp
+ ret
+
+Note that bb1 and bb2 are the same. This doesn't happen at the IR level
+because one call is passing an i32 and the other is passing an i64.
+
+//===---------------------------------------------------------------------===//
+
+I see this sort of pattern in 176.gcc in a few places (e.g. the start of
+store_bit_field). The rem should be replaced with a multiply and subtract:
+
+ %3 = sdiv i32 %A, %B
+ %4 = srem i32 %A, %B
+
+Similarly for udiv/urem. Note that this shouldn't be done on X86 or ARM,
+which can do this in a single operation (instruction or libcall). It is
+probably best to do this in the code generator.
+
+//===---------------------------------------------------------------------===//
+
+unsigned foo(unsigned x, unsigned y) { return (x & y) == 0 || x == 0; }
+should fold to (x & y) == 0.
+
+//===---------------------------------------------------------------------===//
+
+unsigned foo(unsigned x, unsigned y) { return x > y && x != 0; }
+should fold to x > y.