//===---------------------------------------------------------------------===//
-We should add 'unaligned load/store' nodes, and produce them from code like
-this:
+We should produce an unaligned load from code like this:
v4sf example(float *P) {
return (v4sf){P[0], P[1], P[2], P[3] };
//===---------------------------------------------------------------------===//
-Scalar Repl cannot currently promote this testcase to 'ret long cst':
-
- %struct.X = type { i32, i32 }
- %struct.Y = type { %struct.X }
-
-define i64 @bar() {
- %retval = alloca %struct.Y, align 8
- %tmp12 = getelementptr %struct.Y* %retval, i32 0, i32 0, i32 0
- store i32 0, i32* %tmp12
- %tmp15 = getelementptr %struct.Y* %retval, i32 0, i32 0, i32 1
- store i32 1, i32* %tmp15
- %retval.upgrd.1 = bitcast %struct.Y* %retval to i64*
- %retval.upgrd.2 = load i64* %retval.upgrd.1
- ret i64 %retval.upgrd.2
-}
-
-it should be extended to do so.
-
-//===---------------------------------------------------------------------===//
-
--scalarrepl should promote this to be a vector scalar.
-
- %struct..0anon = type { <4 x float> }
-
-define void @test1(<4 x float> %V, float* %P) {
- %u = alloca %struct..0anon, align 16
- %tmp = getelementptr %struct..0anon* %u, i32 0, i32 0
- store <4 x float> %V, <4 x float>* %tmp
- %tmp1 = bitcast %struct..0anon* %u to [4 x float]*
- %tmp.upgrd.1 = getelementptr [4 x float]* %tmp1, i32 0, i32 1
- %tmp.upgrd.2 = load float* %tmp.upgrd.1
- %tmp3 = mul float %tmp.upgrd.2, 2.000000e+00
- store float %tmp3, float* %P
- ret void
-}
-
-//===---------------------------------------------------------------------===//
-
Turn this into a single byte store with no load (the other 3 bytes are
unmodified):
-void %test(uint* %P) {
- %tmp = load uint* %P
- %tmp14 = or uint %tmp, 3305111552
- %tmp15 = and uint %tmp14, 3321888767
- store uint %tmp15, uint* %P
+define void @test(i32* %P) {
+ %tmp = load i32* %P
+ %tmp14 = or i32 %tmp, 3305111552
+ %tmp15 = and i32 %tmp14, 3321888767
+ store i32 %tmp15, i32* %P
ret void
}
//===---------------------------------------------------------------------===//
-Legalize should lower ctlz like this:
- ctlz(x) = popcnt((x-1) & ~x)
-
-on targets that have popcnt but not ctlz. itanium, what else?
-
-//===---------------------------------------------------------------------===//
-
quantum_sigma_x in 462.libquantum contains the following loop:
for(i=0; i<reg->size; i++)
//===---------------------------------------------------------------------===//
-Promote for i32 bswap can use i64 bswap + shr. Useful on targets with 64-bit
-regs and bswap, like itanium.
-
-//===---------------------------------------------------------------------===//
-
LSR should know what GPR types a target has. This code:
volatile short X, Y; // globals
//===---------------------------------------------------------------------===//
-We should extend parameter attributes to capture more information about
-pointer parameters for alias analysis. Some ideas:
-
-1. Add a "nocapture" attribute, which indicates that the callee does not store
- the address of the parameter into a global or any other memory location
- visible to the callee. This can be used to make basicaa and other analyses
- more powerful. It is true for things like memcpy, strcat, and many other
- things, including structs passed by value, most C++ references, etc.
-2. Generalize readonly to be set on parameters. This is important mod/ref
- info for the function, which is important for basicaa and others. It can
- also be used by the inliner to avoid inserting a memcpy for byval
- arguments when the function is inlined.
-
-These functions can be inferred by various analysis passes such as the
-globalsmodrefaa pass. Note that getting #2 right is actually really tricky.
-Consider this code:
-
-struct S; S G;
-void caller(S byvalarg) { G.field = 1; ... }
-void callee() { caller(G); }
-
-The fact that the caller does not modify byval arg is not enough, we need
-to know that it doesn't modify G either. This is very tricky.
-
-//===---------------------------------------------------------------------===//
-
We should add an FRINT node to the DAG to model targets that have legal
implementations of ceil/floor/rint.
//===---------------------------------------------------------------------===//
-We should be able to evaluate this loop:
-
-int test(int x_offs) {
- while (x_offs > 4)
- x_offs -= 4;
- return x_offs;
-}
-
-//===---------------------------------------------------------------------===//
-
Reassociate should turn things like:
int factorial(int X) {
//===---------------------------------------------------------------------===//
-SROA is not promoting the union on the stack in this example, we should end
-up with no allocas.
-
-union vec2d {
- double e[2];
- double v __attribute__((vector_size(16)));
-};
-typedef union vec2d vec2d;
-
-static vec2d a={{1,2}}, b={{3,4}};
-
-vec2d foo () {
- return (vec2d){ .v = a.v + b.v * (vec2d){{5,5}}.v };
-}
-
-//===---------------------------------------------------------------------===//
-
Better mod/ref analysis for scanf would allow us to eliminate the vtable and a
bunch of other stuff from this example (see PR1604):
456.hmmer apparently uses strcspn and strspn a lot. 471.omnetpp uses strspn.
//===---------------------------------------------------------------------===//
+
+"gas" uses this idiom:
+ else if (strchr ("+-/*%|&^:[]()~", *intel_parser.op_string))
+..
+ else if (strchr ("<>", *intel_parser.op_string)
+
+Those should be turned into a switch.
+
+//===---------------------------------------------------------------------===//
+
+252.eon contains this interesting code:
+
+ %3072 = getelementptr [100 x i8]* %tempString, i32 0, i32 0
+ %3073 = call i8* @strcpy(i8* %3072, i8* %3071) nounwind
+ %strlen = call i32 @strlen(i8* %3072) ; uses = 1
+ %endptr = getelementptr [100 x i8]* %tempString, i32 0, i32 %strlen
+ call void @llvm.memcpy.i32(i8* %endptr,
+ i8* getelementptr ([5 x i8]* @"\01LC42", i32 0, i32 0), i32 5, i32 1)
+ %3074 = call i32 @strlen(i8* %endptr) nounwind readonly
+
+This is interesting for a couple reasons. First, in this:
+
+ %3073 = call i8* @strcpy(i8* %3072, i8* %3071) nounwind
+ %strlen = call i32 @strlen(i8* %3072)
+
+The strlen could be replaced with: %strlen = sub %3072, %3073, because the
+strcpy call returns a pointer to the end of the string. Based on that, the
+endptr GEP just becomes equal to 3073, which eliminates a strlen call and GEP.
+
+Second, the memcpy+strlen strlen can be replaced with:
+
+ %3074 = call i32 @strlen([5 x i8]* @"\01LC42") nounwind readonly
+
+Because the destination was just copied into the specified memory buffer. This,
+in turn, can be constant folded to "4".
+
+In other code, it contains:
+
+ %endptr6978 = bitcast i8* %endptr69 to i32*
+ store i32 7107374, i32* %endptr6978, align 1
+ %3167 = call i32 @strlen(i8* %endptr69) nounwind readonly
+
+Which could also be constant folded. Whatever is producing this should probably
+be fixed to leave this as a memcpy from a string.
+
+Further, eon also has an interesting partially redundant strlen call:
+
+bb8: ; preds = %_ZN18eonImageCalculatorC1Ev.exit
+ %682 = getelementptr i8** %argv, i32 6 ; <i8**> [#uses=2]
+ %683 = load i8** %682, align 4 ; <i8*> [#uses=4]
+ %684 = load i8* %683, align 1 ; <i8> [#uses=1]
+ %685 = icmp eq i8 %684, 0 ; <i1> [#uses=1]
+ br i1 %685, label %bb10, label %bb9
+
+bb9: ; preds = %bb8
+ %686 = call i32 @strlen(i8* %683) nounwind readonly
+ %687 = icmp ugt i32 %686, 254 ; <i1> [#uses=1]
+ br i1 %687, label %bb10, label %bb11
+
+bb10: ; preds = %bb9, %bb8
+ %688 = call i32 @strlen(i8* %683) nounwind readonly
+
+This could be eliminated by doing the strlen once in bb8, saving code size and
+improving perf on the bb8->9->10 path.
+
+//===---------------------------------------------------------------------===//
+
+I see an interesting fully redundant call to strlen left in 186.crafty:InputMove
+which looks like:
+ %movetext11 = getelementptr [128 x i8]* %movetext, i32 0, i32 0
+
+
+bb62: ; preds = %bb55, %bb53
+ %promote.0 = phi i32 [ %169, %bb55 ], [ 0, %bb53 ]
+ %171 = call i32 @strlen(i8* %movetext11) nounwind readonly align 1
+ %172 = add i32 %171, -1 ; <i32> [#uses=1]
+ %173 = getelementptr [128 x i8]* %movetext, i32 0, i32 %172
+
+... no stores ...
+ br i1 %or.cond, label %bb65, label %bb72
+
+bb65: ; preds = %bb62
+ store i8 0, i8* %173, align 1
+ br label %bb72
+
+bb72: ; preds = %bb65, %bb62
+ %trank.1 = phi i32 [ %176, %bb65 ], [ -1, %bb62 ]
+ %177 = call i32 @strlen(i8* %movetext11) nounwind readonly align 1
+
+Note that on the bb62->bb72 path, that the %177 strlen call is partially
+redundant with the %171 call. At worst, we could shove the %177 strlen call
+up into the bb65 block moving it out of the bb62->bb72 path. However, note
+that bb65 stores to the string, zeroing out the last byte. This means that on
+that path the value of %177 is actually just %171-1. A sub is cheaper than a
+strlen!
+
+This pattern repeats several times, basically doing:
+
+ A = strlen(P);
+ P[A-1] = 0;
+ B = strlen(P);
+ where it is "obvious" that B = A-1.
+
+//===---------------------------------------------------------------------===//
+
+186.crafty contains this interesting pattern:
+
+%77 = call i8* @strstr(i8* getelementptr ([6 x i8]* @"\01LC5", i32 0, i32 0),
+ i8* %30)
+%phitmp648 = icmp eq i8* %77, getelementptr ([6 x i8]* @"\01LC5", i32 0, i32 0)
+br i1 %phitmp648, label %bb70, label %bb76
+
+bb70: ; preds = %OptionMatch.exit91, %bb69
+ %78 = call i32 @strlen(i8* %30) nounwind readonly align 1 ; <i32> [#uses=1]
+
+This is basically:
+ cststr = "abcdef";
+ if (strstr(cststr, P) == cststr) {
+ x = strlen(P);
+ ...
+
+The strstr call would be significantly cheaper written as:
+
+cststr = "abcdef";
+if (memcmp(P, str, strlen(P)))
+ x = strlen(P);
+
+This is memcmp+strlen instead of strstr. This also makes the strlen fully
+redundant.
+
+//===---------------------------------------------------------------------===//
+
+186.crafty also contains this code:
+
+%1906 = call i32 @strlen(i8* getelementptr ([32 x i8]* @pgn_event, i32 0,i32 0))
+%1907 = getelementptr [32 x i8]* @pgn_event, i32 0, i32 %1906
+%1908 = call i8* @strcpy(i8* %1907, i8* %1905) nounwind align 1
+%1909 = call i32 @strlen(i8* getelementptr ([32 x i8]* @pgn_event, i32 0,i32 0))
+%1910 = getelementptr [32 x i8]* @pgn_event, i32 0, i32 %1909
+
+The last strlen is computable as 1908-@pgn_event, which means 1910=1908.
+
+//===---------------------------------------------------------------------===//
+
+186.crafty has this interesting pattern with the "out.4543" variable:
+
+call void @llvm.memcpy.i32(
+ i8* getelementptr ([10 x i8]* @out.4543, i32 0, i32 0),
+ i8* getelementptr ([7 x i8]* @"\01LC28700", i32 0, i32 0), i32 7, i32 1)
+%101 = call@printf(i8* ... @out.4543, i32 0, i32 0)) nounwind
+
+It is basically doing:
+
+ memcpy(globalarray, "string");
+ printf(..., globalarray);
+
+Anyway, by knowing that printf just reads the memory and forward substituting
+the string directly into the printf, this eliminates reads from globalarray.
+Since this pattern occurs frequently in crafty (due to the "DisplayTime" and
+other similar functions) there are many stores to "out". Once all the printfs
+stop using "out", all that is left is the memcpy's into it. This should allow
+globalopt to remove the "stored only" global.
+
+//===---------------------------------------------------------------------===//
+
+This code:
+
+define inreg i32 @foo(i8* inreg %p) nounwind {
+ %tmp0 = load i8* %p
+ %tmp1 = ashr i8 %tmp0, 5
+ %tmp2 = sext i8 %tmp1 to i32
+ ret i32 %tmp2
+}
+
+could be dagcombine'd to a sign-extending load with a shift.
+For example, on x86 this currently gets this:
+
+ movb (%eax), %al
+ sarb $5, %al
+ movsbl %al, %eax
+
+while it could get this:
+
+ movsbl (%eax), %eax
+ sarl $5, %eax
+
+//===---------------------------------------------------------------------===//
+
+GCC PR31029:
+
+int test(int x) { return 1-x == x; } // --> return false
+int test2(int x) { return 2-x == x; } // --> return x == 1 ?
+
+Always foldable for odd constants, what is the rule for even?
+
+//===---------------------------------------------------------------------===//
+
+PR 3381: GEP to field of size 0 inside a struct could be turned into GEP
+for next field in struct (which is at same address).
+
+For example: store of float into { {{}}, float } could be turned into a store to
+the float directly.
+
+//===---------------------------------------------------------------------===//
+
+#include <math.h>
+double foo(double a) { return sin(a); }
+
+This compiles into this on x86-64 Linux:
+foo:
+ subq $8, %rsp
+ call sin
+ addq $8, %rsp
+ ret
+vs:
+
+foo:
+ jmp sin
+
+//===---------------------------------------------------------------------===//
+
+The arg promotion pass should make use of nocapture to make its alias analysis
+stuff much more precise.
+
+//===---------------------------------------------------------------------===//
+
+The following functions should be optimized to use a select instead of a
+branch (from gcc PR40072):
+
+char char_int(int m) {if(m>7) return 0; return m;}
+int int_char(char m) {if(m>7) return 0; return m;}
+
+//===---------------------------------------------------------------------===//
+
+Instcombine should replace the load with a constant in:
+
+ static const char x[4] = {'a', 'b', 'c', 'd'};
+
+ unsigned int y(void) {
+ return *(unsigned int *)x;
+ }
+
+It currently only does this transformation when the size of the constant
+is the same as the size of the integer (so, try x[5]) and the last byte
+is a null (making it a C string). There's no need for these restrictions.
+
+//===---------------------------------------------------------------------===//
+
+InstCombine's "turn load from constant into constant" optimization should be
+more aggressive in the presence of bitcasts. For example, because of unions,
+this code:
+
+union vec2d {
+ double e[2];
+ double v __attribute__((vector_size(16)));
+};
+typedef union vec2d vec2d;
+
+static vec2d a={{1,2}}, b={{3,4}};
+
+vec2d foo () {
+ return (vec2d){ .v = a.v + b.v * (vec2d){{5,5}}.v };
+}
+
+Compiles into:
+
+@a = internal constant %0 { [2 x double]
+ [double 1.000000e+00, double 2.000000e+00] }, align 16
+@b = internal constant %0 { [2 x double]
+ [double 3.000000e+00, double 4.000000e+00] }, align 16
+...
+define void @foo(%struct.vec2d* noalias nocapture sret %agg.result) nounwind {
+entry:
+ %0 = load <2 x double>* getelementptr (%struct.vec2d*
+ bitcast (%0* @a to %struct.vec2d*), i32 0, i32 0), align 16
+ %1 = load <2 x double>* getelementptr (%struct.vec2d*
+ bitcast (%0* @b to %struct.vec2d*), i32 0, i32 0), align 16
+
+
+Instcombine should be able to optimize away the loads (and thus the globals).
+
+
+//===---------------------------------------------------------------------===//
+
+I saw this constant expression in real code after llvm-g++ -O2:
+
+declare extern_weak i32 @0(i64)
+
+define void @foo() {
+ br i1 icmp eq (i32 zext (i1 icmp ne (i32 (i64)* @0, i32 (i64)* null) to i32),
+i32 0), label %cond_true, label %cond_false
+cond_true:
+ ret void
+cond_false:
+ ret void
+}
+
+That branch expression should be reduced to:
+
+ i1 icmp eq (i32 (i64)* @0, i32 (i64)* null)
+
+It's probably not a perf issue, I just happened to see it while examining
+something else and didn't want to forget about it.
+
+//===---------------------------------------------------------------------===//