X-Git-Url: http://plrg.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FREADME.txt;h=518c9893c8c58dc7e10f7f7d7512f7ce246e59f1;hb=c5a1a220ea3de1a656e91b9de7cba2e587c12eab;hp=5d7d8c30be6724412dc65ae01f49fd13928f837f;hpb=349155b671a0908a994f6d54a5576eae85237212;p=oota-llvm.git diff --git a/lib/Target/README.txt b/lib/Target/README.txt index 5d7d8c30be6..518c9893c8c 100644 --- a/lib/Target/README.txt +++ b/lib/Target/README.txt @@ -271,15 +271,7 @@ alas... //===---------------------------------------------------------------------===// -This isn't recognized as bswap by instcombine: - -unsigned int swap_32(unsigned int v) { - v = ((v & 0x00ff00ffU) << 8) | ((v & 0xff00ff00U) >> 8); - v = ((v & 0x0000ffffU) << 16) | ((v & 0xffff0000U) >> 16); - return v; -} - -Nor is this (yes, it really is bswap): +This isn't recognized as bswap by instcombine (yes, it really is bswap): unsigned long reverse(unsigned v) { unsigned t; @@ -291,6 +283,39 @@ unsigned long reverse(unsigned v) { //===---------------------------------------------------------------------===// +These idioms should be recognized as popcount (see PR1488): + +unsigned countbits_slow(unsigned v) { + unsigned c; + for (c = 0; v; v >>= 1) + c += v & 1; + return c; +} +unsigned countbits_fast(unsigned v){ + unsigned c; + for (c = 0; v; c++) + v &= v - 1; // clear the least significant bit set + return c; +} + +BITBOARD = unsigned long long +int PopCnt(register BITBOARD a) { + register int c=0; + while(a) { + c++; + a &= a - 1; + } + return c; +} +unsigned int popcount(unsigned int input) { + unsigned int count = 0; + for (unsigned int i = 0; i < 4 * 8; i++) + count += (input >> i) & i; + return count; +} + +//===---------------------------------------------------------------------===// + These should turn into single 16-bit (unaligned?) loads on little/big endian processors. @@ -317,11 +342,6 @@ this construct. //===---------------------------------------------------------------------===// -Instcombine misses several of these cases (see the testcase in the patch): -http://gcc.gnu.org/ml/gcc-patches/2006-10/msg01519.html - -//===---------------------------------------------------------------------===// - viterbi speeds up *significantly* if the various "history" related copy loops are turned into memcpy calls at the source level. We need a "loops to memcpy" pass. @@ -443,6 +463,19 @@ long long fib(const long long n) { //===---------------------------------------------------------------------===// +Tail recursion elimination should handle: + +int pow2m1(int n) { + if (n == 0) + return 0; + return 2 * pow2m1 (n - 1) + 1; +} + +Also, multiplies can be turned into SHL's, so they should be handled as if +they were associative. "return foo() << 1" can be tail recursion eliminated. + +//===---------------------------------------------------------------------===// + Argument promotion should promote arguments for recursive functions, like this: @@ -766,9 +799,477 @@ unsigned long long f6(unsigned long long x, unsigned long long y, int z) { } } -On X86-64, we only handle f3/f4 right. On x86-32, several of these +On X86-64, we only handle f2/f3/f4 right. On x86-32, a few of these generate truly horrible code, instead of using shld and friends. On ARM, we end up with calls to L___lshrdi3/L___ashldi3 in f, which is badness. PPC64 misses f, f5 and f6. CellSPU aborts in isel. //===---------------------------------------------------------------------===// + +We do a number of simplifications in simplify libcalls to strength reduce +standard library functions, but we don't currently merge them together. For +example, it is useful to merge memcpy(a,b,strlen(b)) -> strcpy. This can only +be done safely if "b" isn't modified between the strlen and memcpy of course. + +//===---------------------------------------------------------------------===// + +We should be able to evaluate this loop: + +int test(int x_offs) { + while (x_offs > 4) + x_offs -= 4; + return x_offs; +} + +//===---------------------------------------------------------------------===// + +Reassociate should turn things like: + +int factorial(int X) { + return X*X*X*X*X*X*X*X; +} + +into llvm.powi calls, allowing the code generator to produce balanced +multiplication trees. + +//===---------------------------------------------------------------------===// + +We generate a horrible libcall for llvm.powi. For example, we compile: + +#include +double f(double a) { return std::pow(a, 4); } + +into: + +__Z1fd: + subl $12, %esp + movsd 16(%esp), %xmm0 + movsd %xmm0, (%esp) + movl $4, 8(%esp) + call L___powidf2$stub + addl $12, %esp + ret + +GCC produces: + +__Z1fd: + subl $12, %esp + movsd 16(%esp), %xmm0 + mulsd %xmm0, %xmm0 + mulsd %xmm0, %xmm0 + movsd %xmm0, (%esp) + fldl (%esp) + addl $12, %esp + ret + +//===---------------------------------------------------------------------===// + +We compile this program: (from GCC PR11680) +http://gcc.gnu.org/bugzilla/attachment.cgi?id=4487 + +Into code that runs the same speed in fast/slow modes, but both modes run 2x +slower than when compile with GCC (either 4.0 or 4.2): + +$ llvm-g++ perf.cpp -O3 -fno-exceptions +$ time ./a.out fast +1.821u 0.003s 0:01.82 100.0% 0+0k 0+0io 0pf+0w + +$ g++ perf.cpp -O3 -fno-exceptions +$ time ./a.out fast +0.821u 0.001s 0:00.82 100.0% 0+0k 0+0io 0pf+0w + +It looks like we are making the same inlining decisions, so this may be raw +codegen badness or something else (haven't investigated). + +//===---------------------------------------------------------------------===// + +We miss some instcombines for stuff like this: +void bar (void); +void foo (unsigned int a) { + /* This one is equivalent to a >= (3 << 2). */ + if ((a >> 2) >= 3) + bar (); +} + +A few other related ones are in GCC PR14753. + +//===---------------------------------------------------------------------===// + +Divisibility by constant can be simplified (according to GCC PR12849) from +being a mulhi to being a mul lo (cheaper). Testcase: + +void bar(unsigned n) { + if (n % 3 == 0) + true(); +} + +I think this basically amounts to a dag combine to simplify comparisons against +multiply hi's into a comparison against the mullo. + +//===---------------------------------------------------------------------===// + +SROA is not promoting the union on the stack in this example, we should end +up with no allocas. + +union vec2d { + double e[2]; + double v __attribute__((vector_size(16))); +}; +typedef union vec2d vec2d; + +static vec2d a={{1,2}}, b={{3,4}}; + +vec2d foo () { + return (vec2d){ .v = a.v + b.v * (vec2d){{5,5}}.v }; +} + +//===---------------------------------------------------------------------===// + +This C++ file: +void g(); struct A { int n; int m; A& operator++(void) { ++n; if (n == m) g(); +return *this; } A() : n(0), m(0) { } friend bool operator!=(A const& a1, +A const& a2) { return a1.n != a2.n; } }; void testfunction(A& iter) { A const +end; while (iter != end) ++iter; } + +Compiles down to: + +bb: ; preds = %bb3.backedge, %bb.nph + %.rle = phi i32 [ %1, %bb.nph ], [ %7, %bb3.backedge ] ; [#uses=1] + %4 = add i32 %.rle, 1 ; [#uses=2] + store i32 %4, i32* %0, align 4 + %5 = load i32* %3, align 4 ; [#uses=1] + %6 = icmp eq i32 %4, %5 ; [#uses=1] + br i1 %6, label %bb1, label %bb3.backedge + +bb1: ; preds = %bb + tail call void @_Z1gv() + br label %bb3.backedge + +bb3.backedge: ; preds = %bb, %bb1 + %7 = load i32* %0, align 4 ; [#uses=2] + + +The %7 load is partially redundant with the store of %4 to %0, GVN's PRE +should remove it, but it doesn't apply to memory objects. + +//===---------------------------------------------------------------------===// + +Better mod/ref analysis for scanf would allow us to eliminate the vtable and a +bunch of other stuff from this example (see PR1604): + +#include +struct test { + int val; + virtual ~test() {} +}; + +int main() { + test t; + std::scanf("%d", &t.val); + std::printf("%d\n", t.val); +} + +//===---------------------------------------------------------------------===// + +Instcombine will merge comparisons like (x >= 10) && (x < 20) by producing (x - +10) u< 10, but only when the comparisons have matching sign. + +This could be converted with a similiar technique. (PR1941) + +define i1 @test(i8 %x) { + %A = icmp uge i8 %x, 5 + %B = icmp slt i8 %x, 20 + %C = and i1 %A, %B + ret i1 %C +} + +//===---------------------------------------------------------------------===// + +These functions perform the same computation, but produce different assembly. + +define i8 @select(i8 %x) readnone nounwind { + %A = icmp ult i8 %x, 250 + %B = select i1 %A, i8 0, i8 1 + ret i8 %B +} + +define i8 @addshr(i8 %x) readnone nounwind { + %A = zext i8 %x to i9 + %B = add i9 %A, 6 ;; 256 - 250 == 6 + %C = lshr i9 %B, 8 + %D = trunc i9 %C to i8 + ret i8 %D +} + +//===---------------------------------------------------------------------===// + +From gcc bug 24696: +int +f (unsigned long a, unsigned long b, unsigned long c) +{ + return ((a & (c - 1)) != 0) || ((b & (c - 1)) != 0); +} +int +f (unsigned long a, unsigned long b, unsigned long c) +{ + return ((a & (c - 1)) != 0) | ((b & (c - 1)) != 0); +} +Both should combine to ((a|b) & (c-1)) != 0. Currently not optimized with +"clang -emit-llvm-bc | opt -std-compile-opts". + +//===---------------------------------------------------------------------===// + +From GCC Bug 20192: +#define PMD_MASK (~((1UL << 23) - 1)) +void clear_pmd_range(unsigned long start, unsigned long end) +{ + if (!(start & ~PMD_MASK) && !(end & ~PMD_MASK)) + f(); +} +The expression should optimize to something like +"!((start|end)&~PMD_MASK). Currently not optimized with "clang +-emit-llvm-bc | opt -std-compile-opts". + +//===---------------------------------------------------------------------===// + +From GCC Bug 15241: +unsigned int +foo (unsigned int a, unsigned int b) +{ + if (a <= 7 && b <= 7) + baz (); +} +Should combine to "(a|b) <= 7". Currently not optimized with "clang +-emit-llvm-bc | opt -std-compile-opts". + +//===---------------------------------------------------------------------===// + +From GCC Bug 3756: +int +pn (int n) +{ + return (n >= 0 ? 1 : -1); +} +Should combine to (n >> 31) | 1. Currently not optimized with "clang +-emit-llvm-bc | opt -std-compile-opts | llc". + +//===---------------------------------------------------------------------===// + +From GCC Bug 28685: +int test(int a, int b) +{ + int lt = a < b; + int eq = a == b; + + return (lt || eq); +} +Should combine to "a <= b". Currently not optimized with "clang +-emit-llvm-bc | opt -std-compile-opts | llc". + +//===---------------------------------------------------------------------===// + +void a(int variable) +{ + if (variable == 4 || variable == 6) + bar(); +} +This should optimize to "if ((variable | 2) == 6)". Currently not +optimized with "clang -emit-llvm-bc | opt -std-compile-opts | llc". + +//===---------------------------------------------------------------------===// + +unsigned int f(unsigned int i, unsigned int n) {++i; if (i == n) ++i; return +i;} +unsigned int f2(unsigned int i, unsigned int n) {++i; i += i == n; return i;} +These should combine to the same thing. Currently, the first function +produces better code on X86. + +//===---------------------------------------------------------------------===// + +From GCC Bug 15784: +#define abs(x) x>0?x:-x +int f(int x, int y) +{ + return (abs(x)) >= 0; +} +This should optimize to x == INT_MIN. (With -fwrapv.) Currently not +optimized with "clang -emit-llvm-bc | opt -std-compile-opts". + +//===---------------------------------------------------------------------===// + +From GCC Bug 14753: +void +rotate_cst (unsigned int a) +{ + a = (a << 10) | (a >> 22); + if (a == 123) + bar (); +} +void +minus_cst (unsigned int a) +{ + unsigned int tem; + + tem = 20 - a; + if (tem == 5) + bar (); +} +void +mask_gt (unsigned int a) +{ + /* This is equivalent to a > 15. */ + if ((a & ~7) > 8) + bar (); +} +void +rshift_gt (unsigned int a) +{ + /* This is equivalent to a > 23. */ + if ((a >> 2) > 5) + bar (); +} +All should simplify to a single comparison. All of these are +currently not optimized with "clang -emit-llvm-bc | opt +-std-compile-opts". + +//===---------------------------------------------------------------------===// + +From GCC Bug 32605: +int c(int* x) {return (char*)x+2 == (char*)x;} +Should combine to 0. Currently not optimized with "clang +-emit-llvm-bc | opt -std-compile-opts" (although llc can optimize it). + +//===---------------------------------------------------------------------===// + +int a(unsigned char* b) {return *b > 99;} +There's an unnecessary zext in the generated code with "clang +-emit-llvm-bc | opt -std-compile-opts". + +//===---------------------------------------------------------------------===// + +int a(unsigned b) {return ((b << 31) | (b << 30)) >> 31;} +Should be combined to "((b >> 1) | b) & 1". Currently not optimized +with "clang -emit-llvm-bc | opt -std-compile-opts". + +//===---------------------------------------------------------------------===// + +unsigned a(unsigned x, unsigned y) { return x | (y & 1) | (y & 2);} +Should combine to "x | (y & 3)". Currently not optimized with "clang +-emit-llvm-bc | opt -std-compile-opts". + +//===---------------------------------------------------------------------===// + +unsigned a(unsigned a) {return ((a | 1) & 3) | (a & -4);} +Should combine to "a | 1". Currently not optimized with "clang +-emit-llvm-bc | opt -std-compile-opts". + +//===---------------------------------------------------------------------===// + +int a(int a, int b, int c) {return (~a & c) | ((c|a) & b);} +Should fold to "(~a & c) | (a & b)". Currently not optimized with +"clang -emit-llvm-bc | opt -std-compile-opts". + +//===---------------------------------------------------------------------===// + +int a(int a,int b) {return (~(a|b))|a;} +Should fold to "a|~b". Currently not optimized with "clang +-emit-llvm-bc | opt -std-compile-opts". + +//===---------------------------------------------------------------------===// + +int a(int a, int b) {return (a&&b) || (a&&!b);} +Should fold to "a". Currently not optimized with "clang -emit-llvm-bc +| opt -std-compile-opts". + +//===---------------------------------------------------------------------===// + +int a(int a, int b, int c) {return (a&&b) || (!a&&c);} +Should fold to "a ? b : c", or at least something sane. Currently not +optimized with "clang -emit-llvm-bc | opt -std-compile-opts". + +//===---------------------------------------------------------------------===// + +int a(int a, int b, int c) {return (a&&b) || (a&&c) || (a&&b&&c);} +Should fold to a && (b || c). Currently not optimized with "clang +-emit-llvm-bc | opt -std-compile-opts". + +//===---------------------------------------------------------------------===// + +int a(int x) {return x | ((x & 8) ^ 8);} +Should combine to x | 8. Currently not optimized with "clang +-emit-llvm-bc | opt -std-compile-opts". + +//===---------------------------------------------------------------------===// + +int a(int x) {return x ^ ((x & 8) ^ 8);} +Should also combine to x | 8. Currently not optimized with "clang +-emit-llvm-bc | opt -std-compile-opts". + +//===---------------------------------------------------------------------===// + +int a(int x) {return (x & 8) == 0 ? -1 : -9;} +Should combine to (x | -9) ^ 8. Currently not optimized with "clang +-emit-llvm-bc | opt -std-compile-opts". + +//===---------------------------------------------------------------------===// + +int a(int x) {return (x & 8) == 0 ? -9 : -1;} +Should combine to x | -9. Currently not optimized with "clang +-emit-llvm-bc | opt -std-compile-opts". + +//===---------------------------------------------------------------------===// + +int a(int x) {return ((x | -9) ^ 8) & x;} +Should combine to x & -9. Currently not optimized with "clang +-emit-llvm-bc | opt -std-compile-opts". + +//===---------------------------------------------------------------------===// + +unsigned a(unsigned a) {return a * 0x11111111 >> 28 & 1;} +Should combine to "a * 0x88888888 >> 31". Currently not optimized +with "clang -emit-llvm-bc | opt -std-compile-opts". + +//===---------------------------------------------------------------------===// + +unsigned a(char* x) {if ((*x & 32) == 0) return b();} +There's an unnecessary zext in the generated code with "clang +-emit-llvm-bc | opt -std-compile-opts". + +//===---------------------------------------------------------------------===// + +unsigned a(unsigned long long x) {return 40 * (x >> 1);} +Should combine to "20 * (((unsigned)x) & -2)". Currently not +optimized with "clang -emit-llvm-bc | opt -std-compile-opts". + +//===---------------------------------------------------------------------===// + +We would like to do the following transform in the instcombiner: + + -X/C -> X/-C + +However, this isn't valid if (-X) overflows. We can implement this when we +have the concept of a "C signed subtraction" operator that which is undefined +on overflow. + +//===---------------------------------------------------------------------===// + +This was noticed in the entryblock for grokdeclarator in 403.gcc: + + %tmp = icmp eq i32 %decl_context, 4 + %decl_context_addr.0 = select i1 %tmp, i32 3, i32 %decl_context + %tmp1 = icmp eq i32 %decl_context_addr.0, 1 + %decl_context_addr.1 = select i1 %tmp1, i32 0, i32 %decl_context_addr.0 + +tmp1 should be simplified to something like: + (!tmp || decl_context == 1) + +This allows recursive simplifications, tmp1 is used all over the place in +the function, e.g. by: + + %tmp23 = icmp eq i32 %decl_context_addr.1, 0 ; [#uses=1] + %tmp24 = xor i1 %tmp1, true ; [#uses=1] + %or.cond8 = and i1 %tmp23, %tmp24 ; [#uses=1] + +later. + +