X-Git-Url: http://plrg.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FREADME.txt;h=7e9888cc13e80b48466cb5cca8997819eb07cb92;hb=19e5c2eef38b9a79fa835d52ca51d5ef7abb0f61;hp=16be2e9004f11686a96e9412166beb6bb96f71bb;hpb=4cd18f98f724bef7e018ba64aaece5dab949f648;p=oota-llvm.git diff --git a/lib/Target/README.txt b/lib/Target/README.txt index 16be2e9004f..7e9888cc13e 100644 --- a/lib/Target/README.txt +++ b/lib/Target/README.txt @@ -2,45 +2,6 @@ Target Independent Opportunities: //===---------------------------------------------------------------------===// -Dead argument elimination should be enhanced to handle cases when an argument is -dead to an externally visible function. Though the argument can't be removed -from the externally visible function, the caller doesn't need to pass it in. -For example in this testcase: - - void foo(int X) __attribute__((noinline)); - void foo(int X) { sideeffect(); } - void bar(int A) { foo(A+1); } - -We compile bar to: - -define void @bar(i32 %A) nounwind ssp { - %0 = add nsw i32 %A, 1 ; [#uses=1] - tail call void @foo(i32 %0) nounwind noinline ssp - ret void -} - -The add is dead, we could pass in 'i32 undef' instead. This occurs for C++ -templates etc, which usually have linkonce_odr/weak_odr linkage, not internal -linkage. - -//===---------------------------------------------------------------------===// - -With the recent changes to make the implicit def/use set explicit in -machineinstrs, we should change the target descriptions for 'call' instructions -so that the .td files don't list all the call-clobbered registers as implicit -defs. Instead, these should be added by the code generator (e.g. on the dag). - -This has a number of uses: - -1. PPC32/64 and X86 32/64 can avoid having multiple copies of call instructions - for their different impdef sets. -2. Targets with multiple calling convs (e.g. x86) which have different clobber - sets don't need copies of call instructions. -3. 'Interprocedural register allocation' can be done to reduce the clobber sets - of calls. - -//===---------------------------------------------------------------------===// - We should recognized various "overflow detection" idioms and translate them into llvm.uadd.with.overflow and similar intrinsics. Here is a multiply idiom: @@ -132,44 +93,6 @@ This requires reassociating to forms of expressions that are already available, something that reassoc doesn't think about yet. -//===---------------------------------------------------------------------===// - -This function: (derived from GCC PR19988) -double foo(double x, double y) { - return ((x + 0.1234 * y) * (x + -0.1234 * y)); -} - -compiles to: -_foo: - movapd %xmm1, %xmm2 - mulsd LCPI1_1(%rip), %xmm1 - mulsd LCPI1_0(%rip), %xmm2 - addsd %xmm0, %xmm1 - addsd %xmm0, %xmm2 - movapd %xmm1, %xmm0 - mulsd %xmm2, %xmm0 - ret - -Reassociate should be able to turn it into: - -double foo(double x, double y) { - return ((x + 0.1234 * y) * (x - 0.1234 * y)); -} - -Which allows the multiply by constant to be CSE'd, producing: - -_foo: - mulsd LCPI1_0(%rip), %xmm1 - movapd %xmm1, %xmm2 - addsd %xmm0, %xmm2 - subsd %xmm1, %xmm0 - mulsd %xmm2, %xmm0 - ret - -This doesn't need -ffast-math support at all. This is particularly bad because -the llvm-gcc frontend is canonicalizing the later into the former, but clang -doesn't have this problem. - //===---------------------------------------------------------------------===// These two functions should generate the same code on big-endian systems: @@ -183,7 +106,7 @@ for 1,2,4,8 bytes. //===---------------------------------------------------------------------===// It would be nice to revert this patch: -http://lists.cs.uiuc.edu/pipermail/llvm-commits/Week-of-Mon-20060213/031986.html +http://lists.llvm.org/pipermail/llvm-commits/Week-of-Mon-20060213/031986.html And teach the dag combiner enough to simplify the code expanded before legalize. It seems plausible that this knowledge would let it simplify other @@ -191,7 +114,7 @@ stuff too. //===---------------------------------------------------------------------===// -For vector types, TargetData.cpp::getTypeInfo() returns alignment that is equal +For vector types, DataLayout.cpp::getTypeInfo() returns alignment that is equal to the type size. It works but can be overly conservative as the alignment of specific vector types are target dependent. @@ -277,6 +200,20 @@ unsigned long reverse(unsigned v) { //===---------------------------------------------------------------------===// +[LOOP DELETION] + +We don't delete this output free loop, because trip count analysis doesn't +realize that it is finite (if it were infinite, it would be undefined). Not +having this blocks Loop Idiom from matching strlen and friends. + +void foo(char *C) { + int x = 0; + while (*C) + ++x,++C; +} + +//===---------------------------------------------------------------------===// + [LOOP RECOGNITION] These idioms should be recognized as popcount (see PR1488): @@ -287,22 +224,7 @@ unsigned countbits_slow(unsigned v) { c += v & 1; return c; } -unsigned countbits_fast(unsigned v){ - unsigned c; - for (c = 0; v; c++) - v &= v - 1; // clear the least significant bit set - return c; -} -BITBOARD = unsigned long long -int PopCnt(register BITBOARD a) { - register int c=0; - while(a) { - c++; - a &= a - 1; - } - return c; -} unsigned int popcount(unsigned int input) { unsigned int count = 0; for (unsigned int i = 0; i < 4 * 8; i++) @@ -310,6 +232,16 @@ unsigned int popcount(unsigned int input) { return count; } +This should be recognized as CLZ: rdar://8459039 + +unsigned clz_a(unsigned a) { + int i; + for (i=0;i<32;i++) + if (a & (1<<(31-i))) + return i; + return 32; +} + This sort of thing should be added to the loop idiom pass. //===---------------------------------------------------------------------===// @@ -391,34 +323,6 @@ PHI Slicing could be extended to do this. //===---------------------------------------------------------------------===// -LSR should know what GPR types a target has from TargetData. This code: - -volatile short X, Y; // globals - -void foo(int N) { - int i; - for (i = 0; i < N; i++) { X = i; Y = i*4; } -} - -produces two near identical IV's (after promotion) on PPC/ARM: - -LBB1_2: - ldr r3, LCPI1_0 - ldr r3, [r3] - strh r2, [r3] - ldr r3, LCPI1_1 - ldr r3, [r3] - strh r1, [r3] - add r1, r1, #4 - add r2, r2, #1 <- [0,+,1] - sub r0, r0, #1 <- [0,-,1] - cmp r0, #0 - bne LBB1_2 - -LSR should reuse the "+" IV for the exit test. - -//===---------------------------------------------------------------------===// - Tail call elim should be more aggressive, checking to see if the call is followed by an uncond branch to an exit block. @@ -756,18 +660,6 @@ codegen badness or something else (haven't investigated). //===---------------------------------------------------------------------===// -We miss some instcombines for stuff like this: -void bar (void); -void foo (unsigned int a) { - /* This one is equivalent to a >= (3 << 2). */ - if ((a >> 2) >= 3) - bar (); -} - -A few other related ones are in GCC PR14753. - -//===---------------------------------------------------------------------===// - Divisibility by constant can be simplified (according to GCC PR12849) from being a mulhi to being a mul lo (cheaper). Testcase: @@ -841,7 +733,7 @@ f (unsigned long a, unsigned long b, unsigned long c) return ((a & (c - 1)) != 0) | ((b & (c - 1)) != 0); } Both should combine to ((a|b) & (c-1)) != 0. Currently not optimized with -"clang -emit-llvm-bc | opt -std-compile-opts". +"clang -emit-llvm-bc | opt -O3". //===---------------------------------------------------------------------===// @@ -854,7 +746,7 @@ void clear_pmd_range(unsigned long start, unsigned long end) } The expression should optimize to something like "!((start|end)&~PMD_MASK). Currently not optimized with "clang --emit-llvm-bc | opt -std-compile-opts". +-emit-llvm-bc | opt -O3". //===---------------------------------------------------------------------===// @@ -873,7 +765,7 @@ int f(int x, int y) return (abs(x)) >= 0; } This should optimize to x == INT_MIN. (With -fwrapv.) Currently not -optimized with "clang -emit-llvm-bc | opt -std-compile-opts". +optimized with "clang -emit-llvm-bc | opt -O3". //===---------------------------------------------------------------------===// @@ -908,94 +800,120 @@ rshift_gt (unsigned int a) if ((a >> 2) > 5) bar (); } + All should simplify to a single comparison. All of these are currently not optimized with "clang -emit-llvm-bc | opt --std-compile-opts". +-O3". //===---------------------------------------------------------------------===// From GCC Bug 32605: int c(int* x) {return (char*)x+2 == (char*)x;} Should combine to 0. Currently not optimized with "clang --emit-llvm-bc | opt -std-compile-opts" (although llc can optimize it). +-emit-llvm-bc | opt -O3" (although llc can optimize it). //===---------------------------------------------------------------------===// int a(unsigned b) {return ((b << 31) | (b << 30)) >> 31;} Should be combined to "((b >> 1) | b) & 1". Currently not optimized -with "clang -emit-llvm-bc | opt -std-compile-opts". +with "clang -emit-llvm-bc | opt -O3". //===---------------------------------------------------------------------===// unsigned a(unsigned x, unsigned y) { return x | (y & 1) | (y & 2);} Should combine to "x | (y & 3)". Currently not optimized with "clang --emit-llvm-bc | opt -std-compile-opts". +-emit-llvm-bc | opt -O3". //===---------------------------------------------------------------------===// int a(int a, int b, int c) {return (~a & c) | ((c|a) & b);} Should fold to "(~a & c) | (a & b)". Currently not optimized with -"clang -emit-llvm-bc | opt -std-compile-opts". +"clang -emit-llvm-bc | opt -O3". //===---------------------------------------------------------------------===// int a(int a,int b) {return (~(a|b))|a;} Should fold to "a|~b". Currently not optimized with "clang --emit-llvm-bc | opt -std-compile-opts". +-emit-llvm-bc | opt -O3". //===---------------------------------------------------------------------===// int a(int a, int b) {return (a&&b) || (a&&!b);} Should fold to "a". Currently not optimized with "clang -emit-llvm-bc -| opt -std-compile-opts". +| opt -O3". //===---------------------------------------------------------------------===// int a(int a, int b, int c) {return (a&&b) || (!a&&c);} Should fold to "a ? b : c", or at least something sane. Currently not -optimized with "clang -emit-llvm-bc | opt -std-compile-opts". +optimized with "clang -emit-llvm-bc | opt -O3". //===---------------------------------------------------------------------===// int a(int a, int b, int c) {return (a&&b) || (a&&c) || (a&&b&&c);} Should fold to a && (b || c). Currently not optimized with "clang --emit-llvm-bc | opt -std-compile-opts". +-emit-llvm-bc | opt -O3". //===---------------------------------------------------------------------===// int a(int x) {return x | ((x & 8) ^ 8);} Should combine to x | 8. Currently not optimized with "clang --emit-llvm-bc | opt -std-compile-opts". +-emit-llvm-bc | opt -O3". //===---------------------------------------------------------------------===// int a(int x) {return x ^ ((x & 8) ^ 8);} Should also combine to x | 8. Currently not optimized with "clang --emit-llvm-bc | opt -std-compile-opts". +-emit-llvm-bc | opt -O3". //===---------------------------------------------------------------------===// int a(int x) {return ((x | -9) ^ 8) & x;} Should combine to x & -9. Currently not optimized with "clang --emit-llvm-bc | opt -std-compile-opts". +-emit-llvm-bc | opt -O3". //===---------------------------------------------------------------------===// unsigned a(unsigned a) {return a * 0x11111111 >> 28 & 1;} Should combine to "a * 0x88888888 >> 31". Currently not optimized -with "clang -emit-llvm-bc | opt -std-compile-opts". +with "clang -emit-llvm-bc | opt -O3". //===---------------------------------------------------------------------===// unsigned a(char* x) {if ((*x & 32) == 0) return b();} There's an unnecessary zext in the generated code with "clang --emit-llvm-bc | opt -std-compile-opts". +-emit-llvm-bc | opt -O3". //===---------------------------------------------------------------------===// unsigned a(unsigned long long x) {return 40 * (x >> 1);} Should combine to "20 * (((unsigned)x) & -2)". Currently not -optimized with "clang -emit-llvm-bc | opt -std-compile-opts". +optimized with "clang -emit-llvm-bc | opt -O3". + +//===---------------------------------------------------------------------===// + +int g(int x) { return (x - 10) < 0; } +Should combine to "x <= 9" (the sub has nsw). Currently not +optimized with "clang -emit-llvm-bc | opt -O3". + +//===---------------------------------------------------------------------===// + +int g(int x) { return (x + 10) < 0; } +Should combine to "x < -10" (the add has nsw). Currently not +optimized with "clang -emit-llvm-bc | opt -O3". + +//===---------------------------------------------------------------------===// + +int f(int i, int j) { return i < j + 1; } +int g(int i, int j) { return j > i - 1; } +Should combine to "i <= j" (the add/sub has nsw). Currently not +optimized with "clang -emit-llvm-bc | opt -O3". + +//===---------------------------------------------------------------------===// + +unsigned f(unsigned x) { return ((x & 7) + 1) & 15; } +The & 15 part should be optimized away, it doesn't change the result. Currently +not optimized with "clang -emit-llvm-bc | opt -O3". //===---------------------------------------------------------------------===// @@ -1207,7 +1125,7 @@ There are many load PRE testcases in testsuite/gcc.dg/tree-ssa/loadpre* in the GCC testsuite, ones we don't get yet are (checked through loadpre25): [CRIT EDGE BREAKING] -loadpre3.c predcom-4.c +predcom-4.c [PRE OF READONLY CALL] loadpre5.c @@ -1285,26 +1203,6 @@ SingleSource/Benchmarks/Misc/dt.c //===---------------------------------------------------------------------===// -A/B get pinned to the stack because we turn an if/then into a select instead -of PRE'ing the load/store. This may be fixable in instcombine: -http://gcc.gnu.org/bugzilla/show_bug.cgi?id=37892 - -struct X { int i; }; -int foo (int x) { - struct X a; - struct X b; - struct X *p; - a.i = 1; - b.i = 2; - if (x) - p = &a; - else - p = &b; - return p->i; -} - -//===---------------------------------------------------------------------===// - Interesting missed case because of control flow flattening (should be 2 loads): http://gcc.gnu.org/bugzilla/show_bug.cgi?id=26629 With: llvm-gcc t2.c -S -o - -O0 -emit-llvm | llvm-as | @@ -1350,12 +1248,28 @@ codegen. //===---------------------------------------------------------------------===// +simplifylibcalls should turn these snprintf idioms into memcpy (GCC PR47917) + +char buf1[6], buf2[6], buf3[4], buf4[4]; +int i; + +int foo (void) { + int ret = snprintf (buf1, sizeof buf1, "abcde"); + ret += snprintf (buf2, sizeof buf2, "abcdef") * 16; + ret += snprintf (buf3, sizeof buf3, "%s", i++ < 6 ? "abc" : "def") * 256; + ret += snprintf (buf4, sizeof buf4, "%s", i++ > 10 ? "abcde" : "defgh")*4096; + return ret; +} + +//===---------------------------------------------------------------------===// + "gas" uses this idiom: else if (strchr ("+-/*%|&^:[]()~", *intel_parser.op_string)) .. else if (strchr ("<>", *intel_parser.op_string) -Those should be turned into a switch. +Those should be turned into a switch. SimplifyLibCalls only gets the second +case. //===---------------------------------------------------------------------===// @@ -1805,51 +1719,6 @@ case it choses instead to keep the max operation obvious. //===---------------------------------------------------------------------===// -Take the following testcase on x86-64 (similar testcases exist for all targets -with addc/adde): - -define void @a(i64* nocapture %s, i64* nocapture %t, i64 %a, i64 %b, -i64 %c) nounwind { -entry: - %0 = zext i64 %a to i128 ; [#uses=1] - %1 = zext i64 %b to i128 ; [#uses=1] - %2 = add i128 %1, %0 ; [#uses=2] - %3 = zext i64 %c to i128 ; [#uses=1] - %4 = shl i128 %3, 64 ; [#uses=1] - %5 = add i128 %4, %2 ; [#uses=1] - %6 = lshr i128 %5, 64 ; [#uses=1] - %7 = trunc i128 %6 to i64 ; [#uses=1] - store i64 %7, i64* %s, align 8 - %8 = trunc i128 %2 to i64 ; [#uses=1] - store i64 %8, i64* %t, align 8 - ret void -} - -Generated code: - addq %rcx, %rdx - movl $0, %eax - adcq $0, %rax - addq %r8, %rax - movq %rax, (%rdi) - movq %rdx, (%rsi) - ret - -Expected code: - addq %rcx, %rdx - adcq $0, %r8 - movq %r8, (%rdi) - movq %rdx, (%rsi) - ret - -The generated SelectionDAG has an ADD of an ADDE, where both operands of the -ADDE are zero. Replacing one of the operands of the ADDE with the other operand -of the ADD, and replacing the ADD with the ADDE, should give the desired result. - -(That said, we are doing a lot better than gcc on this testcase. :) ) - -//===---------------------------------------------------------------------===// - -Switch lowering generates less than ideal code for the following switch: define void @a(i32 %x) nounwind { entry: switch i32 %x, label %if.end [ @@ -1870,19 +1739,15 @@ declare void @foo() Generated code on x86-64 (other platforms give similar results): a: cmpl $5, %edi - ja .LBB0_2 - movl %edi, %eax - movl $47, %ecx - btq %rax, %rcx - jb .LBB0_3 + ja LBB2_2 + cmpl $4, %edi + jne LBB2_3 .LBB0_2: ret .LBB0_3: jmp foo # TAILCALL -The movl+movl+btq+jb could be simplified to a cmpl+jne. - -Or, if we wanted to be really clever, we could simplify the whole thing to +If we wanted to be really clever, we could simplify the whole thing to something like the following, which eliminates a branch: xorl $1, %edi cmpl $4, %edi @@ -1890,43 +1755,6 @@ something like the following, which eliminates a branch: ret .LBB0_2: jmp foo # TAILCALL -//===---------------------------------------------------------------------===// -Given a branch where the two target blocks are identical ("ret i32 %b" in -both), simplifycfg will simplify them away. But not so for a switch statement: - -define i32 @f(i32 %a, i32 %b) nounwind readnone { -entry: - switch i32 %a, label %bb3 [ - i32 4, label %bb - i32 6, label %bb - ] - -bb: ; preds = %entry, %entry - ret i32 %b - -bb3: ; preds = %entry - ret i32 %b -} -//===---------------------------------------------------------------------===// - -clang -O3 fails to devirtualize this virtual inheritance case: (GCC PR45875) -Looks related to PR3100 - -struct c1 {}; -struct c10 : c1{ - virtual void foo (); -}; -struct c11 : c10, c1{ - virtual void f6 (); -}; -struct c28 : virtual c11{ - void f6 (); -}; -void check_c28 () { - c28 obj; - c11 *ptr = &obj; - ptr->f6 (); -} //===---------------------------------------------------------------------===// @@ -2016,44 +1844,6 @@ we remove checking in code like //===---------------------------------------------------------------------===// -This code (from Benchmarks/Dhrystone/dry.c): - -define i32 @Func1(i32, i32) nounwind readnone optsize ssp { -entry: - %sext = shl i32 %0, 24 - %conv = ashr i32 %sext, 24 - %sext6 = shl i32 %1, 24 - %conv4 = ashr i32 %sext6, 24 - %cmp = icmp eq i32 %conv, %conv4 - %. = select i1 %cmp, i32 10000, i32 0 - ret i32 %. -} - -Should be simplified into something like: - -define i32 @Func1(i32, i32) nounwind readnone optsize ssp { -entry: - %sext = shl i32 %0, 24 - %conv = and i32 %sext, 0xFF000000 - %sext6 = shl i32 %1, 24 - %conv4 = and i32 %sext6, 0xFF000000 - %cmp = icmp eq i32 %conv, %conv4 - %. = select i1 %cmp, i32 10000, i32 0 - ret i32 %. -} - -and then to: - -define i32 @Func1(i32, i32) nounwind readnone optsize ssp { -entry: - %conv = and i32 %0, 0xFF - %conv4 = and i32 %1, 0xFF - %cmp = icmp eq i32 %conv, %conv4 - %. = select i1 %cmp, i32 10000, i32 0 - ret i32 %. -} -//===---------------------------------------------------------------------===// - clang -O3 currently compiles this code int g(unsigned int a) { @@ -2155,44 +1945,50 @@ _ZNSt12_Vector_baseIiSaIiEEC2EmRKS0_.exit.i.i: ; preds = %cond.true.i.i.i.i br label %_ZNSt6vectorIiSaIiEEC1EmRKiRKS0_.exit This is just the handling the construction of the vector. Most surprising here -is the fact that all three null stores in %entry are dead, but not eliminated. +is the fact that all three null stores in %entry are dead (because we do no +cross-block DSE). + Also surprising is that %conv isn't simplified to 0 in %....exit.thread.i.i. +This is a because the client of LazyValueInfo doesn't simplify all instruction +operands, just selected ones. //===---------------------------------------------------------------------===// clang -O3 -fno-exceptions currently compiles this code: -void f(int N) { - std::vector v(N); - for (int k = 0; k < N; ++k) - v[k] = 0; - - extern void sink(void*); sink(&v); +void f(char* a, int n) { + __builtin_memset(a, 0, n); + for (int i = 0; i < n; ++i) + a[i] = 0; } -into almost the same as the previous note, but replace its final BB with: +into: -for.body.lr.ph: ; preds = %cond.true.i.i.i.i - %mul.i.i.i.i.i = shl i64 %conv, 2 - %call3.i.i.i.i.i = call noalias i8* @_Znwm(i64 %mul.i.i.i.i.i) nounwind - %0 = bitcast i8* %call3.i.i.i.i.i to i32* - store i32* %0, i32** %v8.sub, align 8, !tbaa !0 - %add.ptr.i.i.i = getelementptr inbounds i32* %0, i64 %conv - store i32* %add.ptr.i.i.i, i32** %tmp4.i.i.i.i.i, align 8, !tbaa !0 - call void @llvm.memset.p0i8.i64(i8* %call3.i.i.i.i.i, i8 0, i64 %mul.i.i.i.i.i, i32 4, i1 false) - store i32* %add.ptr.i.i.i, i32** %tmp3.i.i.i.i.i, align 8, !tbaa !0 - %tmp18 = add i32 %N, -1 - %tmp19 = zext i32 %tmp18 to i64 - %tmp20 = shl i64 %tmp19, 2 - %tmp21 = add i64 %tmp20, 4 - call void @llvm.memset.p0i8.i64(i8* %call3.i.i.i.i.i, i8 0, i64 %tmp21, i32 4, i1 false) - br label %for.end +define void @_Z1fPci(i8* nocapture %a, i32 %n) nounwind { +entry: + %conv = sext i32 %n to i64 + tail call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 %conv, i32 1, i1 false) + %cmp8 = icmp sgt i32 %n, 0 + br i1 %cmp8, label %for.body.lr.ph, label %for.end + +for.body.lr.ph: ; preds = %entry + %tmp10 = add i32 %n, -1 + %tmp11 = zext i32 %tmp10 to i64 + %tmp12 = add i64 %tmp11, 1 + call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 %tmp12, i32 1, i1 false) + ret void + +for.end: ; preds = %entry + ret void +} -First off, why (((zext %N - 1) << 2) + 4) instead of the ((sext %N) << 2) done -previously? (or better yet, re-use that one?) +This shouldn't need the ((zext (%n - 1)) + 1) game, and it should ideally fold +the two memset's together. -Then, the really painful one is the second memset, of the same memory, to the -same value. +The issue with the addition only occurs in 64-bit mode, and appears to be at +least partially caused by Scalar Evolution not keeping its cache updated: it +returns the "wrong" result immediately after indvars runs, but figures out the +expected result if it is run from scratch on IR resulting from running indvars. //===---------------------------------------------------------------------===// @@ -2213,6 +2009,12 @@ S is only 6 bytes, but each element is 8 byte-aligned. We generate a loop and 4 stores on each iteration. If the struct were 8 bytes, this gets turned into a memset. +In order to handle this we have to: + A) Teach clang to generate metadata for memsets of structs that have holes in + them. + B) Teach clang to use such a memset for zero init of this struct (since it has + a hole), instead of doing elementwise zeroing. + //===---------------------------------------------------------------------===// clang -O3 currently compiles this code: @@ -2262,7 +2064,7 @@ transform the fmul to 0.0, and then the fadd to 2.0. We should enhance memcpy/memcpy/memset to allow a metadata node on them indicating that some bytes of the transfer are undefined. This is useful for -frontends like clang when lowering struct lowering, when some elements of the +frontends like clang when lowering struct copies, when some elements of the struct are undefined. Consider something like this: struct x { @@ -2307,3 +2109,171 @@ avoids partial register stalls in some important cases. //===---------------------------------------------------------------------===// +We don't fold (icmp (add) (add)) unless the two adds only have a single use. +There are a lot of cases that we're refusing to fold in (e.g.) 256.bzip2, for +example: + + %indvar.next90 = add i64 %indvar89, 1 ;; Has 2 uses + %tmp96 = add i64 %tmp95, 1 ;; Has 1 use + %exitcond97 = icmp eq i64 %indvar.next90, %tmp96 + +We don't fold this because we don't want to introduce an overlapped live range +of the ivar. However if we can make this more aggressive without causing +performance issues in two ways: + +1. If *either* the LHS or RHS has a single use, we can definitely do the + transformation. In the overlapping liverange case we're trading one register + use for one fewer operation, which is a reasonable trade. Before doing this + we should verify that the llc output actually shrinks for some benchmarks. +2. If both ops have multiple uses, we can still fold it if the operations are + both sinkable to *after* the icmp (e.g. in a subsequent block) which doesn't + increase register pressure. + +There are a ton of icmp's we aren't simplifying because of the reg pressure +concern. Care is warranted here though because many of these are induction +variables and other cases that matter a lot to performance, like the above. +Here's a blob of code that you can drop into the bottom of visitICmp to see some +missed cases: + + { Value *A, *B, *C, *D; + if (match(Op0, m_Add(m_Value(A), m_Value(B))) && + match(Op1, m_Add(m_Value(C), m_Value(D))) && + (A == C || A == D || B == C || B == D)) { + errs() << "OP0 = " << *Op0 << " U=" << Op0->getNumUses() << "\n"; + errs() << "OP1 = " << *Op1 << " U=" << Op1->getNumUses() << "\n"; + errs() << "CMP = " << I << "\n\n"; + } + } + +//===---------------------------------------------------------------------===// + +define i1 @test1(i32 %x) nounwind { + %and = and i32 %x, 3 + %cmp = icmp ult i32 %and, 2 + ret i1 %cmp +} + +Can be folded to (x & 2) == 0. + +define i1 @test2(i32 %x) nounwind { + %and = and i32 %x, 3 + %cmp = icmp ugt i32 %and, 1 + ret i1 %cmp +} + +Can be folded to (x & 2) != 0. + +SimplifyDemandedBits shrinks the "and" constant to 2 but instcombine misses the +icmp transform. + +//===---------------------------------------------------------------------===// + +This code: + +typedef struct { +int f1:1; +int f2:1; +int f3:1; +int f4:29; +} t1; + +typedef struct { +int f1:1; +int f2:1; +int f3:30; +} t2; + +t1 s1; +t2 s2; + +void func1(void) +{ +s1.f1 = s2.f1; +s1.f2 = s2.f2; +} + +Compiles into this IR (on x86-64 at least): + +%struct.t1 = type { i8, [3 x i8] } +@s2 = global %struct.t1 zeroinitializer, align 4 +@s1 = global %struct.t1 zeroinitializer, align 4 +define void @func1() nounwind ssp noredzone { +entry: + %0 = load i32* bitcast (%struct.t1* @s2 to i32*), align 4 + %bf.val.sext5 = and i32 %0, 1 + %1 = load i32* bitcast (%struct.t1* @s1 to i32*), align 4 + %2 = and i32 %1, -4 + %3 = or i32 %2, %bf.val.sext5 + %bf.val.sext26 = and i32 %0, 2 + %4 = or i32 %3, %bf.val.sext26 + store i32 %4, i32* bitcast (%struct.t1* @s1 to i32*), align 4 + ret void +} + +The two or/and's should be merged into one each. + +//===---------------------------------------------------------------------===// + +Machine level code hoisting can be useful in some cases. For example, PR9408 +is about: + +typedef union { + void (*f1)(int); + void (*f2)(long); +} funcs; + +void foo(funcs f, int which) { + int a = 5; + if (which) { + f.f1(a); + } else { + f.f2(a); + } +} + +which we compile to: + +foo: # @foo +# BB#0: # %entry + pushq %rbp + movq %rsp, %rbp + testl %esi, %esi + movq %rdi, %rax + je .LBB0_2 +# BB#1: # %if.then + movl $5, %edi + callq *%rax + popq %rbp + ret +.LBB0_2: # %if.else + movl $5, %edi + callq *%rax + popq %rbp + ret + +Note that bb1 and bb2 are the same. This doesn't happen at the IR level +because one call is passing an i32 and the other is passing an i64. + +//===---------------------------------------------------------------------===// + +I see this sort of pattern in 176.gcc in a few places (e.g. the start of +store_bit_field). The rem should be replaced with a multiply and subtract: + + %3 = sdiv i32 %A, %B + %4 = srem i32 %A, %B + +Similarly for udiv/urem. Note that this shouldn't be done on X86 or ARM, +which can do this in a single operation (instruction or libcall). It is +probably best to do this in the code generator. + +//===---------------------------------------------------------------------===// + +unsigned foo(unsigned x, unsigned y) { return (x & y) == 0 || x == 0; } +should fold to (x & y) == 0. + +//===---------------------------------------------------------------------===// + +unsigned foo(unsigned x, unsigned y) { return x > y && x != 0; } +should fold to x > y. + +//===---------------------------------------------------------------------===//