X-Git-Url: http://plrg.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FX86%2FREADME-SSE.txt;h=19eb05e7657bfd64d1017a37933efce35c09ea1d;hb=f451cb870efcf9e0302d25ed05f4cac6bb494e42;hp=bc51b53482430043fffb8202642b99d360cbf272;hpb=1632782fe9a845e545f0ebd05fb4dc304adb5bd2;p=oota-llvm.git diff --git a/lib/Target/X86/README-SSE.txt b/lib/Target/X86/README-SSE.txt index bc51b534824..19eb05e7657 100644 --- a/lib/Target/X86/README-SSE.txt +++ b/lib/Target/X86/README-SSE.txt @@ -376,7 +376,7 @@ ret ... saving two instructions. The basic idea is that a reload from a spill slot, can, if only one 4-byte -chunk is used, bring in 3 zeros the the one element instead of 4 elements. +chunk is used, bring in 3 zeros the one element instead of 4 elements. This can be used to simplify a variety of shuffle operations, where the elements are fixed zeros. @@ -840,42 +840,6 @@ _t: shufps $132, %xmm2, %xmm0 movaps %xmm0, 0 -//===---------------------------------------------------------------------===// -rdar://6037315 - -llvm-gcc-4.2 does the following for uint32_t -> float conversions on i386: - - uint32_t x; - float y = (float)x; - -becomes: - -movl %eax, -8(%ebp) // write x to the stack -movl $0x3ff00000, -4(%ebp) // 2^52 + x as a double at -4(%ebp) -movsd -8(%ebp), %xmm0 -subsd [2^52 double], %xmm0 // subtract 2^52 -- this is exact -cvtsd2ss %xmm0, %xmm0 // convert to single -- rounding happens here - -On merom/yonah, this takes a substantial stall. The following is a much -better option: - -movd %eax, %xmm0 // load x into low word of xmm0 -movsd [2^52 double], %xmm1 // load 2^52 into xmm1 -orpd %xmm1, %xmm0 // 2^52 + x in double precision -subsd %xmm1, %xmm0 // x in double precision -cvtsd2ss %xmm0, %xmm0 // x rounded to single precision - -IF we don't already need PIC, then the following is even faster still, at a -small cost to code size: - -movl $0x3ff00000, %ecx // conjure high word of 2^52 -movd %ecx, %xmm1 -movss %eax, %xmm0 // load x into low word of xmm0 -psllq $32, %xmm1 // 2^52 -orpd %xmm1, %xmm0 // 2^52 + x in double precision -subsd %xmm1, %xmm0 // x in double precision -cvtsd2ss %xmm0, %xmm0 // x in single precision - //===---------------------------------------------------------------------===// rdar://5907648 @@ -912,3 +876,114 @@ since we know the stack slot is already zext'd. Consider using movlps instead of movsd to implement (scalar_to_vector (loadf64)) when code size is critical. movlps is slower than movsd on core2 but it's one byte shorter. + +//===---------------------------------------------------------------------===// + +We should use a dynamic programming based approach to tell when using FPStack +operations is cheaper than SSE. SciMark montecarlo contains code like this +for example: + +double MonteCarlo_num_flops(int Num_samples) { + return ((double) Num_samples)* 4.0; +} + +In fpstack mode, this compiles into: + +LCPI1_0: + .long 1082130432 ## float 4.000000e+00 +_MonteCarlo_num_flops: + subl $4, %esp + movl 8(%esp), %eax + movl %eax, (%esp) + fildl (%esp) + fmuls LCPI1_0 + addl $4, %esp + ret + +in SSE mode, it compiles into significantly slower code: + +_MonteCarlo_num_flops: + subl $12, %esp + cvtsi2sd 16(%esp), %xmm0 + mulsd LCPI1_0, %xmm0 + movsd %xmm0, (%esp) + fldl (%esp) + addl $12, %esp + ret + +There are also other cases in scimark where using fpstack is better, it is +cheaper to do fld1 than load from a constant pool for example, so +"load, add 1.0, store" is better done in the fp stack, etc. + +//===---------------------------------------------------------------------===// + +The X86 backend should be able to if-convert SSE comparisons like "ucomisd" to +"cmpsd". For example, this code: + +double d1(double x) { return x == x ? x : x + x; } + +Compiles into: + +_d1: + ucomisd %xmm0, %xmm0 + jnp LBB1_2 + addsd %xmm0, %xmm0 + ret +LBB1_2: + ret + +Also, the 'ret's should be shared. This is PR6032. + +//===---------------------------------------------------------------------===// + +These should compile into the same code (PR6214): Perhaps instcombine should +canonicalize the former into the later? + +define float @foo(float %x) nounwind { + %t = bitcast float %x to i32 + %s = and i32 %t, 2147483647 + %d = bitcast i32 %s to float + ret float %d +} + +declare float @fabsf(float %n) +define float @bar(float %x) nounwind { + %d = call float @fabsf(float %x) + ret float %d +} + +//===---------------------------------------------------------------------===// + +This IR (from PR6194): + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" +target triple = "x86_64-apple-darwin10.0.0" + +%0 = type { double, double } +%struct.float3 = type { float, float, float } + +define void @test(%0, %struct.float3* nocapture %res) nounwind noinline ssp { +entry: + %tmp18 = extractvalue %0 %0, 0 ; [#uses=1] + %tmp19 = bitcast double %tmp18 to i64 ; [#uses=1] + %tmp20 = zext i64 %tmp19 to i128 ; [#uses=1] + %tmp10 = lshr i128 %tmp20, 32 ; [#uses=1] + %tmp11 = trunc i128 %tmp10 to i32 ; [#uses=1] + %tmp12 = bitcast i32 %tmp11 to float ; [#uses=1] + %tmp5 = getelementptr inbounds %struct.float3* %res, i64 0, i32 1 ; [#uses=1] + store float %tmp12, float* %tmp5 + ret void +} + +Compiles to: + +_test: ## @test + movd %xmm0, %rax + shrq $32, %rax + movl %eax, 4(%rdi) + ret + +This would be better kept in the SSE unit by treating XMM0 as a 4xfloat and +doing a shuffle from v[1] to v[0] then a float store. + +//===---------------------------------------------------------------------===//