X-Git-Url: http://plrg.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FX86%2FREADME-SSE.txt;h=19eb05e7657bfd64d1017a37933efce35c09ea1d;hb=f451cb870efcf9e0302d25ed05f4cac6bb494e42;hp=bc51b53482430043fffb8202642b99d360cbf272;hpb=1632782fe9a845e545f0ebd05fb4dc304adb5bd2;p=oota-llvm.git

diff --git a/lib/Target/X86/README-SSE.txt b/lib/Target/X86/README-SSE.txt
index bc51b534824..19eb05e7657 100644
--- a/lib/Target/X86/README-SSE.txt
+++ b/lib/Target/X86/README-SSE.txt
@@ -376,7 +376,7 @@ ret
 ... saving two instructions.
 
 The basic idea is that a reload from a spill slot, can, if only one 4-byte 
-chunk is used, bring in 3 zeros the the one element instead of 4 elements.
+chunk is used, bring in 3 zeros the one element instead of 4 elements.
 This can be used to simplify a variety of shuffle operations, where the
 elements are fixed zeros.
 
@@ -840,42 +840,6 @@ _t:
 	shufps	$132, %xmm2, %xmm0
 	movaps	%xmm0, 0
 
-//===---------------------------------------------------------------------===//
-rdar://6037315
-
-llvm-gcc-4.2 does the following for uint32_t -> float conversions on i386:
-
-	uint32_t x;
-	float y = (float)x;
-
-becomes:
-
-movl	%eax,		-8(%ebp)	// write x to the stack
-movl	$0x3ff00000,	-4(%ebp)	// 2^52 + x as a double at -4(%ebp)
-movsd	-8(%ebp),		%xmm0
-subsd	[2^52 double], 	%xmm0	// subtract 2^52 -- this is exact
-cvtsd2ss %xmm0,		%xmm0	// convert to single -- rounding happens here
-
-On merom/yonah, this takes a substantial stall.  The following is a much 
-better option:
-
-movd	%eax,		%xmm0	// load x into low word of xmm0
-movsd	[2^52 double],	%xmm1 	// load 2^52 into xmm1
-orpd	%xmm1,		%xmm0	// 2^52 + x in double precision
-subsd	%xmm1,		%xmm0	// x in double precision
-cvtsd2ss %xmm0,		%xmm0	// x rounded to single precision
-
-IF we don't already need PIC, then the following is even faster still, at a 
-small cost to code size:
-
-movl		$0x3ff00000,	%ecx		// conjure high word of 2^52
-movd		%ecx,		%xmm1
-movss	%eax,		%xmm0	// load x into low word of xmm0
-psllq		$32,			%xmm1	// 2^52
-orpd		%xmm1,		%xmm0	// 2^52 + x in double precision
-subsd		%xmm1,		%xmm0	// x in double precision
-cvtsd2ss	%xmm0,		%xmm0	// x in single precision
-
 //===---------------------------------------------------------------------===//
 rdar://5907648
 
@@ -912,3 +876,114 @@ since we know the stack slot is already zext'd.
 Consider using movlps instead of movsd to implement (scalar_to_vector (loadf64))
 when code size is critical. movlps is slower than movsd on core2 but it's one
 byte shorter.
+
+//===---------------------------------------------------------------------===//
+
+We should use a dynamic programming based approach to tell when using FPStack
+operations is cheaper than SSE.  SciMark montecarlo contains code like this
+for example:
+
+double MonteCarlo_num_flops(int Num_samples) {
+    return ((double) Num_samples)* 4.0;
+}
+
+In fpstack mode, this compiles into:
+
+LCPI1_0:					
+	.long	1082130432	## float 4.000000e+00
+_MonteCarlo_num_flops:
+	subl	$4, %esp
+	movl	8(%esp), %eax
+	movl	%eax, (%esp)
+	fildl	(%esp)
+	fmuls	LCPI1_0
+	addl	$4, %esp
+	ret
+        
+in SSE mode, it compiles into significantly slower code:
+
+_MonteCarlo_num_flops:
+	subl	$12, %esp
+	cvtsi2sd	16(%esp), %xmm0
+	mulsd	LCPI1_0, %xmm0
+	movsd	%xmm0, (%esp)
+	fldl	(%esp)
+	addl	$12, %esp
+	ret
+
+There are also other cases in scimark where using fpstack is better, it is
+cheaper to do fld1 than load from a constant pool for example, so
+"load, add 1.0, store" is better done in the fp stack, etc.
+
+//===---------------------------------------------------------------------===//
+
+The X86 backend should be able to if-convert SSE comparisons like "ucomisd" to
+"cmpsd".  For example, this code:
+
+double d1(double x) { return x == x ? x : x + x; }
+
+Compiles into:
+
+_d1:
+	ucomisd	%xmm0, %xmm0
+	jnp	LBB1_2
+	addsd	%xmm0, %xmm0
+	ret
+LBB1_2:
+	ret
+
+Also, the 'ret's should be shared.  This is PR6032.
+
+//===---------------------------------------------------------------------===//
+
+These should compile into the same code (PR6214): Perhaps instcombine should
+canonicalize the former into the later?
+
+define float @foo(float %x) nounwind {
+  %t = bitcast float %x to i32
+  %s = and i32 %t, 2147483647
+  %d = bitcast i32 %s to float
+  ret float %d
+}
+
+declare float @fabsf(float %n)
+define float @bar(float %x) nounwind {
+  %d = call float @fabsf(float %x)
+  ret float %d
+}
+
+//===---------------------------------------------------------------------===//
+
+This IR (from PR6194):
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-apple-darwin10.0.0"
+
+%0 = type { double, double }
+%struct.float3 = type { float, float, float }
+
+define void @test(%0, %struct.float3* nocapture %res) nounwind noinline ssp {
+entry:
+  %tmp18 = extractvalue %0 %0, 0                  ; <double> [#uses=1]
+  %tmp19 = bitcast double %tmp18 to i64           ; <i64> [#uses=1]
+  %tmp20 = zext i64 %tmp19 to i128                ; <i128> [#uses=1]
+  %tmp10 = lshr i128 %tmp20, 32                   ; <i128> [#uses=1]
+  %tmp11 = trunc i128 %tmp10 to i32               ; <i32> [#uses=1]
+  %tmp12 = bitcast i32 %tmp11 to float            ; <float> [#uses=1]
+  %tmp5 = getelementptr inbounds %struct.float3* %res, i64 0, i32 1 ; <float*> [#uses=1]
+  store float %tmp12, float* %tmp5
+  ret void
+}
+
+Compiles to:
+
+_test:                                  ## @test
+	movd	%xmm0, %rax
+	shrq	$32, %rax
+	movl	%eax, 4(%rdi)
+	ret
+
+This would be better kept in the SSE unit by treating XMM0 as a 4xfloat and
+doing a shuffle from v[1] to v[0] then a float store.
+
+//===---------------------------------------------------------------------===//