... saving two instructions.
The basic idea is that a reload from a spill slot, can, if only one 4-byte
-chunk is used, bring in 3 zeros the the one element instead of 4 elements.
+chunk is used, bring in 3 zeros the one element instead of 4 elements.
This can be used to simplify a variety of shuffle operations, where the
elements are fixed zeros.
shufps $132, %xmm2, %xmm0
movaps %xmm0, 0
-//===---------------------------------------------------------------------===//
-rdar://6037315
-
-llvm-gcc-4.2 does the following for uint32_t -> float conversions on i386:
-
- uint32_t x;
- float y = (float)x;
-
-becomes:
-
-movl %eax, -8(%ebp) // write x to the stack
-movl $0x3ff00000, -4(%ebp) // 2^52 + x as a double at -4(%ebp)
-movsd -8(%ebp), %xmm0
-subsd [2^52 double], %xmm0 // subtract 2^52 -- this is exact
-cvtsd2ss %xmm0, %xmm0 // convert to single -- rounding happens here
-
-On merom/yonah, this takes a substantial stall. The following is a much
-better option:
-
-movd %eax, %xmm0 // load x into low word of xmm0
-movsd [2^52 double], %xmm1 // load 2^52 into xmm1
-orpd %xmm1, %xmm0 // 2^52 + x in double precision
-subsd %xmm1, %xmm0 // x in double precision
-cvtsd2ss %xmm0, %xmm0 // x rounded to single precision
-
-IF we don't already need PIC, then the following is even faster still, at a
-small cost to code size:
-
-movl $0x3ff00000, %ecx // conjure high word of 2^52
-movd %ecx, %xmm1
-movss %eax, %xmm0 // load x into low word of xmm0
-psllq $32, %xmm1 // 2^52
-orpd %xmm1, %xmm0 // 2^52 + x in double precision
-subsd %xmm1, %xmm0 // x in double precision
-cvtsd2ss %xmm0, %xmm0 // x in single precision
-
//===---------------------------------------------------------------------===//
rdar://5907648
Consider using movlps instead of movsd to implement (scalar_to_vector (loadf64))
when code size is critical. movlps is slower than movsd on core2 but it's one
byte shorter.
+
+//===---------------------------------------------------------------------===//
+
+We should use a dynamic programming based approach to tell when using FPStack
+operations is cheaper than SSE. SciMark montecarlo contains code like this
+for example:
+
+double MonteCarlo_num_flops(int Num_samples) {
+ return ((double) Num_samples)* 4.0;
+}
+
+In fpstack mode, this compiles into:
+
+LCPI1_0:
+ .long 1082130432 ## float 4.000000e+00
+_MonteCarlo_num_flops:
+ subl $4, %esp
+ movl 8(%esp), %eax
+ movl %eax, (%esp)
+ fildl (%esp)
+ fmuls LCPI1_0
+ addl $4, %esp
+ ret
+
+in SSE mode, it compiles into significantly slower code:
+
+_MonteCarlo_num_flops:
+ subl $12, %esp
+ cvtsi2sd 16(%esp), %xmm0
+ mulsd LCPI1_0, %xmm0
+ movsd %xmm0, (%esp)
+ fldl (%esp)
+ addl $12, %esp
+ ret
+
+There are also other cases in scimark where using fpstack is better, it is
+cheaper to do fld1 than load from a constant pool for example, so
+"load, add 1.0, store" is better done in the fp stack, etc.
+
+//===---------------------------------------------------------------------===//
+
+The X86 backend should be able to if-convert SSE comparisons like "ucomisd" to
+"cmpsd". For example, this code:
+
+double d1(double x) { return x == x ? x : x + x; }
+
+Compiles into:
+
+_d1:
+ ucomisd %xmm0, %xmm0
+ jnp LBB1_2
+ addsd %xmm0, %xmm0
+ ret
+LBB1_2:
+ ret
+
+Also, the 'ret's should be shared. This is PR6032.
+
+//===---------------------------------------------------------------------===//
+
+These should compile into the same code (PR6214): Perhaps instcombine should
+canonicalize the former into the later?
+
+define float @foo(float %x) nounwind {
+ %t = bitcast float %x to i32
+ %s = and i32 %t, 2147483647
+ %d = bitcast i32 %s to float
+ ret float %d
+}
+
+declare float @fabsf(float %n)
+define float @bar(float %x) nounwind {
+ %d = call float @fabsf(float %x)
+ ret float %d
+}
+
+//===---------------------------------------------------------------------===//
+
+This IR (from PR6194):
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-apple-darwin10.0.0"
+
+%0 = type { double, double }
+%struct.float3 = type { float, float, float }
+
+define void @test(%0, %struct.float3* nocapture %res) nounwind noinline ssp {
+entry:
+ %tmp18 = extractvalue %0 %0, 0 ; <double> [#uses=1]
+ %tmp19 = bitcast double %tmp18 to i64 ; <i64> [#uses=1]
+ %tmp20 = zext i64 %tmp19 to i128 ; <i128> [#uses=1]
+ %tmp10 = lshr i128 %tmp20, 32 ; <i128> [#uses=1]
+ %tmp11 = trunc i128 %tmp10 to i32 ; <i32> [#uses=1]
+ %tmp12 = bitcast i32 %tmp11 to float ; <float> [#uses=1]
+ %tmp5 = getelementptr inbounds %struct.float3* %res, i64 0, i32 1 ; <float*> [#uses=1]
+ store float %tmp12, float* %tmp5
+ ret void
+}
+
+Compiles to:
+
+_test: ## @test
+ movd %xmm0, %rax
+ shrq $32, %rax
+ movl %eax, 4(%rdi)
+ ret
+
+This would be better kept in the SSE unit by treating XMM0 as a 4xfloat and
+doing a shuffle from v[1] to v[0] then a float store.
+
+//===---------------------------------------------------------------------===//