// Random ideas for the X86 backend: SSE-specific stuff.
//===---------------------------------------------------------------------===//
-- Consider eliminating the unaligned SSE load intrinsics, replacing them with
- unaligned LLVM load instructions.
+//===---------------------------------------------------------------------===//
+
+SSE Variable shift can be custom lowered to something like this, which uses a
+small table + unaligned load + shuffle instead of going through memory.
+
+__m128i_shift_right:
+ .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ .byte -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
+
+...
+__m128i shift_right(__m128i value, unsigned long offset) {
+ return _mm_shuffle_epi8(value,
+ _mm_loadu_si128((__m128 *) (___m128i_shift_right + offset)));
+}
+
+//===---------------------------------------------------------------------===//
+
+SSE has instructions for doing operations on complex numbers, we should pattern
+match them. For example, this should turn into a horizontal add:
+
+typedef float __attribute__((vector_size(16))) v4f32;
+float f32(v4f32 A) {
+ return A[0]+A[1]+A[2]+A[3];
+}
+
+Instead we get this:
+
+_f32: ## @f32
+ pshufd $1, %xmm0, %xmm1 ## xmm1 = xmm0[1,0,0,0]
+ addss %xmm0, %xmm1
+ pshufd $3, %xmm0, %xmm2 ## xmm2 = xmm0[3,0,0,0]
+ movhlps %xmm0, %xmm0 ## xmm0 = xmm0[1,1]
+ movaps %xmm0, %xmm3
+ addss %xmm1, %xmm3
+ movdqa %xmm2, %xmm0
+ addss %xmm3, %xmm0
+ ret
+
+Also, there are cases where some simple local SLP would improve codegen a bit.
+compiling this:
+
+_Complex float f32(_Complex float A, _Complex float B) {
+ return A+B;
+}
+
+into:
+
+_f32: ## @f32
+ movdqa %xmm0, %xmm2
+ addss %xmm1, %xmm2
+ pshufd $1, %xmm1, %xmm1 ## xmm1 = xmm1[1,0,0,0]
+ pshufd $1, %xmm0, %xmm3 ## xmm3 = xmm0[1,0,0,0]
+ addss %xmm1, %xmm3
+ movaps %xmm2, %xmm0
+ unpcklps %xmm3, %xmm0 ## xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+ ret
+
+seems silly when it could just be one addps.
+
//===---------------------------------------------------------------------===//
//===---------------------------------------------------------------------===//
-SSE should implement 'select_cc' using 'emulated conditional moves' that use
-pcmp/pand/pandn/por to do a selection instead of a conditional branch:
-
-double %X(double %Y, double %Z, double %A, double %B) {
- %C = setlt double %A, %B
- %z = fadd double %Z, 0.0 ;; select operand is not a load
- %D = select bool %C, double %Y, double %z
- ret double %D
-}
-
-We currently emit:
-
-_X:
- subl $12, %esp
- xorpd %xmm0, %xmm0
- addsd 24(%esp), %xmm0
- movsd 32(%esp), %xmm1
- movsd 16(%esp), %xmm2
- ucomisd 40(%esp), %xmm1
- jb LBB_X_2
-LBB_X_1:
- movsd %xmm0, %xmm2
-LBB_X_2:
- movsd %xmm2, (%esp)
- fldl (%esp)
- addl $12, %esp
- ret
-
-//===---------------------------------------------------------------------===//
-
-It's not clear whether we should use pxor or xorps / xorpd to clear XMM
-registers. The choice may depend on subtarget information. We should do some
-more experiments on different x86 machines.
-
-//===---------------------------------------------------------------------===//
-
Lower memcpy / memset to a series of SSE 128 bit move instructions when it's
feasible.
//===---------------------------------------------------------------------===//
-How to decide when to use the "floating point version" of logical ops? Here are
-some code fragments:
-
- movaps LCPI5_5, %xmm2
- divps %xmm1, %xmm2
- mulps %xmm2, %xmm3
- mulps 8656(%ecx), %xmm3
- addps 8672(%ecx), %xmm3
- andps LCPI5_6, %xmm2
- andps LCPI5_1, %xmm3
- por %xmm2, %xmm3
- movdqa %xmm3, (%edi)
-
- movaps LCPI5_5, %xmm1
- divps %xmm0, %xmm1
- mulps %xmm1, %xmm3
- mulps 8656(%ecx), %xmm3
- addps 8672(%ecx), %xmm3
- andps LCPI5_6, %xmm1
- andps LCPI5_1, %xmm3
- orps %xmm1, %xmm3
- movaps %xmm3, 112(%esp)
- movaps %xmm3, (%ebx)
-
-Due to some minor source change, the later case ended up using orps and movaps
-instead of por and movdqa. Does it matter?
-
-//===---------------------------------------------------------------------===//
-
-X86RegisterInfo::copyRegToReg() returns X86::MOVAPSrr for VR128. Is it possible
-to choose between movaps, movapd, and movdqa based on types of source and
-destination?
-
-How about andps, andpd, and pand? Do we really care about the type of the packed
-elements? If not, why not always use the "ps" variants which are likely to be
-shorter.
-
-//===---------------------------------------------------------------------===//
-
External test Nurbs exposed some problems. Look for
__ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc
emits:
//===---------------------------------------------------------------------===//
-http://gcc.gnu.org/bugzilla/show_bug.cgi?id=25500
-
-LLVM is producing bad code.
-
-LBB_main_4: # cond_true44
- addps %xmm1, %xmm2
- subps %xmm3, %xmm2
- movaps (%ecx), %xmm4
- movaps %xmm2, %xmm1
- addps %xmm4, %xmm1
- addl $16, %ecx
- incl %edx
- cmpl $262144, %edx
- movaps %xmm3, %xmm2
- movaps %xmm4, %xmm3
- jne LBB_main_4 # cond_true44
-
-There are two problems. 1) No need to two loop induction variables. We can
-compare against 262144 * 16. 2) Known register coalescer issue. We should
-be able eliminate one of the movaps:
-
- addps %xmm2, %xmm1 <=== Commute!
- subps %xmm3, %xmm1
- movaps (%ecx), %xmm4
- movaps %xmm1, %xmm1 <=== Eliminate!
- addps %xmm4, %xmm1
- addl $16, %ecx
- incl %edx
- cmpl $262144, %edx
- movaps %xmm3, %xmm2
- movaps %xmm4, %xmm3
- jne LBB_main_4 # cond_true44
-
-//===---------------------------------------------------------------------===//
-
Consider:
__m128 test(float a) {
//===---------------------------------------------------------------------===//
-__m128d test1( __m128d A, __m128d B) {
- return _mm_shuffle_pd(A, B, 0x3);
-}
-
-compiles to
-
-shufpd $3, %xmm1, %xmm0
-
-Perhaps it's better to use unpckhpd instead?
-
-unpckhpd %xmm1, %xmm0
-
-Don't know if unpckhpd is faster. But it is shorter.
-
-//===---------------------------------------------------------------------===//
-
This code generates ugly code, probably due to costs being off or something:
define void @test(float* %P, <4 x float>* %P2 ) {
%tmp20 = tail call i64 @ccoshf( float %tmp6, float %z.0 ) nounwind readonly
ret i64 %tmp20
}
+declare i64 @ccoshf(float %z.0, float %z.1) nounwind readonly
This currently compiles to:
//===---------------------------------------------------------------------===//
-SSE4 extract-to-mem ops aren't being pattern matched because of the AssertZext
-sitting between the truncate and the extract.
-
-//===---------------------------------------------------------------------===//
-
INSERTPS can match any insert (extract, imm1), imm2 for 4 x float, and insert
any number of 0.0 simultaneously. Currently we only use it for simple
insertions.
//===---------------------------------------------------------------------===//
-'select' on vectors and scalars could be a whole lot better. We currently
-lower them to conditional branches. On x86-64 for example, we compile this:
-
-double test(double a, double b, double c, double d) { return a<b ? c : d; }
-
-to:
-
-_test:
- ucomisd %xmm0, %xmm1
- ja LBB1_2 # entry
-LBB1_1: # entry
- movapd %xmm3, %xmm2
-LBB1_2: # entry
- movapd %xmm2, %xmm0
- ret
-
-instead of:
-
-_test:
- cmpltsd %xmm1, %xmm0
- andpd %xmm0, %xmm2
- andnpd %xmm3, %xmm0
- orpd %xmm2, %xmm0
- ret
-
-For unpredictable branches, the later is much more efficient. This should
-just be a matter of having scalar sse map to SELECT_CC and custom expanding
-or iseling it.
-
-//===---------------------------------------------------------------------===//
-
LLVM currently generates stack realignment code, when it is not necessary
needed. The problem is that we need to know about stack alignment too early,
before RA runs.
//===---------------------------------------------------------------------===//
-The X86 backend should be able to if-convert SSE comparisons like "ucomisd" to
-"cmpsd". For example, this code:
-
-double d1(double x) { return x == x ? x : x + x; }
-
-Compiles into:
-
-_d1:
- ucomisd %xmm0, %xmm0
- jnp LBB1_2
- addsd %xmm0, %xmm0
- ret
-LBB1_2:
- ret
-
-Also, the 'ret's should be shared. This is PR6032.
-
-//===---------------------------------------------------------------------===//
-
These should compile into the same code (PR6214): Perhaps instcombine should
canonicalize the former into the later?
This IR (from PR6194):
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-apple-darwin10.0.0"
%0 = type { double, double }
doing a shuffle from v[1] to v[0] then a float store.
//===---------------------------------------------------------------------===//
+
+[UNSAFE FP]
+
+void foo(double, double, double);
+void norm(double x, double y, double z) {
+ double scale = __builtin_sqrt(x*x + y*y + z*z);
+ foo(x/scale, y/scale, z/scale);
+}
+
+We currently generate an sqrtsd and 3 divsd instructions. This is bad, fp div is
+slow and not pipelined. In -ffast-math mode we could compute "1.0/scale" first
+and emit 3 mulsd in place of the divs. This can be done as a target-independent
+transform.
+
+If we're dealing with floats instead of doubles we could even replace the sqrtss
+and inversion with an rsqrtss instruction, which computes 1/sqrt faster at the
+cost of reduced accuracy.
+
+//===---------------------------------------------------------------------===//
+
+This function should be matched to haddpd when the appropriate CPU is enabled:
+
+#include <x86intrin.h>
+double f (__m128d p) {
+ return p[0] + p[1];
+}
+
+similarly, v[0]-v[1] should match to hsubpd, and {v[0]-v[1], w[0]-w[1]} should
+turn into hsubpd also.
+
+//===---------------------------------------------------------------------===//
+
+define <2 x i32> @foo(<2 x double> %in) {
+ %x = fptosi <2 x double> %in to <2 x i32>
+ ret <2 x i32> %x
+}
+
+Should compile into cvttpd2dq instead of being scalarized into 2 cvttsd2si.
+
+//===---------------------------------------------------------------------===//