// Random ideas for the X86 backend: SSE-specific stuff.
//===---------------------------------------------------------------------===//
+- Consider eliminating the unaligned SSE load intrinsics, replacing them with
+ unaligned LLVM load instructions.
+
+//===---------------------------------------------------------------------===//
+
+Expand libm rounding functions inline: Significant speedups possible.
+http://gcc.gnu.org/ml/gcc-patches/2006-10/msg00909.html
+
//===---------------------------------------------------------------------===//
When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and
//===---------------------------------------------------------------------===//
-Should generate min/max for stuff like:
-
-void minf(float a, float b, float *X) {
- *X = a <= b ? a : b;
-}
-
-Make use of floating point min / max instructions. Perhaps introduce ISD::FMIN
-and ISD::FMAX node types?
-
-//===---------------------------------------------------------------------===//
-
Lower memcpy / memset to a series of SSE 128 bit move instructions when it's
feasible.
//===---------------------------------------------------------------------===//
-Better codegen for:
-
-void f(float a, float b, vector float * out) { *out = (vector float){ a, 0.0, 0.0, b}; }
-void f(float a, float b, vector float * out) { *out = (vector float){ a, b, 0.0, 0}; }
-
-For the later we generate:
-
-_f:
- pxor %xmm0, %xmm0
- movss 8(%esp), %xmm1
- movaps %xmm0, %xmm2
- unpcklps %xmm1, %xmm2
- movss 4(%esp), %xmm1
- unpcklps %xmm0, %xmm1
- unpcklps %xmm2, %xmm1
- movl 12(%esp), %eax
- movaps %xmm1, (%eax)
- ret
-
-This seems like it should use shufps, one for each of a & b.
-
-//===---------------------------------------------------------------------===//
-
How to decide when to use the "floating point version" of logical ops? Here are
some code fragments:
So icc is smart enough to know that B is in memory so it doesn't load it and
store it back to stack.
+This should be fixed by eliminating the llvm.x86.sse2.loadl.pd intrinsic,
+lowering it to a load+insertelement instead. Already match the load+shuffle
+as movlpd, so this should be easy. We already get optimal code for:
+
+define void @test2(<2 x double>* %r, <2 x double>* %A, double %B) {
+entry:
+ %tmp2 = load <2 x double>* %A, align 16
+ %tmp8 = insertelement <2 x double> %tmp2, double %B, i32 0
+ store <2 x double> %tmp8, <2 x double>* %r, align 16
+ ret void
+}
+
//===---------------------------------------------------------------------===//
__m128d test1( __m128d A, __m128d B) {
This code generates ugly code, probably due to costs being off or something:
-void %test(float* %P, <4 x float>* %P2 ) {
+define void @test(float* %P, <4 x float>* %P2 ) {
%xFloat0.688 = load float* %P
- %loadVector37.712 = load <4 x float>* %P2
- %inFloat3.713 = insertelement <4 x float> %loadVector37.712, float 0.000000e+00, uint 3
+ %tmp = load <4 x float>* %P2
+ %inFloat3.713 = insertelement <4 x float> %tmp, float 0.0, i32 3
store <4 x float> %inFloat3.713, <4 x float>* %P2
ret void
}
Generates:
_test:
- pxor %xmm0, %xmm0
- movd %xmm0, %eax ;; EAX = 0!
- movl 8(%esp), %ecx
- movaps (%ecx), %xmm0
- pinsrw $6, %eax, %xmm0
- shrl $16, %eax ;; EAX = 0 again!
- pinsrw $7, %eax, %xmm0
- movaps %xmm0, (%ecx)
- ret
+ movl 8(%esp), %eax
+ movaps (%eax), %xmm0
+ pxor %xmm1, %xmm1
+ movaps %xmm0, %xmm2
+ shufps $50, %xmm1, %xmm2
+ shufps $132, %xmm2, %xmm0
+ movaps %xmm0, (%eax)
+ ret
-It would be better to generate:
+Would it be better to generate:
_test:
movl 8(%esp), %ecx
movaps %xmm0, (%ecx)
ret
-or use pxor (to make a zero vector) and shuffle (to insert it).
+?
//===---------------------------------------------------------------------===//
//===---------------------------------------------------------------------===//
-Implement some missing insert/extract element operations without going through
-the stack. Testcase here:
-CodeGen/X86/vec_ins_extract.ll
-corresponds to this C code:
+Apply the same transformation that merged four float into a single 128-bit load
+to loads from constant pool.
+
+//===---------------------------------------------------------------------===//
+
+Floating point max / min are commutable when -enable-unsafe-fp-path is
+specified. We should turn int_x86_sse_max_ss and X86ISD::FMIN etc. into other
+nodes which are selected to max / min instructions that are marked commutable.
+
+//===---------------------------------------------------------------------===//
+
+We should compile this:
+#include <xmmintrin.h>
+typedef union {
+ int i[4];
+ float f[4];
+ __m128 v;
+} vector4_t;
+void swizzle (const void *a, vector4_t * b, vector4_t * c) {
+ b->v = _mm_loadl_pi (b->v, (__m64 *) a);
+ c->v = _mm_loadl_pi (c->v, ((__m64 *) a) + 1);
+}
+
+to:
+
+_swizzle:
+ movl 4(%esp), %eax
+ movl 8(%esp), %edx
+ movl 12(%esp), %ecx
+ movlps (%eax), %xmm0
+ movlps %xmm0, (%edx)
+ movlps 8(%eax), %xmm0
+ movlps %xmm0, (%ecx)
+ ret
+
+not:
+
+swizzle:
+ movl 8(%esp), %eax
+ movaps (%eax), %xmm0
+ movl 4(%esp), %ecx
+ movlps (%ecx), %xmm0
+ movaps %xmm0, (%eax)
+ movl 12(%esp), %eax
+ movaps (%eax), %xmm0
+ movlps 8(%ecx), %xmm0
+ movaps %xmm0, (%eax)
+ ret
+
+//===---------------------------------------------------------------------===//
+
+These functions should produce the same code:
+
+#include <emmintrin.h>
+
+typedef long long __m128i __attribute__ ((__vector_size__ (16)));
-typedef float vectorfloat __attribute__((vector_size(16)));
-void test(vectorfloat *F, float f) {
- vectorfloat G = *F + *F;
- *((float*)&G) = f;
- *F = G + G;
+int foo(__m128i* val) {
+ return __builtin_ia32_vec_ext_v4si(*val, 1);
}
-void test2(vectorfloat *F, float f) {
- vectorfloat G = *F + *F;
- ((float*)&G)[2] = f;
- *F = G + G;
+int bar(__m128i* val) {
+ union vs {
+ __m128i *_v;
+ int* _s;
+ } v = {val};
+ return v._s[1];
}
-void test3(vectorfloat *F, float *f) {
- vectorfloat G = *F + *F;
- *f = ((float*)&G)[2];
+
+We currently produce (with -m64):
+
+_foo:
+ pshufd $1, (%rdi), %xmm0
+ movd %xmm0, %eax
+ ret
+_bar:
+ movl 4(%rdi), %eax
+ ret
+
+//===---------------------------------------------------------------------===//
+
+We should materialize vector constants like "all ones" and "signbit" with
+code like:
+
+ cmpeqps xmm1, xmm1 ; xmm1 = all-ones
+
+and:
+ cmpeqps xmm1, xmm1 ; xmm1 = all-ones
+ psrlq xmm1, 31 ; xmm1 = all 100000000000...
+
+instead of using a load from the constant pool. The later is important for
+ABS/NEG/copysign etc.
+
+//===---------------------------------------------------------------------===//
+
+"converting 64-bit constant pool entry to 32-bit not necessarily beneficial"
+http://llvm.org/PR1264
+
+For this test case:
+
+define double @foo(double %x) {
+ %y = mul double %x, 5.000000e-01
+ ret double %y
+}
+
+llc -march=x86-64 currently produces a 32-bit constant pool entry and this code:
+
+ cvtss2sd .LCPI1_0(%rip), %xmm1
+ mulsd %xmm1, %xmm0
+
+instead of just using a 64-bit constant pool entry with this:
+
+ mulsd .LCPI1_0(%rip), %xmm0
+
+This is due to the code in ExpandConstantFP in LegalizeDAG.cpp. It notices that
+x86-64 indeed has an instruction to load a 32-bit float from memory and convert
+it into a 64-bit float in a register, however it doesn't notice that this isn't
+beneficial because it prevents the load from being folded into the multiply.
+
+//===---------------------------------------------------------------------===//
+
+These functions:
+
+#include <xmmintrin.h>
+__m128i a;
+void x(unsigned short n) {
+ a = _mm_slli_epi32 (a, n);
}
-void test4(vectorfloat *F, float *f) {
- vectorfloat G = *F + *F;
- *f = *((float*)&G);
+void y(unsigned n) {
+ a = _mm_slli_epi32 (a, n);
}
+compile to ( -O3 -static -fomit-frame-pointer):
+_x:
+ movzwl 4(%esp), %eax
+ movd %eax, %xmm0
+ movaps _a, %xmm1
+ pslld %xmm0, %xmm1
+ movaps %xmm1, _a
+ ret
+_y:
+ movd 4(%esp), %xmm0
+ movaps _a, %xmm1
+ pslld %xmm0, %xmm1
+ movaps %xmm1, _a
+ ret
+
+"y" looks good, but "x" does silly movzwl stuff around into a GPR. It seems
+like movd would be sufficient in both cases as the value is already zero
+extended in the 32-bit stack slot IIRC. For signed short, it should also be
+save, as a really-signed value would be undefined for pslld.
+
+
//===---------------------------------------------------------------------===//
-Apply the same transformation that merged four float into a single 128-bit load
-to loads from constant pool.
+#include <math.h>
+int t1(double d) { return signbit(d); }
+
+This currently compiles to:
+ subl $12, %esp
+ movsd 16(%esp), %xmm0
+ movsd %xmm0, (%esp)
+ movl 4(%esp), %eax
+ shrl $31, %eax
+ addl $12, %esp
+ ret
+
+We should use movmskp{s|d} instead.
+
+//===---------------------------------------------------------------------===//
+
+CodeGen/X86/vec_align.ll tests whether we can turn 4 scalar loads into a single
+(aligned) vector load. This functionality has a couple of problems.
+
+1. The code to infer alignment from loads of globals is in the X86 backend,
+ not the dag combiner. This is because dagcombine2 needs to be able to see
+ through the X86ISD::Wrapper node, which DAGCombine can't really do.
+2. The code for turning 4 x load into a single vector load is target
+ independent and should be moved to the dag combiner.
+3. The code for turning 4 x load into a vector load can only handle a direct
+ load from a global or a direct load from the stack. It should be generalized
+ to handle any load from P, P+4, P+8, P+12, where P can be anything.
+4. The alignment inference code cannot handle loads from globals in non-static
+ mode because it doesn't look through the extra dyld stub load. If you try
+ vec_align.ll without -relocation-model=static, you'll see what I mean.
+
+//===---------------------------------------------------------------------===//
+
+We should lower store(fneg(load p), q) into an integer load+xor+store, which
+eliminates a constant pool load. For example, consider:
+
+define i64 @ccosf(float %z.0, float %z.1) nounwind readonly {
+entry:
+ %tmp6 = sub float -0.000000e+00, %z.1 ; <float> [#uses=1]
+ %tmp20 = tail call i64 @ccoshf( float %tmp6, float %z.0 ) nounwind readonly ; <i64> [#uses=1]
+ ret i64 %tmp20
+}
+
+This currently compiles to:
+
+LCPI1_0: # <4 x float>
+ .long 2147483648 # float -0
+ .long 2147483648 # float -0
+ .long 2147483648 # float -0
+ .long 2147483648 # float -0
+_ccosf:
+ subl $12, %esp
+ movss 16(%esp), %xmm0
+ movss %xmm0, 4(%esp)
+ movss 20(%esp), %xmm0
+ xorps LCPI1_0, %xmm0
+ movss %xmm0, (%esp)
+ call L_ccoshf$stub
+ addl $12, %esp
+ ret
+
+Note the load into xmm0, then xor (to negate), then store. In PIC mode,
+this code computes the pic base and does two loads to do the constant pool
+load, so the improvement is much bigger.
+
+The tricky part about this xform is that the argument load/store isn't exposed
+until post-legalize, and at that point, the fneg has been custom expanded into
+an X86 fxor. This means that we need to handle this case in the x86 backend
+instead of in target independent code.
+
+//===---------------------------------------------------------------------===//