X-Git-Url: http://plrg.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FX86%2FREADME-SSE.txt;h=d3f91bfabc3e80403ded2ea5d4621c66795013a0;hb=f6b935d8e696f221eeb0578aa3da7cc143f80049;hp=20e6a53267ed26b15ee5682b3dd367fe9388f1e6;hpb=36c5155d0f1d37b62e2782d89c084e1d5d9d0270;p=oota-llvm.git diff --git a/lib/Target/X86/README-SSE.txt b/lib/Target/X86/README-SSE.txt index 20e6a53267e..d3f91bfabc3 100644 --- a/lib/Target/X86/README-SSE.txt +++ b/lib/Target/X86/README-SSE.txt @@ -2,6 +2,9 @@ // Random ideas for the X86 backend: SSE-specific stuff. //===---------------------------------------------------------------------===// +- Consider eliminating the unaligned SSE load intrinsics, replacing them with + unaligned LLVM load instructions. + //===---------------------------------------------------------------------===// Expand libm rounding functions inline: Significant speedups possible. @@ -453,6 +456,18 @@ icc generates: So icc is smart enough to know that B is in memory so it doesn't load it and store it back to stack. +This should be fixed by eliminating the llvm.x86.sse2.loadl.pd intrinsic, +lowering it to a load+insertelement instead. Already match the load+shuffle +as movlpd, so this should be easy. We already get optimal code for: + +define void @test2(<2 x double>* %r, <2 x double>* %A, double %B) { +entry: + %tmp2 = load <2 x double>* %A, align 16 + %tmp8 = insertelement <2 x double> %tmp2, double %B, i32 0 + store <2 x double> %tmp8, <2 x double>* %r, align 16 + ret void +} + //===---------------------------------------------------------------------===// __m128d test1( __m128d A, __m128d B) { @@ -473,10 +488,10 @@ Don't know if unpckhpd is faster. But it is shorter. This code generates ugly code, probably due to costs being off or something: -void %test(float* %P, <4 x float>* %P2 ) { +define void @test(float* %P, <4 x float>* %P2 ) { %xFloat0.688 = load float* %P - %loadVector37.712 = load <4 x float>* %P2 - %inFloat3.713 = insertelement <4 x float> %loadVector37.712, float 0.000000e+00, uint 3 + %tmp = load <4 x float>* %P2 + %inFloat3.713 = insertelement <4 x float> %tmp, float 0.0, i32 3 store <4 x float> %inFloat3.713, <4 x float>* %P2 ret void } @@ -484,17 +499,16 @@ void %test(float* %P, <4 x float>* %P2 ) { Generates: _test: - pxor %xmm0, %xmm0 - movd %xmm0, %eax ;; EAX = 0! - movl 8(%esp), %ecx - movaps (%ecx), %xmm0 - pinsrw $6, %eax, %xmm0 - shrl $16, %eax ;; EAX = 0 again! - pinsrw $7, %eax, %xmm0 - movaps %xmm0, (%ecx) - ret + movl 8(%esp), %eax + movaps (%eax), %xmm0 + pxor %xmm1, %xmm1 + movaps %xmm0, %xmm2 + shufps $50, %xmm1, %xmm2 + shufps $132, %xmm2, %xmm0 + movaps %xmm0, (%eax) + ret -It would be better to generate: +Would it be better to generate: _test: movl 8(%esp), %ecx @@ -505,7 +519,7 @@ _test: movaps %xmm0, (%ecx) ret -or use pxor (to make a zero vector) and shuffle (to insert it). +? //===---------------------------------------------------------------------===// @@ -573,28 +587,177 @@ swizzle: //===---------------------------------------------------------------------===// -This code: +These functions should produce the same code: #include -__m128i test(long long i) { return _mm_cvtsi64x_si128(i); } -Should turn into a single 'movq %rdi, %xmm0' instruction. Instead, we -get this (on x86-64): +typedef long long __m128i __attribute__ ((__vector_size__ (16))); -_test: - movd %rdi, %xmm1 - xorps %xmm0, %xmm0 - movsd %xmm1, %xmm0 +int foo(__m128i* val) { + return __builtin_ia32_vec_ext_v4si(*val, 1); +} +int bar(__m128i* val) { + union vs { + __m128i *_v; + int* _s; + } v = {val}; + return v._s[1]; +} + +We currently produce (with -m64): + +_foo: + pshufd $1, (%rdi), %xmm0 + movd %xmm0, %eax + ret +_bar: + movl 4(%rdi), %eax + ret + +//===---------------------------------------------------------------------===// + +We should materialize vector constants like "all ones" and "signbit" with +code like: + + cmpeqps xmm1, xmm1 ; xmm1 = all-ones + +and: + cmpeqps xmm1, xmm1 ; xmm1 = all-ones + psrlq xmm1, 31 ; xmm1 = all 100000000000... + +instead of using a load from the constant pool. The later is important for +ABS/NEG/copysign etc. + +//===---------------------------------------------------------------------===// + +"converting 64-bit constant pool entry to 32-bit not necessarily beneficial" +http://llvm.org/PR1264 + +For this test case: + +define double @foo(double %x) { + %y = mul double %x, 5.000000e-01 + ret double %y +} + +llc -march=x86-64 currently produces a 32-bit constant pool entry and this code: + + cvtss2sd .LCPI1_0(%rip), %xmm1 + mulsd %xmm1, %xmm0 + +instead of just using a 64-bit constant pool entry with this: + + mulsd .LCPI1_0(%rip), %xmm0 + +This is due to the code in ExpandConstantFP in LegalizeDAG.cpp. It notices that +x86-64 indeed has an instruction to load a 32-bit float from memory and convert +it into a 64-bit float in a register, however it doesn't notice that this isn't +beneficial because it prevents the load from being folded into the multiply. + +//===---------------------------------------------------------------------===// + +These functions: + +#include +__m128i a; +void x(unsigned short n) { + a = _mm_slli_epi32 (a, n); +} +void y(unsigned n) { + a = _mm_slli_epi32 (a, n); +} + +compile to ( -O3 -static -fomit-frame-pointer): +_x: + movzwl 4(%esp), %eax + movd %eax, %xmm0 + movaps _a, %xmm1 + pslld %xmm0, %xmm1 + movaps %xmm1, _a + ret +_y: + movd 4(%esp), %xmm0 + movaps _a, %xmm1 + pslld %xmm0, %xmm1 + movaps %xmm1, _a + ret + +"y" looks good, but "x" does silly movzwl stuff around into a GPR. It seems +like movd would be sufficient in both cases as the value is already zero +extended in the 32-bit stack slot IIRC. For signed short, it should also be +save, as a really-signed value would be undefined for pslld. + + +//===---------------------------------------------------------------------===// + +#include +int t1(double d) { return signbit(d); } + +This currently compiles to: + subl $12, %esp + movsd 16(%esp), %xmm0 + movsd %xmm0, (%esp) + movl 4(%esp), %eax + shrl $31, %eax + addl $12, %esp ret -The LLVM IR is: +We should use movmskp{s|d} instead. + +//===---------------------------------------------------------------------===// + +CodeGen/X86/vec_align.ll tests whether we can turn 4 scalar loads into a single +(aligned) vector load. This functionality has a couple of problems. + +1. The code to infer alignment from loads of globals is in the X86 backend, + not the dag combiner. This is because dagcombine2 needs to be able to see + through the X86ISD::Wrapper node, which DAGCombine can't really do. +2. The code for turning 4 x load into a single vector load is target + independent and should be moved to the dag combiner. +3. The code for turning 4 x load into a vector load can only handle a direct + load from a global or a direct load from the stack. It should be generalized + to handle any load from P, P+4, P+8, P+12, where P can be anything. +4. The alignment inference code cannot handle loads from globals in non-static + mode because it doesn't look through the extra dyld stub load. If you try + vec_align.ll without -relocation-model=static, you'll see what I mean. -target triple = "x86_64-apple-darwin8" -define <2 x i64> @test(i64 %i) { +//===---------------------------------------------------------------------===// + +We should lower store(fneg(load p), q) into an integer load+xor+store, which +eliminates a constant pool load. For example, consider: + +define i64 @ccosf(float %z.0, float %z.1) nounwind readonly { entry: - %tmp10 = insertelement <2 x i64> undef, i64 %i, i32 0 - %tmp11 = insertelement <2 x i64> %tmp10, i64 0, i32 1 - ret <2 x i64> %tmp11 + %tmp6 = sub float -0.000000e+00, %z.1 ; [#uses=1] + %tmp20 = tail call i64 @ccoshf( float %tmp6, float %z.0 ) nounwind readonly ; [#uses=1] + ret i64 %tmp20 } +This currently compiles to: + +LCPI1_0: # <4 x float> + .long 2147483648 # float -0 + .long 2147483648 # float -0 + .long 2147483648 # float -0 + .long 2147483648 # float -0 +_ccosf: + subl $12, %esp + movss 16(%esp), %xmm0 + movss %xmm0, 4(%esp) + movss 20(%esp), %xmm0 + xorps LCPI1_0, %xmm0 + movss %xmm0, (%esp) + call L_ccoshf$stub + addl $12, %esp + ret + +Note the load into xmm0, then xor (to negate), then store. In PIC mode, +this code computes the pic base and does two loads to do the constant pool +load, so the improvement is much bigger. + +The tricky part about this xform is that the argument load/store isn't exposed +until post-legalize, and at that point, the fneg has been custom expanded into +an X86 fxor. This means that we need to handle this case in the x86 backend +instead of in target independent code. + //===---------------------------------------------------------------------===//