[X86][AVX] Fold loads + splats into broadcast instructions

[oota-llvm.git] / lib / Target / X86 / README-SSE.txt
diff --git a/lib/Target/X86/README-SSE.txt b/lib/Target/X86/README-SSE.txt

index 67cad42a3548a7281fb0978c8698bce5412460cd..e6896e805568ad9515b2c0420386e6fa77806e34 100644 (file)
--- a/lib/Target/X86/README-SSE.txt
+++ b/lib/Target/X86/README-SSE.txt
@@ -2,129 +2,94 @@
  // Random ideas for the X86 backend: SSE-specific stuff.
  //===---------------------------------------------------------------------===//
  
-- Consider eliminating the unaligned SSE load intrinsics, replacing them with
-  unaligned LLVM load instructions.
-
  //===---------------------------------------------------------------------===//
  
-Expand libm rounding functions inline:  Significant speedups possible.
-http://gcc.gnu.org/ml/gcc-patches/2006-10/msg00909.html
-
-//===---------------------------------------------------------------------===//
+SSE Variable shift can be custom lowered to something like this, which uses a
+small table + unaligned load + shuffle instead of going through memory.
  
-When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and
-other fast SSE modes.
-
-//===---------------------------------------------------------------------===//
+__m128i_shift_right:
+       .byte     0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
+       .byte    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
  
-Think about doing i64 math in SSE regs on x86-32.
+...
+__m128i shift_right(__m128i value, unsigned long offset) {
+  return _mm_shuffle_epi8(value,
+               _mm_loadu_si128((__m128 *) (___m128i_shift_right + offset)));
+}
  
  //===---------------------------------------------------------------------===//
  
-This testcase should have no SSE instructions in it, and only one load from
-a constant pool:
+SSE has instructions for doing operations on complex numbers, we should pattern
+match them.   For example, this should turn into a horizontal add:
  
-double %test3(bool %B) {
-        %C = select bool %B, double 123.412, double 523.01123123
-        ret double %C
+typedef float __attribute__((vector_size(16))) v4f32;
+float f32(v4f32 A) {
+  return A[0]+A[1]+A[2]+A[3];
  }
  
-Currently, the select is being lowered, which prevents the dag combiner from
-turning 'select (load CPI1), (load CPI2)' -> 'load (select CPI1, CPI2)'
-
-The pattern isel got this one right.
-
-//===---------------------------------------------------------------------===//
-
-SSE doesn't have [mem] op= reg instructions.  If we have an SSE instruction
-like this:
+Instead we get this:
+
+_f32:                                   ## @f32
+       pshufd  $1, %xmm0, %xmm1        ## xmm1 = xmm0[1,0,0,0]
+       addss   %xmm0, %xmm1
+       pshufd  $3, %xmm0, %xmm2        ## xmm2 = xmm0[3,0,0,0]
+       movhlps %xmm0, %xmm0            ## xmm0 = xmm0[1,1]
+       movaps  %xmm0, %xmm3
+       addss   %xmm1, %xmm3
+       movdqa  %xmm2, %xmm0
+       addss   %xmm3, %xmm0
+       ret
  
-  X += y
+Also, there are cases where some simple local SLP would improve codegen a bit.
+compiling this:
  
-and the register allocator decides to spill X, it is cheaper to emit this as:
+_Complex float f32(_Complex float A, _Complex float B) {
+  return A+B;
+}
  
-Y += [xslot]
-store Y -> [xslot]
+into:
  
-than as:
+_f32:                                   ## @f32
+       movdqa  %xmm0, %xmm2
+       addss   %xmm1, %xmm2
+       pshufd  $1, %xmm1, %xmm1        ## xmm1 = xmm1[1,0,0,0]
+       pshufd  $1, %xmm0, %xmm3        ## xmm3 = xmm0[1,0,0,0]
+       addss   %xmm1, %xmm3
+       movaps  %xmm2, %xmm0
+       unpcklps        %xmm3, %xmm0    ## xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+       ret
  
-tmp = [xslot]
-tmp += y
-store tmp -> [xslot]
+seems silly when it could just be one addps.
  
-..and this uses one fewer register (so this should be done at load folding
-time, not at spiller time).  *Note* however that this can only be done
-if Y is dead.  Here's a testcase:
  
-@.str_3 = external global [15 x i8]
-declare void @printf(i32, ...)
-define void @main() {
-build_tree.exit:
-       br label %no_exit.i7
+//===---------------------------------------------------------------------===//
  
-no_exit.i7:            ; preds = %no_exit.i7, %build_tree.exit
-       %tmp.0.1.0.i9 = phi double [ 0.000000e+00, %build_tree.exit ],
-                                   [ %tmp.34.i18, %no_exit.i7 ]
-       %tmp.0.0.0.i10 = phi double [ 0.000000e+00, %build_tree.exit ],
-                                    [ %tmp.28.i16, %no_exit.i7 ]
-       %tmp.28.i16 = add double %tmp.0.0.0.i10, 0.000000e+00
-       %tmp.34.i18 = add double %tmp.0.1.0.i9, 0.000000e+00
-       br i1 false, label %Compute_Tree.exit23, label %no_exit.i7
+Expand libm rounding functions inline:  Significant speedups possible.
+http://gcc.gnu.org/ml/gcc-patches/2006-10/msg00909.html
  
-Compute_Tree.exit23:           ; preds = %no_exit.i7
-       tail call void (i32, ...)* @printf( i32 0 )
-       store double %tmp.34.i18, double* null
-       ret void
-}
+//===---------------------------------------------------------------------===//
  
-We currently emit:
+When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and
+other fast SSE modes.
  
-.BBmain_1:
-        xorpd %XMM1, %XMM1
-        addsd %XMM0, %XMM1
-***     movsd %XMM2, QWORD PTR [%ESP + 8]
-***     addsd %XMM2, %XMM1
-***     movsd QWORD PTR [%ESP + 8], %XMM2
-        jmp .BBmain_1   # no_exit.i7
+//===---------------------------------------------------------------------===//
  
-This is a bugpoint reduced testcase, which is why the testcase doesn't make
-much sense (e.g. its an infinite loop). :)
+Think about doing i64 math in SSE regs on x86-32.
  
  //===---------------------------------------------------------------------===//
  
-SSE should implement 'select_cc' using 'emulated conditional moves' that use
-pcmp/pand/pandn/por to do a selection instead of a conditional branch:
+This testcase should have no SSE instructions in it, and only one load from
+a constant pool:
  
-double %X(double %Y, double %Z, double %A, double %B) {
-        %C = setlt double %A, %B
-        %z = add double %Z, 0.0    ;; select operand is not a load
-        %D = select bool %C, double %Y, double %z
-        ret double %D
+double %test3(bool %B) {
+        %C = select bool %B, double 123.412, double 523.01123123
+        ret double %C
  }
  
-We currently emit:
-
-_X:
-        subl $12, %esp
-        xorpd %xmm0, %xmm0
-        addsd 24(%esp), %xmm0
-        movsd 32(%esp), %xmm1
-        movsd 16(%esp), %xmm2
-        ucomisd 40(%esp), %xmm1
-        jb LBB_X_2
-LBB_X_1:
-        movsd %xmm0, %xmm2
-LBB_X_2:
-        movsd %xmm2, (%esp)
-        fldl (%esp)
-        addl $12, %esp
-        ret
-
-//===---------------------------------------------------------------------===//
+Currently, the select is being lowered, which prevents the dag combiner from
+turning 'select (load CPI1), (load CPI2)' -> 'load (select CPI1, CPI2)'
  
-It's not clear whether we should use pxor or xorps / xorpd to clear XMM
-registers. The choice may depend on subtarget information. We should do some
-more experiments on different x86 machines.
+The pattern isel got this one right.
  
  //===---------------------------------------------------------------------===//
  
@@ -151,45 +116,6 @@ Perhaps use pxor / xorp* to clear a XMM register first?
  
  //===---------------------------------------------------------------------===//
  
-How to decide when to use the "floating point version" of logical ops? Here are
-some code fragments:
-
-       movaps LCPI5_5, %xmm2
-       divps %xmm1, %xmm2
-       mulps %xmm2, %xmm3
-       mulps 8656(%ecx), %xmm3
-       addps 8672(%ecx), %xmm3
-       andps LCPI5_6, %xmm2
-       andps LCPI5_1, %xmm3
-       por %xmm2, %xmm3
-       movdqa %xmm3, (%edi)
-
-       movaps LCPI5_5, %xmm1
-       divps %xmm0, %xmm1
-       mulps %xmm1, %xmm3
-       mulps 8656(%ecx), %xmm3
-       addps 8672(%ecx), %xmm3
-       andps LCPI5_6, %xmm1
-       andps LCPI5_1, %xmm3
-       orps %xmm1, %xmm3
-       movaps %xmm3, 112(%esp)
-       movaps %xmm3, (%ebx)
-
-Due to some minor source change, the later case ended up using orps and movaps
-instead of por and movdqa. Does it matter?
-
-//===---------------------------------------------------------------------===//
-
-X86RegisterInfo::copyRegToReg() returns X86::MOVAPSrr for VR128. Is it possible
-to choose between movaps, movapd, and movdqa based on types of source and
-destination?
-
-How about andps, andpd, and pand? Do we really care about the type of the packed
-elements? If not, why not always use the "ps" variants which are likely to be
-shorter.
-
-//===---------------------------------------------------------------------===//
-
  External test Nurbs exposed some problems. Look for
  __ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc
  emits:
@@ -278,41 +204,6 @@ It also exposes some other problems. See MOV32ri -3 and the spills.
  
  //===---------------------------------------------------------------------===//
  
-http://gcc.gnu.org/bugzilla/show_bug.cgi?id=25500
-
-LLVM is producing bad code.
-
-LBB_main_4:    # cond_true44
-       addps %xmm1, %xmm2
-       subps %xmm3, %xmm2
-       movaps (%ecx), %xmm4
-       movaps %xmm2, %xmm1
-       addps %xmm4, %xmm1
-       addl $16, %ecx
-       incl %edx
-       cmpl $262144, %edx
-       movaps %xmm3, %xmm2
-       movaps %xmm4, %xmm3
-       jne LBB_main_4  # cond_true44
-
-There are two problems. 1) No need to two loop induction variables. We can
-compare against 262144 * 16. 2) Known register coalescer issue. We should
-be able eliminate one of the movaps:
-
-       addps %xmm2, %xmm1    <=== Commute!
-       subps %xmm3, %xmm1
-       movaps (%ecx), %xmm4
-       movaps %xmm1, %xmm1   <=== Eliminate!
-       addps %xmm4, %xmm1
-       addl $16, %ecx
-       incl %edx
-       cmpl $262144, %edx
-       movaps %xmm3, %xmm2
-       movaps %xmm4, %xmm3
-       jne LBB_main_4  # cond_true44
-
-//===---------------------------------------------------------------------===//
-
  Consider:
  
  __m128 test(float a) {
@@ -376,28 +267,12 @@ ret
  ... saving two instructions.
  
  The basic idea is that a reload from a spill slot, can, if only one 4-byte 
-chunk is used, bring in 3 zeros the the one element instead of 4 elements.
+chunk is used, bring in 3 zeros the one element instead of 4 elements.
  This can be used to simplify a variety of shuffle operations, where the
  elements are fixed zeros.
  
  //===---------------------------------------------------------------------===//
  
-__m128d test1( __m128d A, __m128d B) {
-  return _mm_shuffle_pd(A, B, 0x3);
-}
-
-compiles to
-
-shufpd $3, %xmm1, %xmm0
-
-Perhaps it's better to use unpckhpd instead?
-
-unpckhpd %xmm1, %xmm0
-
-Don't know if unpckhpd is faster. But it is shorter.
-
-//===---------------------------------------------------------------------===//
-
  This code generates ugly code, probably due to costs being off or something:
  
  define void @test(float* %P, <4 x float>* %P2 ) {
@@ -545,10 +420,11 @@ eliminates a constant pool load.  For example, consider:
  
  define i64 @ccosf(float %z.0, float %z.1) nounwind readonly  {
  entry:
- %tmp6 = sub float -0.000000e+00, %z.1         ; <float> [#uses=1]
+ %tmp6 = fsub float -0.000000e+00, %z.1                ; <float> [#uses=1]
   %tmp20 = tail call i64 @ccoshf( float %tmp6, float %z.0 ) nounwind readonly
   ret i64 %tmp20
  }
+declare i64 @ccoshf(float %z.0, float %z.1) nounwind readonly
  
  This currently compiles to:
  
@@ -588,11 +464,6 @@ is memory.
  
  //===---------------------------------------------------------------------===//
  
-SSE4 extract-to-mem ops aren't being pattern matched because of the AssertZext
-sitting between the truncate and the extract.
-
-//===---------------------------------------------------------------------===//
-
  INSERTPS can match any insert (extract, imm1), imm2 for 4 x float, and insert
  any number of 0.0 simultaneously.  Currently we only use it for simple
  insertions.
@@ -611,37 +482,6 @@ to <2 x i64> ops being so bad.
  
  //===---------------------------------------------------------------------===//
  
-'select' on vectors and scalars could be a whole lot better.  We currently 
-lower them to conditional branches.  On x86-64 for example, we compile this:
-
-double test(double a, double b, double c, double d) { return a<b ? c : d; }
-
-to:
-
-_test:
-       ucomisd %xmm0, %xmm1
-       ja      LBB1_2  # entry
-LBB1_1:        # entry
-       movapd  %xmm3, %xmm2
-LBB1_2:        # entry
-       movapd  %xmm2, %xmm0
-       ret
-
-instead of:
-
-_test:
-       cmpltsd %xmm1, %xmm0
-       andpd   %xmm0, %xmm2
-       andnpd  %xmm3, %xmm0
-       orpd    %xmm2, %xmm0
-       ret
-
-For unpredictable branches, the later is much more efficient.  This should
-just be a matter of having scalar sse map to SELECT_CC and custom expanding
-or iseling it.
-
-//===---------------------------------------------------------------------===//
-
  LLVM currently generates stack realignment code, when it is not necessary
  needed. The problem is that we need to know about stack alignment too early,
  before RA runs.
@@ -840,42 +680,6 @@ _t:
         shufps  $132, %xmm2, %xmm0
         movaps  %xmm0, 0
  
-//===---------------------------------------------------------------------===//
-rdar://6037315
-
-llvm-gcc-4.2 does the following for uint32_t -> float conversions on i386:
-
-       uint32_t x;
-       float y = (float)x;
-
-becomes:
-
-movl   %eax,           -8(%ebp)        // write x to the stack
-movl   $0x3ff00000,    -4(%ebp)        // 2^52 + x as a double at -4(%ebp)
-movsd  -8(%ebp),               %xmm0
-subsd  [2^52 double],  %xmm0   // subtract 2^52 -- this is exact
-cvtsd2ss %xmm0,                %xmm0   // convert to single -- rounding happens here
-
-On merom/yonah, this takes a substantial stall.  The following is a much 
-better option:
-
-movd   %eax,           %xmm0   // load x into low word of xmm0
-movsd  [2^52 double],  %xmm1   // load 2^52 into xmm1
-orpd   %xmm1,          %xmm0   // 2^52 + x in double precision
-subsd  %xmm1,          %xmm0   // x in double precision
-cvtsd2ss %xmm0,                %xmm0   // x rounded to single precision
-
-IF we don't already need PIC, then the following is even faster still, at a 
-small cost to code size:
-
-movl           $0x3ff00000,    %ecx            // conjure high word of 2^52
-movd           %ecx,           %xmm1
-movss  %eax,           %xmm0   // load x into low word of xmm0
-psllq          $32,                    %xmm1   // 2^52
-orpd           %xmm1,          %xmm0   // 2^52 + x in double precision
-subsd          %xmm1,          %xmm0   // x in double precision
-cvtsd2ss       %xmm0,          %xmm0   // x in single precision
-
  //===---------------------------------------------------------------------===//
  rdar://5907648
  
@@ -952,3 +756,95 @@ cheaper to do fld1 than load from a constant pool for example, so
  "load, add 1.0, store" is better done in the fp stack, etc.
  
  //===---------------------------------------------------------------------===//
+
+These should compile into the same code (PR6214): Perhaps instcombine should
+canonicalize the former into the later?
+
+define float @foo(float %x) nounwind {
+  %t = bitcast float %x to i32
+  %s = and i32 %t, 2147483647
+  %d = bitcast i32 %s to float
+  ret float %d
+}
+
+declare float @fabsf(float %n)
+define float @bar(float %x) nounwind {
+  %d = call float @fabsf(float %x)
+  ret float %d
+}
+
+//===---------------------------------------------------------------------===//
+
+This IR (from PR6194):
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-darwin10.0.0"
+
+%0 = type { double, double }
+%struct.float3 = type { float, float, float }
+
+define void @test(%0, %struct.float3* nocapture %res) nounwind noinline ssp {
+entry:
+  %tmp18 = extractvalue %0 %0, 0                  ; <double> [#uses=1]
+  %tmp19 = bitcast double %tmp18 to i64           ; <i64> [#uses=1]
+  %tmp20 = zext i64 %tmp19 to i128                ; <i128> [#uses=1]
+  %tmp10 = lshr i128 %tmp20, 32                   ; <i128> [#uses=1]
+  %tmp11 = trunc i128 %tmp10 to i32               ; <i32> [#uses=1]
+  %tmp12 = bitcast i32 %tmp11 to float            ; <float> [#uses=1]
+  %tmp5 = getelementptr inbounds %struct.float3* %res, i64 0, i32 1 ; <float*> [#uses=1]
+  store float %tmp12, float* %tmp5
+  ret void
+}
+
+Compiles to:
+
+_test:                                  ## @test
+       movd    %xmm0, %rax
+       shrq    $32, %rax
+       movl    %eax, 4(%rdi)
+       ret
+
+This would be better kept in the SSE unit by treating XMM0 as a 4xfloat and
+doing a shuffle from v[1] to v[0] then a float store.
+
+//===---------------------------------------------------------------------===//
+
+[UNSAFE FP]
+
+void foo(double, double, double);
+void norm(double x, double y, double z) {
+  double scale = __builtin_sqrt(x*x + y*y + z*z);
+  foo(x/scale, y/scale, z/scale);
+}
+
+We currently generate an sqrtsd and 3 divsd instructions. This is bad, fp div is
+slow and not pipelined. In -ffast-math mode we could compute "1.0/scale" first
+and emit 3 mulsd in place of the divs. This can be done as a target-independent
+transform.
+
+If we're dealing with floats instead of doubles we could even replace the sqrtss
+and inversion with an rsqrtss instruction, which computes 1/sqrt faster at the
+cost of reduced accuracy.
+
+//===---------------------------------------------------------------------===//
+
+This function should be matched to haddpd when the appropriate CPU is enabled:
+
+#include <x86intrin.h>
+double f (__m128d p) {
+  return p[0] + p[1];
+}
+
+similarly, v[0]-v[1] should match to hsubpd, and {v[0]-v[1], w[0]-w[1]} should
+turn into hsubpd also.
+
+//===---------------------------------------------------------------------===//
+
+define <2 x i32> @foo(<2 x double> %in) {
+  %x = fptosi <2 x double> %in to <2 x i32>
+  ret <2 x i32> %x
+}
+
+Should compile into cvttpd2dq instead of being scalarized into 2 cvttsd2si.
+
+//===---------------------------------------------------------------------===//