add a note

[oota-llvm.git] / lib / Target / PowerPC / README_ALTIVEC.txt
diff --git a/lib/Target/PowerPC/README_ALTIVEC.txt b/lib/Target/PowerPC/README_ALTIVEC.txt

index 3c928ad6bbd01bc7af76aa1466ef054981746246..9611f9e318d4d6385b810199e0953274b7b79c51 100644 (file)
--- a/lib/Target/PowerPC/README_ALTIVEC.txt
+++ b/lib/Target/PowerPC/README_ALTIVEC.txt
@@ -5,8 +5,8 @@ registers, to generate better spill code.
  
  //===----------------------------------------------------------------------===//
  
-Altivec support.  The first should be a single lvx from the constant pool, the
-second should be a xor/stvx:
+The first should be a single lvx from the constant pool, the second should be 
+a xor/stvx:
  
  void foo(void) {
    int x[8] __attribute__((aligned(128))) = { 1, 1, 1, 17, 1, 1, 1, 1 };
@@ -39,33 +39,12 @@ a load/store/lve*x sequence.
  
  //===----------------------------------------------------------------------===//
  
-There are a wide range of vector constants we can generate with combinations of
-altivec instructions.
-
-Examples, these work with all widths:
-  Splat(+/- 16,18,20,22,24,28,30):  t = vspliti I/2,  r = t+t
-  Splat(+/- 17,19,21,23,25,29):     t = vsplti +/-15, t2 = vsplti I-15, r=t + t2
-  Splat(31):                        t = vsplti FB,  r = srl t,t
-  Splat(256):  t = vsplti 1, r = vsldoi t, t, 1
-
-Lots more are listed here:
-http://www.informatik.uni-bremen.de/~hobold/AltiVec.html
-
-This should be added to the ISD::BUILD_VECTOR case in 
-PPCTargetLowering::LowerOperation.
-
-//===----------------------------------------------------------------------===//
-
-FABS/FNEG can be codegen'd with the appropriate and/xor of -0.0.
-
-//===----------------------------------------------------------------------===//
-
  For functions that use altivec AND have calls, we are VRSAVE'ing all call
  clobbered regs.
  
  //===----------------------------------------------------------------------===//
  
-Implement passing/returning vectors by value.
+Implement passing vectors by value into calls and receiving them as arguments.
  
  //===----------------------------------------------------------------------===//
  
@@ -74,67 +53,119 @@ of C1/C2/C3, then a load and vperm of Variable.
  
  //===----------------------------------------------------------------------===//
  
+We need a way to teach tblgen that some operands of an intrinsic are required to
+be constants.  The verifier should enforce this constraint.
+
+//===----------------------------------------------------------------------===//
+
  We currently codegen SCALAR_TO_VECTOR as a store of the scalar to a 16-byte
-aligned stack slot, followed by a lve*x/vperm.  We should probably just store it
+aligned stack slot, followed by a load/vperm.  We should probably just store it
  to a scalar stack slot, then use lvsl/vperm to load it.  If the value is already
-in memory, this is a huge win.
+in memory this is a big win.
  
  //===----------------------------------------------------------------------===//
  
-Do not generate the MFCR/RLWINM sequence for predicate compares when the
-predicate compare is used immediately by a branch.  Just branch on the right
-cond code on CR6.
+extract_vector_elt of an arbitrary constant vector can be done with the 
+following instructions:
  
-//===----------------------------------------------------------------------===//
+vTemp = vec_splat(v0,2);    // 2 is the element the src is in.
+vec_ste(&destloc,0,vTemp);
  
-We need a way to teach tblgen that some operands of an intrinsic are required to
-be constants.  The verifier should enforce this constraint.
+We can do an arbitrary non-constant value by using lvsr/perm/ste.
  
  //===----------------------------------------------------------------------===//
  
-Instead of writting a pattern for type-agnostic operations (e.g. gen-zero, load,
-store, and, ...) in every supported type, make legalize do the work.  We should
-have a canonical type that we want operations changed to (e.g. v4i32 for
-build_vector) and legalize should change non-identical types to thse.  This is
-similar to what it does for operations that are only supported in some types,
-e.g. x86 cmov (not supported on bytes).
-
-This would fix two problems:
-1. Writing patterns multiple times.
-2. Identical operations in different types are not getting CSE'd.
+If we want to tie instruction selection into the scheduler, we can do some
+constant formation with different instructions.  For example, we can generate
+"vsplti -1" with "vcmpequw R,R" and 1,1,1,1 with "vsubcuw R,R", and 0,0,0,0 with
+"vsplti 0" or "vxor", each of which use different execution units, thus could
+help scheduling.
  
-We already do this for shuffle and build_vector.  We need load,undef,and,or,xor,
-etc.
+This is probably only reasonable for a post-pass scheduler.
  
  //===----------------------------------------------------------------------===//
  
-Implement multiply for vector integer types, to avoid the horrible scalarized
-code produced by legalize.
+For this function:
  
-void test(vector int *X, vector int *Y) {
-  *X = *X * *Y;
+void test(vector float *A, vector float *B) {
+  vector float C = (vector float)vec_cmpeq(*A, *B);
+  if (!vec_any_eq(*A, *B))
+    *B = (vector float){0,0,0,0};
+  *A = C;
  }
  
-//===----------------------------------------------------------------------===//
-
-There are a wide variety of vector_shuffle operations that we can do with a pair
-of instructions (e.g. a vsldoi + vpkuhum).  We should pattern match these, but
-there are a huge number of these.
+we get the following basic block:
  
-Specific examples:
+       ...
+        lvx v2, 0, r4
+        lvx v3, 0, r3
+        vcmpeqfp v4, v3, v2
+        vcmpeqfp. v2, v3, v2
+        bne cr6, LBB1_2 ; cond_next
  
-C = vector_shuffle A, B, <0, 1, 2, 4>
-->  t = vsldoi A, A, 12
-->  C = vsldoi A, B, 4
+The vcmpeqfp/vcmpeqfp. instructions currently cannot be merged when the
+vcmpeqfp. result is used by a branch.  This can be improved.
  
  //===----------------------------------------------------------------------===//
  
-extract_vector_elt of an arbitrary constant vector can be done with the 
-following instructions:
+The code generated for this is truly aweful:
  
-vTemp = vec_splat(v0,2);    // 2 is the element the src is in.
-vec_ste(&destloc,0,vTemp);
+vector float test(float a, float b) {
+ return (vector float){ 0.0, a, 0.0, 0.0}; 
+}
  
-We can do an arbitrary non-constant value by using lvsr/perm/ste.
+LCPI1_0:                                        ;  float
+        .space  4
+        .text
+        .globl  _test
+        .align  4
+_test:
+        mfspr r2, 256
+        oris r3, r2, 4096
+        mtspr 256, r3
+        lis r3, ha16(LCPI1_0)
+        addi r4, r1, -32
+        stfs f1, -16(r1)
+        addi r5, r1, -16
+        lfs f0, lo16(LCPI1_0)(r3)
+        stfs f0, -32(r1)
+        lvx v2, 0, r4
+        lvx v3, 0, r5
+        vmrghw v3, v3, v2
+        vspltw v2, v2, 0
+        vmrghw v2, v2, v3
+        mtspr 256, r2
+        blr
+
+//===----------------------------------------------------------------------===//
+
+int foo(vector float *x, vector float *y) {
+        if (vec_all_eq(*x,*y)) return 3245; 
+        else return 12;
+}
+
+A predicate compare being used in a select_cc should have the same peephole
+applied to it as a predicate compare used by a br_cc.  There should be no
+mfcr here:
+
+_foo:
+        mfspr r2, 256
+        oris r5, r2, 12288
+        mtspr 256, r5
+        li r5, 12
+        li r6, 3245
+        lvx v2, 0, r4
+        lvx v3, 0, r3
+        vcmpeqfp. v2, v3, v2
+        mfcr r3, 2
+        rlwinm r3, r3, 25, 31, 31
+        cmpwi cr0, r3, 0
+        bne cr0, LBB1_2 ; entry
+LBB1_1: ; entry
+        mr r6, r5
+LBB1_2: ; entry
+        mr r3, r6
+        mtspr 256, r2
+        blr
  
  //===----------------------------------------------------------------------===//