Add LLVM support for PPC cryptography builtins

[oota-llvm.git] / lib / Target / PowerPC / README_ALTIVEC.txt
diff --git a/lib/Target/PowerPC/README_ALTIVEC.txt b/lib/Target/PowerPC/README_ALTIVEC.txt

index 9de4415c3e0c39dd2da3752872f04d0a659a5154..34765de1b2179fffb5ec7ede41a26e9969806fc9 100644 (file)
--- a/lib/Target/PowerPC/README_ALTIVEC.txt
+++ b/lib/Target/PowerPC/README_ALTIVEC.txt
@@ -39,16 +39,12 @@ a load/store/lve*x sequence.
  
  //===----------------------------------------------------------------------===//
  
  
  //===----------------------------------------------------------------------===//
  
-FABS/FNEG can be codegen'd with the appropriate and/xor of -0.0.
-
-//===----------------------------------------------------------------------===//
-
  For functions that use altivec AND have calls, we are VRSAVE'ing all call
  clobbered regs.
  
  //===----------------------------------------------------------------------===//
  
  For functions that use altivec AND have calls, we are VRSAVE'ing all call
  clobbered regs.
  
  //===----------------------------------------------------------------------===//
  
-Implement passing vectors by value.
+Implement passing vectors by value into calls and receiving them as arguments.
  
  //===----------------------------------------------------------------------===//
  
  
  //===----------------------------------------------------------------------===//
  
@@ -57,21 +53,15 @@ of C1/C2/C3, then a load and vperm of Variable.
  
  //===----------------------------------------------------------------------===//
  
  
  //===----------------------------------------------------------------------===//
  
-We currently codegen SCALAR_TO_VECTOR as a store of the scalar to a 16-byte
-aligned stack slot, followed by a load/vperm.  We should probably just store it
-to a scalar stack slot, then use lvsl/vperm to load it.  If the value is already
-in memory, this is a huge win.
-
-//===----------------------------------------------------------------------===//
-
-Do not generate the MFCR/RLWINM sequence for predicate compares when the
-predicate compare is used immediately by a branch.  Just branch on the right
-cond code on CR6.
+We need a way to teach tblgen that some operands of an intrinsic are required to
+be constants.  The verifier should enforce this constraint.
  
  //===----------------------------------------------------------------------===//
  
  
  //===----------------------------------------------------------------------===//
  
-We need a way to teach tblgen that some operands of an intrinsic are required to
-be constants.  The verifier should enforce this constraint.
+We currently codegen SCALAR_TO_VECTOR as a store of the scalar to a 16-byte
+aligned stack slot, followed by a load/vperm.  We should probably just store it
+to a scalar stack slot, then use lvsl/vperm to load it.  If the value is already
+in memory this is a big win.
  
  //===----------------------------------------------------------------------===//
  
  
  //===----------------------------------------------------------------------===//
  
@@ -87,10 +77,206 @@ We can do an arbitrary non-constant value by using lvsr/perm/ste.
  
  If we want to tie instruction selection into the scheduler, we can do some
  constant formation with different instructions.  For example, we can generate
  
  If we want to tie instruction selection into the scheduler, we can do some
  constant formation with different instructions.  For example, we can generate
-"vsplti -1" with "vcmpequw R,R" and 1,1,1,1 with "vsubcuw R,R", both of which
-use different execution units, thus could help scheduling.
+"vsplti -1" with "vcmpequw R,R" and 1,1,1,1 with "vsubcuw R,R", and 0,0,0,0 with
+"vsplti 0" or "vxor", each of which use different execution units, thus could
+help scheduling.
  
  This is probably only reasonable for a post-pass scheduler.
  
  //===----------------------------------------------------------------------===//
  
  
  This is probably only reasonable for a post-pass scheduler.
  
  //===----------------------------------------------------------------------===//
  
+For this function:
+
+void test(vector float *A, vector float *B) {
+  vector float C = (vector float)vec_cmpeq(*A, *B);
+  if (!vec_any_eq(*A, *B))
+    *B = (vector float){0,0,0,0};
+  *A = C;
+}
+
+we get the following basic block:
+
+       ...
+        lvx v2, 0, r4
+        lvx v3, 0, r3
+        vcmpeqfp v4, v3, v2
+        vcmpeqfp. v2, v3, v2
+        bne cr6, LBB1_2 ; cond_next
+
+The vcmpeqfp/vcmpeqfp. instructions currently cannot be merged when the
+vcmpeqfp. result is used by a branch.  This can be improved.
+
+//===----------------------------------------------------------------------===//
+
+The code generated for this is truly aweful:
+
+vector float test(float a, float b) {
+ return (vector float){ 0.0, a, 0.0, 0.0}; 
+}
+
+LCPI1_0:                                        ;  float
+        .space  4
+        .text
+        .globl  _test
+        .align  4
+_test:
+        mfspr r2, 256
+        oris r3, r2, 4096
+        mtspr 256, r3
+        lis r3, ha16(LCPI1_0)
+        addi r4, r1, -32
+        stfs f1, -16(r1)
+        addi r5, r1, -16
+        lfs f0, lo16(LCPI1_0)(r3)
+        stfs f0, -32(r1)
+        lvx v2, 0, r4
+        lvx v3, 0, r5
+        vmrghw v3, v3, v2
+        vspltw v2, v2, 0
+        vmrghw v2, v2, v3
+        mtspr 256, r2
+        blr
+
+//===----------------------------------------------------------------------===//
+
+int foo(vector float *x, vector float *y) {
+        if (vec_all_eq(*x,*y)) return 3245; 
+        else return 12;
+}
+
+A predicate compare being used in a select_cc should have the same peephole
+applied to it as a predicate compare used by a br_cc.  There should be no
+mfcr here:
+
+_foo:
+        mfspr r2, 256
+        oris r5, r2, 12288
+        mtspr 256, r5
+        li r5, 12
+        li r6, 3245
+        lvx v2, 0, r4
+        lvx v3, 0, r3
+        vcmpeqfp. v2, v3, v2
+        mfcr r3, 2
+        rlwinm r3, r3, 25, 31, 31
+        cmpwi cr0, r3, 0
+        bne cr0, LBB1_2 ; entry
+LBB1_1: ; entry
+        mr r6, r5
+LBB1_2: ; entry
+        mr r3, r6
+        mtspr 256, r2
+        blr
+
+//===----------------------------------------------------------------------===//
+
+CodeGen/PowerPC/vec_constants.ll has an and operation that should be
+codegen'd to andc.  The issue is that the 'all ones' build vector is
+SelectNodeTo'd a VSPLTISB instruction node before the and/xor is selected
+which prevents the vnot pattern from matching.
+
+
+//===----------------------------------------------------------------------===//
+
+An alternative to the store/store/load approach for illegal insert element 
+lowering would be:
+
+1. store element to any ol' slot
+2. lvx the slot
+3. lvsl 0; splat index; vcmpeq to generate a select mask
+4. lvsl slot + x; vperm to rotate result into correct slot
+5. vsel result together.
+
+//===----------------------------------------------------------------------===//
+
+Should codegen branches on vec_any/vec_all to avoid mfcr.  Two examples:
+
+#include <altivec.h>
+ int f(vector float a, vector float b)
+ {
+  int aa = 0;
+  if (vec_all_ge(a, b))
+    aa |= 0x1;
+  if (vec_any_ge(a,b))
+    aa |= 0x2;
+  return aa;
+}
+
+vector float f(vector float a, vector float b) { 
+  if (vec_any_eq(a, b)) 
+    return a; 
+  else 
+    return b; 
+}
+
+//===----------------------------------------------------------------------===//
+
+We should do a little better with eliminating dead stores.
+The stores to the stack are dead since %a and %b are not needed
+
+; Function Attrs: nounwind
+define <16 x i8> @test_vpmsumb() #0 {
+  entry:
+  %a = alloca <16 x i8>, align 16
+  %b = alloca <16 x i8>, align 16
+  store <16 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16>, <16 x i8>* %a, align 16
+  store <16 x i8> <i8 113, i8 114, i8 115, i8 116, i8 117, i8 118, i8 119, i8 120, i8 121, i8 122, i8 123, i8 124, i8 125, i8 126, i8 127, i8 112>, <16 x i8>* %b, align 16
+  %0 = load <16 x i8>* %a, align 16
+  %1 = load <16 x i8>* %b, align 16
+  %2 = call <16 x i8> @llvm.ppc.altivec.crypto.vpmsumb(<16 x i8> %0, <16 x i8> %1)
+  ret <16 x i8> %2
+}
+
+
+; Function Attrs: nounwind readnone
+declare <16 x i8> @llvm.ppc.altivec.crypto.vpmsumb(<16 x i8>, <16 x i8>) #1
+
+
+Produces the following code with -mtriple=powerpc64-unknown-linux-gnu:
+# BB#0:                                 # %entry
+    addis 3, 2, .LCPI0_0@toc@ha
+    addis 4, 2, .LCPI0_1@toc@ha
+    addi 3, 3, .LCPI0_0@toc@l
+    addi 4, 4, .LCPI0_1@toc@l
+    lxvw4x 0, 0, 3
+    addi 3, 1, -16
+    lxvw4x 35, 0, 4
+    stxvw4x 0, 0, 3
+    ori 2, 2, 0
+    lxvw4x 34, 0, 3
+    addi 3, 1, -32
+    stxvw4x 35, 0, 3
+    vpmsumb 2, 2, 3
+    blr
+    .long   0
+    .quad   0
+
+The two stxvw4x instructions are not needed.
+With -mtriple=powerpc64le-unknown-linux-gnu, the associated permutes
+are present too.
+//===----------------------------------------------------------------------===//
+
+The following example is found in test/CodeGen/PowerPC/vec_add_sub_doubleword.ll:
+
+define <2 x i64> @increment_by_val(<2 x i64> %x, i64 %val) nounwind {
+       %tmpvec = insertelement <2 x i64> <i64 0, i64 0>, i64 %val, i32 0
+       %tmpvec2 = insertelement <2 x i64> %tmpvec, i64 %val, i32 1
+       %result = add <2 x i64> %x, %tmpvec2
+       ret <2 x i64> %result
+
+This will generate the following instruction sequence:
+        std 5, -8(1)
+        std 5, -16(1)
+        addi 3, 1, -16
+        ori 2, 2, 0
+        lxvd2x 35, 0, 3
+        vaddudm 2, 2, 3
+        blr
+
+This will almost certainly cause a load-hit-store hazard.  
+Since val is a value parameter, it should not need to be saved onto
+the stack, unless it's being done set up the vector register. Instead,
+it would be better to splat teh value into a vector register, and then
+remove the (dead) stores to the stack.
+
+