md/raid6: delta syndrome for ARM NEON
authorArd Biesheuvel <ard.biesheuvel@linaro.org>
Wed, 1 Jul 2015 02:19:56 +0000 (12:19 +1000)
committerNeilBrown <neilb@suse.com>
Mon, 31 Aug 2015 17:29:05 +0000 (19:29 +0200)
This implements XOR syndrome calculation using NEON intrinsics.
As before, the module can be built for ARM and arm64 from the
same source.

Relative performance on a Cortex-A57 based system:

  raid6: int64x1  gen()   905 MB/s
  raid6: int64x1  xor()   881 MB/s
  raid6: int64x2  gen()  1343 MB/s
  raid6: int64x2  xor()  1286 MB/s
  raid6: int64x4  gen()  1896 MB/s
  raid6: int64x4  xor()  1321 MB/s
  raid6: int64x8  gen()  1773 MB/s
  raid6: int64x8  xor()  1165 MB/s
  raid6: neonx1   gen()  1834 MB/s
  raid6: neonx1   xor()  1278 MB/s
  raid6: neonx2   gen()  2528 MB/s
  raid6: neonx2   xor()  1942 MB/s
  raid6: neonx4   gen()  2888 MB/s
  raid6: neonx4   xor()  2334 MB/s
  raid6: neonx8   gen()  2957 MB/s
  raid6: neonx8   xor()  2232 MB/s
  raid6: using algorithm neonx8 gen() 2957 MB/s
  raid6: .... xor() 2232 MB/s, rmw enabled

Cc: Markus Stockhausen <stockhausen@collogia.de>
Cc: Neil Brown <neilb@suse.de>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: NeilBrown <neilb@suse.com>
lib/raid6/neon.c
lib/raid6/neon.uc

index d9ad6ee284f4b0ae33961fe42528de2df69b591b..7076ef1ba3dd6dca99bcc99f6f5de079cd95167a 100644 (file)
                                        (unsigned long)bytes, ptrs);    \
                kernel_neon_end();                                      \
        }                                                               \
+       static void raid6_neon ## _n ## _xor_syndrome(int disks,        \
+                                       int start, int stop,            \
+                                       size_t bytes, void **ptrs)      \
+       {                                                               \
+               void raid6_neon ## _n  ## _xor_syndrome_real(int,       \
+                               int, int, unsigned long, void**);       \
+               kernel_neon_begin();                                    \
+               raid6_neon ## _n ## _xor_syndrome_real(disks,           \
+                       start, stop, (unsigned long)bytes, ptrs);       \
+               kernel_neon_end();                                      \
+       }                                                               \
        struct raid6_calls const raid6_neonx ## _n = {                  \
                raid6_neon ## _n ## _gen_syndrome,                      \
-               NULL,           /* XOR not yet implemented */           \
+               raid6_neon ## _n ## _xor_syndrome,                      \
                raid6_have_neon,                                        \
                "neonx" #_n,                                            \
                0                                                       \
index 1b9ed793342d757e09d19e2e5cd168e65655c0e7..4fa51b761dd0cb4e0ef6c82fbacff87dfe20383f 100644 (file)
@@ -3,6 +3,7 @@
  *   neon.uc - RAID-6 syndrome calculation using ARM NEON instructions
  *
  *   Copyright (C) 2012 Rob Herring
+ *   Copyright (C) 2015 Linaro Ltd. <ard.biesheuvel@linaro.org>
  *
  *   Based on altivec.uc:
  *     Copyright 2002-2004 H. Peter Anvin - All Rights Reserved
@@ -78,3 +79,48 @@ void raid6_neon$#_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
                vst1q_u8(&q[d+NSIZE*$$], wq$$);
        }
 }
+
+void raid6_neon$#_xor_syndrome_real(int disks, int start, int stop,
+                                   unsigned long bytes, void **ptrs)
+{
+       uint8_t **dptr = (uint8_t **)ptrs;
+       uint8_t *p, *q;
+       int d, z, z0;
+
+       register unative_t wd$$, wq$$, wp$$, w1$$, w2$$;
+       const unative_t x1d = NBYTES(0x1d);
+
+       z0 = stop;              /* P/Q right side optimization */
+       p = dptr[disks-2];      /* XOR parity */
+       q = dptr[disks-1];      /* RS syndrome */
+
+       for ( d = 0 ; d < bytes ; d += NSIZE*$# ) {
+               wq$$ = vld1q_u8(&dptr[z0][d+$$*NSIZE]);
+               wp$$ = veorq_u8(vld1q_u8(&p[d+$$*NSIZE]), wq$$);
+
+               /* P/Q data pages */
+               for ( z = z0-1 ; z >= start ; z-- ) {
+                       wd$$ = vld1q_u8(&dptr[z][d+$$*NSIZE]);
+                       wp$$ = veorq_u8(wp$$, wd$$);
+                       w2$$ = MASK(wq$$);
+                       w1$$ = SHLBYTE(wq$$);
+
+                       w2$$ = vandq_u8(w2$$, x1d);
+                       w1$$ = veorq_u8(w1$$, w2$$);
+                       wq$$ = veorq_u8(w1$$, wd$$);
+               }
+               /* P/Q left side optimization */
+               for ( z = start-1 ; z >= 0 ; z-- ) {
+                       w2$$ = MASK(wq$$);
+                       w1$$ = SHLBYTE(wq$$);
+
+                       w2$$ = vandq_u8(w2$$, x1d);
+                       wq$$ = veorq_u8(w1$$, w2$$);
+               }
+               w1$$ = vld1q_u8(&q[d+NSIZE*$$]);
+               wq$$ = veorq_u8(wq$$, w1$$);
+
+               vst1q_u8(&p[d+NSIZE*$$], wp$$);
+               vst1q_u8(&q[d+NSIZE*$$], wq$$);
+       }
+}