arm: crypto: Add optimized SHA-256/224
authorSami Tolvanen <samitolvanen@google.com>
Mon, 23 Mar 2015 14:42:55 +0000 (14:42 +0000)
committerJP Abgrall <jpa@google.com>
Thu, 9 Apr 2015 17:39:04 +0000 (17:39 +0000)
Add Andy Polyakov's optimized assembly and NEON implementations for
SHA-256/224.

The sha256-armv4.pl script for generating the assembly code is from
OpenSSL commit 51f8d095562f36cdaa6893597b5c609e943b0565.

Compared to sha256-generic these implementations have the following
tcrypt speed improvements on Motorola Nexus 6 (Snapdragon 805):

  bs    b/u      sha256-neon  sha256-asm
  16    16       x1.32        x1.19
  64    16       x1.27        x1.15
  64    64       x1.36        x1.20
  256   16       x1.22        x1.11
  256   64       x1.36        x1.19
  256   256      x1.59        x1.23
  1024  16       x1.21        x1.10
  1024  256      x1.65        x1.23
  1024  1024     x1.76        x1.25
  2048  16       x1.21        x1.10
  2048  256      x1.66        x1.23
  2048  1024     x1.78        x1.25
  2048  2048     x1.79        x1.25
  4096  16       x1.20        x1.09
  4096  256      x1.66        x1.23
  4096  1024     x1.79        x1.26
  4096  4096     x1.82        x1.26
  8192  16       x1.20        x1.09
  8192  256      x1.67        x1.23
  8192  1024     x1.80        x1.26
  8192  4096     x1.85        x1.28
  8192  8192     x1.85        x1.27

Where bs refers to block size and b/u to bytes per update.

Change-Id: I83938010007660f7f3f77f2946c8d22557e3a327
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
arch/arm/crypto/Makefile
arch/arm/crypto/sha256-armv4.pl [new file with mode: 0644]
arch/arm/crypto/sha256-core.S_shipped [new file with mode: 0644]
arch/arm/crypto/sha256_glue.c [new file with mode: 0644]
arch/arm/crypto/sha256_glue.h [new file with mode: 0644]
arch/arm/crypto/sha256_neon_glue.c [new file with mode: 0644]
crypto/Kconfig

index b48fa341648d1766a49bf4553e3149836d264c66..2cee53b272384d68cdb060da9e754b2ee6165ff9 100644 (file)
@@ -6,12 +6,15 @@ obj-$(CONFIG_CRYPTO_AES_ARM) += aes-arm.o
 obj-$(CONFIG_CRYPTO_AES_ARM_BS) += aes-arm-bs.o
 obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o
 obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
+obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
 obj-$(CONFIG_CRYPTO_SHA512_ARM_NEON) += sha512-arm-neon.o
 
 aes-arm-y      := aes-armv4.o aes_glue.o
 aes-arm-bs-y   := aesbs-core.o aesbs-glue.o
 sha1-arm-y     := sha1-armv4-large.o sha1_glue.o
 sha1-arm-neon-y        := sha1-armv7-neon.o sha1_neon_glue.o
+sha256-arm-neon-$(CONFIG_KERNEL_MODE_NEON) := sha256_neon_glue.o
+sha256-arm-y   := sha256-core.o sha256_glue.o $(sha256-arm-neon-y)
 sha512-arm-neon-y := sha512-armv7-neon.o sha512_neon_glue.o
 
 quiet_cmd_perl = PERL    $@
@@ -20,4 +23,7 @@ quiet_cmd_perl = PERL    $@
 $(src)/aesbs-core.S_shipped: $(src)/bsaes-armv7.pl
        $(call cmd,perl)
 
-.PRECIOUS: $(obj)/aesbs-core.S
+$(src)/sha256-core.S_shipped: $(src)/sha256-armv4.pl
+       $(call cmd,perl)
+
+.PRECIOUS: $(obj)/aesbs-core.S $(obj)/sha256-core.S
diff --git a/arch/arm/crypto/sha256-armv4.pl b/arch/arm/crypto/sha256-armv4.pl
new file mode 100644 (file)
index 0000000..fac0533
--- /dev/null
@@ -0,0 +1,716 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+#
+# Permission to use under GPL terms is granted.
+# ====================================================================
+
+# SHA256 block procedure for ARMv4. May 2007.
+
+# Performance is ~2x better than gcc 3.4 generated code and in "abso-
+# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
+# byte [on single-issue Xscale PXA250 core].
+
+# July 2010.
+#
+# Rescheduling for dual-issue pipeline resulted in 22% improvement on
+# Cortex A8 core and ~20 cycles per processed byte.
+
+# February 2011.
+#
+# Profiler-assisted and platform-specific optimization resulted in 16%
+# improvement on Cortex A8 core and ~15.4 cycles per processed byte.
+
+# September 2013.
+#
+# Add NEON implementation. On Cortex A8 it was measured to process one
+# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
+# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
+# code (meaning that latter performs sub-optimally, nothing was done
+# about it).
+
+# May 2014.
+#
+# Add ARMv8 code path performing at 2.0 cpb on Apple A7.
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+$ctx="r0";     $t0="r0";
+$inp="r1";     $t4="r1";
+$len="r2";     $t1="r2";
+$T1="r3";      $t3="r3";
+$A="r4";
+$B="r5";
+$C="r6";
+$D="r7";
+$E="r8";
+$F="r9";
+$G="r10";
+$H="r11";
+@V=($A,$B,$C,$D,$E,$F,$G,$H);
+$t2="r12";
+$Ktbl="r14";
+
+@Sigma0=( 2,13,22);
+@Sigma1=( 6,11,25);
+@sigma0=( 7,18, 3);
+@sigma1=(17,19,10);
+
+sub BODY_00_15 {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
+
+$code.=<<___ if ($i<16);
+#if __ARM_ARCH__>=7
+       @ ldr   $t1,[$inp],#4                   @ $i
+# if $i==15
+       str     $inp,[sp,#17*4]                 @ make room for $t4
+# endif
+       eor     $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
+       add     $a,$a,$t2                       @ h+=Maj(a,b,c) from the past
+       eor     $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`  @ Sigma1(e)
+# ifndef __ARMEB__
+       rev     $t1,$t1
+# endif
+#else
+       @ ldrb  $t1,[$inp,#3]                   @ $i
+       add     $a,$a,$t2                       @ h+=Maj(a,b,c) from the past
+       ldrb    $t2,[$inp,#2]
+       ldrb    $t0,[$inp,#1]
+       orr     $t1,$t1,$t2,lsl#8
+       ldrb    $t2,[$inp],#4
+       orr     $t1,$t1,$t0,lsl#16
+# if $i==15
+       str     $inp,[sp,#17*4]                 @ make room for $t4
+# endif
+       eor     $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
+       orr     $t1,$t1,$t2,lsl#24
+       eor     $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`  @ Sigma1(e)
+#endif
+___
+$code.=<<___;
+       ldr     $t2,[$Ktbl],#4                  @ *K256++
+       add     $h,$h,$t1                       @ h+=X[i]
+       str     $t1,[sp,#`$i%16`*4]
+       eor     $t1,$f,$g
+       add     $h,$h,$t0,ror#$Sigma1[0]        @ h+=Sigma1(e)
+       and     $t1,$t1,$e
+       add     $h,$h,$t2                       @ h+=K256[i]
+       eor     $t1,$t1,$g                      @ Ch(e,f,g)
+       eor     $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
+       add     $h,$h,$t1                       @ h+=Ch(e,f,g)
+#if $i==31
+       and     $t2,$t2,#0xff
+       cmp     $t2,#0xf2                       @ done?
+#endif
+#if $i<15
+# if __ARM_ARCH__>=7
+       ldr     $t1,[$inp],#4                   @ prefetch
+# else
+       ldrb    $t1,[$inp,#3]
+# endif
+       eor     $t2,$a,$b                       @ a^b, b^c in next round
+#else
+       ldr     $t1,[sp,#`($i+2)%16`*4]         @ from future BODY_16_xx
+       eor     $t2,$a,$b                       @ a^b, b^c in next round
+       ldr     $t4,[sp,#`($i+15)%16`*4]        @ from future BODY_16_xx
+#endif
+       eor     $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]`  @ Sigma0(a)
+       and     $t3,$t3,$t2                     @ (b^c)&=(a^b)
+       add     $d,$d,$h                        @ d+=h
+       eor     $t3,$t3,$b                      @ Maj(a,b,c)
+       add     $h,$h,$t0,ror#$Sigma0[0]        @ h+=Sigma0(a)
+       @ add   $h,$h,$t3                       @ h+=Maj(a,b,c)
+___
+       ($t2,$t3)=($t3,$t2);
+}
+
+sub BODY_16_XX {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
+
+$code.=<<___;
+       @ ldr   $t1,[sp,#`($i+1)%16`*4]         @ $i
+       @ ldr   $t4,[sp,#`($i+14)%16`*4]
+       mov     $t0,$t1,ror#$sigma0[0]
+       add     $a,$a,$t2                       @ h+=Maj(a,b,c) from the past
+       mov     $t2,$t4,ror#$sigma1[0]
+       eor     $t0,$t0,$t1,ror#$sigma0[1]
+       eor     $t2,$t2,$t4,ror#$sigma1[1]
+       eor     $t0,$t0,$t1,lsr#$sigma0[2]      @ sigma0(X[i+1])
+       ldr     $t1,[sp,#`($i+0)%16`*4]
+       eor     $t2,$t2,$t4,lsr#$sigma1[2]      @ sigma1(X[i+14])
+       ldr     $t4,[sp,#`($i+9)%16`*4]
+
+       add     $t2,$t2,$t0
+       eor     $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`   @ from BODY_00_15
+       add     $t1,$t1,$t2
+       eor     $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`  @ Sigma1(e)
+       add     $t1,$t1,$t4                     @ X[i]
+___
+       &BODY_00_15(@_);
+}
+
+$code=<<___;
+#ifndef __KERNEL__
+# include "arm_arch.h"
+#else
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ 7
+#endif
+
+.text
+#if __ARM_ARCH__<7
+.code  32
+#else
+.syntax unified
+# ifdef __thumb2__
+#  define adrl adr
+.thumb
+# else
+.code   32
+# endif
+#endif
+
+.type  K256,%object
+.align 5
+K256:
+.word  0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.word  0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.word  0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.word  0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.word  0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.word  0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.word  0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.word  0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.word  0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.word  0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.word  0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.word  0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.word  0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.word  0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.word  0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.word  0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.size  K256,.-K256
+.word  0                               @ terminator
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.LOPENSSL_armcap:
+.word  OPENSSL_armcap_P-sha256_block_data_order
+#endif
+.align 5
+
+.global        sha256_block_data_order
+.type  sha256_block_data_order,%function
+sha256_block_data_order:
+#if __ARM_ARCH__<7
+       sub     r3,pc,#8                @ sha256_block_data_order
+#else
+       adr     r3,sha256_block_data_order
+#endif
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+       ldr     r12,.LOPENSSL_armcap
+       ldr     r12,[r3,r12]            @ OPENSSL_armcap_P
+       tst     r12,#ARMV8_SHA256
+       bne     .LARMv8
+       tst     r12,#ARMV7_NEON
+       bne     .LNEON
+#endif
+       add     $len,$inp,$len,lsl#6    @ len to point at the end of inp
+       stmdb   sp!,{$ctx,$inp,$len,r4-r11,lr}
+       ldmia   $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
+       sub     $Ktbl,r3,#256+32        @ K256
+       sub     sp,sp,#16*4             @ alloca(X[16])
+.Loop:
+# if __ARM_ARCH__>=7
+       ldr     $t1,[$inp],#4
+# else
+       ldrb    $t1,[$inp,#3]
+# endif
+       eor     $t3,$B,$C               @ magic
+       eor     $t2,$t2,$t2
+___
+for($i=0;$i<16;$i++)   { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
+$code.=".Lrounds_16_xx:\n";
+for (;$i<32;$i++)      { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+#if __ARM_ARCH__>=7
+       ite     eq                      @ Thumb2 thing, sanity check in ARM
+#endif
+       ldreq   $t3,[sp,#16*4]          @ pull ctx
+       bne     .Lrounds_16_xx
+
+       add     $A,$A,$t2               @ h+=Maj(a,b,c) from the past
+       ldr     $t0,[$t3,#0]
+       ldr     $t1,[$t3,#4]
+       ldr     $t2,[$t3,#8]
+       add     $A,$A,$t0
+       ldr     $t0,[$t3,#12]
+       add     $B,$B,$t1
+       ldr     $t1,[$t3,#16]
+       add     $C,$C,$t2
+       ldr     $t2,[$t3,#20]
+       add     $D,$D,$t0
+       ldr     $t0,[$t3,#24]
+       add     $E,$E,$t1
+       ldr     $t1,[$t3,#28]
+       add     $F,$F,$t2
+       ldr     $inp,[sp,#17*4]         @ pull inp
+       ldr     $t2,[sp,#18*4]          @ pull inp+len
+       add     $G,$G,$t0
+       add     $H,$H,$t1
+       stmia   $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
+       cmp     $inp,$t2
+       sub     $Ktbl,$Ktbl,#256        @ rewind Ktbl
+       bne     .Loop
+
+       add     sp,sp,#`16+3`*4 @ destroy frame
+#if __ARM_ARCH__>=5
+       ldmia   sp!,{r4-r11,pc}
+#else
+       ldmia   sp!,{r4-r11,lr}
+       tst     lr,#1
+       moveq   pc,lr                   @ be binary compatible with V4, yet
+       bx      lr                      @ interoperable with Thumb ISA:-)
+#endif
+.size  sha256_block_data_order,.-sha256_block_data_order
+___
+######################################################################
+# NEON stuff
+#
+{{{
+my @X=map("q$_",(0..3));
+my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
+my $Xfer=$t4;
+my $j=0;
+
+sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
+sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
+
+sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
+  my $arg = pop;
+    $arg = "#$arg" if ($arg*1 eq $arg);
+    $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
+}
+
+sub Xupdate()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);
+  my ($a,$b,$c,$d,$e,$f,$g,$h);
+
+       &vext_8         ($T0,@X[0],@X[1],4);    # X[1..4]
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vext_8         ($T1,@X[2],@X[3],4);    # X[9..12]
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vshr_u32       ($T2,$T0,$sigma0[0]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vadd_i32       (@X[0],@X[0],$T1);      # X[0..3] += X[9..12]
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vshr_u32       ($T1,$T0,$sigma0[2]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vsli_32        ($T2,$T0,32-$sigma0[0]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vshr_u32       ($T3,$T0,$sigma0[1]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &veor           ($T1,$T1,$T2);
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vsli_32        ($T3,$T0,32-$sigma0[1]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &vshr_u32     ($T4,&Dhi(@X[3]),$sigma1[0]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &veor           ($T1,$T1,$T3);          # sigma0(X[1..4])
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &vsli_32      ($T4,&Dhi(@X[3]),32-$sigma1[0]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &vshr_u32     ($T5,&Dhi(@X[3]),$sigma1[2]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vadd_i32       (@X[0],@X[0],$T1);      # X[0..3] += sigma0(X[1..4])
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &veor         ($T5,$T5,$T4);
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &vshr_u32     ($T4,&Dhi(@X[3]),$sigma1[1]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &vsli_32      ($T4,&Dhi(@X[3]),32-$sigma1[1]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &veor         ($T5,$T5,$T4);          # sigma1(X[14..15])
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vadd_i32       (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &vshr_u32     ($T4,&Dlo(@X[0]),$sigma1[0]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &vsli_32      ($T4,&Dlo(@X[0]),32-$sigma1[0]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &vshr_u32     ($T5,&Dlo(@X[0]),$sigma1[2]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &veor         ($T5,$T5,$T4);
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &vshr_u32     ($T4,&Dlo(@X[0]),$sigma1[1]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vld1_32        ("{$T0}","[$Ktbl,:128]!");
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &vsli_32      ($T4,&Dlo(@X[0]),32-$sigma1[1]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &veor         ($T5,$T5,$T4);          # sigma1(X[16..17])
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vadd_i32       (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vadd_i32       ($T0,$T0,@X[0]);
+        while($#insns>=2) { eval(shift(@insns)); }
+       &vst1_32        ("{$T0}","[$Xfer,:128]!");
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+       push(@X,shift(@X));             # "rotate" X[]
+}
+
+sub Xpreload()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);
+  my ($a,$b,$c,$d,$e,$f,$g,$h);
+
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vld1_32        ("{$T0}","[$Ktbl,:128]!");
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vrev32_8       (@X[0],@X[0]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vadd_i32       ($T0,$T0,@X[0]);
+        foreach (@insns) { eval; }     # remaining instructions
+       &vst1_32        ("{$T0}","[$Xfer,:128]!");
+
+       push(@X,shift(@X));             # "rotate" X[]
+}
+
+sub body_00_15 () {
+       (
+       '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
+       '&add   ($h,$h,$t1)',                   # h+=X[i]+K[i]
+       '&eor   ($t1,$f,$g)',
+       '&eor   ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
+       '&add   ($a,$a,$t2)',                   # h+=Maj(a,b,c) from the past
+       '&and   ($t1,$t1,$e)',
+       '&eor   ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))',   # Sigma1(e)
+       '&eor   ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
+       '&eor   ($t1,$t1,$g)',                  # Ch(e,f,g)
+       '&add   ($h,$h,$t2,"ror#$Sigma1[0]")',  # h+=Sigma1(e)
+       '&eor   ($t2,$a,$b)',                   # a^b, b^c in next round
+       '&eor   ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',   # Sigma0(a)
+       '&add   ($h,$h,$t1)',                   # h+=Ch(e,f,g)
+       '&ldr   ($t1,sprintf "[sp,#%d]",4*(($j+1)&15))  if (($j&15)!=15);'.
+       '&ldr   ($t1,"[$Ktbl]")                         if ($j==15);'.
+       '&ldr   ($t1,"[sp,#64]")                        if ($j==31)',
+       '&and   ($t3,$t3,$t2)',                 # (b^c)&=(a^b)
+       '&add   ($d,$d,$h)',                    # d+=h
+       '&add   ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
+       '&eor   ($t3,$t3,$b)',                  # Maj(a,b,c)
+       '$j++;  unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
+       )
+}
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7
+.arch  armv7-a
+.fpu   neon
+
+.global        sha256_block_data_order_neon
+.type  sha256_block_data_order_neon,%function
+.align 4
+sha256_block_data_order_neon:
+.LNEON:
+       stmdb   sp!,{r4-r12,lr}
+
+       sub     $H,sp,#16*4+16
+       adrl    $Ktbl,K256
+       bic     $H,$H,#15               @ align for 128-bit stores
+       mov     $t2,sp
+       mov     sp,$H                   @ alloca
+       add     $len,$inp,$len,lsl#6    @ len to point at the end of inp
+
+       vld1.8          {@X[0]},[$inp]!
+       vld1.8          {@X[1]},[$inp]!
+       vld1.8          {@X[2]},[$inp]!
+       vld1.8          {@X[3]},[$inp]!
+       vld1.32         {$T0},[$Ktbl,:128]!
+       vld1.32         {$T1},[$Ktbl,:128]!
+       vld1.32         {$T2},[$Ktbl,:128]!
+       vld1.32         {$T3},[$Ktbl,:128]!
+       vrev32.8        @X[0],@X[0]             @ yes, even on
+       str             $ctx,[sp,#64]
+       vrev32.8        @X[1],@X[1]             @ big-endian
+       str             $inp,[sp,#68]
+       mov             $Xfer,sp
+       vrev32.8        @X[2],@X[2]
+       str             $len,[sp,#72]
+       vrev32.8        @X[3],@X[3]
+       str             $t2,[sp,#76]            @ save original sp
+       vadd.i32        $T0,$T0,@X[0]
+       vadd.i32        $T1,$T1,@X[1]
+       vst1.32         {$T0},[$Xfer,:128]!
+       vadd.i32        $T2,$T2,@X[2]
+       vst1.32         {$T1},[$Xfer,:128]!
+       vadd.i32        $T3,$T3,@X[3]
+       vst1.32         {$T2},[$Xfer,:128]!
+       vst1.32         {$T3},[$Xfer,:128]!
+
+       ldmia           $ctx,{$A-$H}
+       sub             $Xfer,$Xfer,#64
+       ldr             $t1,[sp,#0]
+       eor             $t2,$t2,$t2
+       eor             $t3,$B,$C
+       b               .L_00_48
+
+.align 4
+.L_00_48:
+___
+       &Xupdate(\&body_00_15);
+       &Xupdate(\&body_00_15);
+       &Xupdate(\&body_00_15);
+       &Xupdate(\&body_00_15);
+$code.=<<___;
+       teq     $t1,#0                          @ check for K256 terminator
+       ldr     $t1,[sp,#0]
+       sub     $Xfer,$Xfer,#64
+       bne     .L_00_48
+
+       ldr             $inp,[sp,#68]
+       ldr             $t0,[sp,#72]
+       sub             $Ktbl,$Ktbl,#256        @ rewind $Ktbl
+       teq             $inp,$t0
+       it              eq
+       subeq           $inp,$inp,#64           @ avoid SEGV
+       vld1.8          {@X[0]},[$inp]!         @ load next input block
+       vld1.8          {@X[1]},[$inp]!
+       vld1.8          {@X[2]},[$inp]!
+       vld1.8          {@X[3]},[$inp]!
+       it              ne
+       strne           $inp,[sp,#68]
+       mov             $Xfer,sp
+___
+       &Xpreload(\&body_00_15);
+       &Xpreload(\&body_00_15);
+       &Xpreload(\&body_00_15);
+       &Xpreload(\&body_00_15);
+$code.=<<___;
+       ldr     $t0,[$t1,#0]
+       add     $A,$A,$t2                       @ h+=Maj(a,b,c) from the past
+       ldr     $t2,[$t1,#4]
+       ldr     $t3,[$t1,#8]
+       ldr     $t4,[$t1,#12]
+       add     $A,$A,$t0                       @ accumulate
+       ldr     $t0,[$t1,#16]
+       add     $B,$B,$t2
+       ldr     $t2,[$t1,#20]
+       add     $C,$C,$t3
+       ldr     $t3,[$t1,#24]
+       add     $D,$D,$t4
+       ldr     $t4,[$t1,#28]
+       add     $E,$E,$t0
+       str     $A,[$t1],#4
+       add     $F,$F,$t2
+       str     $B,[$t1],#4
+       add     $G,$G,$t3
+       str     $C,[$t1],#4
+       add     $H,$H,$t4
+       str     $D,[$t1],#4
+       stmia   $t1,{$E-$H}
+
+       ittte   ne
+       movne   $Xfer,sp
+       ldrne   $t1,[sp,#0]
+       eorne   $t2,$t2,$t2
+       ldreq   sp,[sp,#76]                     @ restore original sp
+       itt     ne
+       eorne   $t3,$B,$C
+       bne     .L_00_48
+
+       ldmia   sp!,{r4-r12,pc}
+.size  sha256_block_data_order_neon,.-sha256_block_data_order_neon
+#endif
+___
+}}}
+######################################################################
+# ARMv8 stuff
+#
+{{{
+my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
+my @MSG=map("q$_",(8..11));
+my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
+my $Ktbl="r3";
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+
+# ifdef __thumb2__
+#  define INST(a,b,c,d)        .byte   c,d|0xc,a,b
+# else
+#  define INST(a,b,c,d)        .byte   a,b,c,d
+# endif
+
+.type  sha256_block_data_order_armv8,%function
+.align 5
+sha256_block_data_order_armv8:
+.LARMv8:
+       vld1.32 {$ABCD,$EFGH},[$ctx]
+# ifdef __thumb2__
+       adr     $Ktbl,.LARMv8
+       sub     $Ktbl,$Ktbl,#.LARMv8-K256
+# else
+       adrl    $Ktbl,K256
+# endif
+       add     $len,$inp,$len,lsl#6    @ len to point at the end of inp
+
+.Loop_v8:
+       vld1.8          {@MSG[0]-@MSG[1]},[$inp]!
+       vld1.8          {@MSG[2]-@MSG[3]},[$inp]!
+       vld1.32         {$W0},[$Ktbl]!
+       vrev32.8        @MSG[0],@MSG[0]
+       vrev32.8        @MSG[1],@MSG[1]
+       vrev32.8        @MSG[2],@MSG[2]
+       vrev32.8        @MSG[3],@MSG[3]
+       vmov            $ABCD_SAVE,$ABCD        @ offload
+       vmov            $EFGH_SAVE,$EFGH
+       teq             $inp,$len
+___
+for($i=0;$i<12;$i++) {
+$code.=<<___;
+       vld1.32         {$W1},[$Ktbl]!
+       vadd.i32        $W0,$W0,@MSG[0]
+       sha256su0       @MSG[0],@MSG[1]
+       vmov            $abcd,$ABCD
+       sha256h         $ABCD,$EFGH,$W0
+       sha256h2        $EFGH,$abcd,$W0
+       sha256su1       @MSG[0],@MSG[2],@MSG[3]
+___
+       ($W0,$W1)=($W1,$W0);    push(@MSG,shift(@MSG));
+}
+$code.=<<___;
+       vld1.32         {$W1},[$Ktbl]!
+       vadd.i32        $W0,$W0,@MSG[0]
+       vmov            $abcd,$ABCD
+       sha256h         $ABCD,$EFGH,$W0
+       sha256h2        $EFGH,$abcd,$W0
+
+       vld1.32         {$W0},[$Ktbl]!
+       vadd.i32        $W1,$W1,@MSG[1]
+       vmov            $abcd,$ABCD
+       sha256h         $ABCD,$EFGH,$W1
+       sha256h2        $EFGH,$abcd,$W1
+
+       vld1.32         {$W1},[$Ktbl]
+       vadd.i32        $W0,$W0,@MSG[2]
+       sub             $Ktbl,$Ktbl,#256-16     @ rewind
+       vmov            $abcd,$ABCD
+       sha256h         $ABCD,$EFGH,$W0
+       sha256h2        $EFGH,$abcd,$W0
+
+       vadd.i32        $W1,$W1,@MSG[3]
+       vmov            $abcd,$ABCD
+       sha256h         $ABCD,$EFGH,$W1
+       sha256h2        $EFGH,$abcd,$W1
+
+       vadd.i32        $ABCD,$ABCD,$ABCD_SAVE
+       vadd.i32        $EFGH,$EFGH,$EFGH_SAVE
+       it              ne
+       bne             .Loop_v8
+
+       vst1.32         {$ABCD,$EFGH},[$ctx]
+
+       ret             @ bx lr
+.size  sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
+#endif
+___
+}}}
+$code.=<<___;
+.asciz  "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
+.align 2
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.comm   OPENSSL_armcap_P,4,4
+#endif
+___
+
+open SELF,$0;
+while(<SELF>) {
+       next if (/^#!/);
+       last if (!s/^#/@/ and !/^$/);
+       print;
+}
+close SELF;
+
+{   my  %opcode = (
+       "sha256h"       => 0xf3000c40,  "sha256h2"      => 0xf3100c40,
+       "sha256su0"     => 0xf3ba03c0,  "sha256su1"     => 0xf3200c40   );
+
+    sub unsha256 {
+       my ($mnemonic,$arg)=@_;
+
+       if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
+           my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
+                                        |(($2&7)<<17)|(($2&8)<<4)
+                                        |(($3&7)<<1) |(($3&8)<<2);
+           # since ARMv7 instructions are always encoded little-endian.
+           # correct solution is to use .inst directive, but older
+           # assemblers don't implement it:-(
+           sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
+                       $word&0xff,($word>>8)&0xff,
+                       ($word>>16)&0xff,($word>>24)&0xff,
+                       $mnemonic,$arg;
+       }
+    }
+}
+
+foreach (split($/,$code)) {
+
+       s/\`([^\`]*)\`/eval $1/geo;
+
+       s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
+
+       s/\bret\b/bx    lr/go           or
+       s/\bbx\s+lr\b/.word\t0xe12fff1e/go;     # make it possible to compile with -march=armv4
+
+       print $_,"\n";
+}
+
+close STDOUT; # enforce flush
diff --git a/arch/arm/crypto/sha256-core.S_shipped b/arch/arm/crypto/sha256-core.S_shipped
new file mode 100644 (file)
index 0000000..555a1a8
--- /dev/null
@@ -0,0 +1,2808 @@
+
+@ ====================================================================
+@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+@ project. The module is, however, dual licensed under OpenSSL and
+@ CRYPTOGAMS licenses depending on where you obtain it. For further
+@ details see http://www.openssl.org/~appro/cryptogams/.
+@
+@ Permission to use under GPL terms is granted.
+@ ====================================================================
+
+@ SHA256 block procedure for ARMv4. May 2007.
+
+@ Performance is ~2x better than gcc 3.4 generated code and in "abso-
+@ lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
+@ byte [on single-issue Xscale PXA250 core].
+
+@ July 2010.
+@
+@ Rescheduling for dual-issue pipeline resulted in 22% improvement on
+@ Cortex A8 core and ~20 cycles per processed byte.
+
+@ February 2011.
+@
+@ Profiler-assisted and platform-specific optimization resulted in 16%
+@ improvement on Cortex A8 core and ~15.4 cycles per processed byte.
+
+@ September 2013.
+@
+@ Add NEON implementation. On Cortex A8 it was measured to process one
+@ byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
+@ S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
+@ code (meaning that latter performs sub-optimally, nothing was done
+@ about it).
+
+@ May 2014.
+@
+@ Add ARMv8 code path performing at 2.0 cpb on Apple A7.
+
+#ifndef __KERNEL__
+# include "arm_arch.h"
+#else
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ 7
+#endif
+
+.text
+#if __ARM_ARCH__<7
+.code  32
+#else
+.syntax unified
+# ifdef __thumb2__
+#  define adrl adr
+.thumb
+# else
+.code   32
+# endif
+#endif
+
+.type  K256,%object
+.align 5
+K256:
+.word  0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.word  0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.word  0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.word  0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.word  0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.word  0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.word  0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.word  0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.word  0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.word  0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.word  0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.word  0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.word  0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.word  0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.word  0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.word  0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.size  K256,.-K256
+.word  0                               @ terminator
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.LOPENSSL_armcap:
+.word  OPENSSL_armcap_P-sha256_block_data_order
+#endif
+.align 5
+
+.global        sha256_block_data_order
+.type  sha256_block_data_order,%function
+sha256_block_data_order:
+#if __ARM_ARCH__<7
+       sub     r3,pc,#8                @ sha256_block_data_order
+#else
+       adr     r3,sha256_block_data_order
+#endif
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+       ldr     r12,.LOPENSSL_armcap
+       ldr     r12,[r3,r12]            @ OPENSSL_armcap_P
+       tst     r12,#ARMV8_SHA256
+       bne     .LARMv8
+       tst     r12,#ARMV7_NEON
+       bne     .LNEON
+#endif
+       add     r2,r1,r2,lsl#6  @ len to point at the end of inp
+       stmdb   sp!,{r0,r1,r2,r4-r11,lr}
+       ldmia   r0,{r4,r5,r6,r7,r8,r9,r10,r11}
+       sub     r14,r3,#256+32  @ K256
+       sub     sp,sp,#16*4             @ alloca(X[16])
+.Loop:
+# if __ARM_ARCH__>=7
+       ldr     r2,[r1],#4
+# else
+       ldrb    r2,[r1,#3]
+# endif
+       eor     r3,r5,r6                @ magic
+       eor     r12,r12,r12
+#if __ARM_ARCH__>=7
+       @ ldr   r2,[r1],#4                      @ 0
+# if 0==15
+       str     r1,[sp,#17*4]                   @ make room for r1
+# endif
+       eor     r0,r8,r8,ror#5
+       add     r4,r4,r12                       @ h+=Maj(a,b,c) from the past
+       eor     r0,r0,r8,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+       rev     r2,r2
+# endif
+#else
+       @ ldrb  r2,[r1,#3]                      @ 0
+       add     r4,r4,r12                       @ h+=Maj(a,b,c) from the past
+       ldrb    r12,[r1,#2]
+       ldrb    r0,[r1,#1]
+       orr     r2,r2,r12,lsl#8
+       ldrb    r12,[r1],#4
+       orr     r2,r2,r0,lsl#16
+# if 0==15
+       str     r1,[sp,#17*4]                   @ make room for r1
+# endif
+       eor     r0,r8,r8,ror#5
+       orr     r2,r2,r12,lsl#24
+       eor     r0,r0,r8,ror#19 @ Sigma1(e)
+#endif
+       ldr     r12,[r14],#4                    @ *K256++
+       add     r11,r11,r2                      @ h+=X[i]
+       str     r2,[sp,#0*4]
+       eor     r2,r9,r10
+       add     r11,r11,r0,ror#6        @ h+=Sigma1(e)
+       and     r2,r2,r8
+       add     r11,r11,r12                     @ h+=K256[i]
+       eor     r2,r2,r10                       @ Ch(e,f,g)
+       eor     r0,r4,r4,ror#11
+       add     r11,r11,r2                      @ h+=Ch(e,f,g)
+#if 0==31
+       and     r12,r12,#0xff
+       cmp     r12,#0xf2                       @ done?
+#endif
+#if 0<15
+# if __ARM_ARCH__>=7
+       ldr     r2,[r1],#4                      @ prefetch
+# else
+       ldrb    r2,[r1,#3]
+# endif
+       eor     r12,r4,r5                       @ a^b, b^c in next round
+#else
+       ldr     r2,[sp,#2*4]            @ from future BODY_16_xx
+       eor     r12,r4,r5                       @ a^b, b^c in next round
+       ldr     r1,[sp,#15*4]   @ from future BODY_16_xx
+#endif
+       eor     r0,r0,r4,ror#20 @ Sigma0(a)
+       and     r3,r3,r12                       @ (b^c)&=(a^b)
+       add     r7,r7,r11                       @ d+=h
+       eor     r3,r3,r5                        @ Maj(a,b,c)
+       add     r11,r11,r0,ror#2        @ h+=Sigma0(a)
+       @ add   r11,r11,r3                      @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+       @ ldr   r2,[r1],#4                      @ 1
+# if 1==15
+       str     r1,[sp,#17*4]                   @ make room for r1
+# endif
+       eor     r0,r7,r7,ror#5
+       add     r11,r11,r3                      @ h+=Maj(a,b,c) from the past
+       eor     r0,r0,r7,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+       rev     r2,r2
+# endif
+#else
+       @ ldrb  r2,[r1,#3]                      @ 1
+       add     r11,r11,r3                      @ h+=Maj(a,b,c) from the past
+       ldrb    r3,[r1,#2]
+       ldrb    r0,[r1,#1]
+       orr     r2,r2,r3,lsl#8
+       ldrb    r3,[r1],#4
+       orr     r2,r2,r0,lsl#16
+# if 1==15
+       str     r1,[sp,#17*4]                   @ make room for r1
+# endif
+       eor     r0,r7,r7,ror#5
+       orr     r2,r2,r3,lsl#24
+       eor     r0,r0,r7,ror#19 @ Sigma1(e)
+#endif
+       ldr     r3,[r14],#4                     @ *K256++
+       add     r10,r10,r2                      @ h+=X[i]
+       str     r2,[sp,#1*4]
+       eor     r2,r8,r9
+       add     r10,r10,r0,ror#6        @ h+=Sigma1(e)
+       and     r2,r2,r7
+       add     r10,r10,r3                      @ h+=K256[i]
+       eor     r2,r2,r9                        @ Ch(e,f,g)
+       eor     r0,r11,r11,ror#11
+       add     r10,r10,r2                      @ h+=Ch(e,f,g)
+#if 1==31
+       and     r3,r3,#0xff
+       cmp     r3,#0xf2                        @ done?
+#endif
+#if 1<15
+# if __ARM_ARCH__>=7
+       ldr     r2,[r1],#4                      @ prefetch
+# else
+       ldrb    r2,[r1,#3]
+# endif
+       eor     r3,r11,r4                       @ a^b, b^c in next round
+#else
+       ldr     r2,[sp,#3*4]            @ from future BODY_16_xx
+       eor     r3,r11,r4                       @ a^b, b^c in next round
+       ldr     r1,[sp,#0*4]    @ from future BODY_16_xx
+#endif
+       eor     r0,r0,r11,ror#20        @ Sigma0(a)
+       and     r12,r12,r3                      @ (b^c)&=(a^b)
+       add     r6,r6,r10                       @ d+=h
+       eor     r12,r12,r4                      @ Maj(a,b,c)
+       add     r10,r10,r0,ror#2        @ h+=Sigma0(a)
+       @ add   r10,r10,r12                     @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+       @ ldr   r2,[r1],#4                      @ 2
+# if 2==15
+       str     r1,[sp,#17*4]                   @ make room for r1
+# endif
+       eor     r0,r6,r6,ror#5
+       add     r10,r10,r12                     @ h+=Maj(a,b,c) from the past
+       eor     r0,r0,r6,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+       rev     r2,r2
+# endif
+#else
+       @ ldrb  r2,[r1,#3]                      @ 2
+       add     r10,r10,r12                     @ h+=Maj(a,b,c) from the past
+       ldrb    r12,[r1,#2]
+       ldrb    r0,[r1,#1]
+       orr     r2,r2,r12,lsl#8
+       ldrb    r12,[r1],#4
+       orr     r2,r2,r0,lsl#16
+# if 2==15
+       str     r1,[sp,#17*4]                   @ make room for r1
+# endif
+       eor     r0,r6,r6,ror#5
+       orr     r2,r2,r12,lsl#24
+       eor     r0,r0,r6,ror#19 @ Sigma1(e)
+#endif
+       ldr     r12,[r14],#4                    @ *K256++
+       add     r9,r9,r2                        @ h+=X[i]
+       str     r2,[sp,#2*4]
+       eor     r2,r7,r8
+       add     r9,r9,r0,ror#6  @ h+=Sigma1(e)
+       and     r2,r2,r6
+       add     r9,r9,r12                       @ h+=K256[i]
+       eor     r2,r2,r8                        @ Ch(e,f,g)
+       eor     r0,r10,r10,ror#11
+       add     r9,r9,r2                        @ h+=Ch(e,f,g)
+#if 2==31
+       and     r12,r12,#0xff
+       cmp     r12,#0xf2                       @ done?
+#endif
+#if 2<15
+# if __ARM_ARCH__>=7
+       ldr     r2,[r1],#4                      @ prefetch
+# else
+       ldrb    r2,[r1,#3]
+# endif
+       eor     r12,r10,r11                     @ a^b, b^c in next round
+#else
+       ldr     r2,[sp,#4*4]            @ from future BODY_16_xx
+       eor     r12,r10,r11                     @ a^b, b^c in next round
+       ldr     r1,[sp,#1*4]    @ from future BODY_16_xx
+#endif
+       eor     r0,r0,r10,ror#20        @ Sigma0(a)
+       and     r3,r3,r12                       @ (b^c)&=(a^b)
+       add     r5,r5,r9                        @ d+=h
+       eor     r3,r3,r11                       @ Maj(a,b,c)
+       add     r9,r9,r0,ror#2  @ h+=Sigma0(a)
+       @ add   r9,r9,r3                        @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+       @ ldr   r2,[r1],#4                      @ 3
+# if 3==15
+       str     r1,[sp,#17*4]                   @ make room for r1
+# endif
+       eor     r0,r5,r5,ror#5
+       add     r9,r9,r3                        @ h+=Maj(a,b,c) from the past
+       eor     r0,r0,r5,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+       rev     r2,r2
+# endif
+#else
+       @ ldrb  r2,[r1,#3]                      @ 3
+       add     r9,r9,r3                        @ h+=Maj(a,b,c) from the past
+       ldrb    r3,[r1,#2]
+       ldrb    r0,[r1,#1]
+       orr     r2,r2,r3,lsl#8
+       ldrb    r3,[r1],#4
+       orr     r2,r2,r0,lsl#16
+# if 3==15
+       str     r1,[sp,#17*4]                   @ make room for r1
+# endif
+       eor     r0,r5,r5,ror#5
+       orr     r2,r2,r3,lsl#24
+       eor     r0,r0,r5,ror#19 @ Sigma1(e)
+#endif
+       ldr     r3,[r14],#4                     @ *K256++
+       add     r8,r8,r2                        @ h+=X[i]
+       str     r2,[sp,#3*4]
+       eor     r2,r6,r7
+       add     r8,r8,r0,ror#6  @ h+=Sigma1(e)
+       and     r2,r2,r5
+       add     r8,r8,r3                        @ h+=K256[i]
+       eor     r2,r2,r7                        @ Ch(e,f,g)
+       eor     r0,r9,r9,ror#11
+       add     r8,r8,r2                        @ h+=Ch(e,f,g)
+#if 3==31
+       and     r3,r3,#0xff
+       cmp     r3,#0xf2                        @ done?
+#endif
+#if 3<15
+# if __ARM_ARCH__>=7
+       ldr     r2,[r1],#4                      @ prefetch
+# else
+       ldrb    r2,[r1,#3]
+# endif
+       eor     r3,r9,r10                       @ a^b, b^c in next round
+#else
+       ldr     r2,[sp,#5*4]            @ from future BODY_16_xx
+       eor     r3,r9,r10                       @ a^b, b^c in next round
+       ldr     r1,[sp,#2*4]    @ from future BODY_16_xx
+#endif
+       eor     r0,r0,r9,ror#20 @ Sigma0(a)
+       and     r12,r12,r3                      @ (b^c)&=(a^b)
+       add     r4,r4,r8                        @ d+=h
+       eor     r12,r12,r10                     @ Maj(a,b,c)
+       add     r8,r8,r0,ror#2  @ h+=Sigma0(a)
+       @ add   r8,r8,r12                       @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+       @ ldr   r2,[r1],#4                      @ 4
+# if 4==15
+       str     r1,[sp,#17*4]                   @ make room for r1
+# endif
+       eor     r0,r4,r4,ror#5
+       add     r8,r8,r12                       @ h+=Maj(a,b,c) from the past
+       eor     r0,r0,r4,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+       rev     r2,r2
+# endif
+#else
+       @ ldrb  r2,[r1,#3]                      @ 4
+       add     r8,r8,r12                       @ h+=Maj(a,b,c) from the past
+       ldrb    r12,[r1,#2]
+       ldrb    r0,[r1,#1]
+       orr     r2,r2,r12,lsl#8
+       ldrb    r12,[r1],#4
+       orr     r2,r2,r0,lsl#16
+# if 4==15
+       str     r1,[sp,#17*4]                   @ make room for r1
+# endif
+       eor     r0,r4,r4,ror#5
+       orr     r2,r2,r12,lsl#24
+       eor     r0,r0,r4,ror#19 @ Sigma1(e)
+#endif
+       ldr     r12,[r14],#4                    @ *K256++
+       add     r7,r7,r2                        @ h+=X[i]
+       str     r2,[sp,#4*4]
+       eor     r2,r5,r6
+       add     r7,r7,r0,ror#6  @ h+=Sigma1(e)
+       and     r2,r2,r4
+       add     r7,r7,r12                       @ h+=K256[i]
+       eor     r2,r2,r6                        @ Ch(e,f,g)
+       eor     r0,r8,r8,ror#11
+       add     r7,r7,r2                        @ h+=Ch(e,f,g)
+#if 4==31
+       and     r12,r12,#0xff
+       cmp     r12,#0xf2                       @ done?
+#endif
+#if 4<15
+# if __ARM_ARCH__>=7
+       ldr     r2,[r1],#4                      @ prefetch
+# else
+       ldrb    r2,[r1,#3]
+# endif
+       eor     r12,r8,r9                       @ a^b, b^c in next round
+#else
+       ldr     r2,[sp,#6*4]            @ from future BODY_16_xx
+       eor     r12,r8,r9                       @ a^b, b^c in next round
+       ldr     r1,[sp,#3*4]    @ from future BODY_16_xx
+#endif
+       eor     r0,r0,r8,ror#20 @ Sigma0(a)
+       and     r3,r3,r12                       @ (b^c)&=(a^b)
+       add     r11,r11,r7                      @ d+=h
+       eor     r3,r3,r9                        @ Maj(a,b,c)
+       add     r7,r7,r0,ror#2  @ h+=Sigma0(a)
+       @ add   r7,r7,r3                        @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+       @ ldr   r2,[r1],#4                      @ 5
+# if 5==15
+       str     r1,[sp,#17*4]                   @ make room for r1
+# endif
+       eor     r0,r11,r11,ror#5
+       add     r7,r7,r3                        @ h+=Maj(a,b,c) from the past
+       eor     r0,r0,r11,ror#19        @ Sigma1(e)
+# ifndef __ARMEB__
+       rev     r2,r2
+# endif
+#else
+       @ ldrb  r2,[r1,#3]                      @ 5
+       add     r7,r7,r3                        @ h+=Maj(a,b,c) from the past
+       ldrb    r3,[r1,#2]
+       ldrb    r0,[r1,#1]
+       orr     r2,r2,r3,lsl#8
+       ldrb    r3,[r1],#4
+       orr     r2,r2,r0,lsl#16
+# if 5==15
+       str     r1,[sp,#17*4]                   @ make room for r1
+# endif
+       eor     r0,r11,r11,ror#5
+       orr     r2,r2,r3,lsl#24
+       eor     r0,r0,r11,ror#19        @ Sigma1(e)
+#endif
+       ldr     r3,[r14],#4                     @ *K256++
+       add     r6,r6,r2                        @ h+=X[i]
+       str     r2,[sp,#5*4]
+       eor     r2,r4,r5
+       add     r6,r6,r0,ror#6  @ h+=Sigma1(e)
+       and     r2,r2,r11
+       add     r6,r6,r3                        @ h+=K256[i]
+       eor     r2,r2,r5                        @ Ch(e,f,g)
+       eor     r0,r7,r7,ror#11
+       add     r6,r6,r2                        @ h+=Ch(e,f,g)
+#if 5==31
+       and     r3,r3,#0xff
+       cmp     r3,#0xf2                        @ done?
+#endif
+#if 5<15
+# if __ARM_ARCH__>=7
+       ldr     r2,[r1],#4                      @ prefetch
+# else
+       ldrb    r2,[r1,#3]
+# endif
+       eor     r3,r7,r8                        @ a^b, b^c in next round
+#else
+       ldr     r2,[sp,#7*4]            @ from future BODY_16_xx
+       eor     r3,r7,r8                        @ a^b, b^c in next round
+       ldr     r1,[sp,#4*4]    @ from future BODY_16_xx
+#endif
+       eor     r0,r0,r7,ror#20 @ Sigma0(a)
+       and     r12,r12,r3                      @ (b^c)&=(a^b)
+       add     r10,r10,r6                      @ d+=h
+       eor     r12,r12,r8                      @ Maj(a,b,c)
+       add     r6,r6,r0,ror#2  @ h+=Sigma0(a)
+       @ add   r6,r6,r12                       @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+       @ ldr   r2,[r1],#4                      @ 6
+# if 6==15
+       str     r1,[sp,#17*4]                   @ make room for r1
+# endif
+       eor     r0,r10,r10,ror#5
+       add     r6,r6,r12                       @ h+=Maj(a,b,c) from the past
+       eor     r0,r0,r10,ror#19        @ Sigma1(e)
+# ifndef __ARMEB__
+       rev     r2,r2
+# endif
+#else
+       @ ldrb  r2,[r1,#3]                      @ 6
+       add     r6,r6,r12                       @ h+=Maj(a,b,c) from the past
+       ldrb    r12,[r1,#2]
+       ldrb    r0,[r1,#1]
+       orr     r2,r2,r12,lsl#8
+       ldrb    r12,[r1],#4
+       orr     r2,r2,r0,lsl#16
+# if 6==15
+       str     r1,[sp,#17*4]                   @ make room for r1
+# endif
+       eor     r0,r10,r10,ror#5
+       orr     r2,r2,r12,lsl#24
+       eor     r0,r0,r10,ror#19        @ Sigma1(e)
+#endif
+       ldr     r12,[r14],#4                    @ *K256++
+       add     r5,r5,r2                        @ h+=X[i]
+       str     r2,[sp,#6*4]
+       eor     r2,r11,r4
+       add     r5,r5,r0,ror#6  @ h+=Sigma1(e)
+       and     r2,r2,r10
+       add     r5,r5,r12                       @ h+=K256[i]
+       eor     r2,r2,r4                        @ Ch(e,f,g)
+       eor     r0,r6,r6,ror#11
+       add     r5,r5,r2                        @ h+=Ch(e,f,g)
+#if 6==31
+       and     r12,r12,#0xff
+       cmp     r12,#0xf2                       @ done?
+#endif
+#if 6<15
+# if __ARM_ARCH__>=7
+       ldr     r2,[r1],#4                      @ prefetch
+# else
+       ldrb    r2,[r1,#3]
+# endif
+       eor     r12,r6,r7                       @ a^b, b^c in next round
+#else
+       ldr     r2,[sp,#8*4]            @ from future BODY_16_xx
+       eor     r12,r6,r7                       @ a^b, b^c in next round
+       ldr     r1,[sp,#5*4]    @ from future BODY_16_xx
+#endif
+       eor     r0,r0,r6,ror#20 @ Sigma0(a)
+       and     r3,r3,r12                       @ (b^c)&=(a^b)
+       add     r9,r9,r5                        @ d+=h
+       eor     r3,r3,r7                        @ Maj(a,b,c)
+       add     r5,r5,r0,ror#2  @ h+=Sigma0(a)
+       @ add   r5,r5,r3                        @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+       @ ldr   r2,[r1],#4                      @ 7
+# if 7==15
+       str     r1,[sp,#17*4]                   @ make room for r1
+# endif
+       eor     r0,r9,r9,ror#5
+       add     r5,r5,r3                        @ h+=Maj(a,b,c) from the past
+       eor     r0,r0,r9,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+       rev     r2,r2
+# endif
+#else
+       @ ldrb  r2,[r1,#3]                      @ 7
+       add     r5,r5,r3                        @ h+=Maj(a,b,c) from the past
+       ldrb    r3,[r1,#2]
+       ldrb    r0,[r1,#1]
+       orr     r2,r2,r3,lsl#8
+       ldrb    r3,[r1],#4
+       orr     r2,r2,r0,lsl#16
+# if 7==15
+       str     r1,[sp,#17*4]                   @ make room for r1
+# endif
+       eor     r0,r9,r9,ror#5
+       orr     r2,r2,r3,lsl#24
+       eor     r0,r0,r9,ror#19 @ Sigma1(e)
+#endif
+       ldr     r3,[r14],#4                     @ *K256++
+       add     r4,r4,r2                        @ h+=X[i]
+       str     r2,[sp,#7*4]
+       eor     r2,r10,r11
+       add     r4,r4,r0,ror#6  @ h+=Sigma1(e)
+       and     r2,r2,r9
+       add     r4,r4,r3                        @ h+=K256[i]
+       eor     r2,r2,r11                       @ Ch(e,f,g)
+       eor     r0,r5,r5,ror#11
+       add     r4,r4,r2                        @ h+=Ch(e,f,g)
+#if 7==31
+       and     r3,r3,#0xff
+       cmp     r3,#0xf2                        @ done?
+#endif
+#if 7<15
+# if __ARM_ARCH__>=7
+       ldr     r2,[r1],#4                      @ prefetch
+# else
+       ldrb    r2,[r1,#3]
+# endif
+       eor     r3,r5,r6                        @ a^b, b^c in next round
+#else
+       ldr     r2,[sp,#9*4]            @ from future BODY_16_xx
+       eor     r3,r5,r6                        @ a^b, b^c in next round
+       ldr     r1,[sp,#6*4]    @ from future BODY_16_xx
+#endif
+       eor     r0,r0,r5,ror#20 @ Sigma0(a)
+       and     r12,r12,r3                      @ (b^c)&=(a^b)
+       add     r8,r8,r4                        @ d+=h
+       eor     r12,r12,r6                      @ Maj(a,b,c)
+       add     r4,r4,r0,ror#2  @ h+=Sigma0(a)
+       @ add   r4,r4,r12                       @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+       @ ldr   r2,[r1],#4                      @ 8
+# if 8==15
+       str     r1,[sp,#17*4]                   @ make room for r1
+# endif
+       eor     r0,r8,r8,ror#5
+       add     r4,r4,r12                       @ h+=Maj(a,b,c) from the past
+       eor     r0,r0,r8,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+       rev     r2,r2
+# endif
+#else
+       @ ldrb  r2,[r1,#3]                      @ 8
+       add     r4,r4,r12                       @ h+=Maj(a,b,c) from the past
+       ldrb    r12,[r1,#2]
+       ldrb    r0,[r1,#1]
+       orr     r2,r2,r12,lsl#8
+       ldrb    r12,[r1],#4
+       orr     r2,r2,r0,lsl#16
+# if 8==15
+       str     r1,[sp,#17*4]                   @ make room for r1
+# endif
+       eor     r0,r8,r8,ror#5
+       orr     r2,r2,r12,lsl#24
+       eor     r0,r0,r8,ror#19 @ Sigma1(e)
+#endif
+       ldr     r12,[r14],#4                    @ *K256++
+       add     r11,r11,r2                      @ h+=X[i]
+       str     r2,[sp,#8*4]
+       eor     r2,r9,r10
+       add     r11,r11,r0,ror#6        @ h+=Sigma1(e)
+       and     r2,r2,r8
+       add     r11,r11,r12                     @ h+=K256[i]
+       eor     r2,r2,r10                       @ Ch(e,f,g)
+       eor     r0,r4,r4,ror#11
+       add     r11,r11,r2                      @ h+=Ch(e,f,g)
+#if 8==31
+       and     r12,r12,#0xff
+       cmp     r12,#0xf2                       @ done?
+#endif
+#if 8<15
+# if __ARM_ARCH__>=7
+       ldr     r2,[r1],#4                      @ prefetch
+# else
+       ldrb    r2,[r1,#3]
+# endif
+       eor     r12,r4,r5                       @ a^b, b^c in next round
+#else
+       ldr     r2,[sp,#10*4]           @ from future BODY_16_xx
+       eor     r12,r4,r5                       @ a^b, b^c in next round
+       ldr     r1,[sp,#7*4]    @ from future BODY_16_xx
+#endif
+       eor     r0,r0,r4,ror#20 @ Sigma0(a)
+       and     r3,r3,r12                       @ (b^c)&=(a^b)
+       add     r7,r7,r11                       @ d+=h
+       eor     r3,r3,r5                        @ Maj(a,b,c)
+       add     r11,r11,r0,ror#2        @ h+=Sigma0(a)
+       @ add   r11,r11,r3                      @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+       @ ldr   r2,[r1],#4                      @ 9
+# if 9==15
+       str     r1,[sp,#17*4]                   @ make room for r1
+# endif
+       eor     r0,r7,r7,ror#5
+       add     r11,r11,r3                      @ h+=Maj(a,b,c) from the past
+       eor     r0,r0,r7,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+       rev     r2,r2
+# endif
+#else
+       @ ldrb  r2,[r1,#3]                      @ 9
+       add     r11,r11,r3                      @ h+=Maj(a,b,c) from the past
+       ldrb    r3,[r1,#2]
+       ldrb    r0,[r1,#1]
+       orr     r2,r2,r3,lsl#8
+       ldrb    r3,[r1],#4
+       orr     r2,r2,r0,lsl#16
+# if 9==15
+       str     r1,[sp,#17*4]                   @ make room for r1
+# endif
+       eor     r0,r7,r7,ror#5
+       orr     r2,r2,r3,lsl#24
+       eor     r0,r0,r7,ror#19 @ Sigma1(e)
+#endif
+       ldr     r3,[r14],#4                     @ *K256++
+       add     r10,r10,r2                      @ h+=X[i]
+       str     r2,[sp,#9*4]
+       eor     r2,r8,r9
+       add     r10,r10,r0,ror#6        @ h+=Sigma1(e)
+       and     r2,r2,r7
+       add     r10,r10,r3                      @ h+=K256[i]
+       eor     r2,r2,r9                        @ Ch(e,f,g)
+       eor     r0,r11,r11,ror#11
+       add     r10,r10,r2                      @ h+=Ch(e,f,g)
+#if 9==31
+       and     r3,r3,#0xff
+       cmp     r3,#0xf2                        @ done?
+#endif
+#if 9<15
+# if __ARM_ARCH__>=7
+       ldr     r2,[r1],#4                      @ prefetch
+# else
+       ldrb    r2,[r1,#3]
+# endif
+       eor     r3,r11,r4                       @ a^b, b^c in next round
+#else
+       ldr     r2,[sp,#11*4]           @ from future BODY_16_xx
+       eor     r3,r11,r4                       @ a^b, b^c in next round
+       ldr     r1,[sp,#8*4]    @ from future BODY_16_xx
+#endif
+       eor     r0,r0,r11,ror#20        @ Sigma0(a)
+       and     r12,r12,r3                      @ (b^c)&=(a^b)
+       add     r6,r6,r10                       @ d+=h
+       eor     r12,r12,r4                      @ Maj(a,b,c)
+       add     r10,r10,r0,ror#2        @ h+=Sigma0(a)
+       @ add   r10,r10,r12                     @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+       @ ldr   r2,[r1],#4                      @ 10
+# if 10==15
+       str     r1,[sp,#17*4]                   @ make room for r1
+# endif
+       eor     r0,r6,r6,ror#5
+       add     r10,r10,r12                     @ h+=Maj(a,b,c) from the past
+       eor     r0,r0,r6,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+       rev     r2,r2
+# endif
+#else
+       @ ldrb  r2,[r1,#3]                      @ 10
+       add     r10,r10,r12                     @ h+=Maj(a,b,c) from the past
+       ldrb    r12,[r1,#2]
+       ldrb    r0,[r1,#1]
+       orr     r2,r2,r12,lsl#8
+       ldrb    r12,[r1],#4
+       orr     r2,r2,r0,lsl#16
+# if 10==15
+       str     r1,[sp,#17*4]                   @ make room for r1
+# endif
+       eor     r0,r6,r6,ror#5
+       orr     r2,r2,r12,lsl#24
+       eor     r0,r0,r6,ror#19 @ Sigma1(e)
+#endif
+       ldr     r12,[r14],#4                    @ *K256++
+       add     r9,r9,r2                        @ h+=X[i]
+       str     r2,[sp,#10*4]
+       eor     r2,r7,r8
+       add     r9,r9,r0,ror#6  @ h+=Sigma1(e)
+       and     r2,r2,r6
+       add     r9,r9,r12                       @ h+=K256[i]
+       eor     r2,r2,r8                        @ Ch(e,f,g)
+       eor     r0,r10,r10,ror#11
+       add     r9,r9,r2                        @ h+=Ch(e,f,g)
+#if 10==31
+       and     r12,r12,#0xff
+       cmp     r12,#0xf2                       @ done?
+#endif
+#if 10<15
+# if __ARM_ARCH__>=7
+       ldr     r2,[r1],#4                      @ prefetch
+# else
+       ldrb    r2,[r1,#3]
+# endif
+       eor     r12,r10,r11                     @ a^b, b^c in next round
+#else
+       ldr     r2,[sp,#12*4]           @ from future BODY_16_xx
+       eor     r12,r10,r11                     @ a^b, b^c in next round
+       ldr     r1,[sp,#9*4]    @ from future BODY_16_xx
+#endif
+       eor     r0,r0,r10,ror#20        @ Sigma0(a)
+       and     r3,r3,r12                       @ (b^c)&=(a^b)
+       add     r5,r5,r9                        @ d+=h
+       eor     r3,r3,r11                       @ Maj(a,b,c)
+       add     r9,r9,r0,ror#2  @ h+=Sigma0(a)
+       @ add   r9,r9,r3                        @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+       @ ldr   r2,[r1],#4                      @ 11
+# if 11==15
+       str     r1,[sp,#17*4]                   @ make room for r1
+# endif
+       eor     r0,r5,r5,ror#5
+       add     r9,r9,r3                        @ h+=Maj(a,b,c) from the past
+       eor     r0,r0,r5,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+       rev     r2,r2
+# endif
+#else
+       @ ldrb  r2,[r1,#3]                      @ 11
+       add     r9,r9,r3                        @ h+=Maj(a,b,c) from the past
+       ldrb    r3,[r1,#2]
+       ldrb    r0,[r1,#1]
+       orr     r2,r2,r3,lsl#8
+       ldrb    r3,[r1],#4
+       orr     r2,r2,r0,lsl#16
+# if 11==15
+       str     r1,[sp,#17*4]                   @ make room for r1
+# endif
+       eor     r0,r5,r5,ror#5
+       orr     r2,r2,r3,lsl#24
+       eor     r0,r0,r5,ror#19 @ Sigma1(e)
+#endif
+       ldr     r3,[r14],#4                     @ *K256++
+       add     r8,r8,r2                        @ h+=X[i]
+       str     r2,[sp,#11*4]
+       eor     r2,r6,r7
+       add     r8,r8,r0,ror#6  @ h+=Sigma1(e)
+       and     r2,r2,r5
+       add     r8,r8,r3                        @ h+=K256[i]
+       eor     r2,r2,r7                        @ Ch(e,f,g)
+       eor     r0,r9,r9,ror#11
+       add     r8,r8,r2                        @ h+=Ch(e,f,g)
+#if 11==31
+       and     r3,r3,#0xff
+       cmp     r3,#0xf2                        @ done?
+#endif
+#if 11<15
+# if __ARM_ARCH__>=7
+       ldr     r2,[r1],#4                      @ prefetch
+# else
+       ldrb    r2,[r1,#3]
+# endif
+       eor     r3,r9,r10                       @ a^b, b^c in next round
+#else
+       ldr     r2,[sp,#13*4]           @ from future BODY_16_xx
+       eor     r3,r9,r10                       @ a^b, b^c in next round
+       ldr     r1,[sp,#10*4]   @ from future BODY_16_xx
+#endif
+       eor     r0,r0,r9,ror#20 @ Sigma0(a)
+       and     r12,r12,r3                      @ (b^c)&=(a^b)
+       add     r4,r4,r8                        @ d+=h
+       eor     r12,r12,r10                     @ Maj(a,b,c)
+       add     r8,r8,r0,ror#2  @ h+=Sigma0(a)
+       @ add   r8,r8,r12                       @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+       @ ldr   r2,[r1],#4                      @ 12
+# if 12==15
+       str     r1,[sp,#17*4]                   @ make room for r1
+# endif
+       eor     r0,r4,r4,ror#5
+       add     r8,r8,r12                       @ h+=Maj(a,b,c) from the past
+       eor     r0,r0,r4,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+       rev     r2,r2
+# endif
+#else
+       @ ldrb  r2,[r1,#3]                      @ 12
+       add     r8,r8,r12                       @ h+=Maj(a,b,c) from the past
+       ldrb    r12,[r1,#2]
+       ldrb    r0,[r1,#1]
+       orr     r2,r2,r12,lsl#8
+       ldrb    r12,[r1],#4
+       orr     r2,r2,r0,lsl#16
+# if 12==15
+       str     r1,[sp,#17*4]                   @ make room for r1
+# endif
+       eor     r0,r4,r4,ror#5
+       orr     r2,r2,r12,lsl#24
+       eor     r0,r0,r4,ror#19 @ Sigma1(e)
+#endif
+       ldr     r12,[r14],#4                    @ *K256++
+       add     r7,r7,r2                        @ h+=X[i]
+       str     r2,[sp,#12*4]
+       eor     r2,r5,r6
+       add     r7,r7,r0,ror#6  @ h+=Sigma1(e)
+       and     r2,r2,r4
+       add     r7,r7,r12                       @ h+=K256[i]
+       eor     r2,r2,r6                        @ Ch(e,f,g)
+       eor     r0,r8,r8,ror#11
+       add     r7,r7,r2                        @ h+=Ch(e,f,g)
+#if 12==31
+       and     r12,r12,#0xff
+       cmp     r12,#0xf2                       @ done?
+#endif
+#if 12<15
+# if __ARM_ARCH__>=7
+       ldr     r2,[r1],#4                      @ prefetch
+# else
+       ldrb    r2,[r1,#3]
+# endif
+       eor     r12,r8,r9                       @ a^b, b^c in next round
+#else
+       ldr     r2,[sp,#14*4]           @ from future BODY_16_xx
+       eor     r12,r8,r9                       @ a^b, b^c in next round
+       ldr     r1,[sp,#11*4]   @ from future BODY_16_xx
+#endif
+       eor     r0,r0,r8,ror#20 @ Sigma0(a)
+       and     r3,r3,r12                       @ (b^c)&=(a^b)
+       add     r11,r11,r7                      @ d+=h
+       eor     r3,r3,r9                        @ Maj(a,b,c)
+       add     r7,r7,r0,ror#2  @ h+=Sigma0(a)
+       @ add   r7,r7,r3                        @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+       @ ldr   r2,[r1],#4                      @ 13
+# if 13==15
+       str     r1,[sp,#17*4]                   @ make room for r1
+# endif
+       eor     r0,r11,r11,ror#5
+       add     r7,r7,r3                        @ h+=Maj(a,b,c) from the past
+       eor     r0,r0,r11,ror#19        @ Sigma1(e)
+# ifndef __ARMEB__
+       rev     r2,r2
+# endif
+#else
+       @ ldrb  r2,[r1,#3]                      @ 13
+       add     r7,r7,r3                        @ h+=Maj(a,b,c) from the past
+       ldrb    r3,[r1,#2]
+       ldrb    r0,[r1,#1]
+       orr     r2,r2,r3,lsl#8
+       ldrb    r3,[r1],#4
+       orr     r2,r2,r0,lsl#16
+# if 13==15
+       str     r1,[sp,#17*4]                   @ make room for r1
+# endif
+       eor     r0,r11,r11,ror#5
+       orr     r2,r2,r3,lsl#24
+       eor     r0,r0,r11,ror#19        @ Sigma1(e)
+#endif
+       ldr     r3,[r14],#4                     @ *K256++
+       add     r6,r6,r2                        @ h+=X[i]
+       str     r2,[sp,#13*4]
+       eor     r2,r4,r5
+       add     r6,r6,r0,ror#6  @ h+=Sigma1(e)
+       and     r2,r2,r11
+       add     r6,r6,r3                        @ h+=K256[i]
+       eor     r2,r2,r5                        @ Ch(e,f,g)
+       eor     r0,r7,r7,ror#11
+       add     r6,r6,r2                        @ h+=Ch(e,f,g)
+#if 13==31
+       and     r3,r3,#0xff
+       cmp     r3,#0xf2                        @ done?
+#endif
+#if 13<15
+# if __ARM_ARCH__>=7
+       ldr     r2,[r1],#4                      @ prefetch
+# else
+       ldrb    r2,[r1,#3]
+# endif
+       eor     r3,r7,r8                        @ a^b, b^c in next round
+#else
+       ldr     r2,[sp,#15*4]           @ from future BODY_16_xx
+       eor     r3,r7,r8                        @ a^b, b^c in next round
+       ldr     r1,[sp,#12*4]   @ from future BODY_16_xx
+#endif
+       eor     r0,r0,r7,ror#20 @ Sigma0(a)
+       and     r12,r12,r3                      @ (b^c)&=(a^b)
+       add     r10,r10,r6                      @ d+=h
+       eor     r12,r12,r8                      @ Maj(a,b,c)
+       add     r6,r6,r0,ror#2  @ h+=Sigma0(a)
+       @ add   r6,r6,r12                       @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+       @ ldr   r2,[r1],#4                      @ 14
+# if 14==15
+       str     r1,[sp,#17*4]                   @ make room for r1
+# endif
+       eor     r0,r10,r10,ror#5
+       add     r6,r6,r12                       @ h+=Maj(a,b,c) from the past
+       eor     r0,r0,r10,ror#19        @ Sigma1(e)
+# ifndef __ARMEB__
+       rev     r2,r2
+# endif
+#else
+       @ ldrb  r2,[r1,#3]                      @ 14
+       add     r6,r6,r12                       @ h+=Maj(a,b,c) from the past
+       ldrb    r12,[r1,#2]
+       ldrb    r0,[r1,#1]
+       orr     r2,r2,r12,lsl#8
+       ldrb    r12,[r1],#4
+       orr     r2,r2,r0,lsl#16
+# if 14==15
+       str     r1,[sp,#17*4]                   @ make room for r1
+# endif
+       eor     r0,r10,r10,ror#5
+       orr     r2,r2,r12,lsl#24
+       eor     r0,r0,r10,ror#19        @ Sigma1(e)
+#endif
+       ldr     r12,[r14],#4                    @ *K256++
+       add     r5,r5,r2                        @ h+=X[i]
+       str     r2,[sp,#14*4]
+       eor     r2,r11,r4
+       add     r5,r5,r0,ror#6  @ h+=Sigma1(e)
+       and     r2,r2,r10
+       add     r5,r5,r12                       @ h+=K256[i]
+       eor     r2,r2,r4                        @ Ch(e,f,g)
+       eor     r0,r6,r6,ror#11
+       add     r5,r5,r2                        @ h+=Ch(e,f,g)
+#if 14==31
+       and     r12,r12,#0xff
+       cmp     r12,#0xf2                       @ done?
+#endif
+#if 14<15
+# if __ARM_ARCH__>=7
+       ldr     r2,[r1],#4                      @ prefetch
+# else
+       ldrb    r2,[r1,#3]
+# endif
+       eor     r12,r6,r7                       @ a^b, b^c in next round
+#else
+       ldr     r2,[sp,#0*4]            @ from future BODY_16_xx
+       eor     r12,r6,r7                       @ a^b, b^c in next round
+       ldr     r1,[sp,#13*4]   @ from future BODY_16_xx
+#endif
+       eor     r0,r0,r6,ror#20 @ Sigma0(a)
+       and     r3,r3,r12                       @ (b^c)&=(a^b)
+       add     r9,r9,r5                        @ d+=h
+       eor     r3,r3,r7                        @ Maj(a,b,c)
+       add     r5,r5,r0,ror#2  @ h+=Sigma0(a)
+       @ add   r5,r5,r3                        @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+       @ ldr   r2,[r1],#4                      @ 15
+# if 15==15
+       str     r1,[sp,#17*4]                   @ make room for r1
+# endif
+       eor     r0,r9,r9,ror#5
+       add     r5,r5,r3                        @ h+=Maj(a,b,c) from the past
+       eor     r0,r0,r9,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+       rev     r2,r2
+# endif
+#else
+       @ ldrb  r2,[r1,#3]                      @ 15
+       add     r5,r5,r3                        @ h+=Maj(a,b,c) from the past
+       ldrb    r3,[r1,#2]
+       ldrb    r0,[r1,#1]
+       orr     r2,r2,r3,lsl#8
+       ldrb    r3,[r1],#4
+       orr     r2,r2,r0,lsl#16
+# if 15==15
+       str     r1,[sp,#17*4]                   @ make room for r1
+# endif
+       eor     r0,r9,r9,ror#5
+       orr     r2,r2,r3,lsl#24
+       eor     r0,r0,r9,ror#19 @ Sigma1(e)
+#endif
+       ldr     r3,[r14],#4                     @ *K256++
+       add     r4,r4,r2                        @ h+=X[i]
+       str     r2,[sp,#15*4]
+       eor     r2,r10,r11
+       add     r4,r4,r0,ror#6  @ h+=Sigma1(e)
+       and     r2,r2,r9
+       add     r4,r4,r3                        @ h+=K256[i]
+       eor     r2,r2,r11                       @ Ch(e,f,g)
+       eor     r0,r5,r5,ror#11
+       add     r4,r4,r2                        @ h+=Ch(e,f,g)
+#if 15==31
+       and     r3,r3,#0xff
+       cmp     r3,#0xf2                        @ done?
+#endif
+#if 15<15
+# if __ARM_ARCH__>=7
+       ldr     r2,[r1],#4                      @ prefetch
+# else
+       ldrb    r2,[r1,#3]
+# endif
+       eor     r3,r5,r6                        @ a^b, b^c in next round
+#else
+       ldr     r2,[sp,#1*4]            @ from future BODY_16_xx
+       eor     r3,r5,r6                        @ a^b, b^c in next round
+       ldr     r1,[sp,#14*4]   @ from future BODY_16_xx
+#endif
+       eor     r0,r0,r5,ror#20 @ Sigma0(a)
+       and     r12,r12,r3                      @ (b^c)&=(a^b)
+       add     r8,r8,r4                        @ d+=h
+       eor     r12,r12,r6                      @ Maj(a,b,c)
+       add     r4,r4,r0,ror#2  @ h+=Sigma0(a)
+       @ add   r4,r4,r12                       @ h+=Maj(a,b,c)
+.Lrounds_16_xx:
+       @ ldr   r2,[sp,#1*4]            @ 16
+       @ ldr   r1,[sp,#14*4]
+       mov     r0,r2,ror#7
+       add     r4,r4,r12                       @ h+=Maj(a,b,c) from the past
+       mov     r12,r1,ror#17
+       eor     r0,r0,r2,ror#18
+       eor     r12,r12,r1,ror#19
+       eor     r0,r0,r2,lsr#3  @ sigma0(X[i+1])
+       ldr     r2,[sp,#0*4]
+       eor     r12,r12,r1,lsr#10       @ sigma1(X[i+14])
+       ldr     r1,[sp,#9*4]
+
+       add     r12,r12,r0
+       eor     r0,r8,r8,ror#5  @ from BODY_00_15
+       add     r2,r2,r12
+       eor     r0,r0,r8,ror#19 @ Sigma1(e)
+       add     r2,r2,r1                        @ X[i]
+       ldr     r12,[r14],#4                    @ *K256++
+       add     r11,r11,r2                      @ h+=X[i]
+       str     r2,[sp,#0*4]
+       eor     r2,r9,r10
+       add     r11,r11,r0,ror#6        @ h+=Sigma1(e)
+       and     r2,r2,r8
+       add     r11,r11,r12                     @ h+=K256[i]
+       eor     r2,r2,r10                       @ Ch(e,f,g)
+       eor     r0,r4,r4,ror#11
+       add     r11,r11,r2                      @ h+=Ch(e,f,g)
+#if 16==31
+       and     r12,r12,#0xff
+       cmp     r12,#0xf2                       @ done?
+#endif
+#if 16<15
+# if __ARM_ARCH__>=7
+       ldr     r2,[r1],#4                      @ prefetch
+# else
+       ldrb    r2,[r1,#3]
+# endif
+       eor     r12,r4,r5                       @ a^b, b^c in next round
+#else
+       ldr     r2,[sp,#2*4]            @ from future BODY_16_xx
+       eor     r12,r4,r5                       @ a^b, b^c in next round
+       ldr     r1,[sp,#15*4]   @ from future BODY_16_xx
+#endif
+       eor     r0,r0,r4,ror#20 @ Sigma0(a)
+       and     r3,r3,r12                       @ (b^c)&=(a^b)
+       add     r7,r7,r11                       @ d+=h
+       eor     r3,r3,r5                        @ Maj(a,b,c)
+       add     r11,r11,r0,ror#2        @ h+=Sigma0(a)
+       @ add   r11,r11,r3                      @ h+=Maj(a,b,c)
+       @ ldr   r2,[sp,#2*4]            @ 17
+       @ ldr   r1,[sp,#15*4]
+       mov     r0,r2,ror#7
+       add     r11,r11,r3                      @ h+=Maj(a,b,c) from the past
+       mov     r3,r1,ror#17
+       eor     r0,r0,r2,ror#18
+       eor     r3,r3,r1,ror#19
+       eor     r0,r0,r2,lsr#3  @ sigma0(X[i+1])
+       ldr     r2,[sp,#1*4]
+       eor     r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+       ldr     r1,[sp,#10*4]
+
+       add     r3,r3,r0
+       eor     r0,r7,r7,ror#5  @ from BODY_00_15
+       add     r2,r2,r3
+       eor     r0,r0,r7,ror#19 @ Sigma1(e)
+       add     r2,r2,r1                        @ X[i]
+       ldr     r3,[r14],#4                     @ *K256++
+       add     r10,r10,r2                      @ h+=X[i]
+       str     r2,[sp,#1*4]
+       eor     r2,r8,r9
+       add     r10,r10,r0,ror#6        @ h+=Sigma1(e)
+       and     r2,r2,r7
+       add     r10,r10,r3                      @ h+=K256[i]
+       eor     r2,r2,r9                        @ Ch(e,f,g)
+       eor     r0,r11,r11,ror#11
+       add     r10,r10,r2                      @ h+=Ch(e,f,g)
+#if 17==31
+       and     r3,r3,#0xff
+       cmp     r3,#0xf2                        @ done?
+#endif
+#if 17<15
+# if __ARM_ARCH__>=7
+       ldr     r2,[r1],#4                      @ prefetch
+# else
+       ldrb    r2,[r1,#3]
+# endif
+       eor     r3,r11,r4                       @ a^b, b^c in next round
+#else
+       ldr     r2,[sp,#3*4]            @ from future BODY_16_xx
+       eor     r3,r11,r4                       @ a^b, b^c in next round
+       ldr     r1,[sp,#0*4]    @ from future BODY_16_xx
+#endif
+       eor     r0,r0,r11,ror#20        @ Sigma0(a)
+       and     r12,r12,r3                      @ (b^c)&=(a^b)
+       add     r6,r6,r10                       @ d+=h
+       eor     r12,r12,r4                      @ Maj(a,b,c)
+       add     r10,r10,r0,ror#2        @ h+=Sigma0(a)
+       @ add   r10,r10,r12                     @ h+=Maj(a,b,c)
+       @ ldr   r2,[sp,#3*4]            @ 18
+       @ ldr   r1,[sp,#0*4]
+       mov     r0,r2,ror#7
+       add     r10,r10,r12                     @ h+=Maj(a,b,c) from the past
+       mov     r12,r1,ror#17
+       eor     r0,r0,r2,ror#18
+       eor     r12,r12,r1,ror#19
+       eor     r0,r0,r2,lsr#3  @ sigma0(X[i+1])
+       ldr     r2,[sp,#2*4]
+       eor     r12,r12,r1,lsr#10       @ sigma1(X[i+14])
+       ldr     r1,[sp,#11*4]
+
+       add     r12,r12,r0
+       eor     r0,r6,r6,ror#5  @ from BODY_00_15
+       add     r2,r2,r12
+       eor     r0,r0,r6,ror#19 @ Sigma1(e)
+       add     r2,r2,r1                        @ X[i]
+       ldr     r12,[r14],#4                    @ *K256++
+       add     r9,r9,r2                        @ h+=X[i]
+       str     r2,[sp,#2*4]
+       eor     r2,r7,r8
+       add     r9,r9,r0,ror#6  @ h+=Sigma1(e)
+       and     r2,r2,r6
+       add     r9,r9,r12                       @ h+=K256[i]
+       eor     r2,r2,r8                        @ Ch(e,f,g)
+       eor     r0,r10,r10,ror#11
+       add     r9,r9,r2                        @ h+=Ch(e,f,g)
+#if 18==31
+       and     r12,r12,#0xff
+       cmp     r12,#0xf2                       @ done?
+#endif
+#if 18<15
+# if __ARM_ARCH__>=7
+       ldr     r2,[r1],#4                      @ prefetch
+# else
+       ldrb    r2,[r1,#3]
+# endif
+       eor     r12,r10,r11                     @ a^b, b^c in next round
+#else
+       ldr     r2,[sp,#4*4]            @ from future BODY_16_xx
+       eor     r12,r10,r11                     @ a^b, b^c in next round
+       ldr     r1,[sp,#1*4]    @ from future BODY_16_xx
+#endif
+       eor     r0,r0,r10,ror#20        @ Sigma0(a)
+       and     r3,r3,r12                       @ (b^c)&=(a^b)
+       add     r5,r5,r9                        @ d+=h
+       eor     r3,r3,r11                       @ Maj(a,b,c)
+       add     r9,r9,r0,ror#2  @ h+=Sigma0(a)
+       @ add   r9,r9,r3                        @ h+=Maj(a,b,c)
+       @ ldr   r2,[sp,#4*4]            @ 19
+       @ ldr   r1,[sp,#1*4]
+       mov     r0,r2,ror#7
+       add     r9,r9,r3                        @ h+=Maj(a,b,c) from the past
+       mov     r3,r1,ror#17
+       eor     r0,r0,r2,ror#18
+       eor     r3,r3,r1,ror#19
+       eor     r0,r0,r2,lsr#3  @ sigma0(X[i+1])
+       ldr     r2,[sp,#3*4]
+       eor     r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+       ldr     r1,[sp,#12*4]
+
+       add     r3,r3,r0
+       eor     r0,r5,r5,ror#5  @ from BODY_00_15
+       add     r2,r2,r3
+       eor     r0,r0,r5,ror#19 @ Sigma1(e)
+       add     r2,r2,r1                        @ X[i]
+       ldr     r3,[r14],#4                     @ *K256++
+       add     r8,r8,r2                        @ h+=X[i]
+       str     r2,[sp,#3*4]
+       eor     r2,r6,r7
+       add     r8,r8,r0,ror#6  @ h+=Sigma1(e)
+       and     r2,r2,r5
+       add     r8,r8,r3                        @ h+=K256[i]
+       eor     r2,r2,r7                        @ Ch(e,f,g)
+       eor     r0,r9,r9,ror#11
+       add     r8,r8,r2                        @ h+=Ch(e,f,g)
+#if 19==31
+       and     r3,r3,#0xff
+       cmp     r3,#0xf2                        @ done?
+#endif
+#if 19<15
+# if __ARM_ARCH__>=7
+       ldr     r2,[r1],#4                      @ prefetch
+# else
+       ldrb    r2,[r1,#3]
+# endif
+       eor     r3,r9,r10                       @ a^b, b^c in next round
+#else
+       ldr     r2,[sp,#5*4]            @ from future BODY_16_xx
+       eor     r3,r9,r10                       @ a^b, b^c in next round
+       ldr     r1,[sp,#2*4]    @ from future BODY_16_xx
+#endif
+       eor     r0,r0,r9,ror#20 @ Sigma0(a)
+       and     r12,r12,r3                      @ (b^c)&=(a^b)
+       add     r4,r4,r8                        @ d+=h
+       eor     r12,r12,r10                     @ Maj(a,b,c)
+       add     r8,r8,r0,ror#2  @ h+=Sigma0(a)
+       @ add   r8,r8,r12                       @ h+=Maj(a,b,c)
+       @ ldr   r2,[sp,#5*4]            @ 20
+       @ ldr   r1,[sp,#2*4]
+       mov     r0,r2,ror#7
+       add     r8,r8,r12                       @ h+=Maj(a,b,c) from the past
+       mov     r12,r1,ror#17
+       eor     r0,r0,r2,ror#18
+       eor     r12,r12,r1,ror#19
+       eor     r0,r0,r2,lsr#3  @ sigma0(X[i+1])
+       ldr     r2,[sp,#4*4]
+       eor     r12,r12,r1,lsr#10       @ sigma1(X[i+14])
+       ldr     r1,[sp,#13*4]
+
+       add     r12,r12,r0
+       eor     r0,r4,r4,ror#5  @ from BODY_00_15
+       add     r2,r2,r12
+       eor     r0,r0,r4,ror#19 @ Sigma1(e)
+       add     r2,r2,r1                        @ X[i]
+       ldr     r12,[r14],#4                    @ *K256++
+       add     r7,r7,r2                        @ h+=X[i]
+       str     r2,[sp,#4*4]
+       eor     r2,r5,r6
+       add     r7,r7,r0,ror#6  @ h+=Sigma1(e)
+       and     r2,r2,r4
+       add     r7,r7,r12                       @ h+=K256[i]
+       eor     r2,r2,r6                        @ Ch(e,f,g)
+       eor     r0,r8,r8,ror#11
+       add     r7,r7,r2                        @ h+=Ch(e,f,g)
+#if 20==31
+       and     r12,r12,#0xff
+       cmp     r12,#0xf2                       @ done?
+#endif
+#if 20<15
+# if __ARM_ARCH__>=7
+       ldr     r2,[r1],#4                      @ prefetch
+# else
+       ldrb    r2,[r1,#3]
+# endif
+       eor     r12,r8,r9                       @ a^b, b^c in next round
+#else
+       ldr     r2,[sp,#6*4]            @ from future BODY_16_xx
+       eor     r12,r8,r9                       @ a^b, b^c in next round
+       ldr     r1,[sp,#3*4]    @ from future BODY_16_xx
+#endif
+       eor     r0,r0,r8,ror#20 @ Sigma0(a)
+       and     r3,r3,r12                       @ (b^c)&=(a^b)
+       add     r11,r11,r7                      @ d+=h
+       eor     r3,r3,r9                        @ Maj(a,b,c)
+       add     r7,r7,r0,ror#2  @ h+=Sigma0(a)
+       @ add   r7,r7,r3                        @ h+=Maj(a,b,c)
+       @ ldr   r2,[sp,#6*4]            @ 21
+       @ ldr   r1,[sp,#3*4]
+       mov     r0,r2,ror#7
+       add     r7,r7,r3                        @ h+=Maj(a,b,c) from the past
+       mov     r3,r1,ror#17
+       eor     r0,r0,r2,ror#18
+       eor     r3,r3,r1,ror#19
+       eor     r0,r0,r2,lsr#3  @ sigma0(X[i+1])
+       ldr     r2,[sp,#5*4]
+       eor     r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+       ldr     r1,[sp,#14*4]
+
+       add     r3,r3,r0
+       eor     r0,r11,r11,ror#5        @ from BODY_00_15
+       add     r2,r2,r3
+       eor     r0,r0,r11,ror#19        @ Sigma1(e)
+       add     r2,r2,r1                        @ X[i]
+       ldr     r3,[r14],#4                     @ *K256++
+       add     r6,r6,r2                        @ h+=X[i]
+       str     r2,[sp,#5*4]
+       eor     r2,r4,r5
+       add     r6,r6,r0,ror#6  @ h+=Sigma1(e)
+       and     r2,r2,r11
+       add     r6,r6,r3                        @ h+=K256[i]
+       eor     r2,r2,r5                        @ Ch(e,f,g)
+       eor     r0,r7,r7,ror#11
+       add     r6,r6,r2                        @ h+=Ch(e,f,g)
+#if 21==31
+       and     r3,r3,#0xff
+       cmp     r3,#0xf2                        @ done?
+#endif
+#if 21<15
+# if __ARM_ARCH__>=7
+       ldr     r2,[r1],#4                      @ prefetch
+# else
+       ldrb    r2,[r1,#3]
+# endif
+       eor     r3,r7,r8                        @ a^b, b^c in next round
+#else
+       ldr     r2,[sp,#7*4]            @ from future BODY_16_xx
+       eor     r3,r7,r8                        @ a^b, b^c in next round
+       ldr     r1,[sp,#4*4]    @ from future BODY_16_xx
+#endif
+       eor     r0,r0,r7,ror#20 @ Sigma0(a)
+       and     r12,r12,r3                      @ (b^c)&=(a^b)
+       add     r10,r10,r6                      @ d+=h
+       eor     r12,r12,r8                      @ Maj(a,b,c)
+       add     r6,r6,r0,ror#2  @ h+=Sigma0(a)
+       @ add   r6,r6,r12                       @ h+=Maj(a,b,c)
+       @ ldr   r2,[sp,#7*4]            @ 22
+       @ ldr   r1,[sp,#4*4]
+       mov     r0,r2,ror#7
+       add     r6,r6,r12                       @ h+=Maj(a,b,c) from the past
+       mov     r12,r1,ror#17
+       eor     r0,r0,r2,ror#18
+       eor     r12,r12,r1,ror#19
+       eor     r0,r0,r2,lsr#3  @ sigma0(X[i+1])
+       ldr     r2,[sp,#6*4]
+       eor     r12,r12,r1,lsr#10       @ sigma1(X[i+14])
+       ldr     r1,[sp,#15*4]
+
+       add     r12,r12,r0
+       eor     r0,r10,r10,ror#5        @ from BODY_00_15
+       add     r2,r2,r12
+       eor     r0,r0,r10,ror#19        @ Sigma1(e)
+       add     r2,r2,r1                        @ X[i]
+       ldr     r12,[r14],#4                    @ *K256++
+       add     r5,r5,r2                        @ h+=X[i]
+       str     r2,[sp,#6*4]
+       eor     r2,r11,r4
+       add     r5,r5,r0,ror#6  @ h+=Sigma1(e)
+       and     r2,r2,r10
+       add     r5,r5,r12                       @ h+=K256[i]
+       eor     r2,r2,r4                        @ Ch(e,f,g)
+       eor     r0,r6,r6,ror#11
+       add     r5,r5,r2                        @ h+=Ch(e,f,g)
+#if 22==31
+       and     r12,r12,#0xff
+       cmp     r12,#0xf2                       @ done?
+#endif
+#if 22<15
+# if __ARM_ARCH__>=7
+       ldr     r2,[r1],#4                      @ prefetch
+# else
+       ldrb    r2,[r1,#3]
+# endif
+       eor     r12,r6,r7                       @ a^b, b^c in next round
+#else
+       ldr     r2,[sp,#8*4]            @ from future BODY_16_xx
+       eor     r12,r6,r7                       @ a^b, b^c in next round
+       ldr     r1,[sp,#5*4]    @ from future BODY_16_xx
+#endif
+       eor     r0,r0,r6,ror#20 @ Sigma0(a)
+       and     r3,r3,r12                       @ (b^c)&=(a^b)
+       add     r9,r9,r5                        @ d+=h
+       eor     r3,r3,r7                        @ Maj(a,b,c)
+       add     r5,r5,r0,ror#2  @ h+=Sigma0(a)
+       @ add   r5,r5,r3                        @ h+=Maj(a,b,c)
+       @ ldr   r2,[sp,#8*4]            @ 23
+       @ ldr   r1,[sp,#5*4]
+       mov     r0,r2,ror#7
+       add     r5,r5,r3                        @ h+=Maj(a,b,c) from the past
+       mov     r3,r1,ror#17
+       eor     r0,r0,r2,ror#18
+       eor     r3,r3,r1,ror#19
+       eor     r0,r0,r2,lsr#3  @ sigma0(X[i+1])
+       ldr     r2,[sp,#7*4]
+       eor     r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+       ldr     r1,[sp,#0*4]
+
+       add     r3,r3,r0
+       eor     r0,r9,r9,ror#5  @ from BODY_00_15
+       add     r2,r2,r3
+       eor     r0,r0,r9,ror#19 @ Sigma1(e)
+       add     r2,r2,r1                        @ X[i]
+       ldr     r3,[r14],#4                     @ *K256++
+       add     r4,r4,r2                        @ h+=X[i]
+       str     r2,[sp,#7*4]
+       eor     r2,r10,r11
+       add     r4,r4,r0,ror#6  @ h+=Sigma1(e)
+       and     r2,r2,r9
+       add     r4,r4,r3                        @ h+=K256[i]
+       eor     r2,r2,r11                       @ Ch(e,f,g)
+       eor     r0,r5,r5,ror#11
+       add     r4,r4,r2                        @ h+=Ch(e,f,g)
+#if 23==31
+       and     r3,r3,#0xff
+       cmp     r3,#0xf2                        @ done?
+#endif
+#if 23<15
+# if __ARM_ARCH__>=7
+       ldr     r2,[r1],#4                      @ prefetch
+# else
+       ldrb    r2,[r1,#3]
+# endif
+       eor     r3,r5,r6                        @ a^b, b^c in next round
+#else
+       ldr     r2,[sp,#9*4]            @ from future BODY_16_xx
+       eor     r3,r5,r6                        @ a^b, b^c in next round
+       ldr     r1,[sp,#6*4]    @ from future BODY_16_xx
+#endif
+       eor     r0,r0,r5,ror#20 @ Sigma0(a)
+       and     r12,r12,r3                      @ (b^c)&=(a^b)
+       add     r8,r8,r4                        @ d+=h
+       eor     r12,r12,r6                      @ Maj(a,b,c)
+       add     r4,r4,r0,ror#2  @ h+=Sigma0(a)
+       @ add   r4,r4,r12                       @ h+=Maj(a,b,c)
+       @ ldr   r2,[sp,#9*4]            @ 24
+       @ ldr   r1,[sp,#6*4]
+       mov     r0,r2,ror#7
+       add     r4,r4,r12                       @ h+=Maj(a,b,c) from the past
+       mov     r12,r1,ror#17
+       eor     r0,r0,r2,ror#18
+       eor     r12,r12,r1,ror#19
+       eor     r0,r0,r2,lsr#3  @ sigma0(X[i+1])
+       ldr     r2,[sp,#8*4]
+       eor     r12,r12,r1,lsr#10       @ sigma1(X[i+14])
+       ldr     r1,[sp,#1*4]
+
+       add     r12,r12,r0
+       eor     r0,r8,r8,ror#5  @ from BODY_00_15
+       add     r2,r2,r12
+       eor     r0,r0,r8,ror#19 @ Sigma1(e)
+       add     r2,r2,r1                        @ X[i]
+       ldr     r12,[r14],#4                    @ *K256++
+       add     r11,r11,r2                      @ h+=X[i]
+       str     r2,[sp,#8*4]
+       eor     r2,r9,r10
+       add     r11,r11,r0,ror#6        @ h+=Sigma1(e)
+       and     r2,r2,r8
+       add     r11,r11,r12                     @ h+=K256[i]
+       eor     r2,r2,r10                       @ Ch(e,f,g)
+       eor     r0,r4,r4,ror#11
+       add     r11,r11,r2                      @ h+=Ch(e,f,g)
+#if 24==31
+       and     r12,r12,#0xff
+       cmp     r12,#0xf2                       @ done?
+#endif
+#if 24<15
+# if __ARM_ARCH__>=7
+       ldr     r2,[r1],#4                      @ prefetch
+# else
+       ldrb    r2,[r1,#3]
+# endif
+       eor     r12,r4,r5                       @ a^b, b^c in next round
+#else
+       ldr     r2,[sp,#10*4]           @ from future BODY_16_xx
+       eor     r12,r4,r5                       @ a^b, b^c in next round
+       ldr     r1,[sp,#7*4]    @ from future BODY_16_xx
+#endif
+       eor     r0,r0,r4,ror#20 @ Sigma0(a)
+       and     r3,r3,r12                       @ (b^c)&=(a^b)
+       add     r7,r7,r11                       @ d+=h
+       eor     r3,r3,r5                        @ Maj(a,b,c)
+       add     r11,r11,r0,ror#2        @ h+=Sigma0(a)
+       @ add   r11,r11,r3                      @ h+=Maj(a,b,c)
+       @ ldr   r2,[sp,#10*4]           @ 25
+       @ ldr   r1,[sp,#7*4]
+       mov     r0,r2,ror#7
+       add     r11,r11,r3                      @ h+=Maj(a,b,c) from the past
+       mov     r3,r1,ror#17
+       eor     r0,r0,r2,ror#18
+       eor     r3,r3,r1,ror#19
+       eor     r0,r0,r2,lsr#3  @ sigma0(X[i+1])
+       ldr     r2,[sp,#9*4]
+       eor     r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+       ldr     r1,[sp,#2*4]
+
+       add     r3,r3,r0
+       eor     r0,r7,r7,ror#5  @ from BODY_00_15
+       add     r2,r2,r3
+       eor     r0,r0,r7,ror#19 @ Sigma1(e)
+       add     r2,r2,r1                        @ X[i]
+       ldr     r3,[r14],#4                     @ *K256++
+       add     r10,r10,r2                      @ h+=X[i]
+       str     r2,[sp,#9*4]
+       eor     r2,r8,r9
+       add     r10,r10,r0,ror#6        @ h+=Sigma1(e)
+       and     r2,r2,r7
+       add     r10,r10,r3                      @ h+=K256[i]
+       eor     r2,r2,r9                        @ Ch(e,f,g)
+       eor     r0,r11,r11,ror#11
+       add     r10,r10,r2                      @ h+=Ch(e,f,g)
+#if 25==31
+       and     r3,r3,#0xff
+       cmp     r3,#0xf2                        @ done?
+#endif
+#if 25<15
+# if __ARM_ARCH__>=7
+       ldr     r2,[r1],#4                      @ prefetch
+# else
+       ldrb    r2,[r1,#3]
+# endif
+       eor     r3,r11,r4                       @ a^b, b^c in next round
+#else
+       ldr     r2,[sp,#11*4]           @ from future BODY_16_xx
+       eor     r3,r11,r4                       @ a^b, b^c in next round
+       ldr     r1,[sp,#8*4]    @ from future BODY_16_xx
+#endif
+       eor     r0,r0,r11,ror#20        @ Sigma0(a)
+       and     r12,r12,r3                      @ (b^c)&=(a^b)
+       add     r6,r6,r10                       @ d+=h
+       eor     r12,r12,r4                      @ Maj(a,b,c)
+       add     r10,r10,r0,ror#2        @ h+=Sigma0(a)
+       @ add   r10,r10,r12                     @ h+=Maj(a,b,c)
+       @ ldr   r2,[sp,#11*4]           @ 26
+       @ ldr   r1,[sp,#8*4]
+       mov     r0,r2,ror#7
+       add     r10,r10,r12                     @ h+=Maj(a,b,c) from the past
+       mov     r12,r1,ror#17
+       eor     r0,r0,r2,ror#18
+       eor     r12,r12,r1,ror#19
+       eor     r0,r0,r2,lsr#3  @ sigma0(X[i+1])
+       ldr     r2,[sp,#10*4]
+       eor     r12,r12,r1,lsr#10       @ sigma1(X[i+14])
+       ldr     r1,[sp,#3*4]
+
+       add     r12,r12,r0
+       eor     r0,r6,r6,ror#5  @ from BODY_00_15
+       add     r2,r2,r12
+       eor     r0,r0,r6,ror#19 @ Sigma1(e)
+       add     r2,r2,r1                        @ X[i]
+       ldr     r12,[r14],#4                    @ *K256++
+       add     r9,r9,r2                        @ h+=X[i]
+       str     r2,[sp,#10*4]
+       eor     r2,r7,r8
+       add     r9,r9,r0,ror#6  @ h+=Sigma1(e)
+       and     r2,r2,r6
+       add     r9,r9,r12                       @ h+=K256[i]
+       eor     r2,r2,r8                        @ Ch(e,f,g)
+       eor     r0,r10,r10,ror#11
+       add     r9,r9,r2                        @ h+=Ch(e,f,g)
+#if 26==31
+       and     r12,r12,#0xff
+       cmp     r12,#0xf2                       @ done?
+#endif
+#if 26<15
+# if __ARM_ARCH__>=7
+       ldr     r2,[r1],#4                      @ prefetch
+# else
+       ldrb    r2,[r1,#3]
+# endif
+       eor     r12,r10,r11                     @ a^b, b^c in next round
+#else
+       ldr     r2,[sp,#12*4]           @ from future BODY_16_xx
+       eor     r12,r10,r11                     @ a^b, b^c in next round
+       ldr     r1,[sp,#9*4]    @ from future BODY_16_xx
+#endif
+       eor     r0,r0,r10,ror#20        @ Sigma0(a)
+       and     r3,r3,r12                       @ (b^c)&=(a^b)
+       add     r5,r5,r9                        @ d+=h
+       eor     r3,r3,r11                       @ Maj(a,b,c)
+       add     r9,r9,r0,ror#2  @ h+=Sigma0(a)
+       @ add   r9,r9,r3                        @ h+=Maj(a,b,c)
+       @ ldr   r2,[sp,#12*4]           @ 27
+       @ ldr   r1,[sp,#9*4]
+       mov     r0,r2,ror#7
+       add     r9,r9,r3                        @ h+=Maj(a,b,c) from the past
+       mov     r3,r1,ror#17
+       eor     r0,r0,r2,ror#18
+       eor     r3,r3,r1,ror#19
+       eor     r0,r0,r2,lsr#3  @ sigma0(X[i+1])
+       ldr     r2,[sp,#11*4]
+       eor     r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+       ldr     r1,[sp,#4*4]
+
+       add     r3,r3,r0
+       eor     r0,r5,r5,ror#5  @ from BODY_00_15
+       add     r2,r2,r3
+       eor     r0,r0,r5,ror#19 @ Sigma1(e)
+       add     r2,r2,r1                        @ X[i]
+       ldr     r3,[r14],#4                     @ *K256++
+       add     r8,r8,r2                        @ h+=X[i]
+       str     r2,[sp,#11*4]
+       eor     r2,r6,r7
+       add     r8,r8,r0,ror#6  @ h+=Sigma1(e)
+       and     r2,r2,r5
+       add     r8,r8,r3                        @ h+=K256[i]
+       eor     r2,r2,r7                        @ Ch(e,f,g)
+       eor     r0,r9,r9,ror#11
+       add     r8,r8,r2                        @ h+=Ch(e,f,g)
+#if 27==31
+       and     r3,r3,#0xff
+       cmp     r3,#0xf2                        @ done?
+#endif
+#if 27<15
+# if __ARM_ARCH__>=7
+       ldr     r2,[r1],#4                      @ prefetch
+# else
+       ldrb    r2,[r1,#3]
+# endif
+       eor     r3,r9,r10                       @ a^b, b^c in next round
+#else
+       ldr     r2,[sp,#13*4]           @ from future BODY_16_xx
+       eor     r3,r9,r10                       @ a^b, b^c in next round
+       ldr     r1,[sp,#10*4]   @ from future BODY_16_xx
+#endif
+       eor     r0,r0,r9,ror#20 @ Sigma0(a)
+       and     r12,r12,r3                      @ (b^c)&=(a^b)
+       add     r4,r4,r8                        @ d+=h
+       eor     r12,r12,r10                     @ Maj(a,b,c)
+       add     r8,r8,r0,ror#2  @ h+=Sigma0(a)
+       @ add   r8,r8,r12                       @ h+=Maj(a,b,c)
+       @ ldr   r2,[sp,#13*4]           @ 28
+       @ ldr   r1,[sp,#10*4]
+       mov     r0,r2,ror#7
+       add     r8,r8,r12                       @ h+=Maj(a,b,c) from the past
+       mov     r12,r1,ror#17
+       eor     r0,r0,r2,ror#18
+       eor     r12,r12,r1,ror#19
+       eor     r0,r0,r2,lsr#3  @ sigma0(X[i+1])
+       ldr     r2,[sp,#12*4]
+       eor     r12,r12,r1,lsr#10       @ sigma1(X[i+14])
+       ldr     r1,[sp,#5*4]
+
+       add     r12,r12,r0
+       eor     r0,r4,r4,ror#5  @ from BODY_00_15
+       add     r2,r2,r12
+       eor     r0,r0,r4,ror#19 @ Sigma1(e)
+       add     r2,r2,r1                        @ X[i]
+       ldr     r12,[r14],#4                    @ *K256++
+       add     r7,r7,r2                        @ h+=X[i]
+       str     r2,[sp,#12*4]
+       eor     r2,r5,r6
+       add     r7,r7,r0,ror#6  @ h+=Sigma1(e)
+       and     r2,r2,r4
+       add     r7,r7,r12                       @ h+=K256[i]
+       eor     r2,r2,r6                        @ Ch(e,f,g)
+       eor     r0,r8,r8,ror#11
+       add     r7,r7,r2                        @ h+=Ch(e,f,g)
+#if 28==31
+       and     r12,r12,#0xff
+       cmp     r12,#0xf2                       @ done?
+#endif
+#if 28<15
+# if __ARM_ARCH__>=7
+       ldr     r2,[r1],#4                      @ prefetch
+# else
+       ldrb    r2,[r1,#3]
+# endif
+       eor     r12,r8,r9                       @ a^b, b^c in next round
+#else
+       ldr     r2,[sp,#14*4]           @ from future BODY_16_xx
+       eor     r12,r8,r9                       @ a^b, b^c in next round
+       ldr     r1,[sp,#11*4]   @ from future BODY_16_xx
+#endif
+       eor     r0,r0,r8,ror#20 @ Sigma0(a)
+       and     r3,r3,r12                       @ (b^c)&=(a^b)
+       add     r11,r11,r7                      @ d+=h
+       eor     r3,r3,r9                        @ Maj(a,b,c)
+       add     r7,r7,r0,ror#2  @ h+=Sigma0(a)
+       @ add   r7,r7,r3                        @ h+=Maj(a,b,c)
+       @ ldr   r2,[sp,#14*4]           @ 29
+       @ ldr   r1,[sp,#11*4]
+       mov     r0,r2,ror#7
+       add     r7,r7,r3                        @ h+=Maj(a,b,c) from the past
+       mov     r3,r1,ror#17
+       eor     r0,r0,r2,ror#18
+       eor     r3,r3,r1,ror#19
+       eor     r0,r0,r2,lsr#3  @ sigma0(X[i+1])
+       ldr     r2,[sp,#13*4]
+       eor     r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+       ldr     r1,[sp,#6*4]
+
+       add     r3,r3,r0
+       eor     r0,r11,r11,ror#5        @ from BODY_00_15
+       add     r2,r2,r3
+       eor     r0,r0,r11,ror#19        @ Sigma1(e)
+       add     r2,r2,r1                        @ X[i]
+       ldr     r3,[r14],#4                     @ *K256++
+       add     r6,r6,r2                        @ h+=X[i]
+       str     r2,[sp,#13*4]
+       eor     r2,r4,r5
+       add     r6,r6,r0,ror#6  @ h+=Sigma1(e)
+       and     r2,r2,r11
+       add     r6,r6,r3                        @ h+=K256[i]
+       eor     r2,r2,r5                        @ Ch(e,f,g)
+       eor     r0,r7,r7,ror#11
+       add     r6,r6,r2                        @ h+=Ch(e,f,g)
+#if 29==31
+       and     r3,r3,#0xff
+       cmp     r3,#0xf2                        @ done?
+#endif
+#if 29<15
+# if __ARM_ARCH__>=7
+       ldr     r2,[r1],#4                      @ prefetch
+# else
+       ldrb    r2,[r1,#3]
+# endif
+       eor     r3,r7,r8                        @ a^b, b^c in next round
+#else
+       ldr     r2,[sp,#15*4]           @ from future BODY_16_xx
+       eor     r3,r7,r8                        @ a^b, b^c in next round
+       ldr     r1,[sp,#12*4]   @ from future BODY_16_xx
+#endif
+       eor     r0,r0,r7,ror#20 @ Sigma0(a)
+       and     r12,r12,r3                      @ (b^c)&=(a^b)
+       add     r10,r10,r6                      @ d+=h
+       eor     r12,r12,r8                      @ Maj(a,b,c)
+       add     r6,r6,r0,ror#2  @ h+=Sigma0(a)
+       @ add   r6,r6,r12                       @ h+=Maj(a,b,c)
+       @ ldr   r2,[sp,#15*4]           @ 30
+       @ ldr   r1,[sp,#12*4]
+       mov     r0,r2,ror#7
+       add     r6,r6,r12                       @ h+=Maj(a,b,c) from the past
+       mov     r12,r1,ror#17
+       eor     r0,r0,r2,ror#18
+       eor     r12,r12,r1,ror#19
+       eor     r0,r0,r2,lsr#3  @ sigma0(X[i+1])
+       ldr     r2,[sp,#14*4]
+       eor     r12,r12,r1,lsr#10       @ sigma1(X[i+14])
+       ldr     r1,[sp,#7*4]
+
+       add     r12,r12,r0
+       eor     r0,r10,r10,ror#5        @ from BODY_00_15
+       add     r2,r2,r12
+       eor     r0,r0,r10,ror#19        @ Sigma1(e)
+       add     r2,r2,r1                        @ X[i]
+       ldr     r12,[r14],#4                    @ *K256++
+       add     r5,r5,r2                        @ h+=X[i]
+       str     r2,[sp,#14*4]
+       eor     r2,r11,r4
+       add     r5,r5,r0,ror#6  @ h+=Sigma1(e)
+       and     r2,r2,r10
+       add     r5,r5,r12                       @ h+=K256[i]
+       eor     r2,r2,r4                        @ Ch(e,f,g)
+       eor     r0,r6,r6,ror#11
+       add     r5,r5,r2                        @ h+=Ch(e,f,g)
+#if 30==31
+       and     r12,r12,#0xff
+       cmp     r12,#0xf2                       @ done?
+#endif
+#if 30<15
+# if __ARM_ARCH__>=7
+       ldr     r2,[r1],#4                      @ prefetch
+# else
+       ldrb    r2,[r1,#3]
+# endif
+       eor     r12,r6,r7                       @ a^b, b^c in next round
+#else
+       ldr     r2,[sp,#0*4]            @ from future BODY_16_xx
+       eor     r12,r6,r7                       @ a^b, b^c in next round
+       ldr     r1,[sp,#13*4]   @ from future BODY_16_xx
+#endif
+       eor     r0,r0,r6,ror#20 @ Sigma0(a)
+       and     r3,r3,r12                       @ (b^c)&=(a^b)
+       add     r9,r9,r5                        @ d+=h
+       eor     r3,r3,r7                        @ Maj(a,b,c)
+       add     r5,r5,r0,ror#2  @ h+=Sigma0(a)
+       @ add   r5,r5,r3                        @ h+=Maj(a,b,c)
+       @ ldr   r2,[sp,#0*4]            @ 31
+       @ ldr   r1,[sp,#13*4]
+       mov     r0,r2,ror#7
+       add     r5,r5,r3                        @ h+=Maj(a,b,c) from the past
+       mov     r3,r1,ror#17
+       eor     r0,r0,r2,ror#18
+       eor     r3,r3,r1,ror#19
+       eor     r0,r0,r2,lsr#3  @ sigma0(X[i+1])
+       ldr     r2,[sp,#15*4]
+       eor     r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+       ldr     r1,[sp,#8*4]
+
+       add     r3,r3,r0
+       eor     r0,r9,r9,ror#5  @ from BODY_00_15
+       add     r2,r2,r3
+       eor     r0,r0,r9,ror#19 @ Sigma1(e)
+       add     r2,r2,r1                        @ X[i]
+       ldr     r3,[r14],#4                     @ *K256++
+       add     r4,r4,r2                        @ h+=X[i]
+       str     r2,[sp,#15*4]
+       eor     r2,r10,r11
+       add     r4,r4,r0,ror#6  @ h+=Sigma1(e)
+       and     r2,r2,r9
+       add     r4,r4,r3                        @ h+=K256[i]
+       eor     r2,r2,r11                       @ Ch(e,f,g)
+       eor     r0,r5,r5,ror#11
+       add     r4,r4,r2                        @ h+=Ch(e,f,g)
+#if 31==31
+       and     r3,r3,#0xff
+       cmp     r3,#0xf2                        @ done?
+#endif
+#if 31<15
+# if __ARM_ARCH__>=7
+       ldr     r2,[r1],#4                      @ prefetch
+# else
+       ldrb    r2,[r1,#3]
+# endif
+       eor     r3,r5,r6                        @ a^b, b^c in next round
+#else
+       ldr     r2,[sp,#1*4]            @ from future BODY_16_xx
+       eor     r3,r5,r6                        @ a^b, b^c in next round
+       ldr     r1,[sp,#14*4]   @ from future BODY_16_xx
+#endif
+       eor     r0,r0,r5,ror#20 @ Sigma0(a)
+       and     r12,r12,r3                      @ (b^c)&=(a^b)
+       add     r8,r8,r4                        @ d+=h
+       eor     r12,r12,r6                      @ Maj(a,b,c)
+       add     r4,r4,r0,ror#2  @ h+=Sigma0(a)
+       @ add   r4,r4,r12                       @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+       ite     eq                      @ Thumb2 thing, sanity check in ARM
+#endif
+       ldreq   r3,[sp,#16*4]           @ pull ctx
+       bne     .Lrounds_16_xx
+
+       add     r4,r4,r12               @ h+=Maj(a,b,c) from the past
+       ldr     r0,[r3,#0]
+       ldr     r2,[r3,#4]
+       ldr     r12,[r3,#8]
+       add     r4,r4,r0
+       ldr     r0,[r3,#12]
+       add     r5,r5,r2
+       ldr     r2,[r3,#16]
+       add     r6,r6,r12
+       ldr     r12,[r3,#20]
+       add     r7,r7,r0
+       ldr     r0,[r3,#24]
+       add     r8,r8,r2
+       ldr     r2,[r3,#28]
+       add     r9,r9,r12
+       ldr     r1,[sp,#17*4]           @ pull inp
+       ldr     r12,[sp,#18*4]          @ pull inp+len
+       add     r10,r10,r0
+       add     r11,r11,r2
+       stmia   r3,{r4,r5,r6,r7,r8,r9,r10,r11}
+       cmp     r1,r12
+       sub     r14,r14,#256    @ rewind Ktbl
+       bne     .Loop
+
+       add     sp,sp,#19*4     @ destroy frame
+#if __ARM_ARCH__>=5
+       ldmia   sp!,{r4-r11,pc}
+#else
+       ldmia   sp!,{r4-r11,lr}
+       tst     lr,#1
+       moveq   pc,lr                   @ be binary compatible with V4, yet
+       .word   0xe12fff1e                      @ interoperable with Thumb ISA:-)
+#endif
+.size  sha256_block_data_order,.-sha256_block_data_order
+#if __ARM_MAX_ARCH__>=7
+.arch  armv7-a
+.fpu   neon
+
+.global        sha256_block_data_order_neon
+.type  sha256_block_data_order_neon,%function
+.align 4
+sha256_block_data_order_neon:
+.LNEON:
+       stmdb   sp!,{r4-r12,lr}
+
+       sub     r11,sp,#16*4+16
+       adrl    r14,K256
+       bic     r11,r11,#15             @ align for 128-bit stores
+       mov     r12,sp
+       mov     sp,r11                  @ alloca
+       add     r2,r1,r2,lsl#6  @ len to point at the end of inp
+
+       vld1.8          {q0},[r1]!
+       vld1.8          {q1},[r1]!
+       vld1.8          {q2},[r1]!
+       vld1.8          {q3},[r1]!
+       vld1.32         {q8},[r14,:128]!
+       vld1.32         {q9},[r14,:128]!
+       vld1.32         {q10},[r14,:128]!
+       vld1.32         {q11},[r14,:128]!
+       vrev32.8        q0,q0           @ yes, even on
+       str             r0,[sp,#64]
+       vrev32.8        q1,q1           @ big-endian
+       str             r1,[sp,#68]
+       mov             r1,sp
+       vrev32.8        q2,q2
+       str             r2,[sp,#72]
+       vrev32.8        q3,q3
+       str             r12,[sp,#76]            @ save original sp
+       vadd.i32        q8,q8,q0
+       vadd.i32        q9,q9,q1
+       vst1.32         {q8},[r1,:128]!
+       vadd.i32        q10,q10,q2
+       vst1.32         {q9},[r1,:128]!
+       vadd.i32        q11,q11,q3
+       vst1.32         {q10},[r1,:128]!
+       vst1.32         {q11},[r1,:128]!
+
+       ldmia           r0,{r4-r11}
+       sub             r1,r1,#64
+       ldr             r2,[sp,#0]
+       eor             r12,r12,r12
+       eor             r3,r5,r6
+       b               .L_00_48
+
+.align 4
+.L_00_48:
+       vext.8  q8,q0,q1,#4
+       add     r11,r11,r2
+       eor     r2,r9,r10
+       eor     r0,r8,r8,ror#5
+       vext.8  q9,q2,q3,#4
+       add     r4,r4,r12
+       and     r2,r2,r8
+       eor     r12,r0,r8,ror#19
+       vshr.u32        q10,q8,#7
+       eor     r0,r4,r4,ror#11
+       eor     r2,r2,r10
+       vadd.i32        q0,q0,q9
+       add     r11,r11,r12,ror#6
+       eor     r12,r4,r5
+       vshr.u32        q9,q8,#3
+       eor     r0,r0,r4,ror#20
+       add     r11,r11,r2
+       vsli.32 q10,q8,#25
+       ldr     r2,[sp,#4]
+       and     r3,r3,r12
+       vshr.u32        q11,q8,#18
+       add     r7,r7,r11
+       add     r11,r11,r0,ror#2
+       eor     r3,r3,r5
+       veor    q9,q9,q10
+       add     r10,r10,r2
+       vsli.32 q11,q8,#14
+       eor     r2,r8,r9
+       eor     r0,r7,r7,ror#5
+       vshr.u32        d24,d7,#17
+       add     r11,r11,r3
+       and     r2,r2,r7
+       veor    q9,q9,q11
+       eor     r3,r0,r7,ror#19
+       eor     r0,r11,r11,ror#11
+       vsli.32 d24,d7,#15
+       eor     r2,r2,r9
+       add     r10,r10,r3,ror#6
+       vshr.u32        d25,d7,#10
+       eor     r3,r11,r4
+       eor     r0,r0,r11,ror#20
+       vadd.i32        q0,q0,q9
+       add     r10,r10,r2
+       ldr     r2,[sp,#8]
+       veor    d25,d25,d24
+       and     r12,r12,r3
+       add     r6,r6,r10
+       vshr.u32        d24,d7,#19
+       add     r10,r10,r0,ror#2
+       eor     r12,r12,r4
+       vsli.32 d24,d7,#13
+       add     r9,r9,r2
+       eor     r2,r7,r8
+       veor    d25,d25,d24
+       eor     r0,r6,r6,ror#5
+       add     r10,r10,r12
+       vadd.i32        d0,d0,d25
+       and     r2,r2,r6
+       eor     r12,r0,r6,ror#19
+       vshr.u32        d24,d0,#17
+       eor     r0,r10,r10,ror#11
+       eor     r2,r2,r8
+       vsli.32 d24,d0,#15
+       add     r9,r9,r12,ror#6
+       eor     r12,r10,r11
+       vshr.u32        d25,d0,#10
+       eor     r0,r0,r10,ror#20
+       add     r9,r9,r2
+       veor    d25,d25,d24
+       ldr     r2,[sp,#12]
+       and     r3,r3,r12
+       vshr.u32        d24,d0,#19
+       add     r5,r5,r9
+       add     r9,r9,r0,ror#2
+       eor     r3,r3,r11
+       vld1.32 {q8},[r14,:128]!
+       add     r8,r8,r2
+       vsli.32 d24,d0,#13
+       eor     r2,r6,r7
+       eor     r0,r5,r5,ror#5
+       veor    d25,d25,d24
+       add     r9,r9,r3
+       and     r2,r2,r5
+       vadd.i32        d1,d1,d25
+       eor     r3,r0,r5,ror#19
+       eor     r0,r9,r9,ror#11
+       vadd.i32        q8,q8,q0
+       eor     r2,r2,r7
+       add     r8,r8,r3,ror#6
+       eor     r3,r9,r10
+       eor     r0,r0,r9,ror#20
+       add     r8,r8,r2
+       ldr     r2,[sp,#16]
+       and     r12,r12,r3
+       add     r4,r4,r8
+       vst1.32 {q8},[r1,:128]!
+       add     r8,r8,r0,ror#2
+       eor     r12,r12,r10
+       vext.8  q8,q1,q2,#4
+       add     r7,r7,r2
+       eor     r2,r5,r6
+       eor     r0,r4,r4,ror#5
+       vext.8  q9,q3,q0,#4
+       add     r8,r8,r12
+       and     r2,r2,r4
+       eor     r12,r0,r4,ror#19
+       vshr.u32        q10,q8,#7
+       eor     r0,r8,r8,ror#11
+       eor     r2,r2,r6
+       vadd.i32        q1,q1,q9
+       add     r7,r7,r12,ror#6
+       eor     r12,r8,r9
+       vshr.u32        q9,q8,#3
+       eor     r0,r0,r8,ror#20
+       add     r7,r7,r2
+       vsli.32 q10,q8,#25
+       ldr     r2,[sp,#20]
+       and     r3,r3,r12
+       vshr.u32        q11,q8,#18
+       add     r11,r11,r7
+       add     r7,r7,r0,ror#2
+       eor     r3,r3,r9
+       veor    q9,q9,q10
+       add     r6,r6,r2
+       vsli.32 q11,q8,#14
+       eor     r2,r4,r5
+       eor     r0,r11,r11,ror#5
+       vshr.u32        d24,d1,#17
+       add     r7,r7,r3
+       and     r2,r2,r11
+       veor    q9,q9,q11
+       eor     r3,r0,r11,ror#19
+       eor     r0,r7,r7,ror#11
+       vsli.32 d24,d1,#15
+       eor     r2,r2,r5
+       add     r6,r6,r3,ror#6
+       vshr.u32        d25,d1,#10
+       eor     r3,r7,r8
+       eor     r0,r0,r7,ror#20
+       vadd.i32        q1,q1,q9
+       add     r6,r6,r2
+       ldr     r2,[sp,#24]
+       veor    d25,d25,d24
+       and     r12,r12,r3
+       add     r10,r10,r6
+       vshr.u32        d24,d1,#19
+       add     r6,r6,r0,ror#2
+       eor     r12,r12,r8
+       vsli.32 d24,d1,#13
+       add     r5,r5,r2
+       eor     r2,r11,r4
+       veor    d25,d25,d24
+       eor     r0,r10,r10,ror#5
+       add     r6,r6,r12
+       vadd.i32        d2,d2,d25
+       and     r2,r2,r10
+       eor     r12,r0,r10,ror#19
+       vshr.u32        d24,d2,#17
+       eor     r0,r6,r6,ror#11
+       eor     r2,r2,r4
+       vsli.32 d24,d2,#15
+       add     r5,r5,r12,ror#6
+       eor     r12,r6,r7
+       vshr.u32        d25,d2,#10
+       eor     r0,r0,r6,ror#20
+       add     r5,r5,r2
+       veor    d25,d25,d24
+       ldr     r2,[sp,#28]
+       and     r3,r3,r12
+       vshr.u32        d24,d2,#19
+       add     r9,r9,r5
+       add     r5,r5,r0,ror#2
+       eor     r3,r3,r7
+       vld1.32 {q8},[r14,:128]!
+       add     r4,r4,r2
+       vsli.32 d24,d2,#13
+       eor     r2,r10,r11
+       eor     r0,r9,r9,ror#5
+       veor    d25,d25,d24
+       add     r5,r5,r3
+       and     r2,r2,r9
+       vadd.i32        d3,d3,d25
+       eor     r3,r0,r9,ror#19
+       eor     r0,r5,r5,ror#11
+       vadd.i32        q8,q8,q1
+       eor     r2,r2,r11
+       add     r4,r4,r3,ror#6
+       eor     r3,r5,r6
+       eor     r0,r0,r5,ror#20
+       add     r4,r4,r2
+       ldr     r2,[sp,#32]
+       and     r12,r12,r3
+       add     r8,r8,r4
+       vst1.32 {q8},[r1,:128]!
+       add     r4,r4,r0,ror#2
+       eor     r12,r12,r6
+       vext.8  q8,q2,q3,#4
+       add     r11,r11,r2
+       eor     r2,r9,r10
+       eor     r0,r8,r8,ror#5
+       vext.8  q9,q0,q1,#4
+       add     r4,r4,r12
+       and     r2,r2,r8
+       eor     r12,r0,r8,ror#19
+       vshr.u32        q10,q8,#7
+       eor     r0,r4,r4,ror#11
+       eor     r2,r2,r10
+       vadd.i32        q2,q2,q9
+       add     r11,r11,r12,ror#6
+       eor     r12,r4,r5
+       vshr.u32        q9,q8,#3
+       eor     r0,r0,r4,ror#20
+       add     r11,r11,r2
+       vsli.32 q10,q8,#25
+       ldr     r2,[sp,#36]
+       and     r3,r3,r12
+       vshr.u32        q11,q8,#18
+       add     r7,r7,r11
+       add     r11,r11,r0,ror#2
+       eor     r3,r3,r5
+       veor    q9,q9,q10
+       add     r10,r10,r2
+       vsli.32 q11,q8,#14
+       eor     r2,r8,r9
+       eor     r0,r7,r7,ror#5
+       vshr.u32        d24,d3,#17
+       add     r11,r11,r3
+       and     r2,r2,r7
+       veor    q9,q9,q11
+       eor     r3,r0,r7,ror#19
+       eor     r0,r11,r11,ror#11
+       vsli.32 d24,d3,#15
+       eor     r2,r2,r9
+       add     r10,r10,r3,ror#6
+       vshr.u32        d25,d3,#10
+       eor     r3,r11,r4
+       eor     r0,r0,r11,ror#20
+       vadd.i32        q2,q2,q9
+       add     r10,r10,r2
+       ldr     r2,[sp,#40]
+       veor    d25,d25,d24
+       and     r12,r12,r3
+       add     r6,r6,r10
+       vshr.u32        d24,d3,#19
+       add     r10,r10,r0,ror#2
+       eor     r12,r12,r4
+       vsli.32 d24,d3,#13
+       add     r9,r9,r2
+       eor     r2,r7,r8
+       veor    d25,d25,d24
+       eor     r0,r6,r6,ror#5
+       add     r10,r10,r12
+       vadd.i32        d4,d4,d25
+       and     r2,r2,r6
+       eor     r12,r0,r6,ror#19
+       vshr.u32        d24,d4,#17
+       eor     r0,r10,r10,ror#11
+       eor     r2,r2,r8
+       vsli.32 d24,d4,#15
+       add     r9,r9,r12,ror#6
+       eor     r12,r10,r11
+       vshr.u32        d25,d4,#10
+       eor     r0,r0,r10,ror#20
+       add     r9,r9,r2
+       veor    d25,d25,d24
+       ldr     r2,[sp,#44]
+       and     r3,r3,r12
+       vshr.u32        d24,d4,#19
+       add     r5,r5,r9
+       add     r9,r9,r0,ror#2
+       eor     r3,r3,r11
+       vld1.32 {q8},[r14,:128]!
+       add     r8,r8,r2
+       vsli.32 d24,d4,#13
+       eor     r2,r6,r7
+       eor     r0,r5,r5,ror#5
+       veor    d25,d25,d24
+       add     r9,r9,r3
+       and     r2,r2,r5
+       vadd.i32        d5,d5,d25
+       eor     r3,r0,r5,ror#19
+       eor     r0,r9,r9,ror#11
+       vadd.i32        q8,q8,q2
+       eor     r2,r2,r7
+       add     r8,r8,r3,ror#6
+       eor     r3,r9,r10
+       eor     r0,r0,r9,ror#20
+       add     r8,r8,r2
+       ldr     r2,[sp,#48]
+       and     r12,r12,r3
+       add     r4,r4,r8
+       vst1.32 {q8},[r1,:128]!
+       add     r8,r8,r0,ror#2
+       eor     r12,r12,r10
+       vext.8  q8,q3,q0,#4
+       add     r7,r7,r2
+       eor     r2,r5,r6
+       eor     r0,r4,r4,ror#5
+       vext.8  q9,q1,q2,#4
+       add     r8,r8,r12
+       and     r2,r2,r4
+       eor     r12,r0,r4,ror#19
+       vshr.u32        q10,q8,#7
+       eor     r0,r8,r8,ror#11
+       eor     r2,r2,r6
+       vadd.i32        q3,q3,q9
+       add     r7,r7,r12,ror#6
+       eor     r12,r8,r9
+       vshr.u32        q9,q8,#3
+       eor     r0,r0,r8,ror#20
+       add     r7,r7,r2
+       vsli.32 q10,q8,#25
+       ldr     r2,[sp,#52]
+       and     r3,r3,r12
+       vshr.u32        q11,q8,#18
+       add     r11,r11,r7
+       add     r7,r7,r0,ror#2
+       eor     r3,r3,r9
+       veor    q9,q9,q10
+       add     r6,r6,r2
+       vsli.32 q11,q8,#14
+       eor     r2,r4,r5
+       eor     r0,r11,r11,ror#5
+       vshr.u32        d24,d5,#17
+       add     r7,r7,r3
+       and     r2,r2,r11
+       veor    q9,q9,q11
+       eor     r3,r0,r11,ror#19
+       eor     r0,r7,r7,ror#11
+       vsli.32 d24,d5,#15
+       eor     r2,r2,r5
+       add     r6,r6,r3,ror#6
+       vshr.u32        d25,d5,#10
+       eor     r3,r7,r8
+       eor     r0,r0,r7,ror#20
+       vadd.i32        q3,q3,q9
+       add     r6,r6,r2
+       ldr     r2,[sp,#56]
+       veor    d25,d25,d24
+       and     r12,r12,r3
+       add     r10,r10,r6
+       vshr.u32        d24,d5,#19
+       add     r6,r6,r0,ror#2
+       eor     r12,r12,r8
+       vsli.32 d24,d5,#13
+       add     r5,r5,r2
+       eor     r2,r11,r4
+       veor    d25,d25,d24
+       eor     r0,r10,r10,ror#5
+       add     r6,r6,r12
+       vadd.i32        d6,d6,d25
+       and     r2,r2,r10
+       eor     r12,r0,r10,ror#19
+       vshr.u32        d24,d6,#17
+       eor     r0,r6,r6,ror#11
+       eor     r2,r2,r4
+       vsli.32 d24,d6,#15
+       add     r5,r5,r12,ror#6
+       eor     r12,r6,r7
+       vshr.u32        d25,d6,#10
+       eor     r0,r0,r6,ror#20
+       add     r5,r5,r2
+       veor    d25,d25,d24
+       ldr     r2,[sp,#60]
+       and     r3,r3,r12
+       vshr.u32        d24,d6,#19
+       add     r9,r9,r5
+       add     r5,r5,r0,ror#2
+       eor     r3,r3,r7
+       vld1.32 {q8},[r14,:128]!
+       add     r4,r4,r2
+       vsli.32 d24,d6,#13
+       eor     r2,r10,r11
+       eor     r0,r9,r9,ror#5
+       veor    d25,d25,d24
+       add     r5,r5,r3
+       and     r2,r2,r9
+       vadd.i32        d7,d7,d25
+       eor     r3,r0,r9,ror#19
+       eor     r0,r5,r5,ror#11
+       vadd.i32        q8,q8,q3
+       eor     r2,r2,r11
+       add     r4,r4,r3,ror#6
+       eor     r3,r5,r6
+       eor     r0,r0,r5,ror#20
+       add     r4,r4,r2
+       ldr     r2,[r14]
+       and     r12,r12,r3
+       add     r8,r8,r4
+       vst1.32 {q8},[r1,:128]!
+       add     r4,r4,r0,ror#2
+       eor     r12,r12,r6
+       teq     r2,#0                           @ check for K256 terminator
+       ldr     r2,[sp,#0]
+       sub     r1,r1,#64
+       bne     .L_00_48
+
+       ldr             r1,[sp,#68]
+       ldr             r0,[sp,#72]
+       sub             r14,r14,#256    @ rewind r14
+       teq             r1,r0
+       it              eq
+       subeq           r1,r1,#64               @ avoid SEGV
+       vld1.8          {q0},[r1]!              @ load next input block
+       vld1.8          {q1},[r1]!
+       vld1.8          {q2},[r1]!
+       vld1.8          {q3},[r1]!
+       it              ne
+       strne           r1,[sp,#68]
+       mov             r1,sp
+       add     r11,r11,r2
+       eor     r2,r9,r10
+       eor     r0,r8,r8,ror#5
+       add     r4,r4,r12
+       vld1.32 {q8},[r14,:128]!
+       and     r2,r2,r8
+       eor     r12,r0,r8,ror#19
+       eor     r0,r4,r4,ror#11
+       eor     r2,r2,r10
+       vrev32.8        q0,q0
+       add     r11,r11,r12,ror#6
+       eor     r12,r4,r5
+       eor     r0,r0,r4,ror#20
+       add     r11,r11,r2
+       vadd.i32        q8,q8,q0
+       ldr     r2,[sp,#4]
+       and     r3,r3,r12
+       add     r7,r7,r11
+       add     r11,r11,r0,ror#2
+       eor     r3,r3,r5
+       add     r10,r10,r2
+       eor     r2,r8,r9
+       eor     r0,r7,r7,ror#5
+       add     r11,r11,r3
+       and     r2,r2,r7
+       eor     r3,r0,r7,ror#19
+       eor     r0,r11,r11,ror#11
+       eor     r2,r2,r9
+       add     r10,r10,r3,ror#6
+       eor     r3,r11,r4
+       eor     r0,r0,r11,ror#20
+       add     r10,r10,r2
+       ldr     r2,[sp,#8]
+       and     r12,r12,r3
+       add     r6,r6,r10
+       add     r10,r10,r0,ror#2
+       eor     r12,r12,r4
+       add     r9,r9,r2
+       eor     r2,r7,r8
+       eor     r0,r6,r6,ror#5
+       add     r10,r10,r12
+       and     r2,r2,r6
+       eor     r12,r0,r6,ror#19
+       eor     r0,r10,r10,ror#11
+       eor     r2,r2,r8
+       add     r9,r9,r12,ror#6
+       eor     r12,r10,r11
+       eor     r0,r0,r10,ror#20
+       add     r9,r9,r2
+       ldr     r2,[sp,#12]
+       and     r3,r3,r12
+       add     r5,r5,r9
+       add     r9,r9,r0,ror#2
+       eor     r3,r3,r11
+       add     r8,r8,r2
+       eor     r2,r6,r7
+       eor     r0,r5,r5,ror#5
+       add     r9,r9,r3
+       and     r2,r2,r5
+       eor     r3,r0,r5,ror#19
+       eor     r0,r9,r9,ror#11
+       eor     r2,r2,r7
+       add     r8,r8,r3,ror#6
+       eor     r3,r9,r10
+       eor     r0,r0,r9,ror#20
+       add     r8,r8,r2
+       ldr     r2,[sp,#16]
+       and     r12,r12,r3
+       add     r4,r4,r8
+       add     r8,r8,r0,ror#2
+       eor     r12,r12,r10
+       vst1.32 {q8},[r1,:128]!
+       add     r7,r7,r2
+       eor     r2,r5,r6
+       eor     r0,r4,r4,ror#5
+       add     r8,r8,r12
+       vld1.32 {q8},[r14,:128]!
+       and     r2,r2,r4
+       eor     r12,r0,r4,ror#19
+       eor     r0,r8,r8,ror#11
+       eor     r2,r2,r6
+       vrev32.8        q1,q1
+       add     r7,r7,r12,ror#6
+       eor     r12,r8,r9
+       eor     r0,r0,r8,ror#20
+       add     r7,r7,r2
+       vadd.i32        q8,q8,q1
+       ldr     r2,[sp,#20]
+       and     r3,r3,r12
+       add     r11,r11,r7
+       add     r7,r7,r0,ror#2
+       eor     r3,r3,r9
+       add     r6,r6,r2
+       eor     r2,r4,r5
+       eor     r0,r11,r11,ror#5
+       add     r7,r7,r3
+       and     r2,r2,r11
+       eor     r3,r0,r11,ror#19
+       eor     r0,r7,r7,ror#11
+       eor     r2,r2,r5
+       add     r6,r6,r3,ror#6
+       eor     r3,r7,r8
+       eor     r0,r0,r7,ror#20
+       add     r6,r6,r2
+       ldr     r2,[sp,#24]
+       and     r12,r12,r3
+       add     r10,r10,r6
+       add     r6,r6,r0,ror#2
+       eor     r12,r12,r8
+       add     r5,r5,r2
+       eor     r2,r11,r4
+       eor     r0,r10,r10,ror#5
+       add     r6,r6,r12
+       and     r2,r2,r10
+       eor     r12,r0,r10,ror#19
+       eor     r0,r6,r6,ror#11
+       eor     r2,r2,r4
+       add     r5,r5,r12,ror#6
+       eor     r12,r6,r7
+       eor     r0,r0,r6,ror#20
+       add     r5,r5,r2
+       ldr     r2,[sp,#28]
+       and     r3,r3,r12
+       add     r9,r9,r5
+       add     r5,r5,r0,ror#2
+       eor     r3,r3,r7
+       add     r4,r4,r2
+       eor     r2,r10,r11
+       eor     r0,r9,r9,ror#5
+       add     r5,r5,r3
+       and     r2,r2,r9
+       eor     r3,r0,r9,ror#19
+       eor     r0,r5,r5,ror#11
+       eor     r2,r2,r11
+       add     r4,r4,r3,ror#6
+       eor     r3,r5,r6
+       eor     r0,r0,r5,ror#20
+       add     r4,r4,r2
+       ldr     r2,[sp,#32]
+       and     r12,r12,r3
+       add     r8,r8,r4
+       add     r4,r4,r0,ror#2
+       eor     r12,r12,r6
+       vst1.32 {q8},[r1,:128]!
+       add     r11,r11,r2
+       eor     r2,r9,r10
+       eor     r0,r8,r8,ror#5
+       add     r4,r4,r12
+       vld1.32 {q8},[r14,:128]!
+       and     r2,r2,r8
+       eor     r12,r0,r8,ror#19
+       eor     r0,r4,r4,ror#11
+       eor     r2,r2,r10
+       vrev32.8        q2,q2
+       add     r11,r11,r12,ror#6
+       eor     r12,r4,r5
+       eor     r0,r0,r4,ror#20
+       add     r11,r11,r2
+       vadd.i32        q8,q8,q2
+       ldr     r2,[sp,#36]
+       and     r3,r3,r12
+       add     r7,r7,r11
+       add     r11,r11,r0,ror#2
+       eor     r3,r3,r5
+       add     r10,r10,r2
+       eor     r2,r8,r9
+       eor     r0,r7,r7,ror#5
+       add     r11,r11,r3
+       and     r2,r2,r7
+       eor     r3,r0,r7,ror#19
+       eor     r0,r11,r11,ror#11
+       eor     r2,r2,r9
+       add     r10,r10,r3,ror#6
+       eor     r3,r11,r4
+       eor     r0,r0,r11,ror#20
+       add     r10,r10,r2
+       ldr     r2,[sp,#40]
+       and     r12,r12,r3
+       add     r6,r6,r10
+       add     r10,r10,r0,ror#2
+       eor     r12,r12,r4
+       add     r9,r9,r2
+       eor     r2,r7,r8
+       eor     r0,r6,r6,ror#5
+       add     r10,r10,r12
+       and     r2,r2,r6
+       eor     r12,r0,r6,ror#19
+       eor     r0,r10,r10,ror#11
+       eor     r2,r2,r8
+       add     r9,r9,r12,ror#6
+       eor     r12,r10,r11
+       eor     r0,r0,r10,ror#20
+       add     r9,r9,r2
+       ldr     r2,[sp,#44]
+       and     r3,r3,r12
+       add     r5,r5,r9
+       add     r9,r9,r0,ror#2
+       eor     r3,r3,r11
+       add     r8,r8,r2
+       eor     r2,r6,r7
+       eor     r0,r5,r5,ror#5
+       add     r9,r9,r3
+       and     r2,r2,r5
+       eor     r3,r0,r5,ror#19
+       eor     r0,r9,r9,ror#11
+       eor     r2,r2,r7
+       add     r8,r8,r3,ror#6
+       eor     r3,r9,r10
+       eor     r0,r0,r9,ror#20
+       add     r8,r8,r2
+       ldr     r2,[sp,#48]
+       and     r12,r12,r3
+       add     r4,r4,r8
+       add     r8,r8,r0,ror#2
+       eor     r12,r12,r10
+       vst1.32 {q8},[r1,:128]!
+       add     r7,r7,r2
+       eor     r2,r5,r6
+       eor     r0,r4,r4,ror#5
+       add     r8,r8,r12
+       vld1.32 {q8},[r14,:128]!
+       and     r2,r2,r4
+       eor     r12,r0,r4,ror#19
+       eor     r0,r8,r8,ror#11
+       eor     r2,r2,r6
+       vrev32.8        q3,q3
+       add     r7,r7,r12,ror#6
+       eor     r12,r8,r9
+       eor     r0,r0,r8,ror#20
+       add     r7,r7,r2
+       vadd.i32        q8,q8,q3
+       ldr     r2,[sp,#52]
+       and     r3,r3,r12
+       add     r11,r11,r7
+       add     r7,r7,r0,ror#2
+       eor     r3,r3,r9
+       add     r6,r6,r2
+       eor     r2,r4,r5
+       eor     r0,r11,r11,ror#5
+       add     r7,r7,r3
+       and     r2,r2,r11
+       eor     r3,r0,r11,ror#19
+       eor     r0,r7,r7,ror#11
+       eor     r2,r2,r5
+       add     r6,r6,r3,ror#6
+       eor     r3,r7,r8
+       eor     r0,r0,r7,ror#20
+       add     r6,r6,r2
+       ldr     r2,[sp,#56]
+       and     r12,r12,r3
+       add     r10,r10,r6
+       add     r6,r6,r0,ror#2
+       eor     r12,r12,r8
+       add     r5,r5,r2
+       eor     r2,r11,r4
+       eor     r0,r10,r10,ror#5
+       add     r6,r6,r12
+       and     r2,r2,r10
+       eor     r12,r0,r10,ror#19
+       eor     r0,r6,r6,ror#11
+       eor     r2,r2,r4
+       add     r5,r5,r12,ror#6
+       eor     r12,r6,r7
+       eor     r0,r0,r6,ror#20
+       add     r5,r5,r2
+       ldr     r2,[sp,#60]
+       and     r3,r3,r12
+       add     r9,r9,r5
+       add     r5,r5,r0,ror#2
+       eor     r3,r3,r7
+       add     r4,r4,r2
+       eor     r2,r10,r11
+       eor     r0,r9,r9,ror#5
+       add     r5,r5,r3
+       and     r2,r2,r9
+       eor     r3,r0,r9,ror#19
+       eor     r0,r5,r5,ror#11
+       eor     r2,r2,r11
+       add     r4,r4,r3,ror#6
+       eor     r3,r5,r6
+       eor     r0,r0,r5,ror#20
+       add     r4,r4,r2
+       ldr     r2,[sp,#64]
+       and     r12,r12,r3
+       add     r8,r8,r4
+       add     r4,r4,r0,ror#2
+       eor     r12,r12,r6
+       vst1.32 {q8},[r1,:128]!
+       ldr     r0,[r2,#0]
+       add     r4,r4,r12                       @ h+=Maj(a,b,c) from the past
+       ldr     r12,[r2,#4]
+       ldr     r3,[r2,#8]
+       ldr     r1,[r2,#12]
+       add     r4,r4,r0                        @ accumulate
+       ldr     r0,[r2,#16]
+       add     r5,r5,r12
+       ldr     r12,[r2,#20]
+       add     r6,r6,r3
+       ldr     r3,[r2,#24]
+       add     r7,r7,r1
+       ldr     r1,[r2,#28]
+       add     r8,r8,r0
+       str     r4,[r2],#4
+       add     r9,r9,r12
+       str     r5,[r2],#4
+       add     r10,r10,r3
+       str     r6,[r2],#4
+       add     r11,r11,r1
+       str     r7,[r2],#4
+       stmia   r2,{r8-r11}
+
+       ittte   ne
+       movne   r1,sp
+       ldrne   r2,[sp,#0]
+       eorne   r12,r12,r12
+       ldreq   sp,[sp,#76]                     @ restore original sp
+       itt     ne
+       eorne   r3,r5,r6
+       bne     .L_00_48
+
+       ldmia   sp!,{r4-r12,pc}
+.size  sha256_block_data_order_neon,.-sha256_block_data_order_neon
+#endif
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+
+# ifdef __thumb2__
+#  define INST(a,b,c,d)        .byte   c,d|0xc,a,b
+# else
+#  define INST(a,b,c,d)        .byte   a,b,c,d
+# endif
+
+.type  sha256_block_data_order_armv8,%function
+.align 5
+sha256_block_data_order_armv8:
+.LARMv8:
+       vld1.32 {q0,q1},[r0]
+# ifdef __thumb2__
+       adr     r3,.LARMv8
+       sub     r3,r3,#.LARMv8-K256
+# else
+       adrl    r3,K256
+# endif
+       add     r2,r1,r2,lsl#6  @ len to point at the end of inp
+
+.Loop_v8:
+       vld1.8          {q8-q9},[r1]!
+       vld1.8          {q10-q11},[r1]!
+       vld1.32         {q12},[r3]!
+       vrev32.8        q8,q8
+       vrev32.8        q9,q9
+       vrev32.8        q10,q10
+       vrev32.8        q11,q11
+       vmov            q14,q0  @ offload
+       vmov            q15,q1
+       teq             r1,r2
+       vld1.32         {q13},[r3]!
+       vadd.i32        q12,q12,q8
+       INST(0xe2,0x03,0xfa,0xf3)       @ sha256su0 q8,q9
+       vmov            q2,q0
+       INST(0x68,0x0c,0x02,0xf3)       @ sha256h q0,q1,q12
+       INST(0x68,0x2c,0x14,0xf3)       @ sha256h2 q1,q2,q12
+       INST(0xe6,0x0c,0x64,0xf3)       @ sha256su1 q8,q10,q11
+       vld1.32         {q12},[r3]!
+       vadd.i32        q13,q13,q9
+       INST(0xe4,0x23,0xfa,0xf3)       @ sha256su0 q9,q10
+       vmov            q2,q0
+       INST(0x6a,0x0c,0x02,0xf3)       @ sha256h q0,q1,q13
+       INST(0x6a,0x2c,0x14,0xf3)       @ sha256h2 q1,q2,q13
+       INST(0xe0,0x2c,0x66,0xf3)       @ sha256su1 q9,q11,q8
+       vld1.32         {q13},[r3]!
+       vadd.i32        q12,q12,q10
+       INST(0xe6,0x43,0xfa,0xf3)       @ sha256su0 q10,q11
+       vmov            q2,q0
+       INST(0x68,0x0c,0x02,0xf3)       @ sha256h q0,q1,q12
+       INST(0x68,0x2c,0x14,0xf3)       @ sha256h2 q1,q2,q12
+       INST(0xe2,0x4c,0x60,0xf3)       @ sha256su1 q10,q8,q9
+       vld1.32         {q12},[r3]!
+       vadd.i32        q13,q13,q11
+       INST(0xe0,0x63,0xfa,0xf3)       @ sha256su0 q11,q8
+       vmov            q2,q0
+       INST(0x6a,0x0c,0x02,0xf3)       @ sha256h q0,q1,q13
+       INST(0x6a,0x2c,0x14,0xf3)       @ sha256h2 q1,q2,q13
+       INST(0xe4,0x6c,0x62,0xf3)       @ sha256su1 q11,q9,q10
+       vld1.32         {q13},[r3]!
+       vadd.i32        q12,q12,q8
+       INST(0xe2,0x03,0xfa,0xf3)       @ sha256su0 q8,q9
+       vmov            q2,q0
+       INST(0x68,0x0c,0x02,0xf3)       @ sha256h q0,q1,q12
+       INST(0x68,0x2c,0x14,0xf3)       @ sha256h2 q1,q2,q12
+       INST(0xe6,0x0c,0x64,0xf3)       @ sha256su1 q8,q10,q11
+       vld1.32         {q12},[r3]!
+       vadd.i32        q13,q13,q9
+       INST(0xe4,0x23,0xfa,0xf3)       @ sha256su0 q9,q10
+       vmov            q2,q0
+       INST(0x6a,0x0c,0x02,0xf3)       @ sha256h q0,q1,q13
+       INST(0x6a,0x2c,0x14,0xf3)       @ sha256h2 q1,q2,q13
+       INST(0xe0,0x2c,0x66,0xf3)       @ sha256su1 q9,q11,q8
+       vld1.32         {q13},[r3]!
+       vadd.i32        q12,q12,q10
+       INST(0xe6,0x43,0xfa,0xf3)       @ sha256su0 q10,q11
+       vmov            q2,q0
+       INST(0x68,0x0c,0x02,0xf3)       @ sha256h q0,q1,q12
+       INST(0x68,0x2c,0x14,0xf3)       @ sha256h2 q1,q2,q12
+       INST(0xe2,0x4c,0x60,0xf3)       @ sha256su1 q10,q8,q9
+       vld1.32         {q12},[r3]!
+       vadd.i32        q13,q13,q11
+       INST(0xe0,0x63,0xfa,0xf3)       @ sha256su0 q11,q8
+       vmov            q2,q0
+       INST(0x6a,0x0c,0x02,0xf3)       @ sha256h q0,q1,q13
+       INST(0x6a,0x2c,0x14,0xf3)       @ sha256h2 q1,q2,q13
+       INST(0xe4,0x6c,0x62,0xf3)       @ sha256su1 q11,q9,q10
+       vld1.32         {q13},[r3]!
+       vadd.i32        q12,q12,q8
+       INST(0xe2,0x03,0xfa,0xf3)       @ sha256su0 q8,q9
+       vmov            q2,q0
+       INST(0x68,0x0c,0x02,0xf3)       @ sha256h q0,q1,q12
+       INST(0x68,0x2c,0x14,0xf3)       @ sha256h2 q1,q2,q12
+       INST(0xe6,0x0c,0x64,0xf3)       @ sha256su1 q8,q10,q11
+       vld1.32         {q12},[r3]!
+       vadd.i32        q13,q13,q9
+       INST(0xe4,0x23,0xfa,0xf3)       @ sha256su0 q9,q10
+       vmov            q2,q0
+       INST(0x6a,0x0c,0x02,0xf3)       @ sha256h q0,q1,q13
+       INST(0x6a,0x2c,0x14,0xf3)       @ sha256h2 q1,q2,q13
+       INST(0xe0,0x2c,0x66,0xf3)       @ sha256su1 q9,q11,q8
+       vld1.32         {q13},[r3]!
+       vadd.i32        q12,q12,q10
+       INST(0xe6,0x43,0xfa,0xf3)       @ sha256su0 q10,q11
+       vmov            q2,q0
+       INST(0x68,0x0c,0x02,0xf3)       @ sha256h q0,q1,q12
+       INST(0x68,0x2c,0x14,0xf3)       @ sha256h2 q1,q2,q12
+       INST(0xe2,0x4c,0x60,0xf3)       @ sha256su1 q10,q8,q9
+       vld1.32         {q12},[r3]!
+       vadd.i32        q13,q13,q11
+       INST(0xe0,0x63,0xfa,0xf3)       @ sha256su0 q11,q8
+       vmov            q2,q0
+       INST(0x6a,0x0c,0x02,0xf3)       @ sha256h q0,q1,q13
+       INST(0x6a,0x2c,0x14,0xf3)       @ sha256h2 q1,q2,q13
+       INST(0xe4,0x6c,0x62,0xf3)       @ sha256su1 q11,q9,q10
+       vld1.32         {q13},[r3]!
+       vadd.i32        q12,q12,q8
+       vmov            q2,q0
+       INST(0x68,0x0c,0x02,0xf3)       @ sha256h q0,q1,q12
+       INST(0x68,0x2c,0x14,0xf3)       @ sha256h2 q1,q2,q12
+
+       vld1.32         {q12},[r3]!
+       vadd.i32        q13,q13,q9
+       vmov            q2,q0
+       INST(0x6a,0x0c,0x02,0xf3)       @ sha256h q0,q1,q13
+       INST(0x6a,0x2c,0x14,0xf3)       @ sha256h2 q1,q2,q13
+
+       vld1.32         {q13},[r3]
+       vadd.i32        q12,q12,q10
+       sub             r3,r3,#256-16   @ rewind
+       vmov            q2,q0
+       INST(0x68,0x0c,0x02,0xf3)       @ sha256h q0,q1,q12
+       INST(0x68,0x2c,0x14,0xf3)       @ sha256h2 q1,q2,q12
+
+       vadd.i32        q13,q13,q11
+       vmov            q2,q0
+       INST(0x6a,0x0c,0x02,0xf3)       @ sha256h q0,q1,q13
+       INST(0x6a,0x2c,0x14,0xf3)       @ sha256h2 q1,q2,q13
+
+       vadd.i32        q0,q0,q14
+       vadd.i32        q1,q1,q15
+       it              ne
+       bne             .Loop_v8
+
+       vst1.32         {q0,q1},[r0]
+
+       bx      lr              @ bx lr
+.size  sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
+#endif
+.asciz  "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro@openssl.org>"
+.align 2
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.comm   OPENSSL_armcap_P,4,4
+#endif
diff --git a/arch/arm/crypto/sha256_glue.c b/arch/arm/crypto/sha256_glue.c
new file mode 100644 (file)
index 0000000..bb03482
--- /dev/null
@@ -0,0 +1,246 @@
+/*
+ * Glue code for the SHA256 Secure Hash Algorithm assembly implementation
+ * using optimized ARM assembler and NEON instructions.
+ *
+ * Copyright © 2015 Google Inc.
+ *
+ * This file is based on sha256_ssse3_glue.c:
+ *   Copyright (C) 2013 Intel Corporation
+ *   Author: Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#include <crypto/internal/hash.h>
+#include <linux/crypto.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/cryptohash.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <crypto/sha.h>
+#include <asm/byteorder.h>
+#include <asm/simd.h>
+#include <asm/neon.h>
+#include "sha256_glue.h"
+
+asmlinkage void sha256_block_data_order(u32 *digest, const void *data,
+                                     unsigned int num_blks);
+
+
+int sha256_init(struct shash_desc *desc)
+{
+       struct sha256_state *sctx = shash_desc_ctx(desc);
+
+       sctx->state[0] = SHA256_H0;
+       sctx->state[1] = SHA256_H1;
+       sctx->state[2] = SHA256_H2;
+       sctx->state[3] = SHA256_H3;
+       sctx->state[4] = SHA256_H4;
+       sctx->state[5] = SHA256_H5;
+       sctx->state[6] = SHA256_H6;
+       sctx->state[7] = SHA256_H7;
+       sctx->count = 0;
+
+       return 0;
+}
+
+int sha224_init(struct shash_desc *desc)
+{
+       struct sha256_state *sctx = shash_desc_ctx(desc);
+
+       sctx->state[0] = SHA224_H0;
+       sctx->state[1] = SHA224_H1;
+       sctx->state[2] = SHA224_H2;
+       sctx->state[3] = SHA224_H3;
+       sctx->state[4] = SHA224_H4;
+       sctx->state[5] = SHA224_H5;
+       sctx->state[6] = SHA224_H6;
+       sctx->state[7] = SHA224_H7;
+       sctx->count = 0;
+
+       return 0;
+}
+
+int __sha256_update(struct shash_desc *desc, const u8 *data, unsigned int len,
+                   unsigned int partial)
+{
+       struct sha256_state *sctx = shash_desc_ctx(desc);
+       unsigned int done = 0;
+
+       sctx->count += len;
+
+       if (partial) {
+               done = SHA256_BLOCK_SIZE - partial;
+               memcpy(sctx->buf + partial, data, done);
+               sha256_block_data_order(sctx->state, sctx->buf, 1);
+       }
+
+       if (len - done >= SHA256_BLOCK_SIZE) {
+               const unsigned int rounds = (len - done) / SHA256_BLOCK_SIZE;
+
+               sha256_block_data_order(sctx->state, data + done, rounds);
+               done += rounds * SHA256_BLOCK_SIZE;
+       }
+
+       memcpy(sctx->buf, data + done, len - done);
+
+       return 0;
+}
+
+int sha256_update(struct shash_desc *desc, const u8 *data, unsigned int len)
+{
+       struct sha256_state *sctx = shash_desc_ctx(desc);
+       unsigned int partial = sctx->count % SHA256_BLOCK_SIZE;
+
+       /* Handle the fast case right here */
+       if (partial + len < SHA256_BLOCK_SIZE) {
+               sctx->count += len;
+               memcpy(sctx->buf + partial, data, len);
+
+               return 0;
+       }
+
+       return __sha256_update(desc, data, len, partial);
+}
+
+/* Add padding and return the message digest. */
+static int sha256_final(struct shash_desc *desc, u8 *out)
+{
+       struct sha256_state *sctx = shash_desc_ctx(desc);
+       unsigned int i, index, padlen;
+       __be32 *dst = (__be32 *)out;
+       __be64 bits;
+       static const u8 padding[SHA256_BLOCK_SIZE] = { 0x80, };
+
+       /* save number of bits */
+       bits = cpu_to_be64(sctx->count << 3);
+
+       /* Pad out to 56 mod 64 and append length */
+       index = sctx->count % SHA256_BLOCK_SIZE;
+       padlen = (index < 56) ? (56 - index) : ((SHA256_BLOCK_SIZE+56)-index);
+
+       /* We need to fill a whole block for __sha256_update */
+       if (padlen <= 56) {
+               sctx->count += padlen;
+               memcpy(sctx->buf + index, padding, padlen);
+       } else {
+               __sha256_update(desc, padding, padlen, index);
+       }
+       __sha256_update(desc, (const u8 *)&bits, sizeof(bits), 56);
+
+       /* Store state in digest */
+       for (i = 0; i < 8; i++)
+               dst[i] = cpu_to_be32(sctx->state[i]);
+
+       /* Wipe context */
+       memset(sctx, 0, sizeof(*sctx));
+
+       return 0;
+}
+
+static int sha224_final(struct shash_desc *desc, u8 *out)
+{
+       u8 D[SHA256_DIGEST_SIZE];
+
+       sha256_final(desc, D);
+
+       memcpy(out, D, SHA224_DIGEST_SIZE);
+       memset(D, 0, SHA256_DIGEST_SIZE);
+
+       return 0;
+}
+
+int sha256_export(struct shash_desc *desc, void *out)
+{
+       struct sha256_state *sctx = shash_desc_ctx(desc);
+
+       memcpy(out, sctx, sizeof(*sctx));
+
+       return 0;
+}
+
+int sha256_import(struct shash_desc *desc, const void *in)
+{
+       struct sha256_state *sctx = shash_desc_ctx(desc);
+
+       memcpy(sctx, in, sizeof(*sctx));
+
+       return 0;
+}
+
+static struct shash_alg algs[] = { {
+       .digestsize     =       SHA256_DIGEST_SIZE,
+       .init           =       sha256_init,
+       .update         =       sha256_update,
+       .final          =       sha256_final,
+       .export         =       sha256_export,
+       .import         =       sha256_import,
+       .descsize       =       sizeof(struct sha256_state),
+       .statesize      =       sizeof(struct sha256_state),
+       .base           =       {
+               .cra_name       =       "sha256",
+               .cra_driver_name =      "sha256-asm",
+               .cra_priority   =       150,
+               .cra_flags      =       CRYPTO_ALG_TYPE_SHASH,
+               .cra_blocksize  =       SHA256_BLOCK_SIZE,
+               .cra_module     =       THIS_MODULE,
+       }
+}, {
+       .digestsize     =       SHA224_DIGEST_SIZE,
+       .init           =       sha224_init,
+       .update         =       sha256_update,
+       .final          =       sha224_final,
+       .export         =       sha256_export,
+       .import         =       sha256_import,
+       .descsize       =       sizeof(struct sha256_state),
+       .statesize      =       sizeof(struct sha256_state),
+       .base           =       {
+               .cra_name       =       "sha224",
+               .cra_driver_name =      "sha224-asm",
+               .cra_priority   =       150,
+               .cra_flags      =       CRYPTO_ALG_TYPE_SHASH,
+               .cra_blocksize  =       SHA224_BLOCK_SIZE,
+               .cra_module     =       THIS_MODULE,
+       }
+} };
+
+static int __init sha256_mod_init(void)
+{
+       int res = crypto_register_shashes(algs, ARRAY_SIZE(algs));
+
+       if (res < 0)
+               return res;
+
+       if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && cpu_has_neon()) {
+               res = crypto_register_shashes(sha256_neon_algs,
+                                             ARRAY_SIZE(sha256_neon_algs));
+
+               if (res < 0)
+                       crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
+       }
+
+       return res;
+}
+
+static void __exit sha256_mod_fini(void)
+{
+       crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
+
+       if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && cpu_has_neon())
+               crypto_unregister_shashes(sha256_neon_algs,
+                                         ARRAY_SIZE(sha256_neon_algs));
+}
+
+module_init(sha256_mod_init);
+module_exit(sha256_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SHA256 Secure Hash Algorithm (ARM), including NEON");
+
+MODULE_ALIAS_CRYPTO("sha256");
diff --git a/arch/arm/crypto/sha256_glue.h b/arch/arm/crypto/sha256_glue.h
new file mode 100644 (file)
index 0000000..0312f4f
--- /dev/null
@@ -0,0 +1,23 @@
+#ifndef _CRYPTO_SHA256_GLUE_H
+#define _CRYPTO_SHA256_GLUE_H
+
+#include <linux/crypto.h>
+#include <crypto/sha.h>
+
+extern struct shash_alg sha256_neon_algs[2];
+
+extern int sha256_init(struct shash_desc *desc);
+
+extern int sha224_init(struct shash_desc *desc);
+
+extern int __sha256_update(struct shash_desc *desc, const u8 *data,
+                          unsigned int len, unsigned int partial);
+
+extern int sha256_update(struct shash_desc *desc, const u8 *data,
+                        unsigned int len);
+
+extern int sha256_export(struct shash_desc *desc, void *out);
+
+extern int sha256_import(struct shash_desc *desc, const void *in);
+
+#endif /* _CRYPTO_SHA256_GLUE_H */
diff --git a/arch/arm/crypto/sha256_neon_glue.c b/arch/arm/crypto/sha256_neon_glue.c
new file mode 100644 (file)
index 0000000..3ff0a7f
--- /dev/null
@@ -0,0 +1,172 @@
+/*
+ * Glue code for the SHA256 Secure Hash Algorithm assembly implementation
+ * using NEON instructions.
+ *
+ * Copyright © 2015 Google Inc.
+ *
+ * This file is based on sha512_neon_glue.c:
+ *   Copyright © 2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#include <crypto/internal/hash.h>
+#include <linux/cryptohash.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <crypto/sha.h>
+#include <asm/byteorder.h>
+#include <asm/simd.h>
+#include <asm/neon.h>
+#include "sha256_glue.h"
+
+asmlinkage void sha256_block_data_order_neon(u32 *digest, const void *data,
+                                     unsigned int num_blks);
+
+
+static int __sha256_neon_update(struct shash_desc *desc, const u8 *data,
+                               unsigned int len, unsigned int partial)
+{
+       struct sha256_state *sctx = shash_desc_ctx(desc);
+       unsigned int done = 0;
+
+       sctx->count += len;
+
+       if (partial) {
+               done = SHA256_BLOCK_SIZE - partial;
+               memcpy(sctx->buf + partial, data, done);
+               sha256_block_data_order_neon(sctx->state, sctx->buf, 1);
+       }
+
+       if (len - done >= SHA256_BLOCK_SIZE) {
+               const unsigned int rounds = (len - done) / SHA256_BLOCK_SIZE;
+
+               sha256_block_data_order_neon(sctx->state, data + done, rounds);
+               done += rounds * SHA256_BLOCK_SIZE;
+       }
+
+       memcpy(sctx->buf, data + done, len - done);
+
+       return 0;
+}
+
+static int sha256_neon_update(struct shash_desc *desc, const u8 *data,
+                             unsigned int len)
+{
+       struct sha256_state *sctx = shash_desc_ctx(desc);
+       unsigned int partial = sctx->count % SHA256_BLOCK_SIZE;
+       int res;
+
+       /* Handle the fast case right here */
+       if (partial + len < SHA256_BLOCK_SIZE) {
+               sctx->count += len;
+               memcpy(sctx->buf + partial, data, len);
+
+               return 0;
+       }
+
+       if (!may_use_simd()) {
+               res = __sha256_update(desc, data, len, partial);
+       } else {
+               kernel_neon_begin();
+               res = __sha256_neon_update(desc, data, len, partial);
+               kernel_neon_end();
+       }
+
+       return res;
+}
+
+/* Add padding and return the message digest. */
+static int sha256_neon_final(struct shash_desc *desc, u8 *out)
+{
+       struct sha256_state *sctx = shash_desc_ctx(desc);
+       unsigned int i, index, padlen;
+       __be32 *dst = (__be32 *)out;
+       __be64 bits;
+       static const u8 padding[SHA256_BLOCK_SIZE] = { 0x80, };
+
+       /* save number of bits */
+       bits = cpu_to_be64(sctx->count << 3);
+
+       /* Pad out to 56 mod 64 and append length */
+       index = sctx->count % SHA256_BLOCK_SIZE;
+       padlen = (index < 56) ? (56 - index) : ((SHA256_BLOCK_SIZE+56)-index);
+
+       if (!may_use_simd()) {
+               sha256_update(desc, padding, padlen);
+               sha256_update(desc, (const u8 *)&bits, sizeof(bits));
+       } else {
+               kernel_neon_begin();
+               /* We need to fill a whole block for __sha256_neon_update() */
+               if (padlen <= 56) {
+                       sctx->count += padlen;
+                       memcpy(sctx->buf + index, padding, padlen);
+               } else {
+                       __sha256_neon_update(desc, padding, padlen, index);
+               }
+               __sha256_neon_update(desc, (const u8 *)&bits,
+                                       sizeof(bits), 56);
+               kernel_neon_end();
+       }
+
+       /* Store state in digest */
+       for (i = 0; i < 8; i++)
+               dst[i] = cpu_to_be32(sctx->state[i]);
+
+       /* Wipe context */
+       memset(sctx, 0, sizeof(*sctx));
+
+       return 0;
+}
+
+static int sha224_neon_final(struct shash_desc *desc, u8 *out)
+{
+       u8 D[SHA256_DIGEST_SIZE];
+
+       sha256_neon_final(desc, D);
+
+       memcpy(out, D, SHA224_DIGEST_SIZE);
+       memset(D, 0, SHA256_DIGEST_SIZE);
+
+       return 0;
+}
+
+struct shash_alg sha256_neon_algs[] = { {
+       .digestsize     =       SHA256_DIGEST_SIZE,
+       .init           =       sha256_init,
+       .update         =       sha256_neon_update,
+       .final          =       sha256_neon_final,
+       .export         =       sha256_export,
+       .import         =       sha256_import,
+       .descsize       =       sizeof(struct sha256_state),
+       .statesize      =       sizeof(struct sha256_state),
+       .base           =       {
+               .cra_name       =       "sha256",
+               .cra_driver_name =      "sha256-neon",
+               .cra_priority   =       250,
+               .cra_flags      =       CRYPTO_ALG_TYPE_SHASH,
+               .cra_blocksize  =       SHA256_BLOCK_SIZE,
+               .cra_module     =       THIS_MODULE,
+       }
+}, {
+       .digestsize     =       SHA224_DIGEST_SIZE,
+       .init           =       sha224_init,
+       .update         =       sha256_neon_update,
+       .final          =       sha224_neon_final,
+       .export         =       sha256_export,
+       .import         =       sha256_import,
+       .descsize       =       sizeof(struct sha256_state),
+       .statesize      =       sizeof(struct sha256_state),
+       .base           =       {
+               .cra_name       =       "sha224",
+               .cra_driver_name =      "sha224-neon",
+               .cra_priority   =       250,
+               .cra_flags      =       CRYPTO_ALG_TYPE_SHASH,
+               .cra_blocksize  =       SHA224_BLOCK_SIZE,
+               .cra_module     =       THIS_MODULE,
+       }
+} };
index b7cc3cb3a61304e0c72a41625c202a561c2269ba..fa7e2a8cc60824111f3fbded37cac22a7f46ab4a 100644 (file)
@@ -556,6 +556,13 @@ config CRYPTO_SHA256
          This code also includes SHA-224, a 224 bit hash with 112 bits
          of security against collision attacks.
 
+config CRYPTO_SHA256_ARM
+       tristate "SHA-224/256 digest algorithm (ARM-asm and NEON)"
+       select CRYPTO_HASH
+       help
+         SHA-256 secure hash standard (DFIPS 180-2) implemented
+         using optimized ARM assembler and NEON, when available.
+
 config CRYPTO_SHA256_SPARC64
        tristate "SHA224 and SHA256 digest algorithm (SPARC64)"
        depends on SPARC64