Merge tag 'scsi-misc' of git://git.kernel.org/pub/scm/linux/kernel/git/jejb/scsi
[firefly-linux-kernel-4.4.55.git] / arch / arm / crypto / sha1-armv4-large.S
1 #define __ARM_ARCH__ __LINUX_ARM_ARCH__
2 @ ====================================================================
3 @ Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
4 @ project. The module is, however, dual licensed under OpenSSL and
5 @ CRYPTOGAMS licenses depending on where you obtain it. For further
6 @ details see http://www.openssl.org/~appro/cryptogams/.
7 @ ====================================================================
8
9 @ sha1_block procedure for ARMv4.
10 @
11 @ January 2007.
12
13 @ Size/performance trade-off
14 @ ====================================================================
15 @ impl          size in bytes   comp cycles[*]  measured performance
16 @ ====================================================================
17 @ thumb         304             3212            4420
18 @ armv4-small   392/+29%        1958/+64%       2250/+96%
19 @ armv4-compact 740/+89%        1552/+26%       1840/+22%
20 @ armv4-large   1420/+92%       1307/+19%       1370/+34%[***]
21 @ full unroll   ~5100/+260%     ~1260/+4%       ~1300/+5%
22 @ ====================================================================
23 @ thumb         = same as 'small' but in Thumb instructions[**] and
24 @                 with recurring code in two private functions;
25 @ small         = detached Xload/update, loops are folded;
26 @ compact       = detached Xload/update, 5x unroll;
27 @ large         = interleaved Xload/update, 5x unroll;
28 @ full unroll   = interleaved Xload/update, full unroll, estimated[!];
29 @
30 @ [*]   Manually counted instructions in "grand" loop body. Measured
31 @       performance is affected by prologue and epilogue overhead,
32 @       i-cache availability, branch penalties, etc.
33 @ [**]  While each Thumb instruction is twice smaller, they are not as
34 @       diverse as ARM ones: e.g., there are only two arithmetic
35 @       instructions with 3 arguments, no [fixed] rotate, addressing
36 @       modes are limited. As result it takes more instructions to do
37 @       the same job in Thumb, therefore the code is never twice as
38 @       small and always slower.
39 @ [***] which is also ~35% better than compiler generated code. Dual-
40 @       issue Cortex A8 core was measured to process input block in
41 @       ~990 cycles.
42
43 @ August 2010.
44 @
45 @ Rescheduling for dual-issue pipeline resulted in 13% improvement on
46 @ Cortex A8 core and in absolute terms ~870 cycles per input block
47 @ [or 13.6 cycles per byte].
48
49 @ February 2011.
50 @
51 @ Profiler-assisted and platform-specific optimization resulted in 10%
52 @ improvement on Cortex A8 core and 12.2 cycles per byte.
53
54 .text
55
56 .global sha1_block_data_order
57 .type   sha1_block_data_order,%function
58
59 .align  2
60 sha1_block_data_order:
61         stmdb   sp!,{r4-r12,lr}
62         add     r2,r1,r2,lsl#6  @ r2 to point at the end of r1
63         ldmia   r0,{r3,r4,r5,r6,r7}
64 .Lloop:
65         ldr     r8,.LK_00_19
66         mov     r14,sp
67         sub     sp,sp,#15*4
68         mov     r5,r5,ror#30
69         mov     r6,r6,ror#30
70         mov     r7,r7,ror#30            @ [6]
71 .L_00_15:
72 #if __ARM_ARCH__<7
73         ldrb    r10,[r1,#2]
74         ldrb    r9,[r1,#3]
75         ldrb    r11,[r1,#1]
76         add     r7,r8,r7,ror#2                  @ E+=K_00_19
77         ldrb    r12,[r1],#4
78         orr     r9,r9,r10,lsl#8
79         eor     r10,r5,r6                       @ F_xx_xx
80         orr     r9,r9,r11,lsl#16
81         add     r7,r7,r3,ror#27                 @ E+=ROR(A,27)
82         orr     r9,r9,r12,lsl#24
83 #else
84         ldr     r9,[r1],#4                      @ handles unaligned
85         add     r7,r8,r7,ror#2                  @ E+=K_00_19
86         eor     r10,r5,r6                       @ F_xx_xx
87         add     r7,r7,r3,ror#27                 @ E+=ROR(A,27)
88 #ifdef __ARMEL__
89         rev     r9,r9                           @ byte swap
90 #endif
91 #endif
92         and     r10,r4,r10,ror#2
93         add     r7,r7,r9                        @ E+=X[i]
94         eor     r10,r10,r6,ror#2                @ F_00_19(B,C,D)
95         str     r9,[r14,#-4]!
96         add     r7,r7,r10                       @ E+=F_00_19(B,C,D)
97 #if __ARM_ARCH__<7
98         ldrb    r10,[r1,#2]
99         ldrb    r9,[r1,#3]
100         ldrb    r11,[r1,#1]
101         add     r6,r8,r6,ror#2                  @ E+=K_00_19
102         ldrb    r12,[r1],#4
103         orr     r9,r9,r10,lsl#8
104         eor     r10,r4,r5                       @ F_xx_xx
105         orr     r9,r9,r11,lsl#16
106         add     r6,r6,r7,ror#27                 @ E+=ROR(A,27)
107         orr     r9,r9,r12,lsl#24
108 #else
109         ldr     r9,[r1],#4                      @ handles unaligned
110         add     r6,r8,r6,ror#2                  @ E+=K_00_19
111         eor     r10,r4,r5                       @ F_xx_xx
112         add     r6,r6,r7,ror#27                 @ E+=ROR(A,27)
113 #ifdef __ARMEL__
114         rev     r9,r9                           @ byte swap
115 #endif
116 #endif
117         and     r10,r3,r10,ror#2
118         add     r6,r6,r9                        @ E+=X[i]
119         eor     r10,r10,r5,ror#2                @ F_00_19(B,C,D)
120         str     r9,[r14,#-4]!
121         add     r6,r6,r10                       @ E+=F_00_19(B,C,D)
122 #if __ARM_ARCH__<7
123         ldrb    r10,[r1,#2]
124         ldrb    r9,[r1,#3]
125         ldrb    r11,[r1,#1]
126         add     r5,r8,r5,ror#2                  @ E+=K_00_19
127         ldrb    r12,[r1],#4
128         orr     r9,r9,r10,lsl#8
129         eor     r10,r3,r4                       @ F_xx_xx
130         orr     r9,r9,r11,lsl#16
131         add     r5,r5,r6,ror#27                 @ E+=ROR(A,27)
132         orr     r9,r9,r12,lsl#24
133 #else
134         ldr     r9,[r1],#4                      @ handles unaligned
135         add     r5,r8,r5,ror#2                  @ E+=K_00_19
136         eor     r10,r3,r4                       @ F_xx_xx
137         add     r5,r5,r6,ror#27                 @ E+=ROR(A,27)
138 #ifdef __ARMEL__
139         rev     r9,r9                           @ byte swap
140 #endif
141 #endif
142         and     r10,r7,r10,ror#2
143         add     r5,r5,r9                        @ E+=X[i]
144         eor     r10,r10,r4,ror#2                @ F_00_19(B,C,D)
145         str     r9,[r14,#-4]!
146         add     r5,r5,r10                       @ E+=F_00_19(B,C,D)
147 #if __ARM_ARCH__<7
148         ldrb    r10,[r1,#2]
149         ldrb    r9,[r1,#3]
150         ldrb    r11,[r1,#1]
151         add     r4,r8,r4,ror#2                  @ E+=K_00_19
152         ldrb    r12,[r1],#4
153         orr     r9,r9,r10,lsl#8
154         eor     r10,r7,r3                       @ F_xx_xx
155         orr     r9,r9,r11,lsl#16
156         add     r4,r4,r5,ror#27                 @ E+=ROR(A,27)
157         orr     r9,r9,r12,lsl#24
158 #else
159         ldr     r9,[r1],#4                      @ handles unaligned
160         add     r4,r8,r4,ror#2                  @ E+=K_00_19
161         eor     r10,r7,r3                       @ F_xx_xx
162         add     r4,r4,r5,ror#27                 @ E+=ROR(A,27)
163 #ifdef __ARMEL__
164         rev     r9,r9                           @ byte swap
165 #endif
166 #endif
167         and     r10,r6,r10,ror#2
168         add     r4,r4,r9                        @ E+=X[i]
169         eor     r10,r10,r3,ror#2                @ F_00_19(B,C,D)
170         str     r9,[r14,#-4]!
171         add     r4,r4,r10                       @ E+=F_00_19(B,C,D)
172 #if __ARM_ARCH__<7
173         ldrb    r10,[r1,#2]
174         ldrb    r9,[r1,#3]
175         ldrb    r11,[r1,#1]
176         add     r3,r8,r3,ror#2                  @ E+=K_00_19
177         ldrb    r12,[r1],#4
178         orr     r9,r9,r10,lsl#8
179         eor     r10,r6,r7                       @ F_xx_xx
180         orr     r9,r9,r11,lsl#16
181         add     r3,r3,r4,ror#27                 @ E+=ROR(A,27)
182         orr     r9,r9,r12,lsl#24
183 #else
184         ldr     r9,[r1],#4                      @ handles unaligned
185         add     r3,r8,r3,ror#2                  @ E+=K_00_19
186         eor     r10,r6,r7                       @ F_xx_xx
187         add     r3,r3,r4,ror#27                 @ E+=ROR(A,27)
188 #ifdef __ARMEL__
189         rev     r9,r9                           @ byte swap
190 #endif
191 #endif
192         and     r10,r5,r10,ror#2
193         add     r3,r3,r9                        @ E+=X[i]
194         eor     r10,r10,r7,ror#2                @ F_00_19(B,C,D)
195         str     r9,[r14,#-4]!
196         add     r3,r3,r10                       @ E+=F_00_19(B,C,D)
197         teq     r14,sp
198         bne     .L_00_15                @ [((11+4)*5+2)*3]
199 #if __ARM_ARCH__<7
200         ldrb    r10,[r1,#2]
201         ldrb    r9,[r1,#3]
202         ldrb    r11,[r1,#1]
203         add     r7,r8,r7,ror#2                  @ E+=K_00_19
204         ldrb    r12,[r1],#4
205         orr     r9,r9,r10,lsl#8
206         eor     r10,r5,r6                       @ F_xx_xx
207         orr     r9,r9,r11,lsl#16
208         add     r7,r7,r3,ror#27                 @ E+=ROR(A,27)
209         orr     r9,r9,r12,lsl#24
210 #else
211         ldr     r9,[r1],#4                      @ handles unaligned
212         add     r7,r8,r7,ror#2                  @ E+=K_00_19
213         eor     r10,r5,r6                       @ F_xx_xx
214         add     r7,r7,r3,ror#27                 @ E+=ROR(A,27)
215 #ifdef __ARMEL__
216         rev     r9,r9                           @ byte swap
217 #endif
218 #endif
219         and     r10,r4,r10,ror#2
220         add     r7,r7,r9                        @ E+=X[i]
221         eor     r10,r10,r6,ror#2                @ F_00_19(B,C,D)
222         str     r9,[r14,#-4]!
223         add     r7,r7,r10                       @ E+=F_00_19(B,C,D)
224         ldr     r9,[r14,#15*4]
225         ldr     r10,[r14,#13*4]
226         ldr     r11,[r14,#7*4]
227         add     r6,r8,r6,ror#2                  @ E+=K_xx_xx
228         ldr     r12,[r14,#2*4]
229         eor     r9,r9,r10
230         eor     r11,r11,r12                     @ 1 cycle stall
231         eor     r10,r4,r5                       @ F_xx_xx
232         mov     r9,r9,ror#31
233         add     r6,r6,r7,ror#27                 @ E+=ROR(A,27)
234         eor     r9,r9,r11,ror#31
235         str     r9,[r14,#-4]!
236         and r10,r3,r10,ror#2                                    @ F_xx_xx
237                                                 @ F_xx_xx
238         add     r6,r6,r9                        @ E+=X[i]
239         eor     r10,r10,r5,ror#2                @ F_00_19(B,C,D)
240         add     r6,r6,r10                       @ E+=F_00_19(B,C,D)
241         ldr     r9,[r14,#15*4]
242         ldr     r10,[r14,#13*4]
243         ldr     r11,[r14,#7*4]
244         add     r5,r8,r5,ror#2                  @ E+=K_xx_xx
245         ldr     r12,[r14,#2*4]
246         eor     r9,r9,r10
247         eor     r11,r11,r12                     @ 1 cycle stall
248         eor     r10,r3,r4                       @ F_xx_xx
249         mov     r9,r9,ror#31
250         add     r5,r5,r6,ror#27                 @ E+=ROR(A,27)
251         eor     r9,r9,r11,ror#31
252         str     r9,[r14,#-4]!
253         and r10,r7,r10,ror#2                                    @ F_xx_xx
254                                                 @ F_xx_xx
255         add     r5,r5,r9                        @ E+=X[i]
256         eor     r10,r10,r4,ror#2                @ F_00_19(B,C,D)
257         add     r5,r5,r10                       @ E+=F_00_19(B,C,D)
258         ldr     r9,[r14,#15*4]
259         ldr     r10,[r14,#13*4]
260         ldr     r11,[r14,#7*4]
261         add     r4,r8,r4,ror#2                  @ E+=K_xx_xx
262         ldr     r12,[r14,#2*4]
263         eor     r9,r9,r10
264         eor     r11,r11,r12                     @ 1 cycle stall
265         eor     r10,r7,r3                       @ F_xx_xx
266         mov     r9,r9,ror#31
267         add     r4,r4,r5,ror#27                 @ E+=ROR(A,27)
268         eor     r9,r9,r11,ror#31
269         str     r9,[r14,#-4]!
270         and r10,r6,r10,ror#2                                    @ F_xx_xx
271                                                 @ F_xx_xx
272         add     r4,r4,r9                        @ E+=X[i]
273         eor     r10,r10,r3,ror#2                @ F_00_19(B,C,D)
274         add     r4,r4,r10                       @ E+=F_00_19(B,C,D)
275         ldr     r9,[r14,#15*4]
276         ldr     r10,[r14,#13*4]
277         ldr     r11,[r14,#7*4]
278         add     r3,r8,r3,ror#2                  @ E+=K_xx_xx
279         ldr     r12,[r14,#2*4]
280         eor     r9,r9,r10
281         eor     r11,r11,r12                     @ 1 cycle stall
282         eor     r10,r6,r7                       @ F_xx_xx
283         mov     r9,r9,ror#31
284         add     r3,r3,r4,ror#27                 @ E+=ROR(A,27)
285         eor     r9,r9,r11,ror#31
286         str     r9,[r14,#-4]!
287         and r10,r5,r10,ror#2                                    @ F_xx_xx
288                                                 @ F_xx_xx
289         add     r3,r3,r9                        @ E+=X[i]
290         eor     r10,r10,r7,ror#2                @ F_00_19(B,C,D)
291         add     r3,r3,r10                       @ E+=F_00_19(B,C,D)
292
293         ldr     r8,.LK_20_39            @ [+15+16*4]
294         sub     sp,sp,#25*4
295         cmn     sp,#0                   @ [+3], clear carry to denote 20_39
296 .L_20_39_or_60_79:
297         ldr     r9,[r14,#15*4]
298         ldr     r10,[r14,#13*4]
299         ldr     r11,[r14,#7*4]
300         add     r7,r8,r7,ror#2                  @ E+=K_xx_xx
301         ldr     r12,[r14,#2*4]
302         eor     r9,r9,r10
303         eor     r11,r11,r12                     @ 1 cycle stall
304         eor     r10,r5,r6                       @ F_xx_xx
305         mov     r9,r9,ror#31
306         add     r7,r7,r3,ror#27                 @ E+=ROR(A,27)
307         eor     r9,r9,r11,ror#31
308         str     r9,[r14,#-4]!
309         eor r10,r4,r10,ror#2                                    @ F_xx_xx
310                                                 @ F_xx_xx
311         add     r7,r7,r9                        @ E+=X[i]
312         add     r7,r7,r10                       @ E+=F_20_39(B,C,D)
313         ldr     r9,[r14,#15*4]
314         ldr     r10,[r14,#13*4]
315         ldr     r11,[r14,#7*4]
316         add     r6,r8,r6,ror#2                  @ E+=K_xx_xx
317         ldr     r12,[r14,#2*4]
318         eor     r9,r9,r10
319         eor     r11,r11,r12                     @ 1 cycle stall
320         eor     r10,r4,r5                       @ F_xx_xx
321         mov     r9,r9,ror#31
322         add     r6,r6,r7,ror#27                 @ E+=ROR(A,27)
323         eor     r9,r9,r11,ror#31
324         str     r9,[r14,#-4]!
325         eor r10,r3,r10,ror#2                                    @ F_xx_xx
326                                                 @ F_xx_xx
327         add     r6,r6,r9                        @ E+=X[i]
328         add     r6,r6,r10                       @ E+=F_20_39(B,C,D)
329         ldr     r9,[r14,#15*4]
330         ldr     r10,[r14,#13*4]
331         ldr     r11,[r14,#7*4]
332         add     r5,r8,r5,ror#2                  @ E+=K_xx_xx
333         ldr     r12,[r14,#2*4]
334         eor     r9,r9,r10
335         eor     r11,r11,r12                     @ 1 cycle stall
336         eor     r10,r3,r4                       @ F_xx_xx
337         mov     r9,r9,ror#31
338         add     r5,r5,r6,ror#27                 @ E+=ROR(A,27)
339         eor     r9,r9,r11,ror#31
340         str     r9,[r14,#-4]!
341         eor r10,r7,r10,ror#2                                    @ F_xx_xx
342                                                 @ F_xx_xx
343         add     r5,r5,r9                        @ E+=X[i]
344         add     r5,r5,r10                       @ E+=F_20_39(B,C,D)
345         ldr     r9,[r14,#15*4]
346         ldr     r10,[r14,#13*4]
347         ldr     r11,[r14,#7*4]
348         add     r4,r8,r4,ror#2                  @ E+=K_xx_xx
349         ldr     r12,[r14,#2*4]
350         eor     r9,r9,r10
351         eor     r11,r11,r12                     @ 1 cycle stall
352         eor     r10,r7,r3                       @ F_xx_xx
353         mov     r9,r9,ror#31
354         add     r4,r4,r5,ror#27                 @ E+=ROR(A,27)
355         eor     r9,r9,r11,ror#31
356         str     r9,[r14,#-4]!
357         eor r10,r6,r10,ror#2                                    @ F_xx_xx
358                                                 @ F_xx_xx
359         add     r4,r4,r9                        @ E+=X[i]
360         add     r4,r4,r10                       @ E+=F_20_39(B,C,D)
361         ldr     r9,[r14,#15*4]
362         ldr     r10,[r14,#13*4]
363         ldr     r11,[r14,#7*4]
364         add     r3,r8,r3,ror#2                  @ E+=K_xx_xx
365         ldr     r12,[r14,#2*4]
366         eor     r9,r9,r10
367         eor     r11,r11,r12                     @ 1 cycle stall
368         eor     r10,r6,r7                       @ F_xx_xx
369         mov     r9,r9,ror#31
370         add     r3,r3,r4,ror#27                 @ E+=ROR(A,27)
371         eor     r9,r9,r11,ror#31
372         str     r9,[r14,#-4]!
373         eor r10,r5,r10,ror#2                                    @ F_xx_xx
374                                                 @ F_xx_xx
375         add     r3,r3,r9                        @ E+=X[i]
376         add     r3,r3,r10                       @ E+=F_20_39(B,C,D)
377         teq     r14,sp                  @ preserve carry
378         bne     .L_20_39_or_60_79       @ [+((12+3)*5+2)*4]
379         bcs     .L_done                 @ [+((12+3)*5+2)*4], spare 300 bytes
380
381         ldr     r8,.LK_40_59
382         sub     sp,sp,#20*4             @ [+2]
383 .L_40_59:
384         ldr     r9,[r14,#15*4]
385         ldr     r10,[r14,#13*4]
386         ldr     r11,[r14,#7*4]
387         add     r7,r8,r7,ror#2                  @ E+=K_xx_xx
388         ldr     r12,[r14,#2*4]
389         eor     r9,r9,r10
390         eor     r11,r11,r12                     @ 1 cycle stall
391         eor     r10,r5,r6                       @ F_xx_xx
392         mov     r9,r9,ror#31
393         add     r7,r7,r3,ror#27                 @ E+=ROR(A,27)
394         eor     r9,r9,r11,ror#31
395         str     r9,[r14,#-4]!
396         and r10,r4,r10,ror#2                                    @ F_xx_xx
397         and r11,r5,r6                                   @ F_xx_xx
398         add     r7,r7,r9                        @ E+=X[i]
399         add     r7,r7,r10                       @ E+=F_40_59(B,C,D)
400         add     r7,r7,r11,ror#2
401         ldr     r9,[r14,#15*4]
402         ldr     r10,[r14,#13*4]
403         ldr     r11,[r14,#7*4]
404         add     r6,r8,r6,ror#2                  @ E+=K_xx_xx
405         ldr     r12,[r14,#2*4]
406         eor     r9,r9,r10
407         eor     r11,r11,r12                     @ 1 cycle stall
408         eor     r10,r4,r5                       @ F_xx_xx
409         mov     r9,r9,ror#31
410         add     r6,r6,r7,ror#27                 @ E+=ROR(A,27)
411         eor     r9,r9,r11,ror#31
412         str     r9,[r14,#-4]!
413         and r10,r3,r10,ror#2                                    @ F_xx_xx
414         and r11,r4,r5                                   @ F_xx_xx
415         add     r6,r6,r9                        @ E+=X[i]
416         add     r6,r6,r10                       @ E+=F_40_59(B,C,D)
417         add     r6,r6,r11,ror#2
418         ldr     r9,[r14,#15*4]
419         ldr     r10,[r14,#13*4]
420         ldr     r11,[r14,#7*4]
421         add     r5,r8,r5,ror#2                  @ E+=K_xx_xx
422         ldr     r12,[r14,#2*4]
423         eor     r9,r9,r10
424         eor     r11,r11,r12                     @ 1 cycle stall
425         eor     r10,r3,r4                       @ F_xx_xx
426         mov     r9,r9,ror#31
427         add     r5,r5,r6,ror#27                 @ E+=ROR(A,27)
428         eor     r9,r9,r11,ror#31
429         str     r9,[r14,#-4]!
430         and r10,r7,r10,ror#2                                    @ F_xx_xx
431         and r11,r3,r4                                   @ F_xx_xx
432         add     r5,r5,r9                        @ E+=X[i]
433         add     r5,r5,r10                       @ E+=F_40_59(B,C,D)
434         add     r5,r5,r11,ror#2
435         ldr     r9,[r14,#15*4]
436         ldr     r10,[r14,#13*4]
437         ldr     r11,[r14,#7*4]
438         add     r4,r8,r4,ror#2                  @ E+=K_xx_xx
439         ldr     r12,[r14,#2*4]
440         eor     r9,r9,r10
441         eor     r11,r11,r12                     @ 1 cycle stall
442         eor     r10,r7,r3                       @ F_xx_xx
443         mov     r9,r9,ror#31
444         add     r4,r4,r5,ror#27                 @ E+=ROR(A,27)
445         eor     r9,r9,r11,ror#31
446         str     r9,[r14,#-4]!
447         and r10,r6,r10,ror#2                                    @ F_xx_xx
448         and r11,r7,r3                                   @ F_xx_xx
449         add     r4,r4,r9                        @ E+=X[i]
450         add     r4,r4,r10                       @ E+=F_40_59(B,C,D)
451         add     r4,r4,r11,ror#2
452         ldr     r9,[r14,#15*4]
453         ldr     r10,[r14,#13*4]
454         ldr     r11,[r14,#7*4]
455         add     r3,r8,r3,ror#2                  @ E+=K_xx_xx
456         ldr     r12,[r14,#2*4]
457         eor     r9,r9,r10
458         eor     r11,r11,r12                     @ 1 cycle stall
459         eor     r10,r6,r7                       @ F_xx_xx
460         mov     r9,r9,ror#31
461         add     r3,r3,r4,ror#27                 @ E+=ROR(A,27)
462         eor     r9,r9,r11,ror#31
463         str     r9,[r14,#-4]!
464         and r10,r5,r10,ror#2                                    @ F_xx_xx
465         and r11,r6,r7                                   @ F_xx_xx
466         add     r3,r3,r9                        @ E+=X[i]
467         add     r3,r3,r10                       @ E+=F_40_59(B,C,D)
468         add     r3,r3,r11,ror#2
469         teq     r14,sp
470         bne     .L_40_59                @ [+((12+5)*5+2)*4]
471
472         ldr     r8,.LK_60_79
473         sub     sp,sp,#20*4
474         cmp     sp,#0                   @ set carry to denote 60_79
475         b       .L_20_39_or_60_79       @ [+4], spare 300 bytes
476 .L_done:
477         add     sp,sp,#80*4             @ "deallocate" stack frame
478         ldmia   r0,{r8,r9,r10,r11,r12}
479         add     r3,r8,r3
480         add     r4,r9,r4
481         add     r5,r10,r5,ror#2
482         add     r6,r11,r6,ror#2
483         add     r7,r12,r7,ror#2
484         stmia   r0,{r3,r4,r5,r6,r7}
485         teq     r1,r2
486         bne     .Lloop                  @ [+18], total 1307
487
488 #if __ARM_ARCH__>=5
489         ldmia   sp!,{r4-r12,pc}
490 #else
491         ldmia   sp!,{r4-r12,lr}
492         tst     lr,#1
493         moveq   pc,lr                   @ be binary compatible with V4, yet
494         .word   0xe12fff1e                      @ interoperable with Thumb ISA:-)
495 #endif
496 .align  2
497 .LK_00_19:      .word   0x5a827999
498 .LK_20_39:      .word   0x6ed9eba1
499 .LK_40_59:      .word   0x8f1bbcdc
500 .LK_60_79:      .word   0xca62c1d6
501 .size   sha1_block_data_order,.-sha1_block_data_order
502 .asciz  "SHA1 block transform for ARMv4, CRYPTOGAMS by <appro@openssl.org>"
503 .align  2