crypto: arm64/aes-xts-ce: fix for big endian
[firefly-linux-kernel-4.4.55.git] / arch / arm64 / crypto / aes-modes.S
1 /*
2  * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES
3  *
4  * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 as
8  * published by the Free Software Foundation.
9  */
10
11 /* included by aes-ce.S and aes-neon.S */
12
13         .text
14         .align          4
15
16 /*
17  * There are several ways to instantiate this code:
18  * - no interleave, all inline
19  * - 2-way interleave, 2x calls out of line (-DINTERLEAVE=2)
20  * - 2-way interleave, all inline (-DINTERLEAVE=2 -DINTERLEAVE_INLINE)
21  * - 4-way interleave, 4x calls out of line (-DINTERLEAVE=4)
22  * - 4-way interleave, all inline (-DINTERLEAVE=4 -DINTERLEAVE_INLINE)
23  *
24  * Macros imported by this code:
25  * - enc_prepare        - setup NEON registers for encryption
26  * - dec_prepare        - setup NEON registers for decryption
27  * - enc_switch_key     - change to new key after having prepared for encryption
28  * - encrypt_block      - encrypt a single block
29  * - decrypt block      - decrypt a single block
30  * - encrypt_block2x    - encrypt 2 blocks in parallel (if INTERLEAVE == 2)
31  * - decrypt_block2x    - decrypt 2 blocks in parallel (if INTERLEAVE == 2)
32  * - encrypt_block4x    - encrypt 4 blocks in parallel (if INTERLEAVE == 4)
33  * - decrypt_block4x    - decrypt 4 blocks in parallel (if INTERLEAVE == 4)
34  */
35
36 #if defined(INTERLEAVE) && !defined(INTERLEAVE_INLINE)
37 #define FRAME_PUSH      stp x29, x30, [sp,#-16]! ; mov x29, sp
38 #define FRAME_POP       ldp x29, x30, [sp],#16
39
40 #if INTERLEAVE == 2
41
42 aes_encrypt_block2x:
43         encrypt_block2x v0, v1, w3, x2, x6, w7
44         ret
45 ENDPROC(aes_encrypt_block2x)
46
47 aes_decrypt_block2x:
48         decrypt_block2x v0, v1, w3, x2, x6, w7
49         ret
50 ENDPROC(aes_decrypt_block2x)
51
52 #elif INTERLEAVE == 4
53
54 aes_encrypt_block4x:
55         encrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
56         ret
57 ENDPROC(aes_encrypt_block4x)
58
59 aes_decrypt_block4x:
60         decrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
61         ret
62 ENDPROC(aes_decrypt_block4x)
63
64 #else
65 #error INTERLEAVE should equal 2 or 4
66 #endif
67
68         .macro          do_encrypt_block2x
69         bl              aes_encrypt_block2x
70         .endm
71
72         .macro          do_decrypt_block2x
73         bl              aes_decrypt_block2x
74         .endm
75
76         .macro          do_encrypt_block4x
77         bl              aes_encrypt_block4x
78         .endm
79
80         .macro          do_decrypt_block4x
81         bl              aes_decrypt_block4x
82         .endm
83
84 #else
85 #define FRAME_PUSH
86 #define FRAME_POP
87
88         .macro          do_encrypt_block2x
89         encrypt_block2x v0, v1, w3, x2, x6, w7
90         .endm
91
92         .macro          do_decrypt_block2x
93         decrypt_block2x v0, v1, w3, x2, x6, w7
94         .endm
95
96         .macro          do_encrypt_block4x
97         encrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
98         .endm
99
100         .macro          do_decrypt_block4x
101         decrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
102         .endm
103
104 #endif
105
106         /*
107          * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
108          *                 int blocks, int first)
109          * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
110          *                 int blocks, int first)
111          */
112
113 AES_ENTRY(aes_ecb_encrypt)
114         FRAME_PUSH
115         cbz             w5, .LecbencloopNx
116
117         enc_prepare     w3, x2, x5
118
119 .LecbencloopNx:
120 #if INTERLEAVE >= 2
121         subs            w4, w4, #INTERLEAVE
122         bmi             .Lecbenc1x
123 #if INTERLEAVE == 2
124         ld1             {v0.16b-v1.16b}, [x1], #32      /* get 2 pt blocks */
125         do_encrypt_block2x
126         st1             {v0.16b-v1.16b}, [x0], #32
127 #else
128         ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 pt blocks */
129         do_encrypt_block4x
130         st1             {v0.16b-v3.16b}, [x0], #64
131 #endif
132         b               .LecbencloopNx
133 .Lecbenc1x:
134         adds            w4, w4, #INTERLEAVE
135         beq             .Lecbencout
136 #endif
137 .Lecbencloop:
138         ld1             {v0.16b}, [x1], #16             /* get next pt block */
139         encrypt_block   v0, w3, x2, x5, w6
140         st1             {v0.16b}, [x0], #16
141         subs            w4, w4, #1
142         bne             .Lecbencloop
143 .Lecbencout:
144         FRAME_POP
145         ret
146 AES_ENDPROC(aes_ecb_encrypt)
147
148
149 AES_ENTRY(aes_ecb_decrypt)
150         FRAME_PUSH
151         cbz             w5, .LecbdecloopNx
152
153         dec_prepare     w3, x2, x5
154
155 .LecbdecloopNx:
156 #if INTERLEAVE >= 2
157         subs            w4, w4, #INTERLEAVE
158         bmi             .Lecbdec1x
159 #if INTERLEAVE == 2
160         ld1             {v0.16b-v1.16b}, [x1], #32      /* get 2 ct blocks */
161         do_decrypt_block2x
162         st1             {v0.16b-v1.16b}, [x0], #32
163 #else
164         ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 ct blocks */
165         do_decrypt_block4x
166         st1             {v0.16b-v3.16b}, [x0], #64
167 #endif
168         b               .LecbdecloopNx
169 .Lecbdec1x:
170         adds            w4, w4, #INTERLEAVE
171         beq             .Lecbdecout
172 #endif
173 .Lecbdecloop:
174         ld1             {v0.16b}, [x1], #16             /* get next ct block */
175         decrypt_block   v0, w3, x2, x5, w6
176         st1             {v0.16b}, [x0], #16
177         subs            w4, w4, #1
178         bne             .Lecbdecloop
179 .Lecbdecout:
180         FRAME_POP
181         ret
182 AES_ENDPROC(aes_ecb_decrypt)
183
184
185         /*
186          * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
187          *                 int blocks, u8 iv[], int first)
188          * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
189          *                 int blocks, u8 iv[], int first)
190          */
191
192 AES_ENTRY(aes_cbc_encrypt)
193         cbz             w6, .Lcbcencloop
194
195         ld1             {v0.16b}, [x5]                  /* get iv */
196         enc_prepare     w3, x2, x5
197
198 .Lcbcencloop:
199         ld1             {v1.16b}, [x1], #16             /* get next pt block */
200         eor             v0.16b, v0.16b, v1.16b          /* ..and xor with iv */
201         encrypt_block   v0, w3, x2, x5, w6
202         st1             {v0.16b}, [x0], #16
203         subs            w4, w4, #1
204         bne             .Lcbcencloop
205         ret
206 AES_ENDPROC(aes_cbc_encrypt)
207
208
209 AES_ENTRY(aes_cbc_decrypt)
210         FRAME_PUSH
211         cbz             w6, .LcbcdecloopNx
212
213         ld1             {v7.16b}, [x5]                  /* get iv */
214         dec_prepare     w3, x2, x5
215
216 .LcbcdecloopNx:
217 #if INTERLEAVE >= 2
218         subs            w4, w4, #INTERLEAVE
219         bmi             .Lcbcdec1x
220 #if INTERLEAVE == 2
221         ld1             {v0.16b-v1.16b}, [x1], #32      /* get 2 ct blocks */
222         mov             v2.16b, v0.16b
223         mov             v3.16b, v1.16b
224         do_decrypt_block2x
225         eor             v0.16b, v0.16b, v7.16b
226         eor             v1.16b, v1.16b, v2.16b
227         mov             v7.16b, v3.16b
228         st1             {v0.16b-v1.16b}, [x0], #32
229 #else
230         ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 ct blocks */
231         mov             v4.16b, v0.16b
232         mov             v5.16b, v1.16b
233         mov             v6.16b, v2.16b
234         do_decrypt_block4x
235         sub             x1, x1, #16
236         eor             v0.16b, v0.16b, v7.16b
237         eor             v1.16b, v1.16b, v4.16b
238         ld1             {v7.16b}, [x1], #16             /* reload 1 ct block */
239         eor             v2.16b, v2.16b, v5.16b
240         eor             v3.16b, v3.16b, v6.16b
241         st1             {v0.16b-v3.16b}, [x0], #64
242 #endif
243         b               .LcbcdecloopNx
244 .Lcbcdec1x:
245         adds            w4, w4, #INTERLEAVE
246         beq             .Lcbcdecout
247 #endif
248 .Lcbcdecloop:
249         ld1             {v1.16b}, [x1], #16             /* get next ct block */
250         mov             v0.16b, v1.16b                  /* ...and copy to v0 */
251         decrypt_block   v0, w3, x2, x5, w6
252         eor             v0.16b, v0.16b, v7.16b          /* xor with iv => pt */
253         mov             v7.16b, v1.16b                  /* ct is next iv */
254         st1             {v0.16b}, [x0], #16
255         subs            w4, w4, #1
256         bne             .Lcbcdecloop
257 .Lcbcdecout:
258         FRAME_POP
259         ret
260 AES_ENDPROC(aes_cbc_decrypt)
261
262
263         /*
264          * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
265          *                 int blocks, u8 ctr[], int first)
266          */
267
268 AES_ENTRY(aes_ctr_encrypt)
269         FRAME_PUSH
270         cbnz            w6, .Lctrfirst          /* 1st time around? */
271         umov            x5, v4.d[1]             /* keep swabbed ctr in reg */
272         rev             x5, x5
273 #if INTERLEAVE >= 2
274         cmn             w5, w4                  /* 32 bit overflow? */
275         bcs             .Lctrinc
276         add             x5, x5, #1              /* increment BE ctr */
277         b               .LctrincNx
278 #else
279         b               .Lctrinc
280 #endif
281 .Lctrfirst:
282         enc_prepare     w3, x2, x6
283         ld1             {v4.16b}, [x5]
284         umov            x5, v4.d[1]             /* keep swabbed ctr in reg */
285         rev             x5, x5
286 #if INTERLEAVE >= 2
287         cmn             w5, w4                  /* 32 bit overflow? */
288         bcs             .Lctrloop
289 .LctrloopNx:
290         subs            w4, w4, #INTERLEAVE
291         bmi             .Lctr1x
292 #if INTERLEAVE == 2
293         mov             v0.8b, v4.8b
294         mov             v1.8b, v4.8b
295         rev             x7, x5
296         add             x5, x5, #1
297         ins             v0.d[1], x7
298         rev             x7, x5
299         add             x5, x5, #1
300         ins             v1.d[1], x7
301         ld1             {v2.16b-v3.16b}, [x1], #32      /* get 2 input blocks */
302         do_encrypt_block2x
303         eor             v0.16b, v0.16b, v2.16b
304         eor             v1.16b, v1.16b, v3.16b
305         st1             {v0.16b-v1.16b}, [x0], #32
306 #else
307         ldr             q8, =0x30000000200000001        /* addends 1,2,3[,0] */
308         dup             v7.4s, w5
309         mov             v0.16b, v4.16b
310         add             v7.4s, v7.4s, v8.4s
311         mov             v1.16b, v4.16b
312         rev32           v8.16b, v7.16b
313         mov             v2.16b, v4.16b
314         mov             v3.16b, v4.16b
315         mov             v1.s[3], v8.s[0]
316         mov             v2.s[3], v8.s[1]
317         mov             v3.s[3], v8.s[2]
318         ld1             {v5.16b-v7.16b}, [x1], #48      /* get 3 input blocks */
319         do_encrypt_block4x
320         eor             v0.16b, v5.16b, v0.16b
321         ld1             {v5.16b}, [x1], #16             /* get 1 input block  */
322         eor             v1.16b, v6.16b, v1.16b
323         eor             v2.16b, v7.16b, v2.16b
324         eor             v3.16b, v5.16b, v3.16b
325         st1             {v0.16b-v3.16b}, [x0], #64
326         add             x5, x5, #INTERLEAVE
327 #endif
328         cbz             w4, .LctroutNx
329 .LctrincNx:
330         rev             x7, x5
331         ins             v4.d[1], x7
332         b               .LctrloopNx
333 .LctroutNx:
334         sub             x5, x5, #1
335         rev             x7, x5
336         ins             v4.d[1], x7
337         b               .Lctrout
338 .Lctr1x:
339         adds            w4, w4, #INTERLEAVE
340         beq             .Lctrout
341 #endif
342 .Lctrloop:
343         mov             v0.16b, v4.16b
344         encrypt_block   v0, w3, x2, x6, w7
345         subs            w4, w4, #1
346         bmi             .Lctrhalfblock          /* blocks < 0 means 1/2 block */
347         ld1             {v3.16b}, [x1], #16
348         eor             v3.16b, v0.16b, v3.16b
349         st1             {v3.16b}, [x0], #16
350         beq             .Lctrout
351 .Lctrinc:
352         adds            x5, x5, #1              /* increment BE ctr */
353         rev             x7, x5
354         ins             v4.d[1], x7
355         bcc             .Lctrloop               /* no overflow? */
356         umov            x7, v4.d[0]             /* load upper word of ctr  */
357         rev             x7, x7                  /* ... to handle the carry */
358         add             x7, x7, #1
359         rev             x7, x7
360         ins             v4.d[0], x7
361         b               .Lctrloop
362 .Lctrhalfblock:
363         ld1             {v3.8b}, [x1]
364         eor             v3.8b, v0.8b, v3.8b
365         st1             {v3.8b}, [x0]
366 .Lctrout:
367         FRAME_POP
368         ret
369 AES_ENDPROC(aes_ctr_encrypt)
370         .ltorg
371
372
373         /*
374          * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
375          *                 int blocks, u8 const rk2[], u8 iv[], int first)
376          * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
377          *                 int blocks, u8 const rk2[], u8 iv[], int first)
378          */
379
380         .macro          next_tweak, out, in, const, tmp
381         sshr            \tmp\().2d,  \in\().2d,   #63
382         and             \tmp\().16b, \tmp\().16b, \const\().16b
383         add             \out\().2d,  \in\().2d,   \in\().2d
384         ext             \tmp\().16b, \tmp\().16b, \tmp\().16b, #8
385         eor             \out\().16b, \out\().16b, \tmp\().16b
386         .endm
387
388 .Lxts_mul_x:
389 CPU_LE( .quad           1, 0x87         )
390 CPU_BE( .quad           0x87, 1         )
391
392 AES_ENTRY(aes_xts_encrypt)
393         FRAME_PUSH
394         cbz             w7, .LxtsencloopNx
395
396         ld1             {v4.16b}, [x6]
397         enc_prepare     w3, x5, x6
398         encrypt_block   v4, w3, x5, x6, w7              /* first tweak */
399         enc_switch_key  w3, x2, x6
400         ldr             q7, .Lxts_mul_x
401         b               .LxtsencNx
402
403 .LxtsencloopNx:
404         ldr             q7, .Lxts_mul_x
405         next_tweak      v4, v4, v7, v8
406 .LxtsencNx:
407 #if INTERLEAVE >= 2
408         subs            w4, w4, #INTERLEAVE
409         bmi             .Lxtsenc1x
410 #if INTERLEAVE == 2
411         ld1             {v0.16b-v1.16b}, [x1], #32      /* get 2 pt blocks */
412         next_tweak      v5, v4, v7, v8
413         eor             v0.16b, v0.16b, v4.16b
414         eor             v1.16b, v1.16b, v5.16b
415         do_encrypt_block2x
416         eor             v0.16b, v0.16b, v4.16b
417         eor             v1.16b, v1.16b, v5.16b
418         st1             {v0.16b-v1.16b}, [x0], #32
419         cbz             w4, .LxtsencoutNx
420         next_tweak      v4, v5, v7, v8
421         b               .LxtsencNx
422 .LxtsencoutNx:
423         mov             v4.16b, v5.16b
424         b               .Lxtsencout
425 #else
426         ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 pt blocks */
427         next_tweak      v5, v4, v7, v8
428         eor             v0.16b, v0.16b, v4.16b
429         next_tweak      v6, v5, v7, v8
430         eor             v1.16b, v1.16b, v5.16b
431         eor             v2.16b, v2.16b, v6.16b
432         next_tweak      v7, v6, v7, v8
433         eor             v3.16b, v3.16b, v7.16b
434         do_encrypt_block4x
435         eor             v3.16b, v3.16b, v7.16b
436         eor             v0.16b, v0.16b, v4.16b
437         eor             v1.16b, v1.16b, v5.16b
438         eor             v2.16b, v2.16b, v6.16b
439         st1             {v0.16b-v3.16b}, [x0], #64
440         mov             v4.16b, v7.16b
441         cbz             w4, .Lxtsencout
442         b               .LxtsencloopNx
443 #endif
444 .Lxtsenc1x:
445         adds            w4, w4, #INTERLEAVE
446         beq             .Lxtsencout
447 #endif
448 .Lxtsencloop:
449         ld1             {v1.16b}, [x1], #16
450         eor             v0.16b, v1.16b, v4.16b
451         encrypt_block   v0, w3, x2, x6, w7
452         eor             v0.16b, v0.16b, v4.16b
453         st1             {v0.16b}, [x0], #16
454         subs            w4, w4, #1
455         beq             .Lxtsencout
456         next_tweak      v4, v4, v7, v8
457         b               .Lxtsencloop
458 .Lxtsencout:
459         FRAME_POP
460         ret
461 AES_ENDPROC(aes_xts_encrypt)
462
463
464 AES_ENTRY(aes_xts_decrypt)
465         FRAME_PUSH
466         cbz             w7, .LxtsdecloopNx
467
468         ld1             {v4.16b}, [x6]
469         enc_prepare     w3, x5, x6
470         encrypt_block   v4, w3, x5, x6, w7              /* first tweak */
471         dec_prepare     w3, x2, x6
472         ldr             q7, .Lxts_mul_x
473         b               .LxtsdecNx
474
475 .LxtsdecloopNx:
476         ldr             q7, .Lxts_mul_x
477         next_tweak      v4, v4, v7, v8
478 .LxtsdecNx:
479 #if INTERLEAVE >= 2
480         subs            w4, w4, #INTERLEAVE
481         bmi             .Lxtsdec1x
482 #if INTERLEAVE == 2
483         ld1             {v0.16b-v1.16b}, [x1], #32      /* get 2 ct blocks */
484         next_tweak      v5, v4, v7, v8
485         eor             v0.16b, v0.16b, v4.16b
486         eor             v1.16b, v1.16b, v5.16b
487         do_decrypt_block2x
488         eor             v0.16b, v0.16b, v4.16b
489         eor             v1.16b, v1.16b, v5.16b
490         st1             {v0.16b-v1.16b}, [x0], #32
491         cbz             w4, .LxtsdecoutNx
492         next_tweak      v4, v5, v7, v8
493         b               .LxtsdecNx
494 .LxtsdecoutNx:
495         mov             v4.16b, v5.16b
496         b               .Lxtsdecout
497 #else
498         ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 ct blocks */
499         next_tweak      v5, v4, v7, v8
500         eor             v0.16b, v0.16b, v4.16b
501         next_tweak      v6, v5, v7, v8
502         eor             v1.16b, v1.16b, v5.16b
503         eor             v2.16b, v2.16b, v6.16b
504         next_tweak      v7, v6, v7, v8
505         eor             v3.16b, v3.16b, v7.16b
506         do_decrypt_block4x
507         eor             v3.16b, v3.16b, v7.16b
508         eor             v0.16b, v0.16b, v4.16b
509         eor             v1.16b, v1.16b, v5.16b
510         eor             v2.16b, v2.16b, v6.16b
511         st1             {v0.16b-v3.16b}, [x0], #64
512         mov             v4.16b, v7.16b
513         cbz             w4, .Lxtsdecout
514         b               .LxtsdecloopNx
515 #endif
516 .Lxtsdec1x:
517         adds            w4, w4, #INTERLEAVE
518         beq             .Lxtsdecout
519 #endif
520 .Lxtsdecloop:
521         ld1             {v1.16b}, [x1], #16
522         eor             v0.16b, v1.16b, v4.16b
523         decrypt_block   v0, w3, x2, x6, w7
524         eor             v0.16b, v0.16b, v4.16b
525         st1             {v0.16b}, [x0], #16
526         subs            w4, w4, #1
527         beq             .Lxtsdecout
528         next_tweak      v4, v4, v7, v8
529         b               .Lxtsdecloop
530 .Lxtsdecout:
531         FRAME_POP
532         ret
533 AES_ENDPROC(aes_xts_decrypt)