1#include "arm_asm.h"
2.text
3
4.align    8         // strategic alignment and padding that allows to use
5                    // address value as loop termination condition...
6.quad     0,0,0,0,0,0,0,0
7.type     iotas,%object
8iotas:
9.quad     0x0000000000000001
10.quad     0x0000000000008082
11.quad     0x800000000000808a
12.quad     0x8000000080008000
13.quad     0x000000000000808b
14.quad     0x0000000080000001
15.quad     0x8000000080008081
16.quad     0x8000000000008009
17.quad     0x000000000000008a
18.quad     0x0000000000000088
19.quad     0x0000000080008009
20.quad     0x000000008000000a
21.quad     0x000000008000808b
22.quad     0x800000000000008b
23.quad     0x8000000000008089
24.quad     0x8000000000008003
25.quad     0x8000000000008002
26.quad     0x8000000000000080
27.quad     0x000000000000800a
28.quad     0x800000008000000a
29.quad     0x8000000080008081
30.quad     0x8000000000008080
31.quad     0x0000000080000001
32.quad     0x8000000080008008
33.size     iotas,.-iotas
34.type     KeccakF1600_int,%function
35.align    5
36KeccakF1600_int:
37          adr       x28,iotas
38.inst     0xd503233f                              // paciasp
39          stp       x28,x30,[sp,#16]              // 32 bytes on top are mine
40          b         .Loop
41.align    4
42.Loop:
43          ////////////////////////////////////////// Theta
44          eor       x26,x0,x5
45          stp       x4,x9,[sp,#0]       // offload pair...
46          eor       x27,x1,x6
47          eor       x28,x2,x7
48          eor       x30,x3,x8
49          eor       x4,x4,x9
50          eor       x26,x26,x10
51          eor       x27,x27,x11
52          eor       x28,x28,x12
53          eor       x30,x30,x13
54          eor       x4,x4,x14
55          eor       x26,x26,x15
56          eor       x27,x27,x16
57          eor       x28,x28,x17
58          eor       x30,x30,x25
59          eor       x4,x4,x19
60          eor       x26,x26,x20
61          eor       x28,x28,x22
62          eor       x27,x27,x21
63          eor       x30,x30,x23
64          eor       x4,x4,x24
65
66          eor       x9,x26,x28,ror#63
67
68          eor       x1,x1,x9
69          eor       x6,x6,x9
70          eor       x11,x11,x9
71          eor       x16,x16,x9
72          eor       x21,x21,x9
73
74          eor       x9,x27,x30,ror#63
75          eor       x28,x28,x4,ror#63
76          eor       x30,x30,x26,ror#63
77          eor       x4,x4,x27,ror#63
78
79          eor       x27,   x2,x9                  // mov    x27,x2
80          eor       x7,x7,x9
81          eor       x12,x12,x9
82          eor       x17,x17,x9
83          eor       x22,x22,x9
84
85          eor       x0,x0,x4
86          eor       x5,x5,x4
87          eor       x10,x10,x4
88          eor       x15,x15,x4
89          eor       x20,x20,x4
90          ldp       x4,x9,[sp,#0]       // re-load offloaded data
91          eor       x26,   x3,x28                 // mov    x26,x3
92          eor       x8,x8,x28
93          eor       x13,x13,x28
94          eor       x25,x25,x28
95          eor       x23,x23,x28
96
97          eor       x28,   x4,x30                 // mov    x28,x4
98          eor       x9,x9,x30
99          eor       x14,x14,x30
100          eor       x19,x19,x30
101          eor       x24,x24,x30
102
103          ////////////////////////////////////////// Rho+Pi
104          mov       x30,x1
105          ror       x1,x6,#64-44
106          //mov     x27,x2
107          ror       x2,x12,#64-43
108          //mov     x26,x3
109          ror       x3,x25,#64-21
110          //mov     x28,x4
111          ror       x4,x24,#64-14
112
113          ror       x6,x9,#64-20
114          ror       x12,x13,#64-25
115          ror       x25,x17,#64-15
116          ror       x24,x21,#64-2
117
118          ror       x9,x22,#64-61
119          ror       x13,x19,#64-8
120          ror       x17,x11,#64-10
121          ror       x21,x8,#64-55
122
123          ror       x22,x14,#64-39
124          ror       x19,x23,#64-56
125          ror       x11,x7,#64-6
126          ror       x8,x16,#64-45
127
128          ror       x14,x20,#64-18
129          ror       x23,x15,#64-41
130          ror       x7,x10,#64-3
131          ror       x16,x5,#64-36
132
133          ror       x5,x26,#64-28
134          ror       x10,x30,#64-1
135          ror       x15,x28,#64-27
136          ror       x20,x27,#64-62
137
138          ////////////////////////////////////////// Chi+Iota
139          bic       x26,x2,x1
140          bic       x27,x3,x2
141          bic       x28,x0,x4
142          bic       x30,x1,x0
143          eor       x0,x0,x26
144          bic       x26,x4,x3
145          eor       x1,x1,x27
146          ldr       x27,[sp,#16]
147          eor       x3,x3,x28
148          eor       x4,x4,x30
149          eor       x2,x2,x26
150          ldr       x30,[x27],#8                  // Iota[i++]
151
152          bic       x26,x7,x6
153          tst       x27,#255                      // are we done?
154          str       x27,[sp,#16]
155          bic       x27,x8,x7
156          bic       x28,x5,x9
157          eor       x0,x0,x30           // A[0][0] ^= Iota
158          bic       x30,x6,x5
159          eor       x5,x5,x26
160          bic       x26,x9,x8
161          eor       x6,x6,x27
162          eor       x8,x8,x28
163          eor       x9,x9,x30
164          eor       x7,x7,x26
165
166          bic       x26,x12,x11
167          bic       x27,x13,x12
168          bic       x28,x10,x14
169          bic       x30,x11,x10
170          eor       x10,x10,x26
171          bic       x26,x14,x13
172          eor       x11,x11,x27
173          eor       x13,x13,x28
174          eor       x14,x14,x30
175          eor       x12,x12,x26
176
177          bic       x26,x17,x16
178          bic       x27,x25,x17
179          bic       x28,x15,x19
180          bic       x30,x16,x15
181          eor       x15,x15,x26
182          bic       x26,x19,x25
183          eor       x16,x16,x27
184          eor       x25,x25,x28
185          eor       x19,x19,x30
186          eor       x17,x17,x26
187
188          bic       x26,x22,x21
189          bic       x27,x23,x22
190          bic       x28,x20,x24
191          bic       x30,x21,x20
192          eor       x20,x20,x26
193          bic       x26,x24,x23
194          eor       x21,x21,x27
195          eor       x23,x23,x28
196          eor       x24,x24,x30
197          eor       x22,x22,x26
198
199          bne       .Loop
200
201          ldr       x30,[sp,#24]
202.inst     0xd50323bf                              // autiasp
203          ret
204.size     KeccakF1600_int,.-KeccakF1600_int
205
206.type     KeccakF1600,%function
207.align    5
208KeccakF1600:
209.inst     0xd503233f                              // paciasp
210          stp       x29,x30,[sp,#-128]!
211          add       x29,sp,#0
212          stp       x19,x20,[sp,#16]
213          stp       x21,x22,[sp,#32]
214          stp       x23,x24,[sp,#48]
215          stp       x25,x26,[sp,#64]
216          stp       x27,x28,[sp,#80]
217          sub       sp,sp,#48
218
219          str       x0,[sp,#32]                             // offload argument
220          mov       x26,x0
221          ldp       x0,x1,[x0,#16*0]
222          ldp       x2,x3,[x26,#16*1]
223          ldp       x4,x5,[x26,#16*2]
224          ldp       x6,x7,[x26,#16*3]
225          ldp       x8,x9,[x26,#16*4]
226          ldp       x10,x11,[x26,#16*5]
227          ldp       x12,x13,[x26,#16*6]
228          ldp       x14,x15,[x26,#16*7]
229          ldp       x16,x17,[x26,#16*8]
230          ldp       x25,x19,[x26,#16*9]
231          ldp       x20,x21,[x26,#16*10]
232          ldp       x22,x23,[x26,#16*11]
233          ldr       x24,[x26,#16*12]
234
235          bl        KeccakF1600_int
236
237          ldr       x26,[sp,#32]
238          stp       x0,x1,[x26,#16*0]
239          stp       x2,x3,[x26,#16*1]
240          stp       x4,x5,[x26,#16*2]
241          stp       x6,x7,[x26,#16*3]
242          stp       x8,x9,[x26,#16*4]
243          stp       x10,x11,[x26,#16*5]
244          stp       x12,x13,[x26,#16*6]
245          stp       x14,x15,[x26,#16*7]
246          stp       x16,x17,[x26,#16*8]
247          stp       x25,x19,[x26,#16*9]
248          stp       x20,x21,[x26,#16*10]
249          stp       x22,x23,[x26,#16*11]
250          str       x24,[x26,#16*12]
251
252          ldp       x19,x20,[x29,#16]
253          add       sp,sp,#48
254          ldp       x21,x22,[x29,#32]
255          ldp       x23,x24,[x29,#48]
256          ldp       x25,x26,[x29,#64]
257          ldp       x27,x28,[x29,#80]
258          ldp       x29,x30,[sp],#128
259.inst     0xd50323bf                              // autiasp
260          ret
261.size     KeccakF1600,.-KeccakF1600
262
263.globl    SHA3_absorb
264.type     SHA3_absorb,%function
265.align    5
266SHA3_absorb:
267.inst     0xd503233f                              // paciasp
268          stp       x29,x30,[sp,#-128]!
269          add       x29,sp,#0
270          stp       x19,x20,[sp,#16]
271          stp       x21,x22,[sp,#32]
272          stp       x23,x24,[sp,#48]
273          stp       x25,x26,[sp,#64]
274          stp       x27,x28,[sp,#80]
275          sub       sp,sp,#64
276
277          stp       x0,x1,[sp,#32]                          // offload arguments
278          stp       x2,x3,[sp,#48]
279
280          mov       x26,x0                        // uint64_t A[5][5]
281          mov       x27,x1                        // const void *inp
282          mov       x28,x2                        // size_t len
283          mov       x30,x3                        // size_t bsz
284          ldp       x0,x1,[x26,#16*0]
285          ldp       x2,x3,[x26,#16*1]
286          ldp       x4,x5,[x26,#16*2]
287          ldp       x6,x7,[x26,#16*3]
288          ldp       x8,x9,[x26,#16*4]
289          ldp       x10,x11,[x26,#16*5]
290          ldp       x12,x13,[x26,#16*6]
291          ldp       x14,x15,[x26,#16*7]
292          ldp       x16,x17,[x26,#16*8]
293          ldp       x25,x19,[x26,#16*9]
294          ldp       x20,x21,[x26,#16*10]
295          ldp       x22,x23,[x26,#16*11]
296          ldr       x24,[x26,#16*12]
297          b         .Loop_absorb
298
299.align    4
300.Loop_absorb:
301          subs      x26,x28,x30                   // len - bsz
302          blo       .Labsorbed
303
304          str       x26,[sp,#48]                            // save len - bsz
305          ldr       x26,[x27],#8                  // *inp++
306#ifdef    __AARCH64EB__
307          rev       x26,x26
308#endif
309          eor       x0,x0,x26
310          cmp       x30,#8*(0+2)
311          blo       .Lprocess_block
312          ldr       x26,[x27],#8                  // *inp++
313#ifdef    __AARCH64EB__
314          rev       x26,x26
315#endif
316          eor       x1,x1,x26
317          beq       .Lprocess_block
318          ldr       x26,[x27],#8                  // *inp++
319#ifdef    __AARCH64EB__
320          rev       x26,x26
321#endif
322          eor       x2,x2,x26
323          cmp       x30,#8*(2+2)
324          blo       .Lprocess_block
325          ldr       x26,[x27],#8                  // *inp++
326#ifdef    __AARCH64EB__
327          rev       x26,x26
328#endif
329          eor       x3,x3,x26
330          beq       .Lprocess_block
331          ldr       x26,[x27],#8                  // *inp++
332#ifdef    __AARCH64EB__
333          rev       x26,x26
334#endif
335          eor       x4,x4,x26
336          cmp       x30,#8*(4+2)
337          blo       .Lprocess_block
338          ldr       x26,[x27],#8                  // *inp++
339#ifdef    __AARCH64EB__
340          rev       x26,x26
341#endif
342          eor       x5,x5,x26
343          beq       .Lprocess_block
344          ldr       x26,[x27],#8                  // *inp++
345#ifdef    __AARCH64EB__
346          rev       x26,x26
347#endif
348          eor       x6,x6,x26
349          cmp       x30,#8*(6+2)
350          blo       .Lprocess_block
351          ldr       x26,[x27],#8                  // *inp++
352#ifdef    __AARCH64EB__
353          rev       x26,x26
354#endif
355          eor       x7,x7,x26
356          beq       .Lprocess_block
357          ldr       x26,[x27],#8                  // *inp++
358#ifdef    __AARCH64EB__
359          rev       x26,x26
360#endif
361          eor       x8,x8,x26
362          cmp       x30,#8*(8+2)
363          blo       .Lprocess_block
364          ldr       x26,[x27],#8                  // *inp++
365#ifdef    __AARCH64EB__
366          rev       x26,x26
367#endif
368          eor       x9,x9,x26
369          beq       .Lprocess_block
370          ldr       x26,[x27],#8                  // *inp++
371#ifdef    __AARCH64EB__
372          rev       x26,x26
373#endif
374          eor       x10,x10,x26
375          cmp       x30,#8*(10+2)
376          blo       .Lprocess_block
377          ldr       x26,[x27],#8                  // *inp++
378#ifdef    __AARCH64EB__
379          rev       x26,x26
380#endif
381          eor       x11,x11,x26
382          beq       .Lprocess_block
383          ldr       x26,[x27],#8                  // *inp++
384#ifdef    __AARCH64EB__
385          rev       x26,x26
386#endif
387          eor       x12,x12,x26
388          cmp       x30,#8*(12+2)
389          blo       .Lprocess_block
390          ldr       x26,[x27],#8                  // *inp++
391#ifdef    __AARCH64EB__
392          rev       x26,x26
393#endif
394          eor       x13,x13,x26
395          beq       .Lprocess_block
396          ldr       x26,[x27],#8                  // *inp++
397#ifdef    __AARCH64EB__
398          rev       x26,x26
399#endif
400          eor       x14,x14,x26
401          cmp       x30,#8*(14+2)
402          blo       .Lprocess_block
403          ldr       x26,[x27],#8                  // *inp++
404#ifdef    __AARCH64EB__
405          rev       x26,x26
406#endif
407          eor       x15,x15,x26
408          beq       .Lprocess_block
409          ldr       x26,[x27],#8                  // *inp++
410#ifdef    __AARCH64EB__
411          rev       x26,x26
412#endif
413          eor       x16,x16,x26
414          cmp       x30,#8*(16+2)
415          blo       .Lprocess_block
416          ldr       x26,[x27],#8                  // *inp++
417#ifdef    __AARCH64EB__
418          rev       x26,x26
419#endif
420          eor       x17,x17,x26
421          beq       .Lprocess_block
422          ldr       x26,[x27],#8                  // *inp++
423#ifdef    __AARCH64EB__
424          rev       x26,x26
425#endif
426          eor       x25,x25,x26
427          cmp       x30,#8*(18+2)
428          blo       .Lprocess_block
429          ldr       x26,[x27],#8                  // *inp++
430#ifdef    __AARCH64EB__
431          rev       x26,x26
432#endif
433          eor       x19,x19,x26
434          beq       .Lprocess_block
435          ldr       x26,[x27],#8                  // *inp++
436#ifdef    __AARCH64EB__
437          rev       x26,x26
438#endif
439          eor       x20,x20,x26
440          cmp       x30,#8*(20+2)
441          blo       .Lprocess_block
442          ldr       x26,[x27],#8                  // *inp++
443#ifdef    __AARCH64EB__
444          rev       x26,x26
445#endif
446          eor       x21,x21,x26
447          beq       .Lprocess_block
448          ldr       x26,[x27],#8                  // *inp++
449#ifdef    __AARCH64EB__
450          rev       x26,x26
451#endif
452          eor       x22,x22,x26
453          cmp       x30,#8*(22+2)
454          blo       .Lprocess_block
455          ldr       x26,[x27],#8                  // *inp++
456#ifdef    __AARCH64EB__
457          rev       x26,x26
458#endif
459          eor       x23,x23,x26
460          beq       .Lprocess_block
461          ldr       x26,[x27],#8                  // *inp++
462#ifdef    __AARCH64EB__
463          rev       x26,x26
464#endif
465          eor       x24,x24,x26
466
467.Lprocess_block:
468          str       x27,[sp,#40]                            // save inp
469
470          bl        KeccakF1600_int
471
472          ldr       x27,[sp,#40]                            // restore arguments
473          ldp       x28,x30,[sp,#48]
474          b         .Loop_absorb
475
476.align    4
477.Labsorbed:
478          ldr       x27,[sp,#32]
479          stp       x0,x1,[x27,#16*0]
480          stp       x2,x3,[x27,#16*1]
481          stp       x4,x5,[x27,#16*2]
482          stp       x6,x7,[x27,#16*3]
483          stp       x8,x9,[x27,#16*4]
484          stp       x10,x11,[x27,#16*5]
485          stp       x12,x13,[x27,#16*6]
486          stp       x14,x15,[x27,#16*7]
487          stp       x16,x17,[x27,#16*8]
488          stp       x25,x19,[x27,#16*9]
489          stp       x20,x21,[x27,#16*10]
490          stp       x22,x23,[x27,#16*11]
491          str       x24,[x27,#16*12]
492
493          mov       x0,x28                        // return value
494          ldp       x19,x20,[x29,#16]
495          add       sp,sp,#64
496          ldp       x21,x22,[x29,#32]
497          ldp       x23,x24,[x29,#48]
498          ldp       x25,x26,[x29,#64]
499          ldp       x27,x28,[x29,#80]
500          ldp       x29,x30,[sp],#128
501.inst     0xd50323bf                              // autiasp
502          ret
503.size     SHA3_absorb,.-SHA3_absorb
504.globl    SHA3_squeeze
505.type     SHA3_squeeze,%function
506.align    5
507SHA3_squeeze:
508.inst     0xd503233f                              // paciasp
509          stp       x29,x30,[sp,#-48]!
510          add       x29,sp,#0
511          stp       x19,x20,[sp,#16]
512          stp       x21,x22,[sp,#32]
513
514          mov       x19,x0                        // put aside arguments
515          mov       x20,x1
516          mov       x21,x2
517          mov       x22,x3
518
519.Loop_squeeze:
520          ldr       x4,[x0],#8
521          cmp       x21,#8
522          blo       .Lsqueeze_tail
523#ifdef    __AARCH64EB__
524          rev       x4,x4
525#endif
526          str       x4,[x20],#8
527          subs      x21,x21,#8
528          beq       .Lsqueeze_done
529
530          subs      x3,x3,#8
531          bhi       .Loop_squeeze
532
533          mov       x0,x19
534          bl        KeccakF1600
535          mov       x0,x19
536          mov       x3,x22
537          b         .Loop_squeeze
538
539.align    4
540.Lsqueeze_tail:
541          strb      w4,[x20],#1
542          lsr       x4,x4,#8
543          subs      x21,x21,#1
544          beq       .Lsqueeze_done
545          strb      w4,[x20],#1
546          lsr       x4,x4,#8
547          subs      x21,x21,#1
548          beq       .Lsqueeze_done
549          strb      w4,[x20],#1
550          lsr       x4,x4,#8
551          subs      x21,x21,#1
552          beq       .Lsqueeze_done
553          strb      w4,[x20],#1
554          lsr       x4,x4,#8
555          subs      x21,x21,#1
556          beq       .Lsqueeze_done
557          strb      w4,[x20],#1
558          lsr       x4,x4,#8
559          subs      x21,x21,#1
560          beq       .Lsqueeze_done
561          strb      w4,[x20],#1
562          lsr       x4,x4,#8
563          subs      x21,x21,#1
564          beq       .Lsqueeze_done
565          strb      w4,[x20],#1
566
567.Lsqueeze_done:
568          ldp       x19,x20,[sp,#16]
569          ldp       x21,x22,[sp,#32]
570          ldp       x29,x30,[sp],#48
571.inst     0xd50323bf                              // autiasp
572          ret
573.size     SHA3_squeeze,.-SHA3_squeeze
574.type     KeccakF1600_ce,%function
575.align    5
576KeccakF1600_ce:
577          mov       x9,#24
578          adr       x10,iotas
579          b         .Loop_ce
580.align    4
581.Loop_ce:
582          ////////////////////////////////////////////////// Theta
583.inst     0xce0f2a99          //eor3 v25.16b,v20.16b,v15.16b,v10.16b
584.inst     0xce102eba          //eor3 v26.16b,v21.16b,v16.16b,v11.16b
585.inst     0xce1132db          //eor3 v27.16b,v22.16b,v17.16b,v12.16b
586.inst     0xce1236fc          //eor3 v28.16b,v23.16b,v18.16b,v13.16b
587.inst     0xce133b1d          //eor3 v29.16b,v24.16b,v19.16b,v14.16b
588.inst     0xce050339          //eor3 v25.16b,v25.16b,   v5.16b,v0.16b
589.inst     0xce06075a          //eor3 v26.16b,v26.16b,   v6.16b,v1.16b
590.inst     0xce070b7b          //eor3 v27.16b,v27.16b,   v7.16b,v2.16b
591.inst     0xce080f9c          //eor3 v28.16b,v28.16b,   v8.16b,v3.16b
592.inst     0xce0913bd          //eor3 v29.16b,v29.16b,   v9.16b,v4.16b
593
594.inst     0xce7b8f3e          //rax1 v30.16b,v25.16b,v27.16b                              // D[1]
595.inst     0xce7c8f5f          //rax1 v31.16b,v26.16b,v28.16b                              // D[2]
596.inst     0xce7d8f7b          //rax1 v27.16b,v27.16b,v29.16b                              // D[3]
597.inst     0xce798f9c          //rax1 v28.16b,v28.16b,v25.16b                              // D[4]
598.inst     0xce7a8fbd          //rax1 v29.16b,v29.16b,v26.16b                              // D[0]
599
600          ////////////////////////////////////////////////// Theta+Rho+Pi
601.inst     0xce9efc39          //xar v25.16b,   v1.16b,v30.16b,#64-1 // C[0]=A[2][0]
602
603.inst     0xce9e50c1          //xar v1.16b,v6.16b,v30.16b,#64-44
604.inst     0xce9cb126          //xar v6.16b,v9.16b,v28.16b,#64-20
605.inst     0xce9f0ec9          //xar v9.16b,v22.16b,v31.16b,#64-61
606.inst     0xce9c65d6          //xar v22.16b,v14.16b,v28.16b,#64-39
607.inst     0xce9dba8e          //xar v14.16b,v20.16b,v29.16b,#64-18
608
609.inst     0xce9f085a          //xar v26.16b,   v2.16b,v31.16b,#64-62 // C[1]=A[4][0]
610
611.inst     0xce9f5582          //xar v2.16b,v12.16b,v31.16b,#64-43
612.inst     0xce9b9dac          //xar v12.16b,v13.16b,v27.16b,#64-25
613.inst     0xce9ce26d          //xar v13.16b,v19.16b,v28.16b,#64-8
614.inst     0xce9b22f3          //xar v19.16b,v23.16b,v27.16b,#64-56
615.inst     0xce9d5df7          //xar v23.16b,v15.16b,v29.16b,#64-41
616
617.inst     0xce9c948f          //xar v15.16b,v4.16b,v28.16b,#64-27
618
619.inst     0xce9ccb1c          //xar v28.16b,   v24.16b,v28.16b,#64-14 // D[4]=A[0][4]
620.inst     0xce9efab8          //xar v24.16b,v21.16b,v30.16b,#64-2
621.inst     0xce9b2508          //xar v8.16b,v8.16b,v27.16b,#64-55 // A[1][3]=A[4][1]
622.inst     0xce9e4e04          //xar v4.16b,v16.16b,v30.16b,#64-45 // A[0][4]=A[1][3]
623.inst     0xce9d70b0          //xar v16.16b,v5.16b,v29.16b,#64-36
624
625.inst     0xce9b9065          //xar v5.16b,v3.16b,v27.16b,#64-28
626
627          eor       v0.16b,v0.16b,v29.16b
628
629.inst     0xce9bae5b          //xar v27.16b,   v18.16b,v27.16b,#64-21 // D[3]=A[0][3]
630.inst     0xce9fc623          //xar v3.16b,v17.16b,v31.16b,#64-15 // A[0][3]=A[3][3]
631.inst     0xce9ed97e          //xar v30.16b,   v11.16b,v30.16b,#64-10 // D[1]=A[3][2]
632.inst     0xce9fe8ff          //xar v31.16b,   v7.16b,v31.16b,#64-6 // D[2]=A[2][1]
633.inst     0xce9df55d          //xar v29.16b,   v10.16b,v29.16b,#64-3 // D[0]=A[1][2]
634
635          ////////////////////////////////////////////////// Chi+Iota
636.inst     0xce362354          //bcax v20.16b,v26.16b,   v22.16b,v8.16b          // A[1][3]=A[4][1]
637.inst     0xce375915          //bcax v21.16b,v8.16b,v23.16b,v22.16b   // A[1][3]=A[4][1]
638.inst     0xce385ed6          //bcax v22.16b,v22.16b,v24.16b,v23.16b
639.inst     0xce3a62f7          //bcax v23.16b,v23.16b,v26.16b,   v24.16b
640.inst     0xce286b18          //bcax v24.16b,v24.16b,v8.16b,v26.16b   // A[1][3]=A[4][1]
641
642          ld1r      {v26.2d},[x10],#8
643
644.inst     0xce330fd1          //bcax v17.16b,v30.16b,   v19.16b,v3.16b          // A[0][3]=A[3][3]
645.inst     0xce2f4c72          //bcax v18.16b,v3.16b,v15.16b,v19.16b   // A[0][3]=A[3][3]
646.inst     0xce303e73          //bcax v19.16b,v19.16b,v16.16b,v15.16b
647.inst     0xce3e41ef          //bcax v15.16b,v15.16b,v30.16b,   v16.16b
648.inst     0xce237a10          //bcax v16.16b,v16.16b,v3.16b,v30.16b   // A[0][3]=A[3][3]
649
650.inst     0xce2c7f2a          //bcax v10.16b,v25.16b,   v12.16b,v31.16b
651.inst     0xce2d33eb          //bcax v11.16b,v31.16b,   v13.16b,v12.16b
652.inst     0xce2e358c          //bcax v12.16b,v12.16b,v14.16b,v13.16b
653.inst     0xce3939ad          //bcax v13.16b,v13.16b,v25.16b,   v14.16b
654.inst     0xce3f65ce          //bcax v14.16b,v14.16b,v31.16b,   v25.16b
655
656.inst     0xce2913a7          //bcax v7.16b,v29.16b,   v9.16b,v4.16b  // A[0][4]=A[1][3]
657.inst     0xce252488          //bcax v8.16b,v4.16b,v5.16b,v9.16b      // A[0][4]=A[1][3]
658.inst     0xce261529          //bcax v9.16b,v9.16b,v6.16b,v5.16b
659.inst     0xce3d18a5          //bcax v5.16b,v5.16b,v29.16b,   v6.16b
660.inst     0xce2474c6          //bcax v6.16b,v6.16b,v4.16b,v29.16b     // A[0][4]=A[1][3]
661
662.inst     0xce207363          //bcax v3.16b,v27.16b,   v0.16b,v28.16b
663.inst     0xce210384          //bcax v4.16b,v28.16b,   v1.16b,v0.16b
664.inst     0xce220400          //bcax v0.16b,v0.16b,v2.16b,v1.16b
665.inst     0xce3b0821          //bcax v1.16b,v1.16b,v27.16b,   v2.16b
666.inst     0xce3c6c42          //bcax v2.16b,v2.16b,v28.16b,   v27.16b
667
668          eor       v0.16b,v0.16b,v26.16b
669
670          subs      x9,x9,#1
671          bne       .Loop_ce
672
673          ret
674.size     KeccakF1600_ce,.-KeccakF1600_ce
675
676.type     KeccakF1600_cext,%function
677.align    5
678KeccakF1600_cext:
679.inst     0xd503233f                    // paciasp
680          stp       x29,x30,[sp,#-80]!
681          add       x29,sp,#0
682          stp       d8,d9,[sp,#16]                // per ABI requirement
683          stp       d10,d11,[sp,#32]
684          stp       d12,d13,[sp,#48]
685          stp       d14,d15,[sp,#64]
686          ldp       d0,d1,[x0,#8*0]
687          ldp       d2,d3,[x0,#8*2]
688          ldp       d4,d5,[x0,#8*4]
689          ldp       d6,d7,[x0,#8*6]
690          ldp       d8,d9,[x0,#8*8]
691          ldp       d10,d11,[x0,#8*10]
692          ldp       d12,d13,[x0,#8*12]
693          ldp       d14,d15,[x0,#8*14]
694          ldp       d16,d17,[x0,#8*16]
695          ldp       d18,d19,[x0,#8*18]
696          ldp       d20,d21,[x0,#8*20]
697          ldp       d22,d23,[x0,#8*22]
698          ldr       d24,[x0,#8*24]
699          bl        KeccakF1600_ce
700          ldr       x30,[sp,#8]
701          stp       d0,d1,[x0,#8*0]
702          stp       d2,d3,[x0,#8*2]
703          stp       d4,d5,[x0,#8*4]
704          stp       d6,d7,[x0,#8*6]
705          stp       d8,d9,[x0,#8*8]
706          stp       d10,d11,[x0,#8*10]
707          stp       d12,d13,[x0,#8*12]
708          stp       d14,d15,[x0,#8*14]
709          stp       d16,d17,[x0,#8*16]
710          stp       d18,d19,[x0,#8*18]
711          stp       d20,d21,[x0,#8*20]
712          stp       d22,d23,[x0,#8*22]
713          str       d24,[x0,#8*24]
714
715          ldp       d8,d9,[sp,#16]
716          ldp       d10,d11,[sp,#32]
717          ldp       d12,d13,[sp,#48]
718          ldp       d14,d15,[sp,#64]
719          ldr       x29,[sp],#80
720.inst     0xd50323bf                    // autiasp
721          ret
722.size     KeccakF1600_cext,.-KeccakF1600_cext
723.globl    SHA3_absorb_cext
724.type     SHA3_absorb_cext,%function
725.align    5
726SHA3_absorb_cext:
727.inst     0xd503233f                    // paciasp
728          stp       x29,x30,[sp,#-80]!
729          add       x29,sp,#0
730          stp       d8,d9,[sp,#16]                // per ABI requirement
731          stp       d10,d11,[sp,#32]
732          stp       d12,d13,[sp,#48]
733          stp       d14,d15,[sp,#64]
734          ldp       d0,d1,[x0,#8*0]
735          ldp       d2,d3,[x0,#8*2]
736          ldp       d4,d5,[x0,#8*4]
737          ldp       d6,d7,[x0,#8*6]
738          ldp       d8,d9,[x0,#8*8]
739          ldp       d10,d11,[x0,#8*10]
740          ldp       d12,d13,[x0,#8*12]
741          ldp       d14,d15,[x0,#8*14]
742          ldp       d16,d17,[x0,#8*16]
743          ldp       d18,d19,[x0,#8*18]
744          ldp       d20,d21,[x0,#8*20]
745          ldp       d22,d23,[x0,#8*22]
746          ldr       d24,[x0,#8*24]
747          b         .Loop_absorb_ce
748
749.align    4
750.Loop_absorb_ce:
751          subs      x2,x2,x3            // len - bsz
752          blo       .Labsorbed_ce
753          ldr       d31,[x1],#8                   // *inp++
754#ifdef    __AARCH64EB__
755          rev64     v31.16b,v31.16b
756#endif
757          eor       v0.16b,v0.16b,v31.16b
758          cmp       x3,#8*(0+2)
759          blo       .Lprocess_block_ce
760          ldr       d31,[x1],#8                   // *inp++
761#ifdef    __AARCH64EB__
762          rev64     v31.16b,v31.16b
763#endif
764          eor       v1.16b,v1.16b,v31.16b
765          beq       .Lprocess_block_ce
766          ldr       d31,[x1],#8                   // *inp++
767#ifdef    __AARCH64EB__
768          rev64     v31.16b,v31.16b
769#endif
770          eor       v2.16b,v2.16b,v31.16b
771          cmp       x3,#8*(2+2)
772          blo       .Lprocess_block_ce
773          ldr       d31,[x1],#8                   // *inp++
774#ifdef    __AARCH64EB__
775          rev64     v31.16b,v31.16b
776#endif
777          eor       v3.16b,v3.16b,v31.16b
778          beq       .Lprocess_block_ce
779          ldr       d31,[x1],#8                   // *inp++
780#ifdef    __AARCH64EB__
781          rev64     v31.16b,v31.16b
782#endif
783          eor       v4.16b,v4.16b,v31.16b
784          cmp       x3,#8*(4+2)
785          blo       .Lprocess_block_ce
786          ldr       d31,[x1],#8                   // *inp++
787#ifdef    __AARCH64EB__
788          rev64     v31.16b,v31.16b
789#endif
790          eor       v5.16b,v5.16b,v31.16b
791          beq       .Lprocess_block_ce
792          ldr       d31,[x1],#8                   // *inp++
793#ifdef    __AARCH64EB__
794          rev64     v31.16b,v31.16b
795#endif
796          eor       v6.16b,v6.16b,v31.16b
797          cmp       x3,#8*(6+2)
798          blo       .Lprocess_block_ce
799          ldr       d31,[x1],#8                   // *inp++
800#ifdef    __AARCH64EB__
801          rev64     v31.16b,v31.16b
802#endif
803          eor       v7.16b,v7.16b,v31.16b
804          beq       .Lprocess_block_ce
805          ldr       d31,[x1],#8                   // *inp++
806#ifdef    __AARCH64EB__
807          rev64     v31.16b,v31.16b
808#endif
809          eor       v8.16b,v8.16b,v31.16b
810          cmp       x3,#8*(8+2)
811          blo       .Lprocess_block_ce
812          ldr       d31,[x1],#8                   // *inp++
813#ifdef    __AARCH64EB__
814          rev64     v31.16b,v31.16b
815#endif
816          eor       v9.16b,v9.16b,v31.16b
817          beq       .Lprocess_block_ce
818          ldr       d31,[x1],#8                   // *inp++
819#ifdef    __AARCH64EB__
820          rev64     v31.16b,v31.16b
821#endif
822          eor       v10.16b,v10.16b,v31.16b
823          cmp       x3,#8*(10+2)
824          blo       .Lprocess_block_ce
825          ldr       d31,[x1],#8                   // *inp++
826#ifdef    __AARCH64EB__
827          rev64     v31.16b,v31.16b
828#endif
829          eor       v11.16b,v11.16b,v31.16b
830          beq       .Lprocess_block_ce
831          ldr       d31,[x1],#8                   // *inp++
832#ifdef    __AARCH64EB__
833          rev64     v31.16b,v31.16b
834#endif
835          eor       v12.16b,v12.16b,v31.16b
836          cmp       x3,#8*(12+2)
837          blo       .Lprocess_block_ce
838          ldr       d31,[x1],#8                   // *inp++
839#ifdef    __AARCH64EB__
840          rev64     v31.16b,v31.16b
841#endif
842          eor       v13.16b,v13.16b,v31.16b
843          beq       .Lprocess_block_ce
844          ldr       d31,[x1],#8                   // *inp++
845#ifdef    __AARCH64EB__
846          rev64     v31.16b,v31.16b
847#endif
848          eor       v14.16b,v14.16b,v31.16b
849          cmp       x3,#8*(14+2)
850          blo       .Lprocess_block_ce
851          ldr       d31,[x1],#8                   // *inp++
852#ifdef    __AARCH64EB__
853          rev64     v31.16b,v31.16b
854#endif
855          eor       v15.16b,v15.16b,v31.16b
856          beq       .Lprocess_block_ce
857          ldr       d31,[x1],#8                   // *inp++
858#ifdef    __AARCH64EB__
859          rev64     v31.16b,v31.16b
860#endif
861          eor       v16.16b,v16.16b,v31.16b
862          cmp       x3,#8*(16+2)
863          blo       .Lprocess_block_ce
864          ldr       d31,[x1],#8                   // *inp++
865#ifdef    __AARCH64EB__
866          rev64     v31.16b,v31.16b
867#endif
868          eor       v17.16b,v17.16b,v31.16b
869          beq       .Lprocess_block_ce
870          ldr       d31,[x1],#8                   // *inp++
871#ifdef    __AARCH64EB__
872          rev64     v31.16b,v31.16b
873#endif
874          eor       v18.16b,v18.16b,v31.16b
875          cmp       x3,#8*(18+2)
876          blo       .Lprocess_block_ce
877          ldr       d31,[x1],#8                   // *inp++
878#ifdef    __AARCH64EB__
879          rev64     v31.16b,v31.16b
880#endif
881          eor       v19.16b,v19.16b,v31.16b
882          beq       .Lprocess_block_ce
883          ldr       d31,[x1],#8                   // *inp++
884#ifdef    __AARCH64EB__
885          rev64     v31.16b,v31.16b
886#endif
887          eor       v20.16b,v20.16b,v31.16b
888          cmp       x3,#8*(20+2)
889          blo       .Lprocess_block_ce
890          ldr       d31,[x1],#8                   // *inp++
891#ifdef    __AARCH64EB__
892          rev64     v31.16b,v31.16b
893#endif
894          eor       v21.16b,v21.16b,v31.16b
895          beq       .Lprocess_block_ce
896          ldr       d31,[x1],#8                   // *inp++
897#ifdef    __AARCH64EB__
898          rev64     v31.16b,v31.16b
899#endif
900          eor       v22.16b,v22.16b,v31.16b
901          cmp       x3,#8*(22+2)
902          blo       .Lprocess_block_ce
903          ldr       d31,[x1],#8                   // *inp++
904#ifdef    __AARCH64EB__
905          rev64     v31.16b,v31.16b
906#endif
907          eor       v23.16b,v23.16b,v31.16b
908          beq       .Lprocess_block_ce
909          ldr       d31,[x1],#8                   // *inp++
910#ifdef    __AARCH64EB__
911          rev64     v31.16b,v31.16b
912#endif
913          eor       v24.16b,v24.16b,v31.16b
914
915.Lprocess_block_ce:
916
917          bl        KeccakF1600_ce
918
919          b         .Loop_absorb_ce
920
921.align    4
922.Labsorbed_ce:
923          stp       d0,d1,[x0,#8*0]
924          stp       d2,d3,[x0,#8*2]
925          stp       d4,d5,[x0,#8*4]
926          stp       d6,d7,[x0,#8*6]
927          stp       d8,d9,[x0,#8*8]
928          stp       d10,d11,[x0,#8*10]
929          stp       d12,d13,[x0,#8*12]
930          stp       d14,d15,[x0,#8*14]
931          stp       d16,d17,[x0,#8*16]
932          stp       d18,d19,[x0,#8*18]
933          stp       d20,d21,[x0,#8*20]
934          stp       d22,d23,[x0,#8*22]
935          str       d24,[x0,#8*24]
936          add       x0,x2,x3            // return value
937
938          ldp       d8,d9,[sp,#16]
939          ldp       d10,d11,[sp,#32]
940          ldp       d12,d13,[sp,#48]
941          ldp       d14,d15,[sp,#64]
942          ldp       x29,x30,[sp],#80
943.inst     0xd50323bf                    // autiasp
944          ret
945.size     SHA3_absorb_cext,.-SHA3_absorb_cext
946.globl    SHA3_squeeze_cext
947.type     SHA3_squeeze_cext,%function
948.align    5
949SHA3_squeeze_cext:
950.inst     0xd503233f                    // paciasp
951          stp       x29,x30,[sp,#-16]!
952          add       x29,sp,#0
953          mov       x9,x0
954          mov       x10,x3
955
956.Loop_squeeze_ce:
957          ldr       x4,[x9],#8
958          cmp       x2,#8
959          blo       .Lsqueeze_tail_ce
960#ifdef    __AARCH64EB__
961          rev       x4,x4
962#endif
963          str       x4,[x1],#8
964          beq       .Lsqueeze_done_ce
965
966          sub       x2,x2,#8
967          subs      x10,x10,#8
968          bhi       .Loop_squeeze_ce
969
970          bl        KeccakF1600_cext
971          ldr       x30,[sp,#8]
972          mov       x9,x0
973          mov       x10,x3
974          b         .Loop_squeeze_ce
975
976.align    4
977.Lsqueeze_tail_ce:
978          strb      w4,[x1],#1
979          lsr       x4,x4,#8
980          subs      x2,x2,#1
981          beq       .Lsqueeze_done_ce
982          strb      w4,[x1],#1
983          lsr       x4,x4,#8
984          subs      x2,x2,#1
985          beq       .Lsqueeze_done_ce
986          strb      w4,[x1],#1
987          lsr       x4,x4,#8
988          subs      x2,x2,#1
989          beq       .Lsqueeze_done_ce
990          strb      w4,[x1],#1
991          lsr       x4,x4,#8
992          subs      x2,x2,#1
993          beq       .Lsqueeze_done_ce
994          strb      w4,[x1],#1
995          lsr       x4,x4,#8
996          subs      x2,x2,#1
997          beq       .Lsqueeze_done_ce
998          strb      w4,[x1],#1
999          lsr       x4,x4,#8
1000          subs      x2,x2,#1
1001          beq       .Lsqueeze_done_ce
1002          strb      w4,[x1],#1
1003
1004.Lsqueeze_done_ce:
1005          ldr       x29,[sp],#16
1006.inst     0xd50323bf                    // autiasp
1007          ret
1008.size     SHA3_squeeze_cext,.-SHA3_squeeze_cext
1009.byte     75,101,99,99,97,107,45,49,54,48,48,32,97,98,115,111,114,98,32,97,110,100,32,115,113,117,101,101,122,101,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1010.align    2
1011