1#include "arm_asm.h"
2#include "arm_arch.h"
3
4#if defined(__thumb2__)
5.syntax   unified
6.thumb
7#else
8.code     32
9#endif
10
11.text
12
13.globl    poly1305_emit
14.globl    poly1305_blocks
15.globl    poly1305_init
16.type     poly1305_init,%function
17.align    5
18poly1305_init:
19.Lpoly1305_init:
20          stmdb     sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
21
22          eor       r3,r3,r3
23          cmp       r1,#0
24          str       r3,[r0,#0]                    @ zero hash value
25          str       r3,[r0,#4]
26          str       r3,[r0,#8]
27          str       r3,[r0,#12]
28          str       r3,[r0,#16]
29          str       r3,[r0,#36]                   @ is_base2_26
30          add       r0,r0,#20
31
32#ifdef    __thumb2__
33          it        eq
34#endif
35          moveq     r0,#0
36          beq       .Lno_key
37
38#if       __ARM_MAX_ARCH__>=7
39          adr       r11,.Lpoly1305_init
40          ldr       r12,.LOPENSSL_armcap
41#endif
42          ldrb      r4,[r1,#0]
43          mov       r10,#0x0fffffff
44          ldrb      r5,[r1,#1]
45          and       r3,r10,#-4                    @ 0x0ffffffc
46          ldrb      r6,[r1,#2]
47          ldrb      r7,[r1,#3]
48          orr       r4,r4,r5,lsl#8
49          ldrb      r5,[r1,#4]
50          orr       r4,r4,r6,lsl#16
51          ldrb      r6,[r1,#5]
52          orr       r4,r4,r7,lsl#24
53          ldrb      r7,[r1,#6]
54          and       r4,r4,r10
55
56#if       __ARM_MAX_ARCH__>=7
57# if !defined(_WIN32)
58          ldr       r12,[r11,r12]                 @ OPENSSL_armcap_P
59# endif
60# if defined(__APPLE__) || defined(_WIN32)
61          ldr       r12,[r12]
62# endif
63#endif
64          ldrb      r8,[r1,#7]
65          orr       r5,r5,r6,lsl#8
66          ldrb      r6,[r1,#8]
67          orr       r5,r5,r7,lsl#16
68          ldrb      r7,[r1,#9]
69          orr       r5,r5,r8,lsl#24
70          ldrb      r8,[r1,#10]
71          and       r5,r5,r3
72
73#if       __ARM_MAX_ARCH__>=7
74          tst       r12,#ARMV7_NEON               @ check for NEON
75# ifdef   __thumb2__
76          adr       r9,.Lpoly1305_blocks_neon
77          adr       r11,.Lpoly1305_blocks
78          adr       r12,.Lpoly1305_emit
79          adr       r10,.Lpoly1305_emit_neon
80          itt       ne
81          movne     r11,r9
82          movne     r12,r10
83          orr       r11,r11,#1          @ thumb-ify address
84          orr       r12,r12,#1
85# else
86          addeq     r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init)
87          addne     r12,r11,#(.Lpoly1305_emit_neon-.Lpoly1305_init)
88          addeq     r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init)
89          addne     r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init)
90# endif
91#endif
92          ldrb      r9,[r1,#11]
93          orr       r6,r6,r7,lsl#8
94          ldrb      r7,[r1,#12]
95          orr       r6,r6,r8,lsl#16
96          ldrb      r8,[r1,#13]
97          orr       r6,r6,r9,lsl#24
98          ldrb      r9,[r1,#14]
99          and       r6,r6,r3
100
101          ldrb      r10,[r1,#15]
102          orr       r7,r7,r8,lsl#8
103          str       r4,[r0,#0]
104          orr       r7,r7,r9,lsl#16
105          str       r5,[r0,#4]
106          orr       r7,r7,r10,lsl#24
107          str       r6,[r0,#8]
108          and       r7,r7,r3
109          str       r7,[r0,#12]
110#if       __ARM_MAX_ARCH__>=7
111          stmia     r2,{r11,r12}                  @ fill functions table
112          mov       r0,#1
113#else
114          mov       r0,#0
115#endif
116.Lno_key:
117          ldmia     sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
118#if       __ARM_ARCH__>=5
119          RET                                     @ bx      lr
120#else
121          tst       lr,#1
122          moveq     pc,lr                         @ be binary compatible with V4, yet
123.word     0xe12fff1e                              @ interoperable with Thumb ISA:-)
124#endif
125.size     poly1305_init,.-poly1305_init
126.type     poly1305_blocks,%function
127.align    5
128poly1305_blocks:
129.Lpoly1305_blocks:
130          stmdb     sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,lr}
131
132          ands      r2,r2,#-16
133          beq       .Lno_data
134
135          cmp       r3,#0
136          add       r2,r2,r1            @ end pointer
137          sub       sp,sp,#32
138
139          ldmia     r0,{r4,r5,r6,r7,r8,r9,r10,r11,r12}                @ load context
140
141          str       r0,[sp,#12]                   @ offload stuff
142          mov       lr,r1
143          str       r2,[sp,#16]
144          str       r10,[sp,#20]
145          str       r11,[sp,#24]
146          str       r12,[sp,#28]
147          b         .Loop
148
149.Loop:
150#if __ARM_ARCH__<7
151          ldrb      r0,[lr],#16                   @ load input
152# ifdef   __thumb2__
153          it        hi
154# endif
155          addhi     r8,r8,#1            @ 1<<128
156          ldrb      r1,[lr,#-15]
157          ldrb      r2,[lr,#-14]
158          ldrb      r3,[lr,#-13]
159          orr       r1,r0,r1,lsl#8
160          ldrb      r0,[lr,#-12]
161          orr       r2,r1,r2,lsl#16
162          ldrb      r1,[lr,#-11]
163          orr       r3,r2,r3,lsl#24
164          ldrb      r2,[lr,#-10]
165          adds      r4,r4,r3            @ accumulate input
166
167          ldrb      r3,[lr,#-9]
168          orr       r1,r0,r1,lsl#8
169          ldrb      r0,[lr,#-8]
170          orr       r2,r1,r2,lsl#16
171          ldrb      r1,[lr,#-7]
172          orr       r3,r2,r3,lsl#24
173          ldrb      r2,[lr,#-6]
174          adcs      r5,r5,r3
175
176          ldrb      r3,[lr,#-5]
177          orr       r1,r0,r1,lsl#8
178          ldrb      r0,[lr,#-4]
179          orr       r2,r1,r2,lsl#16
180          ldrb      r1,[lr,#-3]
181          orr       r3,r2,r3,lsl#24
182          ldrb      r2,[lr,#-2]
183          adcs      r6,r6,r3
184
185          ldrb      r3,[lr,#-1]
186          orr       r1,r0,r1,lsl#8
187          str       lr,[sp,#8]                    @ offload input pointer
188          orr       r2,r1,r2,lsl#16
189          add       r10,r10,r10,lsr#2
190          orr       r3,r2,r3,lsl#24
191#else
192          ldr       r0,[lr],#16                   @ load input
193# ifdef   __thumb2__
194          it        hi
195# endif
196          addhi     r8,r8,#1            @ padbit
197          ldr       r1,[lr,#-12]
198          ldr       r2,[lr,#-8]
199          ldr       r3,[lr,#-4]
200# ifdef   __ARMEB__
201          rev       r0,r0
202          rev       r1,r1
203          rev       r2,r2
204          rev       r3,r3
205# endif
206          adds      r4,r4,r0            @ accumulate input
207          str       lr,[sp,#8]                    @ offload input pointer
208          adcs      r5,r5,r1
209          add       r10,r10,r10,lsr#2
210          adcs      r6,r6,r2
211#endif
212          add       r11,r11,r11,lsr#2
213          adcs      r7,r7,r3
214          add       r12,r12,r12,lsr#2
215
216          umull     r2,r3,r5,r9
217          adc       r8,r8,#0
218          umull     r0,r1,r4,r9
219          umlal     r2,r3,r8,r10
220          umlal     r0,r1,r7,r10
221          ldr       r10,[sp,#20]                  @ reload r10
222          umlal     r2,r3,r6,r12
223          umlal     r0,r1,r5,r12
224          umlal     r2,r3,r7,r11
225          umlal     r0,r1,r6,r11
226          umlal     r2,r3,r4,r10
227          str       r0,[sp,#0]                    @ future r4
228          mul       r0,r11,r8
229          ldr       r11,[sp,#24]                  @ reload r11
230          adds      r2,r2,r1            @ d1+=d0>>32
231          eor       r1,r1,r1
232          adc       lr,r3,#0            @ future r6
233          str       r2,[sp,#4]                    @ future r5
234
235          mul       r2,r12,r8
236          eor       r3,r3,r3
237          umlal     r0,r1,r7,r12
238          ldr       r12,[sp,#28]                  @ reload r12
239          umlal     r2,r3,r7,r9
240          umlal     r0,r1,r6,r9
241          umlal     r2,r3,r6,r10
242          umlal     r0,r1,r5,r10
243          umlal     r2,r3,r5,r11
244          umlal     r0,r1,r4,r11
245          umlal     r2,r3,r4,r12
246          ldr       r4,[sp,#0]
247          mul       r8,r9,r8
248          ldr       r5,[sp,#4]
249
250          adds      r6,lr,r0            @ d2+=d1>>32
251          ldr       lr,[sp,#8]                    @ reload input pointer
252          adc       r1,r1,#0
253          adds      r7,r2,r1            @ d3+=d2>>32
254          ldr       r0,[sp,#16]                   @ reload end pointer
255          adc       r3,r3,#0
256          add       r8,r8,r3            @ h4+=d3>>32
257
258          and       r1,r8,#-4
259          and       r8,r8,#3
260          add       r1,r1,r1,lsr#2                @ *=5
261          adds      r4,r4,r1
262          adcs      r5,r5,#0
263          adcs      r6,r6,#0
264          adcs      r7,r7,#0
265          adc       r8,r8,#0
266
267          cmp       r0,lr                         @ done yet?
268          bhi       .Loop
269
270          ldr       r0,[sp,#12]
271          add       sp,sp,#32
272          stmia     r0,{r4,r5,r6,r7,r8}           @ store the result
273
274.Lno_data:
275#if       __ARM_ARCH__>=5
276          ldmia     sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,pc}
277#else
278          ldmia     sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,lr}
279          tst       lr,#1
280          moveq     pc,lr                         @ be binary compatible with V4, yet
281.word     0xe12fff1e                              @ interoperable with Thumb ISA:-)
282#endif
283.size     poly1305_blocks,.-poly1305_blocks
284.type     poly1305_emit,%function
285.align    5
286poly1305_emit:
287.Lpoly1305_emit:
288          stmdb     sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
289.Lpoly1305_emit_enter:
290
291          ldmia     r0,{r3,r4,r5,r6,r7}
292          adds      r8,r3,#5            @ compare to modulus
293          adcs      r9,r4,#0
294          adcs      r10,r5,#0
295          adcs      r11,r6,#0
296          adc       r7,r7,#0
297          tst       r7,#4                         @ did it carry/borrow?
298
299#ifdef    __thumb2__
300          it        ne
301#endif
302          movne     r3,r8
303          ldr       r8,[r2,#0]
304#ifdef    __thumb2__
305          it        ne
306#endif
307          movne     r4,r9
308          ldr       r9,[r2,#4]
309#ifdef    __thumb2__
310          it        ne
311#endif
312          movne     r5,r10
313          ldr       r10,[r2,#8]
314#ifdef    __thumb2__
315          it        ne
316#endif
317          movne     r6,r11
318          ldr       r11,[r2,#12]
319
320          adds      r3,r3,r8
321          adcs      r4,r4,r9
322          adcs      r5,r5,r10
323          adc       r6,r6,r11
324
325#if __ARM_ARCH__>=7
326# ifdef __ARMEB__
327          rev       r3,r3
328          rev       r4,r4
329          rev       r5,r5
330          rev       r6,r6
331# endif
332          str       r3,[r1,#0]
333          str       r4,[r1,#4]
334          str       r5,[r1,#8]
335          str       r6,[r1,#12]
336#else
337          strb      r3,[r1,#0]
338          mov       r3,r3,lsr#8
339          strb      r4,[r1,#4]
340          mov       r4,r4,lsr#8
341          strb      r5,[r1,#8]
342          mov       r5,r5,lsr#8
343          strb      r6,[r1,#12]
344          mov       r6,r6,lsr#8
345
346          strb      r3,[r1,#1]
347          mov       r3,r3,lsr#8
348          strb      r4,[r1,#5]
349          mov       r4,r4,lsr#8
350          strb      r5,[r1,#9]
351          mov       r5,r5,lsr#8
352          strb      r6,[r1,#13]
353          mov       r6,r6,lsr#8
354
355          strb      r3,[r1,#2]
356          mov       r3,r3,lsr#8
357          strb      r4,[r1,#6]
358          mov       r4,r4,lsr#8
359          strb      r5,[r1,#10]
360          mov       r5,r5,lsr#8
361          strb      r6,[r1,#14]
362          mov       r6,r6,lsr#8
363
364          strb      r3,[r1,#3]
365          strb      r4,[r1,#7]
366          strb      r5,[r1,#11]
367          strb      r6,[r1,#15]
368#endif
369          ldmia     sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
370#if       __ARM_ARCH__>=5
371          RET                                     @ bx      lr
372#else
373          tst       lr,#1
374          moveq     pc,lr                         @ be binary compatible with V4, yet
375.word     0xe12fff1e                              @ interoperable with Thumb ISA:-)
376#endif
377.size     poly1305_emit,.-poly1305_emit
378#if       __ARM_MAX_ARCH__>=7
379.fpu      neon
380
381.type     poly1305_init_neon,%function
382.align    5
383poly1305_init_neon:
384          ldr       r4,[r0,#20]                   @ load key base 2^32
385          ldr       r5,[r0,#24]
386          ldr       r6,[r0,#28]
387          ldr       r7,[r0,#32]
388
389          and       r2,r4,#0x03ffffff   @ base 2^32 -> base 2^26
390          mov       r3,r4,lsr#26
391          mov       r4,r5,lsr#20
392          orr       r3,r3,r5,lsl#6
393          mov       r5,r6,lsr#14
394          orr       r4,r4,r6,lsl#12
395          mov       r6,r7,lsr#8
396          orr       r5,r5,r7,lsl#18
397          and       r3,r3,#0x03ffffff
398          and       r4,r4,#0x03ffffff
399          and       r5,r5,#0x03ffffff
400
401          vdup.32   d0,r2                         @ r^1 in both lanes
402          add       r2,r3,r3,lsl#2                @ *5
403          vdup.32   d1,r3
404          add       r3,r4,r4,lsl#2
405          vdup.32   d2,r2
406          vdup.32   d3,r4
407          add       r4,r5,r5,lsl#2
408          vdup.32   d4,r3
409          vdup.32   d5,r5
410          add       r5,r6,r6,lsl#2
411          vdup.32   d6,r4
412          vdup.32   d7,r6
413          vdup.32   d8,r5
414
415          mov       r5,#2               @ counter
416
417.Lsquare_neon:
418          @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
419          @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
420          @ d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
421          @ d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
422          @ d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
423          @ d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
424
425          vmull.u32 q5,d0,d0[1]
426          vmull.u32 q6,d1,d0[1]
427          vmull.u32 q7,d3,d0[1]
428          vmull.u32 q8,d5,d0[1]
429          vmull.u32 q9,d7,d0[1]
430
431          vmlal.u32 q5,d7,d2[1]
432          vmlal.u32 q6,d0,d1[1]
433          vmlal.u32 q7,d1,d1[1]
434          vmlal.u32 q8,d3,d1[1]
435          vmlal.u32 q9,d5,d1[1]
436
437          vmlal.u32 q5,d5,d4[1]
438          vmlal.u32 q6,d7,d4[1]
439          vmlal.u32 q8,d1,d3[1]
440          vmlal.u32 q7,d0,d3[1]
441          vmlal.u32 q9,d3,d3[1]
442
443          vmlal.u32 q5,d3,d6[1]
444          vmlal.u32 q8,d0,d5[1]
445          vmlal.u32 q6,d5,d6[1]
446          vmlal.u32 q7,d7,d6[1]
447          vmlal.u32 q9,d1,d5[1]
448
449          vmlal.u32 q8,d7,d8[1]
450          vmlal.u32 q5,d1,d8[1]
451          vmlal.u32 q6,d3,d8[1]
452          vmlal.u32 q7,d5,d8[1]
453          vmlal.u32 q9,d0,d7[1]
454
455          @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
456          @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
457          @ and P. Schwabe
458          @
459          @ H0>>+H1>>+H2>>+H3>>+H4
460          @ H3>>+H4>>*5+H0>>+H1
461          @
462          @ Trivia.
463          @
464          @ Result of multiplication of n-bit number by m-bit number is
465          @ n+m bits wide. However! Even though 2^n is a n+1-bit number,
466          @ m-bit number multiplied by 2^n is still n+m bits wide.
467          @
468          @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2,
469          @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit
470          @ one is n+1 bits wide.
471          @
472          @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that
473          @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4
474          @ can be 27. However! In cases when their width exceeds 26 bits
475          @ they are limited by 2^26+2^6. This in turn means that *sum*
476          @ of the products with these values can still be viewed as sum
477          @ of 52-bit numbers as long as the amount of addends is not a
478          @ power of 2. For example,
479          @
480          @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4,
481          @
482          @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or
483          @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than
484          @ 8 * (2^52) or 2^55. However, the value is then multiplied by
485          @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12),
486          @ which is less than 32 * (2^52) or 2^57. And when processing
487          @ data we are looking at triple as many addends...
488          @
489          @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and
490          @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the
491          @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while
492          @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32
493          @ instruction accepts 2x32-bit input and writes 2x64-bit result.
494          @ This means that result of reduction have to be compressed upon
495          @ loop wrap-around. This can be done in the process of reduction
496          @ to minimize amount of instructions [as well as amount of
497          @ 128-bit instructions, which benefits low-end processors], but
498          @ one has to watch for H2 (which is narrower than H0) and 5*H4
499          @ not being wider than 58 bits, so that result of right shift
500          @ by 26 bits fits in 32 bits. This is also useful on x86,
501          @ because it allows to use paddd in place for paddq, which
502          @ benefits Atom, where paddq is ridiculously slow.
503
504          vshr.u64  q15,q8,#26
505          vmovn.i64 d16,q8
506          vshr.u64  q4,q5,#26
507          vmovn.i64 d10,q5
508          vadd.i64  q9,q9,q15           @ h3 -> h4
509          vbic.i32  d16,#0xfc000000     @ &=0x03ffffff
510          vadd.i64  q6,q6,q4            @ h0 -> h1
511          vbic.i32  d10,#0xfc000000
512
513          vshrn.u64 d30,q9,#26
514          vmovn.i64 d18,q9
515          vshr.u64  q4,q6,#26
516          vmovn.i64 d12,q6
517          vadd.i64  q7,q7,q4            @ h1 -> h2
518          vbic.i32  d18,#0xfc000000
519          vbic.i32  d12,#0xfc000000
520
521          vadd.i32  d10,d10,d30
522          vshl.u32  d30,d30,#2
523          vshrn.u64 d8,q7,#26
524          vmovn.i64 d14,q7
525          vadd.i32  d10,d10,d30         @ h4 -> h0
526          vadd.i32  d16,d16,d8          @ h2 -> h3
527          vbic.i32  d14,#0xfc000000
528
529          vshr.u32  d30,d10,#26
530          vbic.i32  d10,#0xfc000000
531          vshr.u32  d8,d16,#26
532          vbic.i32  d16,#0xfc000000
533          vadd.i32  d12,d12,d30         @ h0 -> h1
534          vadd.i32  d18,d18,d8          @ h3 -> h4
535
536          subs      r5,r5,#1
537          beq       .Lsquare_break_neon
538
539          add       r6,r0,#(48+0*9*4)
540          add       r7,r0,#(48+1*9*4)
541
542          vtrn.32   d0,d10              @ r^2:r^1
543          vtrn.32   d3,d14
544          vtrn.32   d5,d16
545          vtrn.32   d1,d12
546          vtrn.32   d7,d18
547
548          vshl.u32  d4,d3,#2            @ *5
549          vshl.u32  d6,d5,#2
550          vshl.u32  d2,d1,#2
551          vshl.u32  d8,d7,#2
552          vadd.i32  d4,d4,d3
553          vadd.i32  d2,d2,d1
554          vadd.i32  d6,d6,d5
555          vadd.i32  d8,d8,d7
556
557          vst4.32   {d0[0],d1[0],d2[0],d3[0]},[r6]!
558          vst4.32   {d0[1],d1[1],d2[1],d3[1]},[r7]!
559          vst4.32   {d4[0],d5[0],d6[0],d7[0]},[r6]!
560          vst4.32   {d4[1],d5[1],d6[1],d7[1]},[r7]!
561          vst1.32   {d8[0]},[r6,:32]
562          vst1.32   {d8[1]},[r7,:32]
563
564          b         .Lsquare_neon
565
566.align    4
567.Lsquare_break_neon:
568          add       r6,r0,#(48+2*4*9)
569          add       r7,r0,#(48+3*4*9)
570
571          vmov      d0,d10              @ r^4:r^3
572          vshl.u32  d2,d12,#2           @ *5
573          vmov      d1,d12
574          vshl.u32  d4,d14,#2
575          vmov      d3,d14
576          vshl.u32  d6,d16,#2
577          vmov      d5,d16
578          vshl.u32  d8,d18,#2
579          vmov      d7,d18
580          vadd.i32  d2,d2,d12
581          vadd.i32  d4,d4,d14
582          vadd.i32  d6,d6,d16
583          vadd.i32  d8,d8,d18
584
585          vst4.32   {d0[0],d1[0],d2[0],d3[0]},[r6]!
586          vst4.32   {d0[1],d1[1],d2[1],d3[1]},[r7]!
587          vst4.32   {d4[0],d5[0],d6[0],d7[0]},[r6]!
588          vst4.32   {d4[1],d5[1],d6[1],d7[1]},[r7]!
589          vst1.32   {d8[0]},[r6]
590          vst1.32   {d8[1]},[r7]
591
592          RET                                     @ bx      lr
593.size     poly1305_init_neon,.-poly1305_init_neon
594
595.type     poly1305_blocks_neon,%function
596.align    5
597poly1305_blocks_neon:
598.Lpoly1305_blocks_neon:
599          ldr       ip,[r0,#36]                   @ is_base2_26
600          ands      r2,r2,#-16
601          beq       .Lno_data_neon
602
603          cmp       r2,#64
604          bhs       .Lenter_neon
605          tst       ip,ip                         @ is_base2_26?
606          beq       .Lpoly1305_blocks
607
608.Lenter_neon:
609          stmdb     sp!,{r4,r5,r6,r7}
610          vstmdb    sp!,{d8,d9,d10,d11,d12,d13,d14,d15}               @ ABI specification says so
611
612          tst       ip,ip                         @ is_base2_26?
613          bne       .Lbase2_26_neon
614
615          stmdb     sp!,{r1,r2,r3,lr}
616          bl        poly1305_init_neon
617
618          ldr       r4,[r0,#0]                    @ load hash value base 2^32
619          ldr       r5,[r0,#4]
620          ldr       r6,[r0,#8]
621          ldr       r7,[r0,#12]
622          ldr       ip,[r0,#16]
623
624          and       r2,r4,#0x03ffffff   @ base 2^32 -> base 2^26
625          mov       r3,r4,lsr#26
626          veor      d10,d10,d10
627          mov       r4,r5,lsr#20
628          orr       r3,r3,r5,lsl#6
629          veor      d12,d12,d12
630          mov       r5,r6,lsr#14
631          orr       r4,r4,r6,lsl#12
632          veor      d14,d14,d14
633          mov       r6,r7,lsr#8
634          orr       r5,r5,r7,lsl#18
635          veor      d16,d16,d16
636          and       r3,r3,#0x03ffffff
637          orr       r6,r6,ip,lsl#24
638          veor      d18,d18,d18
639          and       r4,r4,#0x03ffffff
640          mov       r1,#1
641          and       r5,r5,#0x03ffffff
642          str       r1,[r0,#36]                   @ is_base2_26
643
644          vmov.32   d10[0],r2
645          vmov.32   d12[0],r3
646          vmov.32   d14[0],r4
647          vmov.32   d16[0],r5
648          vmov.32   d18[0],r6
649          adr       r5,.Lzeros
650
651          ldmia     sp!,{r1,r2,r3,lr}
652          b         .Lbase2_32_neon
653
654.align    4
655.Lbase2_26_neon:
656          @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
657          @ load hash value
658
659          veor      d10,d10,d10
660          veor      d12,d12,d12
661          veor      d14,d14,d14
662          veor      d16,d16,d16
663          veor      d18,d18,d18
664          vld4.32   {d10[0],d12[0],d14[0],d16[0]},[r0]!
665          adr       r5,.Lzeros
666          vld1.32   {d18[0]},[r0]
667          sub       r0,r0,#16           @ rewind
668
669.Lbase2_32_neon:
670          add       r4,r1,#32
671          mov       r3,r3,lsl#24
672          tst       r2,#31
673          beq       .Leven
674
675          vld4.32   {d20[0],d22[0],d24[0],d26[0]},[r1]!
676          vmov.32   d28[0],r3
677          sub       r2,r2,#16
678          add       r4,r1,#32
679
680# ifdef   __ARMEB__
681          vrev32.8  q10,q10
682          vrev32.8  q13,q13
683          vrev32.8  q11,q11
684          vrev32.8  q12,q12
685# endif
686          vsri.u32  d28,d26,#8          @ base 2^32 -> base 2^26
687          vshl.u32  d26,d26,#18
688
689          vsri.u32  d26,d24,#14
690          vshl.u32  d24,d24,#12
691          vadd.i32  d29,d28,d18         @ add hash value and move to #hi
692
693          vbic.i32  d26,#0xfc000000
694          vsri.u32  d24,d22,#20
695          vshl.u32  d22,d22,#6
696
697          vbic.i32  d24,#0xfc000000
698          vsri.u32  d22,d20,#26
699          vadd.i32  d27,d26,d16
700
701          vbic.i32  d20,#0xfc000000
702          vbic.i32  d22,#0xfc000000
703          vadd.i32  d25,d24,d14
704
705          vadd.i32  d21,d20,d10
706          vadd.i32  d23,d22,d12
707
708          mov       r7,r5
709          add       r6,r0,#48
710
711          cmp       r2,r2
712          b         .Long_tail
713
714.align    4
715.Leven:
716          subs      r2,r2,#64
717          it        lo
718          movlo     r4,r5
719
720          vmov.i32  q14,#1<<24                    @ padbit, yes, always
721          vld4.32   {d20,d22,d24,d26},[r1]        @ inp[0:1]
722          add       r1,r1,#64
723          vld4.32   {d21,d23,d25,d27},[r4]        @ inp[2:3] (or 0)
724          add       r4,r4,#64
725          itt       hi
726          addhi     r7,r0,#(48+1*9*4)
727          addhi     r6,r0,#(48+3*9*4)
728
729# ifdef   __ARMEB__
730          vrev32.8  q10,q10
731          vrev32.8  q13,q13
732          vrev32.8  q11,q11
733          vrev32.8  q12,q12
734# endif
735          vsri.u32  q14,q13,#8                    @ base 2^32 -> base 2^26
736          vshl.u32  q13,q13,#18
737
738          vsri.u32  q13,q12,#14
739          vshl.u32  q12,q12,#12
740
741          vbic.i32  q13,#0xfc000000
742          vsri.u32  q12,q11,#20
743          vshl.u32  q11,q11,#6
744
745          vbic.i32  q12,#0xfc000000
746          vsri.u32  q11,q10,#26
747
748          vbic.i32  q10,#0xfc000000
749          vbic.i32  q11,#0xfc000000
750
751          bls       .Lskip_loop
752
753          vld4.32   {d0[1],d1[1],d2[1],d3[1]},[r7]!         @ load r^2
754          vld4.32   {d0[0],d1[0],d2[0],d3[0]},[r6]!         @ load r^4
755          vld4.32   {d4[1],d5[1],d6[1],d7[1]},[r7]!
756          vld4.32   {d4[0],d5[0],d6[0],d7[0]},[r6]!
757          b         .Loop_neon
758
759.align    5
760.Loop_neon:
761          @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
762          @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
763          @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
764          @   ___________________/
765          @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
766          @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
767          @   ___________________/ ____________________/
768          @
769          @ Note that we start with inp[2:3]*r^2. This is because it
770          @ doesn't depend on reduction in previous iteration.
771          @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
772          @ d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
773          @ d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
774          @ d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
775          @ d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
776          @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
777
778          @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
779          @ inp[2:3]*r^2
780
781          vadd.i32  d24,d24,d14         @ accumulate inp[0:1]
782          vmull.u32 q7,d25,d0[1]
783          vadd.i32  d20,d20,d10
784          vmull.u32 q5,d21,d0[1]
785          vadd.i32  d26,d26,d16
786          vmull.u32 q8,d27,d0[1]
787          vmlal.u32 q7,d23,d1[1]
788          vadd.i32  d22,d22,d12
789          vmull.u32 q6,d23,d0[1]
790
791          vadd.i32  d28,d28,d18
792          vmull.u32 q9,d29,d0[1]
793          subs      r2,r2,#64
794          vmlal.u32 q5,d29,d2[1]
795          it        lo
796          movlo     r4,r5
797          vmlal.u32 q8,d25,d1[1]
798          vld1.32   d8[1],[r7,:32]
799          vmlal.u32 q6,d21,d1[1]
800          vmlal.u32 q9,d27,d1[1]
801
802          vmlal.u32 q5,d27,d4[1]
803          vmlal.u32 q8,d23,d3[1]
804          vmlal.u32 q9,d25,d3[1]
805          vmlal.u32 q6,d29,d4[1]
806          vmlal.u32 q7,d21,d3[1]
807
808          vmlal.u32 q8,d21,d5[1]
809          vmlal.u32 q5,d25,d6[1]
810          vmlal.u32 q9,d23,d5[1]
811          vmlal.u32 q6,d27,d6[1]
812          vmlal.u32 q7,d29,d6[1]
813
814          vmlal.u32 q8,d29,d8[1]
815          vmlal.u32 q5,d23,d8[1]
816          vmlal.u32 q9,d21,d7[1]
817          vmlal.u32 q6,d25,d8[1]
818          vmlal.u32 q7,d27,d8[1]
819
820          vld4.32   {d21,d23,d25,d27},[r4]        @ inp[2:3] (or 0)
821          add       r4,r4,#64
822
823          @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
824          @ (hash+inp[0:1])*r^4 and accumulate
825
826          vmlal.u32 q8,d26,d0[0]
827          vmlal.u32 q5,d20,d0[0]
828          vmlal.u32 q9,d28,d0[0]
829          vmlal.u32 q6,d22,d0[0]
830          vmlal.u32 q7,d24,d0[0]
831          vld1.32   d8[0],[r6,:32]
832
833          vmlal.u32 q8,d24,d1[0]
834          vmlal.u32 q5,d28,d2[0]
835          vmlal.u32 q9,d26,d1[0]
836          vmlal.u32 q6,d20,d1[0]
837          vmlal.u32 q7,d22,d1[0]
838
839          vmlal.u32 q8,d22,d3[0]
840          vmlal.u32 q5,d26,d4[0]
841          vmlal.u32 q9,d24,d3[0]
842          vmlal.u32 q6,d28,d4[0]
843          vmlal.u32 q7,d20,d3[0]
844
845          vmlal.u32 q8,d20,d5[0]
846          vmlal.u32 q5,d24,d6[0]
847          vmlal.u32 q9,d22,d5[0]
848          vmlal.u32 q6,d26,d6[0]
849          vmlal.u32 q8,d28,d8[0]
850
851          vmlal.u32 q7,d28,d6[0]
852          vmlal.u32 q5,d22,d8[0]
853          vmlal.u32 q9,d20,d7[0]
854          vmov.i32  q14,#1<<24                    @ padbit, yes, always
855          vmlal.u32 q6,d24,d8[0]
856          vmlal.u32 q7,d26,d8[0]
857
858          vld4.32   {d20,d22,d24,d26},[r1]        @ inp[0:1]
859          add       r1,r1,#64
860# ifdef   __ARMEB__
861          vrev32.8  q10,q10
862          vrev32.8  q11,q11
863          vrev32.8  q12,q12
864          vrev32.8  q13,q13
865# endif
866
867          @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
868          @ lazy reduction interleaved with base 2^32 -> base 2^26 of
869          @ inp[0:3] previously loaded to q10-q13 and smashed to q10-q14.
870
871          vshr.u64  q15,q8,#26
872          vmovn.i64 d16,q8
873          vshr.u64  q4,q5,#26
874          vmovn.i64 d10,q5
875          vadd.i64  q9,q9,q15           @ h3 -> h4
876          vbic.i32  d16,#0xfc000000
877          vsri.u32  q14,q13,#8                    @ base 2^32 -> base 2^26
878          vadd.i64  q6,q6,q4            @ h0 -> h1
879          vshl.u32  q13,q13,#18
880          vbic.i32  d10,#0xfc000000
881
882          vshrn.u64 d30,q9,#26
883          vmovn.i64 d18,q9
884          vshr.u64  q4,q6,#26
885          vmovn.i64 d12,q6
886          vadd.i64  q7,q7,q4            @ h1 -> h2
887          vsri.u32  q13,q12,#14
888          vbic.i32  d18,#0xfc000000
889          vshl.u32  q12,q12,#12
890          vbic.i32  d12,#0xfc000000
891
892          vadd.i32  d10,d10,d30
893          vshl.u32  d30,d30,#2
894          vbic.i32  q13,#0xfc000000
895          vshrn.u64 d8,q7,#26
896          vmovn.i64 d14,q7
897          vaddl.u32 q5,d10,d30          @ h4 -> h0 [widen for a sec]
898          vsri.u32  q12,q11,#20
899          vadd.i32  d16,d16,d8          @ h2 -> h3
900          vshl.u32  q11,q11,#6
901          vbic.i32  d14,#0xfc000000
902          vbic.i32  q12,#0xfc000000
903
904          vshrn.u64 d30,q5,#26                    @ re-narrow
905          vmovn.i64 d10,q5
906          vsri.u32  q11,q10,#26
907          vbic.i32  q10,#0xfc000000
908          vshr.u32  d8,d16,#26
909          vbic.i32  d16,#0xfc000000
910          vbic.i32  d10,#0xfc000000
911          vadd.i32  d12,d12,d30         @ h0 -> h1
912          vadd.i32  d18,d18,d8          @ h3 -> h4
913          vbic.i32  q11,#0xfc000000
914
915          bhi       .Loop_neon
916
917.Lskip_loop:
918          @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
919          @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
920
921          add       r7,r0,#(48+0*9*4)
922          add       r6,r0,#(48+1*9*4)
923          adds      r2,r2,#32
924          it        ne
925          movne     r2,#0
926          bne       .Long_tail
927
928          vadd.i32  d25,d24,d14         @ add hash value and move to #hi
929          vadd.i32  d21,d20,d10
930          vadd.i32  d27,d26,d16
931          vadd.i32  d23,d22,d12
932          vadd.i32  d29,d28,d18
933
934.Long_tail:
935          vld4.32   {d0[1],d1[1],d2[1],d3[1]},[r7]!         @ load r^1
936          vld4.32   {d0[0],d1[0],d2[0],d3[0]},[r6]!         @ load r^2
937
938          vadd.i32  d24,d24,d14         @ can be redundant
939          vmull.u32 q7,d25,d0
940          vadd.i32  d20,d20,d10
941          vmull.u32 q5,d21,d0
942          vadd.i32  d26,d26,d16
943          vmull.u32 q8,d27,d0
944          vadd.i32  d22,d22,d12
945          vmull.u32 q6,d23,d0
946          vadd.i32  d28,d28,d18
947          vmull.u32 q9,d29,d0
948
949          vmlal.u32 q5,d29,d2
950          vld4.32   {d4[1],d5[1],d6[1],d7[1]},[r7]!
951          vmlal.u32 q8,d25,d1
952          vld4.32   {d4[0],d5[0],d6[0],d7[0]},[r6]!
953          vmlal.u32 q6,d21,d1
954          vmlal.u32 q9,d27,d1
955          vmlal.u32 q7,d23,d1
956
957          vmlal.u32 q8,d23,d3
958          vld1.32   d8[1],[r7,:32]
959          vmlal.u32 q5,d27,d4
960          vld1.32   d8[0],[r6,:32]
961          vmlal.u32 q9,d25,d3
962          vmlal.u32 q6,d29,d4
963          vmlal.u32 q7,d21,d3
964
965          vmlal.u32 q8,d21,d5
966          it        ne
967          addne     r7,r0,#(48+2*9*4)
968          vmlal.u32 q5,d25,d6
969          it        ne
970          addne     r6,r0,#(48+3*9*4)
971          vmlal.u32 q9,d23,d5
972          vmlal.u32 q6,d27,d6
973          vmlal.u32 q7,d29,d6
974
975          vmlal.u32 q8,d29,d8
976          vorn      q0,q0,q0  @ all-ones, can be redundant
977          vmlal.u32 q5,d23,d8
978          vshr.u64  q0,q0,#38
979          vmlal.u32 q9,d21,d7
980          vmlal.u32 q6,d25,d8
981          vmlal.u32 q7,d27,d8
982
983          beq       .Lshort_tail
984
985          @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
986          @ (hash+inp[0:1])*r^4:r^3 and accumulate
987
988          vld4.32   {d0[1],d1[1],d2[1],d3[1]},[r7]!         @ load r^3
989          vld4.32   {d0[0],d1[0],d2[0],d3[0]},[r6]!         @ load r^4
990
991          vmlal.u32 q7,d24,d0
992          vmlal.u32 q5,d20,d0
993          vmlal.u32 q8,d26,d0
994          vmlal.u32 q6,d22,d0
995          vmlal.u32 q9,d28,d0
996
997          vmlal.u32 q5,d28,d2
998          vld4.32   {d4[1],d5[1],d6[1],d7[1]},[r7]!
999          vmlal.u32 q8,d24,d1
1000          vld4.32   {d4[0],d5[0],d6[0],d7[0]},[r6]!
1001          vmlal.u32 q6,d20,d1
1002          vmlal.u32 q9,d26,d1
1003          vmlal.u32 q7,d22,d1
1004
1005          vmlal.u32 q8,d22,d3
1006          vld1.32   d8[1],[r7,:32]
1007          vmlal.u32 q5,d26,d4
1008          vld1.32   d8[0],[r6,:32]
1009          vmlal.u32 q9,d24,d3
1010          vmlal.u32 q6,d28,d4
1011          vmlal.u32 q7,d20,d3
1012
1013          vmlal.u32 q8,d20,d5
1014          vmlal.u32 q5,d24,d6
1015          vmlal.u32 q9,d22,d5
1016          vmlal.u32 q6,d26,d6
1017          vmlal.u32 q7,d28,d6
1018
1019          vmlal.u32 q8,d28,d8
1020          vorn      q0,q0,q0  @ all-ones
1021          vmlal.u32 q5,d22,d8
1022          vshr.u64  q0,q0,#38
1023          vmlal.u32 q9,d20,d7
1024          vmlal.u32 q6,d24,d8
1025          vmlal.u32 q7,d26,d8
1026
1027.Lshort_tail:
1028          @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1029          @ horizontal addition
1030
1031          vadd.i64  d16,d16,d17
1032          vadd.i64  d10,d10,d11
1033          vadd.i64  d18,d18,d19
1034          vadd.i64  d12,d12,d13
1035          vadd.i64  d14,d14,d15
1036
1037          @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1038          @ lazy reduction, but without narrowing
1039
1040          vshr.u64  q15,q8,#26
1041          vand.i64  q8,q8,q0
1042          vshr.u64  q4,q5,#26
1043          vand.i64  q5,q5,q0
1044          vadd.i64  q9,q9,q15           @ h3 -> h4
1045          vadd.i64  q6,q6,q4            @ h0 -> h1
1046
1047          vshr.u64  q15,q9,#26
1048          vand.i64  q9,q9,q0
1049          vshr.u64  q4,q6,#26
1050          vand.i64  q6,q6,q0
1051          vadd.i64  q7,q7,q4            @ h1 -> h2
1052
1053          vadd.i64  q5,q5,q15
1054          vshl.u64  q15,q15,#2
1055          vshr.u64  q4,q7,#26
1056          vand.i64  q7,q7,q0
1057          vadd.i64  q5,q5,q15           @ h4 -> h0
1058          vadd.i64  q8,q8,q4            @ h2 -> h3
1059
1060          vshr.u64  q15,q5,#26
1061          vand.i64  q5,q5,q0
1062          vshr.u64  q4,q8,#26
1063          vand.i64  q8,q8,q0
1064          vadd.i64  q6,q6,q15           @ h0 -> h1
1065          vadd.i64  q9,q9,q4            @ h3 -> h4
1066
1067          cmp       r2,#0
1068          bne       .Leven
1069
1070          @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1071          @ store hash value
1072
1073          vst4.32   {d10[0],d12[0],d14[0],d16[0]},[r0]!
1074          vst1.32   {d18[0]},[r0]
1075
1076          vldmia    sp!,{d8,d9,d10,d11,d12,d13,d14,d15}                         @ epilogue
1077          ldmia     sp!,{r4,r5,r6,r7}
1078.Lno_data_neon:
1079          RET                                               @ bx      lr
1080.size     poly1305_blocks_neon,.-poly1305_blocks_neon
1081
1082.type     poly1305_emit_neon,%function
1083.align    5
1084poly1305_emit_neon:
1085.Lpoly1305_emit_neon:
1086          ldr       ip,[r0,#36]                   @ is_base2_26
1087
1088          stmdb     sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
1089
1090          tst       ip,ip
1091          beq       .Lpoly1305_emit_enter
1092
1093          ldmia     r0,{r3,r4,r5,r6,r7}
1094          eor       r8,r8,r8
1095
1096          adds      r3,r3,r4,lsl#26     @ base 2^26 -> base 2^32
1097          mov       r4,r4,lsr#6
1098          adcs      r4,r4,r5,lsl#20
1099          mov       r5,r5,lsr#12
1100          adcs      r5,r5,r6,lsl#14
1101          mov       r6,r6,lsr#18
1102          adcs      r6,r6,r7,lsl#8
1103          adc       r7,r8,r7,lsr#24     @ can be partially reduced ...
1104
1105          and       r8,r7,#-4           @ ... so reduce
1106          and       r7,r6,#3
1107          add       r8,r8,r8,lsr#2      @ *= 5
1108          adds      r3,r3,r8
1109          adcs      r4,r4,#0
1110          adcs      r5,r5,#0
1111          adcs      r6,r6,#0
1112          adc       r7,r7,#0
1113
1114          adds      r8,r3,#5            @ compare to modulus
1115          adcs      r9,r4,#0
1116          adcs      r10,r5,#0
1117          adcs      r11,r6,#0
1118          adc       r7,r7,#0
1119          tst       r7,#4                         @ did it carry/borrow?
1120
1121          it        ne
1122          movne     r3,r8
1123          ldr       r8,[r2,#0]
1124          it        ne
1125          movne     r4,r9
1126          ldr       r9,[r2,#4]
1127          it        ne
1128          movne     r5,r10
1129          ldr       r10,[r2,#8]
1130          it        ne
1131          movne     r6,r11
1132          ldr       r11,[r2,#12]
1133
1134          adds      r3,r3,r8            @ accumulate nonce
1135          adcs      r4,r4,r9
1136          adcs      r5,r5,r10
1137          adc       r6,r6,r11
1138
1139# ifdef __ARMEB__
1140          rev       r3,r3
1141          rev       r4,r4
1142          rev       r5,r5
1143          rev       r6,r6
1144# endif
1145          str       r3,[r1,#0]                    @ store the result
1146          str       r4,[r1,#4]
1147          str       r5,[r1,#8]
1148          str       r6,[r1,#12]
1149
1150          ldmia     sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
1151          RET                                     @ bx      lr
1152.size     poly1305_emit_neon,.-poly1305_emit_neon
1153
1154.align    5
1155.Lzeros:
1156.long     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1157.LOPENSSL_armcap:
1158# ifdef   _WIN32
1159.word     OPENSSL_armcap_P
1160# else
1161.word     OPENSSL_armcap_P-.Lpoly1305_init
1162# endif
1163#endif
1164.byte     80,111,108,121,49,51,48,53,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1165.align    2
1166.align    2
1167#if       __ARM_MAX_ARCH__>=7
1168.comm     OPENSSL_armcap_P,4,4
1169#endif
1170