1#include "arm_asm.h"
2#include "arm_arch.h"
3
4#if defined(__thumb2__)
5.syntax   unified
6.thumb
7#else
8.code     32
9#endif
10
11.text
12.type     mul_1x1_ialu,%function
13.align    5
14mul_1x1_ialu:
15          mov       r4,#0
16          bic       r5,r1,#3<<30                  @ a1=a&0x3fffffff
17          str       r4,[sp,#0]                    @ tab[0]=0
18          add       r6,r5,r5            @ a2=a1<<1
19          str       r5,[sp,#4]                    @ tab[1]=a1
20          eor       r7,r5,r6            @ a1^a2
21          str       r6,[sp,#8]                    @ tab[2]=a2
22          mov       r8,r5,lsl#2                   @ a4=a1<<2
23          str       r7,[sp,#12]                   @ tab[3]=a1^a2
24          eor       r9,r5,r8            @ a1^a4
25          str       r8,[sp,#16]                   @ tab[4]=a4
26          eor       r4,r6,r8            @ a2^a4
27          str       r9,[sp,#20]                   @ tab[5]=a1^a4
28          eor       r7,r7,r8            @ a1^a2^a4
29          str       r4,[sp,#24]                   @ tab[6]=a2^a4
30          and       r8,r12,r0,lsl#2
31          str       r7,[sp,#28]                   @ tab[7]=a1^a2^a4
32
33          and       r9,r12,r0,lsr#1
34          ldr       r5,[sp,r8]                    @ tab[b       & 0x7]
35          and       r8,r12,r0,lsr#4
36          ldr       r7,[sp,r9]                    @ tab[b >>  3 & 0x7]
37          and       r9,r12,r0,lsr#7
38          ldr       r6,[sp,r8]                    @ tab[b >>  6 & 0x7]
39          eor       r5,r5,r7,lsl#3      @ stall
40          mov       r4,r7,lsr#29
41          ldr       r7,[sp,r9]                    @ tab[b >>  9 & 0x7]
42
43          and       r8,r12,r0,lsr#10
44          eor       r5,r5,r6,lsl#6
45          eor       r4,r4,r6,lsr#26
46          ldr       r6,[sp,r8]                    @ tab[b >> 12 & 0x7]
47
48          and       r9,r12,r0,lsr#13
49          eor       r5,r5,r7,lsl#9
50          eor       r4,r4,r7,lsr#23
51          ldr       r7,[sp,r9]                    @ tab[b >> 15 & 0x7]
52
53          and       r8,r12,r0,lsr#16
54          eor       r5,r5,r6,lsl#12
55          eor       r4,r4,r6,lsr#20
56          ldr       r6,[sp,r8]                    @ tab[b >> 18 & 0x7]
57
58          and       r9,r12,r0,lsr#19
59          eor       r5,r5,r7,lsl#15
60          eor       r4,r4,r7,lsr#17
61          ldr       r7,[sp,r9]                    @ tab[b >> 21 & 0x7]
62
63          and       r8,r12,r0,lsr#22
64          eor       r5,r5,r6,lsl#18
65          eor       r4,r4,r6,lsr#14
66          ldr       r6,[sp,r8]                    @ tab[b >> 24 & 0x7]
67
68          and       r9,r12,r0,lsr#25
69          eor       r5,r5,r7,lsl#21
70          eor       r4,r4,r7,lsr#11
71          ldr       r7,[sp,r9]                    @ tab[b >> 27 & 0x7]
72
73          tst       r1,#1<<30
74          and       r8,r12,r0,lsr#28
75          eor       r5,r5,r6,lsl#24
76          eor       r4,r4,r6,lsr#8
77          ldr       r6,[sp,r8]                    @ tab[b >> 30      ]
78
79#ifdef    __thumb2__
80          itt       ne
81#endif
82          eorne     r5,r5,r0,lsl#30
83          eorne     r4,r4,r0,lsr#2
84          tst       r1,#1<<31
85          eor       r5,r5,r7,lsl#27
86          eor       r4,r4,r7,lsr#5
87#ifdef    __thumb2__
88          itt       ne
89#endif
90          eorne     r5,r5,r0,lsl#31
91          eorne     r4,r4,r0,lsr#1
92          eor       r5,r5,r6,lsl#30
93          eor       r4,r4,r6,lsr#2
94
95          mov       pc,lr
96.size     mul_1x1_ialu,.-mul_1x1_ialu
97.globl    bn_GF2m_mul_2x2
98.type     bn_GF2m_mul_2x2,%function
99.align    5
100bn_GF2m_mul_2x2:
101#if __ARM_MAX_ARCH__>=7
102          stmdb     sp!,{r10,lr}
103          ldr       r12,.LOPENSSL_armcap
104# if !defined(_WIN32)
105          adr       r10,.LOPENSSL_armcap
106          ldr       r12,[r12,r10]
107# endif
108# if defined(__APPLE__) || defined(_WIN32)
109          ldr       r12,[r12]
110# endif
111          tst       r12,#ARMV7_NEON
112          itt       ne
113          ldrne     r10,[sp],#8
114          bne       .LNEON
115          stmdb     sp!,{r4,r5,r6,r7,r8,r9}
116#else
117          stmdb     sp!,{r4,r5,r6,r7,r8,r9,r10,lr}
118#endif
119          mov       r10,r0                        @ reassign 1st argument
120          mov       r0,r3                         @ r0=b1
121          sub       r7,sp,#36
122          mov       r8,sp
123          and       r7,r7,#-32
124          ldr       r3,[sp,#32]                   @ load b0
125          mov       r12,#7<<2
126          mov       sp,r7                         @ allocate tab[8]
127          str       r8,[r7,#32]
128
129          bl        mul_1x1_ialu                  @ a1·b1
130          str       r5,[r10,#8]
131          str       r4,[r10,#12]
132
133          eor       r0,r0,r3            @ flip b0 and b1
134          eor       r1,r1,r2            @ flip a0 and a1
135          eor       r3,r3,r0
136          eor       r2,r2,r1
137          eor       r0,r0,r3
138          eor       r1,r1,r2
139          bl        mul_1x1_ialu                  @ a0·b0
140          str       r5,[r10]
141          str       r4,[r10,#4]
142
143          eor       r1,r1,r2
144          eor       r0,r0,r3
145          bl        mul_1x1_ialu                  @ (a1+a0)·(b1+b0)
146          ldmia     r10,{r6,r7,r8,r9}
147          eor       r5,r5,r4
148          ldr       sp,[sp,#32]                   @ destroy tab[8]
149          eor       r4,r4,r7
150          eor       r5,r5,r6
151          eor       r4,r4,r8
152          eor       r5,r5,r9
153          eor       r4,r4,r9
154          str       r4,[r10,#8]
155          eor       r5,r5,r4
156          str       r5,[r10,#4]
157
158#if __ARM_ARCH__>=5
159          ldmia     sp!,{r4,r5,r6,r7,r8,r9,r10,pc}
160#else
161          ldmia     sp!,{r4,r5,r6,r7,r8,r9,r10,lr}
162          tst       lr,#1
163          moveq     pc,lr                         @ be binary compatible with V4, yet
164.word     0xe12fff1e                              @ interoperable with Thumb ISA:-)
165#endif
166#if __ARM_MAX_ARCH__>=7
167.arch     armv7-a
168.fpu      neon
169
170.align    5
171.LNEON:
172          ldr       r12, [sp]           @ 5th argument
173          vmov      d26, r2, r1
174          vmov      d27, r12, r3
175          vmov.i64  d28, #0x0000ffffffffffff
176          vmov.i64  d29, #0x00000000ffffffff
177          vmov.i64  d30, #0x000000000000ffff
178
179          vext.8    d2, d26, d26, #1    @ A1
180          vmull.p8  q1, d2, d27                   @ F = A1*B
181          vext.8    d0, d27, d27, #1    @ B1
182          vmull.p8  q0, d26, d0                   @ E = A*B1
183          vext.8    d4, d26, d26, #2    @ A2
184          vmull.p8  q2, d4, d27                   @ H = A2*B
185          vext.8    d16, d27, d27, #2   @ B2
186          vmull.p8  q8, d26, d16                  @ G = A*B2
187          vext.8    d6, d26, d26, #3    @ A3
188          veor      q1, q1, q0                    @ L = E + F
189          vmull.p8  q3, d6, d27                   @ J = A3*B
190          vext.8    d0, d27, d27, #3    @ B3
191          veor      q2, q2, q8                    @ M = G + H
192          vmull.p8  q0, d26, d0                   @ I = A*B3
193          veor      d2, d2, d3          @ t0 = (L) (P0 + P1) << 8
194          vand      d3, d3, d28
195          vext.8    d16, d27, d27, #4   @ B4
196          veor      d4, d4, d5          @ t1 = (M) (P2 + P3) << 16
197          vand      d5, d5, d29
198          vmull.p8  q8, d26, d16                  @ K = A*B4
199          veor      q3, q3, q0                    @ N = I + J
200          veor      d2, d2, d3
201          veor      d4, d4, d5
202          veor      d6, d6, d7          @ t2 = (N) (P4 + P5) << 24
203          vand      d7, d7, d30
204          vext.8    q1, q1, q1, #15
205          veor      d16, d16, d17       @ t3 = (K) (P6 + P7) << 32
206          vmov.i64  d17, #0
207          vext.8    q2, q2, q2, #14
208          veor      d6, d6, d7
209          vmull.p8  q0, d26, d27                  @ D = A*B
210          vext.8    q8, q8, q8, #12
211          vext.8    q3, q3, q3, #13
212          veor      q1, q1, q2
213          veor      q3, q3, q8
214          veor      q0, q0, q1
215          veor      q0, q0, q3
216
217          vst1.32   {q0}, [r0]
218          RET                 @ bx lr
219#endif
220.size     bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
221#if __ARM_MAX_ARCH__>=7
222.align    5
223.LOPENSSL_armcap:
224# ifdef   _WIN32
225.word     OPENSSL_armcap_P
226# else
227.word     OPENSSL_armcap_P-.
228# endif
229#endif
230.byte     71,70,40,50,94,109,41,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
231.align    2
232.align    5
233
234#if __ARM_MAX_ARCH__>=7
235.comm     OPENSSL_armcap_P,4,4
236#endif
237