1dnl  ARM v6 mpn_divexact_1
2
3dnl  Contributed to the GNU project by Torbjörn Granlund.
4
5dnl  Copyright 2012, 2013 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35C               cycles/limb       cycles/limb
36C               norm    unorm    modexact_1c_odd
37C StrongARM          -         -
38C XScale   -         -
39C Cortex-A7          ?         ?
40C Cortex-A8          ?         ?
41C Cortex-A9          9        10                   9
42C Cortex-A15         7         7                   7
43
44C Architecture requirements:
45C v5      -
46C v5t     clz
47C v5te    -
48C v6      umaal
49C v6t2    -
50C v7a     -
51
52define(`rp', `r0')
53define(`up', `r1')
54define(`n',  `r2')
55define(`d',  `r3')
56
57define(`cy',  `r7')
58define(`cnt', `r6')
59define(`tnc', `r10')
60
61ASM_START()
62PROLOGUE(mpn_divexact_1)
63          push      {r4,r5,r6,r7,r8,r9}
64
65          tst       d, #1
66
67          rsb       r4, d, #0
68          and       r4, r4, d
69          clz       r4, r4
70          rsb       cnt, r4, #31                  C count_trailing_zeros
71          mov       d, d, lsr cnt
72
73C binvert limb
74          LEA(      r4, binvert_limb_table)
75          and       r12, d, #254
76          ldrb      r4, [r4, r12, lsr #1]
77          mul       r12, r4, r4
78          mul       r12, d, r12
79          rsb       r12, r12, r4, lsl #1
80          mul       r4, r12, r12
81          mul       r4, d, r4
82          rsb       r4, r4, r12, lsl #1 C r4 = inverse
83
84          ldr       r5, [up], #4                  C up[0]
85          mov       cy, #0
86          rsb       r8, r4, #0                    C r8 = -inverse
87          beq       L(unnorm)
88
89L(norm):
90          subs      n, n, #1
91          mul       r5, r5, r4
92          beq       L(end)
93
94          ALIGN(16)
95L(top):   ldr       r9, [up], #4
96          mov       r12, #0
97          str       r5, [rp], #4
98          umaal     r12, cy, r5, d
99          mul       r5, r9, r4
100          mla       r5, cy, r8, r5
101          subs      n, n, #1
102          bne       L(top)
103
104L(end):   str       r5, [rp]
105          pop       {r4,r5,r6,r7,r8,r9}
106          bx        r14
107
108L(unnorm):
109          push      {r10,r11}
110          rsb       tnc, cnt, #32
111          mov       r11, r5, lsr cnt
112          subs      n, n, #1
113          beq       L(edx)
114
115          ldr       r12, [up], #4
116          orr       r9, r11, r12, lsl tnc
117          mov       r11, r12, lsr cnt
118          mul       r5, r9, r4
119          subs      n, n, #1
120          beq       L(edu)
121
122          ALIGN(16)
123L(tpu):   ldr       r12, [up], #4
124          orr       r9, r11, r12, lsl tnc
125          mov       r11, r12, lsr cnt
126          mov       r12, #0
127          str       r5, [rp], #4
128          umaal     r12, cy, r5, d
129          mul       r5, r9, r4
130          mla       r5, cy, r8, r5
131          subs      n, n, #1
132          bne       L(tpu)
133
134L(edu):   str       r5, [rp], #4
135          mov       r12, #0
136          umaal     r12, cy, r5, d
137          mul       r5, r11, r4
138          mla       r5, cy, r8, r5
139          str       r5, [rp]
140          pop       {r10,r11}
141          pop       {r4,r5,r6,r7,r8,r9}
142          bx        r14
143
144L(edx):   mul       r5, r11, r4
145          str       r5, [rp]
146          pop       {r10,r11}
147          pop       {r4,r5,r6,r7,r8,r9}
148          bx        r14
149EPILOGUE()
150