1dnl  ARM mpn_mod_34lsub1 -- remainder modulo 2^24-1.
2
3dnl  Copyright 2012, 2013 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33C              cycles/limb
34C StrongARM          ?
35C XScale   ?
36C Cortex-A5          2.67
37C Cortex-A7          2.35
38C Cortex-A8          2.0
39C Cortex-A9          1.33
40C Cortex-A15         1.33
41C Cortex-A17         3.34
42C Cortex-A53         2.0
43
44define(`ap',        r0)
45define(`n',         r1)
46
47C mp_limb_t mpn_mod_34lsub1 (mp_srcptr up, mp_size_t n)
48
49C TODO
50C  * Write cleverer summation code.
51C  * Consider loading 6 64-bit aligned registers at a time, to approach 1 c/l.
52
53ASM_START()
54          TEXT
55          ALIGN(32)
56PROLOGUE(mpn_mod_34lsub1)
57          push      { r4, r5, r6, r7 }
58
59          subs      n, n, #3
60          mov       r7, #0
61          blt       L(le2)                        C n <= 2
62
63          ldmia     ap!, { r2, r3, r12 }
64          subs      n, n, #3
65          blt       L(sum)                        C n <= 5
66          cmn       r0, #0                        C clear carry
67          sub       n, n, #3
68          b         L(mid)
69
70L(top):   adcs      r2, r2, r4
71          adcs      r3, r3, r5
72          adcs      r12, r12, r6
73L(mid):   ldmia     ap!, { r4, r5, r6 }
74          tst       n, n
75          sub       n, n, #3
76          bpl       L(top)
77
78          add       n, n, #3
79
80          adcs      r2, r2, r4
81          adcs      r3, r3, r5
82          adcs      r12, r12, r6
83          movcs     r7, #1                        C r7 <= 1
84
85L(sum):   cmn       n, #2
86          movlo     r4, #0
87          ldrhs     r4, [ap], #4
88          movls     r5, #0
89          ldrhi     r5, [ap], #4
90
91          adds      r2, r2, r4
92          adcs      r3, r3, r5
93          adcs      r12, r12, #0
94          adc       r7, r7, #0                    C r7 <= 2
95
96L(sum2):
97          bic       r0, r2, #0xff000000
98          add       r0, r0, r2, lsr #24
99          add       r0, r0, r7
100
101          mov       r7, r3, lsl #8
102          bic       r1, r7, #0xff000000
103          add       r0, r0, r1
104          add       r0, r0, r3, lsr #16
105
106          mov       r7, r12, lsl #16
107          bic       r1, r7, #0xff000000
108          add       r0, r0, r1
109          add       r0, r0, r12, lsr #8
110
111          pop       { r4, r5, r6, r7 }
112          return    lr
113
114L(le2):   cmn       n, #1
115          bne       L(1)
116          ldmia     ap!, { r2, r3 }
117          mov       r12, #0
118          b         L(sum2)
119L(1):     ldr       r2, [ap]
120          bic       r0, r2, #0xff000000
121          add       r0, r0, r2, lsr #24
122          pop       { r4, r5, r6, r7 }
123          return    lr
124EPILOGUE()
125