1dnl  Alpha mpn_divrem_2 -- Divide an mpn number by a normalized 2-limb number.
2
3dnl  Copyright 2007, 2008, 2013 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33C                   norm      frac
34C ev4
35C ev5               70        70
36C ev6               29        29
37
38C TODO
39C  * Perhaps inline mpn_invert_limb, that would allow us to not save/restore
40C    any registers (thus save ~10 cycles per call).
41C  * Use negated d1 and/or d0 to speed carry propagation.  Might save a cycle
42C    or two.
43C  * Check cluster delays (for ev6).  We very likely could save some cycles.
44C  * Use branch-free code for computing di.
45C  * CAVEAT: We rely on r19 not being clobbered by mpn_invert_limb call.
46
47C INPUT PARAMETERS
48define(`qp',                  `r16')
49define(`fn',                  `r17')
50define(`up_param',  `r18')
51define(`un_param',  `r19')
52define(`dp',                  `r20')
53
54ASM_START()
55PROLOGUE(mpn_divrem_2,gp)
56          lda       r30, -80(r30)
57          stq       r26, 0(r30)
58          stq       r9, 8(r30)
59          stq       r10, 16(r30)
60          stq       r11, 24(r30)
61          stq       r12, 32(r30)
62          stq       r13, 40(r30)
63C         stq       r14, 48(r30)
64          stq       r15, 56(r30)
65          .prologue 1
66          stq       r16, 64(r30)
67          bis       r31, r17, r15
68          s8addq    r19, r18, r13
69          lda       r13, -24(r13)
70          ldq       r12, 8(r20)
71          ldq       r10, 0(r20)
72          ldq       r11, 16(r13)
73          ldq       r9, 8(r13)
74
75          bis       r31, r31, r3                  C most_significant_q_limb = 0
76          cmpult    r11, r12, r1
77          bne       r1, L(L8)
78          cmpule    r11, r12, r1
79          cmpult    r9, r10, r2
80          and       r1, r2, r1
81          bne       r1, L(L8)
82          subq      r11, r12, r11
83          subq      r11, r2, r11
84          subq      r9, r10, r9
85          lda       r3, 1(r31)                    C most_significant_q_limb = 1
86L(L8):    stq       r3, 72(r30)
87
88          addq      r15, r19, r19
89          lda       r19, -3(r19)
90          blt       r19, L(L10)
91          bis       r31, r12, r16
92          jsr       r26, mpn_invert_limb
93          LDGP(     r29, 0(r26))
94          mulq      r0, r12, r4                   C t0 = LO(di * d1)
95          umulh     r0, r10, r2                   C s1 = HI(di * d0)
96          addq      r4, r10, r4                   C t0 += d0
97          cmpule    r10, r4, r7                   C (t0 < d0)
98          addq      r4, r2, r4                    C t0 += s1
99          cmpult    r4, r2, r1
100          subq      r1, r7, r7                    C t1 (-1, 0, or 1)
101          blt       r7, L(L42)
102L(L22):
103          lda       r0, -1(r0)                    C di--
104          cmpult    r4, r12, r1                   C cy for: t0 -= d1 (below)
105          subq      r7, r1, r7                    C t1 -= cy
106          subq      r4, r12, r4                   C t0 -= d1
107          bge       r7, L(L22)
108L(L42):
109          ldq       r16, 64(r30)
110          s8addq    r19, r16, r16
111          ALIGN(16)
112L(loop):
113          mulq      r11, r0, r5                   C q0 (early)
114          umulh     r11, r0, r6                   C q  (early)
115          addq      r5, r9, r8                    C q0 += n1
116          addq      r6, r11, r6                   C q  += n2
117          cmpult    r8, r5, r1                    C cy for: q0 += n1
118          addq      r6, r1, r6                    C q  += cy
119          unop
120          mulq      r12, r6, r1                   C LO(d1 * q)
121          umulh     r10, r6, r7                   C t1 = HI(d0 * q)
122          subq      r9, r1, r9                    C n1 -= LO(d1 * q)
123          mulq      r10, r6, r4                   C t0 = LO(d0 * q)
124          unop
125          cmple     r15, r19, r5                  C condition and n0...
126          beq       r5, L(L31)
127          ldq       r5, 0(r13)
128          lda       r13, -8(r13)
129L(L31):   subq      r9, r12, r9                   C n1 -= d1
130          cmpult    r5, r10, r1                   C
131          subq      r9, r1, r9                    C
132          subq      r5, r10, r5                   C n0 -= d0
133          subq      r9, r7, r9                    C n1 -= t0
134          cmpult    r5, r4, r1                    C
135          subq      r9, r1, r2                    C
136          subq      r5, r4, r5                    C n0 -= t1
137          cmpult    r2, r8, r1                    C (n1 < q0)
138          addq      r6, r1, r6                    C q += cond
139          lda       r1, -1(r1)                    C -(n1 >= q0)
140          and       r1, r10, r4                   C
141          addq      r5, r4, r9                    C n0 += mask & d0
142          and       r1, r12, r1                   C
143          cmpult    r9, r5, r11                   C cy for: n0 += mask & d0
144          addq      r2, r1, r1                    C n1 += mask & d1
145          addq      r1, r11, r11                  C n1 += cy
146          cmpult    r11, r12, r1                  C
147          beq       r1, L(fix)                    C
148L(bck):   stq       r6, 0(r16)
149          lda       r16, -8(r16)
150          lda       r19, -1(r19)
151          bge       r19, L(loop)
152
153L(L10):   stq       r9, 8(r13)
154          stq       r11, 16(r13)
155          ldq       r0, 72(r30)
156          ldq       r26, 0(r30)
157          ldq       r9, 8(r30)
158          ldq       r10, 16(r30)
159          ldq       r11, 24(r30)
160          ldq       r12, 32(r30)
161          ldq       r13, 40(r30)
162C         ldq       r14, 48(r30)
163          ldq       r15, 56(r30)
164          lda       r30, 80(r30)
165          ret       r31, (r26), 1
166
167L(fix):   cmpule    r11, r12, r1
168          cmpult    r9, r10, r2
169          and       r1, r2, r1
170          bne       r1, L(bck)
171          subq      r11, r12, r11
172          subq      r11, r2, r11
173          subq      r9, r10, r9
174          lda       r6, 1(r6)
175          br        L(bck)
176EPILOGUE()
177ASM_END()
178