1dnl  PPC-64 mpn_divrem_2 -- Divide an mpn number by a normalized 2-limb number.
2
3dnl  Copyright 2007, 2008 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33C                       cycles/limb
34C                       norm    frac
35C POWER3/PPC630
36C POWER4/PPC970         ?       ?
37C POWER5                37      ?
38C POWER6                62      ?
39C POWER6                30.5    ?
40
41C INPUT PARAMETERS
42C qp  = r3
43C fn  = r4
44C up  = r5
45C un  = r6
46C dp  = r7
47
48
49ifdef(`DARWIN',,`
50define(`r2',`r31')')                    C FIXME!
51
52ASM_START()
53
54EXTERN_FUNC(mpn_invert_limb)
55
56PROLOGUE(mpn_divrem_2,toc)
57          mflr      r0
58          std       r23, -72(r1)
59          std       r24, -64(r1)
60          std       r25, -56(r1)
61          std       r26, -48(r1)
62          std       r27, -40(r1)
63          std       r28, -32(r1)
64          std       r29, -24(r1)
65          std       r30, -16(r1)
66          std       r31, -8(r1)
67          std       r0, 16(r1)
68          stdu      r1, -192(r1)
69          mr        r24, r3
70          mr        r25, r4
71          sldi      r0, r6, 3
72          add       r26, r5, r0
73          addi      r26, r26, -24
74          ld        r30, 8(r7)
75          ld        r28, 0(r7)
76          ld        r29, 16(r26)
77          ld        r31, 8(r26)
78
79ifelse(0,1,`
80          li        r23, 0
81          cmpld     cr7, r29, r30
82          blt       cr7, L(8)
83          bgt       cr7, L(9)
84          cmpld     cr0, r31, r28
85          blt       cr0, L(8)
86L(9):     subfc     r31, r28, r31
87          subfe     r29, r30, r29
88          li        r23, 1
89',`
90          li        r23, 0
91          cmpld     cr7, r29, r30
92          blt       cr7, L(8)
93          mfcr      r0
94          rlwinm    r0, r0, 30, 31, 31
95          subfc     r9, r28, r31
96          addze.    r0, r0
97          nop
98          beq       cr0, L(8)
99          subfc     r31, r28, r31
100          subfe     r29, r30, r29
101          li        r23, 1
102')
103
104L(8):
105          add       r27, r25, r6
106          addic.    r27, r27, -3
107          blt       cr0, L(18)
108          mr        r3, r30
109          CALL(     mpn_invert_limb)
110          mulld     r10, r3, r30
111          mulhdu    r0, r3, r28
112          addc      r8, r10, r28
113          subfe     r11, r1, r1
114          addc      r10, r8, r0
115          addze.    r11, r11
116          blt       cr0, L(91)
117L(40):
118          subfc     r10, r30, r10
119          addme.    r11, r11
120          addi      r3, r3, -1
121          bge       cr0, L(40)
122L(91):
123          addi      r5, r27,  1
124          mtctr     r5
125          sldi      r0, r27, 3
126          add       r24, r24, r0
127          ALIGN(16)
128L(loop):
129          mulhdu    r8, r29, r3
130          mulld     r6, r29, r3
131          addc      r6, r6, r31
132          adde      r8, r8, r29
133          cmpd      cr7, r27, r25
134          mulld     r0, r30, r8
135          mulhdu    r11, r28, r8
136          mulld     r10, r28, r8
137          subf      r31, r0, r31
138          li        r7, 0
139          blt       cr7, L(60)
140          ld        r7, 0(r26)
141          addi      r26, r26, -8
142          nop
143L(60):    subfc     r7, r28, r7
144          subfe     r31, r30, r31
145          subfc     r7, r10, r7
146          subfe     r4, r11, r31
147          subfc     r9, r6, r4
148          subfe     r9, r1, r1
149          andc      r6, r28, r9
150          andc      r0, r30, r9
151          addc      r31, r7, r6
152          adde      r29, r4, r0
153          subf      r8, r9, r8
154          cmpld     cr7, r29, r30
155          bge-      cr7, L(fix)
156L(bck):   std       r8, 0(r24)
157          addi      r24, r24, -8
158          addi      r27, r27, -1
159          bdnz      L(loop)
160L(18):
161          std       r31, 8(r26)
162          std       r29, 16(r26)
163          mr        r3, r23
164          addi      r1, r1, 192
165          ld        r0, 16(r1)
166          mtlr      r0
167          ld        r23, -72(r1)
168          ld        r24, -64(r1)
169          ld        r25, -56(r1)
170          ld        r26, -48(r1)
171          ld        r27, -40(r1)
172          ld        r28, -32(r1)
173          ld        r29, -24(r1)
174          ld        r30, -16(r1)
175          ld        r31, -8(r1)
176          blr
177L(fix):
178          mfcr      r0
179          rlwinm    r0, r0, 30, 31, 31
180          subfc     r9, r28, r31
181          addze.    r0, r0
182          beq       cr0, L(bck)
183          subfc     r31, r28, r31
184          subfe     r29, r30, r29
185          addi      r8, r8, 1
186          b         L(bck)
187EPILOGUE()
188