1 /* mpn_mu_bdiv_qr(qp,rp,np,nn,dp,dn,tp) -- Compute {np,nn} / {dp,dn} mod B^qn,
2    where qn = nn-dn, storing the result in {qp,qn}.  Overlap allowed between Q
3    and N; all other overlap disallowed.
4 
5    Contributed to the GNU project by Torbjorn Granlund.
6 
7    THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
8    SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
9    GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
10 
11 Copyright 2005-2007, 2009, 2010, 2012, 2017 Free Software Foundation, Inc.
12 
13 This file is part of the GNU MP Library.
14 
15 The GNU MP Library is free software; you can redistribute it and/or modify
16 it under the terms of either:
17 
18   * the GNU Lesser General Public License as published by the Free
19     Software Foundation; either version 3 of the License, or (at your
20     option) any later version.
21 
22 or
23 
24   * the GNU General Public License as published by the Free Software
25     Foundation; either version 2 of the License, or (at your option) any
26     later version.
27 
28 or both in parallel, as here.
29 
30 The GNU MP Library is distributed in the hope that it will be useful, but
31 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
32 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
33 for more details.
34 
35 You should have received copies of the GNU General Public License and the
36 GNU Lesser General Public License along with the GNU MP Library.  If not,
37 see https://www.gnu.org/licenses/.  */
38 
39 
40 /*
41    The idea of the algorithm used herein is to compute a smaller inverted value
42    than used in the standard Barrett algorithm, and thus save time in the
43    Newton iterations, and pay just a small price when using the inverted value
44    for developing quotient bits.  This algorithm was presented at ICMS 2006.
45 */
46 
47 #include "gmp-impl.h"
48 
49 
50 /* N = {np,nn}
51    D = {dp,dn}
52 
53    Requirements: N >= D
54                      D >= 1
55                      D odd
56                      dn >= 2
57                      nn >= 2
58                      scratch space as determined by mpn_mu_bdiv_qr_itch(nn,dn).
59 
60    Write quotient to Q = {qp,nn-dn}.
61 
62    FIXME: When iterating, perhaps do the small step before loop, not after.
63    FIXME: Try to avoid the scalar divisions when computing inverse size.
64    FIXME: Trim allocation for (qn > dn) case, 3*dn might be possible.  In
65             particular, when dn==in, tp and rp could use the same space.
66 */
67 static mp_limb_t
mpn_mu_bdiv_qr_old(mp_ptr qp,mp_ptr rp,mp_srcptr np,mp_size_t nn,mp_srcptr dp,mp_size_t dn,mp_ptr scratch)68 mpn_mu_bdiv_qr_old (mp_ptr qp,
69                         mp_ptr rp,
70                         mp_srcptr np, mp_size_t nn,
71                         mp_srcptr dp, mp_size_t dn,
72                         mp_ptr scratch)
73 {
74   mp_size_t qn;
75   mp_size_t in;
76   mp_limb_t cy, c0;
77   mp_size_t tn, wn;
78 
79   qn = nn - dn;
80 
81   ASSERT (dn >= 2);
82   ASSERT (qn >= 2);
83 
84   if (qn > dn)
85     {
86       mp_size_t b;
87 
88       /* |_______________________|   dividend
89                               |________|   divisor  */
90 
91 #define ip           scratch            /* in */
92 #define tp           (scratch + in)     /* dn+in or next_size(dn) or rest >= binvert_itch(in) */
93 #define scratch_out  (scratch + in + tn)/* mulmod_bnm1_itch(next_size(dn)) */
94 
95       /* Compute an inverse size that is a nice partition of the quotient.  */
96       b = (qn - 1) / dn + 1;  /* ceil(qn/dn), number of blocks */
97       in = (qn - 1) / b + 1;  /* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */
98 
99       /* Some notes on allocation:
100 
101            When in = dn, R dies when mpn_mullo returns, if in < dn the low in
102            limbs of R dies at that point.  We could save memory by letting T live
103            just under R, and let the upper part of T expand into R. These changes
104            should reduce itch to perhaps 3dn.
105        */
106 
107       mpn_binvert (ip, dp, in, tp);
108 
109       MPN_COPY (rp, np, dn);
110       np += dn;
111       cy = 0;
112 
113       while (qn > in)
114           {
115             mpn_mullo_n (qp, rp, ip, in);
116 
117             if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
118               mpn_mul (tp, dp, dn, qp, in);       /* mulhi, need tp[dn+in-1...in] */
119             else
120               {
121                 tn = mpn_mulmod_bnm1_next_size (dn);
122                 mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out);
123                 wn = dn + in - tn;                /* number of wrapped limbs */
124                 if (wn > 0)
125                     {
126                       c0 = mpn_sub_n (tp + tn, tp, rp, wn);
127                       mpn_decr_u (tp + wn, c0);
128                     }
129               }
130 
131             qp += in;
132             qn -= in;
133 
134             if (dn != in)
135               {
136                 /* Subtract tp[dn-1...in] from partial remainder.  */
137                 cy += mpn_sub_n (rp, rp + in, tp + in, dn - in);
138                 if (cy == 2)
139                     {
140                       mpn_incr_u (tp + dn, 1);
141                       cy = 1;
142                     }
143               }
144             /* Subtract tp[dn+in-1...dn] from dividend.  */
145             cy = mpn_sub_nc (rp + dn - in, np, tp + dn, in, cy);
146             np += in;
147           }
148 
149       /* Generate last qn limbs.  */
150       mpn_mullo_n (qp, rp, ip, qn);
151 
152       if (BELOW_THRESHOLD (qn, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
153           mpn_mul (tp, dp, dn, qp, qn);           /* mulhi, need tp[qn+in-1...in] */
154       else
155           {
156             tn = mpn_mulmod_bnm1_next_size (dn);
157             mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, qn, scratch_out);
158             wn = dn + qn - tn;                              /* number of wrapped limbs */
159             if (wn > 0)
160               {
161                 c0 = mpn_sub_n (tp + tn, tp, rp, wn);
162                 mpn_decr_u (tp + wn, c0);
163               }
164           }
165 
166       if (dn != qn)
167           {
168             cy += mpn_sub_n (rp, rp + qn, tp + qn, dn - qn);
169             if (cy == 2)
170               {
171                 mpn_incr_u (tp + dn, 1);
172                 cy = 1;
173               }
174           }
175       return mpn_sub_nc (rp + dn - qn, np, tp + dn, qn, cy);
176 
177 #undef ip
178 #undef tp
179 #undef scratch_out
180     }
181   else
182     {
183       /* |_______________________|   dividend
184                     |________________|   divisor  */
185 
186 #define ip           scratch            /* in */
187 #define tp           (scratch + in)     /* dn+in or next_size(dn) or rest >= binvert_itch(in) */
188 #define scratch_out  (scratch + in + tn)/* mulmod_bnm1_itch(next_size(dn)) */
189 
190       /* Compute half-sized inverse.  */
191       in = qn - (qn >> 1);
192 
193       mpn_binvert (ip, dp, in, tp);
194 
195       mpn_mullo_n (qp, np, ip, in);               /* low `in' quotient limbs */
196 
197       if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
198           mpn_mul (tp, dp, dn, qp, in);           /* mulhigh */
199       else
200           {
201             tn = mpn_mulmod_bnm1_next_size (dn);
202             mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out);
203             wn = dn + in - tn;                              /* number of wrapped limbs */
204             if (wn > 0)
205               {
206                 c0 = mpn_sub_n (tp + tn, tp, np, wn);
207                 mpn_decr_u (tp + wn, c0);
208               }
209           }
210 
211       qp += in;
212       qn -= in;
213 
214       cy = mpn_sub_n (rp, np + in, tp + in, dn);
215       mpn_mullo_n (qp, rp, ip, qn);               /* high qn quotient limbs */
216 
217       if (BELOW_THRESHOLD (qn, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
218           mpn_mul (tp, dp, dn, qp, qn);           /* mulhigh */
219       else
220           {
221             tn = mpn_mulmod_bnm1_next_size (dn);
222             mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, qn, scratch_out);
223             wn = dn + qn - tn;                              /* number of wrapped limbs */
224             if (wn > 0)
225               {
226                 c0 = mpn_sub_n (tp + tn, tp, rp, wn);
227                 mpn_decr_u (tp + wn, c0);
228               }
229           }
230 
231       cy += mpn_sub_n (rp, rp + qn, tp + qn, dn - qn);
232       if (cy == 2)
233           {
234             mpn_incr_u (tp + dn, 1);
235             cy = 1;
236           }
237       return mpn_sub_nc (rp + dn - qn, np + dn + in, tp + dn, qn, cy);
238 
239 #undef ip
240 #undef tp
241 #undef scratch_out
242     }
243 }
244 
245 mp_limb_t
mpn_mu_bdiv_qr(mp_ptr qp,mp_ptr rp,mp_srcptr np,mp_size_t nn,mp_srcptr dp,mp_size_t dn,mp_ptr scratch)246 mpn_mu_bdiv_qr (mp_ptr qp,
247                     mp_ptr rp,
248                     mp_srcptr np, mp_size_t nn,
249                     mp_srcptr dp, mp_size_t dn,
250                     mp_ptr scratch)
251 {
252   mp_limb_t cy = mpn_mu_bdiv_qr_old (qp, rp, np, nn, dp, dn, scratch);
253 
254   /* R' B^{qn} = U - Q' D
255    *
256    * Q = B^{qn} - Q' (assuming Q' != 0)
257    *
258    * R B^{qn} = U + Q D = U + B^{qn} D - Q' D
259    *          = B^{qn} D + R'
260    */
261 
262   if (UNLIKELY (!mpn_neg (qp, qp, nn - dn)))
263     {
264       /* Zero quotient. */
265       ASSERT (cy == 0);
266       return 0;
267     }
268   else
269     {
270       mp_limb_t cy2 = mpn_add_n (rp, rp, dp, dn);
271       ASSERT (cy2 >= cy);
272 
273       return cy2 - cy;
274     }
275 }
276 
277 
278 mp_size_t
mpn_mu_bdiv_qr_itch(mp_size_t nn,mp_size_t dn)279 mpn_mu_bdiv_qr_itch (mp_size_t nn, mp_size_t dn)
280 {
281   mp_size_t qn, in, tn, itch_binvert, itch_out, itches;
282   mp_size_t b;
283 
284   ASSERT_ALWAYS (DC_BDIV_Q_THRESHOLD < MU_BDIV_Q_THRESHOLD);
285 
286   qn = nn - dn;
287 
288   if (qn > dn)
289     {
290       b = (qn - 1) / dn + 1;  /* ceil(qn/dn), number of blocks */
291       in = (qn - 1) / b + 1;  /* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */
292     }
293   else
294     {
295       in = qn - (qn >> 1);
296     }
297 
298   if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
299     {
300       tn = dn + in;
301       itch_out = 0;
302     }
303   else
304     {
305       tn = mpn_mulmod_bnm1_next_size (dn);
306       itch_out = mpn_mulmod_bnm1_itch (tn, dn, in);
307     }
308 
309   itch_binvert = mpn_binvert_itch (in);
310   itches = tn + itch_out;
311   return in + MAX (itches, itch_binvert);
312 }
313