1dnl  IA-64 mpn_bdiv_dbm1.
2
3dnl  Contributed to the GNU project by Torbjorn Granlund.
4
5dnl  Copyright 2008, 2009 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35C         cycles/limb
36C Itanium:    4
37C Itanium 2:  2
38
39C TODO
40C  * Optimize feed-in and wind-down code, both for speed and code size.
41
42C INPUT PARAMETERS
43define(`rp', `r32')
44define(`up', `r33')
45define(`n', `r34')
46define(`bd', `r35')
47
48ASM_START()
49PROLOGUE(mpn_bdiv_dbm1c)
50          .prologue
51          .save               ar.lc, r2
52          .body
53
54ifdef(`HAVE_ABI_32',
55`         addp4               rp = 0, rp                    C M I
56          addp4               up = 0, up                    C M I
57          zxt4                n = n                         C I
58          ;;
59')
60{.mmb
61          mov                 r15 = r36           C M I
62          ldf8                f9 = [up], 8                  C M
63          nop.b               0                             C B
64}
65.Lcommon:
66{.mii
67          adds                r16 = -1, n                   C M I
68          mov                 r2 = ar.lc                    C I0
69          and                 r14 = 3, n                    C M I
70          ;;
71}
72{.mii
73          setf.sig  f6 = bd                       C M2 M3
74          shr.u               r31 = r16, 2                  C I0
75          cmp.eq              p10, p0 = 0, r14    C M I
76}
77{.mii
78          nop.m               0                             C M
79          cmp.eq              p11, p0 = 2, r14    C M I
80          cmp.eq              p12, p0 = 3, r14    C M I
81          ;;
82}
83{.mii
84          cmp.ne              p6, p7 = r0, r0               C M I
85          mov.i               ar.lc = r31                   C I0
86          cmp.ne              p8, p9 = r0, r0               C M I
87}
88{.bbb
89  (p10)   br.dptk             .Lb00                         C B
90  (p11)   br.dptk             .Lb10                         C B
91  (p12)   br.dptk             .Lb11                         C B
92          ;;
93}
94
95.Lb01:    br.cloop.dptk       .grt1
96          ;;
97          xma.l               f38 = f9, f6, f0
98          xma.hu              f39 = f9, f6, f0
99          ;;
100          getf.sig  r26 = f38
101          getf.sig  r27 = f39
102          br                  .Lcj1
103
104.grt1:    ldf8                f10 = [r33], 8
105          ;;
106          ldf8                f11 = [r33], 8
107          ;;
108          ldf8                f12 = [r33], 8
109          ;;
110          xma.l               f38 = f9, f6, f0
111          xma.hu              f39 = f9, f6, f0
112          ;;
113          ldf8                f13 = [r33], 8
114          ;;
115          xma.l               f32 = f10, f6, f0
116          xma.hu              f33 = f10, f6, f0
117          br.cloop.dptk       .grt5
118
119          ;;
120          getf.sig  r26 = f38
121          xma.l               f34 = f11, f6, f0
122          xma.hu              f35 = f11, f6, f0
123          ;;
124          getf.sig  r27 = f39
125          ;;
126          getf.sig  r20 = f32
127          xma.l               f36 = f12, f6, f0
128          xma.hu              f37 = f12, f6, f0
129          ;;
130          getf.sig  r21 = f33
131          ;;
132          getf.sig  r22 = f34
133          xma.l               f38 = f13, f6, f0
134          xma.hu              f39 = f13, f6, f0
135          br                  .Lcj5
136
137.grt5:    ldf8                f10 = [r33], 8
138          ;;
139          getf.sig  r26 = f38
140          xma.l               f34 = f11, f6, f0
141          xma.hu              f35 = f11, f6, f0
142          ;;
143          getf.sig  r27 = f39
144          ldf8                f11 = [r33], 8
145          ;;
146          getf.sig  r20 = f32
147          xma.l               f36 = f12, f6, f0
148          xma.hu              f37 = f12, f6, f0
149          ;;
150          getf.sig  r21 = f33
151          ldf8                f12 = [r33], 8
152          ;;
153          getf.sig  r22 = f34
154          xma.l               f38 = f13, f6, f0
155          xma.hu              f39 = f13, f6, f0
156          br                  .LL01
157
158.Lb10:    ldf8                f13 = [r33], 8
159          br.cloop.dptk       .grt2
160          ;;
161
162          xma.l               f36 = f9, f6, f0
163          xma.hu              f37 = f9, f6, f0
164          ;;
165          xma.l               f38 = f13, f6, f0
166          xma.hu              f39 = f13, f6, f0
167          ;;
168          getf.sig  r24 = f36
169          ;;
170          getf.sig  r25 = f37
171          ;;
172          getf.sig  r26 = f38
173          ;;
174          getf.sig  r27 = f39
175          br                  .Lcj2
176
177.grt2:    ldf8                f10 = [r33], 8
178          ;;
179          ldf8                f11 = [r33], 8
180          ;;
181          xma.l               f36 = f9, f6, f0
182          xma.hu              f37 = f9, f6, f0
183          ;;
184          ldf8                f12 = [r33], 8
185          ;;
186          xma.l               f38 = f13, f6, f0
187          xma.hu              f39 = f13, f6, f0
188          ;;
189          ldf8                f13 = [r33], 8
190          ;;
191          getf.sig  r24 = f36
192          xma.l               f32 = f10, f6, f0
193          xma.hu              f33 = f10, f6, f0
194          br.cloop.dptk       .grt6
195
196          getf.sig  r25 = f37
197          ;;
198          getf.sig  r26 = f38
199          xma.l               f34 = f11, f6, f0
200          xma.hu              f35 = f11, f6, f0
201          ;;
202          getf.sig  r27 = f39
203          ;;
204          getf.sig  r20 = f32
205          xma.l               f36 = f12, f6, f0
206          xma.hu              f37 = f12, f6, f0
207          br                  .Lcj6
208
209.grt6:    getf.sig  r25 = f37
210          ldf8                f10 = [r33], 8
211          ;;
212          getf.sig  r26 = f38
213          xma.l               f34 = f11, f6, f0
214          xma.hu              f35 = f11, f6, f0
215          ;;
216          getf.sig  r27 = f39
217          ldf8                f11 = [r33], 8
218          ;;
219          getf.sig  r20 = f32
220          xma.l               f36 = f12, f6, f0
221          xma.hu              f37 = f12, f6, f0
222          br                  .LL10
223
224
225.Lb11:    ldf8                f12 = [r33], 8
226          ;;
227          ldf8                f13 = [r33], 8
228          br.cloop.dptk       .grt3
229          ;;
230
231          xma.l               f34 = f9, f6, f0
232          xma.hu              f35 = f9, f6, f0
233          ;;
234          xma.l               f36 = f12, f6, f0
235          xma.hu              f37 = f12, f6, f0
236          ;;
237          getf.sig  r22 = f34
238          xma.l               f38 = f13, f6, f0
239          xma.hu              f39 = f13, f6, f0
240          ;;
241          getf.sig  r23 = f35
242          ;;
243          getf.sig  r24 = f36
244          ;;
245          getf.sig  r25 = f37
246          ;;
247          getf.sig  r26 = f38
248          br                  .Lcj3
249
250.grt3:    ldf8                f10 = [r33], 8
251          ;;
252          xma.l               f34 = f9, f6, f0
253          xma.hu              f35 = f9, f6, f0
254          ;;
255          ldf8                f11 = [r33], 8
256          ;;
257          xma.l               f36 = f12, f6, f0
258          xma.hu              f37 = f12, f6, f0
259          ;;
260          ldf8                f12 = [r33], 8
261          ;;
262          getf.sig  r22 = f34
263          xma.l               f38 = f13, f6, f0
264          xma.hu              f39 = f13, f6, f0
265          ;;
266          getf.sig  r23 = f35
267          ldf8                f13 = [r33], 8
268          ;;
269          getf.sig  r24 = f36
270          xma.l               f32 = f10, f6, f0
271          xma.hu              f33 = f10, f6, f0
272          br.cloop.dptk       .grt7
273
274          getf.sig  r25 = f37
275          ;;
276          getf.sig  r26 = f38
277          xma.l               f34 = f11, f6, f0
278          xma.hu              f35 = f11, f6, f0
279          br                  .Lcj7
280
281.grt7:    getf.sig  r25 = f37
282          ldf8                f10 = [r33], 8
283          ;;
284          getf.sig  r26 = f38
285          xma.l               f34 = f11, f6, f0
286          xma.hu              f35 = f11, f6, f0
287          br                  .LL11
288
289
290.Lb00:    ldf8                f11 = [r33], 8
291          ;;
292          ldf8                f12 = [r33], 8
293          ;;
294          ldf8                f13 = [r33], 8
295          br.cloop.dptk       .grt4
296          ;;
297
298          xma.l               f32 = f9, f6, f0
299          xma.hu              f33 = f9, f6, f0
300          ;;
301          xma.l               f34 = f11, f6, f0
302          xma.hu              f35 = f11, f6, f0
303          ;;
304          getf.sig  r20 = f32
305          xma.l               f36 = f12, f6, f0
306          xma.hu              f37 = f12, f6, f0
307          ;;
308          getf.sig  r21 = f33
309          ;;
310          getf.sig  r22 = f34
311          xma.l               f38 = f13, f6, f0
312          xma.hu              f39 = f13, f6, f0
313          ;;
314          getf.sig  r23 = f35
315          ;;
316          getf.sig  r24 = f36
317          br                  .Lcj4
318
319.grt4:    xma.l               f32 = f9, f6, f0
320          xma.hu              f33 = f9, f6, f0
321          ;;
322          ldf8                f10 = [r33], 8
323          ;;
324          xma.l               f34 = f11, f6, f0
325          xma.hu              f35 = f11, f6, f0
326          ;;
327          ldf8                f11 = [r33], 8
328          ;;
329          getf.sig  r20 = f32
330          xma.l               f36 = f12, f6, f0
331          xma.hu              f37 = f12, f6, f0
332          ;;
333          getf.sig  r21 = f33
334          ldf8                f12 = [r33], 8
335          ;;
336          getf.sig  r22 = f34
337          xma.l               f38 = f13, f6, f0
338          xma.hu              f39 = f13, f6, f0
339          ;;
340          getf.sig  r23 = f35
341          ldf8                f13 = [r33], 8
342          ;;
343          getf.sig  r24 = f36
344          xma.l               f32 = f10, f6, f0
345          xma.hu              f33 = f10, f6, f0
346          br.cloop.dptk       .LL00
347          br                  .Lcj8
348
349C *** MAIN LOOP START ***
350          ALIGN(32)
351.Ltop:
352          .pred.rel "mutex",p6,p7
353C         .mfi
354          getf.sig  r24 = f36
355          xma.l               f32 = f10, f6, f0
356  (p6)    sub                 r15 = r19, r27, 1
357C         .mfi
358          st8                 [r32] = r19, 8
359          xma.hu              f33 = f10, f6, f0
360  (p7)    sub                 r15 = r19, r27
361          ;;
362.LL00:
363C         .mfi
364          getf.sig  r25 = f37
365          nop.f 0
366          cmp.ltu             p6, p7 = r15, r20
367C         .mib
368          ldf8                f10 = [r33], 8
369          sub                 r16 = r15, r20
370          nop.b 0
371          ;;
372
373C         .mfi
374          getf.sig  r26 = f38
375          xma.l               f34 = f11, f6, f0
376  (p6)    sub                 r15 = r16, r21, 1
377C         .mfi
378          st8                 [r32] = r16, 8
379          xma.hu              f35 = f11, f6, f0
380  (p7)    sub                 r15 = r16, r21
381          ;;
382.LL11:
383C         .mfi
384          getf.sig  r27 = f39
385          nop.f 0
386          cmp.ltu             p6, p7 = r15, r22
387C         .mib
388          ldf8                f11 = [r33], 8
389          sub                 r17 = r15, r22
390          nop.b 0
391          ;;
392
393C         .mfi
394          getf.sig  r20 = f32
395          xma.l               f36 = f12, f6, f0
396  (p6)    sub                 r15 = r17, r23, 1
397C         .mfi
398          st8                 [r32] = r17, 8
399          xma.hu              f37 = f12, f6, f0
400  (p7)    sub                 r15 = r17, r23
401          ;;
402.LL10:
403C         .mfi
404          getf.sig  r21 = f33
405          nop.f 0
406          cmp.ltu             p6, p7 = r15, r24
407C         .mib
408          ldf8                f12 = [r33], 8
409          sub                 r18 = r15, r24
410          nop.b 0
411          ;;
412
413C         .mfi
414          getf.sig  r22 = f34
415          xma.l               f38 = f13, f6, f0
416  (p6)    sub                 r15 = r18, r25, 1
417C         .mfi
418          st8                 [r32] = r18, 8
419          xma.hu              f39 = f13, f6, f0
420  (p7)    sub                 r15 = r18, r25
421          ;;
422.LL01:
423C         .mfi
424          getf.sig  r23 = f35
425          nop.f 0
426          cmp.ltu             p6, p7 = r15, r26
427C         .mib
428          ldf8                f13 = [r33], 8
429          sub                 r19 = r15, r26
430          br.cloop.sptk.few .Ltop
431C *** MAIN LOOP END ***
432          ;;
433
434          getf.sig  r24 = f36
435          xma.l               f32 = f10, f6, f0
436  (p6)    sub                 r15 = r19, r27, 1
437          st8                 [r32] = r19, 8
438          xma.hu              f33 = f10, f6, f0
439  (p7)    sub                 r15 = r19, r27
440          ;;
441.Lcj8:    getf.sig  r25 = f37
442          cmp.ltu             p6, p7 = r15, r20
443          sub                 r16 = r15, r20
444          ;;
445          getf.sig  r26 = f38
446          xma.l               f34 = f11, f6, f0
447  (p6)    sub                 r15 = r16, r21, 1
448          st8                 [r32] = r16, 8
449          xma.hu              f35 = f11, f6, f0
450  (p7)    sub                 r15 = r16, r21
451          ;;
452.Lcj7:    getf.sig  r27 = f39
453          cmp.ltu             p6, p7 = r15, r22
454          sub                 r17 = r15, r22
455          ;;
456          getf.sig  r20 = f32
457          xma.l               f36 = f12, f6, f0
458  (p6)    sub                 r15 = r17, r23, 1
459          st8                 [r32] = r17, 8
460          xma.hu              f37 = f12, f6, f0
461  (p7)    sub                 r15 = r17, r23
462          ;;
463.Lcj6:    getf.sig  r21 = f33
464          cmp.ltu             p6, p7 = r15, r24
465          sub                 r18 = r15, r24
466          ;;
467          getf.sig  r22 = f34
468          xma.l               f38 = f13, f6, f0
469  (p6)    sub                 r15 = r18, r25, 1
470          st8                 [r32] = r18, 8
471          xma.hu              f39 = f13, f6, f0
472  (p7)    sub                 r15 = r18, r25
473          ;;
474.Lcj5:    getf.sig  r23 = f35
475          cmp.ltu             p6, p7 = r15, r26
476          sub                 r19 = r15, r26
477          ;;
478          getf.sig  r24 = f36
479  (p6)    sub                 r15 = r19, r27, 1
480          st8                 [r32] = r19, 8
481  (p7)    sub                 r15 = r19, r27
482          ;;
483.Lcj4:    getf.sig  r25 = f37
484          cmp.ltu             p6, p7 = r15, r20
485          sub                 r16 = r15, r20
486          ;;
487          getf.sig  r26 = f38
488  (p6)    sub                 r15 = r16, r21, 1
489          st8                 [r32] = r16, 8
490  (p7)    sub                 r15 = r16, r21
491          ;;
492.Lcj3:    getf.sig  r27 = f39
493          cmp.ltu             p6, p7 = r15, r22
494          sub                 r17 = r15, r22
495          ;;
496  (p6)    sub                 r15 = r17, r23, 1
497          st8                 [r32] = r17, 8
498  (p7)    sub                 r15 = r17, r23
499          ;;
500.Lcj2:    cmp.ltu             p6, p7 = r15, r24
501          sub                 r18 = r15, r24
502          ;;
503  (p6)    sub                 r15 = r18, r25, 1
504          st8                 [r32] = r18, 8
505  (p7)    sub                 r15 = r18, r25
506          ;;
507.Lcj1:    cmp.ltu             p6, p7 = r15, r26
508          sub                 r19 = r15, r26
509          ;;
510  (p6)    sub                 r8 = r19, r27, 1
511          st8                 [r32] = r19
512  (p7)    sub                 r8 = r19, r27
513          mov ar.lc = r2
514          br.ret.sptk.many b0
515EPILOGUE()
516ASM_END()
517