1dnl  Alpha mpn_divexact_by3c -- mpn division by 3, expecting no remainder.
2
3dnl  Copyright 2004, 2005, 2009 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33C      cycles/limb
34C EV4:    22
35C EV5:    11.5
36C EV6:     6.3                Note that mpn_bdiv_dbm1c is faster
37
38C TODO
39C  * Remove the unops, they benefit just ev6, which no longer uses this file.
40C  * Try prefetch for destination, using lds.
41C  * Improve feed-in code, by moving initial mulq earlier; make initial load
42C    to u0/u0 to save some copying.
43C  * Combine u0 and u2, u1 and u3.
44
45C INPUT PARAMETERS
46define(`rp',        `r16')
47define(`up',        `r17')
48define(`n',         `r18')
49define(`cy',        `r19')
50
51ASM_START()
52
53DATASTART(L(LC),8)
54          .quad     0xAAAAAAAAAAAAAAAB
55          .quad     0x5555555555555555
56          .quad     0xAAAAAAAAAAAAAAAA
57DATAEND()
58
59define(`xAAAAAAAAAAAAAAAB',   `r20')
60define(`x5555555555555555',   `r21')
61define(`xAAAAAAAAAAAAAAAA',   `r22')
62define(`u0',        `r0')     define(`u1',        `r1')
63define(`u2',        `r2')     define(`u3',        `r3')
64define(`l0',        `r25')    define(`x',         `r8')
65define(`q0',        `r4')     define(`q1',        `r5')
66define(`p6',        `r6')     define(`p7',        `r7')
67define(`t0',        `r23')    define(`t1',        `r24')
68define(`cymask',`r28')
69
70
71PROLOGUE(mpn_divexact_by3c,gp)
72
73          ldq       r28, 0(up)                              C load first limb early
74
75C Put magic constants in registers
76          lda       r0, L(LC)
77          ldq       xAAAAAAAAAAAAAAAB, 0(r0)
78          ldq       x5555555555555555, 8(r0)
79          ldq       xAAAAAAAAAAAAAAAA, 16(r0)
80
81C Compute initial l0 value
82          cmpeq     cy, 1, p6
83          cmpeq     cy, 2, p7
84          negq      p6, p6
85          and       p6, x5555555555555555, l0
86          cmovne    p7, xAAAAAAAAAAAAAAAA, l0
87
88C Feed-in depending on (n mod 4)
89          and       n, 3, r8
90          lda       n, -3(n)
91          cmpeq     r8, 1, r4
92          cmpeq     r8, 2, r5
93          bne       r4, $Lb01
94          bne       r5, $Lb10
95          beq       r8, $Lb00
96
97$Lb11:    ldq       u3, 8(up)
98          lda       up, -24(up)
99          lda       rp, -24(rp)
100          mulq      r28, xAAAAAAAAAAAAAAAB, q0
101          mov       r28, u2
102          br        r31, $L11
103
104$Lb00:    ldq       u2, 8(up)
105          lda       up, -16(up)
106          lda       rp, -16(rp)
107          mulq      r28, xAAAAAAAAAAAAAAAB, q1
108          mov       r28, u1
109          br        r31, $L00
110
111$Lb01:    lda       rp, -8(rp)
112          mulq      r28, xAAAAAAAAAAAAAAAB, q0
113          mov       r28, u0
114          blt       n, $Lcj1
115          ldq       u1, 8(up)
116          lda       up, -8(up)
117          br        r31, $L01
118
119$Lb10:    ldq       u0, 8(up)
120          mulq      r28, xAAAAAAAAAAAAAAAB, q1
121          mov       r28, u3
122          blt       n, $Lend
123
124          ALIGN(16)
125$Ltop:
126C 0
127          cmpult    u3, cy, cy                              C L0
128          mulq      u0, xAAAAAAAAAAAAAAAB, q0     C U1
129          ldq       u1, 16(up)                              C L1
130          addq      q1, l0, x                     C U0
131C 1
132          negq      cy, cymask                              C L0
133          unop                                              C U1
134          unop                                              C L1
135          cmpult    x5555555555555555, x, p6      C U0
136C 2
137          cmpult    xAAAAAAAAAAAAAAAA, x, p7      C U1
138          unop
139          unop
140          negq      p6, t0                                  C L0
141C 3
142          negq      p7, t1                                  C L0
143          and       cymask, x5555555555555555, l0 C U1
144          addq      p6, cy, cy
145          and       t0, x5555555555555555, t0
146C 4
147          and       t1, x5555555555555555, t1
148          addq      p7, cy, cy
149          unop
150          addq      t0, l0, l0
151C 5
152          addq      t1, l0, l0
153          unop
154          stq       x, 0(rp)                      C L1
155          unop
156$L01:
157C 0
158          cmpult    u0, cy, cy                              C L0
159          mulq      u1, xAAAAAAAAAAAAAAAB, q1     C U1
160          ldq       u2, 24(up)                              C L1
161          addq      q0, l0, x                     C U0
162C 1
163          negq      cy, cymask                              C L0
164          unop                                              C U1
165          unop                                              C L1
166          cmpult    x5555555555555555, x, p6      C U0
167C 2
168          cmpult    xAAAAAAAAAAAAAAAA, x, p7      C U1
169          unop
170          unop
171          negq      p6, t0                                  C L0
172C 3
173          negq      p7, t1                                  C L0
174          and       cymask, x5555555555555555, l0 C U1
175          addq      p6, cy, cy
176          and       t0, x5555555555555555, t0
177C 4
178          and       t1, x5555555555555555, t1
179          addq      p7, cy, cy
180          unop
181          addq      t0, l0, l0
182C 5
183          addq      t1, l0, l0
184          unop
185          stq       x, 8(rp)                      C L1
186          unop
187$L00:
188C 0
189          cmpult    u1, cy, cy                              C L0
190          mulq      u2, xAAAAAAAAAAAAAAAB, q0     C U1
191          ldq       u3, 32(up)                              C L1
192          addq      q1, l0, x                     C U0
193C 1
194          negq      cy, cymask                              C L0
195          unop                                              C U1
196          unop                                              C L1
197          cmpult    x5555555555555555, x, p6      C U0
198C 2
199          cmpult    xAAAAAAAAAAAAAAAA, x, p7      C U1
200          unop
201          unop
202          negq      p6, t0                                  C L0
203C 3
204          negq      p7, t1                                  C L0
205          and       cymask, x5555555555555555, l0 C U1
206          addq      p6, cy, cy
207          and       t0, x5555555555555555, t0
208C 4
209          and       t1, x5555555555555555, t1
210          addq      p7, cy, cy
211          unop
212          addq      t0, l0, l0
213C 5
214          addq      t1, l0, l0
215          unop
216          stq       x, 16(rp)                     C L1
217          unop
218$L11:
219C 0
220          cmpult    u2, cy, cy                              C L0
221          mulq      u3, xAAAAAAAAAAAAAAAB, q1     C U1
222          ldq       u0, 40(up)                              C L1
223          addq      q0, l0, x                     C U0
224C 1
225          negq      cy, cymask                              C L0
226          unop                                              C U1
227          unop                                              C L1
228          cmpult    x5555555555555555, x, p6      C U0
229C 2
230          cmpult    xAAAAAAAAAAAAAAAA, x, p7      C U1
231          lda       n, -4(n)                      C L1 bookkeeping
232          unop
233          negq      p6, t0                                  C L0
234C 3
235          negq      p7, t1                                  C L0
236          and       cymask, x5555555555555555, l0 C U1
237          addq      p6, cy, cy
238          and       t0, x5555555555555555, t0
239C 4
240          and       t1, x5555555555555555, t1
241          addq      p7, cy, cy
242          unop
243          addq      t0, l0, l0
244C 5
245          addq      t1, l0, l0
246          unop
247          stq       x, 24(rp)                     C L1
248          lda       up, 32(up)
249C
250          ldl       r31, 256(up)                            C prefetch
251          unop
252          lda       rp, 32(rp)
253          bge       n, $Ltop                      C U1
254C *** MAIN LOOP END ***
255$Lend:
256
257          cmpult    u3, cy, cy                              C L0
258          mulq      u0, xAAAAAAAAAAAAAAAB, q0     C U1
259          unop
260          addq      q1, l0, x                     C U0
261C 1
262          negq      cy, cymask                              C L0
263          unop                                              C U1
264          unop                                              C L1
265          cmpult    x5555555555555555, x, p6      C U0
266C 2
267          cmpult    xAAAAAAAAAAAAAAAA, x, p7      C U1
268          unop
269          unop
270          negq      p6, t0                                  C L0
271C 3
272          negq      p7, t1                                  C L0
273          and       cymask, x5555555555555555, l0 C U1
274          addq      p6, cy, cy
275          and       t0, x5555555555555555, t0
276C 4
277          and       t1, x5555555555555555, t1
278          addq      p7, cy, cy
279          unop
280          addq      t0, l0, l0
281C 5
282          addq      t1, l0, l0
283          unop
284          stq       x, 0(rp)                      C L1
285          unop
286$Lcj1:
287          cmpult    u0, cy, cy                              C L0
288          addq      q0, l0, x                     C U0
289          cmpult    x5555555555555555, x, p6      C U0
290          cmpult    xAAAAAAAAAAAAAAAA, x, p7      C U1
291          addq      p6, cy, cy
292          addq      p7, cy, r0
293          stq       x, 8(rp)                      C L1
294
295          ret       r31,(r26),1
296EPILOGUE()
297ASM_END()
298
299C This is useful for playing with various schedules.
300C Expand as: one(0)one(1)one(2)one(3)
301define(`one',`
302C 0
303          cmpult    `$'eval(($1+3)%4), cy, cy               C L0
304          mulq      `$'$1, xAAAAAAAAAAAAAAAB, `$'eval(4+$1%2) C U1
305          ldq       `$'eval(($1+1)%4), eval($1*8+16)(up)    C L1
306          addq      `$'eval(4+($1+1)%2), l0, x              C U0
307C 1
308          negq      cy, cymask                                        C L0
309          unop                                                        C U1
310          unop                                                        C L1
311          cmpult    x5555555555555555, x, p6                C U0
312C 2
313          cmpult    xAAAAAAAAAAAAAAAA, x, p7                C U1
314          unop
315          unop
316          negq      p6, t0                                            C L0
317C 3
318          negq      p7, t1                                            C L0
319          and       cymask, x5555555555555555, l0           C U1
320          addq      p6, cy, cy
321          and       t0, x5555555555555555, t0
322C 4
323          and       t1, x5555555555555555, t1
324          addq      p7, cy, cy
325          unop
326          addq      t0, l0, l0
327C 5
328          addq      t1, l0, l0
329          unop
330          stq       x, eval($1*8)(rp)                       C L1
331          unop
332')
333