1dnl  Alpha ev6 nails mpn_addmul_1.
2
3dnl  Copyright 2002, 2005, 2006 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33C      cycles/limb
34C EV4:    42
35C EV5:    18
36C EV6:     4
37
38C TODO
39C  * Reroll loop for 3.75 c/l with current 4-way unrolling.
40C  * The loop is overscheduled wrt loads and wrt multiplies, in particular
41C    umulh.
42C  * Use FP loop count and multiple exit points, that would simplify feed-in lp0
43C    and would work since the loop structure is really regular.
44
45C  INPUT PARAMETERS
46define(`rp',`r16')
47define(`up',`r17')
48define(`n', `r18')
49define(`vl0',`r19')
50
51define(`numb_mask',`r6')
52
53define(`m0a',`r0')
54define(`m0b',`r1')
55define(`m1a',`r2')
56define(`m1b',`r3')
57define(`m2a',`r20')
58define(`m2b',`r21')
59define(`m3a',`r22')
60define(`m3b',`r23')
61
62define(`acc0',`r25')
63define(`acc1',`r27')
64
65define(`ul0',`r4')
66define(`ul1',`r5')
67define(`ul2',`r4')
68define(`ul3',`r5')
69
70define(`rl0',`r24')
71define(`rl1',`r24')
72define(`rl2',`r24')
73define(`rl3',`r24')
74
75define(`t0',`r7')
76define(`t1',`r8')
77
78define(`NAIL_BITS',`GMP_NAIL_BITS')
79define(`NUMB_BITS',`GMP_NUMB_BITS')
80
81dnl  This declaration is munged by configure
82NAILS_SUPPORT(2-63)
83
84ASM_START()
85PROLOGUE(mpn_addmul_1)
86          sll       vl0, NAIL_BITS, vl0
87          lda       numb_mask, -1(r31)
88          srl       numb_mask, NAIL_BITS, numb_mask
89
90          and       n,        3,        r25
91          cmpeq     r25,      1,        r21
92          bne       r21,      L(1m4)
93          cmpeq     r25,      2,        r21
94          bne       r21,      L(2m4)
95          beq       r25,      L(0m4)
96
97L(3m4):   ldq       ul3,      0(up)
98          lda       n,        -4(n)
99          ldq       ul0,      8(up)
100          mulq      vl0,      ul3,      m3a
101          umulh     vl0,      ul3,      m3b
102          ldq       ul1,      16(up)
103          lda       up,       24(up)
104          lda       rp,       -8(rp)
105          mulq      vl0,      ul0,      m0a
106          umulh     vl0,      ul0,      m0b
107          bge       n,        L(ge3)
108
109          mulq      vl0,      ul1,      m1a
110          umulh     vl0,      ul1,      m1b
111          ldq       rl3,      8(rp)
112          srl       m3a,NAIL_BITS,      t0
113          addq      t0,       r31,      acc1
114          addq      rl3,      acc1,     acc1
115          ldq       rl0,      16(rp)
116          srl       m0a,NAIL_BITS,      t0
117          addq      t0,       m3b,      acc0
118          srl       acc1,NUMB_BITS,     t1
119          br        r31,      L(ta3)
120
121L(ge3):   ldq       ul2,      0(up)
122          mulq      vl0,      ul1,      m1a
123          umulh     vl0,      ul1,      m1b
124          ldq       rl3,      8(rp)
125          srl       m3a,NAIL_BITS,      t0
126          ldq       ul3,      8(up)
127          lda       n,        -4(n)
128          mulq      vl0,      ul2,      m2a
129          addq      t0,       r31,      acc1
130          umulh     vl0,      ul2,      m2b
131          addq      rl3,      acc1,     acc1
132          ldq       rl0,      16(rp)
133          srl       m0a,NAIL_BITS,      t0
134          ldq       ul0,      16(up)
135          mulq      vl0,      ul3,      m3a
136          addq      t0,       m3b,      acc0
137          srl       acc1,NUMB_BITS,     t1
138          br        r31,      L(el3)
139
140L(0m4):   lda       n,        -8(n)
141          ldq       ul2,      0(up)
142          ldq       ul3,      8(up)
143          mulq      vl0,      ul2,      m2a
144          umulh     vl0,      ul2,      m2b
145          ldq       ul0,      16(up)
146          mulq      vl0,      ul3,      m3a
147          umulh     vl0,      ul3,      m3b
148          ldq       ul1,      24(up)
149          lda       up,       32(up)
150          mulq      vl0,      ul0,      m0a
151          umulh     vl0,      ul0,      m0b
152          bge       n,        L(ge4)
153
154          ldq       rl2,      0(rp)
155          srl       m2a,NAIL_BITS,      t0
156          mulq      vl0,      ul1,      m1a
157          addq      t0,       r31,      acc0
158          umulh     vl0,      ul1,      m1b
159          addq      rl2,      acc0,     acc0
160          ldq       rl3,      8(rp)
161          srl       m3a,NAIL_BITS,      t0
162          addq      t0,       m2b,      acc1
163          srl       acc0,NUMB_BITS,     t1
164          br        r31,      L(ta4)
165
166L(ge4):   ldq       rl2,      0(rp)
167          srl       m2a,NAIL_BITS,      t0
168          ldq       ul2,      0(up)
169          mulq      vl0,      ul1,      m1a
170          addq      t0,       r31,      acc0
171          umulh     vl0,      ul1,      m1b
172          addq      rl2,      acc0,     acc0
173          ldq       rl3,      8(rp)
174          srl       m3a,NAIL_BITS,      t0
175          ldq       ul3,      8(up)
176          lda       n,        -4(n)
177          mulq      vl0,      ul2,      m2a
178          addq      t0,       m2b,      acc1
179          srl       acc0,NUMB_BITS,     t1
180          br        r31,      L(el0)
181
182L(2m4):   lda       n,        -4(n)
183          ldq       ul0,      0(up)
184          ldq       ul1,      8(up)
185          lda       up,       16(up)
186          lda       rp,       -16(rp)
187          mulq      vl0,      ul0,      m0a
188          umulh     vl0,      ul0,      m0b
189          bge       n,        L(ge2)
190
191          mulq      vl0,      ul1,      m1a
192          umulh     vl0,      ul1,      m1b
193          ldq       rl0,      16(rp)
194          srl       m0a,NAIL_BITS,      t0
195          addq      t0,       r31,      acc0
196          addq      rl0,      acc0,     acc0
197          ldq       rl1,      24(rp)
198          srl       m1a,NAIL_BITS,      t0
199          addq      t0,       m0b,      acc1
200          srl       acc0,NUMB_BITS,     t1
201          br        r31,      L(ta2)
202
203L(ge2):   ldq       ul2,      0(up)
204          mulq      vl0,      ul1,      m1a
205          umulh     vl0,      ul1,      m1b
206          ldq       ul3,      8(up)
207          lda       n,        -4(n)
208          mulq      vl0,      ul2,      m2a
209          umulh     vl0,      ul2,      m2b
210          ldq       rl0,      16(rp)
211          srl       m0a,NAIL_BITS,      t0
212          ldq       ul0,      16(up)
213          mulq      vl0,      ul3,      m3a
214          addq      t0,       r31,      acc0
215          umulh     vl0,      ul3,      m3b
216          addq      rl0,      acc0,     acc0
217          ldq       rl1,      24(rp)
218          srl       m1a,NAIL_BITS,      t0
219          ldq       ul1,      24(up)
220          lda       up,       32(up)
221          lda       rp,       32(rp)
222          mulq      vl0,      ul0,      m0a
223          addq      t0,       m0b,      acc1
224          srl       acc0,NUMB_BITS,     t1
225          bge       n,        L(el2)
226
227          br        r31,      L(ta6)
228
229L(1m4):   lda       n,        -4(n)
230          ldq       ul1,      0(up)
231          lda       up,       8(up)
232          lda       rp,       -24(rp)
233          bge       n,        L(ge1)
234
235          mulq      vl0,      ul1,      m1a
236          umulh     vl0,      ul1,      m1b
237          ldq       rl1,      24(rp)
238          srl       m1a,NAIL_BITS,      t0
239          addq      rl1,      t0,       acc1
240          and       acc1,numb_mask,     r28
241          srl       acc1,NUMB_BITS,     t1
242          stq       r28,      24(rp)
243          addq      t1,       m1b,      r0
244          ret       r31,      (r26),    1
245
246L(ge1):   ldq       ul2,      0(up)
247          mulq      vl0,      ul1,      m1a
248          umulh     vl0,      ul1,      m1b
249          ldq       ul3,      8(up)
250          lda       n,        -4(n)
251          mulq      vl0,      ul2,      m2a
252          umulh     vl0,      ul2,      m2b
253          ldq       ul0,      16(up)
254          mulq      vl0,      ul3,      m3a
255          umulh     vl0,      ul3,      m3b
256          ldq       rl1,      24(rp)
257          srl       m1a,NAIL_BITS,      t0
258          ldq       ul1,      24(up)
259          lda       up,       32(up)
260          lda       rp,       32(rp)
261          mulq      vl0,      ul0,      m0a
262          addq      t0,       r31,      acc1
263          umulh     vl0,      ul0,      m0b
264          addq      rl1,      acc1,     acc1
265          ldq       rl2,      0(rp)
266          srl       m2a,NAIL_BITS,      t0
267          mulq      vl0,      ul1,      m1a
268          addq      t0,       m1b,      acc0
269          srl       acc1,NUMB_BITS,     t1
270          blt       n,        L(ta5)
271
272L(ge5):   ldq       ul2,      0(up)
273          br        r31,      L(el1)
274
275          ALIGN(16)
276L(top):   mulq      vl0,      ul0,      m0a                 C U1
277          addq      t0,       m0b,      acc1                C L0
278          srl       acc0,NUMB_BITS,     t1                  C U0
279          stq       r28,      -24(rp)                       C L1
280C
281L(el2):   umulh     vl0,      ul0,      m0b                 C U1
282          and       acc0,numb_mask,     r28                 C L0
283          addq      rl1,      acc1,     acc1                C U0
284          ldq       rl2,      0(rp)                         C L1
285C
286          unop                                              C U1
287          addq      t1,       acc1,     acc1                C L0
288          srl       m2a,NAIL_BITS,      t0                  C U0
289          ldq       ul2,      0(up)                         C L1
290C
291          mulq      vl0,      ul1,      m1a                 C U1
292          addq      t0,       m1b,      acc0                C L0
293          srl       acc1,NUMB_BITS,     t1                  C U0
294          stq       r28,      -16(rp)                       C L1
295C
296L(el1):   umulh     vl0,      ul1,      m1b                 C U1
297          and       acc1,numb_mask,     r28                 C L0
298          addq      rl2,      acc0,     acc0                C U0
299          ldq       rl3,      8(rp)                         C L1
300C
301          lda       n,        -4(n)                         C L1
302          addq      t1,       acc0,     acc0                C L0
303          srl       m3a,NAIL_BITS,      t0                  C U0
304          ldq       ul3,      8(up)                         C L1
305C
306          mulq      vl0,      ul2,      m2a                 C U1
307          addq      t0,       m2b,      acc1                C L0
308          srl       acc0,NUMB_BITS,     t1                  C U0
309          stq       r28,      -8(rp)                        C L1
310C
311L(el0):   umulh     vl0,      ul2,      m2b                 C U1
312          and       acc0,numb_mask,     r28                 C L0
313          addq      rl3,      acc1,     acc1                C U0
314          ldq       rl0,      16(rp)                        C L1
315C
316          unop                                              C U1
317          addq      t1,       acc1,     acc1                C L0
318          srl       m0a,NAIL_BITS,      t0                  C U0
319          ldq       ul0,      16(up)                        C L1
320C
321          mulq      vl0,      ul3,      m3a                 C U1
322          addq      t0,       m3b,      acc0                C L0
323          srl       acc1,NUMB_BITS,     t1                  C U0
324          stq       r28,      0(rp)                         C L1
325C
326L(el3):   umulh     vl0,      ul3,      m3b                 C U1
327          and       acc1,numb_mask,     r28                 C L0
328          addq      rl0,      acc0,     acc0                C U0
329          ldq       rl1,      24(rp)                        C L1
330C
331          unop                                              C U1
332          addq      t1,       acc0,     acc0                C L0
333          srl       m1a,NAIL_BITS,      t0                  C U0
334          ldq       ul1,      24(up)                        C L1
335C
336          lda       up,       32(up)                        C L0
337          unop                                              C U1
338          lda       rp,       32(rp)                        C L1
339          bge       n,        L(top)                        C U0
340
341L(end):   mulq      vl0,      ul0,      m0a
342          addq      t0,       m0b,      acc1
343          srl       acc0,NUMB_BITS,     t1
344          stq       r28,      -24(rp)
345L(ta6):   umulh     vl0,      ul0,      m0b
346          and       acc0,numb_mask,     r28
347          addq      rl1,      acc1,     acc1
348          ldq       rl2,      0(rp)
349          addq      t1,       acc1,     acc1
350          srl       m2a,NAIL_BITS,      t0
351          mulq      vl0,      ul1,      m1a
352          addq      t0,       m1b,      acc0
353          srl       acc1,NUMB_BITS,     t1
354          stq       r28,      -16(rp)
355L(ta5):   umulh     vl0,      ul1,      m1b
356          and       acc1,numb_mask,     r28
357          addq      rl2,      acc0,     acc0
358          ldq       rl3,      8(rp)
359          addq      t1,       acc0,     acc0
360          srl       m3a,NAIL_BITS,      t0
361          addq      t0,       m2b,      acc1
362          srl       acc0,NUMB_BITS,     t1
363          stq       r28,      -8(rp)
364          unop
365          ALIGN(16)
366L(ta4):   and       acc0,numb_mask,     r28
367          addq      rl3,      acc1,     acc1
368          ldq       rl0,      16(rp)
369          addq      t1,       acc1,     acc1
370          srl       m0a,NAIL_BITS,      t0
371          addq      t0,       m3b,      acc0
372          srl       acc1,NUMB_BITS,     t1
373          stq       r28,      0(rp)
374          unop
375          ALIGN(16)
376L(ta3):   and       acc1,numb_mask,     r28
377          addq      rl0,      acc0,     acc0
378          ldq       rl1,      24(rp)
379          addq      t1,       acc0,     acc0
380          srl       m1a,NAIL_BITS,      t0
381          addq      t0,       m0b,      acc1
382          srl       acc0,NUMB_BITS,     t1
383          stq       r28,      8(rp)
384          unop
385          ALIGN(16)
386L(ta2):   and       acc0,numb_mask,     r28
387          addq      rl1,      acc1,     acc1
388          addq      t1,       acc1,     acc1
389          srl       acc1,NUMB_BITS,     t1
390          stq       r28,      16(rp)
391          and       acc1,numb_mask,     r28
392          addq      t1,       m1b,      r0
393          stq       r28,      24(rp)
394          ret       r31,      (r26),    1
395EPILOGUE()
396ASM_END()
397