1dnl  IA-64 mpn_addlshC_n, mpn_sublshC_n, mpn_rsblshC_n.
2
3dnl  Contributed to the GNU project by Torbjörn Granlund.
4
5dnl  Copyright 2003-2005, 2010, 2013 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33C           cycles/limb
34C Itanium:      ?
35C Itanium 2:    1.5
36
37C TODO
38C  * Use shladd in feed-in code (for mpn_addlshC_n).
39C  * Rewrite loop to schedule loads closer to use, since we do prefetch.
40
41C INPUT PARAMETERS
42define(`rp', `r32')
43define(`up', `r33')
44define(`vp', `r34')
45define(`n',  `r35')
46
47ifdef(`DO_add', `
48  define(`ADDSUB',     `add   $1 = $2, $3')
49  define(`CMP',        `cmp.ltu         $1,p0 = $2, $3')
50  define(`INCR',       1)
51  define(`LIM',        -1)
52  define(`func',        mpn_addlsh`'LSH`'_n)')
53ifdef(`DO_sub', `
54  define(`ADDSUB',     `sub   $1 = $2, $3')
55  define(`CMP',        `cmp.gtu         $1,p0 = $2, $3')
56  define(`INCR',       -1)
57  define(`LIM',        0)
58  define(`func',        mpn_sublsh`'LSH`'_n)')
59ifdef(`DO_rsb', `
60  define(`ADDSUB',     `sub   $1 = $3, $2')
61  define(`CMP',        `cmp.gtu         $1,p0 = $2, $4')
62  define(`INCR',       -1)
63  define(`LIM',        0)
64  define(`func',        mpn_rsblsh`'LSH`'_n)')
65
66define(PFDIST, 500)
67
68define(`u0',`r14') define(`u1',`r15') define(`u2',`r16') define(`u3',`r17')
69define(`v0',`r18') define(`v1',`r19') define(`v2',`r20') define(`v3',`r21')
70define(`w0',`r22') define(`w1',`r23') define(`w2',`r24') define(`w3',`r25')
71define(`s0',`r26') define(`s1',`r27') define(`s2',`r28') define(`s3',`r29')
72define(`x0',`r30') define(`x1',`r31') define(`x2',`r3')  define(`x3',`r9')
73
74C r3 r8 r9 r10 r11
75
76ASM_START()
77PROLOGUE(func)
78          .prologue
79          .save     ar.lc, r2
80          .body
81ifdef(`HAVE_ABI_32',`
82          addp4     rp = 0, rp                    C                             M I
83          addp4     up = 0, up                    C                             M I
84          nop.i     0
85          addp4     vp = 0, vp                    C                             M I
86          nop.m     0
87          zxt4      n = n                         C                             I
88          ;;
89')
90 {.mmi;   ld8       r11 = [vp], 8                 C                             M01
91          ld8       r10 = [up], 8                 C                             M01
92          mov.i     r2 = ar.lc                    C                             I0
93}{.mmi;   and       r14 = 3, n                    C                             M I
94          cmp.lt    p15, p0 = 4, n                C                             M I
95          add       n = -5, n           C                             M I
96          ;;
97}{.mmi;   cmp.eq    p6, p0 = 1, r14               C                             M I
98          cmp.eq    p7, p0 = 2, r14               C                             M I
99          cmp.eq    p8, p0 = 3, r14               C                             M I
100}{.bbb
101  (p6)    br.dptk   .Lb01                         C                             B
102  (p7)    br.dptk   .Lb10                         C                             B
103  (p8)    br.dptk   .Lb11                         C                             B
104}
105
106.Lb00:
107 {.mmi;   ld8       v0 = [vp], 8                  C                             M01
108          ld8       u0 = [up], 8                  C                             M01
109          shr.u     n = n, 2            C                             I0
110          ;;
111}{.mmi;   ld8       v1 = [vp], 8                  C                             M01
112          ld8       u1 = [up], 8                  C                             M01
113          shl       x3 = r11, LSH                 C                             I0
114          ;;
115}{.mmi;   ld8       v2 = [vp], 8                  C                             M01
116          ld8       u2 = [up], 8                  C                             M01
117          shrp      x0 = v0, r11, 64-LSH          C                             I0
118}{.mmb;   ADDSUB(   w3, r10, x3)                  C                             M I
119          nop       0
120  (p15)   br.dpnt   .grt4                         C                             B
121          ;;
122}{.mii;   CMP(      p7, w3, r10, x3)    C                             M II0
123          shrp      x1 = v1, v0, 64-LSH C                             I0
124          ADDSUB(   w0, u0, x0)                   C                             M I
125          ;;
126}{.mii;   CMP(      p8, w0, u0, x0)               C                             M I
127          shrp      x2 = v2, v1, 64-LSH C                             I0
128          ADDSUB(   w1, u1, x1)                   C                             M I
129}{.mmb;   nop       0
130          nop       0
131          br        .Lcj4                         C                             B
132}
133ALIGN(32)
134.grt4:
135 {.mii;   ld8       v3 = [vp], 8                  C                             M01
136          shrp      x0 = v0, r11, 64-LSH          C                             I0
137          CMP(      p8, w3, r10, x3)    C                             M I
138          ;;
139}{.mmi;   ld8       u3 = [up], 8                  C                             M01
140          add       r11 = PFDIST, vp
141          shrp      x1 = v1, v0, 64-LSH C                             I0
142}{.mmi;   ld8       v0 = [vp], 8                  C                             M01
143          ADDSUB(   w0, u0, x0)                   C                             M I
144          nop       0
145          ;;
146}{.mmi;   CMP(      p6, w0, u0, x0)               C                             M I
147          add       r10 = PFDIST, up
148          mov.i     ar.lc = n           C                             I0
149}{.mmb;   ADDSUB(   w1, u1, x1)                   C                             M I
150          ld8       u0 = [up], 8                  C                             M01
151          br        .LL00                         C                             B
152}
153
154          ALIGN(32)
155.Lb01:
156ifdef(`DO_add',
157`         shladd    w2 = r11, LSH, r10  C                             M I
158          shr.u     r8 = r11, 64-LSH    C retval            I0
159  (p15)   br.dpnt   .grt1                         C                             B
160          ;;
161',`
162          shl       x2 = r11, LSH                 C                             I0
163  (p15)   br.dpnt   .grt1                         C                             B
164          ;;
165          ADDSUB(   w2, r10, x2)                  C                             M I
166          shr.u     r8 = r11, 64-LSH    C retval            I0
167          ;;
168')
169          CMP(      p6, w2, r10, x2)    C                             M I
170          br                  .Lcj1
171
172.grt1:    ld8       v3 = [vp], 8                  C                             M01
173          ld8       u3 = [up], 8                  C                             M01
174          shr.u     n = n, 2            C                             I0
175          ;;
176          ld8       v0 = [vp], 8                  C                             M01
177          ld8       u0 = [up], 8                  C                             M01
178          mov.i     ar.lc = n           C FIXME swap with next        I0
179ifdef(`DO_add',
180`',`
181          ADDSUB(   w2, r10, x2)
182')
183          ;;
184 {.mmi;   ld8       v1 = [vp], 8                  C                             M01
185          ld8       u1 = [up], 8                  C                             M01
186          shrp      x3 = v3, r11, 64-LSH          C                             I0
187          ;;
188}{.mmi;   ld8       v2 = [vp], 8                  C                             M01
189          ld8       u2 = [up], 8                  C                             M01
190          shrp      x0 = v0, v3, 64-LSH C                             I0
191}{.mmb;   CMP(      p6, w2, r10, x2)    C                             M I
192          ADDSUB(   w3, u3, x3)                   C                             M I
193          br.cloop.dptk       .grt5               C                             B
194          ;;
195}{.mmi;   CMP(      p7, w3, u3, x3)               C                             M I
196          ADDSUB(   w0, u0, x0)                   C                             M I
197          shrp      x1 = v1, v0, 64-LSH C                             I0
198}{.mmb;   nop       0
199          nop       0
200          br        .Lcj5                         C                             B
201}
202.grt5:
203 {.mmi;   add       r10 = PFDIST, up
204          add       r11 = PFDIST, vp
205          shrp      x0 = v0, v3, 64-LSH C                             I0
206}{.mmb;   ld8       v3 = [vp], 8                  C                             M01
207          CMP(      p8, w3, u3, x3)               C                             M I
208          br        .LL01                         C                             B
209}
210          ALIGN(32)
211.Lb10:
212 {.mmi;   ld8       v2 = [vp], 8                  C                             M01
213          ld8       u2 = [up], 8                  C                             M01
214          shl       x1 = r11, LSH                 C                             I0
215}{.mmb;   nop       0
216          nop       0
217  (p15)   br.dpnt   .grt2                         C                             B
218          ;;
219}{.mmi;   ADDSUB(   w1, r10, x1)                  C                             M I
220          nop       0
221          shrp      x2 = v2, r11, 64-LSH          C                             I0
222          ;;
223}{.mmi;   CMP(      p9, w1, r10, x1)    C                             M I
224          ADDSUB(   w2, u2, x2)                   C                             M I
225          shr.u     r8 = v2, 64-LSH               C retval            I0
226          ;;
227}{.mmb;   CMP(      p6, w2, u2, x2)               C                             M I
228          nop       0
229          br        .Lcj2                         C                             B
230}
231.grt2:
232 {.mmi;   ld8       v3 = [vp], 8                  C                             M01
233          ld8       u3 = [up], 8                  C                             M01
234          shr.u     n = n, 2            C                             I0
235          ;;
236}{.mmi;   ld8       v0 = [vp], 8                  C                             M01
237          ld8       u0 = [up], 8                  C                             M01
238          mov.i     ar.lc = n           C                             I0
239}{.mmi;   ADDSUB(   w1, r10, x1)                  C                             M I
240          nop       0
241          nop       0
242          ;;
243}{.mii;   ld8       v1 = [vp], 8                  C                             M01
244          shrp      x2 = v2, r11, 64-LSH          C                             I0
245          CMP(      p8, w1, r10, x1)    C                             M I
246          ;;
247}{.mmi;   add       r10 = PFDIST, up
248          ld8       u1 = [up], 8                  C                             M01
249          shrp      x3 = v3, v2, 64-LSH C                             I0
250}{.mmi;   add       r11 = PFDIST, vp
251          ld8       v2 = [vp], 8                  C                             M01
252          ADDSUB(   w2, u2, x2)                   C                             M I
253          ;;
254}{.mmi;   CMP(      p6, w2, u2, x2)               C                             M I
255          ld8       u2 = [up], 8                  C                             M01
256          shrp      x0 = v0, v3, 64-LSH C                             I0
257}{.mib;   ADDSUB(   w3, u3, x3)                   C                             M I
258          nop       0
259          br.cloop.dpnt       L(top)              C                             B
260}
261          br        L(end)                        C                             B
262.Lb11:
263 {.mmi;   ld8       v1 = [vp], 8                  C                             M01
264          ld8       u1 = [up], 8                  C                             M01
265          shl       x0 = r11, LSH                 C                             I0
266          ;;
267}{.mmi;   ld8       v2 = [vp], 8                  C                             M01
268          ld8       u2 = [up], 8                  C                             M01
269          shr.u     n = n, 2            C                             I0
270}{.mmb;   nop       0
271          nop       0
272  (p15)   br.dpnt   .grt3                         C                             B
273          ;;
274}{.mii;   nop       0
275          shrp      x1 = v1, r11, 64-LSH          C                             I0
276          ADDSUB(   w0, r10, x0)                  C                             M I
277          ;;
278}{.mii;   CMP(      p8, w0, r10, x0)    C                             M I
279          shrp      x2 = v2, v1, 64-LSH C                             I0
280          ADDSUB(   w1, u1, x1)                   C                             M I
281          ;;
282}{.mmb;   CMP(      p9, w1, u1, x1)               C                             M I
283          ADDSUB(   w2, u2, x2)                   C                             M I
284          br        .Lcj3                         C                             B
285}
286.grt3:
287 {.mmi;   ld8       v3 = [vp], 8                  C                             M01
288          ld8       u3 = [up], 8                  C                             M01
289          shrp      x1 = v1, r11, 64-LSH          C                             I0
290}{.mmi;   ADDSUB(   w0, r10, x0)                  C                             M I
291          nop       0
292          nop       0
293          ;;
294}{.mmi;   ld8       v0 = [vp], 8                  C                             M01
295          CMP(      p6, w0, r10, x0)    C                             M I
296          mov.i     ar.lc = n           C                             I0
297}{.mmi;   ld8       u0 = [up], 8                  C                             M01
298          ADDSUB(   w1, u1, x1)                   C                             M I
299          nop       0
300          ;;
301}{.mmi;   add       r10 = PFDIST, up
302          add       r11 = PFDIST, vp
303          shrp      x2 = v2, v1, 64-LSH C                             I0
304}{.mmb;   ld8       v1 = [vp], 8                  C                             M01
305          CMP(      p8, w1, u1, x1)               C                             M I
306          br        .LL11                         C                             B
307}
308
309C *** MAIN LOOP START ***
310          ALIGN(32)
311L(top):   st8       [rp] = w1, 8                  C                             M23
312          lfetch    [r10], 32
313   (p8)   cmpeqor   p6, p0 = LIM, w2    C                             M I
314   (p8)   add       w2 = INCR, w2                 C                             M I
315          ld8       v3 = [vp], 8                  C                             M01
316          CMP(      p8, w3, u3, x3)               C                             M I
317          ;;
318.LL01:    ld8       u3 = [up], 8                  C                             M01
319          shrp      x1 = v1, v0, 64-LSH C                             I0
320   (p6)   cmpeqor   p8, p0 = LIM, w3    C                             M I
321   (p6)   add       w3 = INCR, w3                 C                             M I
322          ld8       v0 = [vp], 8                  C                             M01
323          ADDSUB(   w0, u0, x0)                   C                             M I
324          ;;
325          st8       [rp] = w2, 8                  C                             M23
326          CMP(      p6, w0, u0, x0)               C                             M I
327          nop.b     0
328          ld8       u0 = [up], 8                  C                             M01
329          lfetch    [r11], 32
330          ADDSUB(   w1, u1, x1)                   C                             M I
331          ;;
332.LL00:    st8       [rp] = w3, 8                  C                             M23
333          shrp      x2 = v2, v1, 64-LSH C                             I0
334   (p8)   cmpeqor   p6, p0 = LIM, w0    C                             M I
335   (p8)   add       w0 = INCR, w0                 C                             M I
336          ld8       v1 = [vp], 8                  C                             M01
337          CMP(      p8, w1, u1, x1)               C                             M I
338          ;;
339.LL11:    ld8       u1 = [up], 8                  C                             M01
340          shrp      x3 = v3, v2, 64-LSH C                             I0
341   (p6)   cmpeqor   p8, p0 = LIM, w1    C                             M I
342   (p6)   add       w1 = INCR, w1                 C                             M I
343          ld8       v2 = [vp], 8                  C                             M01
344          ADDSUB(   w2, u2, x2)                   C                             M I
345          ;;
346 {.mmi;   st8       [rp] = w0, 8                  C                             M23
347          CMP(      p6, w2, u2, x2)               C                             M I
348          shrp      x0 = v0, v3, 64-LSH C                             I0
349}{.mib;
350          ld8       u2 = [up], 8                  C                             M01
351          ADDSUB(   w3, u3, x3)                   C                             M I
352          br.cloop.dptk       L(top)              C                             B
353          ;;
354}
355C *** MAIN LOOP END ***
356
357L(end):
358 {.mmi;   st8       [rp] = w1, 8                  C                             M23
359   (p8)   cmpeqor   p6, p0 = LIM, w2    C                             M I
360          shrp      x1 = v1, v0, 64-LSH C                             I0
361}{.mmi;
362   (p8)   add       w2 = INCR, w2                 C                             M I
363          CMP(      p7, w3, u3, x3)               C                             M I
364          ADDSUB(   w0, u0, x0)                   C                             M I
365          ;;
366}
367.Lcj5:
368 {.mmi;   st8       [rp] = w2, 8                  C                             M23
369   (p6)   cmpeqor   p7, p0 = LIM, w3    C                             M I
370          shrp      x2 = v2, v1, 64-LSH C                             I0
371}{.mmi;
372   (p6)   add       w3 = INCR, w3                 C                             M I
373          CMP(      p8, w0, u0, x0)               C                             M I
374          ADDSUB(   w1, u1, x1)                   C                             M I
375          ;;
376}
377.Lcj4:
378 {.mmi;   st8       [rp] = w3, 8                  C                             M23
379   (p7)   cmpeqor   p8, p0 = LIM, w0    C                             M I
380          mov.i     ar.lc = r2                    C                             I0
381}{.mmi;
382   (p7)   add       w0 = INCR, w0                 C                             M I
383          CMP(      p9, w1, u1, x1)               C                             M I
384          ADDSUB(   w2, u2, x2)                   C                             M I
385          ;;
386}
387.Lcj3:
388 {.mmi;   st8       [rp] = w0, 8                  C                             M23
389   (p8)   cmpeqor   p9, p0 = LIM, w1    C                             M I
390          shr.u     r8 = v2, 64-LSH               C                             I0
391}{.mmi;
392   (p8)   add       w1 = INCR, w1                 C                             M I
393          CMP(      p6, w2, u2, x2)               C                             M I
394          nop       0
395          ;;
396}
397.Lcj2:
398 {.mmi;   st8       [rp] = w1, 8                  C                             M23
399   (p9)   cmpeqor   p6, p0 = LIM, w2    C                             M I
400   (p9)   add       w2 = INCR, w2                 C                             M I
401          ;;
402}
403.Lcj1:
404 {.mmb;   st8       [rp] = w2           C                             M23
405ifdef(`DO_rsb',`
406   (p6)   add       r8 = -1, r8                   C                             M I
407',`
408   (p6)   add       r8 = 1, r8                    C                             M I
409')        br.ret.sptk.many b0           C                             B
410}
411EPILOGUE()
412ASM_END()
413