1dnl  HP-PA 2.0 64-bit mpn_addmul_1 -- Multiply a limb vector with a limb and
2dnl  add the result to a second limb vector.
3
4dnl  Copyright 1998-2000, 2002, 2003 Free Software Foundation, Inc.
5
6dnl  This file is part of the GNU MP Library.
7dnl
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of either:
10dnl
11dnl    * the GNU Lesser General Public License as published by the Free
12dnl      Software Foundation; either version 3 of the License, or (at your
13dnl      option) any later version.
14dnl
15dnl  or
16dnl
17dnl    * the GNU General Public License as published by the Free Software
18dnl      Foundation; either version 2 of the License, or (at your option) any
19dnl      later version.
20dnl
21dnl  or both in parallel, as here.
22dnl
23dnl  The GNU MP Library is distributed in the hope that it will be useful, but
24dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
26dnl  for more details.
27dnl
28dnl  You should have received copies of the GNU General Public License and the
29dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
30dnl  see https://www.gnu.org/licenses/.
31
32include(`../config.m4')
33
34C                       cycles/limb
35C 8000,8200:                  7
36C 8500,8600,8700:   6.375
37
38C  The feed-in and wind-down code has not yet been scheduled.  Many cycles
39C  could be saved there per call.
40
41C  DESCRIPTION:
42C  The main loop "BIG" is 4-way unrolled, mainly to allow
43C  effective use of ADD,DC.  Delays in moving data via the cache from the FP
44C  registers to the IU registers, have demanded a deep software pipeline, and
45C  a lot of stack slots for partial products in flight.
46C
47C  CODE STRUCTURE:
48C  save-some-registers
49C  do 0, 1, 2, or 3 limbs
50C  if done, restore-some-regs and return
51C  save-many-regs
52C  do 4, 8, ... limb
53C  restore-all-regs
54
55C  STACK LAYOUT:
56C  HP-PA stack grows upwards.  We could allocate 8 fewer slots by using the
57C  slots marked FREE, as well as some slots in the caller's "frame marker".
58C
59C -00 <- r30
60C -08  FREE
61C -10  tmp
62C -18  tmp
63C -20  tmp
64C -28  tmp
65C -30  tmp
66C -38  tmp
67C -40  tmp
68C -48  tmp
69C -50  tmp
70C -58  tmp
71C -60  tmp
72C -68  tmp
73C -70  tmp
74C -78  tmp
75C -80  tmp
76C -88  tmp
77C -90  FREE
78C -98  FREE
79C -a0  FREE
80C -a8  FREE
81C -b0  r13
82C -b8  r12
83C -c0  r11
84C -c8  r10
85C -d0  r8
86C -d8  r8
87C -e0  r7
88C -e8  r6
89C -f0  r5
90C -f8  r4
91C -100 r3
92C  Previous frame:
93C  [unused area]
94C -38/-138 vlimb home slot.  For 2.0N, the vlimb arg will arrive here.
95
96
97include(`../config.m4')
98
99C INPUT PARAMETERS:
100define(`rp',`%r26') C
101define(`up',`%r25') C
102define(`n',`%r24')  C
103define(`vlimb',`%r23')        C
104
105define(`climb',`%r23')        C
106
107ifdef(`HAVE_ABI_2_0w',
108`         .level    2.0w
109',`       .level    2.0
110')
111PROLOGUE(mpn_addmul_1)
112
113ifdef(`HAVE_ABI_2_0w',
114`         std                 vlimb, -0x38(%r30)  C store vlimb into "home" slot
115')
116          std,ma              %r3, 0x100(%r30)
117          std                 %r4, -0xf8(%r30)
118          std                 %r5, -0xf0(%r30)
119          ldo                 0(%r0), climb                 C clear climb
120          fldd                -0x138(%r30), %fr8  C put vlimb in fp register
121
122define(`p032a1',`%r1')        C
123define(`p032a2',`%r19')       C
124
125define(`m032',`%r20')         C
126define(`m096',`%r21')         C
127
128define(`p000a',`%r22')        C
129define(`p064a',`%r29')        C
130
131define(`s000',`%r31')         C
132
133define(`ma000',`%r4')         C
134define(`ma064',`%r20')        C
135
136define(`r000',`%r3')          C
137
138          extrd,u             n, 63, 2, %r5
139          cmpb,=              %r5, %r0, L(BIG)
140          nop
141
142          fldd                0(up), %fr4
143          ldo                 8(up), up
144          xmpyu               %fr8R, %fr4L, %fr22
145          xmpyu               %fr8L, %fr4R, %fr23
146          fstd                %fr22, -0x78(%r30)  C mid product to  -0x78..-0x71
147          xmpyu               %fr8R, %fr4R, %fr24
148          xmpyu               %fr8L, %fr4L, %fr25
149          fstd                %fr23, -0x70(%r30)  C mid product to  -0x70..-0x69
150          fstd                %fr24, -0x80(%r30)  C low product to  -0x80..-0x79
151          addib,<>  -1, %r5, L(two_or_more)
152          fstd                %fr25, -0x68(%r30)  C high product to -0x68..-0x61
153LDEF(one)
154          ldd                 -0x78(%r30), p032a1
155          ldd                 -0x70(%r30), p032a2
156          ldd                 -0x80(%r30), p000a
157          b                   L(0_one_out)
158          ldd                 -0x68(%r30), p064a
159
160LDEF(two_or_more)
161          fldd                0(up), %fr4
162          ldo                 8(up), up
163          xmpyu               %fr8R, %fr4L, %fr22
164          xmpyu               %fr8L, %fr4R, %fr23
165          ldd                 -0x78(%r30), p032a1
166          fstd                %fr22, -0x78(%r30)  C mid product to  -0x78..-0x71
167          xmpyu               %fr8R, %fr4R, %fr24
168          xmpyu               %fr8L, %fr4L, %fr25
169          ldd                 -0x70(%r30), p032a2
170          fstd                %fr23, -0x70(%r30)  C mid product to  -0x70..-0x69
171          ldd                 -0x80(%r30), p000a
172          fstd                %fr24, -0x80(%r30)  C low product to  -0x80..-0x79
173          ldd                 -0x68(%r30), p064a
174          addib,<>  -1, %r5, L(three_or_more)
175          fstd                %fr25, -0x68(%r30)  C high product to -0x68..-0x61
176LDEF(two)
177          add                 p032a1, p032a2, m032
178          add,dc              %r0, %r0, m096
179          depd,z              m032, 31, 32, ma000
180          extrd,u             m032, 31, 32, ma064
181          ldd                 0(rp), r000
182          b                   L(0_two_out)
183          depd                m096, 31, 32, ma064
184
185LDEF(three_or_more)
186          fldd                0(up), %fr4
187          add                 p032a1, p032a2, m032
188          add,dc              %r0, %r0, m096
189          depd,z              m032, 31, 32, ma000
190          extrd,u             m032, 31, 32, ma064
191          ldd                 0(rp), r000
192C         addib,=             -1, %r5, L(0_out)
193          depd                m096, 31, 32, ma064
194LDEF(loop0)
195C         xmpyu               %fr8R, %fr4L, %fr22
196C         xmpyu               %fr8L, %fr4R, %fr23
197C         ldd                 -0x78(%r30), p032a1
198C         fstd                %fr22, -0x78(%r30)  C mid product to  -0x78..-0x71
199C
200C         xmpyu               %fr8R, %fr4R, %fr24
201C         xmpyu               %fr8L, %fr4L, %fr25
202C         ldd                 -0x70(%r30), p032a2
203C         fstd                %fr23, -0x70(%r30)  C mid product to  -0x70..-0x69
204C
205C         ldo                 8(rp), rp
206C         add                 climb, p000a, s000
207C         ldd                 -0x80(%r30), p000a
208C         fstd                %fr24, -0x80(%r30)  C low product to  -0x80..-0x79
209C
210C         add,dc              p064a, %r0, climb
211C         ldo                 8(up), up
212C         ldd                 -0x68(%r30), p064a
213C         fstd                %fr25, -0x68(%r30)  C high product to -0x68..-0x61
214C
215C         add                 ma000, s000, s000
216C         add,dc              ma064, climb, climb
217C         fldd                0(up), %fr4
218C
219C         add                 r000, s000, s000
220C         add,dc              %r0, climb, climb
221C         std                 s000, -8(rp)
222C
223C         add                 p032a1, p032a2, m032
224C         add,dc              %r0, %r0, m096
225C
226C         depd,z              m032, 31, 32, ma000
227C         extrd,u             m032, 31, 32, ma064
228C         ldd                 0(rp), r000
229C         addib,<>  -1, %r5, L(loop0)
230C         depd                m096, 31, 32, ma064
231LDEF(0_out)
232          ldo                 8(up), up
233          xmpyu               %fr8R, %fr4L, %fr22
234          xmpyu               %fr8L, %fr4R, %fr23
235          ldd                 -0x78(%r30), p032a1
236          fstd                %fr22, -0x78(%r30)  C mid product to  -0x78..-0x71
237          xmpyu               %fr8R, %fr4R, %fr24
238          xmpyu               %fr8L, %fr4L, %fr25
239          ldd                 -0x70(%r30), p032a2
240          fstd                %fr23, -0x70(%r30)  C mid product to  -0x70..-0x69
241          ldo                 8(rp), rp
242          add                 climb, p000a, s000
243          ldd                 -0x80(%r30), p000a
244          fstd                %fr24, -0x80(%r30)  C low product to  -0x80..-0x79
245          add,dc              p064a, %r0, climb
246          ldd                 -0x68(%r30), p064a
247          fstd                %fr25, -0x68(%r30)  C high product to -0x68..-0x61
248          add                 ma000, s000, s000
249          add,dc              ma064, climb, climb
250          add                 r000, s000, s000
251          add,dc              %r0, climb, climb
252          std                 s000, -8(rp)
253          add                 p032a1, p032a2, m032
254          add,dc              %r0, %r0, m096
255          depd,z              m032, 31, 32, ma000
256          extrd,u             m032, 31, 32, ma064
257          ldd                 0(rp), r000
258          depd                m096, 31, 32, ma064
259LDEF(0_two_out)
260          ldd                 -0x78(%r30), p032a1
261          ldd                 -0x70(%r30), p032a2
262          ldo                 8(rp), rp
263          add                 climb, p000a, s000
264          ldd                 -0x80(%r30), p000a
265          add,dc              p064a, %r0, climb
266          ldd                 -0x68(%r30), p064a
267          add                 ma000, s000, s000
268          add,dc              ma064, climb, climb
269          add                 r000, s000, s000
270          add,dc              %r0, climb, climb
271          std                 s000, -8(rp)
272LDEF(0_one_out)
273          add                 p032a1, p032a2, m032
274          add,dc              %r0, %r0, m096
275          depd,z              m032, 31, 32, ma000
276          extrd,u             m032, 31, 32, ma064
277          ldd                 0(rp), r000
278          depd                m096, 31, 32, ma064
279
280          add                 climb, p000a, s000
281          add,dc              p064a, %r0, climb
282          add                 ma000, s000, s000
283          add,dc              ma064, climb, climb
284          add                 r000, s000, s000
285          add,dc              %r0, climb, climb
286          std                 s000, 0(rp)
287
288          cmpib,>=  4, n, L(done)
289          ldo                 8(rp), rp
290
291C 4-way unrolled code.
292
293LDEF(BIG)
294
295define(`p032a1',`%r1')        C
296define(`p032a2',`%r19')       C
297define(`p096b1',`%r20')       C
298define(`p096b2',`%r21')       C
299define(`p160c1',`%r22')       C
300define(`p160c2',`%r29')       C
301define(`p224d1',`%r31')       C
302define(`p224d2',`%r3')        C
303                              C
304define(`m032',`%r4')          C
305define(`m096',`%r5')          C
306define(`m160',`%r6')          C
307define(`m224',`%r7')          C
308define(`m288',`%r8')          C
309                              C
310define(`p000a',`%r1')         C
311define(`p064a',`%r19')        C
312define(`p064b',`%r20')        C
313define(`p128b',`%r21')        C
314define(`p128c',`%r22')        C
315define(`p192c',`%r29')        C
316define(`p192d',`%r31')        C
317define(`p256d',`%r3')         C
318                              C
319define(`s000',`%r10')         C
320define(`s064',`%r11')         C
321define(`s128',`%r12')         C
322define(`s192',`%r13')         C
323                              C
324define(`ma000',`%r9')         C
325define(`ma064',`%r4')         C
326define(`ma128',`%r5')         C
327define(`ma192',`%r6')         C
328define(`ma256',`%r7')         C
329                              C
330define(`r000',`%r1')          C
331define(`r064',`%r19')         C
332define(`r128',`%r20')         C
333define(`r192',`%r21')         C
334
335          std                 %r6, -0xe8(%r30)
336          std                 %r7, -0xe0(%r30)
337          std                 %r8, -0xd8(%r30)
338          std                 %r9, -0xd0(%r30)
339          std                 %r10, -0xc8(%r30)
340          std                 %r11, -0xc0(%r30)
341          std                 %r12, -0xb8(%r30)
342          std                 %r13, -0xb0(%r30)
343
344ifdef(`HAVE_ABI_2_0w',
345`         extrd,u             n, 61, 62, n                  C right shift 2
346',`       extrd,u             n, 61, 30, n                  C right shift 2, zero extend
347')
348
349LDEF(4_or_more)
350          fldd                0(up), %fr4
351          fldd                8(up), %fr5
352          fldd                16(up), %fr6
353          fldd                24(up), %fr7
354          xmpyu               %fr8R, %fr4L, %fr22
355          xmpyu               %fr8L, %fr4R, %fr23
356          xmpyu               %fr8R, %fr5L, %fr24
357          xmpyu               %fr8L, %fr5R, %fr25
358          xmpyu               %fr8R, %fr6L, %fr26
359          xmpyu               %fr8L, %fr6R, %fr27
360          fstd                %fr22, -0x78(%r30)  C mid product to  -0x78..-0x71
361          xmpyu               %fr8R, %fr7L, %fr28
362          xmpyu               %fr8L, %fr7R, %fr29
363          fstd                %fr23, -0x70(%r30)  C mid product to  -0x70..-0x69
364          xmpyu               %fr8R, %fr4R, %fr30
365          xmpyu               %fr8L, %fr4L, %fr31
366          fstd                %fr24, -0x38(%r30)  C mid product to  -0x38..-0x31
367          xmpyu               %fr8R, %fr5R, %fr22
368          xmpyu               %fr8L, %fr5L, %fr23
369          fstd                %fr25, -0x30(%r30)  C mid product to  -0x30..-0x29
370          xmpyu               %fr8R, %fr6R, %fr24
371          xmpyu               %fr8L, %fr6L, %fr25
372          fstd                %fr26, -0x58(%r30)  C mid product to  -0x58..-0x51
373          xmpyu               %fr8R, %fr7R, %fr26
374          fstd                %fr27, -0x50(%r30)  C mid product to  -0x50..-0x49
375          addib,<>  -1, n, L(8_or_more)
376          xmpyu               %fr8L, %fr7L, %fr27
377          fstd                %fr28, -0x18(%r30)  C mid product to  -0x18..-0x11
378          fstd                %fr29, -0x10(%r30)  C mid product to  -0x10..-0x09
379          fstd                %fr30, -0x80(%r30)  C low product to  -0x80..-0x79
380          fstd                %fr31, -0x68(%r30)  C high product to -0x68..-0x61
381          fstd                %fr22, -0x40(%r30)  C low product to  -0x40..-0x39
382          fstd                %fr23, -0x28(%r30)  C high product to -0x28..-0x21
383          fstd                %fr24, -0x60(%r30)  C low product to  -0x60..-0x59
384          fstd                %fr25, -0x48(%r30)  C high product to -0x48..-0x41
385          fstd                %fr26, -0x20(%r30)  C low product to  -0x20..-0x19
386          fstd                %fr27, -0x88(%r30)  C high product to -0x88..-0x81
387          ldd                 -0x78(%r30), p032a1
388          ldd                 -0x70(%r30), p032a2
389          ldd                 -0x38(%r30), p096b1
390          ldd                 -0x30(%r30), p096b2
391          ldd                 -0x58(%r30), p160c1
392          ldd                 -0x50(%r30), p160c2
393          ldd                 -0x18(%r30), p224d1
394          ldd                 -0x10(%r30), p224d2
395          b                   L(end1)
396          nop
397
398LDEF(8_or_more)
399          fstd                %fr28, -0x18(%r30)  C mid product to  -0x18..-0x11
400          fstd                %fr29, -0x10(%r30)  C mid product to  -0x10..-0x09
401          ldo                 32(up), up
402          fstd                %fr30, -0x80(%r30)  C low product to  -0x80..-0x79
403          fstd                %fr31, -0x68(%r30)  C high product to -0x68..-0x61
404          fstd                %fr22, -0x40(%r30)  C low product to  -0x40..-0x39
405          fstd                %fr23, -0x28(%r30)  C high product to -0x28..-0x21
406          fstd                %fr24, -0x60(%r30)  C low product to  -0x60..-0x59
407          fstd                %fr25, -0x48(%r30)  C high product to -0x48..-0x41
408          fstd                %fr26, -0x20(%r30)  C low product to  -0x20..-0x19
409          fstd                %fr27, -0x88(%r30)  C high product to -0x88..-0x81
410          fldd                0(up), %fr4
411          fldd                8(up), %fr5
412          fldd                16(up), %fr6
413          fldd                24(up), %fr7
414          xmpyu               %fr8R, %fr4L, %fr22
415          ldd                 -0x78(%r30), p032a1
416          xmpyu               %fr8L, %fr4R, %fr23
417          xmpyu               %fr8R, %fr5L, %fr24
418          ldd                 -0x70(%r30), p032a2
419          xmpyu               %fr8L, %fr5R, %fr25
420          xmpyu               %fr8R, %fr6L, %fr26
421          ldd                 -0x38(%r30), p096b1
422          xmpyu               %fr8L, %fr6R, %fr27
423          fstd                %fr22, -0x78(%r30)  C mid product to  -0x78..-0x71
424          xmpyu               %fr8R, %fr7L, %fr28
425          ldd                 -0x30(%r30), p096b2
426          xmpyu               %fr8L, %fr7R, %fr29
427          fstd                %fr23, -0x70(%r30)  C mid product to  -0x70..-0x69
428          xmpyu               %fr8R, %fr4R, %fr30
429          ldd                 -0x58(%r30), p160c1
430          xmpyu               %fr8L, %fr4L, %fr31
431          fstd                %fr24, -0x38(%r30)  C mid product to  -0x38..-0x31
432          xmpyu               %fr8R, %fr5R, %fr22
433          ldd                 -0x50(%r30), p160c2
434          xmpyu               %fr8L, %fr5L, %fr23
435          fstd                %fr25, -0x30(%r30)  C mid product to  -0x30..-0x29
436          xmpyu               %fr8R, %fr6R, %fr24
437          ldd                 -0x18(%r30), p224d1
438          xmpyu               %fr8L, %fr6L, %fr25
439          fstd                %fr26, -0x58(%r30)  C mid product to  -0x58..-0x51
440          xmpyu               %fr8R, %fr7R, %fr26
441          ldd                 -0x10(%r30), p224d2
442          fstd                %fr27, -0x50(%r30)  C mid product to  -0x50..-0x49
443          addib,=             -1, n, L(end2)
444          xmpyu               %fr8L, %fr7L, %fr27
445LDEF(loop)
446          add                 p032a1, p032a2, m032
447          ldd                 -0x80(%r30), p000a
448          add,dc              p096b1, p096b2, m096
449          fstd                %fr28, -0x18(%r30)  C mid product to  -0x18..-0x11
450
451          add,dc              p160c1, p160c2, m160
452          ldd                 -0x68(%r30), p064a
453          add,dc              p224d1, p224d2, m224
454          fstd                %fr29, -0x10(%r30)  C mid product to  -0x10..-0x09
455
456          add,dc              %r0, %r0, m288
457          ldd                 -0x40(%r30), p064b
458          ldo                 32(up), up
459          fstd                %fr30, -0x80(%r30)  C low product to  -0x80..-0x79
460
461          depd,z              m032, 31, 32, ma000
462          ldd                 -0x28(%r30), p128b
463          extrd,u             m032, 31, 32, ma064
464          fstd                %fr31, -0x68(%r30)  C high product to -0x68..-0x61
465
466          depd                m096, 31, 32, ma064
467          ldd                 -0x60(%r30), p128c
468          extrd,u             m096, 31, 32, ma128
469          fstd                %fr22, -0x40(%r30)  C low product to  -0x40..-0x39
470
471          depd                m160, 31, 32, ma128
472          ldd                 -0x48(%r30), p192c
473          extrd,u             m160, 31, 32, ma192
474          fstd                %fr23, -0x28(%r30)  C high product to -0x28..-0x21
475
476          depd                m224, 31, 32, ma192
477          ldd                 -0x20(%r30), p192d
478          extrd,u             m224, 31, 32, ma256
479          fstd                %fr24, -0x60(%r30)  C low product to  -0x60..-0x59
480
481          depd                m288, 31, 32, ma256
482          ldd                 -0x88(%r30), p256d
483          add                 climb, p000a, s000
484          fstd                %fr25, -0x48(%r30)  C high product to -0x48..-0x41
485
486          add,dc              p064a, p064b, s064
487          ldd                 0(rp), r000
488          add,dc              p128b, p128c, s128
489          fstd                %fr26, -0x20(%r30)  C low product to  -0x20..-0x19
490
491          add,dc              p192c, p192d, s192
492          ldd                 8(rp), r064
493          add,dc              p256d, %r0, climb
494          fstd                %fr27, -0x88(%r30)  C high product to -0x88..-0x81
495
496          ldd                 16(rp), r128
497          add                 ma000, s000, s000   C accum mid 0
498          ldd                 24(rp), r192
499          add,dc              ma064, s064, s064   C accum mid 1
500
501          add,dc              ma128, s128, s128   C accum mid 2
502          fldd                0(up), %fr4
503          add,dc              ma192, s192, s192   C accum mid 3
504          fldd                8(up), %fr5
505
506          add,dc              ma256, climb, climb
507          fldd                16(up), %fr6
508          add                 r000, s000, s000    C accum rlimb 0
509          fldd                24(up), %fr7
510
511          add,dc              r064, s064, s064    C accum rlimb 1
512          add,dc              r128, s128, s128    C accum rlimb 2
513          std                 s000, 0(rp)
514
515          add,dc              r192, s192, s192    C accum rlimb 3
516          add,dc              %r0, climb, climb
517          std                 s064, 8(rp)
518
519          xmpyu               %fr8R, %fr4L, %fr22
520          ldd                 -0x78(%r30), p032a1
521          xmpyu               %fr8L, %fr4R, %fr23
522          std                 s128, 16(rp)
523
524          xmpyu               %fr8R, %fr5L, %fr24
525          ldd                 -0x70(%r30), p032a2
526          xmpyu               %fr8L, %fr5R, %fr25
527          std                 s192, 24(rp)
528
529          xmpyu               %fr8R, %fr6L, %fr26
530          ldd                 -0x38(%r30), p096b1
531          xmpyu               %fr8L, %fr6R, %fr27
532          fstd                %fr22, -0x78(%r30)  C mid product to  -0x78..-0x71
533
534          xmpyu               %fr8R, %fr7L, %fr28
535          ldd                 -0x30(%r30), p096b2
536          xmpyu               %fr8L, %fr7R, %fr29
537          fstd                %fr23, -0x70(%r30)  C mid product to  -0x70..-0x69
538
539          xmpyu               %fr8R, %fr4R, %fr30
540          ldd                 -0x58(%r30), p160c1
541          xmpyu               %fr8L, %fr4L, %fr31
542          fstd                %fr24, -0x38(%r30)  C mid product to  -0x38..-0x31
543
544          xmpyu               %fr8R, %fr5R, %fr22
545          ldd                 -0x50(%r30), p160c2
546          xmpyu               %fr8L, %fr5L, %fr23
547          fstd                %fr25, -0x30(%r30)  C mid product to  -0x30..-0x29
548
549          xmpyu               %fr8R, %fr6R, %fr24
550          ldd                 -0x18(%r30), p224d1
551          xmpyu               %fr8L, %fr6L, %fr25
552          fstd                %fr26, -0x58(%r30)  C mid product to  -0x58..-0x51
553
554          xmpyu               %fr8R, %fr7R, %fr26
555          ldd                 -0x10(%r30), p224d2
556          fstd                %fr27, -0x50(%r30)  C mid product to  -0x50..-0x49
557          xmpyu               %fr8L, %fr7L, %fr27
558
559          addib,<>  -1, n, L(loop)
560          ldo                 32(rp), rp
561
562LDEF(end2)
563          add                 p032a1, p032a2, m032
564          ldd                 -0x80(%r30), p000a
565          add,dc              p096b1, p096b2, m096
566          fstd                %fr28, -0x18(%r30)  C mid product to  -0x18..-0x11
567          add,dc              p160c1, p160c2, m160
568          ldd                 -0x68(%r30), p064a
569          add,dc              p224d1, p224d2, m224
570          fstd                %fr29, -0x10(%r30)  C mid product to  -0x10..-0x09
571          add,dc              %r0, %r0, m288
572          ldd                 -0x40(%r30), p064b
573          fstd                %fr30, -0x80(%r30)  C low product to  -0x80..-0x79
574          depd,z              m032, 31, 32, ma000
575          ldd                 -0x28(%r30), p128b
576          extrd,u             m032, 31, 32, ma064
577          fstd                %fr31, -0x68(%r30)  C high product to -0x68..-0x61
578          depd                m096, 31, 32, ma064
579          ldd                 -0x60(%r30), p128c
580          extrd,u             m096, 31, 32, ma128
581          fstd                %fr22, -0x40(%r30)  C low product to  -0x40..-0x39
582          depd                m160, 31, 32, ma128
583          ldd                 -0x48(%r30), p192c
584          extrd,u             m160, 31, 32, ma192
585          fstd                %fr23, -0x28(%r30)  C high product to -0x28..-0x21
586          depd                m224, 31, 32, ma192
587          ldd                 -0x20(%r30), p192d
588          extrd,u             m224, 31, 32, ma256
589          fstd                %fr24, -0x60(%r30)  C low product to  -0x60..-0x59
590          depd                m288, 31, 32, ma256
591          ldd                 -0x88(%r30), p256d
592          add                 climb, p000a, s000
593          fstd                %fr25, -0x48(%r30)  C high product to -0x48..-0x41
594          add,dc              p064a, p064b, s064
595          ldd                 0(rp), r000
596          add,dc              p128b, p128c, s128
597          fstd                %fr26, -0x20(%r30)  C low product to  -0x20..-0x19
598          add,dc              p192c, p192d, s192
599          ldd                 8(rp), r064
600          add,dc              p256d, %r0, climb
601          fstd                %fr27, -0x88(%r30)  C high product to -0x88..-0x81
602          ldd                 16(rp), r128
603          add                 ma000, s000, s000   C accum mid 0
604          ldd                 24(rp), r192
605          add,dc              ma064, s064, s064   C accum mid 1
606          add,dc              ma128, s128, s128   C accum mid 2
607          add,dc              ma192, s192, s192   C accum mid 3
608          add,dc              ma256, climb, climb
609          add                 r000, s000, s000    C accum rlimb 0
610          add,dc              r064, s064, s064    C accum rlimb 1
611          add,dc              r128, s128, s128    C accum rlimb 2
612          std                 s000, 0(rp)
613          add,dc              r192, s192, s192    C accum rlimb 3
614          add,dc              %r0, climb, climb
615          std                 s064, 8(rp)
616          ldd                 -0x78(%r30), p032a1
617          std                 s128, 16(rp)
618          ldd                 -0x70(%r30), p032a2
619          std                 s192, 24(rp)
620          ldd                 -0x38(%r30), p096b1
621          ldd                 -0x30(%r30), p096b2
622          ldd                 -0x58(%r30), p160c1
623          ldd                 -0x50(%r30), p160c2
624          ldd                 -0x18(%r30), p224d1
625          ldd                 -0x10(%r30), p224d2
626          ldo                 32(rp), rp
627
628LDEF(end1)
629          add                 p032a1, p032a2, m032
630          ldd                 -0x80(%r30), p000a
631          add,dc              p096b1, p096b2, m096
632          add,dc              p160c1, p160c2, m160
633          ldd                 -0x68(%r30), p064a
634          add,dc              p224d1, p224d2, m224
635          add,dc              %r0, %r0, m288
636          ldd                 -0x40(%r30), p064b
637          depd,z              m032, 31, 32, ma000
638          ldd                 -0x28(%r30), p128b
639          extrd,u             m032, 31, 32, ma064
640          depd                m096, 31, 32, ma064
641          ldd                 -0x60(%r30), p128c
642          extrd,u             m096, 31, 32, ma128
643          depd                m160, 31, 32, ma128
644          ldd                 -0x48(%r30), p192c
645          extrd,u             m160, 31, 32, ma192
646          depd                m224, 31, 32, ma192
647          ldd                 -0x20(%r30), p192d
648          extrd,u             m224, 31, 32, ma256
649          depd                m288, 31, 32, ma256
650          ldd                 -0x88(%r30), p256d
651          add                 climb, p000a, s000
652          add,dc              p064a, p064b, s064
653          ldd                 0(rp), r000
654          add,dc              p128b, p128c, s128
655          add,dc              p192c, p192d, s192
656          ldd                 8(rp), r064
657          add,dc              p256d, %r0, climb
658          ldd                 16(rp), r128
659          add                 ma000, s000, s000   C accum mid 0
660          ldd                 24(rp), r192
661          add,dc              ma064, s064, s064   C accum mid 1
662          add,dc              ma128, s128, s128   C accum mid 2
663          add,dc              ma192, s192, s192   C accum mid 3
664          add,dc              ma256, climb, climb
665          add                 r000, s000, s000    C accum rlimb 0
666          add,dc              r064, s064, s064    C accum rlimb 1
667          add,dc              r128, s128, s128    C accum rlimb 2
668          std                 s000, 0(rp)
669          add,dc              r192, s192, s192    C accum rlimb 3
670          add,dc              %r0, climb, climb
671          std                 s064, 8(rp)
672          std                 s128, 16(rp)
673          std                 s192, 24(rp)
674
675          ldd                 -0xb0(%r30), %r13
676          ldd                 -0xb8(%r30), %r12
677          ldd                 -0xc0(%r30), %r11
678          ldd                 -0xc8(%r30), %r10
679          ldd                 -0xd0(%r30), %r9
680          ldd                 -0xd8(%r30), %r8
681          ldd                 -0xe0(%r30), %r7
682          ldd                 -0xe8(%r30), %r6
683LDEF(done)
684ifdef(`HAVE_ABI_2_0w',
685`         copy                climb, %r28
686',`       extrd,u             climb, 63, 32, %r29
687          extrd,u             climb, 31, 32, %r28
688')
689          ldd                 -0xf0(%r30), %r5
690          ldd                 -0xf8(%r30), %r4
691          bve                 (%r2)
692          ldd,mb              -0x100(%r30), %r3
693EPILOGUE(mpn_addmul_1)
694