1dnl  X86-64 mpn_mul_basecase optimised for Intel Nehalem/Westmere.
2dnl  It also seems good for Conroe/Wolfdale.
3
4dnl  Contributed to the GNU project by Torbjörn Granlund.
5
6dnl  Copyright 2008, 2011-2013 Free Software Foundation, Inc.
7
8dnl  This file is part of the GNU MP Library.
9dnl
10dnl  The GNU MP Library is free software; you can redistribute it and/or modify
11dnl  it under the terms of either:
12dnl
13dnl    * the GNU Lesser General Public License as published by the Free
14dnl      Software Foundation; either version 3 of the License, or (at your
15dnl      option) any later version.
16dnl
17dnl  or
18dnl
19dnl    * the GNU General Public License as published by the Free Software
20dnl      Foundation; either version 2 of the License, or (at your option) any
21dnl      later version.
22dnl
23dnl  or both in parallel, as here.
24dnl
25dnl  The GNU MP Library is distributed in the hope that it will be useful, but
26dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
27dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
28dnl  for more details.
29dnl
30dnl  You should have received copies of the GNU General Public License and the
31dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
32dnl  see https://www.gnu.org/licenses/.
33
34include(`../config.m4')
35
36C cycles/limb       mul_1               mul_2               mul_3               addmul_2
37C AMD K8,K9
38C AMD K10
39C AMD bull
40C AMD pile
41C AMD steam
42C AMD bobcat
43C AMD jaguar
44C Intel P4
45C Intel core         4.0                 4.0                 -                  4.18-4.25
46C Intel NHM          3.75                3.8                 -                  4.06-4.2
47C Intel SBR
48C Intel IBR
49C Intel HWL
50C Intel BWL
51C Intel atom
52C VIA nano
53
54C The inner loops of this code are the result of running a code generation and
55C optimisation tool suite written by David Harvey and Torbjörn Granlund.
56
57C Code structure:
58C
59C
60C               m_1(0m4)        m_1(1m4)        m_1(2m4)        m_1(3m4)
61C                  |               |               |               |
62C        m_2(0m4)  |     m_2(1m4)  |     m_2(2m4)  |     m_2(3m4)  |
63C           |      /        |      /        |      /        |      /
64C           |     /         |     /         |     /         |     /
65C           |    /          |    /          |    /          |    /
66C          \|/ |/_         \|/ |/_         \|/ |/_         \|/ |/_
67C             _____           _____           _____           _____
68C            /     \         /     \         /     \         /     \
69C          \|/      |      \|/      |      \|/      |      \|/      |
70C        am_2(0m4)  |    am_2(1m4)  |    am_2(2m4)  |    am_2(3m4)  |
71C           \      /|\      \      /|\      \      /|\      \      /|\
72C            \_____/         \_____/         \_____/         \_____/
73
74C TODO
75C  * Tune.  None done so far.
76C  * Currently 2687 bytes, making it smaller would be nice.
77C  * Implement some basecases, say for un < 4.
78C  * Try zeroing with xor in m2 loops.
79C  * Try re-rolling the m2 loops to avoid the current 9 insn code duplication
80C    between loop header and wind-down code.
81C  * Consider adc reg,reg instead of adc $0,reg in m2 loops.  This save a byte.
82
83C When playing with pointers, set this to $2 to fall back to conservative
84C indexing in wind-down code.
85define(`I',`$1')
86
87C Define this to $1 to use late loop index variable as zero, $2 to use an
88C explicit $0.
89define(`Z',`$1')
90
91define(`rp',       `%rdi')
92define(`up',       `%rsi')
93define(`un_param', `%rdx')
94define(`vp_param', `%rcx')    C FIXME reallocate vp to rcx but watch performance!
95define(`vn_param', `%r8')
96
97define(`un',       `%r9')
98define(`vn',       `(%rsp)')
99
100define(`v0',       `%r10')
101define(`v1',       `%r11')
102define(`w0',       `%rbx')
103define(`w1',       `%rcx')
104define(`w2',       `%rbp')
105define(`w3',       `%r12')
106define(`i',        `%r13')
107define(`vp',       `%r14')
108
109define(`X0',       `%r8')
110define(`X1',       `%r15')
111
112C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
113
114ABI_SUPPORT(DOS64)
115ABI_SUPPORT(STD64)
116
117define(`ALIGNx', `ALIGN(16)')
118
119define(`N', 85)
120ifdef(`N',,`define(`N',0)')
121define(`MOV', `ifelse(eval(N & $3),0,`mov         $1, $2',`lea        ($1), $2')')
122
123ASM_START()
124          TEXT
125          ALIGN(32)
126PROLOGUE(mpn_mul_basecase)
127          FUNC_ENTRY(4)
128IFDOS(`   mov       56(%rsp), %r8d      ')
129          mov       (up), %rax                    C shared for mul_1 and mul_2
130          push      %rbx
131          push      %rbp
132          push      %r12
133          push      %r13
134          push      %r14
135
136          mov       (vp_param), v0                C shared for mul_1 and mul_2
137
138          xor       un, un
139          sub       un_param, un                  C un = -un_param
140
141          lea       (up,un_param,8), up
142          lea       (rp,un_param,8), rp
143
144          mul       v0                            C shared for mul_1 and mul_2
145
146          test      $1, R8(vn_param)
147          jz        L(m2)
148
149          lea       8(vp_param), vp               C FIXME: delay until known needed
150
151          test      $1, R8(un)
152          jnz       L(m1x1)
153
154L(m1x0):test        $2, R8(un)
155          jnz       L(m1s2)
156
157L(m1s0):
158          lea       (un), i
159          mov       %rax, (rp,un,8)
160          mov       8(up,un,8), %rax
161          mov       %rdx, w0            C FIXME: Use lea?
162          lea       L(do_am0)(%rip), %rbp
163          jmp       L(m1e0)
164
165L(m1s2):
166          lea       2(un), i
167          mov       %rax, (rp,un,8)
168          mov       8(up,un,8), %rax
169          mov       %rdx, w0            C FIXME: Use lea?
170          mul       v0
171          lea       L(do_am2)(%rip), %rbp
172          test      i, i
173          jnz       L(m1e2)
174          add       %rax, w0
175          adc       $0, %rdx
176          mov       w0, I(-8(rp),8(rp,un,8))
177          mov       %rdx, I((rp),16(rp,un,8))
178          jmp       L(ret2)
179
180L(m1x1):test        $2, R8(un)
181          jz        L(m1s3)
182
183L(m1s1):
184          lea       1(un), i
185          mov       %rax, (rp,un,8)
186          test      i, i
187          jz        L(1)
188          mov       8(up,un,8), %rax
189          mov       %rdx, w1            C FIXME: Use lea?
190          lea       L(do_am1)(%rip), %rbp
191          jmp       L(m1e1)
192L(1):     mov       %rdx, I((rp),8(rp,un,8))
193          jmp       L(ret2)
194
195L(m1s3):
196          lea       -1(un), i
197          mov       %rax, (rp,un,8)
198          mov       8(up,un,8), %rax
199          mov       %rdx, w1            C FIXME: Use lea?
200          lea       L(do_am3)(%rip), %rbp
201          jmp       L(m1e3)
202
203          ALIGNx
204L(m1top):
205          mul       v0
206          mov       w1, -16(rp,i,8)
207L(m1e2):xor         R32(w1), R32(w1)
208          add       %rax, w0
209          mov       (up,i,8), %rax
210          adc       %rdx, w1
211          mov       w0, -8(rp,i,8)
212L(m1e1):xor         R32(w0), R32(w0)
213          mul       v0
214          add       %rax, w1
215          mov       8(up,i,8), %rax
216          adc       %rdx, w0
217          mov       w1, (rp,i,8)
218L(m1e0):xor         R32(w1), R32(w1)
219          mul       v0
220          add       %rax, w0
221          mov       16(up,i,8), %rax
222          adc       %rdx, w1
223          mov       w0, 8(rp,i,8)
224L(m1e3):xor         R32(w0), R32(w0)
225          mul       v0
226          add       %rax, w1
227          mov       24(up,i,8), %rax
228          adc       %rdx, w0
229          add       $4, i
230          js        L(m1top)
231
232          mul       v0
233          mov       w1, I(-16(rp),-16(rp,i,8))
234          add       %rax, w0
235          adc       $0, %rdx
236          mov       w0, I(-8(rp),-8(rp,i,8))
237          mov       %rdx, I((rp),(rp,i,8))
238
239          dec       vn_param
240          jz        L(ret2)
241          lea       -8(rp), rp
242          jmp       *%rbp
243
244L(m2):
245          mov       8(vp_param), v1
246          lea       16(vp_param), vp    C FIXME: delay until known needed
247
248          test      $1, R8(un)
249          jnz       L(bx1)
250
251L(bx0):   test      $2, R8(un)
252          jnz       L(b10)
253
254L(b00):   lea       (un), i
255          mov       %rax, (rp,un,8)
256          mov       %rdx, w1            C FIXME: Use lea?
257          mov       (up,un,8), %rax
258          mov       $0, R32(w2)
259          jmp       L(m2e0)
260
261L(b10):   lea       -2(un), i
262          mov       %rax, w2            C FIXME: Use lea?
263          mov       (up,un,8), %rax
264          mov       %rdx, w3            C FIXME: Use lea?
265          mov       $0, R32(w0)
266          jmp       L(m2e2)
267
268L(bx1):   test      $2, R8(un)
269          jz        L(b11)
270
271L(b01):   lea       1(un), i
272          mov       %rax, (rp,un,8)
273          mov       (up,un,8), %rax
274          mov       %rdx, w0            C FIXME: Use lea?
275          mov       $0, R32(w1)
276          jmp       L(m2e1)
277
278L(b11):   lea       -1(un), i
279          mov       %rax, w1            C FIXME: Use lea?
280          mov       (up,un,8), %rax
281          mov       %rdx, w2            C FIXME: Use lea?
282          mov       $0, R32(w3)
283          jmp       L(m2e3)
284
285          ALIGNx
286L(m2top0):
287          mul       v0
288          add       %rax, w3
289          mov       -8(up,i,8), %rax
290          mov       w3, -8(rp,i,8)
291          adc       %rdx, w0
292          adc       $0, R32(w1)
293          mul       v1
294          add       %rax, w0
295          adc       %rdx, w1
296          mov       $0, R32(w2)
297          mov       (up,i,8), %rax
298          mul       v0
299          add       %rax, w0
300          mov       w0, (rp,i,8)
301          adc       %rdx, w1
302          mov       (up,i,8), %rax
303          adc       $0, R32(w2)
304L(m2e0):mul         v1
305          add       %rax, w1
306          adc       %rdx, w2
307          mov       8(up,i,8), %rax
308          mul       v0
309          mov       $0, R32(w3)
310          add       %rax, w1
311          adc       %rdx, w2
312          adc       $0, R32(w3)
313          mov       8(up,i,8), %rax
314          mul       v1
315          add       %rax, w2
316          mov       w1, 8(rp,i,8)
317          adc       %rdx, w3
318          mov       $0, R32(w0)
319          mov       16(up,i,8), %rax
320          mul       v0
321          add       %rax, w2
322          mov       16(up,i,8), %rax
323          adc       %rdx, w3
324          adc       $0, R32(w0)
325          mul       v1
326          mov       $0, R32(w1)
327          add       %rax, w3
328          mov       24(up,i,8), %rax
329          mov       w2, 16(rp,i,8)
330          adc       %rdx, w0
331          add       $4, i
332          js        L(m2top0)
333
334          mul       v0
335          add       %rax, w3
336          mov       I(-8(up),-8(up,i,8)), %rax
337          mov       w3, I(-8(rp),-8(rp,i,8))
338          adc       %rdx, w0
339          adc       R32(w1), R32(w1)
340          mul       v1
341          add       %rax, w0
342          adc       %rdx, w1
343          mov       w0, I((rp),(rp,i,8))
344          mov       w1, I(8(rp),8(rp,i,8))
345
346          add       $-2, vn_param
347          jz        L(ret2)
348
349L(do_am0):
350          push      %r15
351          push      vn_param
352
353L(olo0):
354          mov       (vp), v0
355          mov       8(vp), v1
356          lea       16(vp), vp
357          lea       16(rp), rp
358          mov       (up,un,8), %rax
359C         lea       0(un), i
360          mov       un, i
361          mul       v0
362          mov       %rax, X0
363          mov       (up,un,8), %rax
364          MOV(      %rdx, X1, 2)
365          mul       v1
366          MOV(      %rdx, w0, 4)
367          mov       (rp,un,8), w2
368          mov       %rax, w3
369          jmp       L(lo0)
370
371          ALIGNx
372L(am2top0):
373          mul       v1
374          add       w0, w1
375          adc       %rax, w2
376          mov       (up,i,8), %rax
377          MOV(      %rdx, w3, 1)
378          adc       $0, w3
379          mul       v0
380          add       w1, X1
381          mov       X1, -8(rp,i,8)
382          adc       %rax, X0
383          MOV(      %rdx, X1, 2)
384          adc       $0, X1
385          mov       (up,i,8), %rax
386          mul       v1
387          MOV(      %rdx, w0, 4)
388          mov       (rp,i,8), w1
389          add       w1, w2
390          adc       %rax, w3
391          adc       $0, w0
392L(lo0):   mov       8(up,i,8), %rax
393          mul       v0
394          add       w2, X0
395          adc       %rax, X1
396          mov       X0, (rp,i,8)
397          MOV(      %rdx, X0, 8)
398          adc       $0, X0
399          mov       8(up,i,8), %rax
400          mov       8(rp,i,8), w2
401          mul       v1
402          add       w2, w3
403          adc       %rax, w0
404          MOV(      %rdx, w1, 16)
405          adc       $0, w1
406          mov       16(up,i,8), %rax
407          mul       v0
408          add       w3, X1
409          mov       X1, 8(rp,i,8)
410          adc       %rax, X0
411          MOV(      %rdx, X1, 32)
412          mov       16(rp,i,8), w3
413          adc       $0, X1
414          mov       16(up,i,8), %rax
415          mul       v1
416          add       w3, w0
417          MOV(      %rdx, w2, 64)
418          adc       %rax, w1
419          mov       24(up,i,8), %rax
420          adc       $0, w2
421          mul       v0
422          add       w0, X0
423          mov       X0, 16(rp,i,8)
424          MOV(      %rdx, X0, 128)
425          adc       %rax, X1
426          mov       24(up,i,8), %rax
427          mov       24(rp,i,8), w0
428          adc       $0, X0
429          add       $4, i
430          jnc       L(am2top0)
431
432          mul       v1
433          add       w0, w1
434          adc       %rax, w2
435          adc       Z(i,$0), %rdx
436          add       w1, X1
437          adc       Z(i,$0), X0
438          mov       X1, I(-8(rp),-8(rp,i,8))
439          add       w2, X0
440          mov       X0, I((rp),(rp,i,8))
441          adc       Z(i,$0), %rdx
442          mov       %rdx, I(8(rp),8(rp,i,8))
443
444          addl      $-2, vn
445          jnz       L(olo0)
446
447L(ret):   pop       %rax
448          pop       %r15
449L(ret2):pop         %r14
450          pop       %r13
451          pop       %r12
452          pop       %rbp
453          pop       %rbx
454          FUNC_EXIT()
455          ret
456
457
458          ALIGNx
459L(m2top1):
460          mul       v0
461          add       %rax, w3
462          mov       -8(up,i,8), %rax
463          mov       w3, -8(rp,i,8)
464          adc       %rdx, w0
465          adc       $0, R32(w1)
466L(m2e1):mul         v1
467          add       %rax, w0
468          adc       %rdx, w1
469          mov       $0, R32(w2)
470          mov       (up,i,8), %rax
471          mul       v0
472          add       %rax, w0
473          mov       w0, (rp,i,8)
474          adc       %rdx, w1
475          mov       (up,i,8), %rax
476          adc       $0, R32(w2)
477          mul       v1
478          add       %rax, w1
479          adc       %rdx, w2
480          mov       8(up,i,8), %rax
481          mul       v0
482          mov       $0, R32(w3)
483          add       %rax, w1
484          adc       %rdx, w2
485          adc       $0, R32(w3)
486          mov       8(up,i,8), %rax
487          mul       v1
488          add       %rax, w2
489          mov       w1, 8(rp,i,8)
490          adc       %rdx, w3
491          mov       $0, R32(w0)
492          mov       16(up,i,8), %rax
493          mul       v0
494          add       %rax, w2
495          mov       16(up,i,8), %rax
496          adc       %rdx, w3
497          adc       $0, R32(w0)
498          mul       v1
499          mov       $0, R32(w1)
500          add       %rax, w3
501          mov       24(up,i,8), %rax
502          mov       w2, 16(rp,i,8)
503          adc       %rdx, w0
504          add       $4, i
505          js        L(m2top1)
506
507          mul       v0
508          add       %rax, w3
509          mov       I(-8(up),-8(up,i,8)), %rax
510          mov       w3, I(-8(rp),-8(rp,i,8))
511          adc       %rdx, w0
512          adc       R32(w1), R32(w1)
513          mul       v1
514          add       %rax, w0
515          adc       %rdx, w1
516          mov       w0, I((rp),(rp,i,8))
517          mov       w1, I(8(rp),8(rp,i,8))
518
519          add       $-2, vn_param
520          jz        L(ret2)
521
522L(do_am1):
523          push      %r15
524          push      vn_param
525
526L(olo1):
527          mov       (vp), v0
528          mov       8(vp), v1
529          lea       16(vp), vp
530          lea       16(rp), rp
531          mov       (up,un,8), %rax
532          lea       1(un), i
533          mul       v0
534          mov       %rax, X1
535          MOV(      %rdx, X0, 128)
536          mov       (up,un,8), %rax
537          mov       (rp,un,8), w1
538          mul       v1
539          mov       %rax, w2
540          mov       8(up,un,8), %rax
541          MOV(      %rdx, w3, 1)
542          jmp       L(lo1)
543
544          ALIGNx
545L(am2top1):
546          mul       v1
547          add       w0, w1
548          adc       %rax, w2
549          mov       (up,i,8), %rax
550          MOV(      %rdx, w3, 1)
551          adc       $0, w3
552L(lo1):   mul       v0
553          add       w1, X1
554          mov       X1, -8(rp,i,8)
555          adc       %rax, X0
556          MOV(      %rdx, X1, 2)
557          adc       $0, X1
558          mov       (up,i,8), %rax
559          mul       v1
560          MOV(      %rdx, w0, 4)
561          mov       (rp,i,8), w1
562          add       w1, w2
563          adc       %rax, w3
564          adc       $0, w0
565          mov       8(up,i,8), %rax
566          mul       v0
567          add       w2, X0
568          adc       %rax, X1
569          mov       X0, (rp,i,8)
570          MOV(      %rdx, X0, 8)
571          adc       $0, X0
572          mov       8(up,i,8), %rax
573          mov       8(rp,i,8), w2
574          mul       v1
575          add       w2, w3
576          adc       %rax, w0
577          MOV(      %rdx, w1, 16)
578          adc       $0, w1
579          mov       16(up,i,8), %rax
580          mul       v0
581          add       w3, X1
582          mov       X1, 8(rp,i,8)
583          adc       %rax, X0
584          MOV(      %rdx, X1, 32)
585          mov       16(rp,i,8), w3
586          adc       $0, X1
587          mov       16(up,i,8), %rax
588          mul       v1
589          add       w3, w0
590          MOV(      %rdx, w2, 64)
591          adc       %rax, w1
592          mov       24(up,i,8), %rax
593          adc       $0, w2
594          mul       v0
595          add       w0, X0
596          mov       X0, 16(rp,i,8)
597          MOV(      %rdx, X0, 128)
598          adc       %rax, X1
599          mov       24(up,i,8), %rax
600          mov       24(rp,i,8), w0
601          adc       $0, X0
602          add       $4, i
603          jnc       L(am2top1)
604
605          mul       v1
606          add       w0, w1
607          adc       %rax, w2
608          adc       Z(i,$0), %rdx
609          add       w1, X1
610          adc       Z(i,$0), X0
611          mov       X1, I(-8(rp),-8(rp,i,8))
612          add       w2, X0
613          mov       X0, I((rp),(rp,i,8))
614          adc       Z(i,$0), %rdx
615          mov       %rdx, I(8(rp),8(rp,i,8))
616
617          addl      $-2, vn
618          jnz       L(olo1)
619
620          pop       %rax
621          pop       %r15
622          pop       %r14
623          pop       %r13
624          pop       %r12
625          pop       %rbp
626          pop       %rbx
627          FUNC_EXIT()
628          ret
629
630
631          ALIGNx
632L(m2top2):
633          mul       v0
634          add       %rax, w3
635          mov       -8(up,i,8), %rax
636          mov       w3, -8(rp,i,8)
637          adc       %rdx, w0
638          adc       $0, R32(w1)
639          mul       v1
640          add       %rax, w0
641          adc       %rdx, w1
642          mov       $0, R32(w2)
643          mov       (up,i,8), %rax
644          mul       v0
645          add       %rax, w0
646          mov       w0, (rp,i,8)
647          adc       %rdx, w1
648          mov       (up,i,8), %rax
649          adc       $0, R32(w2)
650          mul       v1
651          add       %rax, w1
652          adc       %rdx, w2
653          mov       8(up,i,8), %rax
654          mul       v0
655          mov       $0, R32(w3)
656          add       %rax, w1
657          adc       %rdx, w2
658          adc       $0, R32(w3)
659          mov       8(up,i,8), %rax
660          mul       v1
661          add       %rax, w2
662          mov       w1, 8(rp,i,8)
663          adc       %rdx, w3
664          mov       $0, R32(w0)
665          mov       16(up,i,8), %rax
666          mul       v0
667          add       %rax, w2
668          mov       16(up,i,8), %rax
669          adc       %rdx, w3
670          adc       $0, R32(w0)
671L(m2e2):mul         v1
672          mov       $0, R32(w1)
673          add       %rax, w3
674          mov       24(up,i,8), %rax
675          mov       w2, 16(rp,i,8)
676          adc       %rdx, w0
677          add       $4, i
678          js        L(m2top2)
679
680          mul       v0
681          add       %rax, w3
682          mov       I(-8(up),-8(up,i,8)), %rax
683          mov       w3, I(-8(rp),-8(rp,i,8))
684          adc       %rdx, w0
685          adc       R32(w1), R32(w1)
686          mul       v1
687          add       %rax, w0
688          adc       %rdx, w1
689          mov       w0, I((rp),(rp,i,8))
690          mov       w1, I(8(rp),8(rp,i,8))
691
692          add       $-2, vn_param
693          jz        L(ret2)
694
695L(do_am2):
696          push      %r15
697          push      vn_param
698
699L(olo2):
700          mov       (vp), v0
701          mov       8(vp), v1
702          lea       16(vp), vp
703          lea       16(rp), rp
704          mov       (up,un,8), %rax
705          lea       -2(un), i
706          mul       v0
707          mov       %rax, X0
708          MOV(      %rdx, X1, 32)
709          mov       (up,un,8), %rax
710          mov       (rp,un,8), w0
711          mul       v1
712          mov       %rax, w1
713          lea       (%rdx), w2
714          mov       8(up,un,8), %rax
715          jmp       L(lo2)
716
717          ALIGNx
718L(am2top2):
719          mul       v1
720          add       w0, w1
721          adc       %rax, w2
722          mov       (up,i,8), %rax
723          MOV(      %rdx, w3, 1)
724          adc       $0, w3
725          mul       v0
726          add       w1, X1
727          mov       X1, -8(rp,i,8)
728          adc       %rax, X0
729          MOV(      %rdx, X1, 2)
730          adc       $0, X1
731          mov       (up,i,8), %rax
732          mul       v1
733          MOV(      %rdx, w0, 4)
734          mov       (rp,i,8), w1
735          add       w1, w2
736          adc       %rax, w3
737          adc       $0, w0
738          mov       8(up,i,8), %rax
739          mul       v0
740          add       w2, X0
741          adc       %rax, X1
742          mov       X0, (rp,i,8)
743          MOV(      %rdx, X0, 8)
744          adc       $0, X0
745          mov       8(up,i,8), %rax
746          mov       8(rp,i,8), w2
747          mul       v1
748          add       w2, w3
749          adc       %rax, w0
750          MOV(      %rdx, w1, 16)
751          adc       $0, w1
752          mov       16(up,i,8), %rax
753          mul       v0
754          add       w3, X1
755          mov       X1, 8(rp,i,8)
756          adc       %rax, X0
757          MOV(      %rdx, X1, 32)
758          mov       16(rp,i,8), w3
759          adc       $0, X1
760          mov       16(up,i,8), %rax
761          mul       v1
762          add       w3, w0
763          MOV(      %rdx, w2, 64)
764          adc       %rax, w1
765          mov       24(up,i,8), %rax
766          adc       $0, w2
767L(lo2):   mul       v0
768          add       w0, X0
769          mov       X0, 16(rp,i,8)
770          MOV(      %rdx, X0, 128)
771          adc       %rax, X1
772          mov       24(up,i,8), %rax
773          mov       24(rp,i,8), w0
774          adc       $0, X0
775          add       $4, i
776          jnc       L(am2top2)
777
778          mul       v1
779          add       w0, w1
780          adc       %rax, w2
781          adc       Z(i,$0), %rdx
782          add       w1, X1
783          adc       Z(i,$0), X0
784          mov       X1, I(-8(rp),-8(rp,i,8))
785          add       w2, X0
786          mov       X0, I((rp),(rp,i,8))
787          adc       Z(i,$0), %rdx
788          mov       %rdx, I(8(rp),8(rp,i,8))
789
790          addl      $-2, vn
791          jnz       L(olo2)
792
793          pop       %rax
794          pop       %r15
795          pop       %r14
796          pop       %r13
797          pop       %r12
798          pop       %rbp
799          pop       %rbx
800          FUNC_EXIT()
801          ret
802
803
804          ALIGNx
805L(m2top3):
806          mul       v0
807          add       %rax, w3
808          mov       -8(up,i,8), %rax
809          mov       w3, -8(rp,i,8)
810          adc       %rdx, w0
811          adc       $0, R32(w1)
812          mul       v1
813          add       %rax, w0
814          adc       %rdx, w1
815          mov       $0, R32(w2)
816          mov       (up,i,8), %rax
817          mul       v0
818          add       %rax, w0
819          mov       w0, (rp,i,8)
820          adc       %rdx, w1
821          mov       (up,i,8), %rax
822          adc       $0, R32(w2)
823          mul       v1
824          add       %rax, w1
825          adc       %rdx, w2
826          mov       8(up,i,8), %rax
827          mul       v0
828          mov       $0, R32(w3)
829          add       %rax, w1
830          adc       %rdx, w2
831          adc       $0, R32(w3)
832          mov       8(up,i,8), %rax
833L(m2e3):mul         v1
834          add       %rax, w2
835          mov       w1, 8(rp,i,8)
836          adc       %rdx, w3
837          mov       $0, R32(w0)
838          mov       16(up,i,8), %rax
839          mul       v0
840          add       %rax, w2
841          mov       16(up,i,8), %rax
842          adc       %rdx, w3
843          adc       $0, R32(w0)
844          mul       v1
845          mov       $0, R32(w1)
846          add       %rax, w3
847          mov       24(up,i,8), %rax
848          mov       w2, 16(rp,i,8)
849          adc       %rdx, w0
850          add       $4, i
851          js        L(m2top3)
852
853          mul       v0
854          add       %rax, w3
855          mov       I(-8(up),-8(up,i,8)), %rax
856          mov       w3, I(-8(rp),-8(rp,i,8))
857          adc       %rdx, w0
858          adc       $0, R32(w1)
859          mul       v1
860          add       %rax, w0
861          adc       %rdx, w1
862          mov       w0, I((rp),(rp,i,8))
863          mov       w1, I(8(rp),8(rp,i,8))
864
865          add       $-2, vn_param
866          jz        L(ret2)
867
868L(do_am3):
869          push      %r15
870          push      vn_param
871
872L(olo3):
873          mov       (vp), v0
874          mov       8(vp), v1
875          lea       16(vp), vp
876          lea       16(rp), rp
877          mov       (up,un,8), %rax
878          lea       -1(un), i
879          mul       v0
880          mov       %rax, X1
881          MOV(      %rdx, X0, 8)
882          mov       (up,un,8), %rax
883          mov       (rp,un,8), w3
884          mul       v1
885          mov       %rax, w0
886          MOV(      %rdx, w1, 16)
887          mov       8(up,un,8), %rax
888          jmp       L(lo3)
889
890          ALIGNx
891L(am2top3):
892          mul       v1
893          add       w0, w1
894          adc       %rax, w2
895          mov       (up,i,8), %rax
896          MOV(      %rdx, w3, 1)
897          adc       $0, w3
898          mul       v0
899          add       w1, X1
900          mov       X1, -8(rp,i,8)
901          adc       %rax, X0
902          MOV(      %rdx, X1, 2)
903          adc       $0, X1
904          mov       (up,i,8), %rax
905          mul       v1
906          MOV(      %rdx, w0, 4)
907          mov       (rp,i,8), w1
908          add       w1, w2
909          adc       %rax, w3
910          adc       $0, w0
911          mov       8(up,i,8), %rax
912          mul       v0
913          add       w2, X0
914          adc       %rax, X1
915          mov       X0, (rp,i,8)
916          MOV(      %rdx, X0, 8)
917          adc       $0, X0
918          mov       8(up,i,8), %rax
919          mov       8(rp,i,8), w2
920          mul       v1
921          add       w2, w3
922          adc       %rax, w0
923          MOV(      %rdx, w1, 16)
924          adc       $0, w1
925          mov       16(up,i,8), %rax
926L(lo3):   mul       v0
927          add       w3, X1
928          mov       X1, 8(rp,i,8)
929          adc       %rax, X0
930          MOV(      %rdx, X1, 32)
931          mov       16(rp,i,8), w3
932          adc       $0, X1
933          mov       16(up,i,8), %rax
934          mul       v1
935          add       w3, w0
936          MOV(      %rdx, w2, 64)
937          adc       %rax, w1
938          mov       24(up,i,8), %rax
939          adc       $0, w2
940          mul       v0
941          add       w0, X0
942          mov       X0, 16(rp,i,8)
943          MOV(      %rdx, X0, 128)
944          adc       %rax, X1
945          mov       24(up,i,8), %rax
946          mov       24(rp,i,8), w0
947          adc       $0, X0
948          add       $4, i
949          jnc       L(am2top3)
950
951          mul       v1
952          add       w0, w1
953          adc       %rax, w2
954          adc       Z(i,$0), %rdx
955          add       w1, X1
956          adc       Z(i,$0), X0
957          mov       X1, I(-8(rp),-8(rp,i,8))
958          add       w2, X0
959          mov       X0, I((rp),(rp,i,8))
960          adc       Z(i,$0), %rdx
961          mov       %rdx, I(8(rp),8(rp,i,8))
962
963          addl      $-2, vn
964          jnz       L(olo3)
965
966          pop       %rax
967          pop       %r15
968          pop       %r14
969          pop       %r13
970          pop       %r12
971          pop       %rbp
972          pop       %rbx
973          FUNC_EXIT()
974          ret
975EPILOGUE()
976