1dnl  AMD64 mpn_addmul_2 optimised for AMD Bulldozer.
2
3dnl  Copyright 2017 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33C              cycles/limb
34C AMD K8,K9
35C AMD K10
36C AMD bd1  4.2
37C AMD bd2  4.4
38C AMD bd3
39C AMD bd4
40C AMD zen
41C AMD bt1
42C AMD bt2
43C Intel P4
44C Intel PNR
45C Intel NHM
46C Intel SBR
47C Intel IBR
48C Intel HWL
49C Intel BWL
50C Intel SKL
51C Intel atom
52C Intel SLM
53C VIA nano
54
55C The loop of this code is the result of running a code generation and
56C optimisation tool suite written by David Harvey and Torbjorn Granlund.
57
58define(`rp',      `%rdi')   C rcx
59define(`up',      `%rsi')   C rdx
60define(`n_param', `%rdx')   C r8
61define(`vp',      `%rcx')   C r9
62
63define(`n',       `%rcx')
64define(`v0',      `%rbx')
65define(`v1',      `%rbp')
66define(`X0',      `%r12')
67define(`X1',      `%r13')
68
69define(`w0',    `%r8')
70define(`w1',    `%r9')
71define(`w2',    `%r10')
72define(`w3',    `%r11')
73
74ABI_SUPPORT(DOS64)
75ABI_SUPPORT(STD64)
76
77ASM_START()
78          TEXT
79          ALIGN(32)
80PROLOGUE(mpn_addmul_2)
81          FUNC_ENTRY(4)
82          push      %rbx
83          push      %rbp
84          push      %r12
85          push      %r13
86
87          mov       (vp), v0
88          mov       8(vp), v1
89
90          mov       (up), %rax
91          mov       $0, R32(w2)                   C abuse w2
92
93          lea       (up,n_param,8), up
94          lea       (rp,n_param,8), rp
95          sub       n_param, w2
96          mul       v0
97
98          test      $1, R8(w2)
99          jnz       L(bx1)
100
101L(bx0):   mov       %rdx, X0
102          mov       %rax, X1
103          test      $2, R8(w2)
104          jnz       L(b10)
105
106L(b00):   lea       (w2), n                       C un = 4, 8, 12, ...
107          mov       (up,w2,8), %rax
108          mov       (rp,w2,8), w3
109          mul       v1
110          mov       %rax, w0
111          mov       8(up,w2,8), %rax
112          mov       %rdx, w1
113          jmp       L(lo0)
114
115L(b10):   lea       2(w2), n            C un = 2, 6, 10, ...
116          mov       (up,w2,8), %rax
117          mov       (rp,w2,8), w1
118          mul       v1
119          mov       %rdx, w3
120          mov       %rax, w2
121          mov       -8(up,n,8), %rax
122          test      n, n
123          jz        L(end)
124          jmp       L(top)
125
126L(bx1):   mov       %rax, X0
127          mov       %rdx, X1
128          test      $2, R8(w2)
129          jz        L(b11)
130
131L(b01):   lea       1(w2), n            C un = 1, 5, 9, ...
132          mov       (up,w2,8), %rax
133          mul       v1
134          mov       (rp,w2,8), w2
135          mov       %rdx, w0
136          mov       %rax, w3
137          jmp       L(lo1)
138
139L(b11):   lea       -1(w2), n           C un = 3, 7, 11, ...
140          mov       (up,w2,8), %rax
141          mul       v1
142          mov       (rp,w2,8), w0
143          mov       %rax, w1
144          mov       8(up,w2,8), %rax
145          mov       %rdx, w2
146          jmp       L(lo3)
147
148          ALIGN(32)
149L(top):
150L(lo2):   mul       v0
151          add       w1, X1
152          mov       X1, -16(rp,n,8)
153          mov       %rdx, X1
154          adc       %rax, X0
155          adc       $0, X1
156          mov       -8(up,n,8), %rax
157          mul       v1
158          mov       -8(rp,n,8), w1
159          mov       %rdx, w0
160          add       w1, w2
161          adc       %rax, w3
162          adc       $0, w0
163L(lo1):   mov       (up,n,8), %rax
164          mul       v0
165          add       w2, X0
166          mov       X0, -8(rp,n,8)
167          mov       %rdx, X0
168          adc       %rax, X1
169          mov       (up,n,8), %rax
170          adc       $0, X0
171          mov       (rp,n,8), w2
172          mul       v1
173          add       w2, w3
174          adc       %rax, w0
175          mov       8(up,n,8), %rax
176          mov       %rdx, w1
177          adc       $0, w1
178L(lo0):   mul       v0
179          add       w3, X1
180          mov       X1, (rp,n,8)
181          adc       %rax, X0
182          mov       8(up,n,8), %rax
183          mov       %rdx, X1
184          adc       $0, X1
185          mov       8(rp,n,8), w3
186          mul       v1
187          add       w3, w0
188          adc       %rax, w1
189          mov       16(up,n,8), %rax
190          mov       %rdx, w2
191          adc       $0, w2
192L(lo3):   mul       v0
193          add       w0, X0
194          mov       X0, 8(rp,n,8)
195          mov       %rdx, X0
196          adc       %rax, X1
197          adc       $0, X0
198          mov       16(up,n,8), %rax
199          mov       16(rp,n,8), w0
200          mul       v1
201          mov       %rdx, w3
202          add       w0, w1
203          adc       %rax, w2
204          adc       $0, w3
205          mov       24(up,n,8), %rax
206          add       $4, n
207          jnc       L(top)
208
209L(end):   mul       v0
210          add       w1, X1
211          mov       X1, -16(rp)
212          mov       %rdx, X1
213          adc       %rax, X0
214          adc       $0, X1
215          mov       -8(up), %rax
216          mul       v1
217          mov       -8(rp), w1
218          add       w1, w2
219          adc       %rax, w3
220          adc       $0, %rdx
221          add       w2, X0
222          adc       $0, X1
223          mov       X0, -8(rp)
224          add       w3, X1
225          mov       X1, (rp)
226          adc       $0, %rdx
227          mov       %rdx, %rax
228
229          pop       %r13
230          pop       %r12
231          pop       %rbp
232          pop       %rbx
233          FUNC_EXIT()
234          ret
235EPILOGUE()
236