1dnl  AMD64 mpn_mullo_basecase optimised for Conroe/Wolfdale/Nehalem/Westmere.
2
3dnl  Contributed to the GNU project by Torbjörn Granlund.
4
5dnl  Copyright 2008, 2009, 2011-2013 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35C cycles/limb       mul_2               addmul_2
36C AMD K8,K9
37C AMD K10
38C AMD bull
39C AMD pile
40C AMD steam
41C AMD bobcat
42C AMD jaguar
43C Intel P4
44C Intel core         4.0                4.18-4.25
45C Intel NHM          3.75               4.06-4.2
46C Intel SBR
47C Intel IBR
48C Intel HWL
49C Intel BWL
50C Intel atom
51C VIA nano
52
53C The inner loops of this code are the result of running a code generation and
54C optimisation tool suite written by David Harvey and Torbjörn Granlund.
55
56C TODO
57C   * Implement proper cor2, replacing current cor0.
58C   * Offset n by 2 in order to avoid the outer loop cmp.  (And sqr_basecase?)
59C   * Micro-optimise.
60
61C When playing with pointers, set this to $2 to fall back to conservative
62C indexing in wind-down code.
63define(`I',`$1')
64
65define(`rp',       `%rdi')
66define(`up',       `%rsi')
67define(`vp_param', `%rdx')
68define(`n_param',  `%rcx')
69
70define(`v0',       `%r10')
71define(`v1',       `%r11')
72define(`w0',       `%rbx')
73define(`w1',       `%rcx')
74define(`w2',       `%rbp')
75define(`w3',       `%r12')
76define(`n',        `%r9')
77define(`i',        `%r13')
78define(`vp',       `%r8')
79
80define(`X0',       `%r14')
81define(`X1',       `%r15')
82
83C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
84
85ABI_SUPPORT(DOS64)
86ABI_SUPPORT(STD64)
87
88define(`ALIGNx', `ALIGN(16)')
89
90define(`N', 85)
91ifdef(`N',,`define(`N',0)')
92define(`MOV', `ifelse(eval(N & $3),0,`mov         $1, $2',`lea        ($1), $2')')
93
94ASM_START()
95          TEXT
96          ALIGN(32)
97PROLOGUE(mpn_mullo_basecase)
98          FUNC_ENTRY(4)
99
100          mov       (up), %rax
101          mov       vp_param, vp
102
103          cmp       $4, n_param
104          jb        L(small)
105
106          mov       (vp_param), v0
107          push      %rbx
108          lea       (rp,n_param,8), rp  C point rp at R[un]
109          push      %rbp
110          lea       (up,n_param,8), up  C point up right after U's end
111          push      %r12
112          mov       $0, R32(n)                    C FIXME
113          sub       n_param, n
114          push      %r13
115          mul       v0
116          mov       8(vp), v1
117
118          test      $1, R8(n_param)
119          jnz       L(m2x1)
120
121L(m2x0):test        $2, R8(n_param)
122          jnz       L(m2b2)
123
124L(m2b0):lea         (n), i
125          mov       %rax, (rp,n,8)
126          mov       %rdx, w1
127          mov       (up,n,8), %rax
128          xor       R32(w2), R32(w2)
129          jmp       L(m2e0)
130
131L(m2b2):lea         -2(n), i
132          mov       %rax, w2
133          mov       (up,n,8), %rax
134          mov       %rdx, w3
135          xor       R32(w0), R32(w0)
136          jmp       L(m2e2)
137
138L(m2x1):test        $2, R8(n_param)
139          jnz       L(m2b3)
140
141L(m2b1):lea         1(n), i
142          mov       %rax, (rp,n,8)
143          mov       (up,n,8), %rax
144          mov       %rdx, w0
145          xor       R32(w1), R32(w1)
146          jmp       L(m2e1)
147
148L(m2b3):lea         -1(n), i
149          xor       R32(w3), R32(w3)
150          mov       %rax, w1
151          mov       %rdx, w2
152          mov       (up,n,8), %rax
153          jmp       L(m2e3)
154
155          ALIGNx
156L(m2tp):mul         v0
157          add       %rax, w3
158          mov       -8(up,i,8), %rax
159          mov       w3, -8(rp,i,8)
160          adc       %rdx, w0
161          adc       $0, R32(w1)
162L(m2e1):mul         v1
163          add       %rax, w0
164          adc       %rdx, w1
165          mov       $0, R32(w2)
166          mov       (up,i,8), %rax
167          mul       v0
168          add       %rax, w0
169          mov       w0, (rp,i,8)
170          adc       %rdx, w1
171          mov       (up,i,8), %rax
172          adc       $0, R32(w2)
173L(m2e0):mul         v1
174          add       %rax, w1
175          adc       %rdx, w2
176          mov       8(up,i,8), %rax
177          mul       v0
178          mov       $0, R32(w3)
179          add       %rax, w1
180          adc       %rdx, w2
181          adc       $0, R32(w3)
182          mov       8(up,i,8), %rax
183L(m2e3):mul         v1
184          add       %rax, w2
185          mov       w1, 8(rp,i,8)
186          adc       %rdx, w3
187          mov       $0, R32(w0)
188          mov       16(up,i,8), %rax
189          mul       v0
190          add       %rax, w2
191          mov       16(up,i,8), %rax
192          adc       %rdx, w3
193          adc       $0, R32(w0)
194L(m2e2):mul         v1
195          mov       $0, R32(w1)                   C FIXME: dead in last iteration
196          add       %rax, w3
197          mov       24(up,i,8), %rax
198          mov       w2, 16(rp,i,8)
199          adc       %rdx, w0            C FIXME: dead in last iteration
200          add       $4, i
201          js        L(m2tp)
202
203L(m2ed):imul        v0, %rax
204          add       w3, %rax
205          mov       %rax, I(-8(rp),-8(rp,i,8))
206
207          add       $2, n
208          lea       16(vp), vp
209          lea       -16(up), up
210          cmp       $-2, n
211          jge       L(cor1)
212
213          push      %r14
214          push      %r15
215
216L(outer):
217          mov       (vp), v0
218          mov       8(vp), v1
219          mov       (up,n,8), %rax
220          mul       v0
221          test      $1, R8(n)
222          jnz       L(a1x1)
223
224L(a1x0):mov         %rax, X1
225          MOV(      %rdx, X0, 8)
226          mov       (up,n,8), %rax
227          mul       v1
228          test      $2, R8(n)
229          jnz       L(a110)
230
231L(a100):lea         (n), i
232          mov       (rp,n,8), w3
233          mov       %rax, w0
234          MOV(      %rdx, w1, 16)
235          jmp       L(lo0)
236
237L(a110):lea         2(n), i
238          mov       (rp,n,8), w1
239          mov       %rax, w2
240          mov       8(up,n,8), %rax
241          MOV(      %rdx, w3, 1)
242          jmp       L(lo2)
243
244L(a1x1):mov         %rax, X0
245          MOV(      %rdx, X1, 2)
246          mov       (up,n,8), %rax
247          mul       v1
248          test      $2, R8(n)
249          jz        L(a111)
250
251L(a101):lea         1(n), i
252          MOV(      %rdx, w0, 4)
253          mov       (rp,n,8), w2
254          mov       %rax, w3
255          jmp       L(lo1)
256
257L(a111):lea         -1(n), i
258          MOV(      %rdx, w2, 64)
259          mov       %rax, w1
260          mov       (rp,n,8), w0
261          mov       8(up,n,8), %rax
262          jmp       L(lo3)
263
264          ALIGNx
265L(top):   mul       v1
266          add       w0, w1
267          adc       %rax, w2
268          mov       -8(up,i,8), %rax
269          MOV(      %rdx, w3, 1)
270          adc       $0, w3
271L(lo2):   mul       v0
272          add       w1, X1
273          mov       X1, -16(rp,i,8)
274          adc       %rax, X0
275          MOV(      %rdx, X1, 2)
276          adc       $0, X1
277          mov       -8(up,i,8), %rax
278          mul       v1
279          MOV(      %rdx, w0, 4)
280          mov       -8(rp,i,8), w1
281          add       w1, w2
282          adc       %rax, w3
283          adc       $0, w0
284L(lo1):   mov       (up,i,8), %rax
285          mul       v0
286          add       w2, X0
287          adc       %rax, X1
288          mov       X0, -8(rp,i,8)
289          MOV(      %rdx, X0, 8)
290          adc       $0, X0
291          mov       (up,i,8), %rax
292          mov       (rp,i,8), w2
293          mul       v1
294          add       w2, w3
295          adc       %rax, w0
296          MOV(      %rdx, w1, 16)
297          adc       $0, w1
298L(lo0):   mov       8(up,i,8), %rax
299          mul       v0
300          add       w3, X1
301          mov       X1, (rp,i,8)
302          adc       %rax, X0
303          MOV(      %rdx, X1, 32)
304          mov       8(rp,i,8), w3
305          adc       $0, X1
306          mov       8(up,i,8), %rax
307          mul       v1
308          add       w3, w0
309          MOV(      %rdx, w2, 64)
310          adc       %rax, w1
311          mov       16(up,i,8), %rax
312          adc       $0, w2
313L(lo3):   mul       v0
314          add       w0, X0
315          mov       X0, 8(rp,i,8)
316          MOV(      %rdx, X0, 128)
317          adc       %rax, X1
318          mov       16(up,i,8), %rax
319          mov       16(rp,i,8), w0
320          adc       $0, X0
321          add       $4, i
322          jnc       L(top)
323
324L(end):   imul      v1, %rax
325          add       w0, w1
326          adc       %rax, w2
327          mov       I(-8(up),-8(up,i,8)), %rax
328          imul      v0, %rax
329          add       w1, X1
330          mov       X1, I(-16(rp),-16(rp,i,8))
331          adc       X0, %rax
332          mov       I(-8(rp),-8(rp,i,8)), w1
333          add       w1, w2
334          add       w2, %rax
335          mov       %rax, I(-8(rp),-8(rp,i,8))
336
337          add       $2, n
338          lea       16(vp), vp
339          lea       -16(up), up
340          cmp       $-2, n
341          jl        L(outer)
342
343          pop       %r15
344          pop       %r14
345
346          jnz       L(cor0)
347
348L(cor1):mov         (vp), v0
349          mov       8(vp), v1
350          mov       -16(up), %rax
351          mul       v0                            C u0 x v2
352          add       -16(rp), %rax                 C FIXME: rp[0] still available in reg?
353          adc       -8(rp), %rdx                  C FIXME: rp[1] still available in reg?
354          mov       -8(up), %rbx
355          imul      v0, %rbx
356          mov       -16(up), %rcx
357          imul      v1, %rcx
358          mov       %rax, -16(rp)
359          add       %rbx, %rcx
360          add       %rdx, %rcx
361          mov       %rcx, -8(rp)
362          pop       %r13
363          pop       %r12
364          pop       %rbp
365          pop       %rbx
366          FUNC_EXIT()
367          ret
368
369L(cor0):mov         (vp), %r11
370          imul      -8(up), %r11
371          add       %rax, %r11
372          mov       %r11, -8(rp)
373          pop       %r13
374          pop       %r12
375          pop       %rbp
376          pop       %rbx
377          FUNC_EXIT()
378          ret
379
380          ALIGN(16)
381L(small):
382          cmp       $2, n_param
383          jae       L(gt1)
384L(n1):    imul      (vp_param), %rax
385          mov       %rax, (rp)
386          FUNC_EXIT()
387          ret
388L(gt1):   ja        L(gt2)
389L(n2):    mov       (vp_param), %r9
390          mul       %r9
391          mov       %rax, (rp)
392          mov       8(up), %rax
393          imul      %r9, %rax
394          add       %rax, %rdx
395          mov       8(vp), %r9
396          mov       (up), %rcx
397          imul      %r9, %rcx
398          add       %rcx, %rdx
399          mov       %rdx, 8(rp)
400          FUNC_EXIT()
401          ret
402L(gt2):
403L(n3):    mov       (vp_param), %r9
404          mul       %r9                 C u0 x v0
405          mov       %rax, (rp)
406          mov       %rdx, %r10
407          mov       8(up), %rax
408          mul       %r9                 C u1 x v0
409          imul      16(up), %r9         C u2 x v0
410          add       %rax, %r10
411          adc       %rdx, %r9
412          mov       8(vp), %r11
413          mov       (up), %rax
414          mul       %r11                C u0 x v1
415          add       %rax, %r10
416          adc       %rdx, %r9
417          imul      8(up), %r11         C u1 x v1
418          add       %r11, %r9
419          mov       %r10, 8(rp)
420          mov       16(vp), %r10
421          mov       (up), %rax
422          imul      %rax, %r10          C u0 x v2
423          add       %r10, %r9
424          mov       %r9, 16(rp)
425          FUNC_EXIT()
426          ret
427EPILOGUE()
428