1dnl  Intel P6 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
2
3dnl  Copyright 1999-2002, 2005 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33
34C                                 cycles/limb
35C P5
36C P6 model 0-8,10-12                     6.44
37C P6 model 9  (Banias)                   6.15
38C P6 model 13 (Dothan)                   6.11
39C P4 model 0  (Willamette)
40C P4 model 1  (?)
41C P4 model 2  (Northwood)
42C P4 model 3  (Prescott)
43C P4 model 4  (Nocona)
44C AMD K6
45C AMD K7
46C AMD K8
47
48
49dnl  P6 UNROLL_COUNT cycles/limb
50dnl          8           6.7
51dnl         16           6.35
52dnl         32           6.3
53dnl         64           6.3
54dnl  Maximum possible with the current code is 64.
55
56deflit(UNROLL_COUNT, 16)
57
58
59ifdef(`OPERATION_addmul_1', `
60          define(M4_inst,        addl)
61          define(M4_function_1,  mpn_addmul_1)
62          define(M4_function_1c, mpn_addmul_1c)
63          define(M4_description, add it to)
64          define(M4_desc_retval, carry)
65',`ifdef(`OPERATION_submul_1', `
66          define(M4_inst,        subl)
67          define(M4_function_1,  mpn_submul_1)
68          define(M4_function_1c, mpn_submul_1c)
69          define(M4_description, subtract it from)
70          define(M4_desc_retval, borrow)
71',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
72')')')
73
74MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
75
76
77C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
78C                            mp_limb_t mult);
79C mp_limb_t M4_function_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
80C                             mp_limb_t mult, mp_limb_t carry);
81C
82C Calculate src,size multiplied by mult and M4_description dst,size.
83C Return the M4_desc_retval limb from the top of the result.
84C
85C This code is pretty much the same as the K6 code.  The unrolled loop is
86C the same, but there's just a few scheduling tweaks in the setups and the
87C simple loop.
88C
89C A number of variations have been tried for the unrolled loop, with one or
90C two carries, and with loads scheduled earlier, but nothing faster than 6
91C cycles/limb has been found.
92
93ifdef(`PIC',`
94deflit(UNROLL_THRESHOLD, 5)
95',`
96deflit(UNROLL_THRESHOLD, 5)
97')
98
99defframe(PARAM_CARRY,     20)
100defframe(PARAM_MULTIPLIER,16)
101defframe(PARAM_SIZE,      12)
102defframe(PARAM_SRC,       8)
103defframe(PARAM_DST,       4)
104
105          TEXT
106          ALIGN(32)
107
108PROLOGUE(M4_function_1c)
109          pushl     %ebx
110deflit(`FRAME',4)
111          movl      PARAM_CARRY, %ebx
112          jmp       L(start_nc)
113EPILOGUE()
114
115PROLOGUE(M4_function_1)
116          push      %ebx
117deflit(`FRAME',4)
118          xorl      %ebx, %ebx          C initial carry
119
120L(start_nc):
121          movl      PARAM_SIZE, %ecx
122          pushl     %esi
123deflit(`FRAME',8)
124
125          movl      PARAM_SRC, %esi
126          pushl     %edi
127deflit(`FRAME',12)
128
129          movl      PARAM_DST, %edi
130          pushl     %ebp
131deflit(`FRAME',16)
132          cmpl      $UNROLL_THRESHOLD, %ecx
133
134          movl      PARAM_MULTIPLIER, %ebp
135          jae       L(unroll)
136
137
138          C simple loop
139          C this is offset 0x22, so close enough to aligned
140L(simple):
141          C eax     scratch
142          C ebx     carry
143          C ecx     counter
144          C edx     scratch
145          C esi     src
146          C edi     dst
147          C ebp     multiplier
148
149          movl      (%esi), %eax
150          addl      $4, %edi
151
152          mull      %ebp
153
154          addl      %ebx, %eax
155          adcl      $0, %edx
156
157          M4_inst   %eax, -4(%edi)
158          movl      %edx, %ebx
159
160          adcl      $0, %ebx
161          decl      %ecx
162
163          leal      4(%esi), %esi
164          jnz       L(simple)
165
166
167          popl      %ebp
168          popl      %edi
169
170          popl      %esi
171          movl      %ebx, %eax
172
173          popl      %ebx
174          ret
175
176
177
178C------------------------------------------------------------------------------
179C VAR_JUMP holds the computed jump temporarily because there's not enough
180C registers when doing the mul for the initial two carry limbs.
181C
182C The add/adc for the initial carry in %ebx is necessary only for the
183C mpn_add/submul_1c entry points.  Duplicating the startup code to
184C eliminate this for the plain mpn_add/submul_1 doesn't seem like a good
185C idea.
186
187dnl  overlapping with parameters already fetched
188define(VAR_COUNTER,`PARAM_SIZE')
189define(VAR_JUMP,   `PARAM_DST')
190
191          C this is offset 0x43, so close enough to aligned
192L(unroll):
193          C eax
194          C ebx     initial carry
195          C ecx     size
196          C edx
197          C esi     src
198          C edi     dst
199          C ebp
200
201          movl      %ecx, %edx
202          decl      %ecx
203
204          subl      $2, %edx
205          negl      %ecx
206
207          shrl      $UNROLL_LOG2, %edx
208          andl      $UNROLL_MASK, %ecx
209
210          movl      %edx, VAR_COUNTER
211          movl      %ecx, %edx
212
213          C 15 code bytes per limb
214ifdef(`PIC',`
215          call      L(pic_calc)
216L(here):
217',`
218          shll      $4, %edx
219          negl      %ecx
220
221          leal      L(entry) (%edx,%ecx,1), %edx
222')
223          movl      (%esi), %eax                  C src low limb
224
225          movl      %edx, VAR_JUMP
226          leal      ifelse(UNROLL_BYTES,256,128+) 4(%esi,%ecx,4), %esi
227
228          mull      %ebp
229
230          addl      %ebx, %eax          C initial carry (from _1c)
231          adcl      $0, %edx
232
233          movl      %edx, %ebx          C high carry
234          leal      ifelse(UNROLL_BYTES,256,128) (%edi,%ecx,4), %edi
235
236          movl      VAR_JUMP, %edx
237          testl     $1, %ecx
238          movl      %eax, %ecx          C low carry
239
240          cmovnz(   %ebx, %ecx)         C high,low carry other way around
241          cmovnz(   %eax, %ebx)
242
243          jmp       *%edx
244
245
246ifdef(`PIC',`
247L(pic_calc):
248          shll      $4, %edx
249          negl      %ecx
250
251          C See mpn/x86/README about old gas bugs
252          leal      (%edx,%ecx,1), %edx
253          addl      $L(entry)-L(here), %edx
254
255          addl      (%esp), %edx
256
257          ret_internal
258')
259
260
261C -----------------------------------------------------------
262          ALIGN(32)
263L(top):
264deflit(`FRAME',16)
265          C eax     scratch
266          C ebx     carry hi
267          C ecx     carry lo
268          C edx     scratch
269          C esi     src
270          C edi     dst
271          C ebp     multiplier
272          C
273          C VAR_COUNTER       loop counter
274          C
275          C 15 code bytes per limb
276
277          addl      $UNROLL_BYTES, %edi
278
279L(entry):
280deflit(CHUNK_COUNT,2)
281forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
282          deflit(`disp0', eval(i*4*CHUNK_COUNT ifelse(UNROLL_BYTES,256,-128)))
283          deflit(`disp1', eval(disp0 + 4))
284
285Zdisp(    movl,     disp0,(%esi), %eax)
286          mull      %ebp
287Zdisp(    M4_inst,%ecx, disp0,(%edi))
288          adcl      %eax, %ebx
289          movl      %edx, %ecx
290          adcl      $0, %ecx
291
292          movl      disp1(%esi), %eax
293          mull      %ebp
294          M4_inst   %ebx, disp1(%edi)
295          adcl      %eax, %ecx
296          movl      %edx, %ebx
297          adcl      $0, %ebx
298')
299
300          decl      VAR_COUNTER
301          leal      UNROLL_BYTES(%esi), %esi
302
303          jns       L(top)
304
305
306deflit(`disp0',     eval(UNROLL_BYTES ifelse(UNROLL_BYTES,256,-128)))
307
308          M4_inst   %ecx, disp0(%edi)
309          movl      %ebx, %eax
310
311          popl      %ebp
312          popl      %edi
313
314          popl      %esi
315          popl      %ebx
316          adcl      $0, %eax
317
318          ret
319
320EPILOGUE()
321