1dnl  mpn_sqr_basecase for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F).
2
3dnl  Copyright 2001, 2002, 2007 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33C TODO:
34C  * Improve ad-hoc outer loop code and register handling.  Some feed-in
35C    scheduling could improve things by several cycles per outer iteration.
36C  * In Lam3...Lam1 code for, keep accumulation operands in registers, without
37C    storing intermediates to rp.
38C  * We might want to keep 32 in a free mm register, since the register form is
39C    3 bytes and the immediate form is 4 bytes.  About 80 bytes to save.
40C  * Look into different loop alignment, we now expand the code about 50 bytes
41C    with possibly needless alignment.
42C  * Use OSP, should solve feed-in latency problems.
43C  * Address relative slowness for un<=3 for Pentium M.  The old code is there
44C    considerably faster.  (1:20/14, 2:34:32, 3:66/57)
45
46C INPUT PARAMETERS
47C rp                sp + 4
48C up                sp + 8
49C un                sp + 12
50
51          TEXT
52          ALIGN(16)
53PROLOGUE(mpn_sqr_basecase)
54          mov       4(%esp), %edx                 C rp
55          mov       8(%esp), %eax                 C up
56          mov       12(%esp), %ecx                C un
57
58          cmp       $2, %ecx
59          jc        L(un1)
60          jz        L(un2)
61          cmp       $4, %ecx
62          jc        L(un3)
63          jz        L(un4)
64          jmp       L(big)
65
66L(un1):   mov       (%eax), %eax
67          mov       %edx, %ecx
68          mul       %eax
69          mov       %eax, (%ecx)
70          mov       %edx, 4(%ecx)
71          ret
72L(un2):   movd      (%eax), %mm0                  C                                       un=2
73          movd      (%eax), %mm2                  C                                       un=2
74          movd      4(%eax), %mm1                 C                                       un=2
75          pmuludq   %mm0, %mm0                    C 64b weight 0                          un=2
76          pmuludq   %mm1, %mm2                    C 64b weight 32                         un=2
77          pmuludq   %mm1, %mm1                    C 64b weight 64                         un=2
78          movd      %mm0, (%edx)                  C                                       un=2
79          psrlq     $32, %mm0           C 32b weight 32                         un=2
80          pcmpeqd   %mm7, %mm7                    C                                       un=2
81          psrlq     $33, %mm7           C 0x000000007FFFFFFF                    un=2
82          pand      %mm2, %mm7                    C 31b weight 32                         un=2
83          psrlq     $31, %mm2           C 33b weight 65                         un=2
84          psllq     $1, %mm7            C 31b weight 33                         un=2
85          paddq     %mm7, %mm0                    C                                       un=2
86          movd      %mm0, 4(%edx)                 C                                       un=2
87          psrlq     $32, %mm0           C                                       un=2
88          paddq     %mm2, %mm1                    C                                       un=2
89          paddq     %mm0, %mm1                    C                                       un=2
90          movd      %mm1, 8(%edx)                 C                                       un=2
91          psrlq     $32, %mm1           C                                       un=2
92          movd      %mm1, 12(%edx)                C                                       un=2
93          emms
94          ret
95L(un3):   movd      (%eax), %mm7                  C                                       un=3
96          movd      4(%eax), %mm6                 C                                       un=3
97          pmuludq   %mm7, %mm6                    C                                       un=3
98          movd      8(%eax), %mm2                 C                                       un=3
99          pmuludq   %mm7, %mm2                    C                                       un=3
100          movd      %mm6, 4(%edx)                 C                                       un=3
101          psrlq     $32, %mm6           C                                       un=3
102          paddq     %mm2, %mm6                    C                                       un=3
103          movd      %mm6, 8(%edx)                 C                                       un=3
104          psrlq     $32, %mm6           C                                       un=3
105          movd      %mm6, 12(%edx)                C                                       un=3
106          lea       4(%edx), %edx                 C                                       un=3
107          lea       4(%eax), %eax                 C                                       un=3
108          jmp       L(am1)
109L(un4):   movd      (%eax), %mm7                  C                                       un=4
110          movd      4(%eax), %mm6                 C                                       un=4
111          pmuludq   %mm7, %mm6                    C                                       un=4
112          movd      8(%eax), %mm0                 C                                       un=4
113          pmuludq   %mm7, %mm0                    C                                       un=4
114          movd      12(%eax), %mm1                C                                       un=4
115          pmuludq   %mm7, %mm1                    C                                       un=4
116          movd      %mm6, 4(%edx)                 C                                       un=4
117          psrlq     $32, %mm6           C                                       un=4
118          paddq     %mm0, %mm6                    C                                       un=4
119          movd      %mm6, 8(%edx)                 C                                       un=4
120          psrlq     $32, %mm6           C                                       un=4
121          paddq     %mm1, %mm6                    C                                       un=4
122          movd      %mm6, 12(%edx)                C                                       un=4
123          psrlq     $32, %mm6           C                                       un=4
124          movd      %mm6, 16(%edx)                C                                       un=4
125          lea       4(%edx), %edx                 C                                       un=4
126          lea       4(%eax), %eax                 C                                       un=4
127          jmp       L(am2)
128
129L(big):   push      %esi
130          push      %ebx
131          push      %edi
132          pxor      %mm6, %mm6
133          movd      (%eax), %mm7                  C
134          lea       4(%eax), %esi                 C init up, up++
135          lea       4(%eax), %eax                 C up2++  FIXME: should fix offsets
136          lea       4(%edx), %edi                 C init rp, rp++
137          lea       4(%edx), %edx                 C rp2++
138          lea       -4(%ecx), %ebx                C loop count
139          and       $3, %ecx
140          jz        L(3m)
141          cmp       $2, %ecx
142          ja        L(2m)
143          jb        L(0m)
144
145L(1m):
146          movd      (%eax), %mm4                  C                                       m 1
147          lea       (%ebx), %ecx                  C inner loop count            m 1
148          pmuludq   %mm7, %mm4                    C                                       m 1
149          movd      4(%eax), %mm3                 C                                       m 1
150          pmuludq   %mm7, %mm3                    C                                       m 1
151          movd      8(%eax), %mm0                 C                                       m 1
152          jmp       L(m01)                        C                                       m 1
153          ALIGN(16)                     C                                       m 1
154L(lpm1):
155          pmuludq   %mm7, %mm4                    C                                       m 1
156          paddq     %mm0, %mm6                    C                                       m 1
157          movd      4(%eax), %mm3                 C                                       m 1
158          movd      %mm6, -8(%edx)                C                                       m 1
159          psrlq     $32, %mm6           C                                       m 1
160          pmuludq   %mm7, %mm3                    C                                       m 1
161          paddq     %mm1, %mm6                    C                                       m 1
162          movd      8(%eax), %mm0                 C                                       m 1
163          movd      %mm6, -4(%edx)                C                                       m 1
164          psrlq     $32, %mm6           C                                       m 1
165L(m01):   pmuludq   %mm7, %mm0                    C                                       m 1
166          paddq     %mm4, %mm6                    C                                       m 1
167          movd      12(%eax), %mm1                C                                       m 1
168          movd      %mm6, (%edx)                  C                                       m 1
169          psrlq     $32, %mm6           C                                       m 1
170          pmuludq   %mm7, %mm1                    C                                       m 1
171          paddq     %mm3, %mm6                    C                                       m 1
172          movd      16(%eax), %mm4                C                                       m 1
173          movd      %mm6, 4(%edx)                 C                                       m 1
174          psrlq     $32, %mm6           C                                       m 1
175          lea       16(%eax), %eax                C                                       m 1
176          lea       16(%edx), %edx                C                                       m 1
177          sub       $4, %ecx            C                                       m 1
178          ja        L(lpm1)                       C                                       m 1
179          pmuludq   %mm7, %mm4                    C                                       m 1
180          paddq     %mm0, %mm6                    C                                       m 1
181          movd      %mm6, -8(%edx)                C                                       m 1
182          psrlq     $32, %mm6           C                                       m 1
183          paddq     %mm1, %mm6                    C                                       m 1
184          jmp       L(0)
185
186L(2m):
187          movd      (%eax), %mm1                  C                                       m 2
188          lea       (%ebx), %ecx                  C inner loop count            m 2
189          pmuludq   %mm7, %mm1                    C                                       m 2
190          movd      4(%eax), %mm4                 C                                       m 2
191          pmuludq   %mm7, %mm4                    C                                       m 2
192          movd      8(%eax), %mm3                 C                                       m 2
193          jmp       L(m10)                        C                                       m 2
194          ALIGN(16)                     C                                       m 2
195L(lpm2):
196          pmuludq   %mm7, %mm4                    C                                       m 2
197          paddq     %mm0, %mm6                    C                                       m 2
198          movd      8(%eax), %mm3                 C                                       m 2
199          movd      %mm6, -4(%edx)                C                                       m 2
200          psrlq     $32, %mm6           C                                       m 2
201L(m10):   pmuludq   %mm7, %mm3                    C                                       m 2
202          paddq     %mm1, %mm6                    C                                       m 2
203          movd      12(%eax), %mm0                C                                       m 2
204          movd      %mm6, (%edx)                  C                                       m 2
205          psrlq     $32, %mm6           C                                       m 2
206          pmuludq   %mm7, %mm0                    C                                       m 2
207          paddq     %mm4, %mm6                    C                                       m 2
208          movd      16(%eax), %mm1                C                                       m 2
209          movd      %mm6, 4(%edx)                 C                                       m 2
210          psrlq     $32, %mm6           C                                       m 2
211          pmuludq   %mm7, %mm1                    C                                       m 2
212          paddq     %mm3, %mm6                    C                                       m 2
213          movd      20(%eax), %mm4                C                                       m 2
214          movd      %mm6, 8(%edx)                 C                                       m 2
215          psrlq     $32, %mm6           C                                       m 2
216          lea       16(%eax), %eax                C                                       m 2
217          lea       16(%edx), %edx                C                                       m 2
218          sub       $4, %ecx            C                                       m 2
219          ja        L(lpm2)                       C                                       m 2
220          pmuludq   %mm7, %mm4                    C                                       m 2
221          paddq     %mm0, %mm6                    C                                       m 2
222          movd      %mm6, -4(%edx)                C                                       m 2
223          psrlq     $32, %mm6           C                                       m 2
224          paddq     %mm1, %mm6                    C                                       m 2
225          jmp       L(1)
226
227L(3m):
228          movd      (%eax), %mm0                  C                                       m 3
229          lea       (%ebx), %ecx                  C inner loop count            m 3
230          pmuludq   %mm7, %mm0                    C                                       m 3
231          movd      4(%eax), %mm1                 C                                       m 3
232          pmuludq   %mm7, %mm1                    C                                       m 3
233          movd      8(%eax), %mm4                 C                                       m 3
234          jmp       L(lpm3)                       C                                       m 3
235          ALIGN(16)                     C                                       m 3
236L(lpm3):
237          pmuludq   %mm7, %mm4                    C                                       m 3
238          paddq     %mm0, %mm6                    C                                       m 3
239          movd      12(%eax), %mm3                C                                       m 3
240          movd      %mm6, (%edx)                  C                                       m 3
241          psrlq     $32, %mm6           C                                       m 3
242          pmuludq   %mm7, %mm3                    C                                       m 3
243          paddq     %mm1, %mm6                    C                                       m 3
244          movd      16(%eax), %mm0                C                                       m 3
245          movd      %mm6, 4(%edx)                 C                                       m 3
246          psrlq     $32, %mm6           C                                       m 3
247          pmuludq   %mm7, %mm0                    C                                       m 3
248          paddq     %mm4, %mm6                    C                                       m 3
249          movd      20(%eax), %mm1                C                                       m 3
250          movd      %mm6, 8(%edx)                 C                                       m 3
251          psrlq     $32, %mm6           C                                       m 3
252          pmuludq   %mm7, %mm1                    C                                       m 3
253          paddq     %mm3, %mm6                    C                                       m 3
254          movd      24(%eax), %mm4                C                                       m 3
255          movd      %mm6, 12(%edx)                C                                       m 3
256          psrlq     $32, %mm6           C                                       m 3
257          lea       16(%eax), %eax                C                                       m 3
258          lea       16(%edx), %edx                C                                       m 3
259          sub       $4, %ecx            C                                       m 3
260          ja        L(lpm3)                       C                                       m 3
261          pmuludq   %mm7, %mm4                    C                                       m 3
262          paddq     %mm0, %mm6                    C                                       m 3
263          movd      %mm6, (%edx)                  C                                       m 3
264          psrlq     $32, %mm6           C                                       m 3
265          paddq     %mm1, %mm6                    C                                       m 3
266          jmp       L(2)
267
268L(0m):
269          movd      (%eax), %mm3                  C                                       m 0
270          lea       (%ebx), %ecx                  C inner loop count            m 0
271          pmuludq   %mm7, %mm3                    C                                       m 0
272          movd      4(%eax), %mm0                 C                                       m 0
273          pmuludq   %mm7, %mm0                    C                                       m 0
274          movd      8(%eax), %mm1                 C                                       m 0
275          jmp       L(m00)                        C                                       m 0
276          ALIGN(16)                     C                                       m 0
277L(lpm0):
278          pmuludq   %mm7, %mm4                    C                                       m 0
279          paddq     %mm0, %mm6                    C                                       m 0
280          movd      (%eax), %mm3                  C                                       m 0
281          movd      %mm6, -12(%edx)               C                                       m 0
282          psrlq     $32, %mm6           C                                       m 0
283          pmuludq   %mm7, %mm3                    C                                       m 0
284          paddq     %mm1, %mm6                    C                                       m 0
285          movd      4(%eax), %mm0                 C                                       m 0
286          movd      %mm6, -8(%edx)                C                                       m 0
287          psrlq     $32, %mm6           C                                       m 0
288          pmuludq   %mm7, %mm0                    C                                       m 0
289          paddq     %mm4, %mm6                    C                                       m 0
290          movd      8(%eax), %mm1                 C                                       m 0
291          movd      %mm6, -4(%edx)                C                                       m 0
292          psrlq     $32, %mm6           C                                       m 0
293L(m00):   pmuludq   %mm7, %mm1                    C                                       m 0
294          paddq     %mm3, %mm6                    C                                       m 0
295          movd      12(%eax), %mm4                C                                       m 0
296          movd      %mm6, (%edx)                  C                                       m 0
297          psrlq     $32, %mm6           C                                       m 0
298          lea       16(%eax), %eax                C                                       m 0
299          lea       16(%edx), %edx                C                                       m 0
300          sub       $4, %ecx            C                                       m 0
301          ja        L(lpm0)                       C                                       m 0
302          pmuludq   %mm7, %mm4                    C                                       m 0
303          paddq     %mm0, %mm6                    C                                       m 0
304          movd      %mm6, -12(%edx)               C                                       m 0
305          psrlq     $32, %mm6           C                                       m 0
306          paddq     %mm1, %mm6                    C                                       m 0
307          jmp       L(3)
308
309L(outer):
310          lea       8(%edi), %edi                 C rp += 2
311          movd      (%esi), %mm7                  C                                       am 3
312          mov       %edi, %edx                    C rp2 = rp                              am 3
313          lea       4(%esi), %esi                 C up++                                  am 3
314          lea       (%esi), %eax                  C up2 = up                              am 3
315          movd      (%eax), %mm0                  C                                       am 3
316          lea       (%ebx), %ecx                  C inner loop count            am 3
317          pxor      %mm6, %mm6                    C                                       am 3
318          pmuludq   %mm7, %mm0                    C                                       am 3
319          movd      4(%eax), %mm1                 C                                       am 3
320          movd      (%edx), %mm4                  C                                       am 3
321          pmuludq   %mm7, %mm1                    C                                       am 3
322          movd      8(%eax), %mm2                 C                                       am 3
323          paddq     %mm0, %mm4                    C                                       am 3
324          movd      4(%edx), %mm5                 C                                       am 3
325          jmp       L(lam3)                       C                                       am 3
326          ALIGN(16)                     C                                       am 3
327L(lam3):
328          pmuludq   %mm7, %mm2                    C                                       am 3
329          paddq     %mm4, %mm6                    C                                       am 3
330          movd      12(%eax), %mm3                C                                       am 3
331          paddq     %mm1, %mm5                    C                                       am 3
332          movd      8(%edx), %mm4                 C                                       am 3
333          movd      %mm6, (%edx)                  C                                       am 3
334          psrlq     $32, %mm6           C                                       am 3
335          pmuludq   %mm7, %mm3                    C                                       am 3
336          paddq     %mm5, %mm6                    C                                       am 3
337          movd      16(%eax), %mm0                C                                       am 3
338          paddq     %mm2, %mm4                    C                                       am 3
339          movd      12(%edx), %mm5                C                                       am 3
340          movd      %mm6, 4(%edx)                 C                                       am 3
341          psrlq     $32, %mm6           C                                       am 3
342          pmuludq   %mm7, %mm0                    C                                       am 3
343          paddq     %mm4, %mm6                    C                                       am 3
344          movd      20(%eax), %mm1                C                                       am 3
345          paddq     %mm3, %mm5                    C                                       am 3
346          movd      16(%edx), %mm4                C                                       am 3
347          movd      %mm6, 8(%edx)                 C                                       am 3
348          psrlq     $32, %mm6           C                                       am 3
349          pmuludq   %mm7, %mm1                    C                                       am 3
350          paddq     %mm5, %mm6                    C                                       am 3
351          movd      24(%eax), %mm2                C                                       am 3
352          paddq     %mm0, %mm4                    C                                       am 3
353          movd      20(%edx), %mm5                C                                       am 3
354          movd      %mm6, 12(%edx)                C                                       am 3
355          psrlq     $32, %mm6           C                                       am 3
356          lea       16(%eax), %eax                C                                       am 3
357          lea       16(%edx), %edx                C                                       am 3
358          sub       $4, %ecx            C                                       am 3
359          ja        L(lam3)                       C                                       am 3
360          pmuludq   %mm7, %mm2                    C                                       am 3
361          paddq     %mm4, %mm6                    C                                       am 3
362          paddq     %mm1, %mm5                    C                                       am 3
363          movd      8(%edx), %mm4                 C                                       am 3
364          movd      %mm6, (%edx)                  C                                       am 3
365          psrlq     $32, %mm6           C                                       am 3
366          paddq     %mm5, %mm6                    C                                       am 3
367          paddq     %mm2, %mm4                    C                                       am 3
368L(2):     movd      %mm6, 4(%edx)                 C                                       am 3
369          psrlq     $32, %mm6           C                                       am 3
370          paddq     %mm4, %mm6                    C                                       am 3
371          movd      %mm6, 8(%edx)                 C                                       am 3
372          psrlq     $32, %mm6           C                                       am 3
373          movd      %mm6, 12(%edx)                C                                       am 3
374
375          lea       8(%edi), %edi                 C rp += 2
376          movd      (%esi), %mm7                  C                                       am 2
377          mov       %edi, %edx                    C rp2 = rp                              am 2
378          lea       4(%esi), %esi                 C up++                                  am 2
379          lea       (%esi), %eax                  C up2 = up                              am 2
380          movd      (%eax), %mm1                  C                                       am 2
381          lea       (%ebx), %ecx                  C inner loop count            am 2
382          pxor      %mm6, %mm6                    C                                       am 2
383          pmuludq   %mm7, %mm1                    C                                       am 2
384          movd      4(%eax), %mm2                 C                                       am 2
385          movd      (%edx), %mm5                  C                                       am 2
386          pmuludq   %mm7, %mm2                    C                                       am 2
387          movd      8(%eax), %mm3                 C                                       am 2
388          paddq     %mm1, %mm5                    C                                       am 2
389          movd      4(%edx), %mm4                 C                                       am 2
390          jmp       L(am10)                       C                                       am 2
391          ALIGN(16)                     C                                       am 2
392L(lam2):
393          pmuludq   %mm7, %mm2                    C                                       am 2
394          paddq     %mm4, %mm6                    C                                       am 2
395          movd      8(%eax), %mm3                 C                                       am 2
396          paddq     %mm1, %mm5                    C                                       am 2
397          movd      4(%edx), %mm4                 C                                       am 2
398          movd      %mm6, -4(%edx)                C                                       am 2
399          psrlq     $32, %mm6           C                                       am 2
400L(am10):
401          pmuludq   %mm7, %mm3                    C                                       am 2
402          paddq     %mm5, %mm6                    C                                       am 2
403          movd      12(%eax), %mm0                C                                       am 2
404          paddq     %mm2, %mm4                    C                                       am 2
405          movd      8(%edx), %mm5                 C                                       am 2
406          movd      %mm6, (%edx)                  C                                       am 2
407          psrlq     $32, %mm6           C                                       am 2
408          pmuludq   %mm7, %mm0                    C                                       am 2
409          paddq     %mm4, %mm6                    C                                       am 2
410          movd      16(%eax), %mm1                C                                       am 2
411          paddq     %mm3, %mm5                    C                                       am 2
412          movd      12(%edx), %mm4                C                                       am 2
413          movd      %mm6, 4(%edx)                 C                                       am 2
414          psrlq     $32, %mm6           C                                       am 2
415          pmuludq   %mm7, %mm1                    C                                       am 2
416          paddq     %mm5, %mm6                    C                                       am 2
417          movd      20(%eax), %mm2                C                                       am 2
418          paddq     %mm0, %mm4                    C                                       am 2
419          movd      16(%edx), %mm5                C                                       am 2
420          movd      %mm6, 8(%edx)                 C                                       am 2
421          psrlq     $32, %mm6           C                                       am 2
422          lea       16(%eax), %eax                C                                       am 2
423          lea       16(%edx), %edx                C                                       am 2
424          sub       $4, %ecx            C                                       am 2
425          ja        L(lam2)                       C                                       am 2
426          pmuludq   %mm7, %mm2                    C                                       am 2
427          paddq     %mm4, %mm6                    C                                       am 2
428          paddq     %mm1, %mm5                    C                                       am 2
429          movd      4(%edx), %mm4                 C                                       am 2
430          movd      %mm6, -4(%edx)                C                                       am 2
431          psrlq     $32, %mm6           C                                       am 2
432          paddq     %mm5, %mm6                    C                                       am 2
433          paddq     %mm2, %mm4                    C                                       am 2
434L(1):     movd      %mm6, (%edx)                  C                                       am 2
435          psrlq     $32, %mm6           C                                       am 2
436          paddq     %mm4, %mm6                    C                                       am 2
437          movd      %mm6, 4(%edx)                 C                                       am 2
438          psrlq     $32, %mm6           C                                       am 2
439          movd      %mm6, 8(%edx)                 C                                       am 2
440
441          lea       8(%edi), %edi                 C rp += 2
442          movd      (%esi), %mm7                  C                                       am 1
443          mov       %edi, %edx                    C rp2 = rp                              am 1
444          lea       4(%esi), %esi                 C up++                                  am 1
445          lea       (%esi), %eax                  C up2 = up                              am 1
446          movd      (%eax), %mm2                  C                                       am 1
447          lea       (%ebx), %ecx                  C inner loop count            am 1
448          pxor      %mm6, %mm6                    C                                       am 1
449          pmuludq   %mm7, %mm2                    C                                       am 1
450          movd      4(%eax), %mm3                 C                                       am 1
451          movd      (%edx), %mm4                  C                                       am 1
452          pmuludq   %mm7, %mm3                    C                                       am 1
453          movd      8(%eax), %mm0                 C                                       am 1
454          paddq     %mm2, %mm4                    C                                       am 1
455          movd      4(%edx), %mm5                 C                                       am 1
456          jmp       L(am01)                       C                                       am 1
457          ALIGN(16)                     C                                       am 1
458L(lam1):
459          pmuludq   %mm7, %mm2                    C                                       am 1
460          paddq     %mm4, %mm6                    C                                       am 1
461          movd      4(%eax), %mm3                 C                                       am 1
462          paddq     %mm1, %mm5                    C                                       am 1
463          movd      (%edx), %mm4                  C                                       am 1
464          movd      %mm6, -8(%edx)                C                                       am 1
465          psrlq     $32, %mm6           C                                       am 1
466          pmuludq   %mm7, %mm3                    C                                       am 1
467          paddq     %mm5, %mm6                    C                                       am 1
468          movd      8(%eax), %mm0                 C                                       am 1
469          paddq     %mm2, %mm4                    C                                       am 1
470          movd      4(%edx), %mm5                 C                                       am 1
471          movd      %mm6, -4(%edx)                C                                       am 1
472          psrlq     $32, %mm6           C                                       am 1
473L(am01):
474          pmuludq   %mm7, %mm0                    C                                       am 1
475          paddq     %mm4, %mm6                    C                                       am 1
476          movd      12(%eax), %mm1                C                                       am 1
477          paddq     %mm3, %mm5                    C                                       am 1
478          movd      8(%edx), %mm4                 C                                       am 1
479          movd      %mm6, (%edx)                  C                                       am 1
480          psrlq     $32, %mm6           C                                       am 1
481          pmuludq   %mm7, %mm1                    C                                       am 1
482          paddq     %mm5, %mm6                    C                                       am 1
483          movd      16(%eax), %mm2                C                                       am 1
484          paddq     %mm0, %mm4                    C                                       am 1
485          movd      12(%edx), %mm5                C                                       am 1
486          movd      %mm6, 4(%edx)                 C                                       am 1
487          psrlq     $32, %mm6           C                                       am 1
488          lea       16(%eax), %eax                C                                       am 1
489          lea       16(%edx), %edx                C                                       am 1
490          sub       $4, %ecx            C                                       am 1
491          ja        L(lam1)                       C                                       am 1
492          pmuludq   %mm7, %mm2                    C                                       am 1
493          paddq     %mm4, %mm6                    C                                       am 1
494          paddq     %mm1, %mm5                    C                                       am 1
495          movd      (%edx), %mm4                  C                                       am 1
496          movd      %mm6, -8(%edx)                C                                       am 1
497          psrlq     $32, %mm6           C                                       am 1
498          paddq     %mm5, %mm6                    C                                       am 1
499          paddq     %mm2, %mm4                    C                                       am 1
500L(0):     movd      %mm6, -4(%edx)                C                                       am 1
501          psrlq     $32, %mm6           C                                       am 1
502          paddq     %mm4, %mm6                    C                                       am 1
503          movd      %mm6, (%edx)                  C                                       am 1
504          psrlq     $32, %mm6           C                                       am 1
505          movd      %mm6, 4(%edx)                 C                                       am 1
506
507          lea       8(%edi), %edi                 C rp += 2
508          movd      (%esi), %mm7                  C                                       am 0
509          mov       %edi, %edx                    C rp2 = rp                              am 0
510          lea       4(%esi), %esi                 C up++                                  am 0
511          lea       (%esi), %eax                  C up2 = up                              am 0
512          movd      (%eax), %mm3                  C                                       am 0
513          lea       (%ebx), %ecx                  C inner loop count            am 0
514          pxor      %mm6, %mm6                    C                                       am 0
515          pmuludq   %mm7, %mm3                    C                                       am 0
516          movd      4(%eax), %mm0                 C                                       am 0
517          movd      (%edx), %mm5                  C                                       am 0
518          pmuludq   %mm7, %mm0                    C                                       am 0
519          movd      8(%eax), %mm1                 C                                       am 0
520          paddq     %mm3, %mm5                    C                                       am 0
521          movd      4(%edx), %mm4                 C                                       am 0
522          jmp       L(am00)                       C                                       am 0
523          ALIGN(16)                     C                                       am 0
524L(lam0):
525          pmuludq   %mm7, %mm2                    C                                       am 0
526          paddq     %mm4, %mm6                    C                                       am 0
527          movd      (%eax), %mm3                  C                                       am 0
528          paddq     %mm1, %mm5                    C                                       am 0
529          movd      -4(%edx), %mm4                C                                       am 0
530          movd      %mm6, -12(%edx)               C                                       am 0
531          psrlq     $32, %mm6           C                                       am 0
532          pmuludq   %mm7, %mm3                    C                                       am 0
533          paddq     %mm5, %mm6                    C                                       am 0
534          movd      4(%eax), %mm0                 C                                       am 0
535          paddq     %mm2, %mm4                    C                                       am 0
536          movd      (%edx), %mm5                  C                                       am 0
537          movd      %mm6, -8(%edx)                C                                       am 0
538          psrlq     $32, %mm6           C                                       am 0
539          pmuludq   %mm7, %mm0                    C                                       am 0
540          paddq     %mm4, %mm6                    C                                       am 0
541          movd      8(%eax), %mm1                 C                                       am 0
542          paddq     %mm3, %mm5                    C                                       am 0
543          movd      4(%edx), %mm4                 C                                       am 0
544          movd      %mm6, -4(%edx)                C                                       am 0
545          psrlq     $32, %mm6           C                                       am 0
546L(am00):
547          pmuludq   %mm7, %mm1                    C                                       am 0
548          paddq     %mm5, %mm6                    C                                       am 0
549          movd      12(%eax), %mm2                C                                       am 0
550          paddq     %mm0, %mm4                    C                                       am 0
551          movd      8(%edx), %mm5                 C                                       am 0
552          movd      %mm6, (%edx)                  C                                       am 0
553          psrlq     $32, %mm6           C                                       am 0
554          lea       16(%eax), %eax                C                                       am 0
555          lea       16(%edx), %edx                C                                       am 0
556          sub       $4, %ecx            C                                       am 0
557          ja        L(lam0)                       C                                       am 0
558          pmuludq   %mm7, %mm2                    C                                       am 0
559          paddq     %mm4, %mm6                    C                                       am 0
560          paddq     %mm1, %mm5                    C                                       am 0
561          movd      -4(%edx), %mm4                C                                       am 0
562          movd      %mm6, -12(%edx)               C                                       am 0
563          psrlq     $32, %mm6           C                                       am 0
564          paddq     %mm5, %mm6                    C                                       am 0
565          paddq     %mm2, %mm4                    C                                       am 0
566L(3):     movd      %mm6, -8(%edx)                C                                       am 0
567          psrlq     $32, %mm6           C                                       am 0
568          paddq     %mm4, %mm6                    C                                       am 0
569          movd      %mm6, -4(%edx)                C                                       am 0
570          psrlq     $32, %mm6           C                                       am 0
571          movd      %mm6, (%edx)                  C                                       am 0
572          sub       $4, %ebx            C                                       am 0
573          ja        L(outer)                      C                                       am 0
574
575          mov       %edi, %edx
576          mov       %esi, %eax
577          pop       %edi
578          pop       %ebx
579          pop       %esi
580
581L(am3):   C up[un-1..un-3] x up[un-4]
582          lea       8(%edx), %edx                 C rp2 += 2
583          movd      (%eax), %mm7
584          movd      4(%eax), %mm1
585          movd      8(%eax), %mm2
586          movd      12(%eax), %mm3
587          movd      (%edx), %mm4
588          pmuludq   %mm7, %mm1
589          movd      4(%edx), %mm5
590          pmuludq   %mm7, %mm2
591          movd      8(%edx), %mm6
592          pmuludq   %mm7, %mm3
593          paddq     %mm1, %mm4
594          paddq     %mm2, %mm5
595          paddq     %mm3, %mm6
596          movd      %mm4, (%edx)
597          psrlq     $32, %mm4
598          paddq     %mm5, %mm4
599          movd      %mm4, 4(%edx)
600          psrlq     $32, %mm4
601          paddq     %mm6, %mm4
602          movd      %mm4, 8(%edx)
603          psrlq     $32, %mm4
604          movd      %mm4, 12(%edx)                C FIXME feed through!
605          lea       4(%eax), %eax
606
607L(am2):   C up[un-1..un-2] x up[un-3]
608          lea       8(%edx), %edx                 C rp2 += 2
609          movd      (%eax), %mm7
610          movd      4(%eax), %mm1
611          movd      8(%eax), %mm2
612          movd      (%edx), %mm4
613          movd      4(%edx), %mm5
614          pmuludq   %mm7, %mm1
615          pmuludq   %mm7, %mm2
616          paddq     %mm1, %mm4
617          paddq     %mm2, %mm5
618          movd      %mm4, (%edx)
619          psrlq     $32, %mm4
620          paddq     %mm5, %mm4
621          movd      %mm4, 4(%edx)
622          psrlq     $32, %mm4
623          movd      %mm4, 8(%edx)                 C FIXME feed through!
624          lea       4(%eax), %eax
625
626L(am1):   C up[un-1] x up[un-2]
627          lea       8(%edx), %edx                 C rp2 += 2
628          movd      (%eax), %mm7
629          movd      4(%eax), %mm2
630          movd      (%edx), %mm4
631          pmuludq   %mm7, %mm2
632          paddq     %mm2, %mm4
633          movd      %mm4, (%edx)
634          psrlq     $32, %mm4
635          movd      %mm4, 4(%edx)
636
637C *** diag stuff, use elementary code for now
638
639          mov       4(%esp), %edx                 C rp
640          mov       8(%esp), %eax                 C up
641          mov       12(%esp), %ecx                C un
642
643          movd      (%eax), %mm2
644          pmuludq   %mm2, %mm2                    C src[0]^2
645
646          pcmpeqd   %mm7, %mm7
647          psrlq     $32, %mm7
648
649          movd      4(%edx), %mm3                 C dst[1]
650
651          movd      %mm2, (%edx)
652          psrlq     $32, %mm2
653
654          psllq     $1, %mm3            C 2*dst[1]
655          paddq     %mm3, %mm2
656          movd      %mm2, 4(%edx)
657          psrlq     $32, %mm2
658
659          sub       $2, %ecx
660
661L(diag):
662          movd      4(%eax), %mm0                 C src limb
663          add       $4, %eax
664          pmuludq   %mm0, %mm0
665          movq      %mm7, %mm1
666          pand      %mm0, %mm1                    C diagonal low
667          psrlq     $32, %mm0           C diagonal high
668
669          movd      8(%edx), %mm3
670          psllq     $1, %mm3            C 2*dst[i]
671          paddq     %mm3, %mm1
672          paddq     %mm1, %mm2
673          movd      %mm2, 8(%edx)
674          psrlq     $32, %mm2
675
676          movd      12(%edx), %mm3
677          psllq     $1, %mm3            C 2*dst[i+1]
678          paddq     %mm3, %mm0
679          paddq     %mm0, %mm2
680          movd      %mm2, 12(%edx)
681          add       $8, %edx
682          psrlq     $32, %mm2
683
684          sub       $1, %ecx
685          jnz       L(diag)
686
687          movd      4(%eax), %mm0                 C src[size-1]
688          pmuludq   %mm0, %mm0
689          pand      %mm0, %mm7                    C diagonal low
690          psrlq     $32, %mm0           C diagonal high
691
692          movd      8(%edx), %mm3                 C dst[2*size-2]
693          psllq     $1, %mm3
694          paddq     %mm3, %mm7
695          paddq     %mm7, %mm2
696          movd      %mm2, 8(%edx)
697          psrlq     $32, %mm2
698
699          paddq     %mm0, %mm2
700          movd      %mm2, 12(%edx)                C dst[2*size-1]
701
702          emms
703          ret
704
705EPILOGUE()
706