1dnl  AMD K6 mpn_sqr_basecase -- square an mpn number.
2
3dnl  Copyright 1999-2002 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33
34C K6: approx 4.7 cycles per cross product, or 9.2 cycles per triangular
35C     product (measured on the speed difference between 17 and 33 limbs,
36C     which is roughly the Karatsuba recursing range).
37
38
39dnl  SQR_TOOM2_THRESHOLD_MAX is the maximum SQR_TOOM2_THRESHOLD this
40dnl  code supports.  This value is used only by the tune program to know
41dnl  what it can go up to.  (An attempt to compile with a bigger value will
42dnl  trigger some m4_assert()s in the code, making the build fail.)
43dnl
44dnl  The value is determined by requiring the displacements in the unrolled
45dnl  addmul to fit in single bytes.  This means a maximum UNROLL_COUNT of
46dnl  63, giving a maximum SQR_TOOM2_THRESHOLD of 66.
47
48deflit(SQR_TOOM2_THRESHOLD_MAX, 66)
49
50
51dnl  Allow a value from the tune program to override config.m4.
52
53ifdef(`SQR_TOOM2_THRESHOLD_OVERRIDE',
54`define(`SQR_TOOM2_THRESHOLD',SQR_TOOM2_THRESHOLD_OVERRIDE)')
55
56
57dnl  UNROLL_COUNT is the number of code chunks in the unrolled addmul.  The
58dnl  number required is determined by SQR_TOOM2_THRESHOLD, since
59dnl  mpn_sqr_basecase only needs to handle sizes < SQR_TOOM2_THRESHOLD.
60dnl
61dnl  The first addmul is the biggest, and this takes the second least
62dnl  significant limb and multiplies it by the third least significant and
63dnl  up.  Hence for a maximum operand size of SQR_TOOM2_THRESHOLD-1
64dnl  limbs, UNROLL_COUNT needs to be SQR_TOOM2_THRESHOLD-3.
65
66m4_config_gmp_mparam(`SQR_TOOM2_THRESHOLD')
67deflit(UNROLL_COUNT, eval(SQR_TOOM2_THRESHOLD-3))
68
69
70C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
71C
72C The algorithm is essentially the same as mpn/generic/sqr_basecase.c, but a
73C lot of function call overheads are avoided, especially when the given size
74C is small.
75C
76C The code size might look a bit excessive, but not all of it is executed
77C and so won't fill up the code cache.  The 1x1, 2x2 and 3x3 special cases
78C clearly apply only to those sizes; mid sizes like 10x10 only need part of
79C the unrolled addmul; and big sizes like 35x35 that do need all of it will
80C at least be getting value for money, because 35x35 spends something like
81C 5780 cycles here.
82C
83C Different values of UNROLL_COUNT give slightly different speeds, between
84C 9.0 and 9.2 c/tri-prod measured on the difference between 17 and 33 limbs.
85C This isn't a big difference, but it's presumably some alignment effect
86C which if understood could give a simple speedup.
87
88defframe(PARAM_SIZE,12)
89defframe(PARAM_SRC, 8)
90defframe(PARAM_DST, 4)
91
92          TEXT
93          ALIGN(32)
94PROLOGUE(mpn_sqr_basecase)
95deflit(`FRAME',0)
96
97          movl      PARAM_SIZE, %ecx
98          movl      PARAM_SRC, %eax
99
100          cmpl      $2, %ecx
101          je        L(two_limbs)
102
103          movl      PARAM_DST, %edx
104          ja        L(three_or_more)
105
106
107C -----------------------------------------------------------------------------
108C one limb only
109          C eax     src
110          C ebx
111          C ecx     size
112          C edx     dst
113
114          movl      (%eax), %eax
115          movl      %edx, %ecx
116
117          mull      %eax
118
119          movl      %eax, (%ecx)
120          movl      %edx, 4(%ecx)
121          ret
122
123
124C -----------------------------------------------------------------------------
125          ALIGN(16)
126L(two_limbs):
127          C eax     src
128          C ebx
129          C ecx     size
130          C edx     dst
131
132          pushl     %ebx
133          movl      %eax, %ebx          C src
134deflit(`FRAME',4)
135
136          movl      (%ebx), %eax
137          movl      PARAM_DST, %ecx
138
139          mull      %eax                C src[0]^2
140
141          movl      %eax, (%ecx)
142          movl      4(%ebx), %eax
143
144          movl      %edx, 4(%ecx)
145
146          mull      %eax                C src[1]^2
147
148          movl      %eax, 8(%ecx)
149          movl      (%ebx), %eax
150
151          movl      %edx, 12(%ecx)
152          movl      4(%ebx), %edx
153
154          mull      %edx                C src[0]*src[1]
155
156          addl      %eax, 4(%ecx)
157
158          adcl      %edx, 8(%ecx)
159          adcl      $0, 12(%ecx)
160
161          popl      %ebx
162          addl      %eax, 4(%ecx)
163
164          adcl      %edx, 8(%ecx)
165          adcl      $0, 12(%ecx)
166
167          ret
168
169
170C -----------------------------------------------------------------------------
171L(three_or_more):
172deflit(`FRAME',0)
173          cmpl      $4, %ecx
174          jae       L(four_or_more)
175
176
177C -----------------------------------------------------------------------------
178C three limbs
179          C eax     src
180          C ecx     size
181          C edx     dst
182
183          pushl     %ebx
184          movl      %eax, %ebx          C src
185
186          movl      (%ebx), %eax
187          movl      %edx, %ecx          C dst
188
189          mull      %eax                C src[0] ^ 2
190
191          movl      %eax, (%ecx)
192          movl      4(%ebx), %eax
193
194          movl      %edx, 4(%ecx)
195          pushl     %esi
196
197          mull      %eax                C src[1] ^ 2
198
199          movl      %eax, 8(%ecx)
200          movl      8(%ebx), %eax
201
202          movl      %edx, 12(%ecx)
203          pushl     %edi
204
205          mull      %eax                C src[2] ^ 2
206
207          movl      %eax, 16(%ecx)
208          movl      (%ebx), %eax
209
210          movl      %edx, 20(%ecx)
211          movl      4(%ebx), %edx
212
213          mull      %edx                C src[0] * src[1]
214
215          movl      %eax, %esi
216          movl      (%ebx), %eax
217
218          movl      %edx, %edi
219          movl      8(%ebx), %edx
220
221          pushl     %ebp
222          xorl      %ebp, %ebp
223
224          mull      %edx                C src[0] * src[2]
225
226          addl      %eax, %edi
227          movl      4(%ebx), %eax
228
229          adcl      %edx, %ebp
230
231          movl      8(%ebx), %edx
232
233          mull      %edx                C src[1] * src[2]
234
235          addl      %eax, %ebp
236
237          adcl      $0, %edx
238
239
240          C eax     will be dst[5]
241          C ebx
242          C ecx     dst
243          C edx     dst[4]
244          C esi     dst[1]
245          C edi     dst[2]
246          C ebp     dst[3]
247
248          xorl      %eax, %eax
249          addl      %esi, %esi
250          adcl      %edi, %edi
251          adcl      %ebp, %ebp
252          adcl      %edx, %edx
253          adcl      $0, %eax
254
255          addl      %esi, 4(%ecx)
256          adcl      %edi, 8(%ecx)
257          adcl      %ebp, 12(%ecx)
258
259          popl      %ebp
260          popl      %edi
261
262          adcl      %edx, 16(%ecx)
263
264          popl      %esi
265          popl      %ebx
266
267          adcl      %eax, 20(%ecx)
268          ASSERT(nc)
269
270          ret
271
272
273C -----------------------------------------------------------------------------
274
275defframe(SAVE_EBX,   -4)
276defframe(SAVE_ESI,   -8)
277defframe(SAVE_EDI,   -12)
278defframe(SAVE_EBP,   -16)
279defframe(VAR_COUNTER,-20)
280defframe(VAR_JMP,    -24)
281deflit(STACK_SPACE, 24)
282
283          ALIGN(16)
284L(four_or_more):
285
286          C eax     src
287          C ebx
288          C ecx     size
289          C edx     dst
290          C esi
291          C edi
292          C ebp
293
294C First multiply src[0]*src[1..size-1] and store at dst[1..size].
295C
296C A test was done calling mpn_mul_1 here to get the benefit of its unrolled
297C loop, but this was only a tiny speedup; at 35 limbs it took 24 cycles off
298C a 5780 cycle operation, which is not surprising since the loop here is 8
299C c/l and mpn_mul_1 is 6.25 c/l.
300
301          subl      $STACK_SPACE, %esp  deflit(`FRAME',STACK_SPACE)
302
303          movl      %edi, SAVE_EDI
304          leal      4(%edx), %edi
305
306          movl      %ebx, SAVE_EBX
307          leal      4(%eax), %ebx
308
309          movl      %esi, SAVE_ESI
310          xorl      %esi, %esi
311
312          movl      %ebp, SAVE_EBP
313
314          C eax
315          C ebx     src+4
316          C ecx     size
317          C edx
318          C esi
319          C edi     dst+4
320          C ebp
321
322          movl      (%eax), %ebp        C multiplier
323          leal      -1(%ecx), %ecx      C size-1, and pad to a 16 byte boundary
324
325
326          ALIGN(16)
327L(mul_1):
328          C eax     scratch
329          C ebx     src ptr
330          C ecx     counter
331          C edx     scratch
332          C esi     carry
333          C edi     dst ptr
334          C ebp     multiplier
335
336          movl      (%ebx), %eax
337          addl      $4, %ebx
338
339          mull      %ebp
340
341          addl      %esi, %eax
342          movl      $0, %esi
343
344          adcl      %edx, %esi
345
346          movl      %eax, (%edi)
347          addl      $4, %edi
348
349          loop      L(mul_1)
350
351
352C Addmul src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2.
353C
354C The last two addmuls, which are the bottom right corner of the product
355C triangle, are left to the end.  These are src[size-3]*src[size-2,size-1]
356C and src[size-2]*src[size-1].  If size is 4 then it's only these corner
357C cases that need to be done.
358C
359C The unrolled code is the same as mpn_addmul_1(), see that routine for some
360C comments.
361C
362C VAR_COUNTER is the outer loop, running from -(size-4) to -1, inclusive.
363C
364C VAR_JMP is the computed jump into the unrolled code, stepped by one code
365C chunk each outer loop.
366C
367C K6 doesn't do any branch prediction on indirect jumps, which is good
368C actually because it's a different target each time.  The unrolled addmul
369C is about 3 cycles/limb faster than a simple loop, so the 6 cycle cost of
370C the indirect jump is quickly recovered.
371
372
373dnl  This value is also implicitly encoded in a shift and add.
374dnl
375deflit(CODE_BYTES_PER_LIMB, 15)
376
377dnl  With the unmodified &src[size] and &dst[size] pointers, the
378dnl  displacements in the unrolled code fit in a byte for UNROLL_COUNT
379dnl  values up to 31.  Above that an offset must be added to them.
380dnl
381deflit(OFFSET,
382ifelse(eval(UNROLL_COUNT>31),1,
383eval((UNROLL_COUNT-31)*4),
3840))
385
386          C eax
387          C ebx     &src[size]
388          C ecx
389          C edx
390          C esi     carry
391          C edi     &dst[size]
392          C ebp
393
394          movl      PARAM_SIZE, %ecx
395          movl      %esi, (%edi)
396
397          subl      $4, %ecx
398          jz        L(corner)
399
400          movl      %ecx, %edx
401ifelse(OFFSET,0,,
402`         subl      $OFFSET, %ebx')
403
404          shll      $4, %ecx
405ifelse(OFFSET,0,,
406`         subl      $OFFSET, %edi')
407
408          negl      %ecx
409
410ifdef(`PIC',`
411          call      L(pic_calc)
412L(here):
413',`
414          leal      L(unroll_inner_end)-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx
415')
416          negl      %edx
417
418
419          C The calculated jump mustn't be before the start of the available
420          C code.  This is the limitation UNROLL_COUNT puts on the src operand
421          C size, but checked here using the jump address directly.
422          C
423          ASSERT(ae,`
424          movl_text_address( L(unroll_inner_start), %eax)
425          cmpl      %eax, %ecx
426          ')
427
428
429C -----------------------------------------------------------------------------
430          ALIGN(16)
431L(unroll_outer_top):
432          C eax
433          C ebx     &src[size], constant
434          C ecx     VAR_JMP
435          C edx     VAR_COUNTER, limbs, negative
436          C esi     high limb to store
437          C edi     dst ptr, high of last addmul
438          C ebp
439
440          movl      -12+OFFSET(%ebx,%edx,4), %ebp C multiplier
441          movl      %edx, VAR_COUNTER
442
443          movl      -8+OFFSET(%ebx,%edx,4), %eax  C first limb of multiplicand
444
445          mull      %ebp
446
447          testb     $1, %cl
448
449          movl      %edx, %esi          C high carry
450          movl      %ecx, %edx          C jump
451
452          movl      %eax, %ecx          C low carry
453          leal      CODE_BYTES_PER_LIMB(%edx), %edx
454
455          movl      %edx, VAR_JMP
456          leal      4(%edi), %edi
457
458          C A branch-free version of this using some xors was found to be a
459          C touch slower than just a conditional jump, despite the jump
460          C switching between taken and not taken on every loop.
461
462ifelse(eval(UNROLL_COUNT%2),0,
463          jz,jnz)   L(unroll_noswap)
464          movl      %esi, %eax          C high,low carry other way around
465
466          movl      %ecx, %esi
467          movl      %eax, %ecx
468L(unroll_noswap):
469
470          jmp       *%edx
471
472
473          C Must be on an even address here so the low bit of the jump address
474          C will indicate which way around ecx/esi should start.
475          C
476          C An attempt was made at padding here to get the end of the unrolled
477          C code to come out on a good alignment, to save padding before
478          C L(corner).  This worked, but turned out to run slower than just an
479          C ALIGN(2).  The reason for this is not clear, it might be related
480          C to the different speeds on different UNROLL_COUNTs noted above.
481
482          ALIGN(2)
483
484L(unroll_inner_start):
485          C eax     scratch
486          C ebx     src
487          C ecx     carry low
488          C edx     scratch
489          C esi     carry high
490          C edi     dst
491          C ebp     multiplier
492          C
493          C 15 code bytes each limb
494          C ecx/esi swapped on each chunk
495
496forloop(`i', UNROLL_COUNT, 1, `
497          deflit(`disp_src', eval(-i*4 + OFFSET))
498          deflit(`disp_dst', eval(disp_src - 4))
499
500          m4_assert(`disp_src>=-128 && disp_src<128')
501          m4_assert(`disp_dst>=-128 && disp_dst<128')
502
503ifelse(eval(i%2),0,`
504Zdisp(    movl,     disp_src,(%ebx), %eax)
505          mull      %ebp
506Zdisp(    addl,     %esi, disp_dst,(%edi))
507          adcl      %eax, %ecx
508          movl      %edx, %esi
509          jadcl0( %esi)
510',`
511          dnl  this one comes out last
512Zdisp(    movl,     disp_src,(%ebx), %eax)
513          mull      %ebp
514Zdisp(    addl,     %ecx, disp_dst,(%edi))
515          adcl      %eax, %esi
516          movl      %edx, %ecx
517          jadcl0( %ecx)
518')
519')
520L(unroll_inner_end):
521
522          addl      %esi, -4+OFFSET(%edi)
523
524          movl      VAR_COUNTER, %edx
525          jadcl0(   %ecx)
526
527          movl      %ecx, m4_empty_if_zero(OFFSET)(%edi)
528          movl      VAR_JMP, %ecx
529
530          incl      %edx
531          jnz       L(unroll_outer_top)
532
533
534ifelse(OFFSET,0,,`
535          addl      $OFFSET, %ebx
536          addl      $OFFSET, %edi
537')
538
539
540C -----------------------------------------------------------------------------
541          ALIGN(16)
542L(corner):
543          C ebx     &src[size]
544          C edi     &dst[2*size-5]
545
546          movl      -12(%ebx), %ebp
547
548          movl      -8(%ebx), %eax
549          movl      %eax, %ecx
550
551          mull      %ebp
552
553          addl      %eax, -4(%edi)
554          adcl      $0, %edx
555
556          movl      -4(%ebx), %eax
557          movl      %edx, %esi
558          movl      %eax, %ebx
559
560          mull      %ebp
561
562          addl      %esi, %eax
563          adcl      $0, %edx
564
565          addl      %eax, (%edi)
566          adcl      $0, %edx
567
568          movl      %edx, %esi
569          movl      %ebx, %eax
570
571          mull      %ecx
572
573          addl      %esi, %eax
574          movl      %eax, 4(%edi)
575
576          adcl      $0, %edx
577
578          movl      %edx, 8(%edi)
579
580
581C -----------------------------------------------------------------------------
582C Left shift of dst[1..2*size-2], the bit shifted out becomes dst[2*size-1].
583C The loop measures about 6 cycles/iteration, though it looks like it should
584C decode in 5.
585
586L(lshift_start):
587          movl      PARAM_SIZE, %ecx
588
589          movl      PARAM_DST, %edi
590          subl      $1, %ecx            C size-1 and clear carry
591
592          movl      PARAM_SRC, %ebx
593          movl      %ecx, %edx
594
595          xorl      %eax, %eax                    C ready for adcl
596
597
598          ALIGN(16)
599L(lshift):
600          C eax
601          C ebx     src (for later use)
602          C ecx     counter, decrementing
603          C edx     size-1 (for later use)
604          C esi
605          C edi     dst, incrementing
606          C ebp
607
608          rcll      4(%edi)
609          rcll      8(%edi)
610          leal      8(%edi), %edi
611          loop      L(lshift)
612
613
614          adcl      %eax, %eax
615
616          movl      %eax, 4(%edi)                 C dst most significant limb
617          movl      (%ebx), %eax                  C src[0]
618
619          leal      4(%ebx,%edx,4), %ebx          C &src[size]
620          subl      %edx, %ecx                    C -(size-1)
621
622
623C -----------------------------------------------------------------------------
624C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ...,
625C src[size-1]^2.  dst[0] hasn't yet been set at all yet, and just gets the
626C low limb of src[0]^2.
627
628
629          mull      %eax
630
631          movl      %eax, (%edi,%ecx,8) C dst[0]
632
633
634          ALIGN(16)
635L(diag):
636          C eax     scratch
637          C ebx     &src[size]
638          C ecx     counter, negative
639          C edx     carry
640          C esi     scratch
641          C edi     dst[2*size-2]
642          C ebp
643
644          movl      (%ebx,%ecx,4), %eax
645          movl      %edx, %esi
646
647          mull      %eax
648
649          addl      %esi, 4(%edi,%ecx,8)
650          adcl      %eax, 8(%edi,%ecx,8)
651          adcl      $0, %edx
652
653          incl      %ecx
654          jnz       L(diag)
655
656
657          movl      SAVE_EBX, %ebx
658          movl      SAVE_ESI, %esi
659
660          addl      %edx, 4(%edi)                 C dst most significant limb
661
662          movl      SAVE_EDI, %edi
663          movl      SAVE_EBP, %ebp
664          addl      $FRAME, %esp
665          ret
666
667
668
669C -----------------------------------------------------------------------------
670ifdef(`PIC',`
671L(pic_calc):
672          C See mpn/x86/README about old gas bugs
673          addl      (%esp), %ecx
674          addl      $L(unroll_inner_end)-L(here)-eval(2*CODE_BYTES_PER_LIMB), %ecx
675          addl      %edx, %ecx
676          ret_internal
677')
678
679
680EPILOGUE()
681