1dnl  AMD64 mpn_mod_34lsub1 -- remainder modulo 2^48-1.
2
3dnl  Copyright 2000-2002, 2004, 2005, 2007, 2010-2012 Free Software Foundation,
4dnl  Inc.
5
6dnl  This file is part of the GNU MP Library.
7dnl
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of either:
10dnl
11dnl    * the GNU Lesser General Public License as published by the Free
12dnl      Software Foundation; either version 3 of the License, or (at your
13dnl      option) any later version.
14dnl
15dnl  or
16dnl
17dnl    * the GNU General Public License as published by the Free Software
18dnl      Foundation; either version 2 of the License, or (at your option) any
19dnl      later version.
20dnl
21dnl  or both in parallel, as here.
22dnl
23dnl  The GNU MP Library is distributed in the hope that it will be useful, but
24dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
26dnl  for more details.
27dnl
28dnl  You should have received copies of the GNU General Public License and the
29dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
30dnl  see https://www.gnu.org/licenses/.
31
32include(`../config.m4')
33
34
35C              cycles/limb
36C AMD K8,K9          1.0
37C AMD K10  1.12
38C Intel P4           3.25
39C Intel core2        1.5
40C Intel corei        1.5
41C Intel atom         2.5
42C VIA nano           1.75
43
44
45C INPUT PARAMETERS
46define(`ap',        %rdi)
47define(`n',         %rsi)
48
49C mp_limb_t mpn_mod_34lsub1 (mp_srcptr up, mp_size_t n)
50
51C TODO
52C  * Review feed-in and wind-down code.  In particular, try to avoid adc and
53C    sbb to placate Pentium4.
54C  * It seems possible to reach 2.67 c/l by using a cleaner 6-way unrolling,
55C    without the dual loop exits.
56
57ABI_SUPPORT(DOS64)
58ABI_SUPPORT(STD64)
59
60ASM_START()
61          TEXT
62          ALIGN(32)
63PROLOGUE(mpn_mod_34lsub1)
64          FUNC_ENTRY(2)
65
66          mov       $0x0000FFFFFFFFFFFF, %r11
67
68          sub       $2, %rsi
69          ja        L(gt2)
70
71          mov       (ap), %rax
72          nop
73          jb        L(1)
74
75          mov       8(ap), %rsi
76          mov       %rax, %rdx
77          shr       $48, %rax           C src[0] low
78
79          and       %r11, %rdx                    C src[0] high
80          add       %rdx, %rax
81          mov       R32(%rsi), R32(%rdx)
82
83          shr       $32, %rsi           C src[1] high
84          add       %rsi, %rax
85
86          shl       $16, %rdx           C src[1] low
87          add       %rdx, %rax
88
89L(1):     FUNC_EXIT()
90          ret
91
92
93          ALIGN(16)
94L(gt2):   xor       R32(%rax), R32(%rax)
95          xor       R32(%rcx), R32(%rcx)
96          xor       R32(%rdx), R32(%rdx)
97          xor       %r8, %r8
98          xor       %r9, %r9
99          xor       %r10, %r10
100
101L(top):   add       (ap), %rax
102          adc       $0, %r10
103          add       8(ap), %rcx
104          adc       $0, %r8
105          add       16(ap), %rdx
106          adc       $0, %r9
107
108          sub       $3, %rsi
109          jng       L(end)
110
111          add       24(ap), %rax
112          adc       $0, %r10
113          add       32(ap), %rcx
114          adc       $0, %r8
115          add       40(ap), %rdx
116          lea       48(ap), ap
117          adc       $0, %r9
118
119          sub       $3, %rsi
120          jg        L(top)
121
122
123          add       $-24, ap
124L(end):   add       %r9, %rax
125          adc       %r10, %rcx
126          adc       %r8, %rdx
127
128          inc       %rsi
129          mov       $0x1, R32(%r10)
130          js        L(combine)
131
132          mov       $0x10000, R32(%r10)
133          adc       24(ap), %rax
134          dec       %rsi
135          js        L(combine)
136
137          adc       32(ap), %rcx
138          mov       $0x100000000, %r10
139
140L(combine):
141          sbb       %rsi, %rsi                    C carry
142          mov       %rax, %rdi                    C 0mod3
143          shr       $48, %rax           C 0mod3 high
144
145          and       %r10, %rsi                    C carry masked
146          and       %r11, %rdi                    C 0mod3 low
147          mov       R32(%rcx), R32(%r10)          C 1mod3
148
149          add       %rsi, %rax                    C apply carry
150          shr       $32, %rcx           C 1mod3 high
151
152          add       %rdi, %rax                    C apply 0mod3 low
153          movzwl    %dx, R32(%rdi)                C 2mod3
154          shl       $16, %r10           C 1mod3 low
155
156          add       %rcx, %rax                    C apply 1mod3 high
157          shr       $16, %rdx           C 2mod3 high
158
159          add       %r10, %rax                    C apply 1mod3 low
160          shl       $32, %rdi           C 2mod3 low
161
162          add       %rdx, %rax                    C apply 2mod3 high
163          add       %rdi, %rax                    C apply 2mod3 low
164
165          FUNC_EXIT()
166          ret
167EPILOGUE()
168