1dnl  S/390-32 mpn_sqr_basecase.
2
3dnl  Copyright 2011 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33C            cycles/limb
34C z900               ?
35C z990              23
36C z9                 ?
37C z10                ?
38C z196               ?
39
40C TODO
41C  * Clean up.
42C  * Stop iterating addmul_1 loop at latest for n = 2, implement longer tail.
43C    This will ask for basecase handling of n = 3.
44C  * Update counters and pointers more straightforwardly, possibly lowering
45C    register usage.
46C  * Should we use this allocation-free style for more sqr_basecase asm
47C    implementations?  The only disadvantage is that it requires R != U.
48C  * Replace loops by faster code.  The mul_1 and addmul_1 loops could be sped
49C    up by about 10%.  The sqr_diag_addlsh1 loop could probably be sped up even
50C    more.
51
52C INPUT PARAMETERS
53define(`rp',        `%r2')
54define(`up',        `%r3')
55define(`n',         `%r4')
56
57define(`zero',      `%r8')
58define(`rp_saved',  `%r9')
59define(`up_saved',  `%r13')
60define(`n_saved',   `%r14')
61
62ASM_START()
63PROLOGUE(mpn_sqr_basecase)
64          ahi       n, -2
65          jhe       L(ge2)
66
67C n = 1
68          l         %r5, 0(up)
69          mlr       %r4, %r5
70          st        %r5, 0(rp)
71          st        %r4, 4(rp)
72          br        %r14
73
74L(ge2):   jne       L(gen)
75
76C n = 2
77          stm       %r6, %r8, 24(%r15)
78          lhi       zero, 0
79
80          l         %r5, 0(up)
81          mlr       %r4, %r5            C u0 * u0
82          l         %r1, 4(up)
83          mlr       %r0, %r1            C u1 * u1
84          st        %r5, 0(rp)
85
86          l         %r7, 0(up)
87          ml        %r6, 4(up)                    C u0 * u1
88          alr       %r7, %r7
89          alcr      %r6, %r6
90          alcr      %r0, zero
91
92          alr       %r4, %r7
93          alcr      %r1, %r6
94          alcr      %r0, zero
95          st        %r4, 4(rp)
96          st        %r1, 8(rp)
97          st        %r0, 12(rp)
98
99          lm        %r6, %r8, 24(%r15)
100          br        %r14
101
102L(gen):
103C mul_1 =======================================================================
104
105          stm       %r6, %r14, 24(%r15)
106          lhi       zero, 0
107          lr        up_saved, up
108          lr        rp_saved, rp
109          lr        n_saved, n
110
111          l         %r6, 0(up)
112          l         %r11, 4(up)
113          lhi       %r12, 8             C init index register
114          mlr       %r10, %r6
115          lr        %r5, n
116          st        %r11, 4(rp)
117          cr        %r15, %r15                    C clear carry flag
118
119L(tm):    l         %r1, 0(%r12,up)
120          mlr       %r0, %r6
121          alcr      %r1, %r10
122          lr        %r10, %r0           C copy high part to carry limb
123          st        %r1, 0(%r12,rp)
124          la        %r12, 4(%r12)
125          brct      %r5, L(tm)
126
127          alcr      %r0, zero
128          st        %r0, 0(%r12,rp)
129
130C addmul_1 loop ===============================================================
131
132          ahi       n, -1
133          je        L(outer_end)
134L(outer_loop):
135
136          la        rp, 8(rp)           C rp += 2
137          la        up, 4(up)           C up += 1
138          l         %r6, 0(up)
139          l         %r11, 4(up)
140          lhi       %r12, 8             C init index register
141          mlr       %r10, %r6
142          lr        %r5, n
143          al        %r11, 4(rp)
144          st        %r11, 4(rp)
145
146L(tam):   l         %r1, 0(%r12,up)
147          l         %r7, 0(%r12,rp)
148          mlr       %r0, %r6
149          alcr      %r1, %r7
150          alcr      %r0, zero
151          alr       %r1, %r10
152          lr        %r10, %r0
153          st        %r1, 0(%r12,rp)
154          la        %r12, 4(%r12)
155          brct      %r5, L(tam)
156
157          alcr      %r0, zero
158          st        %r0, 0(%r12,rp)
159
160          brct      n, L(outer_loop)
161L(outer_end):
162
163          l         %r6, 4(up)
164          l         %r1, 8(up)
165          lr        %r7, %r0            C Same as: l %r7, 12(,rp)
166          mlr       %r0, %r6
167          alr       %r1, %r7
168          alcr      %r0, zero
169          st        %r1, 12(rp)
170          st        %r0, 16(rp)
171
172C sqr_diag_addlsh1 ============================================================
173
174define(`up', `up_saved')
175define(`rp', `rp_saved')
176          la        n, 1(n_saved)
177
178          l         %r1, 0(up)
179          mlr       %r0, %r1
180          st        %r1, 0(rp)
181C         clr       %r15, %r15                    C clear carry (already clear per above)
182
183L(top):   l         %r11, 4(up)
184          la        up, 4(up)
185          l         %r6, 4(rp)
186          l         %r7, 8(rp)
187          mlr       %r10, %r11
188          alcr      %r6, %r6
189          alcr      %r7, %r7
190          alcr      %r10, zero                    C propagate carry to high product limb
191          alr       %r6, %r0
192          alcr      %r7, %r11
193          stm       %r6, %r7, 4(rp)
194          la        rp, 8(rp)
195          lr        %r0, %r10           C copy carry limb
196          brct      n, L(top)
197
198          alcr      %r0, zero
199          st        %r0, 4(rp)
200
201          lm        %r6, %r14, 24(%r15)
202          br        %r14
203EPILOGUE()
204