1dnl  S/390-64 mpn_lshift.
2
3dnl  Copyright 2011, 2012, 2014 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33C            cycles/limb
34C z900               7
35C z990           3
36C z9                 ?
37C z10                6
38C z196               ?
39
40C NOTES
41C  * This uses discrete loads and stores in a software pipeline.  Using lmg and
42C    stmg is not faster.
43C  * One could assume more pipelining could approach 2.5 c/l, but we have not
44C    found any 8-way loop that runs better than the current 4-way loop.
45C  * Consider using the same feed-in code for 1 <= n <= 3 as for n mod 4,
46C    similarly to the x86_64 sqr_basecase feed-in.
47
48C INPUT PARAMETERS
49define(`rp',        `%r2')
50define(`up',        `%r3')
51define(`n',         `%r4')
52define(`cnt',       `%r5')
53
54define(`tnc',       `%r6')
55
56ASM_START()
57PROLOGUE(mpn_lshift)
58          cghi      n, 3
59          jh        L(gt1)
60
61          stmg      %r6, %r7, 48(%r15)
62          larl      %r1, L(tab)-4
63          lcgr      tnc, cnt
64          sllg      n, n, 2
65          b         0(n,%r1)
66L(tab):   j         L(n1)
67          j         L(n2)
68          j         L(n3)
69
70L(n1):    lg        %r1, 0(up)
71          sllg      %r0, %r1, 0(cnt)
72          stg       %r0, 0(rp)
73          srlg      %r2, %r1, 0(tnc)
74          lg        %r6, 48(%r15)                 C restoring r7 not needed
75          br        %r14
76
77L(n2):    lg        %r1, 8(up)
78          srlg      %r4, %r1, 0(tnc)
79          sllg      %r0, %r1, 0(cnt)
80          j         L(cj)
81
82L(n3):    lg        %r1, 16(up)
83          srlg      %r4, %r1, 0(tnc)
84          sllg      %r0, %r1, 0(cnt)
85          lg        %r1, 8(up)
86          srlg      %r7, %r1, 0(tnc)
87          ogr       %r7, %r0
88          sllg      %r0, %r1, 0(cnt)
89          stg       %r7, 16(rp)
90L(cj):    lg        %r1, 0(up)
91          srlg      %r7, %r1, 0(tnc)
92          ogr       %r7, %r0
93          sllg      %r0, %r1, 0(cnt)
94          stg       %r7, 8(rp)
95          stg       %r0, 0(rp)
96          lgr       %r2, %r4
97          lmg       %r6, %r7, 48(%r15)
98          br        %r14
99
100L(gt1):   stmg      %r6, %r13, 48(%r15)
101          lcgr      tnc, cnt            C tnc = -cnt
102
103          sllg      %r1, n, 3
104          srlg      %r0, n, 2           C loop count
105
106          agr       up, %r1                       C point up at end of U
107          agr       rp, %r1                       C point rp at end of R
108          aghi      up, -56
109          aghi      rp, -40
110
111          lghi      %r7, 3
112          ngr       %r7, n
113          je        L(b0)
114          cghi      %r7, 2
115          jl        L(b1)
116          je        L(b2)
117
118L(b3):    lg        %r7, 48(up)
119          srlg      %r9, %r7, 0(tnc)
120          sllg      %r11, %r7, 0(cnt)
121          lg        %r8, 40(up)
122          lg        %r7, 32(up)
123          srlg      %r4, %r8, 0(tnc)
124          sllg      %r13, %r8, 0(cnt)
125          ogr       %r11, %r4
126          la        rp, 16(rp)
127          j         L(lm3)
128
129L(b2):    lg        %r8, 48(up)
130          lg        %r7, 40(up)
131          srlg      %r9, %r8, 0(tnc)
132          sllg      %r13, %r8, 0(cnt)
133          la        rp, 24(rp)
134          la        up, 8(up)
135          j         L(lm2)
136
137L(b1):    lg        %r7, 48(up)
138          srlg      %r9, %r7, 0(tnc)
139          sllg      %r11, %r7, 0(cnt)
140          lg        %r8, 40(up)
141          lg        %r7, 32(up)
142          srlg      %r4, %r8, 0(tnc)
143          sllg      %r10, %r8, 0(cnt)
144          ogr       %r11, %r4
145          la        rp, 32(rp)
146          la        up, 16(up)
147          j         L(lm1)
148
149L(b0):    lg        %r8, 48(up)
150          lg        %r7, 40(up)
151          srlg      %r9, %r8, 0(tnc)
152          sllg      %r10, %r8, 0(cnt)
153          la        rp, 40(rp)
154          la        up, 24(up)
155          j         L(lm0)
156
157          ALIGN(8)
158L(top):   srlg      %r4, %r8, 0(tnc)
159          sllg      %r13, %r8, 0(cnt)
160          ogr       %r11, %r4
161          stg       %r10, 24(rp)
162L(lm3):   stg       %r11, 16(rp)
163L(lm2):   srlg      %r12, %r7, 0(tnc)
164          sllg      %r11, %r7, 0(cnt)
165          lg        %r8, 24(up)
166          lg        %r7, 16(up)
167          ogr       %r13, %r12
168          srlg      %r4, %r8, 0(tnc)
169          sllg      %r10, %r8, 0(cnt)
170          ogr       %r11, %r4
171          stg       %r13, 8(rp)
172L(lm1):   stg       %r11, 0(rp)
173L(lm0):   srlg      %r12, %r7, 0(tnc)
174          aghi      rp, -32
175          sllg      %r11, %r7, 0(cnt)
176          lg        %r8, 8(up)
177          lg        %r7, 0(up)
178          aghi      up, -32
179          ogr       %r10, %r12
180          brctg     %r0, L(top)
181
182L(end):   srlg      %r4, %r8, 0(tnc)
183          sllg      %r13, %r8, 0(cnt)
184          ogr       %r11, %r4
185          stg       %r10, 24(rp)
186          stg       %r11, 16(rp)
187          srlg      %r12, %r7, 0(tnc)
188          sllg      %r11, %r7, 0(cnt)
189          ogr       %r13, %r12
190          stg       %r13, 8(rp)
191          stg       %r11, 0(rp)
192          lgr       %r2, %r9
193
194          lmg       %r6, %r13, 48(%r15)
195          br        %r14
196EPILOGUE()
197