1dnl  PowerPC-64 mpn_invert_limb -- Invert a normalized limb.
2
3dnl  Copyright 2004-2006, 2008, 2010, 2013 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33C                  cycles/limb (approximate)
34C POWER3/PPC630         80
35C POWER4/PPC970         86
36C POWER5                86
37C POWER6               170
38C POWER7                66
39
40ASM_START()
41PROLOGUE(mpn_invert_limb,toc)
42          LEAL(     r12, approx_tab)
43          srdi      r9, r3, 32
44          rlwinm    r9, r9, 10, 23, 30  C (d >> 55) & 0x1fe
45          srdi      r10, r3, 24                   C d >> 24
46          lis       r11, 0x1000
47          rldicl    r8, r3, 0, 63                 C d mod 2
48          addi      r10, r10, 1                   C d40
49          sldi      r11, r11, 32                  C 2^60
50          srdi      r7, r3, 1           C d/2
51          add       r7, r7, r8                    C d63 = ceil(d/2)
52          neg       r8, r8                        C mask = -(d mod 2)
53          lhzx      r0, r9, r12
54          mullw     r9, r0, r0                    C v0*v0
55          sldi      r6, r0, 11                    C v0 << 11
56          addi      r0, r6, -1                    C (v0 << 11) - 1
57          mulld     r9, r9, r10                   C v0*v0*d40
58          srdi      r9, r9, 40                    C v0*v0*d40 >> 40
59          subf      r9, r9, r0                    C v1 = (v0 << 11) - (v0*v0*d40 >> 40) - 1
60          mulld     r0, r9, r10                   C v1*d40
61          sldi      r6, r9, 13                    C v1 << 13
62          subf      r0, r0, r11                   C 2^60 - v1*d40
63          mulld     r0, r0, r9                    C v1 * (2^60 - v1*d40)
64          srdi      r0, r0, 47                    C v1 * (2^60 - v1*d40) >> 47
65          add       r0, r0, r6                    C v2 = (v1 << 13) + (v1 * (2^60 - v1*d40) >> 47)
66          mulld     r11, r0, r7                   C v2 * d63
67          srdi      r10, r0, 1                    C v2 >> 1
68          sldi      r9, r0, 31                    C v2 << 31
69          and       r8, r10, r8                   C (v2 >> 1) & mask
70          subf      r8, r11, r8                   C ((v2 >> 1) & mask) - v2 * d63
71          mulhdu    r0, r8, r0                    C p1 = v2 * (((v2 >> 1) & mask) - v2 * d63)
72          srdi      r0, r0, 1           C p1 >> 1
73          add       r0, r0, r9                    C v3 = (v2 << 31) + (p1 >> 1)
74          nop
75          mulld     r11, r0, r3
76          mulhdu    r9, r0, r3
77          addc      r10, r11, r3
78          adde      r3, r9, r3
79          subf      r3, r3, r0
80          blr
81EPILOGUE()
82
83DEF_OBJECT(approx_tab)
84forloop(i,256,512-1,dnl
85`         .short    eval(0x7fd00/i)
86')dnl
87END_OBJECT(approx_tab)
88ASM_END()
89