1dnl  AMD64 mpn_popcount, mpn_hamdist -- population count and hamming distance.
2
3dnl  Copyright 2004, 2005, 2007, 2010-2012, 2017 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31
32include(`../config.m4')
33
34
35C                        popcount             hamdist
36C                       cycles/limb         cycles/limb
37C AMD K8,K9                    6                   7
38C AMD K10            6                   7
39C Intel P4                    12                  14.3
40C Intel core2                  7                   8
41C Intel corei                  ?                   7.3
42C Intel atom                  16.5                17.5
43C VIA nano                     8.75               10.4
44
45C TODO
46C  * Tune.  It should be possible to reach 5 c/l for popcount and 6 c/l for
47C    hamdist for K8/K9.
48
49
50ifdef(`OPERATION_popcount',`
51  define(`func',`mpn_popcount')
52  define(`up',                `%rdi')
53  define(`n',                 `%rsi')
54  define(`h55555555',         `%r10')
55  define(`h33333333',         `%r11')
56  define(`h0f0f0f0f',         `%rcx')
57  define(`h01010101',         `%rdx')
58  define(`POP',               `$1')
59  define(`HAM',               `dnl')
60')
61ifdef(`OPERATION_hamdist',`
62  define(`func',`mpn_hamdist')
63  define(`up',                `%rdi')
64  define(`vp',                `%rsi')
65  define(`n',                 `%rdx')
66  define(`h55555555',         `%r10')
67  define(`h33333333',         `%r11')
68  define(`h0f0f0f0f',         `%rcx')
69  define(`h01010101',         `%r12')
70  define(`POP',               `dnl')
71  define(`HAM',               `$1')
72')
73
74
75MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
76
77ABI_SUPPORT(DOS64)
78ABI_SUPPORT(STD64)
79
80ASM_START()
81          TEXT
82          ALIGN(32)
83PROLOGUE(func)
84 POP(`    FUNC_ENTRY(2)                 ')
85 HAM(`    FUNC_ENTRY(3)                 ')
86          push      %rbx
87          mov       $0x5555555555555555, h55555555
88          push      %rbp
89          mov       $0x3333333333333333, h33333333
90 HAM(`    push      %r12                ')
91          lea       (up,n,8), up
92          mov       $0x0f0f0f0f0f0f0f0f, h0f0f0f0f
93 HAM(`    lea       (vp,n,8), vp        ')
94          neg       n
95          mov       $0x0101010101010101, h01010101
96          xor       R32(%rax), R32(%rax)
97          test      $1, R8(n)
98          jz        L(top)
99
100          mov       (up,n,8), %r8
101 HAM(`    xor       (vp,n,8), %r8       ')
102
103          mov       %r8, %r9
104          shr       %r8
105          and       h55555555, %r8
106          sub       %r8, %r9
107
108          mov       %r9, %r8
109          shr       $2, %r9
110          and       h33333333, %r8
111          and       h33333333, %r9
112          add       %r8, %r9            C 16 4-bit fields (0..4)
113
114          dec       n
115          jmp       L(mid)
116
117          ALIGN(16)
118L(top):   mov       (up,n,8), %r8
119          mov       8(up,n,8), %rbx
120 HAM(`    xor       (vp,n,8), %r8       ')
121 HAM(`    xor       8(vp,n,8), %rbx     ')
122
123          mov       %r8, %r9
124          mov       %rbx, %rbp
125          shr       %r8
126          shr       %rbx
127          and       h55555555, %r8
128          and       h55555555, %rbx
129          sub       %r8, %r9
130          sub       %rbx, %rbp
131
132          mov       %r9, %r8
133          mov       %rbp, %rbx
134          shr       $2, %r9
135          shr       $2, %rbp
136          and       h33333333, %r8
137          and       h33333333, %r9
138          and       h33333333, %rbx
139          and       h33333333, %rbp
140          add       %r8, %r9            C 16 4-bit fields (0..4)
141          add       %rbx, %rbp                    C 16 4-bit fields (0..4)
142
143          add       %rbp, %r9           C 16 4-bit fields (0..8)
144L(mid):   mov       %r9, %r8
145          shr       $4, %r9
146          and       h0f0f0f0f, %r8
147          and       h0f0f0f0f, %r9
148          add       %r8, %r9            C 8 8-bit fields (0..16)
149
150          imul      h01010101, %r9                C sum the 8 fields in high 8 bits
151          shr       $56, %r9
152
153          add       %r9, %rax           C add to total
154          add       $2, n
155          jnc       L(top)
156
157L(end):
158 HAM(`    pop       %r12                ')
159          pop       %rbp
160          pop       %rbx
161          FUNC_EXIT()
162          ret
163EPILOGUE()
164