1dnl AMD64 mpn_mullo_basecase optimised for Conroe/Wolfdale/Nehalem/Westmere. 2 3dnl Contributed to the GNU project by Torbjörn Granlund. 4 5dnl Copyright 2008, 2009, 2011-2013 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8dnl 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of either: 11dnl 12dnl * the GNU Lesser General Public License as published by the Free 13dnl Software Foundation; either version 3 of the License, or (at your 14dnl option) any later version. 15dnl 16dnl or 17dnl 18dnl * the GNU General Public License as published by the Free Software 19dnl Foundation; either version 2 of the License, or (at your option) any 20dnl later version. 21dnl 22dnl or both in parallel, as here. 23dnl 24dnl The GNU MP Library is distributed in the hope that it will be useful, but 25dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27dnl for more details. 28dnl 29dnl You should have received copies of the GNU General Public License and the 30dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31dnl see https://www.gnu.org/licenses/. 32 33include(`../config.m4') 34 35C cycles/limb mul_2 addmul_2 36C AMD K8,K9 37C AMD K10 38C AMD bull 39C AMD pile 40C AMD steam 41C AMD bobcat 42C AMD jaguar 43C Intel P4 44C Intel core 4.0 4.18-4.25 45C Intel NHM 3.75 4.06-4.2 46C Intel SBR 47C Intel IBR 48C Intel HWL 49C Intel BWL 50C Intel atom 51C VIA nano 52 53C The inner loops of this code are the result of running a code generation and 54C optimisation tool suite written by David Harvey and Torbjörn Granlund. 55 56C TODO 57C * Implement proper cor2, replacing current cor0. 58C * Offset n by 2 in order to avoid the outer loop cmp. (And sqr_basecase?) 59C * Micro-optimise. 60 61C When playing with pointers, set this to $2 to fall back to conservative 62C indexing in wind-down code. 63define(`I',`$1') 64 65define(`rp', `%rdi') 66define(`up', `%rsi') 67define(`vp_param', `%rdx') 68define(`n_param', `%rcx') 69 70define(`v0', `%r10') 71define(`v1', `%r11') 72define(`w0', `%rbx') 73define(`w1', `%rcx') 74define(`w2', `%rbp') 75define(`w3', `%r12') 76define(`n', `%r9') 77define(`i', `%r13') 78define(`vp', `%r8') 79 80define(`X0', `%r14') 81define(`X1', `%r15') 82 83C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15 84 85ABI_SUPPORT(DOS64) 86ABI_SUPPORT(STD64) 87 88define(`ALIGNx', `ALIGN(16)') 89 90define(`N', 85) 91ifdef(`N',,`define(`N',0)') 92define(`MOV', `ifelse(eval(N & $3),0,`mov $1, $2',`lea ($1), $2')') 93 94ASM_START() 95 TEXT 96 ALIGN(32) 97PROLOGUE(mpn_mullo_basecase) 98 FUNC_ENTRY(4) 99 100 mov (up), %rax 101 mov vp_param, vp 102 103 cmp $4, n_param 104 jb L(small) 105 106 mov (vp_param), v0 107 push %rbx 108 lea (rp,n_param,8), rp C point rp at R[un] 109 push %rbp 110 lea (up,n_param,8), up C point up right after U's end 111 push %r12 112 mov $0, R32(n) C FIXME 113 sub n_param, n 114 push %r13 115 mul v0 116 mov 8(vp), v1 117 118 test $1, R8(n_param) 119 jnz L(m2x1) 120 121L(m2x0):test $2, R8(n_param) 122 jnz L(m2b2) 123 124L(m2b0):lea (n), i 125 mov %rax, (rp,n,8) 126 mov %rdx, w1 127 mov (up,n,8), %rax 128 xor R32(w2), R32(w2) 129 jmp L(m2e0) 130 131L(m2b2):lea -2(n), i 132 mov %rax, w2 133 mov (up,n,8), %rax 134 mov %rdx, w3 135 xor R32(w0), R32(w0) 136 jmp L(m2e2) 137 138L(m2x1):test $2, R8(n_param) 139 jnz L(m2b3) 140 141L(m2b1):lea 1(n), i 142 mov %rax, (rp,n,8) 143 mov (up,n,8), %rax 144 mov %rdx, w0 145 xor R32(w1), R32(w1) 146 jmp L(m2e1) 147 148L(m2b3):lea -1(n), i 149 xor R32(w3), R32(w3) 150 mov %rax, w1 151 mov %rdx, w2 152 mov (up,n,8), %rax 153 jmp L(m2e3) 154 155 ALIGNx 156L(m2tp):mul v0 157 add %rax, w3 158 mov -8(up,i,8), %rax 159 mov w3, -8(rp,i,8) 160 adc %rdx, w0 161 adc $0, R32(w1) 162L(m2e1):mul v1 163 add %rax, w0 164 adc %rdx, w1 165 mov $0, R32(w2) 166 mov (up,i,8), %rax 167 mul v0 168 add %rax, w0 169 mov w0, (rp,i,8) 170 adc %rdx, w1 171 mov (up,i,8), %rax 172 adc $0, R32(w2) 173L(m2e0):mul v1 174 add %rax, w1 175 adc %rdx, w2 176 mov 8(up,i,8), %rax 177 mul v0 178 mov $0, R32(w3) 179 add %rax, w1 180 adc %rdx, w2 181 adc $0, R32(w3) 182 mov 8(up,i,8), %rax 183L(m2e3):mul v1 184 add %rax, w2 185 mov w1, 8(rp,i,8) 186 adc %rdx, w3 187 mov $0, R32(w0) 188 mov 16(up,i,8), %rax 189 mul v0 190 add %rax, w2 191 mov 16(up,i,8), %rax 192 adc %rdx, w3 193 adc $0, R32(w0) 194L(m2e2):mul v1 195 mov $0, R32(w1) C FIXME: dead in last iteration 196 add %rax, w3 197 mov 24(up,i,8), %rax 198 mov w2, 16(rp,i,8) 199 adc %rdx, w0 C FIXME: dead in last iteration 200 add $4, i 201 js L(m2tp) 202 203L(m2ed):imul v0, %rax 204 add w3, %rax 205 mov %rax, I(-8(rp),-8(rp,i,8)) 206 207 add $2, n 208 lea 16(vp), vp 209 lea -16(up), up 210 cmp $-2, n 211 jge L(cor1) 212 213 push %r14 214 push %r15 215 216L(outer): 217 mov (vp), v0 218 mov 8(vp), v1 219 mov (up,n,8), %rax 220 mul v0 221 test $1, R8(n) 222 jnz L(a1x1) 223 224L(a1x0):mov %rax, X1 225 MOV( %rdx, X0, 8) 226 mov (up,n,8), %rax 227 mul v1 228 test $2, R8(n) 229 jnz L(a110) 230 231L(a100):lea (n), i 232 mov (rp,n,8), w3 233 mov %rax, w0 234 MOV( %rdx, w1, 16) 235 jmp L(lo0) 236 237L(a110):lea 2(n), i 238 mov (rp,n,8), w1 239 mov %rax, w2 240 mov 8(up,n,8), %rax 241 MOV( %rdx, w3, 1) 242 jmp L(lo2) 243 244L(a1x1):mov %rax, X0 245 MOV( %rdx, X1, 2) 246 mov (up,n,8), %rax 247 mul v1 248 test $2, R8(n) 249 jz L(a111) 250 251L(a101):lea 1(n), i 252 MOV( %rdx, w0, 4) 253 mov (rp,n,8), w2 254 mov %rax, w3 255 jmp L(lo1) 256 257L(a111):lea -1(n), i 258 MOV( %rdx, w2, 64) 259 mov %rax, w1 260 mov (rp,n,8), w0 261 mov 8(up,n,8), %rax 262 jmp L(lo3) 263 264 ALIGNx 265L(top): mul v1 266 add w0, w1 267 adc %rax, w2 268 mov -8(up,i,8), %rax 269 MOV( %rdx, w3, 1) 270 adc $0, w3 271L(lo2): mul v0 272 add w1, X1 273 mov X1, -16(rp,i,8) 274 adc %rax, X0 275 MOV( %rdx, X1, 2) 276 adc $0, X1 277 mov -8(up,i,8), %rax 278 mul v1 279 MOV( %rdx, w0, 4) 280 mov -8(rp,i,8), w1 281 add w1, w2 282 adc %rax, w3 283 adc $0, w0 284L(lo1): mov (up,i,8), %rax 285 mul v0 286 add w2, X0 287 adc %rax, X1 288 mov X0, -8(rp,i,8) 289 MOV( %rdx, X0, 8) 290 adc $0, X0 291 mov (up,i,8), %rax 292 mov (rp,i,8), w2 293 mul v1 294 add w2, w3 295 adc %rax, w0 296 MOV( %rdx, w1, 16) 297 adc $0, w1 298L(lo0): mov 8(up,i,8), %rax 299 mul v0 300 add w3, X1 301 mov X1, (rp,i,8) 302 adc %rax, X0 303 MOV( %rdx, X1, 32) 304 mov 8(rp,i,8), w3 305 adc $0, X1 306 mov 8(up,i,8), %rax 307 mul v1 308 add w3, w0 309 MOV( %rdx, w2, 64) 310 adc %rax, w1 311 mov 16(up,i,8), %rax 312 adc $0, w2 313L(lo3): mul v0 314 add w0, X0 315 mov X0, 8(rp,i,8) 316 MOV( %rdx, X0, 128) 317 adc %rax, X1 318 mov 16(up,i,8), %rax 319 mov 16(rp,i,8), w0 320 adc $0, X0 321 add $4, i 322 jnc L(top) 323 324L(end): imul v1, %rax 325 add w0, w1 326 adc %rax, w2 327 mov I(-8(up),-8(up,i,8)), %rax 328 imul v0, %rax 329 add w1, X1 330 mov X1, I(-16(rp),-16(rp,i,8)) 331 adc X0, %rax 332 mov I(-8(rp),-8(rp,i,8)), w1 333 add w1, w2 334 add w2, %rax 335 mov %rax, I(-8(rp),-8(rp,i,8)) 336 337 add $2, n 338 lea 16(vp), vp 339 lea -16(up), up 340 cmp $-2, n 341 jl L(outer) 342 343 pop %r15 344 pop %r14 345 346 jnz L(cor0) 347 348L(cor1):mov (vp), v0 349 mov 8(vp), v1 350 mov -16(up), %rax 351 mul v0 C u0 x v2 352 add -16(rp), %rax C FIXME: rp[0] still available in reg? 353 adc -8(rp), %rdx C FIXME: rp[1] still available in reg? 354 mov -8(up), %rbx 355 imul v0, %rbx 356 mov -16(up), %rcx 357 imul v1, %rcx 358 mov %rax, -16(rp) 359 add %rbx, %rcx 360 add %rdx, %rcx 361 mov %rcx, -8(rp) 362 pop %r13 363 pop %r12 364 pop %rbp 365 pop %rbx 366 FUNC_EXIT() 367 ret 368 369L(cor0):mov (vp), %r11 370 imul -8(up), %r11 371 add %rax, %r11 372 mov %r11, -8(rp) 373 pop %r13 374 pop %r12 375 pop %rbp 376 pop %rbx 377 FUNC_EXIT() 378 ret 379 380 ALIGN(16) 381L(small): 382 cmp $2, n_param 383 jae L(gt1) 384L(n1): imul (vp_param), %rax 385 mov %rax, (rp) 386 FUNC_EXIT() 387 ret 388L(gt1): ja L(gt2) 389L(n2): mov (vp_param), %r9 390 mul %r9 391 mov %rax, (rp) 392 mov 8(up), %rax 393 imul %r9, %rax 394 add %rax, %rdx 395 mov 8(vp), %r9 396 mov (up), %rcx 397 imul %r9, %rcx 398 add %rcx, %rdx 399 mov %rdx, 8(rp) 400 FUNC_EXIT() 401 ret 402L(gt2): 403L(n3): mov (vp_param), %r9 404 mul %r9 C u0 x v0 405 mov %rax, (rp) 406 mov %rdx, %r10 407 mov 8(up), %rax 408 mul %r9 C u1 x v0 409 imul 16(up), %r9 C u2 x v0 410 add %rax, %r10 411 adc %rdx, %r9 412 mov 8(vp), %r11 413 mov (up), %rax 414 mul %r11 C u0 x v1 415 add %rax, %r10 416 adc %rdx, %r9 417 imul 8(up), %r11 C u1 x v1 418 add %r11, %r9 419 mov %r10, 8(rp) 420 mov 16(vp), %r10 421 mov (up), %rax 422 imul %rax, %r10 C u0 x v2 423 add %r10, %r9 424 mov %r9, 16(rp) 425 FUNC_EXIT() 426 ret 427EPILOGUE() 428