1dnl X86-64 mpn_mul_basecase optimised for Intel Nehalem/Westmere. 2dnl It also seems good for Conroe/Wolfdale. 3 4dnl Contributed to the GNU project by Torbjörn Granlund. 5 6dnl Copyright 2008, 2011-2013 Free Software Foundation, Inc. 7 8dnl This file is part of the GNU MP Library. 9dnl 10dnl The GNU MP Library is free software; you can redistribute it and/or modify 11dnl it under the terms of either: 12dnl 13dnl * the GNU Lesser General Public License as published by the Free 14dnl Software Foundation; either version 3 of the License, or (at your 15dnl option) any later version. 16dnl 17dnl or 18dnl 19dnl * the GNU General Public License as published by the Free Software 20dnl Foundation; either version 2 of the License, or (at your option) any 21dnl later version. 22dnl 23dnl or both in parallel, as here. 24dnl 25dnl The GNU MP Library is distributed in the hope that it will be useful, but 26dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 27dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 28dnl for more details. 29dnl 30dnl You should have received copies of the GNU General Public License and the 31dnl GNU Lesser General Public License along with the GNU MP Library. If not, 32dnl see https://www.gnu.org/licenses/. 33 34include(`../config.m4') 35 36C cycles/limb mul_1 mul_2 mul_3 addmul_2 37C AMD K8,K9 38C AMD K10 39C AMD bull 40C AMD pile 41C AMD steam 42C AMD bobcat 43C AMD jaguar 44C Intel P4 45C Intel core 4.0 4.0 - 4.18-4.25 46C Intel NHM 3.75 3.8 - 4.06-4.2 47C Intel SBR 48C Intel IBR 49C Intel HWL 50C Intel BWL 51C Intel atom 52C VIA nano 53 54C The inner loops of this code are the result of running a code generation and 55C optimisation tool suite written by David Harvey and Torbjörn Granlund. 56 57C Code structure: 58C 59C 60C m_1(0m4) m_1(1m4) m_1(2m4) m_1(3m4) 61C | | | | 62C m_2(0m4) | m_2(1m4) | m_2(2m4) | m_2(3m4) | 63C | / | / | / | / 64C | / | / | / | / 65C | / | / | / | / 66C \|/ |/_ \|/ |/_ \|/ |/_ \|/ |/_ 67C _____ _____ _____ _____ 68C / \ / \ / \ / \ 69C \|/ | \|/ | \|/ | \|/ | 70C am_2(0m4) | am_2(1m4) | am_2(2m4) | am_2(3m4) | 71C \ /|\ \ /|\ \ /|\ \ /|\ 72C \_____/ \_____/ \_____/ \_____/ 73 74C TODO 75C * Tune. None done so far. 76C * Currently 2687 bytes, making it smaller would be nice. 77C * Implement some basecases, say for un < 4. 78C * Try zeroing with xor in m2 loops. 79C * Try re-rolling the m2 loops to avoid the current 9 insn code duplication 80C between loop header and wind-down code. 81C * Consider adc reg,reg instead of adc $0,reg in m2 loops. This save a byte. 82 83C When playing with pointers, set this to $2 to fall back to conservative 84C indexing in wind-down code. 85define(`I',`$1') 86 87C Define this to $1 to use late loop index variable as zero, $2 to use an 88C explicit $0. 89define(`Z',`$1') 90 91define(`rp', `%rdi') 92define(`up', `%rsi') 93define(`un_param', `%rdx') 94define(`vp_param', `%rcx') C FIXME reallocate vp to rcx but watch performance! 95define(`vn_param', `%r8') 96 97define(`un', `%r9') 98define(`vn', `(%rsp)') 99 100define(`v0', `%r10') 101define(`v1', `%r11') 102define(`w0', `%rbx') 103define(`w1', `%rcx') 104define(`w2', `%rbp') 105define(`w3', `%r12') 106define(`i', `%r13') 107define(`vp', `%r14') 108 109define(`X0', `%r8') 110define(`X1', `%r15') 111 112C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15 113 114ABI_SUPPORT(DOS64) 115ABI_SUPPORT(STD64) 116 117define(`ALIGNx', `ALIGN(16)') 118 119define(`N', 85) 120ifdef(`N',,`define(`N',0)') 121define(`MOV', `ifelse(eval(N & $3),0,`mov $1, $2',`lea ($1), $2')') 122 123ASM_START() 124 TEXT 125 ALIGN(32) 126PROLOGUE(mpn_mul_basecase) 127 FUNC_ENTRY(4) 128IFDOS(` mov 56(%rsp), %r8d ') 129 mov (up), %rax C shared for mul_1 and mul_2 130 push %rbx 131 push %rbp 132 push %r12 133 push %r13 134 push %r14 135 136 mov (vp_param), v0 C shared for mul_1 and mul_2 137 138 xor un, un 139 sub un_param, un C un = -un_param 140 141 lea (up,un_param,8), up 142 lea (rp,un_param,8), rp 143 144 mul v0 C shared for mul_1 and mul_2 145 146 test $1, R8(vn_param) 147 jz L(m2) 148 149 lea 8(vp_param), vp C FIXME: delay until known needed 150 151 test $1, R8(un) 152 jnz L(m1x1) 153 154L(m1x0):test $2, R8(un) 155 jnz L(m1s2) 156 157L(m1s0): 158 lea (un), i 159 mov %rax, (rp,un,8) 160 mov 8(up,un,8), %rax 161 mov %rdx, w0 C FIXME: Use lea? 162 lea L(do_am0)(%rip), %rbp 163 jmp L(m1e0) 164 165L(m1s2): 166 lea 2(un), i 167 mov %rax, (rp,un,8) 168 mov 8(up,un,8), %rax 169 mov %rdx, w0 C FIXME: Use lea? 170 mul v0 171 lea L(do_am2)(%rip), %rbp 172 test i, i 173 jnz L(m1e2) 174 add %rax, w0 175 adc $0, %rdx 176 mov w0, I(-8(rp),8(rp,un,8)) 177 mov %rdx, I((rp),16(rp,un,8)) 178 jmp L(ret2) 179 180L(m1x1):test $2, R8(un) 181 jz L(m1s3) 182 183L(m1s1): 184 lea 1(un), i 185 mov %rax, (rp,un,8) 186 test i, i 187 jz L(1) 188 mov 8(up,un,8), %rax 189 mov %rdx, w1 C FIXME: Use lea? 190 lea L(do_am1)(%rip), %rbp 191 jmp L(m1e1) 192L(1): mov %rdx, I((rp),8(rp,un,8)) 193 jmp L(ret2) 194 195L(m1s3): 196 lea -1(un), i 197 mov %rax, (rp,un,8) 198 mov 8(up,un,8), %rax 199 mov %rdx, w1 C FIXME: Use lea? 200 lea L(do_am3)(%rip), %rbp 201 jmp L(m1e3) 202 203 ALIGNx 204L(m1top): 205 mul v0 206 mov w1, -16(rp,i,8) 207L(m1e2):xor R32(w1), R32(w1) 208 add %rax, w0 209 mov (up,i,8), %rax 210 adc %rdx, w1 211 mov w0, -8(rp,i,8) 212L(m1e1):xor R32(w0), R32(w0) 213 mul v0 214 add %rax, w1 215 mov 8(up,i,8), %rax 216 adc %rdx, w0 217 mov w1, (rp,i,8) 218L(m1e0):xor R32(w1), R32(w1) 219 mul v0 220 add %rax, w0 221 mov 16(up,i,8), %rax 222 adc %rdx, w1 223 mov w0, 8(rp,i,8) 224L(m1e3):xor R32(w0), R32(w0) 225 mul v0 226 add %rax, w1 227 mov 24(up,i,8), %rax 228 adc %rdx, w0 229 add $4, i 230 js L(m1top) 231 232 mul v0 233 mov w1, I(-16(rp),-16(rp,i,8)) 234 add %rax, w0 235 adc $0, %rdx 236 mov w0, I(-8(rp),-8(rp,i,8)) 237 mov %rdx, I((rp),(rp,i,8)) 238 239 dec vn_param 240 jz L(ret2) 241 lea -8(rp), rp 242 jmp *%rbp 243 244L(m2): 245 mov 8(vp_param), v1 246 lea 16(vp_param), vp C FIXME: delay until known needed 247 248 test $1, R8(un) 249 jnz L(bx1) 250 251L(bx0): test $2, R8(un) 252 jnz L(b10) 253 254L(b00): lea (un), i 255 mov %rax, (rp,un,8) 256 mov %rdx, w1 C FIXME: Use lea? 257 mov (up,un,8), %rax 258 mov $0, R32(w2) 259 jmp L(m2e0) 260 261L(b10): lea -2(un), i 262 mov %rax, w2 C FIXME: Use lea? 263 mov (up,un,8), %rax 264 mov %rdx, w3 C FIXME: Use lea? 265 mov $0, R32(w0) 266 jmp L(m2e2) 267 268L(bx1): test $2, R8(un) 269 jz L(b11) 270 271L(b01): lea 1(un), i 272 mov %rax, (rp,un,8) 273 mov (up,un,8), %rax 274 mov %rdx, w0 C FIXME: Use lea? 275 mov $0, R32(w1) 276 jmp L(m2e1) 277 278L(b11): lea -1(un), i 279 mov %rax, w1 C FIXME: Use lea? 280 mov (up,un,8), %rax 281 mov %rdx, w2 C FIXME: Use lea? 282 mov $0, R32(w3) 283 jmp L(m2e3) 284 285 ALIGNx 286L(m2top0): 287 mul v0 288 add %rax, w3 289 mov -8(up,i,8), %rax 290 mov w3, -8(rp,i,8) 291 adc %rdx, w0 292 adc $0, R32(w1) 293 mul v1 294 add %rax, w0 295 adc %rdx, w1 296 mov $0, R32(w2) 297 mov (up,i,8), %rax 298 mul v0 299 add %rax, w0 300 mov w0, (rp,i,8) 301 adc %rdx, w1 302 mov (up,i,8), %rax 303 adc $0, R32(w2) 304L(m2e0):mul v1 305 add %rax, w1 306 adc %rdx, w2 307 mov 8(up,i,8), %rax 308 mul v0 309 mov $0, R32(w3) 310 add %rax, w1 311 adc %rdx, w2 312 adc $0, R32(w3) 313 mov 8(up,i,8), %rax 314 mul v1 315 add %rax, w2 316 mov w1, 8(rp,i,8) 317 adc %rdx, w3 318 mov $0, R32(w0) 319 mov 16(up,i,8), %rax 320 mul v0 321 add %rax, w2 322 mov 16(up,i,8), %rax 323 adc %rdx, w3 324 adc $0, R32(w0) 325 mul v1 326 mov $0, R32(w1) 327 add %rax, w3 328 mov 24(up,i,8), %rax 329 mov w2, 16(rp,i,8) 330 adc %rdx, w0 331 add $4, i 332 js L(m2top0) 333 334 mul v0 335 add %rax, w3 336 mov I(-8(up),-8(up,i,8)), %rax 337 mov w3, I(-8(rp),-8(rp,i,8)) 338 adc %rdx, w0 339 adc R32(w1), R32(w1) 340 mul v1 341 add %rax, w0 342 adc %rdx, w1 343 mov w0, I((rp),(rp,i,8)) 344 mov w1, I(8(rp),8(rp,i,8)) 345 346 add $-2, vn_param 347 jz L(ret2) 348 349L(do_am0): 350 push %r15 351 push vn_param 352 353L(olo0): 354 mov (vp), v0 355 mov 8(vp), v1 356 lea 16(vp), vp 357 lea 16(rp), rp 358 mov (up,un,8), %rax 359C lea 0(un), i 360 mov un, i 361 mul v0 362 mov %rax, X0 363 mov (up,un,8), %rax 364 MOV( %rdx, X1, 2) 365 mul v1 366 MOV( %rdx, w0, 4) 367 mov (rp,un,8), w2 368 mov %rax, w3 369 jmp L(lo0) 370 371 ALIGNx 372L(am2top0): 373 mul v1 374 add w0, w1 375 adc %rax, w2 376 mov (up,i,8), %rax 377 MOV( %rdx, w3, 1) 378 adc $0, w3 379 mul v0 380 add w1, X1 381 mov X1, -8(rp,i,8) 382 adc %rax, X0 383 MOV( %rdx, X1, 2) 384 adc $0, X1 385 mov (up,i,8), %rax 386 mul v1 387 MOV( %rdx, w0, 4) 388 mov (rp,i,8), w1 389 add w1, w2 390 adc %rax, w3 391 adc $0, w0 392L(lo0): mov 8(up,i,8), %rax 393 mul v0 394 add w2, X0 395 adc %rax, X1 396 mov X0, (rp,i,8) 397 MOV( %rdx, X0, 8) 398 adc $0, X0 399 mov 8(up,i,8), %rax 400 mov 8(rp,i,8), w2 401 mul v1 402 add w2, w3 403 adc %rax, w0 404 MOV( %rdx, w1, 16) 405 adc $0, w1 406 mov 16(up,i,8), %rax 407 mul v0 408 add w3, X1 409 mov X1, 8(rp,i,8) 410 adc %rax, X0 411 MOV( %rdx, X1, 32) 412 mov 16(rp,i,8), w3 413 adc $0, X1 414 mov 16(up,i,8), %rax 415 mul v1 416 add w3, w0 417 MOV( %rdx, w2, 64) 418 adc %rax, w1 419 mov 24(up,i,8), %rax 420 adc $0, w2 421 mul v0 422 add w0, X0 423 mov X0, 16(rp,i,8) 424 MOV( %rdx, X0, 128) 425 adc %rax, X1 426 mov 24(up,i,8), %rax 427 mov 24(rp,i,8), w0 428 adc $0, X0 429 add $4, i 430 jnc L(am2top0) 431 432 mul v1 433 add w0, w1 434 adc %rax, w2 435 adc Z(i,$0), %rdx 436 add w1, X1 437 adc Z(i,$0), X0 438 mov X1, I(-8(rp),-8(rp,i,8)) 439 add w2, X0 440 mov X0, I((rp),(rp,i,8)) 441 adc Z(i,$0), %rdx 442 mov %rdx, I(8(rp),8(rp,i,8)) 443 444 addl $-2, vn 445 jnz L(olo0) 446 447L(ret): pop %rax 448 pop %r15 449L(ret2):pop %r14 450 pop %r13 451 pop %r12 452 pop %rbp 453 pop %rbx 454 FUNC_EXIT() 455 ret 456 457 458 ALIGNx 459L(m2top1): 460 mul v0 461 add %rax, w3 462 mov -8(up,i,8), %rax 463 mov w3, -8(rp,i,8) 464 adc %rdx, w0 465 adc $0, R32(w1) 466L(m2e1):mul v1 467 add %rax, w0 468 adc %rdx, w1 469 mov $0, R32(w2) 470 mov (up,i,8), %rax 471 mul v0 472 add %rax, w0 473 mov w0, (rp,i,8) 474 adc %rdx, w1 475 mov (up,i,8), %rax 476 adc $0, R32(w2) 477 mul v1 478 add %rax, w1 479 adc %rdx, w2 480 mov 8(up,i,8), %rax 481 mul v0 482 mov $0, R32(w3) 483 add %rax, w1 484 adc %rdx, w2 485 adc $0, R32(w3) 486 mov 8(up,i,8), %rax 487 mul v1 488 add %rax, w2 489 mov w1, 8(rp,i,8) 490 adc %rdx, w3 491 mov $0, R32(w0) 492 mov 16(up,i,8), %rax 493 mul v0 494 add %rax, w2 495 mov 16(up,i,8), %rax 496 adc %rdx, w3 497 adc $0, R32(w0) 498 mul v1 499 mov $0, R32(w1) 500 add %rax, w3 501 mov 24(up,i,8), %rax 502 mov w2, 16(rp,i,8) 503 adc %rdx, w0 504 add $4, i 505 js L(m2top1) 506 507 mul v0 508 add %rax, w3 509 mov I(-8(up),-8(up,i,8)), %rax 510 mov w3, I(-8(rp),-8(rp,i,8)) 511 adc %rdx, w0 512 adc R32(w1), R32(w1) 513 mul v1 514 add %rax, w0 515 adc %rdx, w1 516 mov w0, I((rp),(rp,i,8)) 517 mov w1, I(8(rp),8(rp,i,8)) 518 519 add $-2, vn_param 520 jz L(ret2) 521 522L(do_am1): 523 push %r15 524 push vn_param 525 526L(olo1): 527 mov (vp), v0 528 mov 8(vp), v1 529 lea 16(vp), vp 530 lea 16(rp), rp 531 mov (up,un,8), %rax 532 lea 1(un), i 533 mul v0 534 mov %rax, X1 535 MOV( %rdx, X0, 128) 536 mov (up,un,8), %rax 537 mov (rp,un,8), w1 538 mul v1 539 mov %rax, w2 540 mov 8(up,un,8), %rax 541 MOV( %rdx, w3, 1) 542 jmp L(lo1) 543 544 ALIGNx 545L(am2top1): 546 mul v1 547 add w0, w1 548 adc %rax, w2 549 mov (up,i,8), %rax 550 MOV( %rdx, w3, 1) 551 adc $0, w3 552L(lo1): mul v0 553 add w1, X1 554 mov X1, -8(rp,i,8) 555 adc %rax, X0 556 MOV( %rdx, X1, 2) 557 adc $0, X1 558 mov (up,i,8), %rax 559 mul v1 560 MOV( %rdx, w0, 4) 561 mov (rp,i,8), w1 562 add w1, w2 563 adc %rax, w3 564 adc $0, w0 565 mov 8(up,i,8), %rax 566 mul v0 567 add w2, X0 568 adc %rax, X1 569 mov X0, (rp,i,8) 570 MOV( %rdx, X0, 8) 571 adc $0, X0 572 mov 8(up,i,8), %rax 573 mov 8(rp,i,8), w2 574 mul v1 575 add w2, w3 576 adc %rax, w0 577 MOV( %rdx, w1, 16) 578 adc $0, w1 579 mov 16(up,i,8), %rax 580 mul v0 581 add w3, X1 582 mov X1, 8(rp,i,8) 583 adc %rax, X0 584 MOV( %rdx, X1, 32) 585 mov 16(rp,i,8), w3 586 adc $0, X1 587 mov 16(up,i,8), %rax 588 mul v1 589 add w3, w0 590 MOV( %rdx, w2, 64) 591 adc %rax, w1 592 mov 24(up,i,8), %rax 593 adc $0, w2 594 mul v0 595 add w0, X0 596 mov X0, 16(rp,i,8) 597 MOV( %rdx, X0, 128) 598 adc %rax, X1 599 mov 24(up,i,8), %rax 600 mov 24(rp,i,8), w0 601 adc $0, X0 602 add $4, i 603 jnc L(am2top1) 604 605 mul v1 606 add w0, w1 607 adc %rax, w2 608 adc Z(i,$0), %rdx 609 add w1, X1 610 adc Z(i,$0), X0 611 mov X1, I(-8(rp),-8(rp,i,8)) 612 add w2, X0 613 mov X0, I((rp),(rp,i,8)) 614 adc Z(i,$0), %rdx 615 mov %rdx, I(8(rp),8(rp,i,8)) 616 617 addl $-2, vn 618 jnz L(olo1) 619 620 pop %rax 621 pop %r15 622 pop %r14 623 pop %r13 624 pop %r12 625 pop %rbp 626 pop %rbx 627 FUNC_EXIT() 628 ret 629 630 631 ALIGNx 632L(m2top2): 633 mul v0 634 add %rax, w3 635 mov -8(up,i,8), %rax 636 mov w3, -8(rp,i,8) 637 adc %rdx, w0 638 adc $0, R32(w1) 639 mul v1 640 add %rax, w0 641 adc %rdx, w1 642 mov $0, R32(w2) 643 mov (up,i,8), %rax 644 mul v0 645 add %rax, w0 646 mov w0, (rp,i,8) 647 adc %rdx, w1 648 mov (up,i,8), %rax 649 adc $0, R32(w2) 650 mul v1 651 add %rax, w1 652 adc %rdx, w2 653 mov 8(up,i,8), %rax 654 mul v0 655 mov $0, R32(w3) 656 add %rax, w1 657 adc %rdx, w2 658 adc $0, R32(w3) 659 mov 8(up,i,8), %rax 660 mul v1 661 add %rax, w2 662 mov w1, 8(rp,i,8) 663 adc %rdx, w3 664 mov $0, R32(w0) 665 mov 16(up,i,8), %rax 666 mul v0 667 add %rax, w2 668 mov 16(up,i,8), %rax 669 adc %rdx, w3 670 adc $0, R32(w0) 671L(m2e2):mul v1 672 mov $0, R32(w1) 673 add %rax, w3 674 mov 24(up,i,8), %rax 675 mov w2, 16(rp,i,8) 676 adc %rdx, w0 677 add $4, i 678 js L(m2top2) 679 680 mul v0 681 add %rax, w3 682 mov I(-8(up),-8(up,i,8)), %rax 683 mov w3, I(-8(rp),-8(rp,i,8)) 684 adc %rdx, w0 685 adc R32(w1), R32(w1) 686 mul v1 687 add %rax, w0 688 adc %rdx, w1 689 mov w0, I((rp),(rp,i,8)) 690 mov w1, I(8(rp),8(rp,i,8)) 691 692 add $-2, vn_param 693 jz L(ret2) 694 695L(do_am2): 696 push %r15 697 push vn_param 698 699L(olo2): 700 mov (vp), v0 701 mov 8(vp), v1 702 lea 16(vp), vp 703 lea 16(rp), rp 704 mov (up,un,8), %rax 705 lea -2(un), i 706 mul v0 707 mov %rax, X0 708 MOV( %rdx, X1, 32) 709 mov (up,un,8), %rax 710 mov (rp,un,8), w0 711 mul v1 712 mov %rax, w1 713 lea (%rdx), w2 714 mov 8(up,un,8), %rax 715 jmp L(lo2) 716 717 ALIGNx 718L(am2top2): 719 mul v1 720 add w0, w1 721 adc %rax, w2 722 mov (up,i,8), %rax 723 MOV( %rdx, w3, 1) 724 adc $0, w3 725 mul v0 726 add w1, X1 727 mov X1, -8(rp,i,8) 728 adc %rax, X0 729 MOV( %rdx, X1, 2) 730 adc $0, X1 731 mov (up,i,8), %rax 732 mul v1 733 MOV( %rdx, w0, 4) 734 mov (rp,i,8), w1 735 add w1, w2 736 adc %rax, w3 737 adc $0, w0 738 mov 8(up,i,8), %rax 739 mul v0 740 add w2, X0 741 adc %rax, X1 742 mov X0, (rp,i,8) 743 MOV( %rdx, X0, 8) 744 adc $0, X0 745 mov 8(up,i,8), %rax 746 mov 8(rp,i,8), w2 747 mul v1 748 add w2, w3 749 adc %rax, w0 750 MOV( %rdx, w1, 16) 751 adc $0, w1 752 mov 16(up,i,8), %rax 753 mul v0 754 add w3, X1 755 mov X1, 8(rp,i,8) 756 adc %rax, X0 757 MOV( %rdx, X1, 32) 758 mov 16(rp,i,8), w3 759 adc $0, X1 760 mov 16(up,i,8), %rax 761 mul v1 762 add w3, w0 763 MOV( %rdx, w2, 64) 764 adc %rax, w1 765 mov 24(up,i,8), %rax 766 adc $0, w2 767L(lo2): mul v0 768 add w0, X0 769 mov X0, 16(rp,i,8) 770 MOV( %rdx, X0, 128) 771 adc %rax, X1 772 mov 24(up,i,8), %rax 773 mov 24(rp,i,8), w0 774 adc $0, X0 775 add $4, i 776 jnc L(am2top2) 777 778 mul v1 779 add w0, w1 780 adc %rax, w2 781 adc Z(i,$0), %rdx 782 add w1, X1 783 adc Z(i,$0), X0 784 mov X1, I(-8(rp),-8(rp,i,8)) 785 add w2, X0 786 mov X0, I((rp),(rp,i,8)) 787 adc Z(i,$0), %rdx 788 mov %rdx, I(8(rp),8(rp,i,8)) 789 790 addl $-2, vn 791 jnz L(olo2) 792 793 pop %rax 794 pop %r15 795 pop %r14 796 pop %r13 797 pop %r12 798 pop %rbp 799 pop %rbx 800 FUNC_EXIT() 801 ret 802 803 804 ALIGNx 805L(m2top3): 806 mul v0 807 add %rax, w3 808 mov -8(up,i,8), %rax 809 mov w3, -8(rp,i,8) 810 adc %rdx, w0 811 adc $0, R32(w1) 812 mul v1 813 add %rax, w0 814 adc %rdx, w1 815 mov $0, R32(w2) 816 mov (up,i,8), %rax 817 mul v0 818 add %rax, w0 819 mov w0, (rp,i,8) 820 adc %rdx, w1 821 mov (up,i,8), %rax 822 adc $0, R32(w2) 823 mul v1 824 add %rax, w1 825 adc %rdx, w2 826 mov 8(up,i,8), %rax 827 mul v0 828 mov $0, R32(w3) 829 add %rax, w1 830 adc %rdx, w2 831 adc $0, R32(w3) 832 mov 8(up,i,8), %rax 833L(m2e3):mul v1 834 add %rax, w2 835 mov w1, 8(rp,i,8) 836 adc %rdx, w3 837 mov $0, R32(w0) 838 mov 16(up,i,8), %rax 839 mul v0 840 add %rax, w2 841 mov 16(up,i,8), %rax 842 adc %rdx, w3 843 adc $0, R32(w0) 844 mul v1 845 mov $0, R32(w1) 846 add %rax, w3 847 mov 24(up,i,8), %rax 848 mov w2, 16(rp,i,8) 849 adc %rdx, w0 850 add $4, i 851 js L(m2top3) 852 853 mul v0 854 add %rax, w3 855 mov I(-8(up),-8(up,i,8)), %rax 856 mov w3, I(-8(rp),-8(rp,i,8)) 857 adc %rdx, w0 858 adc $0, R32(w1) 859 mul v1 860 add %rax, w0 861 adc %rdx, w1 862 mov w0, I((rp),(rp,i,8)) 863 mov w1, I(8(rp),8(rp,i,8)) 864 865 add $-2, vn_param 866 jz L(ret2) 867 868L(do_am3): 869 push %r15 870 push vn_param 871 872L(olo3): 873 mov (vp), v0 874 mov 8(vp), v1 875 lea 16(vp), vp 876 lea 16(rp), rp 877 mov (up,un,8), %rax 878 lea -1(un), i 879 mul v0 880 mov %rax, X1 881 MOV( %rdx, X0, 8) 882 mov (up,un,8), %rax 883 mov (rp,un,8), w3 884 mul v1 885 mov %rax, w0 886 MOV( %rdx, w1, 16) 887 mov 8(up,un,8), %rax 888 jmp L(lo3) 889 890 ALIGNx 891L(am2top3): 892 mul v1 893 add w0, w1 894 adc %rax, w2 895 mov (up,i,8), %rax 896 MOV( %rdx, w3, 1) 897 adc $0, w3 898 mul v0 899 add w1, X1 900 mov X1, -8(rp,i,8) 901 adc %rax, X0 902 MOV( %rdx, X1, 2) 903 adc $0, X1 904 mov (up,i,8), %rax 905 mul v1 906 MOV( %rdx, w0, 4) 907 mov (rp,i,8), w1 908 add w1, w2 909 adc %rax, w3 910 adc $0, w0 911 mov 8(up,i,8), %rax 912 mul v0 913 add w2, X0 914 adc %rax, X1 915 mov X0, (rp,i,8) 916 MOV( %rdx, X0, 8) 917 adc $0, X0 918 mov 8(up,i,8), %rax 919 mov 8(rp,i,8), w2 920 mul v1 921 add w2, w3 922 adc %rax, w0 923 MOV( %rdx, w1, 16) 924 adc $0, w1 925 mov 16(up,i,8), %rax 926L(lo3): mul v0 927 add w3, X1 928 mov X1, 8(rp,i,8) 929 adc %rax, X0 930 MOV( %rdx, X1, 32) 931 mov 16(rp,i,8), w3 932 adc $0, X1 933 mov 16(up,i,8), %rax 934 mul v1 935 add w3, w0 936 MOV( %rdx, w2, 64) 937 adc %rax, w1 938 mov 24(up,i,8), %rax 939 adc $0, w2 940 mul v0 941 add w0, X0 942 mov X0, 16(rp,i,8) 943 MOV( %rdx, X0, 128) 944 adc %rax, X1 945 mov 24(up,i,8), %rax 946 mov 24(rp,i,8), w0 947 adc $0, X0 948 add $4, i 949 jnc L(am2top3) 950 951 mul v1 952 add w0, w1 953 adc %rax, w2 954 adc Z(i,$0), %rdx 955 add w1, X1 956 adc Z(i,$0), X0 957 mov X1, I(-8(rp),-8(rp,i,8)) 958 add w2, X0 959 mov X0, I((rp),(rp,i,8)) 960 adc Z(i,$0), %rdx 961 mov %rdx, I(8(rp),8(rp,i,8)) 962 963 addl $-2, vn 964 jnz L(olo3) 965 966 pop %rax 967 pop %r15 968 pop %r14 969 pop %r13 970 pop %r12 971 pop %rbp 972 pop %rbx 973 FUNC_EXIT() 974 ret 975EPILOGUE() 976