1 # $FreeBSD$ 2.text 3 4 5 6.globl bn_mul_mont_gather5 7.type bn_mul_mont_gather5,@function 8.align 64 9bn_mul_mont_gather5: 10 testl $7,%r9d 11 jnz .Lmul_enter 12 jmp .Lmul4x_enter 13 14.align 16 15.Lmul_enter: 16 movl %r9d,%r9d 17 movq %rsp,%rax 18 movl 8(%rsp),%r10d 19 pushq %rbx 20 pushq %rbp 21 pushq %r12 22 pushq %r13 23 pushq %r14 24 pushq %r15 25 leaq 2(%r9),%r11 26 negq %r11 27 leaq (%rsp,%r11,8),%rsp 28 andq $-1024,%rsp 29 30 movq %rax,8(%rsp,%r9,8) 31.Lmul_body: 32 movq %rdx,%r12 33 movq %r10,%r11 34 shrq $3,%r10 35 andq $7,%r11 36 notq %r10 37 leaq .Lmagic_masks(%rip),%rax 38 andq $3,%r10 39 leaq 96(%r12,%r11,8),%r12 40 movq 0(%rax,%r10,8),%xmm4 41 movq 8(%rax,%r10,8),%xmm5 42 movq 16(%rax,%r10,8),%xmm6 43 movq 24(%rax,%r10,8),%xmm7 44 45 movq -96(%r12),%xmm0 46 movq -32(%r12),%xmm1 47 pand %xmm4,%xmm0 48 movq 32(%r12),%xmm2 49 pand %xmm5,%xmm1 50 movq 96(%r12),%xmm3 51 pand %xmm6,%xmm2 52 por %xmm1,%xmm0 53 pand %xmm7,%xmm3 54 por %xmm2,%xmm0 55 leaq 256(%r12),%r12 56 por %xmm3,%xmm0 57 58.byte 102,72,15,126,195 59 60 movq (%r8),%r8 61 movq (%rsi),%rax 62 63 xorq %r14,%r14 64 xorq %r15,%r15 65 66 movq -96(%r12),%xmm0 67 movq -32(%r12),%xmm1 68 pand %xmm4,%xmm0 69 movq 32(%r12),%xmm2 70 pand %xmm5,%xmm1 71 72 movq %r8,%rbp 73 mulq %rbx 74 movq %rax,%r10 75 movq (%rcx),%rax 76 77 movq 96(%r12),%xmm3 78 pand %xmm6,%xmm2 79 por %xmm1,%xmm0 80 pand %xmm7,%xmm3 81 82 imulq %r10,%rbp 83 movq %rdx,%r11 84 85 por %xmm2,%xmm0 86 leaq 256(%r12),%r12 87 por %xmm3,%xmm0 88 89 mulq %rbp 90 addq %rax,%r10 91 movq 8(%rsi),%rax 92 adcq $0,%rdx 93 movq %rdx,%r13 94 95 leaq 1(%r15),%r15 96 jmp .L1st_enter 97 98.align 16 99.L1st: 100 addq %rax,%r13 101 movq (%rsi,%r15,8),%rax 102 adcq $0,%rdx 103 addq %r11,%r13 104 movq %r10,%r11 105 adcq $0,%rdx 106 movq %r13,-16(%rsp,%r15,8) 107 movq %rdx,%r13 108 109.L1st_enter: 110 mulq %rbx 111 addq %rax,%r11 112 movq (%rcx,%r15,8),%rax 113 adcq $0,%rdx 114 leaq 1(%r15),%r15 115 movq %rdx,%r10 116 117 mulq %rbp 118 cmpq %r9,%r15 119 jne .L1st 120 121.byte 102,72,15,126,195 122 123 addq %rax,%r13 124 movq (%rsi),%rax 125 adcq $0,%rdx 126 addq %r11,%r13 127 adcq $0,%rdx 128 movq %r13,-16(%rsp,%r15,8) 129 movq %rdx,%r13 130 movq %r10,%r11 131 132 xorq %rdx,%rdx 133 addq %r11,%r13 134 adcq $0,%rdx 135 movq %r13,-8(%rsp,%r9,8) 136 movq %rdx,(%rsp,%r9,8) 137 138 leaq 1(%r14),%r14 139 jmp .Louter 140.align 16 141.Louter: 142 xorq %r15,%r15 143 movq %r8,%rbp 144 movq (%rsp),%r10 145 146 movq -96(%r12),%xmm0 147 movq -32(%r12),%xmm1 148 pand %xmm4,%xmm0 149 movq 32(%r12),%xmm2 150 pand %xmm5,%xmm1 151 152 mulq %rbx 153 addq %rax,%r10 154 movq (%rcx),%rax 155 adcq $0,%rdx 156 157 movq 96(%r12),%xmm3 158 pand %xmm6,%xmm2 159 por %xmm1,%xmm0 160 pand %xmm7,%xmm3 161 162 imulq %r10,%rbp 163 movq %rdx,%r11 164 165 por %xmm2,%xmm0 166 leaq 256(%r12),%r12 167 por %xmm3,%xmm0 168 169 mulq %rbp 170 addq %rax,%r10 171 movq 8(%rsi),%rax 172 adcq $0,%rdx 173 movq 8(%rsp),%r10 174 movq %rdx,%r13 175 176 leaq 1(%r15),%r15 177 jmp .Linner_enter 178 179.align 16 180.Linner: 181 addq %rax,%r13 182 movq (%rsi,%r15,8),%rax 183 adcq $0,%rdx 184 addq %r10,%r13 185 movq (%rsp,%r15,8),%r10 186 adcq $0,%rdx 187 movq %r13,-16(%rsp,%r15,8) 188 movq %rdx,%r13 189 190.Linner_enter: 191 mulq %rbx 192 addq %rax,%r11 193 movq (%rcx,%r15,8),%rax 194 adcq $0,%rdx 195 addq %r11,%r10 196 movq %rdx,%r11 197 adcq $0,%r11 198 leaq 1(%r15),%r15 199 200 mulq %rbp 201 cmpq %r9,%r15 202 jne .Linner 203 204.byte 102,72,15,126,195 205 206 addq %rax,%r13 207 movq (%rsi),%rax 208 adcq $0,%rdx 209 addq %r10,%r13 210 movq (%rsp,%r15,8),%r10 211 adcq $0,%rdx 212 movq %r13,-16(%rsp,%r15,8) 213 movq %rdx,%r13 214 215 xorq %rdx,%rdx 216 addq %r11,%r13 217 adcq $0,%rdx 218 addq %r10,%r13 219 adcq $0,%rdx 220 movq %r13,-8(%rsp,%r9,8) 221 movq %rdx,(%rsp,%r9,8) 222 223 leaq 1(%r14),%r14 224 cmpq %r9,%r14 225 jb .Louter 226 227 xorq %r14,%r14 228 movq (%rsp),%rax 229 leaq (%rsp),%rsi 230 movq %r9,%r15 231 jmp .Lsub 232.align 16 233.Lsub: sbbq (%rcx,%r14,8),%rax 234 movq %rax,(%rdi,%r14,8) 235 movq 8(%rsi,%r14,8),%rax 236 leaq 1(%r14),%r14 237 decq %r15 238 jnz .Lsub 239 240 sbbq $0,%rax 241 xorq %r14,%r14 242 andq %rax,%rsi 243 notq %rax 244 movq %rdi,%rcx 245 andq %rax,%rcx 246 movq %r9,%r15 247 orq %rcx,%rsi 248.align 16 249.Lcopy: 250 movq (%rsi,%r14,8),%rax 251 movq %r14,(%rsp,%r14,8) 252 movq %rax,(%rdi,%r14,8) 253 leaq 1(%r14),%r14 254 subq $1,%r15 255 jnz .Lcopy 256 257 movq 8(%rsp,%r9,8),%rsi 258 movq $1,%rax 259 movq -48(%rsi),%r15 260 movq -40(%rsi),%r14 261 movq -32(%rsi),%r13 262 movq -24(%rsi),%r12 263 movq -16(%rsi),%rbp 264 movq -8(%rsi),%rbx 265 leaq (%rsi),%rsp 266.Lmul_epilogue: 267 .byte 0xf3,0xc3 268.size bn_mul_mont_gather5,.-bn_mul_mont_gather5 269.type bn_mul4x_mont_gather5,@function 270.align 32 271bn_mul4x_mont_gather5: 272.Lmul4x_enter: 273.byte 0x67 274 movq %rsp,%rax 275 pushq %rbx 276 pushq %rbp 277 pushq %r12 278 pushq %r13 279 pushq %r14 280 pushq %r15 281.byte 0x67 282 movl %r9d,%r10d 283 shll $3,%r9d 284 shll $3+2,%r10d 285 negq %r9 286 287 288 289 290 291 292 293 294 leaq -64(%rsp,%r9,2),%r11 295 subq %rsi,%r11 296 andq $4095,%r11 297 cmpq %r11,%r10 298 jb .Lmul4xsp_alt 299 subq %r11,%rsp 300 leaq -64(%rsp,%r9,2),%rsp 301 jmp .Lmul4xsp_done 302 303.align 32 304.Lmul4xsp_alt: 305 leaq 4096-64(,%r9,2),%r10 306 leaq -64(%rsp,%r9,2),%rsp 307 subq %r10,%r11 308 movq $0,%r10 309 cmovcq %r10,%r11 310 subq %r11,%rsp 311.Lmul4xsp_done: 312 andq $-64,%rsp 313 negq %r9 314 315 movq %rax,40(%rsp) 316.Lmul4x_body: 317 318 call mul4x_internal 319 320 movq 40(%rsp),%rsi 321 movq $1,%rax 322 movq -48(%rsi),%r15 323 movq -40(%rsi),%r14 324 movq -32(%rsi),%r13 325 movq -24(%rsi),%r12 326 movq -16(%rsi),%rbp 327 movq -8(%rsi),%rbx 328 leaq (%rsi),%rsp 329.Lmul4x_epilogue: 330 .byte 0xf3,0xc3 331.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5 332 333.type mul4x_internal,@function 334.align 32 335mul4x_internal: 336 shlq $5,%r9 337 movl 8(%rax),%r10d 338 leaq 256(%rdx,%r9,1),%r13 339 shrq $5,%r9 340 movq %r10,%r11 341 shrq $3,%r10 342 andq $7,%r11 343 notq %r10 344 leaq .Lmagic_masks(%rip),%rax 345 andq $3,%r10 346 leaq 96(%rdx,%r11,8),%r12 347 movq 0(%rax,%r10,8),%xmm4 348 movq 8(%rax,%r10,8),%xmm5 349 addq $7,%r11 350 movq 16(%rax,%r10,8),%xmm6 351 movq 24(%rax,%r10,8),%xmm7 352 andq $7,%r11 353 354 movq -96(%r12),%xmm0 355 leaq 256(%r12),%r14 356 movq -32(%r12),%xmm1 357 pand %xmm4,%xmm0 358 movq 32(%r12),%xmm2 359 pand %xmm5,%xmm1 360 movq 96(%r12),%xmm3 361 pand %xmm6,%xmm2 362.byte 0x67 363 por %xmm1,%xmm0 364 movq -96(%r14),%xmm1 365.byte 0x67 366 pand %xmm7,%xmm3 367.byte 0x67 368 por %xmm2,%xmm0 369 movq -32(%r14),%xmm2 370.byte 0x67 371 pand %xmm4,%xmm1 372.byte 0x67 373 por %xmm3,%xmm0 374 movq 32(%r14),%xmm3 375 376.byte 102,72,15,126,195 377 movq 96(%r14),%xmm0 378 movq %r13,16+8(%rsp) 379 movq %rdi,56+8(%rsp) 380 381 movq (%r8),%r8 382 movq (%rsi),%rax 383 leaq (%rsi,%r9,1),%rsi 384 negq %r9 385 386 movq %r8,%rbp 387 mulq %rbx 388 movq %rax,%r10 389 movq (%rcx),%rax 390 391 pand %xmm5,%xmm2 392 pand %xmm6,%xmm3 393 por %xmm2,%xmm1 394 395 imulq %r10,%rbp 396 397 398 399 400 401 402 403 leaq 64+8(%rsp,%r11,8),%r14 404 movq %rdx,%r11 405 406 pand %xmm7,%xmm0 407 por %xmm3,%xmm1 408 leaq 512(%r12),%r12 409 por %xmm1,%xmm0 410 411 mulq %rbp 412 addq %rax,%r10 413 movq 8(%rsi,%r9,1),%rax 414 adcq $0,%rdx 415 movq %rdx,%rdi 416 417 mulq %rbx 418 addq %rax,%r11 419 movq 16(%rcx),%rax 420 adcq $0,%rdx 421 movq %rdx,%r10 422 423 mulq %rbp 424 addq %rax,%rdi 425 movq 16(%rsi,%r9,1),%rax 426 adcq $0,%rdx 427 addq %r11,%rdi 428 leaq 32(%r9),%r15 429 leaq 64(%rcx),%rcx 430 adcq $0,%rdx 431 movq %rdi,(%r14) 432 movq %rdx,%r13 433 jmp .L1st4x 434 435.align 32 436.L1st4x: 437 mulq %rbx 438 addq %rax,%r10 439 movq -32(%rcx),%rax 440 leaq 32(%r14),%r14 441 adcq $0,%rdx 442 movq %rdx,%r11 443 444 mulq %rbp 445 addq %rax,%r13 446 movq -8(%rsi,%r15,1),%rax 447 adcq $0,%rdx 448 addq %r10,%r13 449 adcq $0,%rdx 450 movq %r13,-24(%r14) 451 movq %rdx,%rdi 452 453 mulq %rbx 454 addq %rax,%r11 455 movq -16(%rcx),%rax 456 adcq $0,%rdx 457 movq %rdx,%r10 458 459 mulq %rbp 460 addq %rax,%rdi 461 movq (%rsi,%r15,1),%rax 462 adcq $0,%rdx 463 addq %r11,%rdi 464 adcq $0,%rdx 465 movq %rdi,-16(%r14) 466 movq %rdx,%r13 467 468 mulq %rbx 469 addq %rax,%r10 470 movq 0(%rcx),%rax 471 adcq $0,%rdx 472 movq %rdx,%r11 473 474 mulq %rbp 475 addq %rax,%r13 476 movq 8(%rsi,%r15,1),%rax 477 adcq $0,%rdx 478 addq %r10,%r13 479 adcq $0,%rdx 480 movq %r13,-8(%r14) 481 movq %rdx,%rdi 482 483 mulq %rbx 484 addq %rax,%r11 485 movq 16(%rcx),%rax 486 adcq $0,%rdx 487 movq %rdx,%r10 488 489 mulq %rbp 490 addq %rax,%rdi 491 movq 16(%rsi,%r15,1),%rax 492 adcq $0,%rdx 493 addq %r11,%rdi 494 leaq 64(%rcx),%rcx 495 adcq $0,%rdx 496 movq %rdi,(%r14) 497 movq %rdx,%r13 498 499 addq $32,%r15 500 jnz .L1st4x 501 502 mulq %rbx 503 addq %rax,%r10 504 movq -32(%rcx),%rax 505 leaq 32(%r14),%r14 506 adcq $0,%rdx 507 movq %rdx,%r11 508 509 mulq %rbp 510 addq %rax,%r13 511 movq -8(%rsi),%rax 512 adcq $0,%rdx 513 addq %r10,%r13 514 adcq $0,%rdx 515 movq %r13,-24(%r14) 516 movq %rdx,%rdi 517 518 mulq %rbx 519 addq %rax,%r11 520 movq -16(%rcx),%rax 521 adcq $0,%rdx 522 movq %rdx,%r10 523 524 mulq %rbp 525 addq %rax,%rdi 526 movq (%rsi,%r9,1),%rax 527 adcq $0,%rdx 528 addq %r11,%rdi 529 adcq $0,%rdx 530 movq %rdi,-16(%r14) 531 movq %rdx,%r13 532 533.byte 102,72,15,126,195 534 leaq (%rcx,%r9,2),%rcx 535 536 xorq %rdi,%rdi 537 addq %r10,%r13 538 adcq $0,%rdi 539 movq %r13,-8(%r14) 540 541 jmp .Louter4x 542 543.align 32 544.Louter4x: 545 movq (%r14,%r9,1),%r10 546 movq %r8,%rbp 547 mulq %rbx 548 addq %rax,%r10 549 movq (%rcx),%rax 550 adcq $0,%rdx 551 552 movq -96(%r12),%xmm0 553 movq -32(%r12),%xmm1 554 pand %xmm4,%xmm0 555 movq 32(%r12),%xmm2 556 pand %xmm5,%xmm1 557 movq 96(%r12),%xmm3 558 559 imulq %r10,%rbp 560.byte 0x67 561 movq %rdx,%r11 562 movq %rdi,(%r14) 563 564 pand %xmm6,%xmm2 565 por %xmm1,%xmm0 566 pand %xmm7,%xmm3 567 por %xmm2,%xmm0 568 leaq (%r14,%r9,1),%r14 569 leaq 256(%r12),%r12 570 por %xmm3,%xmm0 571 572 mulq %rbp 573 addq %rax,%r10 574 movq 8(%rsi,%r9,1),%rax 575 adcq $0,%rdx 576 movq %rdx,%rdi 577 578 mulq %rbx 579 addq %rax,%r11 580 movq 16(%rcx),%rax 581 adcq $0,%rdx 582 addq 8(%r14),%r11 583 adcq $0,%rdx 584 movq %rdx,%r10 585 586 mulq %rbp 587 addq %rax,%rdi 588 movq 16(%rsi,%r9,1),%rax 589 adcq $0,%rdx 590 addq %r11,%rdi 591 leaq 32(%r9),%r15 592 leaq 64(%rcx),%rcx 593 adcq $0,%rdx 594 movq %rdx,%r13 595 jmp .Linner4x 596 597.align 32 598.Linner4x: 599 mulq %rbx 600 addq %rax,%r10 601 movq -32(%rcx),%rax 602 adcq $0,%rdx 603 addq 16(%r14),%r10 604 leaq 32(%r14),%r14 605 adcq $0,%rdx 606 movq %rdx,%r11 607 608 mulq %rbp 609 addq %rax,%r13 610 movq -8(%rsi,%r15,1),%rax 611 adcq $0,%rdx 612 addq %r10,%r13 613 adcq $0,%rdx 614 movq %rdi,-32(%r14) 615 movq %rdx,%rdi 616 617 mulq %rbx 618 addq %rax,%r11 619 movq -16(%rcx),%rax 620 adcq $0,%rdx 621 addq -8(%r14),%r11 622 adcq $0,%rdx 623 movq %rdx,%r10 624 625 mulq %rbp 626 addq %rax,%rdi 627 movq (%rsi,%r15,1),%rax 628 adcq $0,%rdx 629 addq %r11,%rdi 630 adcq $0,%rdx 631 movq %r13,-24(%r14) 632 movq %rdx,%r13 633 634 mulq %rbx 635 addq %rax,%r10 636 movq 0(%rcx),%rax 637 adcq $0,%rdx 638 addq (%r14),%r10 639 adcq $0,%rdx 640 movq %rdx,%r11 641 642 mulq %rbp 643 addq %rax,%r13 644 movq 8(%rsi,%r15,1),%rax 645 adcq $0,%rdx 646 addq %r10,%r13 647 adcq $0,%rdx 648 movq %rdi,-16(%r14) 649 movq %rdx,%rdi 650 651 mulq %rbx 652 addq %rax,%r11 653 movq 16(%rcx),%rax 654 adcq $0,%rdx 655 addq 8(%r14),%r11 656 adcq $0,%rdx 657 movq %rdx,%r10 658 659 mulq %rbp 660 addq %rax,%rdi 661 movq 16(%rsi,%r15,1),%rax 662 adcq $0,%rdx 663 addq %r11,%rdi 664 leaq 64(%rcx),%rcx 665 adcq $0,%rdx 666 movq %r13,-8(%r14) 667 movq %rdx,%r13 668 669 addq $32,%r15 670 jnz .Linner4x 671 672 mulq %rbx 673 addq %rax,%r10 674 movq -32(%rcx),%rax 675 adcq $0,%rdx 676 addq 16(%r14),%r10 677 leaq 32(%r14),%r14 678 adcq $0,%rdx 679 movq %rdx,%r11 680 681 mulq %rbp 682 addq %rax,%r13 683 movq -8(%rsi),%rax 684 adcq $0,%rdx 685 addq %r10,%r13 686 adcq $0,%rdx 687 movq %rdi,-32(%r14) 688 movq %rdx,%rdi 689 690 mulq %rbx 691 addq %rax,%r11 692 movq %rbp,%rax 693 movq -16(%rcx),%rbp 694 adcq $0,%rdx 695 addq -8(%r14),%r11 696 adcq $0,%rdx 697 movq %rdx,%r10 698 699 mulq %rbp 700 addq %rax,%rdi 701 movq (%rsi,%r9,1),%rax 702 adcq $0,%rdx 703 addq %r11,%rdi 704 adcq $0,%rdx 705 movq %r13,-24(%r14) 706 movq %rdx,%r13 707 708.byte 102,72,15,126,195 709 movq %rdi,-16(%r14) 710 leaq (%rcx,%r9,2),%rcx 711 712 xorq %rdi,%rdi 713 addq %r10,%r13 714 adcq $0,%rdi 715 addq (%r14),%r13 716 adcq $0,%rdi 717 movq %r13,-8(%r14) 718 719 cmpq 16+8(%rsp),%r12 720 jb .Louter4x 721 subq %r13,%rbp 722 adcq %r15,%r15 723 orq %r15,%rdi 724 xorq $1,%rdi 725 leaq (%r14,%r9,1),%rbx 726 leaq (%rcx,%rdi,8),%rbp 727 movq %r9,%rcx 728 sarq $3+2,%rcx 729 movq 56+8(%rsp),%rdi 730 jmp .Lsqr4x_sub 731.size mul4x_internal,.-mul4x_internal 732.globl bn_power5 733.type bn_power5,@function 734.align 32 735bn_power5: 736 movq %rsp,%rax 737 pushq %rbx 738 pushq %rbp 739 pushq %r12 740 pushq %r13 741 pushq %r14 742 pushq %r15 743 movl %r9d,%r10d 744 shll $3,%r9d 745 shll $3+2,%r10d 746 negq %r9 747 movq (%r8),%r8 748 749 750 751 752 753 754 755 leaq -64(%rsp,%r9,2),%r11 756 subq %rsi,%r11 757 andq $4095,%r11 758 cmpq %r11,%r10 759 jb .Lpwr_sp_alt 760 subq %r11,%rsp 761 leaq -64(%rsp,%r9,2),%rsp 762 jmp .Lpwr_sp_done 763 764.align 32 765.Lpwr_sp_alt: 766 leaq 4096-64(,%r9,2),%r10 767 leaq -64(%rsp,%r9,2),%rsp 768 subq %r10,%r11 769 movq $0,%r10 770 cmovcq %r10,%r11 771 subq %r11,%rsp 772.Lpwr_sp_done: 773 andq $-64,%rsp 774 movq %r9,%r10 775 negq %r9 776 777 778 779 780 781 782 783 784 785 786 movq %r8,32(%rsp) 787 movq %rax,40(%rsp) 788.Lpower5_body: 789.byte 102,72,15,110,207 790.byte 102,72,15,110,209 791.byte 102,73,15,110,218 792.byte 102,72,15,110,226 793 794 call __bn_sqr8x_internal 795 call __bn_sqr8x_internal 796 call __bn_sqr8x_internal 797 call __bn_sqr8x_internal 798 call __bn_sqr8x_internal 799 800.byte 102,72,15,126,209 801.byte 102,72,15,126,226 802 movq %rsi,%rdi 803 movq 40(%rsp),%rax 804 leaq 32(%rsp),%r8 805 806 call mul4x_internal 807 808 movq 40(%rsp),%rsi 809 movq $1,%rax 810 movq -48(%rsi),%r15 811 movq -40(%rsi),%r14 812 movq -32(%rsi),%r13 813 movq -24(%rsi),%r12 814 movq -16(%rsi),%rbp 815 movq -8(%rsi),%rbx 816 leaq (%rsi),%rsp 817.Lpower5_epilogue: 818 .byte 0xf3,0xc3 819.size bn_power5,.-bn_power5 820 821.globl bn_sqr8x_internal 822.hidden bn_sqr8x_internal 823.type bn_sqr8x_internal,@function 824.align 32 825bn_sqr8x_internal: 826__bn_sqr8x_internal: 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 leaq 32(%r10),%rbp 901 leaq (%rsi,%r9,1),%rsi 902 903 movq %r9,%rcx 904 905 906 movq -32(%rsi,%rbp,1),%r14 907 leaq 48+8(%rsp,%r9,2),%rdi 908 movq -24(%rsi,%rbp,1),%rax 909 leaq -32(%rdi,%rbp,1),%rdi 910 movq -16(%rsi,%rbp,1),%rbx 911 movq %rax,%r15 912 913 mulq %r14 914 movq %rax,%r10 915 movq %rbx,%rax 916 movq %rdx,%r11 917 movq %r10,-24(%rdi,%rbp,1) 918 919 mulq %r14 920 addq %rax,%r11 921 movq %rbx,%rax 922 adcq $0,%rdx 923 movq %r11,-16(%rdi,%rbp,1) 924 movq %rdx,%r10 925 926 927 movq -8(%rsi,%rbp,1),%rbx 928 mulq %r15 929 movq %rax,%r12 930 movq %rbx,%rax 931 movq %rdx,%r13 932 933 leaq (%rbp),%rcx 934 mulq %r14 935 addq %rax,%r10 936 movq %rbx,%rax 937 movq %rdx,%r11 938 adcq $0,%r11 939 addq %r12,%r10 940 adcq $0,%r11 941 movq %r10,-8(%rdi,%rcx,1) 942 jmp .Lsqr4x_1st 943 944.align 32 945.Lsqr4x_1st: 946 movq (%rsi,%rcx,1),%rbx 947 mulq %r15 948 addq %rax,%r13 949 movq %rbx,%rax 950 movq %rdx,%r12 951 adcq $0,%r12 952 953 mulq %r14 954 addq %rax,%r11 955 movq %rbx,%rax 956 movq 8(%rsi,%rcx,1),%rbx 957 movq %rdx,%r10 958 adcq $0,%r10 959 addq %r13,%r11 960 adcq $0,%r10 961 962 963 mulq %r15 964 addq %rax,%r12 965 movq %rbx,%rax 966 movq %r11,(%rdi,%rcx,1) 967 movq %rdx,%r13 968 adcq $0,%r13 969 970 mulq %r14 971 addq %rax,%r10 972 movq %rbx,%rax 973 movq 16(%rsi,%rcx,1),%rbx 974 movq %rdx,%r11 975 adcq $0,%r11 976 addq %r12,%r10 977 adcq $0,%r11 978 979 mulq %r15 980 addq %rax,%r13 981 movq %rbx,%rax 982 movq %r10,8(%rdi,%rcx,1) 983 movq %rdx,%r12 984 adcq $0,%r12 985 986 mulq %r14 987 addq %rax,%r11 988 movq %rbx,%rax 989 movq 24(%rsi,%rcx,1),%rbx 990 movq %rdx,%r10 991 adcq $0,%r10 992 addq %r13,%r11 993 adcq $0,%r10 994 995 996 mulq %r15 997 addq %rax,%r12 998 movq %rbx,%rax 999 movq %r11,16(%rdi,%rcx,1) 1000 movq %rdx,%r13 1001 adcq $0,%r13 1002 leaq 32(%rcx),%rcx 1003 1004 mulq %r14 1005 addq %rax,%r10 1006 movq %rbx,%rax 1007 movq %rdx,%r11 1008 adcq $0,%r11 1009 addq %r12,%r10 1010 adcq $0,%r11 1011 movq %r10,-8(%rdi,%rcx,1) 1012 1013 cmpq $0,%rcx 1014 jne .Lsqr4x_1st 1015 1016 mulq %r15 1017 addq %rax,%r13 1018 leaq 16(%rbp),%rbp 1019 adcq $0,%rdx 1020 addq %r11,%r13 1021 adcq $0,%rdx 1022 1023 movq %r13,(%rdi) 1024 movq %rdx,%r12 1025 movq %rdx,8(%rdi) 1026 jmp .Lsqr4x_outer 1027 1028.align 32 1029.Lsqr4x_outer: 1030 movq -32(%rsi,%rbp,1),%r14 1031 leaq 48+8(%rsp,%r9,2),%rdi 1032 movq -24(%rsi,%rbp,1),%rax 1033 leaq -32(%rdi,%rbp,1),%rdi 1034 movq -16(%rsi,%rbp,1),%rbx 1035 movq %rax,%r15 1036 1037 mulq %r14 1038 movq -24(%rdi,%rbp,1),%r10 1039 addq %rax,%r10 1040 movq %rbx,%rax 1041 adcq $0,%rdx 1042 movq %r10,-24(%rdi,%rbp,1) 1043 movq %rdx,%r11 1044 1045 mulq %r14 1046 addq %rax,%r11 1047 movq %rbx,%rax 1048 adcq $0,%rdx 1049 addq -16(%rdi,%rbp,1),%r11 1050 movq %rdx,%r10 1051 adcq $0,%r10 1052 movq %r11,-16(%rdi,%rbp,1) 1053 1054 xorq %r12,%r12 1055 1056 movq -8(%rsi,%rbp,1),%rbx 1057 mulq %r15 1058 addq %rax,%r12 1059 movq %rbx,%rax 1060 adcq $0,%rdx 1061 addq -8(%rdi,%rbp,1),%r12 1062 movq %rdx,%r13 1063 adcq $0,%r13 1064 1065 mulq %r14 1066 addq %rax,%r10 1067 movq %rbx,%rax 1068 adcq $0,%rdx 1069 addq %r12,%r10 1070 movq %rdx,%r11 1071 adcq $0,%r11 1072 movq %r10,-8(%rdi,%rbp,1) 1073 1074 leaq (%rbp),%rcx 1075 jmp .Lsqr4x_inner 1076 1077.align 32 1078.Lsqr4x_inner: 1079 movq (%rsi,%rcx,1),%rbx 1080 mulq %r15 1081 addq %rax,%r13 1082 movq %rbx,%rax 1083 movq %rdx,%r12 1084 adcq $0,%r12 1085 addq (%rdi,%rcx,1),%r13 1086 adcq $0,%r12 1087 1088.byte 0x67 1089 mulq %r14 1090 addq %rax,%r11 1091 movq %rbx,%rax 1092 movq 8(%rsi,%rcx,1),%rbx 1093 movq %rdx,%r10 1094 adcq $0,%r10 1095 addq %r13,%r11 1096 adcq $0,%r10 1097 1098 mulq %r15 1099 addq %rax,%r12 1100 movq %r11,(%rdi,%rcx,1) 1101 movq %rbx,%rax 1102 movq %rdx,%r13 1103 adcq $0,%r13 1104 addq 8(%rdi,%rcx,1),%r12 1105 leaq 16(%rcx),%rcx 1106 adcq $0,%r13 1107 1108 mulq %r14 1109 addq %rax,%r10 1110 movq %rbx,%rax 1111 adcq $0,%rdx 1112 addq %r12,%r10 1113 movq %rdx,%r11 1114 adcq $0,%r11 1115 movq %r10,-8(%rdi,%rcx,1) 1116 1117 cmpq $0,%rcx 1118 jne .Lsqr4x_inner 1119 1120.byte 0x67 1121 mulq %r15 1122 addq %rax,%r13 1123 adcq $0,%rdx 1124 addq %r11,%r13 1125 adcq $0,%rdx 1126 1127 movq %r13,(%rdi) 1128 movq %rdx,%r12 1129 movq %rdx,8(%rdi) 1130 1131 addq $16,%rbp 1132 jnz .Lsqr4x_outer 1133 1134 1135 movq -32(%rsi),%r14 1136 leaq 48+8(%rsp,%r9,2),%rdi 1137 movq -24(%rsi),%rax 1138 leaq -32(%rdi,%rbp,1),%rdi 1139 movq -16(%rsi),%rbx 1140 movq %rax,%r15 1141 1142 mulq %r14 1143 addq %rax,%r10 1144 movq %rbx,%rax 1145 movq %rdx,%r11 1146 adcq $0,%r11 1147 1148 mulq %r14 1149 addq %rax,%r11 1150 movq %rbx,%rax 1151 movq %r10,-24(%rdi) 1152 movq %rdx,%r10 1153 adcq $0,%r10 1154 addq %r13,%r11 1155 movq -8(%rsi),%rbx 1156 adcq $0,%r10 1157 1158 mulq %r15 1159 addq %rax,%r12 1160 movq %rbx,%rax 1161 movq %r11,-16(%rdi) 1162 movq %rdx,%r13 1163 adcq $0,%r13 1164 1165 mulq %r14 1166 addq %rax,%r10 1167 movq %rbx,%rax 1168 movq %rdx,%r11 1169 adcq $0,%r11 1170 addq %r12,%r10 1171 adcq $0,%r11 1172 movq %r10,-8(%rdi) 1173 1174 mulq %r15 1175 addq %rax,%r13 1176 movq -16(%rsi),%rax 1177 adcq $0,%rdx 1178 addq %r11,%r13 1179 adcq $0,%rdx 1180 1181 movq %r13,(%rdi) 1182 movq %rdx,%r12 1183 movq %rdx,8(%rdi) 1184 1185 mulq %rbx 1186 addq $16,%rbp 1187 xorq %r14,%r14 1188 subq %r9,%rbp 1189 xorq %r15,%r15 1190 1191 addq %r12,%rax 1192 adcq $0,%rdx 1193 movq %rax,8(%rdi) 1194 movq %rdx,16(%rdi) 1195 movq %r15,24(%rdi) 1196 1197 movq -16(%rsi,%rbp,1),%rax 1198 leaq 48+8(%rsp),%rdi 1199 xorq %r10,%r10 1200 movq 8(%rdi),%r11 1201 1202 leaq (%r14,%r10,2),%r12 1203 shrq $63,%r10 1204 leaq (%rcx,%r11,2),%r13 1205 shrq $63,%r11 1206 orq %r10,%r13 1207 movq 16(%rdi),%r10 1208 movq %r11,%r14 1209 mulq %rax 1210 negq %r15 1211 movq 24(%rdi),%r11 1212 adcq %rax,%r12 1213 movq -8(%rsi,%rbp,1),%rax 1214 movq %r12,(%rdi) 1215 adcq %rdx,%r13 1216 1217 leaq (%r14,%r10,2),%rbx 1218 movq %r13,8(%rdi) 1219 sbbq %r15,%r15 1220 shrq $63,%r10 1221 leaq (%rcx,%r11,2),%r8 1222 shrq $63,%r11 1223 orq %r10,%r8 1224 movq 32(%rdi),%r10 1225 movq %r11,%r14 1226 mulq %rax 1227 negq %r15 1228 movq 40(%rdi),%r11 1229 adcq %rax,%rbx 1230 movq 0(%rsi,%rbp,1),%rax 1231 movq %rbx,16(%rdi) 1232 adcq %rdx,%r8 1233 leaq 16(%rbp),%rbp 1234 movq %r8,24(%rdi) 1235 sbbq %r15,%r15 1236 leaq 64(%rdi),%rdi 1237 jmp .Lsqr4x_shift_n_add 1238 1239.align 32 1240.Lsqr4x_shift_n_add: 1241 leaq (%r14,%r10,2),%r12 1242 shrq $63,%r10 1243 leaq (%rcx,%r11,2),%r13 1244 shrq $63,%r11 1245 orq %r10,%r13 1246 movq -16(%rdi),%r10 1247 movq %r11,%r14 1248 mulq %rax 1249 negq %r15 1250 movq -8(%rdi),%r11 1251 adcq %rax,%r12 1252 movq -8(%rsi,%rbp,1),%rax 1253 movq %r12,-32(%rdi) 1254 adcq %rdx,%r13 1255 1256 leaq (%r14,%r10,2),%rbx 1257 movq %r13,-24(%rdi) 1258 sbbq %r15,%r15 1259 shrq $63,%r10 1260 leaq (%rcx,%r11,2),%r8 1261 shrq $63,%r11 1262 orq %r10,%r8 1263 movq 0(%rdi),%r10 1264 movq %r11,%r14 1265 mulq %rax 1266 negq %r15 1267 movq 8(%rdi),%r11 1268 adcq %rax,%rbx 1269 movq 0(%rsi,%rbp,1),%rax 1270 movq %rbx,-16(%rdi) 1271 adcq %rdx,%r8 1272 1273 leaq (%r14,%r10,2),%r12 1274 movq %r8,-8(%rdi) 1275 sbbq %r15,%r15 1276 shrq $63,%r10 1277 leaq (%rcx,%r11,2),%r13 1278 shrq $63,%r11 1279 orq %r10,%r13 1280 movq 16(%rdi),%r10 1281 movq %r11,%r14 1282 mulq %rax 1283 negq %r15 1284 movq 24(%rdi),%r11 1285 adcq %rax,%r12 1286 movq 8(%rsi,%rbp,1),%rax 1287 movq %r12,0(%rdi) 1288 adcq %rdx,%r13 1289 1290 leaq (%r14,%r10,2),%rbx 1291 movq %r13,8(%rdi) 1292 sbbq %r15,%r15 1293 shrq $63,%r10 1294 leaq (%rcx,%r11,2),%r8 1295 shrq $63,%r11 1296 orq %r10,%r8 1297 movq 32(%rdi),%r10 1298 movq %r11,%r14 1299 mulq %rax 1300 negq %r15 1301 movq 40(%rdi),%r11 1302 adcq %rax,%rbx 1303 movq 16(%rsi,%rbp,1),%rax 1304 movq %rbx,16(%rdi) 1305 adcq %rdx,%r8 1306 movq %r8,24(%rdi) 1307 sbbq %r15,%r15 1308 leaq 64(%rdi),%rdi 1309 addq $32,%rbp 1310 jnz .Lsqr4x_shift_n_add 1311 1312 leaq (%r14,%r10,2),%r12 1313.byte 0x67 1314 shrq $63,%r10 1315 leaq (%rcx,%r11,2),%r13 1316 shrq $63,%r11 1317 orq %r10,%r13 1318 movq -16(%rdi),%r10 1319 movq %r11,%r14 1320 mulq %rax 1321 negq %r15 1322 movq -8(%rdi),%r11 1323 adcq %rax,%r12 1324 movq -8(%rsi),%rax 1325 movq %r12,-32(%rdi) 1326 adcq %rdx,%r13 1327 1328 leaq (%r14,%r10,2),%rbx 1329 movq %r13,-24(%rdi) 1330 sbbq %r15,%r15 1331 shrq $63,%r10 1332 leaq (%rcx,%r11,2),%r8 1333 shrq $63,%r11 1334 orq %r10,%r8 1335 mulq %rax 1336 negq %r15 1337 adcq %rax,%rbx 1338 adcq %rdx,%r8 1339 movq %rbx,-16(%rdi) 1340 movq %r8,-8(%rdi) 1341.byte 102,72,15,126,213 1342sqr8x_reduction: 1343 xorq %rax,%rax 1344 leaq (%rbp,%r9,2),%rcx 1345 leaq 48+8(%rsp,%r9,2),%rdx 1346 movq %rcx,0+8(%rsp) 1347 leaq 48+8(%rsp,%r9,1),%rdi 1348 movq %rdx,8+8(%rsp) 1349 negq %r9 1350 jmp .L8x_reduction_loop 1351 1352.align 32 1353.L8x_reduction_loop: 1354 leaq (%rdi,%r9,1),%rdi 1355.byte 0x66 1356 movq 0(%rdi),%rbx 1357 movq 8(%rdi),%r9 1358 movq 16(%rdi),%r10 1359 movq 24(%rdi),%r11 1360 movq 32(%rdi),%r12 1361 movq 40(%rdi),%r13 1362 movq 48(%rdi),%r14 1363 movq 56(%rdi),%r15 1364 movq %rax,(%rdx) 1365 leaq 64(%rdi),%rdi 1366 1367.byte 0x67 1368 movq %rbx,%r8 1369 imulq 32+8(%rsp),%rbx 1370 movq 0(%rbp),%rax 1371 movl $8,%ecx 1372 jmp .L8x_reduce 1373 1374.align 32 1375.L8x_reduce: 1376 mulq %rbx 1377 movq 16(%rbp),%rax 1378 negq %r8 1379 movq %rdx,%r8 1380 adcq $0,%r8 1381 1382 mulq %rbx 1383 addq %rax,%r9 1384 movq 32(%rbp),%rax 1385 adcq $0,%rdx 1386 addq %r9,%r8 1387 movq %rbx,48-8+8(%rsp,%rcx,8) 1388 movq %rdx,%r9 1389 adcq $0,%r9 1390 1391 mulq %rbx 1392 addq %rax,%r10 1393 movq 48(%rbp),%rax 1394 adcq $0,%rdx 1395 addq %r10,%r9 1396 movq 32+8(%rsp),%rsi 1397 movq %rdx,%r10 1398 adcq $0,%r10 1399 1400 mulq %rbx 1401 addq %rax,%r11 1402 movq 64(%rbp),%rax 1403 adcq $0,%rdx 1404 imulq %r8,%rsi 1405 addq %r11,%r10 1406 movq %rdx,%r11 1407 adcq $0,%r11 1408 1409 mulq %rbx 1410 addq %rax,%r12 1411 movq 80(%rbp),%rax 1412 adcq $0,%rdx 1413 addq %r12,%r11 1414 movq %rdx,%r12 1415 adcq $0,%r12 1416 1417 mulq %rbx 1418 addq %rax,%r13 1419 movq 96(%rbp),%rax 1420 adcq $0,%rdx 1421 addq %r13,%r12 1422 movq %rdx,%r13 1423 adcq $0,%r13 1424 1425 mulq %rbx 1426 addq %rax,%r14 1427 movq 112(%rbp),%rax 1428 adcq $0,%rdx 1429 addq %r14,%r13 1430 movq %rdx,%r14 1431 adcq $0,%r14 1432 1433 mulq %rbx 1434 movq %rsi,%rbx 1435 addq %rax,%r15 1436 movq 0(%rbp),%rax 1437 adcq $0,%rdx 1438 addq %r15,%r14 1439 movq %rdx,%r15 1440 adcq $0,%r15 1441 1442 decl %ecx 1443 jnz .L8x_reduce 1444 1445 leaq 128(%rbp),%rbp 1446 xorq %rax,%rax 1447 movq 8+8(%rsp),%rdx 1448 cmpq 0+8(%rsp),%rbp 1449 jae .L8x_no_tail 1450 1451.byte 0x66 1452 addq 0(%rdi),%r8 1453 adcq 8(%rdi),%r9 1454 adcq 16(%rdi),%r10 1455 adcq 24(%rdi),%r11 1456 adcq 32(%rdi),%r12 1457 adcq 40(%rdi),%r13 1458 adcq 48(%rdi),%r14 1459 adcq 56(%rdi),%r15 1460 sbbq %rsi,%rsi 1461 1462 movq 48+56+8(%rsp),%rbx 1463 movl $8,%ecx 1464 movq 0(%rbp),%rax 1465 jmp .L8x_tail 1466 1467.align 32 1468.L8x_tail: 1469 mulq %rbx 1470 addq %rax,%r8 1471 movq 16(%rbp),%rax 1472 movq %r8,(%rdi) 1473 movq %rdx,%r8 1474 adcq $0,%r8 1475 1476 mulq %rbx 1477 addq %rax,%r9 1478 movq 32(%rbp),%rax 1479 adcq $0,%rdx 1480 addq %r9,%r8 1481 leaq 8(%rdi),%rdi 1482 movq %rdx,%r9 1483 adcq $0,%r9 1484 1485 mulq %rbx 1486 addq %rax,%r10 1487 movq 48(%rbp),%rax 1488 adcq $0,%rdx 1489 addq %r10,%r9 1490 movq %rdx,%r10 1491 adcq $0,%r10 1492 1493 mulq %rbx 1494 addq %rax,%r11 1495 movq 64(%rbp),%rax 1496 adcq $0,%rdx 1497 addq %r11,%r10 1498 movq %rdx,%r11 1499 adcq $0,%r11 1500 1501 mulq %rbx 1502 addq %rax,%r12 1503 movq 80(%rbp),%rax 1504 adcq $0,%rdx 1505 addq %r12,%r11 1506 movq %rdx,%r12 1507 adcq $0,%r12 1508 1509 mulq %rbx 1510 addq %rax,%r13 1511 movq 96(%rbp),%rax 1512 adcq $0,%rdx 1513 addq %r13,%r12 1514 movq %rdx,%r13 1515 adcq $0,%r13 1516 1517 mulq %rbx 1518 addq %rax,%r14 1519 movq 112(%rbp),%rax 1520 adcq $0,%rdx 1521 addq %r14,%r13 1522 movq %rdx,%r14 1523 adcq $0,%r14 1524 1525 mulq %rbx 1526 movq 48-16+8(%rsp,%rcx,8),%rbx 1527 addq %rax,%r15 1528 adcq $0,%rdx 1529 addq %r15,%r14 1530 movq 0(%rbp),%rax 1531 movq %rdx,%r15 1532 adcq $0,%r15 1533 1534 decl %ecx 1535 jnz .L8x_tail 1536 1537 leaq 128(%rbp),%rbp 1538 movq 8+8(%rsp),%rdx 1539 cmpq 0+8(%rsp),%rbp 1540 jae .L8x_tail_done 1541 1542 movq 48+56+8(%rsp),%rbx 1543 negq %rsi 1544 movq 0(%rbp),%rax 1545 adcq 0(%rdi),%r8 1546 adcq 8(%rdi),%r9 1547 adcq 16(%rdi),%r10 1548 adcq 24(%rdi),%r11 1549 adcq 32(%rdi),%r12 1550 adcq 40(%rdi),%r13 1551 adcq 48(%rdi),%r14 1552 adcq 56(%rdi),%r15 1553 sbbq %rsi,%rsi 1554 1555 movl $8,%ecx 1556 jmp .L8x_tail 1557 1558.align 32 1559.L8x_tail_done: 1560 addq (%rdx),%r8 1561 adcq $0,%r9 1562 adcq $0,%r10 1563 adcq $0,%r11 1564 adcq $0,%r12 1565 adcq $0,%r13 1566 adcq $0,%r14 1567 adcq $0,%r15 1568 1569 1570 xorq %rax,%rax 1571 1572 negq %rsi 1573.L8x_no_tail: 1574 adcq 0(%rdi),%r8 1575 adcq 8(%rdi),%r9 1576 adcq 16(%rdi),%r10 1577 adcq 24(%rdi),%r11 1578 adcq 32(%rdi),%r12 1579 adcq 40(%rdi),%r13 1580 adcq 48(%rdi),%r14 1581 adcq 56(%rdi),%r15 1582 adcq $0,%rax 1583 movq -16(%rbp),%rcx 1584 xorq %rsi,%rsi 1585 1586.byte 102,72,15,126,213 1587 1588 movq %r8,0(%rdi) 1589 movq %r9,8(%rdi) 1590.byte 102,73,15,126,217 1591 movq %r10,16(%rdi) 1592 movq %r11,24(%rdi) 1593 movq %r12,32(%rdi) 1594 movq %r13,40(%rdi) 1595 movq %r14,48(%rdi) 1596 movq %r15,56(%rdi) 1597 leaq 64(%rdi),%rdi 1598 1599 cmpq %rdx,%rdi 1600 jb .L8x_reduction_loop 1601 1602 subq %r15,%rcx 1603 leaq (%rdi,%r9,1),%rbx 1604 adcq %rsi,%rsi 1605 movq %r9,%rcx 1606 orq %rsi,%rax 1607.byte 102,72,15,126,207 1608 xorq $1,%rax 1609.byte 102,72,15,126,206 1610 leaq (%rbp,%rax,8),%rbp 1611 sarq $3+2,%rcx 1612 jmp .Lsqr4x_sub 1613 1614.align 32 1615.Lsqr4x_sub: 1616.byte 0x66 1617 movq 0(%rbx),%r12 1618 movq 8(%rbx),%r13 1619 sbbq 0(%rbp),%r12 1620 movq 16(%rbx),%r14 1621 sbbq 16(%rbp),%r13 1622 movq 24(%rbx),%r15 1623 leaq 32(%rbx),%rbx 1624 sbbq 32(%rbp),%r14 1625 movq %r12,0(%rdi) 1626 sbbq 48(%rbp),%r15 1627 leaq 64(%rbp),%rbp 1628 movq %r13,8(%rdi) 1629 movq %r14,16(%rdi) 1630 movq %r15,24(%rdi) 1631 leaq 32(%rdi),%rdi 1632 1633 incq %rcx 1634 jnz .Lsqr4x_sub 1635 movq %r9,%r10 1636 negq %r9 1637 .byte 0xf3,0xc3 1638.size bn_sqr8x_internal,.-bn_sqr8x_internal 1639.globl bn_from_montgomery 1640.type bn_from_montgomery,@function 1641.align 32 1642bn_from_montgomery: 1643 testl $7,%r9d 1644 jz bn_from_mont8x 1645 xorl %eax,%eax 1646 .byte 0xf3,0xc3 1647.size bn_from_montgomery,.-bn_from_montgomery 1648 1649.type bn_from_mont8x,@function 1650.align 32 1651bn_from_mont8x: 1652.byte 0x67 1653 movq %rsp,%rax 1654 pushq %rbx 1655 pushq %rbp 1656 pushq %r12 1657 pushq %r13 1658 pushq %r14 1659 pushq %r15 1660.byte 0x67 1661 movl %r9d,%r10d 1662 shll $3,%r9d 1663 shll $3+2,%r10d 1664 negq %r9 1665 movq (%r8),%r8 1666 1667 1668 1669 1670 1671 1672 1673 leaq -64(%rsp,%r9,2),%r11 1674 subq %rsi,%r11 1675 andq $4095,%r11 1676 cmpq %r11,%r10 1677 jb .Lfrom_sp_alt 1678 subq %r11,%rsp 1679 leaq -64(%rsp,%r9,2),%rsp 1680 jmp .Lfrom_sp_done 1681 1682.align 32 1683.Lfrom_sp_alt: 1684 leaq 4096-64(,%r9,2),%r10 1685 leaq -64(%rsp,%r9,2),%rsp 1686 subq %r10,%r11 1687 movq $0,%r10 1688 cmovcq %r10,%r11 1689 subq %r11,%rsp 1690.Lfrom_sp_done: 1691 andq $-64,%rsp 1692 movq %r9,%r10 1693 negq %r9 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 movq %r8,32(%rsp) 1705 movq %rax,40(%rsp) 1706.Lfrom_body: 1707 movq %r9,%r11 1708 leaq 48(%rsp),%rax 1709 pxor %xmm0,%xmm0 1710 jmp .Lmul_by_1 1711 1712.align 32 1713.Lmul_by_1: 1714 movdqu (%rsi),%xmm1 1715 movdqu 16(%rsi),%xmm2 1716 movdqu 32(%rsi),%xmm3 1717 movdqa %xmm0,(%rax,%r9,1) 1718 movdqu 48(%rsi),%xmm4 1719 movdqa %xmm0,16(%rax,%r9,1) 1720.byte 0x48,0x8d,0xb6,0x40,0x00,0x00,0x00 1721 movdqa %xmm1,(%rax) 1722 movdqa %xmm0,32(%rax,%r9,1) 1723 movdqa %xmm2,16(%rax) 1724 movdqa %xmm0,48(%rax,%r9,1) 1725 movdqa %xmm3,32(%rax) 1726 movdqa %xmm4,48(%rax) 1727 leaq 64(%rax),%rax 1728 subq $64,%r11 1729 jnz .Lmul_by_1 1730 1731.byte 102,72,15,110,207 1732.byte 102,72,15,110,209 1733.byte 0x67 1734 movq %rcx,%rbp 1735.byte 102,73,15,110,218 1736 call sqr8x_reduction 1737 1738 pxor %xmm0,%xmm0 1739 leaq 48(%rsp),%rax 1740 movq 40(%rsp),%rsi 1741 jmp .Lfrom_mont_zero 1742 1743.align 32 1744.Lfrom_mont_zero: 1745 movdqa %xmm0,0(%rax) 1746 movdqa %xmm0,16(%rax) 1747 movdqa %xmm0,32(%rax) 1748 movdqa %xmm0,48(%rax) 1749 leaq 64(%rax),%rax 1750 subq $32,%r9 1751 jnz .Lfrom_mont_zero 1752 1753 movq $1,%rax 1754 movq -48(%rsi),%r15 1755 movq -40(%rsi),%r14 1756 movq -32(%rsi),%r13 1757 movq -24(%rsi),%r12 1758 movq -16(%rsi),%rbp 1759 movq -8(%rsi),%rbx 1760 leaq (%rsi),%rsp 1761.Lfrom_epilogue: 1762 .byte 0xf3,0xc3 1763.size bn_from_mont8x,.-bn_from_mont8x 1764.globl bn_get_bits5 1765.type bn_get_bits5,@function 1766.align 16 1767bn_get_bits5: 1768 leaq 0(%rdi),%r10 1769 leaq 1(%rdi),%r11 1770 movl %esi,%ecx 1771 shrl $4,%esi 1772 andl $15,%ecx 1773 leal -8(%rcx),%eax 1774 cmpl $11,%ecx 1775 cmovaq %r11,%r10 1776 cmoval %eax,%ecx 1777 movzwl (%r10,%rsi,2),%eax 1778 shrl %cl,%eax 1779 andl $31,%eax 1780 .byte 0xf3,0xc3 1781.size bn_get_bits5,.-bn_get_bits5 1782 1783.globl bn_scatter5 1784.type bn_scatter5,@function 1785.align 16 1786bn_scatter5: 1787 cmpl $0,%esi 1788 jz .Lscatter_epilogue 1789 leaq (%rdx,%rcx,8),%rdx 1790.Lscatter: 1791 movq (%rdi),%rax 1792 leaq 8(%rdi),%rdi 1793 movq %rax,(%rdx) 1794 leaq 256(%rdx),%rdx 1795 subl $1,%esi 1796 jnz .Lscatter 1797.Lscatter_epilogue: 1798 .byte 0xf3,0xc3 1799.size bn_scatter5,.-bn_scatter5 1800 1801.globl bn_gather5 1802.type bn_gather5,@function 1803.align 16 1804bn_gather5: 1805 movl %ecx,%r11d 1806 shrl $3,%ecx 1807 andq $7,%r11 1808 notl %ecx 1809 leaq .Lmagic_masks(%rip),%rax 1810 andl $3,%ecx 1811 leaq 128(%rdx,%r11,8),%rdx 1812 movq 0(%rax,%rcx,8),%xmm4 1813 movq 8(%rax,%rcx,8),%xmm5 1814 movq 16(%rax,%rcx,8),%xmm6 1815 movq 24(%rax,%rcx,8),%xmm7 1816 jmp .Lgather 1817.align 16 1818.Lgather: 1819 movq -128(%rdx),%xmm0 1820 movq -64(%rdx),%xmm1 1821 pand %xmm4,%xmm0 1822 movq 0(%rdx),%xmm2 1823 pand %xmm5,%xmm1 1824 movq 64(%rdx),%xmm3 1825 pand %xmm6,%xmm2 1826 por %xmm1,%xmm0 1827 pand %xmm7,%xmm3 1828.byte 0x67,0x67 1829 por %xmm2,%xmm0 1830 leaq 256(%rdx),%rdx 1831 por %xmm3,%xmm0 1832 1833 movq %xmm0,(%rdi) 1834 leaq 8(%rdi),%rdi 1835 subl $1,%esi 1836 jnz .Lgather 1837 .byte 0xf3,0xc3 1838.LSEH_end_bn_gather5: 1839.size bn_gather5,.-bn_gather5 1840.align 64 1841.Lmagic_masks: 1842.long 0,0, 0,0, 0,0, -1,-1 1843.long 0,0, 0,0, 0,0, 0,0 1844.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 1845