xref: /NextBSD/secure/lib/libcrypto/amd64/x86_64-mont5.S (revision 4557fabb34e865d7f40be64b39c9e34fa41dbb60)
1	# $FreeBSD$
2.text
3
4
5
6.globl	bn_mul_mont_gather5
7.type	bn_mul_mont_gather5,@function
8.align	64
9bn_mul_mont_gather5:
10	testl	$7,%r9d
11	jnz	.Lmul_enter
12	jmp	.Lmul4x_enter
13
14.align	16
15.Lmul_enter:
16	movl	%r9d,%r9d
17	movq	%rsp,%rax
18	movl	8(%rsp),%r10d
19	pushq	%rbx
20	pushq	%rbp
21	pushq	%r12
22	pushq	%r13
23	pushq	%r14
24	pushq	%r15
25	leaq	2(%r9),%r11
26	negq	%r11
27	leaq	(%rsp,%r11,8),%rsp
28	andq	$-1024,%rsp
29
30	movq	%rax,8(%rsp,%r9,8)
31.Lmul_body:
32	movq	%rdx,%r12
33	movq	%r10,%r11
34	shrq	$3,%r10
35	andq	$7,%r11
36	notq	%r10
37	leaq	.Lmagic_masks(%rip),%rax
38	andq	$3,%r10
39	leaq	96(%r12,%r11,8),%r12
40	movq	0(%rax,%r10,8),%xmm4
41	movq	8(%rax,%r10,8),%xmm5
42	movq	16(%rax,%r10,8),%xmm6
43	movq	24(%rax,%r10,8),%xmm7
44
45	movq	-96(%r12),%xmm0
46	movq	-32(%r12),%xmm1
47	pand	%xmm4,%xmm0
48	movq	32(%r12),%xmm2
49	pand	%xmm5,%xmm1
50	movq	96(%r12),%xmm3
51	pand	%xmm6,%xmm2
52	por	%xmm1,%xmm0
53	pand	%xmm7,%xmm3
54	por	%xmm2,%xmm0
55	leaq	256(%r12),%r12
56	por	%xmm3,%xmm0
57
58.byte	102,72,15,126,195
59
60	movq	(%r8),%r8
61	movq	(%rsi),%rax
62
63	xorq	%r14,%r14
64	xorq	%r15,%r15
65
66	movq	-96(%r12),%xmm0
67	movq	-32(%r12),%xmm1
68	pand	%xmm4,%xmm0
69	movq	32(%r12),%xmm2
70	pand	%xmm5,%xmm1
71
72	movq	%r8,%rbp
73	mulq	%rbx
74	movq	%rax,%r10
75	movq	(%rcx),%rax
76
77	movq	96(%r12),%xmm3
78	pand	%xmm6,%xmm2
79	por	%xmm1,%xmm0
80	pand	%xmm7,%xmm3
81
82	imulq	%r10,%rbp
83	movq	%rdx,%r11
84
85	por	%xmm2,%xmm0
86	leaq	256(%r12),%r12
87	por	%xmm3,%xmm0
88
89	mulq	%rbp
90	addq	%rax,%r10
91	movq	8(%rsi),%rax
92	adcq	$0,%rdx
93	movq	%rdx,%r13
94
95	leaq	1(%r15),%r15
96	jmp	.L1st_enter
97
98.align	16
99.L1st:
100	addq	%rax,%r13
101	movq	(%rsi,%r15,8),%rax
102	adcq	$0,%rdx
103	addq	%r11,%r13
104	movq	%r10,%r11
105	adcq	$0,%rdx
106	movq	%r13,-16(%rsp,%r15,8)
107	movq	%rdx,%r13
108
109.L1st_enter:
110	mulq	%rbx
111	addq	%rax,%r11
112	movq	(%rcx,%r15,8),%rax
113	adcq	$0,%rdx
114	leaq	1(%r15),%r15
115	movq	%rdx,%r10
116
117	mulq	%rbp
118	cmpq	%r9,%r15
119	jne	.L1st
120
121.byte	102,72,15,126,195
122
123	addq	%rax,%r13
124	movq	(%rsi),%rax
125	adcq	$0,%rdx
126	addq	%r11,%r13
127	adcq	$0,%rdx
128	movq	%r13,-16(%rsp,%r15,8)
129	movq	%rdx,%r13
130	movq	%r10,%r11
131
132	xorq	%rdx,%rdx
133	addq	%r11,%r13
134	adcq	$0,%rdx
135	movq	%r13,-8(%rsp,%r9,8)
136	movq	%rdx,(%rsp,%r9,8)
137
138	leaq	1(%r14),%r14
139	jmp	.Louter
140.align	16
141.Louter:
142	xorq	%r15,%r15
143	movq	%r8,%rbp
144	movq	(%rsp),%r10
145
146	movq	-96(%r12),%xmm0
147	movq	-32(%r12),%xmm1
148	pand	%xmm4,%xmm0
149	movq	32(%r12),%xmm2
150	pand	%xmm5,%xmm1
151
152	mulq	%rbx
153	addq	%rax,%r10
154	movq	(%rcx),%rax
155	adcq	$0,%rdx
156
157	movq	96(%r12),%xmm3
158	pand	%xmm6,%xmm2
159	por	%xmm1,%xmm0
160	pand	%xmm7,%xmm3
161
162	imulq	%r10,%rbp
163	movq	%rdx,%r11
164
165	por	%xmm2,%xmm0
166	leaq	256(%r12),%r12
167	por	%xmm3,%xmm0
168
169	mulq	%rbp
170	addq	%rax,%r10
171	movq	8(%rsi),%rax
172	adcq	$0,%rdx
173	movq	8(%rsp),%r10
174	movq	%rdx,%r13
175
176	leaq	1(%r15),%r15
177	jmp	.Linner_enter
178
179.align	16
180.Linner:
181	addq	%rax,%r13
182	movq	(%rsi,%r15,8),%rax
183	adcq	$0,%rdx
184	addq	%r10,%r13
185	movq	(%rsp,%r15,8),%r10
186	adcq	$0,%rdx
187	movq	%r13,-16(%rsp,%r15,8)
188	movq	%rdx,%r13
189
190.Linner_enter:
191	mulq	%rbx
192	addq	%rax,%r11
193	movq	(%rcx,%r15,8),%rax
194	adcq	$0,%rdx
195	addq	%r11,%r10
196	movq	%rdx,%r11
197	adcq	$0,%r11
198	leaq	1(%r15),%r15
199
200	mulq	%rbp
201	cmpq	%r9,%r15
202	jne	.Linner
203
204.byte	102,72,15,126,195
205
206	addq	%rax,%r13
207	movq	(%rsi),%rax
208	adcq	$0,%rdx
209	addq	%r10,%r13
210	movq	(%rsp,%r15,8),%r10
211	adcq	$0,%rdx
212	movq	%r13,-16(%rsp,%r15,8)
213	movq	%rdx,%r13
214
215	xorq	%rdx,%rdx
216	addq	%r11,%r13
217	adcq	$0,%rdx
218	addq	%r10,%r13
219	adcq	$0,%rdx
220	movq	%r13,-8(%rsp,%r9,8)
221	movq	%rdx,(%rsp,%r9,8)
222
223	leaq	1(%r14),%r14
224	cmpq	%r9,%r14
225	jb	.Louter
226
227	xorq	%r14,%r14
228	movq	(%rsp),%rax
229	leaq	(%rsp),%rsi
230	movq	%r9,%r15
231	jmp	.Lsub
232.align	16
233.Lsub:	sbbq	(%rcx,%r14,8),%rax
234	movq	%rax,(%rdi,%r14,8)
235	movq	8(%rsi,%r14,8),%rax
236	leaq	1(%r14),%r14
237	decq	%r15
238	jnz	.Lsub
239
240	sbbq	$0,%rax
241	xorq	%r14,%r14
242	andq	%rax,%rsi
243	notq	%rax
244	movq	%rdi,%rcx
245	andq	%rax,%rcx
246	movq	%r9,%r15
247	orq	%rcx,%rsi
248.align	16
249.Lcopy:
250	movq	(%rsi,%r14,8),%rax
251	movq	%r14,(%rsp,%r14,8)
252	movq	%rax,(%rdi,%r14,8)
253	leaq	1(%r14),%r14
254	subq	$1,%r15
255	jnz	.Lcopy
256
257	movq	8(%rsp,%r9,8),%rsi
258	movq	$1,%rax
259	movq	-48(%rsi),%r15
260	movq	-40(%rsi),%r14
261	movq	-32(%rsi),%r13
262	movq	-24(%rsi),%r12
263	movq	-16(%rsi),%rbp
264	movq	-8(%rsi),%rbx
265	leaq	(%rsi),%rsp
266.Lmul_epilogue:
267	.byte	0xf3,0xc3
268.size	bn_mul_mont_gather5,.-bn_mul_mont_gather5
269.type	bn_mul4x_mont_gather5,@function
270.align	32
271bn_mul4x_mont_gather5:
272.Lmul4x_enter:
273.byte	0x67
274	movq	%rsp,%rax
275	pushq	%rbx
276	pushq	%rbp
277	pushq	%r12
278	pushq	%r13
279	pushq	%r14
280	pushq	%r15
281.byte	0x67
282	movl	%r9d,%r10d
283	shll	$3,%r9d
284	shll	$3+2,%r10d
285	negq	%r9
286
287
288
289
290
291
292
293
294	leaq	-64(%rsp,%r9,2),%r11
295	subq	%rsi,%r11
296	andq	$4095,%r11
297	cmpq	%r11,%r10
298	jb	.Lmul4xsp_alt
299	subq	%r11,%rsp
300	leaq	-64(%rsp,%r9,2),%rsp
301	jmp	.Lmul4xsp_done
302
303.align	32
304.Lmul4xsp_alt:
305	leaq	4096-64(,%r9,2),%r10
306	leaq	-64(%rsp,%r9,2),%rsp
307	subq	%r10,%r11
308	movq	$0,%r10
309	cmovcq	%r10,%r11
310	subq	%r11,%rsp
311.Lmul4xsp_done:
312	andq	$-64,%rsp
313	negq	%r9
314
315	movq	%rax,40(%rsp)
316.Lmul4x_body:
317
318	call	mul4x_internal
319
320	movq	40(%rsp),%rsi
321	movq	$1,%rax
322	movq	-48(%rsi),%r15
323	movq	-40(%rsi),%r14
324	movq	-32(%rsi),%r13
325	movq	-24(%rsi),%r12
326	movq	-16(%rsi),%rbp
327	movq	-8(%rsi),%rbx
328	leaq	(%rsi),%rsp
329.Lmul4x_epilogue:
330	.byte	0xf3,0xc3
331.size	bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
332
333.type	mul4x_internal,@function
334.align	32
335mul4x_internal:
336	shlq	$5,%r9
337	movl	8(%rax),%r10d
338	leaq	256(%rdx,%r9,1),%r13
339	shrq	$5,%r9
340	movq	%r10,%r11
341	shrq	$3,%r10
342	andq	$7,%r11
343	notq	%r10
344	leaq	.Lmagic_masks(%rip),%rax
345	andq	$3,%r10
346	leaq	96(%rdx,%r11,8),%r12
347	movq	0(%rax,%r10,8),%xmm4
348	movq	8(%rax,%r10,8),%xmm5
349	addq	$7,%r11
350	movq	16(%rax,%r10,8),%xmm6
351	movq	24(%rax,%r10,8),%xmm7
352	andq	$7,%r11
353
354	movq	-96(%r12),%xmm0
355	leaq	256(%r12),%r14
356	movq	-32(%r12),%xmm1
357	pand	%xmm4,%xmm0
358	movq	32(%r12),%xmm2
359	pand	%xmm5,%xmm1
360	movq	96(%r12),%xmm3
361	pand	%xmm6,%xmm2
362.byte	0x67
363	por	%xmm1,%xmm0
364	movq	-96(%r14),%xmm1
365.byte	0x67
366	pand	%xmm7,%xmm3
367.byte	0x67
368	por	%xmm2,%xmm0
369	movq	-32(%r14),%xmm2
370.byte	0x67
371	pand	%xmm4,%xmm1
372.byte	0x67
373	por	%xmm3,%xmm0
374	movq	32(%r14),%xmm3
375
376.byte	102,72,15,126,195
377	movq	96(%r14),%xmm0
378	movq	%r13,16+8(%rsp)
379	movq	%rdi,56+8(%rsp)
380
381	movq	(%r8),%r8
382	movq	(%rsi),%rax
383	leaq	(%rsi,%r9,1),%rsi
384	negq	%r9
385
386	movq	%r8,%rbp
387	mulq	%rbx
388	movq	%rax,%r10
389	movq	(%rcx),%rax
390
391	pand	%xmm5,%xmm2
392	pand	%xmm6,%xmm3
393	por	%xmm2,%xmm1
394
395	imulq	%r10,%rbp
396
397
398
399
400
401
402
403	leaq	64+8(%rsp,%r11,8),%r14
404	movq	%rdx,%r11
405
406	pand	%xmm7,%xmm0
407	por	%xmm3,%xmm1
408	leaq	512(%r12),%r12
409	por	%xmm1,%xmm0
410
411	mulq	%rbp
412	addq	%rax,%r10
413	movq	8(%rsi,%r9,1),%rax
414	adcq	$0,%rdx
415	movq	%rdx,%rdi
416
417	mulq	%rbx
418	addq	%rax,%r11
419	movq	16(%rcx),%rax
420	adcq	$0,%rdx
421	movq	%rdx,%r10
422
423	mulq	%rbp
424	addq	%rax,%rdi
425	movq	16(%rsi,%r9,1),%rax
426	adcq	$0,%rdx
427	addq	%r11,%rdi
428	leaq	32(%r9),%r15
429	leaq	64(%rcx),%rcx
430	adcq	$0,%rdx
431	movq	%rdi,(%r14)
432	movq	%rdx,%r13
433	jmp	.L1st4x
434
435.align	32
436.L1st4x:
437	mulq	%rbx
438	addq	%rax,%r10
439	movq	-32(%rcx),%rax
440	leaq	32(%r14),%r14
441	adcq	$0,%rdx
442	movq	%rdx,%r11
443
444	mulq	%rbp
445	addq	%rax,%r13
446	movq	-8(%rsi,%r15,1),%rax
447	adcq	$0,%rdx
448	addq	%r10,%r13
449	adcq	$0,%rdx
450	movq	%r13,-24(%r14)
451	movq	%rdx,%rdi
452
453	mulq	%rbx
454	addq	%rax,%r11
455	movq	-16(%rcx),%rax
456	adcq	$0,%rdx
457	movq	%rdx,%r10
458
459	mulq	%rbp
460	addq	%rax,%rdi
461	movq	(%rsi,%r15,1),%rax
462	adcq	$0,%rdx
463	addq	%r11,%rdi
464	adcq	$0,%rdx
465	movq	%rdi,-16(%r14)
466	movq	%rdx,%r13
467
468	mulq	%rbx
469	addq	%rax,%r10
470	movq	0(%rcx),%rax
471	adcq	$0,%rdx
472	movq	%rdx,%r11
473
474	mulq	%rbp
475	addq	%rax,%r13
476	movq	8(%rsi,%r15,1),%rax
477	adcq	$0,%rdx
478	addq	%r10,%r13
479	adcq	$0,%rdx
480	movq	%r13,-8(%r14)
481	movq	%rdx,%rdi
482
483	mulq	%rbx
484	addq	%rax,%r11
485	movq	16(%rcx),%rax
486	adcq	$0,%rdx
487	movq	%rdx,%r10
488
489	mulq	%rbp
490	addq	%rax,%rdi
491	movq	16(%rsi,%r15,1),%rax
492	adcq	$0,%rdx
493	addq	%r11,%rdi
494	leaq	64(%rcx),%rcx
495	adcq	$0,%rdx
496	movq	%rdi,(%r14)
497	movq	%rdx,%r13
498
499	addq	$32,%r15
500	jnz	.L1st4x
501
502	mulq	%rbx
503	addq	%rax,%r10
504	movq	-32(%rcx),%rax
505	leaq	32(%r14),%r14
506	adcq	$0,%rdx
507	movq	%rdx,%r11
508
509	mulq	%rbp
510	addq	%rax,%r13
511	movq	-8(%rsi),%rax
512	adcq	$0,%rdx
513	addq	%r10,%r13
514	adcq	$0,%rdx
515	movq	%r13,-24(%r14)
516	movq	%rdx,%rdi
517
518	mulq	%rbx
519	addq	%rax,%r11
520	movq	-16(%rcx),%rax
521	adcq	$0,%rdx
522	movq	%rdx,%r10
523
524	mulq	%rbp
525	addq	%rax,%rdi
526	movq	(%rsi,%r9,1),%rax
527	adcq	$0,%rdx
528	addq	%r11,%rdi
529	adcq	$0,%rdx
530	movq	%rdi,-16(%r14)
531	movq	%rdx,%r13
532
533.byte	102,72,15,126,195
534	leaq	(%rcx,%r9,2),%rcx
535
536	xorq	%rdi,%rdi
537	addq	%r10,%r13
538	adcq	$0,%rdi
539	movq	%r13,-8(%r14)
540
541	jmp	.Louter4x
542
543.align	32
544.Louter4x:
545	movq	(%r14,%r9,1),%r10
546	movq	%r8,%rbp
547	mulq	%rbx
548	addq	%rax,%r10
549	movq	(%rcx),%rax
550	adcq	$0,%rdx
551
552	movq	-96(%r12),%xmm0
553	movq	-32(%r12),%xmm1
554	pand	%xmm4,%xmm0
555	movq	32(%r12),%xmm2
556	pand	%xmm5,%xmm1
557	movq	96(%r12),%xmm3
558
559	imulq	%r10,%rbp
560.byte	0x67
561	movq	%rdx,%r11
562	movq	%rdi,(%r14)
563
564	pand	%xmm6,%xmm2
565	por	%xmm1,%xmm0
566	pand	%xmm7,%xmm3
567	por	%xmm2,%xmm0
568	leaq	(%r14,%r9,1),%r14
569	leaq	256(%r12),%r12
570	por	%xmm3,%xmm0
571
572	mulq	%rbp
573	addq	%rax,%r10
574	movq	8(%rsi,%r9,1),%rax
575	adcq	$0,%rdx
576	movq	%rdx,%rdi
577
578	mulq	%rbx
579	addq	%rax,%r11
580	movq	16(%rcx),%rax
581	adcq	$0,%rdx
582	addq	8(%r14),%r11
583	adcq	$0,%rdx
584	movq	%rdx,%r10
585
586	mulq	%rbp
587	addq	%rax,%rdi
588	movq	16(%rsi,%r9,1),%rax
589	adcq	$0,%rdx
590	addq	%r11,%rdi
591	leaq	32(%r9),%r15
592	leaq	64(%rcx),%rcx
593	adcq	$0,%rdx
594	movq	%rdx,%r13
595	jmp	.Linner4x
596
597.align	32
598.Linner4x:
599	mulq	%rbx
600	addq	%rax,%r10
601	movq	-32(%rcx),%rax
602	adcq	$0,%rdx
603	addq	16(%r14),%r10
604	leaq	32(%r14),%r14
605	adcq	$0,%rdx
606	movq	%rdx,%r11
607
608	mulq	%rbp
609	addq	%rax,%r13
610	movq	-8(%rsi,%r15,1),%rax
611	adcq	$0,%rdx
612	addq	%r10,%r13
613	adcq	$0,%rdx
614	movq	%rdi,-32(%r14)
615	movq	%rdx,%rdi
616
617	mulq	%rbx
618	addq	%rax,%r11
619	movq	-16(%rcx),%rax
620	adcq	$0,%rdx
621	addq	-8(%r14),%r11
622	adcq	$0,%rdx
623	movq	%rdx,%r10
624
625	mulq	%rbp
626	addq	%rax,%rdi
627	movq	(%rsi,%r15,1),%rax
628	adcq	$0,%rdx
629	addq	%r11,%rdi
630	adcq	$0,%rdx
631	movq	%r13,-24(%r14)
632	movq	%rdx,%r13
633
634	mulq	%rbx
635	addq	%rax,%r10
636	movq	0(%rcx),%rax
637	adcq	$0,%rdx
638	addq	(%r14),%r10
639	adcq	$0,%rdx
640	movq	%rdx,%r11
641
642	mulq	%rbp
643	addq	%rax,%r13
644	movq	8(%rsi,%r15,1),%rax
645	adcq	$0,%rdx
646	addq	%r10,%r13
647	adcq	$0,%rdx
648	movq	%rdi,-16(%r14)
649	movq	%rdx,%rdi
650
651	mulq	%rbx
652	addq	%rax,%r11
653	movq	16(%rcx),%rax
654	adcq	$0,%rdx
655	addq	8(%r14),%r11
656	adcq	$0,%rdx
657	movq	%rdx,%r10
658
659	mulq	%rbp
660	addq	%rax,%rdi
661	movq	16(%rsi,%r15,1),%rax
662	adcq	$0,%rdx
663	addq	%r11,%rdi
664	leaq	64(%rcx),%rcx
665	adcq	$0,%rdx
666	movq	%r13,-8(%r14)
667	movq	%rdx,%r13
668
669	addq	$32,%r15
670	jnz	.Linner4x
671
672	mulq	%rbx
673	addq	%rax,%r10
674	movq	-32(%rcx),%rax
675	adcq	$0,%rdx
676	addq	16(%r14),%r10
677	leaq	32(%r14),%r14
678	adcq	$0,%rdx
679	movq	%rdx,%r11
680
681	mulq	%rbp
682	addq	%rax,%r13
683	movq	-8(%rsi),%rax
684	adcq	$0,%rdx
685	addq	%r10,%r13
686	adcq	$0,%rdx
687	movq	%rdi,-32(%r14)
688	movq	%rdx,%rdi
689
690	mulq	%rbx
691	addq	%rax,%r11
692	movq	%rbp,%rax
693	movq	-16(%rcx),%rbp
694	adcq	$0,%rdx
695	addq	-8(%r14),%r11
696	adcq	$0,%rdx
697	movq	%rdx,%r10
698
699	mulq	%rbp
700	addq	%rax,%rdi
701	movq	(%rsi,%r9,1),%rax
702	adcq	$0,%rdx
703	addq	%r11,%rdi
704	adcq	$0,%rdx
705	movq	%r13,-24(%r14)
706	movq	%rdx,%r13
707
708.byte	102,72,15,126,195
709	movq	%rdi,-16(%r14)
710	leaq	(%rcx,%r9,2),%rcx
711
712	xorq	%rdi,%rdi
713	addq	%r10,%r13
714	adcq	$0,%rdi
715	addq	(%r14),%r13
716	adcq	$0,%rdi
717	movq	%r13,-8(%r14)
718
719	cmpq	16+8(%rsp),%r12
720	jb	.Louter4x
721	subq	%r13,%rbp
722	adcq	%r15,%r15
723	orq	%r15,%rdi
724	xorq	$1,%rdi
725	leaq	(%r14,%r9,1),%rbx
726	leaq	(%rcx,%rdi,8),%rbp
727	movq	%r9,%rcx
728	sarq	$3+2,%rcx
729	movq	56+8(%rsp),%rdi
730	jmp	.Lsqr4x_sub
731.size	mul4x_internal,.-mul4x_internal
732.globl	bn_power5
733.type	bn_power5,@function
734.align	32
735bn_power5:
736	movq	%rsp,%rax
737	pushq	%rbx
738	pushq	%rbp
739	pushq	%r12
740	pushq	%r13
741	pushq	%r14
742	pushq	%r15
743	movl	%r9d,%r10d
744	shll	$3,%r9d
745	shll	$3+2,%r10d
746	negq	%r9
747	movq	(%r8),%r8
748
749
750
751
752
753
754
755	leaq	-64(%rsp,%r9,2),%r11
756	subq	%rsi,%r11
757	andq	$4095,%r11
758	cmpq	%r11,%r10
759	jb	.Lpwr_sp_alt
760	subq	%r11,%rsp
761	leaq	-64(%rsp,%r9,2),%rsp
762	jmp	.Lpwr_sp_done
763
764.align	32
765.Lpwr_sp_alt:
766	leaq	4096-64(,%r9,2),%r10
767	leaq	-64(%rsp,%r9,2),%rsp
768	subq	%r10,%r11
769	movq	$0,%r10
770	cmovcq	%r10,%r11
771	subq	%r11,%rsp
772.Lpwr_sp_done:
773	andq	$-64,%rsp
774	movq	%r9,%r10
775	negq	%r9
776
777
778
779
780
781
782
783
784
785
786	movq	%r8,32(%rsp)
787	movq	%rax,40(%rsp)
788.Lpower5_body:
789.byte	102,72,15,110,207
790.byte	102,72,15,110,209
791.byte	102,73,15,110,218
792.byte	102,72,15,110,226
793
794	call	__bn_sqr8x_internal
795	call	__bn_sqr8x_internal
796	call	__bn_sqr8x_internal
797	call	__bn_sqr8x_internal
798	call	__bn_sqr8x_internal
799
800.byte	102,72,15,126,209
801.byte	102,72,15,126,226
802	movq	%rsi,%rdi
803	movq	40(%rsp),%rax
804	leaq	32(%rsp),%r8
805
806	call	mul4x_internal
807
808	movq	40(%rsp),%rsi
809	movq	$1,%rax
810	movq	-48(%rsi),%r15
811	movq	-40(%rsi),%r14
812	movq	-32(%rsi),%r13
813	movq	-24(%rsi),%r12
814	movq	-16(%rsi),%rbp
815	movq	-8(%rsi),%rbx
816	leaq	(%rsi),%rsp
817.Lpower5_epilogue:
818	.byte	0xf3,0xc3
819.size	bn_power5,.-bn_power5
820
821.globl	bn_sqr8x_internal
822.hidden	bn_sqr8x_internal
823.type	bn_sqr8x_internal,@function
824.align	32
825bn_sqr8x_internal:
826__bn_sqr8x_internal:
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900	leaq	32(%r10),%rbp
901	leaq	(%rsi,%r9,1),%rsi
902
903	movq	%r9,%rcx
904
905
906	movq	-32(%rsi,%rbp,1),%r14
907	leaq	48+8(%rsp,%r9,2),%rdi
908	movq	-24(%rsi,%rbp,1),%rax
909	leaq	-32(%rdi,%rbp,1),%rdi
910	movq	-16(%rsi,%rbp,1),%rbx
911	movq	%rax,%r15
912
913	mulq	%r14
914	movq	%rax,%r10
915	movq	%rbx,%rax
916	movq	%rdx,%r11
917	movq	%r10,-24(%rdi,%rbp,1)
918
919	mulq	%r14
920	addq	%rax,%r11
921	movq	%rbx,%rax
922	adcq	$0,%rdx
923	movq	%r11,-16(%rdi,%rbp,1)
924	movq	%rdx,%r10
925
926
927	movq	-8(%rsi,%rbp,1),%rbx
928	mulq	%r15
929	movq	%rax,%r12
930	movq	%rbx,%rax
931	movq	%rdx,%r13
932
933	leaq	(%rbp),%rcx
934	mulq	%r14
935	addq	%rax,%r10
936	movq	%rbx,%rax
937	movq	%rdx,%r11
938	adcq	$0,%r11
939	addq	%r12,%r10
940	adcq	$0,%r11
941	movq	%r10,-8(%rdi,%rcx,1)
942	jmp	.Lsqr4x_1st
943
944.align	32
945.Lsqr4x_1st:
946	movq	(%rsi,%rcx,1),%rbx
947	mulq	%r15
948	addq	%rax,%r13
949	movq	%rbx,%rax
950	movq	%rdx,%r12
951	adcq	$0,%r12
952
953	mulq	%r14
954	addq	%rax,%r11
955	movq	%rbx,%rax
956	movq	8(%rsi,%rcx,1),%rbx
957	movq	%rdx,%r10
958	adcq	$0,%r10
959	addq	%r13,%r11
960	adcq	$0,%r10
961
962
963	mulq	%r15
964	addq	%rax,%r12
965	movq	%rbx,%rax
966	movq	%r11,(%rdi,%rcx,1)
967	movq	%rdx,%r13
968	adcq	$0,%r13
969
970	mulq	%r14
971	addq	%rax,%r10
972	movq	%rbx,%rax
973	movq	16(%rsi,%rcx,1),%rbx
974	movq	%rdx,%r11
975	adcq	$0,%r11
976	addq	%r12,%r10
977	adcq	$0,%r11
978
979	mulq	%r15
980	addq	%rax,%r13
981	movq	%rbx,%rax
982	movq	%r10,8(%rdi,%rcx,1)
983	movq	%rdx,%r12
984	adcq	$0,%r12
985
986	mulq	%r14
987	addq	%rax,%r11
988	movq	%rbx,%rax
989	movq	24(%rsi,%rcx,1),%rbx
990	movq	%rdx,%r10
991	adcq	$0,%r10
992	addq	%r13,%r11
993	adcq	$0,%r10
994
995
996	mulq	%r15
997	addq	%rax,%r12
998	movq	%rbx,%rax
999	movq	%r11,16(%rdi,%rcx,1)
1000	movq	%rdx,%r13
1001	adcq	$0,%r13
1002	leaq	32(%rcx),%rcx
1003
1004	mulq	%r14
1005	addq	%rax,%r10
1006	movq	%rbx,%rax
1007	movq	%rdx,%r11
1008	adcq	$0,%r11
1009	addq	%r12,%r10
1010	adcq	$0,%r11
1011	movq	%r10,-8(%rdi,%rcx,1)
1012
1013	cmpq	$0,%rcx
1014	jne	.Lsqr4x_1st
1015
1016	mulq	%r15
1017	addq	%rax,%r13
1018	leaq	16(%rbp),%rbp
1019	adcq	$0,%rdx
1020	addq	%r11,%r13
1021	adcq	$0,%rdx
1022
1023	movq	%r13,(%rdi)
1024	movq	%rdx,%r12
1025	movq	%rdx,8(%rdi)
1026	jmp	.Lsqr4x_outer
1027
1028.align	32
1029.Lsqr4x_outer:
1030	movq	-32(%rsi,%rbp,1),%r14
1031	leaq	48+8(%rsp,%r9,2),%rdi
1032	movq	-24(%rsi,%rbp,1),%rax
1033	leaq	-32(%rdi,%rbp,1),%rdi
1034	movq	-16(%rsi,%rbp,1),%rbx
1035	movq	%rax,%r15
1036
1037	mulq	%r14
1038	movq	-24(%rdi,%rbp,1),%r10
1039	addq	%rax,%r10
1040	movq	%rbx,%rax
1041	adcq	$0,%rdx
1042	movq	%r10,-24(%rdi,%rbp,1)
1043	movq	%rdx,%r11
1044
1045	mulq	%r14
1046	addq	%rax,%r11
1047	movq	%rbx,%rax
1048	adcq	$0,%rdx
1049	addq	-16(%rdi,%rbp,1),%r11
1050	movq	%rdx,%r10
1051	adcq	$0,%r10
1052	movq	%r11,-16(%rdi,%rbp,1)
1053
1054	xorq	%r12,%r12
1055
1056	movq	-8(%rsi,%rbp,1),%rbx
1057	mulq	%r15
1058	addq	%rax,%r12
1059	movq	%rbx,%rax
1060	adcq	$0,%rdx
1061	addq	-8(%rdi,%rbp,1),%r12
1062	movq	%rdx,%r13
1063	adcq	$0,%r13
1064
1065	mulq	%r14
1066	addq	%rax,%r10
1067	movq	%rbx,%rax
1068	adcq	$0,%rdx
1069	addq	%r12,%r10
1070	movq	%rdx,%r11
1071	adcq	$0,%r11
1072	movq	%r10,-8(%rdi,%rbp,1)
1073
1074	leaq	(%rbp),%rcx
1075	jmp	.Lsqr4x_inner
1076
1077.align	32
1078.Lsqr4x_inner:
1079	movq	(%rsi,%rcx,1),%rbx
1080	mulq	%r15
1081	addq	%rax,%r13
1082	movq	%rbx,%rax
1083	movq	%rdx,%r12
1084	adcq	$0,%r12
1085	addq	(%rdi,%rcx,1),%r13
1086	adcq	$0,%r12
1087
1088.byte	0x67
1089	mulq	%r14
1090	addq	%rax,%r11
1091	movq	%rbx,%rax
1092	movq	8(%rsi,%rcx,1),%rbx
1093	movq	%rdx,%r10
1094	adcq	$0,%r10
1095	addq	%r13,%r11
1096	adcq	$0,%r10
1097
1098	mulq	%r15
1099	addq	%rax,%r12
1100	movq	%r11,(%rdi,%rcx,1)
1101	movq	%rbx,%rax
1102	movq	%rdx,%r13
1103	adcq	$0,%r13
1104	addq	8(%rdi,%rcx,1),%r12
1105	leaq	16(%rcx),%rcx
1106	adcq	$0,%r13
1107
1108	mulq	%r14
1109	addq	%rax,%r10
1110	movq	%rbx,%rax
1111	adcq	$0,%rdx
1112	addq	%r12,%r10
1113	movq	%rdx,%r11
1114	adcq	$0,%r11
1115	movq	%r10,-8(%rdi,%rcx,1)
1116
1117	cmpq	$0,%rcx
1118	jne	.Lsqr4x_inner
1119
1120.byte	0x67
1121	mulq	%r15
1122	addq	%rax,%r13
1123	adcq	$0,%rdx
1124	addq	%r11,%r13
1125	adcq	$0,%rdx
1126
1127	movq	%r13,(%rdi)
1128	movq	%rdx,%r12
1129	movq	%rdx,8(%rdi)
1130
1131	addq	$16,%rbp
1132	jnz	.Lsqr4x_outer
1133
1134
1135	movq	-32(%rsi),%r14
1136	leaq	48+8(%rsp,%r9,2),%rdi
1137	movq	-24(%rsi),%rax
1138	leaq	-32(%rdi,%rbp,1),%rdi
1139	movq	-16(%rsi),%rbx
1140	movq	%rax,%r15
1141
1142	mulq	%r14
1143	addq	%rax,%r10
1144	movq	%rbx,%rax
1145	movq	%rdx,%r11
1146	adcq	$0,%r11
1147
1148	mulq	%r14
1149	addq	%rax,%r11
1150	movq	%rbx,%rax
1151	movq	%r10,-24(%rdi)
1152	movq	%rdx,%r10
1153	adcq	$0,%r10
1154	addq	%r13,%r11
1155	movq	-8(%rsi),%rbx
1156	adcq	$0,%r10
1157
1158	mulq	%r15
1159	addq	%rax,%r12
1160	movq	%rbx,%rax
1161	movq	%r11,-16(%rdi)
1162	movq	%rdx,%r13
1163	adcq	$0,%r13
1164
1165	mulq	%r14
1166	addq	%rax,%r10
1167	movq	%rbx,%rax
1168	movq	%rdx,%r11
1169	adcq	$0,%r11
1170	addq	%r12,%r10
1171	adcq	$0,%r11
1172	movq	%r10,-8(%rdi)
1173
1174	mulq	%r15
1175	addq	%rax,%r13
1176	movq	-16(%rsi),%rax
1177	adcq	$0,%rdx
1178	addq	%r11,%r13
1179	adcq	$0,%rdx
1180
1181	movq	%r13,(%rdi)
1182	movq	%rdx,%r12
1183	movq	%rdx,8(%rdi)
1184
1185	mulq	%rbx
1186	addq	$16,%rbp
1187	xorq	%r14,%r14
1188	subq	%r9,%rbp
1189	xorq	%r15,%r15
1190
1191	addq	%r12,%rax
1192	adcq	$0,%rdx
1193	movq	%rax,8(%rdi)
1194	movq	%rdx,16(%rdi)
1195	movq	%r15,24(%rdi)
1196
1197	movq	-16(%rsi,%rbp,1),%rax
1198	leaq	48+8(%rsp),%rdi
1199	xorq	%r10,%r10
1200	movq	8(%rdi),%r11
1201
1202	leaq	(%r14,%r10,2),%r12
1203	shrq	$63,%r10
1204	leaq	(%rcx,%r11,2),%r13
1205	shrq	$63,%r11
1206	orq	%r10,%r13
1207	movq	16(%rdi),%r10
1208	movq	%r11,%r14
1209	mulq	%rax
1210	negq	%r15
1211	movq	24(%rdi),%r11
1212	adcq	%rax,%r12
1213	movq	-8(%rsi,%rbp,1),%rax
1214	movq	%r12,(%rdi)
1215	adcq	%rdx,%r13
1216
1217	leaq	(%r14,%r10,2),%rbx
1218	movq	%r13,8(%rdi)
1219	sbbq	%r15,%r15
1220	shrq	$63,%r10
1221	leaq	(%rcx,%r11,2),%r8
1222	shrq	$63,%r11
1223	orq	%r10,%r8
1224	movq	32(%rdi),%r10
1225	movq	%r11,%r14
1226	mulq	%rax
1227	negq	%r15
1228	movq	40(%rdi),%r11
1229	adcq	%rax,%rbx
1230	movq	0(%rsi,%rbp,1),%rax
1231	movq	%rbx,16(%rdi)
1232	adcq	%rdx,%r8
1233	leaq	16(%rbp),%rbp
1234	movq	%r8,24(%rdi)
1235	sbbq	%r15,%r15
1236	leaq	64(%rdi),%rdi
1237	jmp	.Lsqr4x_shift_n_add
1238
1239.align	32
1240.Lsqr4x_shift_n_add:
1241	leaq	(%r14,%r10,2),%r12
1242	shrq	$63,%r10
1243	leaq	(%rcx,%r11,2),%r13
1244	shrq	$63,%r11
1245	orq	%r10,%r13
1246	movq	-16(%rdi),%r10
1247	movq	%r11,%r14
1248	mulq	%rax
1249	negq	%r15
1250	movq	-8(%rdi),%r11
1251	adcq	%rax,%r12
1252	movq	-8(%rsi,%rbp,1),%rax
1253	movq	%r12,-32(%rdi)
1254	adcq	%rdx,%r13
1255
1256	leaq	(%r14,%r10,2),%rbx
1257	movq	%r13,-24(%rdi)
1258	sbbq	%r15,%r15
1259	shrq	$63,%r10
1260	leaq	(%rcx,%r11,2),%r8
1261	shrq	$63,%r11
1262	orq	%r10,%r8
1263	movq	0(%rdi),%r10
1264	movq	%r11,%r14
1265	mulq	%rax
1266	negq	%r15
1267	movq	8(%rdi),%r11
1268	adcq	%rax,%rbx
1269	movq	0(%rsi,%rbp,1),%rax
1270	movq	%rbx,-16(%rdi)
1271	adcq	%rdx,%r8
1272
1273	leaq	(%r14,%r10,2),%r12
1274	movq	%r8,-8(%rdi)
1275	sbbq	%r15,%r15
1276	shrq	$63,%r10
1277	leaq	(%rcx,%r11,2),%r13
1278	shrq	$63,%r11
1279	orq	%r10,%r13
1280	movq	16(%rdi),%r10
1281	movq	%r11,%r14
1282	mulq	%rax
1283	negq	%r15
1284	movq	24(%rdi),%r11
1285	adcq	%rax,%r12
1286	movq	8(%rsi,%rbp,1),%rax
1287	movq	%r12,0(%rdi)
1288	adcq	%rdx,%r13
1289
1290	leaq	(%r14,%r10,2),%rbx
1291	movq	%r13,8(%rdi)
1292	sbbq	%r15,%r15
1293	shrq	$63,%r10
1294	leaq	(%rcx,%r11,2),%r8
1295	shrq	$63,%r11
1296	orq	%r10,%r8
1297	movq	32(%rdi),%r10
1298	movq	%r11,%r14
1299	mulq	%rax
1300	negq	%r15
1301	movq	40(%rdi),%r11
1302	adcq	%rax,%rbx
1303	movq	16(%rsi,%rbp,1),%rax
1304	movq	%rbx,16(%rdi)
1305	adcq	%rdx,%r8
1306	movq	%r8,24(%rdi)
1307	sbbq	%r15,%r15
1308	leaq	64(%rdi),%rdi
1309	addq	$32,%rbp
1310	jnz	.Lsqr4x_shift_n_add
1311
1312	leaq	(%r14,%r10,2),%r12
1313.byte	0x67
1314	shrq	$63,%r10
1315	leaq	(%rcx,%r11,2),%r13
1316	shrq	$63,%r11
1317	orq	%r10,%r13
1318	movq	-16(%rdi),%r10
1319	movq	%r11,%r14
1320	mulq	%rax
1321	negq	%r15
1322	movq	-8(%rdi),%r11
1323	adcq	%rax,%r12
1324	movq	-8(%rsi),%rax
1325	movq	%r12,-32(%rdi)
1326	adcq	%rdx,%r13
1327
1328	leaq	(%r14,%r10,2),%rbx
1329	movq	%r13,-24(%rdi)
1330	sbbq	%r15,%r15
1331	shrq	$63,%r10
1332	leaq	(%rcx,%r11,2),%r8
1333	shrq	$63,%r11
1334	orq	%r10,%r8
1335	mulq	%rax
1336	negq	%r15
1337	adcq	%rax,%rbx
1338	adcq	%rdx,%r8
1339	movq	%rbx,-16(%rdi)
1340	movq	%r8,-8(%rdi)
1341.byte	102,72,15,126,213
1342sqr8x_reduction:
1343	xorq	%rax,%rax
1344	leaq	(%rbp,%r9,2),%rcx
1345	leaq	48+8(%rsp,%r9,2),%rdx
1346	movq	%rcx,0+8(%rsp)
1347	leaq	48+8(%rsp,%r9,1),%rdi
1348	movq	%rdx,8+8(%rsp)
1349	negq	%r9
1350	jmp	.L8x_reduction_loop
1351
1352.align	32
1353.L8x_reduction_loop:
1354	leaq	(%rdi,%r9,1),%rdi
1355.byte	0x66
1356	movq	0(%rdi),%rbx
1357	movq	8(%rdi),%r9
1358	movq	16(%rdi),%r10
1359	movq	24(%rdi),%r11
1360	movq	32(%rdi),%r12
1361	movq	40(%rdi),%r13
1362	movq	48(%rdi),%r14
1363	movq	56(%rdi),%r15
1364	movq	%rax,(%rdx)
1365	leaq	64(%rdi),%rdi
1366
1367.byte	0x67
1368	movq	%rbx,%r8
1369	imulq	32+8(%rsp),%rbx
1370	movq	0(%rbp),%rax
1371	movl	$8,%ecx
1372	jmp	.L8x_reduce
1373
1374.align	32
1375.L8x_reduce:
1376	mulq	%rbx
1377	movq	16(%rbp),%rax
1378	negq	%r8
1379	movq	%rdx,%r8
1380	adcq	$0,%r8
1381
1382	mulq	%rbx
1383	addq	%rax,%r9
1384	movq	32(%rbp),%rax
1385	adcq	$0,%rdx
1386	addq	%r9,%r8
1387	movq	%rbx,48-8+8(%rsp,%rcx,8)
1388	movq	%rdx,%r9
1389	adcq	$0,%r9
1390
1391	mulq	%rbx
1392	addq	%rax,%r10
1393	movq	48(%rbp),%rax
1394	adcq	$0,%rdx
1395	addq	%r10,%r9
1396	movq	32+8(%rsp),%rsi
1397	movq	%rdx,%r10
1398	adcq	$0,%r10
1399
1400	mulq	%rbx
1401	addq	%rax,%r11
1402	movq	64(%rbp),%rax
1403	adcq	$0,%rdx
1404	imulq	%r8,%rsi
1405	addq	%r11,%r10
1406	movq	%rdx,%r11
1407	adcq	$0,%r11
1408
1409	mulq	%rbx
1410	addq	%rax,%r12
1411	movq	80(%rbp),%rax
1412	adcq	$0,%rdx
1413	addq	%r12,%r11
1414	movq	%rdx,%r12
1415	adcq	$0,%r12
1416
1417	mulq	%rbx
1418	addq	%rax,%r13
1419	movq	96(%rbp),%rax
1420	adcq	$0,%rdx
1421	addq	%r13,%r12
1422	movq	%rdx,%r13
1423	adcq	$0,%r13
1424
1425	mulq	%rbx
1426	addq	%rax,%r14
1427	movq	112(%rbp),%rax
1428	adcq	$0,%rdx
1429	addq	%r14,%r13
1430	movq	%rdx,%r14
1431	adcq	$0,%r14
1432
1433	mulq	%rbx
1434	movq	%rsi,%rbx
1435	addq	%rax,%r15
1436	movq	0(%rbp),%rax
1437	adcq	$0,%rdx
1438	addq	%r15,%r14
1439	movq	%rdx,%r15
1440	adcq	$0,%r15
1441
1442	decl	%ecx
1443	jnz	.L8x_reduce
1444
1445	leaq	128(%rbp),%rbp
1446	xorq	%rax,%rax
1447	movq	8+8(%rsp),%rdx
1448	cmpq	0+8(%rsp),%rbp
1449	jae	.L8x_no_tail
1450
1451.byte	0x66
1452	addq	0(%rdi),%r8
1453	adcq	8(%rdi),%r9
1454	adcq	16(%rdi),%r10
1455	adcq	24(%rdi),%r11
1456	adcq	32(%rdi),%r12
1457	adcq	40(%rdi),%r13
1458	adcq	48(%rdi),%r14
1459	adcq	56(%rdi),%r15
1460	sbbq	%rsi,%rsi
1461
1462	movq	48+56+8(%rsp),%rbx
1463	movl	$8,%ecx
1464	movq	0(%rbp),%rax
1465	jmp	.L8x_tail
1466
1467.align	32
1468.L8x_tail:
1469	mulq	%rbx
1470	addq	%rax,%r8
1471	movq	16(%rbp),%rax
1472	movq	%r8,(%rdi)
1473	movq	%rdx,%r8
1474	adcq	$0,%r8
1475
1476	mulq	%rbx
1477	addq	%rax,%r9
1478	movq	32(%rbp),%rax
1479	adcq	$0,%rdx
1480	addq	%r9,%r8
1481	leaq	8(%rdi),%rdi
1482	movq	%rdx,%r9
1483	adcq	$0,%r9
1484
1485	mulq	%rbx
1486	addq	%rax,%r10
1487	movq	48(%rbp),%rax
1488	adcq	$0,%rdx
1489	addq	%r10,%r9
1490	movq	%rdx,%r10
1491	adcq	$0,%r10
1492
1493	mulq	%rbx
1494	addq	%rax,%r11
1495	movq	64(%rbp),%rax
1496	adcq	$0,%rdx
1497	addq	%r11,%r10
1498	movq	%rdx,%r11
1499	adcq	$0,%r11
1500
1501	mulq	%rbx
1502	addq	%rax,%r12
1503	movq	80(%rbp),%rax
1504	adcq	$0,%rdx
1505	addq	%r12,%r11
1506	movq	%rdx,%r12
1507	adcq	$0,%r12
1508
1509	mulq	%rbx
1510	addq	%rax,%r13
1511	movq	96(%rbp),%rax
1512	adcq	$0,%rdx
1513	addq	%r13,%r12
1514	movq	%rdx,%r13
1515	adcq	$0,%r13
1516
1517	mulq	%rbx
1518	addq	%rax,%r14
1519	movq	112(%rbp),%rax
1520	adcq	$0,%rdx
1521	addq	%r14,%r13
1522	movq	%rdx,%r14
1523	adcq	$0,%r14
1524
1525	mulq	%rbx
1526	movq	48-16+8(%rsp,%rcx,8),%rbx
1527	addq	%rax,%r15
1528	adcq	$0,%rdx
1529	addq	%r15,%r14
1530	movq	0(%rbp),%rax
1531	movq	%rdx,%r15
1532	adcq	$0,%r15
1533
1534	decl	%ecx
1535	jnz	.L8x_tail
1536
1537	leaq	128(%rbp),%rbp
1538	movq	8+8(%rsp),%rdx
1539	cmpq	0+8(%rsp),%rbp
1540	jae	.L8x_tail_done
1541
1542	movq	48+56+8(%rsp),%rbx
1543	negq	%rsi
1544	movq	0(%rbp),%rax
1545	adcq	0(%rdi),%r8
1546	adcq	8(%rdi),%r9
1547	adcq	16(%rdi),%r10
1548	adcq	24(%rdi),%r11
1549	adcq	32(%rdi),%r12
1550	adcq	40(%rdi),%r13
1551	adcq	48(%rdi),%r14
1552	adcq	56(%rdi),%r15
1553	sbbq	%rsi,%rsi
1554
1555	movl	$8,%ecx
1556	jmp	.L8x_tail
1557
1558.align	32
1559.L8x_tail_done:
1560	addq	(%rdx),%r8
1561	adcq	$0,%r9
1562	adcq	$0,%r10
1563	adcq	$0,%r11
1564	adcq	$0,%r12
1565	adcq	$0,%r13
1566	adcq	$0,%r14
1567	adcq	$0,%r15
1568
1569
1570	xorq	%rax,%rax
1571
1572	negq	%rsi
1573.L8x_no_tail:
1574	adcq	0(%rdi),%r8
1575	adcq	8(%rdi),%r9
1576	adcq	16(%rdi),%r10
1577	adcq	24(%rdi),%r11
1578	adcq	32(%rdi),%r12
1579	adcq	40(%rdi),%r13
1580	adcq	48(%rdi),%r14
1581	adcq	56(%rdi),%r15
1582	adcq	$0,%rax
1583	movq	-16(%rbp),%rcx
1584	xorq	%rsi,%rsi
1585
1586.byte	102,72,15,126,213
1587
1588	movq	%r8,0(%rdi)
1589	movq	%r9,8(%rdi)
1590.byte	102,73,15,126,217
1591	movq	%r10,16(%rdi)
1592	movq	%r11,24(%rdi)
1593	movq	%r12,32(%rdi)
1594	movq	%r13,40(%rdi)
1595	movq	%r14,48(%rdi)
1596	movq	%r15,56(%rdi)
1597	leaq	64(%rdi),%rdi
1598
1599	cmpq	%rdx,%rdi
1600	jb	.L8x_reduction_loop
1601
1602	subq	%r15,%rcx
1603	leaq	(%rdi,%r9,1),%rbx
1604	adcq	%rsi,%rsi
1605	movq	%r9,%rcx
1606	orq	%rsi,%rax
1607.byte	102,72,15,126,207
1608	xorq	$1,%rax
1609.byte	102,72,15,126,206
1610	leaq	(%rbp,%rax,8),%rbp
1611	sarq	$3+2,%rcx
1612	jmp	.Lsqr4x_sub
1613
1614.align	32
1615.Lsqr4x_sub:
1616.byte	0x66
1617	movq	0(%rbx),%r12
1618	movq	8(%rbx),%r13
1619	sbbq	0(%rbp),%r12
1620	movq	16(%rbx),%r14
1621	sbbq	16(%rbp),%r13
1622	movq	24(%rbx),%r15
1623	leaq	32(%rbx),%rbx
1624	sbbq	32(%rbp),%r14
1625	movq	%r12,0(%rdi)
1626	sbbq	48(%rbp),%r15
1627	leaq	64(%rbp),%rbp
1628	movq	%r13,8(%rdi)
1629	movq	%r14,16(%rdi)
1630	movq	%r15,24(%rdi)
1631	leaq	32(%rdi),%rdi
1632
1633	incq	%rcx
1634	jnz	.Lsqr4x_sub
1635	movq	%r9,%r10
1636	negq	%r9
1637	.byte	0xf3,0xc3
1638.size	bn_sqr8x_internal,.-bn_sqr8x_internal
1639.globl	bn_from_montgomery
1640.type	bn_from_montgomery,@function
1641.align	32
1642bn_from_montgomery:
1643	testl	$7,%r9d
1644	jz	bn_from_mont8x
1645	xorl	%eax,%eax
1646	.byte	0xf3,0xc3
1647.size	bn_from_montgomery,.-bn_from_montgomery
1648
1649.type	bn_from_mont8x,@function
1650.align	32
1651bn_from_mont8x:
1652.byte	0x67
1653	movq	%rsp,%rax
1654	pushq	%rbx
1655	pushq	%rbp
1656	pushq	%r12
1657	pushq	%r13
1658	pushq	%r14
1659	pushq	%r15
1660.byte	0x67
1661	movl	%r9d,%r10d
1662	shll	$3,%r9d
1663	shll	$3+2,%r10d
1664	negq	%r9
1665	movq	(%r8),%r8
1666
1667
1668
1669
1670
1671
1672
1673	leaq	-64(%rsp,%r9,2),%r11
1674	subq	%rsi,%r11
1675	andq	$4095,%r11
1676	cmpq	%r11,%r10
1677	jb	.Lfrom_sp_alt
1678	subq	%r11,%rsp
1679	leaq	-64(%rsp,%r9,2),%rsp
1680	jmp	.Lfrom_sp_done
1681
1682.align	32
1683.Lfrom_sp_alt:
1684	leaq	4096-64(,%r9,2),%r10
1685	leaq	-64(%rsp,%r9,2),%rsp
1686	subq	%r10,%r11
1687	movq	$0,%r10
1688	cmovcq	%r10,%r11
1689	subq	%r11,%rsp
1690.Lfrom_sp_done:
1691	andq	$-64,%rsp
1692	movq	%r9,%r10
1693	negq	%r9
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704	movq	%r8,32(%rsp)
1705	movq	%rax,40(%rsp)
1706.Lfrom_body:
1707	movq	%r9,%r11
1708	leaq	48(%rsp),%rax
1709	pxor	%xmm0,%xmm0
1710	jmp	.Lmul_by_1
1711
1712.align	32
1713.Lmul_by_1:
1714	movdqu	(%rsi),%xmm1
1715	movdqu	16(%rsi),%xmm2
1716	movdqu	32(%rsi),%xmm3
1717	movdqa	%xmm0,(%rax,%r9,1)
1718	movdqu	48(%rsi),%xmm4
1719	movdqa	%xmm0,16(%rax,%r9,1)
1720.byte	0x48,0x8d,0xb6,0x40,0x00,0x00,0x00
1721	movdqa	%xmm1,(%rax)
1722	movdqa	%xmm0,32(%rax,%r9,1)
1723	movdqa	%xmm2,16(%rax)
1724	movdqa	%xmm0,48(%rax,%r9,1)
1725	movdqa	%xmm3,32(%rax)
1726	movdqa	%xmm4,48(%rax)
1727	leaq	64(%rax),%rax
1728	subq	$64,%r11
1729	jnz	.Lmul_by_1
1730
1731.byte	102,72,15,110,207
1732.byte	102,72,15,110,209
1733.byte	0x67
1734	movq	%rcx,%rbp
1735.byte	102,73,15,110,218
1736	call	sqr8x_reduction
1737
1738	pxor	%xmm0,%xmm0
1739	leaq	48(%rsp),%rax
1740	movq	40(%rsp),%rsi
1741	jmp	.Lfrom_mont_zero
1742
1743.align	32
1744.Lfrom_mont_zero:
1745	movdqa	%xmm0,0(%rax)
1746	movdqa	%xmm0,16(%rax)
1747	movdqa	%xmm0,32(%rax)
1748	movdqa	%xmm0,48(%rax)
1749	leaq	64(%rax),%rax
1750	subq	$32,%r9
1751	jnz	.Lfrom_mont_zero
1752
1753	movq	$1,%rax
1754	movq	-48(%rsi),%r15
1755	movq	-40(%rsi),%r14
1756	movq	-32(%rsi),%r13
1757	movq	-24(%rsi),%r12
1758	movq	-16(%rsi),%rbp
1759	movq	-8(%rsi),%rbx
1760	leaq	(%rsi),%rsp
1761.Lfrom_epilogue:
1762	.byte	0xf3,0xc3
1763.size	bn_from_mont8x,.-bn_from_mont8x
1764.globl	bn_get_bits5
1765.type	bn_get_bits5,@function
1766.align	16
1767bn_get_bits5:
1768	leaq	0(%rdi),%r10
1769	leaq	1(%rdi),%r11
1770	movl	%esi,%ecx
1771	shrl	$4,%esi
1772	andl	$15,%ecx
1773	leal	-8(%rcx),%eax
1774	cmpl	$11,%ecx
1775	cmovaq	%r11,%r10
1776	cmoval	%eax,%ecx
1777	movzwl	(%r10,%rsi,2),%eax
1778	shrl	%cl,%eax
1779	andl	$31,%eax
1780	.byte	0xf3,0xc3
1781.size	bn_get_bits5,.-bn_get_bits5
1782
1783.globl	bn_scatter5
1784.type	bn_scatter5,@function
1785.align	16
1786bn_scatter5:
1787	cmpl	$0,%esi
1788	jz	.Lscatter_epilogue
1789	leaq	(%rdx,%rcx,8),%rdx
1790.Lscatter:
1791	movq	(%rdi),%rax
1792	leaq	8(%rdi),%rdi
1793	movq	%rax,(%rdx)
1794	leaq	256(%rdx),%rdx
1795	subl	$1,%esi
1796	jnz	.Lscatter
1797.Lscatter_epilogue:
1798	.byte	0xf3,0xc3
1799.size	bn_scatter5,.-bn_scatter5
1800
1801.globl	bn_gather5
1802.type	bn_gather5,@function
1803.align	16
1804bn_gather5:
1805	movl	%ecx,%r11d
1806	shrl	$3,%ecx
1807	andq	$7,%r11
1808	notl	%ecx
1809	leaq	.Lmagic_masks(%rip),%rax
1810	andl	$3,%ecx
1811	leaq	128(%rdx,%r11,8),%rdx
1812	movq	0(%rax,%rcx,8),%xmm4
1813	movq	8(%rax,%rcx,8),%xmm5
1814	movq	16(%rax,%rcx,8),%xmm6
1815	movq	24(%rax,%rcx,8),%xmm7
1816	jmp	.Lgather
1817.align	16
1818.Lgather:
1819	movq	-128(%rdx),%xmm0
1820	movq	-64(%rdx),%xmm1
1821	pand	%xmm4,%xmm0
1822	movq	0(%rdx),%xmm2
1823	pand	%xmm5,%xmm1
1824	movq	64(%rdx),%xmm3
1825	pand	%xmm6,%xmm2
1826	por	%xmm1,%xmm0
1827	pand	%xmm7,%xmm3
1828.byte	0x67,0x67
1829	por	%xmm2,%xmm0
1830	leaq	256(%rdx),%rdx
1831	por	%xmm3,%xmm0
1832
1833	movq	%xmm0,(%rdi)
1834	leaq	8(%rdi),%rdi
1835	subl	$1,%esi
1836	jnz	.Lgather
1837	.byte	0xf3,0xc3
1838.LSEH_end_bn_gather5:
1839.size	bn_gather5,.-bn_gather5
1840.align	64
1841.Lmagic_masks:
1842.long	0,0, 0,0, 0,0, -1,-1
1843.long	0,0, 0,0, 0,0,  0,0
1844.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1845