1#! /usr/bin/env perl
2# Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# Specific modes implementations for SPARC Architecture 2011. There
11# is T4 dependency though, an ASI value that is not specified in the
12# Architecture Manual. But as SPARC universe is rather monocultural,
13# we imply that processor capable of executing crypto instructions
14# can handle the ASI in question as well. This means that we ought to
15# keep eyes open when new processors emerge...
16#
17# As for above mentioned ASI. It's so called "block initializing
18# store" which cancels "read" in "read-update-write" on cache lines.
19# This is "cooperative" optimization, as it reduces overall pressure
20# on memory interface. Benefits can't be observed/quantified with
21# usual benchmarks, on the contrary you can notice that single-thread
22# performance for parallelizable modes is ~1.5% worse for largest
23# block sizes [though few percent better for not so long ones]. All
24# this based on suggestions from David Miller.
25
26$::bias="STACK_BIAS";
27$::frame="STACK_FRAME";
28$::size_t_cc="SIZE_T_CC";
29
30sub asm_init {                # to be called with @ARGV as argument
31    for (@_)                  { $::abibits=64 if (/\-m64/ || /\-xarch\=v9/); }
32    if ($::abibits==64)       { $::bias=2047; $::frame=192; $::size_t_cc="%xcc"; }
33    else            { $::bias=0;    $::frame=112; $::size_t_cc="%icc"; }
34}
35
36# unified interface
37my ($inp,$out,$len,$key,$ivec)=map("%i$_",(0..5));
38# local variables
39my ($ileft,$iright,$ooff,$omask,$ivoff,$blk_init)=map("%l$_",(0..7));
40
41sub alg_cbc_encrypt_implement {
42my ($alg,$bits) = @_;
43
44$::code.=<<___;
45.globl    ${alg}${bits}_t4_cbc_encrypt
46.align    32
47${alg}${bits}_t4_cbc_encrypt:
48          save                %sp, -$::frame, %sp
49          cmp                 $len, 0
50          be,pn               $::size_t_cc, .L${bits}_cbc_enc_abort
51          srln                $len, 0, $len                 ! needed on v8+, "nop" on v9
52          sub                 $inp, $out, $blk_init         ! $inp!=$out
53___
54$::code.=<<___ if (!$::evp);
55          andcc               $ivec, 7, $ivoff
56          alignaddr $ivec, %g0, $ivec
57
58          ldd                 [$ivec + 0], %f0    ! load ivec
59          bz,pt               %icc, 1f
60          ldd                 [$ivec + 8], %f2
61          ldd                 [$ivec + 16], %f4
62          faligndata          %f0, %f2, %f0
63          faligndata          %f2, %f4, %f2
641:
65___
66$::code.=<<___ if ($::evp);
67          ld                  [$ivec + 0], %f0
68          ld                  [$ivec + 4], %f1
69          ld                  [$ivec + 8], %f2
70          ld                  [$ivec + 12], %f3
71___
72$::code.=<<___;
73          prefetch  [$inp], 20
74          prefetch  [$inp + 63], 20
75          call                _${alg}${bits}_load_enckey
76          and                 $inp, 7, $ileft
77          andn                $inp, 7, $inp
78          sll                 $ileft, 3, $ileft
79          mov                 64, $iright
80          mov                 0xff, $omask
81          sub                 $iright, $ileft, $iright
82          and                 $out, 7, $ooff
83          cmp                 $len, 127
84          movrnz              $ooff, 0, $blk_init           ! if (    $out&7 ||
85          movleu              $::size_t_cc, 0, $blk_init    !         $len<128 ||
86          brnz,pn             $blk_init, .L${bits}cbc_enc_blk         !         $inp==$out)
87          srl                 $omask, $ooff, $omask
88
89          alignaddrl          $out, %g0, $out
90          srlx                $len, 4, $len
91          prefetch  [$out], 22
92
93.L${bits}_cbc_enc_loop:
94          ldx                 [$inp + 0], %o0
95          brz,pt              $ileft, 4f
96          ldx                 [$inp + 8], %o1
97
98          ldx                 [$inp + 16], %o2
99          sllx                %o0, $ileft, %o0
100          srlx                %o1, $iright, %g1
101          sllx                %o1, $ileft, %o1
102          or                  %g1, %o0, %o0
103          srlx                %o2, $iright, %o2
104          or                  %o2, %o1, %o1
1054:
106          xor                 %g4, %o0, %o0                 ! ^= rk[0]
107          xor                 %g5, %o1, %o1
108          movxtod             %o0, %f12
109          movxtod             %o1, %f14
110
111          fxor                %f12, %f0, %f0                ! ^= ivec
112          fxor                %f14, %f2, %f2
113          prefetch  [$out + 63], 22
114          prefetch  [$inp + 16+63], 20
115          call                _${alg}${bits}_encrypt_1x
116          add                 $inp, 16, $inp
117
118          brnz,pn             $ooff, 2f
119          sub                 $len, 1, $len
120
121          std                 %f0, [$out + 0]
122          std                 %f2, [$out + 8]
123          brnz,pt             $len, .L${bits}_cbc_enc_loop
124          add                 $out, 16, $out
125___
126$::code.=<<___ if ($::evp);
127          st                  %f0, [$ivec + 0]
128          st                  %f1, [$ivec + 4]
129          st                  %f2, [$ivec + 8]
130          st                  %f3, [$ivec + 12]
131___
132$::code.=<<___ if (!$::evp);
133          brnz,pn             $ivoff, 3f
134          nop
135
136          std                 %f0, [$ivec + 0]    ! write out ivec
137          std                 %f2, [$ivec + 8]
138___
139$::code.=<<___;
140.L${bits}_cbc_enc_abort:
141          ret
142          restore
143
144.align    16
1452:        ldxa                [$inp]0x82, %o0               ! avoid read-after-write hazard
146                                                            ! and ~3x deterioration
147                                                            ! in inp==out case
148          faligndata          %f0, %f0, %f4                 ! handle unaligned output
149          faligndata          %f0, %f2, %f6
150          faligndata          %f2, %f2, %f8
151
152          stda                %f4, [$out + $omask]0xc0      ! partial store
153          std                 %f6, [$out + 8]
154          add                 $out, 16, $out
155          orn                 %g0, $omask, $omask
156          stda                %f8, [$out + $omask]0xc0      ! partial store
157
158          brnz,pt             $len, .L${bits}_cbc_enc_loop+4
159          orn                 %g0, $omask, $omask
160___
161$::code.=<<___ if ($::evp);
162          st                  %f0, [$ivec + 0]
163          st                  %f1, [$ivec + 4]
164          st                  %f2, [$ivec + 8]
165          st                  %f3, [$ivec + 12]
166___
167$::code.=<<___ if (!$::evp);
168          brnz,pn             $ivoff, 3f
169          nop
170
171          std                 %f0, [$ivec + 0]    ! write out ivec
172          std                 %f2, [$ivec + 8]
173          ret
174          restore
175
176.align    16
1773:        alignaddrl          $ivec, $ivoff, %g0  ! handle unaligned ivec
178          mov                 0xff, $omask
179          srl                 $omask, $ivoff, $omask
180          faligndata          %f0, %f0, %f4
181          faligndata          %f0, %f2, %f6
182          faligndata          %f2, %f2, %f8
183          stda                %f4, [$ivec + $omask]0xc0
184          std                 %f6, [$ivec + 8]
185          add                 $ivec, 16, $ivec
186          orn                 %g0, $omask, $omask
187          stda                %f8, [$ivec + $omask]0xc0
188___
189$::code.=<<___;
190          ret
191          restore
192
193!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
194.align    32
195.L${bits}cbc_enc_blk:
196          add       $out, $len, $blk_init
197          and       $blk_init, 63, $blk_init      ! tail
198          sub       $len, $blk_init, $len
199          add       $blk_init, 15, $blk_init      ! round up to 16n
200          srlx      $len, 4, $len
201          srl       $blk_init, 4, $blk_init
202
203.L${bits}_cbc_enc_blk_loop:
204          ldx                 [$inp + 0], %o0
205          brz,pt              $ileft, 5f
206          ldx                 [$inp + 8], %o1
207
208          ldx                 [$inp + 16], %o2
209          sllx                %o0, $ileft, %o0
210          srlx                %o1, $iright, %g1
211          sllx                %o1, $ileft, %o1
212          or                  %g1, %o0, %o0
213          srlx                %o2, $iright, %o2
214          or                  %o2, %o1, %o1
2155:
216          xor                 %g4, %o0, %o0                 ! ^= rk[0]
217          xor                 %g5, %o1, %o1
218          movxtod             %o0, %f12
219          movxtod             %o1, %f14
220
221          fxor                %f12, %f0, %f0                ! ^= ivec
222          fxor                %f14, %f2, %f2
223          prefetch  [$inp + 16+63], 20
224          call                _${alg}${bits}_encrypt_1x
225          add                 $inp, 16, $inp
226          sub                 $len, 1, $len
227
228          stda                %f0, [$out]0xe2               ! ASI_BLK_INIT, T4-specific
229          add                 $out, 8, $out
230          stda                %f2, [$out]0xe2               ! ASI_BLK_INIT, T4-specific
231          brnz,pt             $len, .L${bits}_cbc_enc_blk_loop
232          add                 $out, 8, $out
233
234          membar              #StoreLoad|#StoreStore
235          brnz,pt             $blk_init, .L${bits}_cbc_enc_loop
236          mov                 $blk_init, $len
237___
238$::code.=<<___ if ($::evp);
239          st                  %f0, [$ivec + 0]
240          st                  %f1, [$ivec + 4]
241          st                  %f2, [$ivec + 8]
242          st                  %f3, [$ivec + 12]
243___
244$::code.=<<___ if (!$::evp);
245          brnz,pn             $ivoff, 3b
246          nop
247
248          std                 %f0, [$ivec + 0]    ! write out ivec
249          std                 %f2, [$ivec + 8]
250___
251$::code.=<<___;
252          ret
253          restore
254.type     ${alg}${bits}_t4_cbc_encrypt,#function
255.size     ${alg}${bits}_t4_cbc_encrypt,.-${alg}${bits}_t4_cbc_encrypt
256___
257}
258
259sub alg_cbc_decrypt_implement {
260my ($alg,$bits) = @_;
261
262$::code.=<<___;
263.globl    ${alg}${bits}_t4_cbc_decrypt
264.align    32
265${alg}${bits}_t4_cbc_decrypt:
266          save                %sp, -$::frame, %sp
267          cmp                 $len, 0
268          be,pn               $::size_t_cc, .L${bits}_cbc_dec_abort
269          srln                $len, 0, $len                 ! needed on v8+, "nop" on v9
270          sub                 $inp, $out, $blk_init         ! $inp!=$out
271___
272$::code.=<<___ if (!$::evp);
273          andcc               $ivec, 7, $ivoff
274          alignaddr $ivec, %g0, $ivec
275
276          ldd                 [$ivec + 0], %f12   ! load ivec
277          bz,pt               %icc, 1f
278          ldd                 [$ivec + 8], %f14
279          ldd                 [$ivec + 16], %f0
280          faligndata          %f12, %f14, %f12
281          faligndata          %f14, %f0, %f14
2821:
283___
284$::code.=<<___ if ($::evp);
285          ld                  [$ivec + 0], %f12   ! load ivec
286          ld                  [$ivec + 4], %f13
287          ld                  [$ivec + 8], %f14
288          ld                  [$ivec + 12], %f15
289___
290$::code.=<<___;
291          prefetch  [$inp], 20
292          prefetch  [$inp + 63], 20
293          call                _${alg}${bits}_load_deckey
294          and                 $inp, 7, $ileft
295          andn                $inp, 7, $inp
296          sll                 $ileft, 3, $ileft
297          mov                 64, $iright
298          mov                 0xff, $omask
299          sub                 $iright, $ileft, $iright
300          and                 $out, 7, $ooff
301          cmp                 $len, 255
302          movrnz              $ooff, 0, $blk_init           ! if (    $out&7 ||
303          movleu              $::size_t_cc, 0, $blk_init    !         $len<256 ||
304          brnz,pn             $blk_init, .L${bits}cbc_dec_blk         !         $inp==$out)
305          srl                 $omask, $ooff, $omask
306
307          andcc               $len, 16, %g0                 ! is number of blocks even?
308          srlx                $len, 4, $len
309          alignaddrl          $out, %g0, $out
310          bz                  %icc, .L${bits}_cbc_dec_loop2x
311          prefetch  [$out], 22
312.L${bits}_cbc_dec_loop:
313          ldx                 [$inp + 0], %o0
314          brz,pt              $ileft, 4f
315          ldx                 [$inp + 8], %o1
316
317          ldx                 [$inp + 16], %o2
318          sllx                %o0, $ileft, %o0
319          srlx                %o1, $iright, %g1
320          sllx                %o1, $ileft, %o1
321          or                  %g1, %o0, %o0
322          srlx                %o2, $iright, %o2
323          or                  %o2, %o1, %o1
3244:
325          xor                 %g4, %o0, %o2                 ! ^= rk[0]
326          xor                 %g5, %o1, %o3
327          movxtod             %o2, %f0
328          movxtod             %o3, %f2
329
330          prefetch  [$out + 63], 22
331          prefetch  [$inp + 16+63], 20
332          call                _${alg}${bits}_decrypt_1x
333          add                 $inp, 16, $inp
334
335          fxor                %f12, %f0, %f0                ! ^= ivec
336          fxor                %f14, %f2, %f2
337          movxtod             %o0, %f12
338          movxtod             %o1, %f14
339
340          brnz,pn             $ooff, 2f
341          sub                 $len, 1, $len
342
343          std                 %f0, [$out + 0]
344          std                 %f2, [$out + 8]
345          brnz,pt             $len, .L${bits}_cbc_dec_loop2x
346          add                 $out, 16, $out
347___
348$::code.=<<___ if ($::evp);
349          st                  %f12, [$ivec + 0]
350          st                  %f13, [$ivec + 4]
351          st                  %f14, [$ivec + 8]
352          st                  %f15, [$ivec + 12]
353___
354$::code.=<<___ if (!$::evp);
355          brnz,pn             $ivoff, .L${bits}_cbc_dec_unaligned_ivec
356          nop
357
358          std                 %f12, [$ivec + 0]   ! write out ivec
359          std                 %f14, [$ivec + 8]
360___
361$::code.=<<___;
362.L${bits}_cbc_dec_abort:
363          ret
364          restore
365
366.align    16
3672:        ldxa                [$inp]0x82, %o0               ! avoid read-after-write hazard
368                                                            ! and ~3x deterioration
369                                                            ! in inp==out case
370          faligndata          %f0, %f0, %f4                 ! handle unaligned output
371          faligndata          %f0, %f2, %f6
372          faligndata          %f2, %f2, %f8
373
374          stda                %f4, [$out + $omask]0xc0      ! partial store
375          std                 %f6, [$out + 8]
376          add                 $out, 16, $out
377          orn                 %g0, $omask, $omask
378          stda                %f8, [$out + $omask]0xc0      ! partial store
379
380          brnz,pt             $len, .L${bits}_cbc_dec_loop2x+4
381          orn                 %g0, $omask, $omask
382___
383$::code.=<<___ if ($::evp);
384          st                  %f12, [$ivec + 0]
385          st                  %f13, [$ivec + 4]
386          st                  %f14, [$ivec + 8]
387          st                  %f15, [$ivec + 12]
388___
389$::code.=<<___ if (!$::evp);
390          brnz,pn             $ivoff, .L${bits}_cbc_dec_unaligned_ivec
391          nop
392
393          std                 %f12, [$ivec + 0]   ! write out ivec
394          std                 %f14, [$ivec + 8]
395___
396$::code.=<<___;
397          ret
398          restore
399
400!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
401.align    32
402.L${bits}_cbc_dec_loop2x:
403          ldx                 [$inp + 0], %o0
404          ldx                 [$inp + 8], %o1
405          ldx                 [$inp + 16], %o2
406          brz,pt              $ileft, 4f
407          ldx                 [$inp + 24], %o3
408
409          ldx                 [$inp + 32], %o4
410          sllx                %o0, $ileft, %o0
411          srlx                %o1, $iright, %g1
412          or                  %g1, %o0, %o0
413          sllx                %o1, $ileft, %o1
414          srlx                %o2, $iright, %g1
415          or                  %g1, %o1, %o1
416          sllx                %o2, $ileft, %o2
417          srlx                %o3, $iright, %g1
418          or                  %g1, %o2, %o2
419          sllx                %o3, $ileft, %o3
420          srlx                %o4, $iright, %o4
421          or                  %o4, %o3, %o3
4224:
423          xor                 %g4, %o0, %o4                 ! ^= rk[0]
424          xor                 %g5, %o1, %o5
425          movxtod             %o4, %f0
426          movxtod             %o5, %f2
427          xor                 %g4, %o2, %o4
428          xor                 %g5, %o3, %o5
429          movxtod             %o4, %f4
430          movxtod             %o5, %f6
431
432          prefetch  [$out + 63], 22
433          prefetch  [$inp + 32+63], 20
434          call                _${alg}${bits}_decrypt_2x
435          add                 $inp, 32, $inp
436
437          movxtod             %o0, %f8
438          movxtod             %o1, %f10
439          fxor                %f12, %f0, %f0                ! ^= ivec
440          fxor                %f14, %f2, %f2
441          movxtod             %o2, %f12
442          movxtod             %o3, %f14
443          fxor                %f8, %f4, %f4
444          fxor                %f10, %f6, %f6
445
446          brnz,pn             $ooff, 2f
447          sub                 $len, 2, $len
448
449          std                 %f0, [$out + 0]
450          std                 %f2, [$out + 8]
451          std                 %f4, [$out + 16]
452          std                 %f6, [$out + 24]
453          brnz,pt             $len, .L${bits}_cbc_dec_loop2x
454          add                 $out, 32, $out
455___
456$::code.=<<___ if ($::evp);
457          st                  %f12, [$ivec + 0]
458          st                  %f13, [$ivec + 4]
459          st                  %f14, [$ivec + 8]
460          st                  %f15, [$ivec + 12]
461___
462$::code.=<<___ if (!$::evp);
463          brnz,pn             $ivoff, .L${bits}_cbc_dec_unaligned_ivec
464          nop
465
466          std                 %f12, [$ivec + 0]   ! write out ivec
467          std                 %f14, [$ivec + 8]
468___
469$::code.=<<___;
470          ret
471          restore
472
473.align    16
4742:        ldxa                [$inp]0x82, %o0               ! avoid read-after-write hazard
475                                                            ! and ~3x deterioration
476                                                            ! in inp==out case
477          faligndata          %f0, %f0, %f8                 ! handle unaligned output
478          faligndata          %f0, %f2, %f0
479          faligndata          %f2, %f4, %f2
480          faligndata          %f4, %f6, %f4
481          faligndata          %f6, %f6, %f6
482          stda                %f8, [$out + $omask]0xc0      ! partial store
483          std                 %f0, [$out + 8]
484          std                 %f2, [$out + 16]
485          std                 %f4, [$out + 24]
486          add                 $out, 32, $out
487          orn                 %g0, $omask, $omask
488          stda                %f6, [$out + $omask]0xc0      ! partial store
489
490          brnz,pt             $len, .L${bits}_cbc_dec_loop2x+4
491          orn                 %g0, $omask, $omask
492___
493$::code.=<<___ if ($::evp);
494          st                  %f12, [$ivec + 0]
495          st                  %f13, [$ivec + 4]
496          st                  %f14, [$ivec + 8]
497          st                  %f15, [$ivec + 12]
498___
499$::code.=<<___ if (!$::evp);
500          brnz,pn             $ivoff, .L${bits}_cbc_dec_unaligned_ivec
501          nop
502
503          std                 %f12, [$ivec + 0]   ! write out ivec
504          std                 %f14, [$ivec + 8]
505          ret
506          restore
507
508.align    16
509.L${bits}_cbc_dec_unaligned_ivec:
510          alignaddrl          $ivec, $ivoff, %g0  ! handle unaligned ivec
511          mov                 0xff, $omask
512          srl                 $omask, $ivoff, $omask
513          faligndata          %f12, %f12, %f0
514          faligndata          %f12, %f14, %f2
515          faligndata          %f14, %f14, %f4
516          stda                %f0, [$ivec + $omask]0xc0
517          std                 %f2, [$ivec + 8]
518          add                 $ivec, 16, $ivec
519          orn                 %g0, $omask, $omask
520          stda                %f4, [$ivec + $omask]0xc0
521___
522$::code.=<<___;
523          ret
524          restore
525
526!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
527.align    32
528.L${bits}cbc_dec_blk:
529          add       $out, $len, $blk_init
530          and       $blk_init, 63, $blk_init      ! tail
531          sub       $len, $blk_init, $len
532          add       $blk_init, 15, $blk_init      ! round up to 16n
533          srlx      $len, 4, $len
534          srl       $blk_init, 4, $blk_init
535          sub       $len, 1, $len
536          add       $blk_init, 1, $blk_init
537
538.L${bits}_cbc_dec_blk_loop2x:
539          ldx                 [$inp + 0], %o0
540          ldx                 [$inp + 8], %o1
541          ldx                 [$inp + 16], %o2
542          brz,pt              $ileft, 5f
543          ldx                 [$inp + 24], %o3
544
545          ldx                 [$inp + 32], %o4
546          sllx                %o0, $ileft, %o0
547          srlx                %o1, $iright, %g1
548          or                  %g1, %o0, %o0
549          sllx                %o1, $ileft, %o1
550          srlx                %o2, $iright, %g1
551          or                  %g1, %o1, %o1
552          sllx                %o2, $ileft, %o2
553          srlx                %o3, $iright, %g1
554          or                  %g1, %o2, %o2
555          sllx                %o3, $ileft, %o3
556          srlx                %o4, $iright, %o4
557          or                  %o4, %o3, %o3
5585:
559          xor                 %g4, %o0, %o4                 ! ^= rk[0]
560          xor                 %g5, %o1, %o5
561          movxtod             %o4, %f0
562          movxtod             %o5, %f2
563          xor                 %g4, %o2, %o4
564          xor                 %g5, %o3, %o5
565          movxtod             %o4, %f4
566          movxtod             %o5, %f6
567
568          prefetch  [$inp + 32+63], 20
569          call                _${alg}${bits}_decrypt_2x
570          add                 $inp, 32, $inp
571          subcc               $len, 2, $len
572
573          movxtod             %o0, %f8
574          movxtod             %o1, %f10
575          fxor                %f12, %f0, %f0                ! ^= ivec
576          fxor                %f14, %f2, %f2
577          movxtod             %o2, %f12
578          movxtod             %o3, %f14
579          fxor                %f8, %f4, %f4
580          fxor                %f10, %f6, %f6
581
582          stda                %f0, [$out]0xe2               ! ASI_BLK_INIT, T4-specific
583          add                 $out, 8, $out
584          stda                %f2, [$out]0xe2               ! ASI_BLK_INIT, T4-specific
585          add                 $out, 8, $out
586          stda                %f4, [$out]0xe2               ! ASI_BLK_INIT, T4-specific
587          add                 $out, 8, $out
588          stda                %f6, [$out]0xe2               ! ASI_BLK_INIT, T4-specific
589          bgu,pt              $::size_t_cc, .L${bits}_cbc_dec_blk_loop2x
590          add                 $out, 8, $out
591
592          add                 $blk_init, $len, $len
593          andcc               $len, 1, %g0                  ! is number of blocks even?
594          membar              #StoreLoad|#StoreStore
595          bnz,pt              %icc, .L${bits}_cbc_dec_loop
596          srl                 $len, 0, $len
597          brnz,pn             $len, .L${bits}_cbc_dec_loop2x
598          nop
599___
600$::code.=<<___ if ($::evp);
601          st                  %f12, [$ivec + 0]   ! write out ivec
602          st                  %f13, [$ivec + 4]
603          st                  %f14, [$ivec + 8]
604          st                  %f15, [$ivec + 12]
605___
606$::code.=<<___ if (!$::evp);
607          brnz,pn             $ivoff, 3b
608          nop
609
610          std                 %f12, [$ivec + 0]   ! write out ivec
611          std                 %f14, [$ivec + 8]
612___
613$::code.=<<___;
614          ret
615          restore
616.type     ${alg}${bits}_t4_cbc_decrypt,#function
617.size     ${alg}${bits}_t4_cbc_decrypt,.-${alg}${bits}_t4_cbc_decrypt
618___
619}
620
621sub alg_ctr32_implement {
622my ($alg,$bits) = @_;
623
624$::code.=<<___;
625.globl    ${alg}${bits}_t4_ctr32_encrypt
626.align    32
627${alg}${bits}_t4_ctr32_encrypt:
628          save                %sp, -$::frame, %sp
629          srln                $len, 0, $len                 ! needed on v8+, "nop" on v9
630
631          prefetch  [$inp], 20
632          prefetch  [$inp + 63], 20
633          call                _${alg}${bits}_load_enckey
634          sllx                $len, 4, $len
635
636          ld                  [$ivec + 0], %l4    ! counter
637          ld                  [$ivec + 4], %l5
638          ld                  [$ivec + 8], %l6
639          ld                  [$ivec + 12], %l7
640
641          sllx                %l4, 32, %o5
642          or                  %l5, %o5, %o5
643          sllx                %l6, 32, %g1
644          xor                 %o5, %g4, %g4                 ! ^= rk[0]
645          xor                 %g1, %g5, %g5
646          movxtod             %g4, %f14           ! most significant 64 bits
647
648          sub                 $inp, $out, $blk_init         ! $inp!=$out
649          and                 $inp, 7, $ileft
650          andn                $inp, 7, $inp
651          sll                 $ileft, 3, $ileft
652          mov                 64, $iright
653          mov                 0xff, $omask
654          sub                 $iright, $ileft, $iright
655          and                 $out, 7, $ooff
656          cmp                 $len, 255
657          movrnz              $ooff, 0, $blk_init           ! if (    $out&7 ||
658          movleu              $::size_t_cc, 0, $blk_init    !         $len<256 ||
659          brnz,pn             $blk_init, .L${bits}_ctr32_blk          !         $inp==$out)
660          srl                 $omask, $ooff, $omask
661
662          andcc               $len, 16, %g0                 ! is number of blocks even?
663          alignaddrl          $out, %g0, $out
664          bz                  %icc, .L${bits}_ctr32_loop2x
665          srlx                $len, 4, $len
666.L${bits}_ctr32_loop:
667          ldx                 [$inp + 0], %o0
668          brz,pt              $ileft, 4f
669          ldx                 [$inp + 8], %o1
670
671          ldx                 [$inp + 16], %o2
672          sllx                %o0, $ileft, %o0
673          srlx                %o1, $iright, %g1
674          sllx                %o1, $ileft, %o1
675          or                  %g1, %o0, %o0
676          srlx                %o2, $iright, %o2
677          or                  %o2, %o1, %o1
6784:
679          xor                 %g5, %l7, %g1                 ! ^= rk[0]
680          add                 %l7, 1, %l7
681          movxtod             %g1, %f2
682          srl                 %l7, 0, %l7                   ! clruw
683          prefetch  [$out + 63], 22
684          prefetch  [$inp + 16+63], 20
685___
686$::code.=<<___ if ($alg eq "aes");
687          aes_eround01        %f16, %f14, %f2, %f4
688          aes_eround23        %f18, %f14, %f2, %f2
689___
690$::code.=<<___ if ($alg eq "cmll");
691          camellia_f          %f16, %f2, %f14, %f2
692          camellia_f          %f18, %f14, %f2, %f0
693___
694$::code.=<<___;
695          call                _${alg}${bits}_encrypt_1x+8
696          add                 $inp, 16, $inp
697
698          movxtod             %o0, %f10
699          movxtod             %o1, %f12
700          fxor                %f10, %f0, %f0                ! ^= inp
701          fxor                %f12, %f2, %f2
702
703          brnz,pn             $ooff, 2f
704          sub                 $len, 1, $len
705
706          std                 %f0, [$out + 0]
707          std                 %f2, [$out + 8]
708          brnz,pt             $len, .L${bits}_ctr32_loop2x
709          add                 $out, 16, $out
710
711          ret
712          restore
713
714.align    16
7152:        ldxa                [$inp]0x82, %o0               ! avoid read-after-write hazard
716                                                            ! and ~3x deterioration
717                                                            ! in inp==out case
718          faligndata          %f0, %f0, %f4                 ! handle unaligned output
719          faligndata          %f0, %f2, %f6
720          faligndata          %f2, %f2, %f8
721          stda                %f4, [$out + $omask]0xc0      ! partial store
722          std                 %f6, [$out + 8]
723          add                 $out, 16, $out
724          orn                 %g0, $omask, $omask
725          stda                %f8, [$out + $omask]0xc0      ! partial store
726
727          brnz,pt             $len, .L${bits}_ctr32_loop2x+4
728          orn                 %g0, $omask, $omask
729
730          ret
731          restore
732
733!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
734.align    32
735.L${bits}_ctr32_loop2x:
736          ldx                 [$inp + 0], %o0
737          ldx                 [$inp + 8], %o1
738          ldx                 [$inp + 16], %o2
739          brz,pt              $ileft, 4f
740          ldx                 [$inp + 24], %o3
741
742          ldx                 [$inp + 32], %o4
743          sllx                %o0, $ileft, %o0
744          srlx                %o1, $iright, %g1
745          or                  %g1, %o0, %o0
746          sllx                %o1, $ileft, %o1
747          srlx                %o2, $iright, %g1
748          or                  %g1, %o1, %o1
749          sllx                %o2, $ileft, %o2
750          srlx                %o3, $iright, %g1
751          or                  %g1, %o2, %o2
752          sllx                %o3, $ileft, %o3
753          srlx                %o4, $iright, %o4
754          or                  %o4, %o3, %o3
7554:
756          xor                 %g5, %l7, %g1                 ! ^= rk[0]
757          add                 %l7, 1, %l7
758          movxtod             %g1, %f2
759          srl                 %l7, 0, %l7                   ! clruw
760          xor                 %g5, %l7, %g1
761          add                 %l7, 1, %l7
762          movxtod             %g1, %f6
763          srl                 %l7, 0, %l7                   ! clruw
764          prefetch  [$out + 63], 22
765          prefetch  [$inp + 32+63], 20
766___
767$::code.=<<___ if ($alg eq "aes");
768          aes_eround01        %f16, %f14, %f2, %f8
769          aes_eround23        %f18, %f14, %f2, %f2
770          aes_eround01        %f16, %f14, %f6, %f10
771          aes_eround23        %f18, %f14, %f6, %f6
772___
773$::code.=<<___ if ($alg eq "cmll");
774          camellia_f          %f16, %f2, %f14, %f2
775          camellia_f          %f16, %f6, %f14, %f6
776          camellia_f          %f18, %f14, %f2, %f0
777          camellia_f          %f18, %f14, %f6, %f4
778___
779$::code.=<<___;
780          call                _${alg}${bits}_encrypt_2x+16
781          add                 $inp, 32, $inp
782
783          movxtod             %o0, %f8
784          movxtod             %o1, %f10
785          movxtod             %o2, %f12
786          fxor                %f8, %f0, %f0                 ! ^= inp
787          movxtod             %o3, %f8
788          fxor                %f10, %f2, %f2
789          fxor                %f12, %f4, %f4
790          fxor                %f8, %f6, %f6
791
792          brnz,pn             $ooff, 2f
793          sub                 $len, 2, $len
794
795          std                 %f0, [$out + 0]
796          std                 %f2, [$out + 8]
797          std                 %f4, [$out + 16]
798          std                 %f6, [$out + 24]
799          brnz,pt             $len, .L${bits}_ctr32_loop2x
800          add                 $out, 32, $out
801
802          ret
803          restore
804
805.align    16
8062:        ldxa                [$inp]0x82, %o0               ! avoid read-after-write hazard
807                                                            ! and ~3x deterioration
808                                                            ! in inp==out case
809          faligndata          %f0, %f0, %f8                 ! handle unaligned output
810          faligndata          %f0, %f2, %f0
811          faligndata          %f2, %f4, %f2
812          faligndata          %f4, %f6, %f4
813          faligndata          %f6, %f6, %f6
814
815          stda                %f8, [$out + $omask]0xc0      ! partial store
816          std                 %f0, [$out + 8]
817          std                 %f2, [$out + 16]
818          std                 %f4, [$out + 24]
819          add                 $out, 32, $out
820          orn                 %g0, $omask, $omask
821          stda                %f6, [$out + $omask]0xc0      ! partial store
822
823          brnz,pt             $len, .L${bits}_ctr32_loop2x+4
824          orn                 %g0, $omask, $omask
825
826          ret
827          restore
828
829!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
830.align    32
831.L${bits}_ctr32_blk:
832          add       $out, $len, $blk_init
833          and       $blk_init, 63, $blk_init      ! tail
834          sub       $len, $blk_init, $len
835          add       $blk_init, 15, $blk_init      ! round up to 16n
836          srlx      $len, 4, $len
837          srl       $blk_init, 4, $blk_init
838          sub       $len, 1, $len
839          add       $blk_init, 1, $blk_init
840
841.L${bits}_ctr32_blk_loop2x:
842          ldx                 [$inp + 0], %o0
843          ldx                 [$inp + 8], %o1
844          ldx                 [$inp + 16], %o2
845          brz,pt              $ileft, 5f
846          ldx                 [$inp + 24], %o3
847
848          ldx                 [$inp + 32], %o4
849          sllx                %o0, $ileft, %o0
850          srlx                %o1, $iright, %g1
851          or                  %g1, %o0, %o0
852          sllx                %o1, $ileft, %o1
853          srlx                %o2, $iright, %g1
854          or                  %g1, %o1, %o1
855          sllx                %o2, $ileft, %o2
856          srlx                %o3, $iright, %g1
857          or                  %g1, %o2, %o2
858          sllx                %o3, $ileft, %o3
859          srlx                %o4, $iright, %o4
860          or                  %o4, %o3, %o3
8615:
862          xor                 %g5, %l7, %g1                 ! ^= rk[0]
863          add                 %l7, 1, %l7
864          movxtod             %g1, %f2
865          srl                 %l7, 0, %l7                   ! clruw
866          xor                 %g5, %l7, %g1
867          add                 %l7, 1, %l7
868          movxtod             %g1, %f6
869          srl                 %l7, 0, %l7                   ! clruw
870          prefetch  [$inp + 32+63], 20
871___
872$::code.=<<___ if ($alg eq "aes");
873          aes_eround01        %f16, %f14, %f2, %f8
874          aes_eround23        %f18, %f14, %f2, %f2
875          aes_eround01        %f16, %f14, %f6, %f10
876          aes_eround23        %f18, %f14, %f6, %f6
877___
878$::code.=<<___ if ($alg eq "cmll");
879          camellia_f          %f16, %f2, %f14, %f2
880          camellia_f          %f16, %f6, %f14, %f6
881          camellia_f          %f18, %f14, %f2, %f0
882          camellia_f          %f18, %f14, %f6, %f4
883___
884$::code.=<<___;
885          call                _${alg}${bits}_encrypt_2x+16
886          add                 $inp, 32, $inp
887          subcc               $len, 2, $len
888
889          movxtod             %o0, %f8
890          movxtod             %o1, %f10
891          movxtod             %o2, %f12
892          fxor                %f8, %f0, %f0                 ! ^= inp
893          movxtod             %o3, %f8
894          fxor                %f10, %f2, %f2
895          fxor                %f12, %f4, %f4
896          fxor                %f8, %f6, %f6
897
898          stda                %f0, [$out]0xe2               ! ASI_BLK_INIT, T4-specific
899          add                 $out, 8, $out
900          stda                %f2, [$out]0xe2               ! ASI_BLK_INIT, T4-specific
901          add                 $out, 8, $out
902          stda                %f4, [$out]0xe2               ! ASI_BLK_INIT, T4-specific
903          add                 $out, 8, $out
904          stda                %f6, [$out]0xe2               ! ASI_BLK_INIT, T4-specific
905          bgu,pt              $::size_t_cc, .L${bits}_ctr32_blk_loop2x
906          add                 $out, 8, $out
907
908          add                 $blk_init, $len, $len
909          andcc               $len, 1, %g0                  ! is number of blocks even?
910          membar              #StoreLoad|#StoreStore
911          bnz,pt              %icc, .L${bits}_ctr32_loop
912          srl                 $len, 0, $len
913          brnz,pn             $len, .L${bits}_ctr32_loop2x
914          nop
915
916          ret
917          restore
918.type     ${alg}${bits}_t4_ctr32_encrypt,#function
919.size     ${alg}${bits}_t4_ctr32_encrypt,.-${alg}${bits}_t4_ctr32_encrypt
920___
921}
922
923sub alg_xts_implement {
924my ($alg,$bits,$dir) = @_;
925my ($inp,$out,$len,$key1,$key2,$ivec)=map("%i$_",(0..5));
926my $rem=$ivec;
927
928$::code.=<<___;
929.globl    ${alg}${bits}_t4_xts_${dir}crypt
930.align    32
931${alg}${bits}_t4_xts_${dir}crypt:
932          save                %sp, -$::frame-16, %sp
933          srln                $len, 0, $len                 ! needed on v8+, "nop" on v9
934
935          mov                 $ivec, %o0
936          add                 %fp, $::bias-16, %o1
937          call                ${alg}_t4_encrypt
938          mov                 $key2, %o2
939
940          add                 %fp, $::bias-16, %l7
941          ldxa                [%l7]0x88, %g2
942          add                 %fp, $::bias-8, %l7
943          ldxa                [%l7]0x88, %g3                ! %g3:%g2 is tweak
944
945          sethi               %hi(0x76543210), %l7
946          or                  %l7, %lo(0x76543210), %l7
947          bmask               %l7, %g0, %g0                 ! byte swap mask
948
949          prefetch  [$inp], 20
950          prefetch  [$inp + 63], 20
951          call                _${alg}${bits}_load_${dir}ckey
952          and                 $len, 15,  $rem
953          and                 $len, -16, $len
954___
955$code.=<<___ if ($dir eq "de");
956          mov                 0, %l7
957          movrnz              $rem, 16,  %l7
958          sub                 $len, %l7, $len
959___
960$code.=<<___;
961
962          sub                 $inp, $out, $blk_init         ! $inp!=$out
963          and                 $inp, 7, $ileft
964          andn                $inp, 7, $inp
965          sll                 $ileft, 3, $ileft
966          mov                 64, $iright
967          mov                 0xff, $omask
968          sub                 $iright, $ileft, $iright
969          and                 $out, 7, $ooff
970          cmp                 $len, 255
971          movrnz              $ooff, 0, $blk_init           ! if (    $out&7 ||
972          movleu              $::size_t_cc, 0, $blk_init    !         $len<256 ||
973          brnz,pn             $blk_init, .L${bits}_xts_${dir}blk !    $inp==$out)
974          srl                 $omask, $ooff, $omask
975
976          andcc               $len, 16, %g0                 ! is number of blocks even?
977___
978$code.=<<___ if ($dir eq "de");
979          brz,pn              $len, .L${bits}_xts_${dir}steal
980___
981$code.=<<___;
982          alignaddrl          $out, %g0, $out
983          bz                  %icc, .L${bits}_xts_${dir}loop2x
984          srlx                $len, 4, $len
985.L${bits}_xts_${dir}loop:
986          ldx                 [$inp + 0], %o0
987          brz,pt              $ileft, 4f
988          ldx                 [$inp + 8], %o1
989
990          ldx                 [$inp + 16], %o2
991          sllx                %o0, $ileft, %o0
992          srlx                %o1, $iright, %g1
993          sllx                %o1, $ileft, %o1
994          or                  %g1, %o0, %o0
995          srlx                %o2, $iright, %o2
996          or                  %o2, %o1, %o1
9974:
998          movxtod             %g2, %f12
999          movxtod             %g3, %f14
1000          bshuffle  %f12, %f12, %f12
1001          bshuffle  %f14, %f14, %f14
1002
1003          xor                 %g4, %o0, %o0                 ! ^= rk[0]
1004          xor                 %g5, %o1, %o1
1005          movxtod             %o0, %f0
1006          movxtod             %o1, %f2
1007
1008          fxor                %f12, %f0, %f0                ! ^= tweak[0]
1009          fxor                %f14, %f2, %f2
1010
1011          prefetch  [$out + 63], 22
1012          prefetch  [$inp + 16+63], 20
1013          call                _${alg}${bits}_${dir}crypt_1x
1014          add                 $inp, 16, $inp
1015
1016          fxor                %f12, %f0, %f0                ! ^= tweak[0]
1017          fxor                %f14, %f2, %f2
1018
1019          srax                %g3, 63, %l7                  ! next tweak value
1020          addcc               %g2, %g2, %g2
1021          and                 %l7, 0x87, %l7
1022          addxc               %g3, %g3, %g3
1023          xor                 %l7, %g2, %g2
1024
1025          brnz,pn             $ooff, 2f
1026          sub                 $len, 1, $len
1027
1028          std                 %f0, [$out + 0]
1029          std                 %f2, [$out + 8]
1030          brnz,pt             $len, .L${bits}_xts_${dir}loop2x
1031          add                 $out, 16, $out
1032
1033          brnz,pn             $rem, .L${bits}_xts_${dir}steal
1034          nop
1035
1036          ret
1037          restore
1038
1039.align    16
10402:        ldxa                [$inp]0x82, %o0               ! avoid read-after-write hazard
1041                                                            ! and ~3x deterioration
1042                                                            ! in inp==out case
1043          faligndata          %f0, %f0, %f4                 ! handle unaligned output
1044          faligndata          %f0, %f2, %f6
1045          faligndata          %f2, %f2, %f8
1046          stda                %f4, [$out + $omask]0xc0      ! partial store
1047          std                 %f6, [$out + 8]
1048          add                 $out, 16, $out
1049          orn                 %g0, $omask, $omask
1050          stda                %f8, [$out + $omask]0xc0      ! partial store
1051
1052          brnz,pt             $len, .L${bits}_xts_${dir}loop2x+4
1053          orn                 %g0, $omask, $omask
1054
1055          brnz,pn             $rem, .L${bits}_xts_${dir}steal
1056          nop
1057
1058          ret
1059          restore
1060
1061!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
1062.align    32
1063.L${bits}_xts_${dir}loop2x:
1064          ldx                 [$inp + 0], %o0
1065          ldx                 [$inp + 8], %o1
1066          ldx                 [$inp + 16], %o2
1067          brz,pt              $ileft, 4f
1068          ldx                 [$inp + 24], %o3
1069
1070          ldx                 [$inp + 32], %o4
1071          sllx                %o0, $ileft, %o0
1072          srlx                %o1, $iright, %g1
1073          or                  %g1, %o0, %o0
1074          sllx                %o1, $ileft, %o1
1075          srlx                %o2, $iright, %g1
1076          or                  %g1, %o1, %o1
1077          sllx                %o2, $ileft, %o2
1078          srlx                %o3, $iright, %g1
1079          or                  %g1, %o2, %o2
1080          sllx                %o3, $ileft, %o3
1081          srlx                %o4, $iright, %o4
1082          or                  %o4, %o3, %o3
10834:
1084          movxtod             %g2, %f12
1085          movxtod             %g3, %f14
1086          bshuffle  %f12, %f12, %f12
1087          bshuffle  %f14, %f14, %f14
1088
1089          srax                %g3, 63, %l7                  ! next tweak value
1090          addcc               %g2, %g2, %g2
1091          and                 %l7, 0x87, %l7
1092          addxc               %g3, %g3, %g3
1093          xor                 %l7, %g2, %g2
1094
1095          movxtod             %g2, %f8
1096          movxtod             %g3, %f10
1097          bshuffle  %f8,  %f8,  %f8
1098          bshuffle  %f10, %f10, %f10
1099
1100          xor                 %g4, %o0, %o0                 ! ^= rk[0]
1101          xor                 %g5, %o1, %o1
1102          xor                 %g4, %o2, %o2                 ! ^= rk[0]
1103          xor                 %g5, %o3, %o3
1104          movxtod             %o0, %f0
1105          movxtod             %o1, %f2
1106          movxtod             %o2, %f4
1107          movxtod             %o3, %f6
1108
1109          fxor                %f12, %f0, %f0                ! ^= tweak[0]
1110          fxor                %f14, %f2, %f2
1111          fxor                %f8,  %f4, %f4                ! ^= tweak[0]
1112          fxor                %f10, %f6, %f6
1113
1114          prefetch  [$out + 63], 22
1115          prefetch  [$inp + 32+63], 20
1116          call                _${alg}${bits}_${dir}crypt_2x
1117          add                 $inp, 32, $inp
1118
1119          movxtod             %g2, %f8
1120          movxtod             %g3, %f10
1121
1122          srax                %g3, 63, %l7                  ! next tweak value
1123          addcc               %g2, %g2, %g2
1124          and                 %l7, 0x87, %l7
1125          addxc               %g3, %g3, %g3
1126          xor                 %l7, %g2, %g2
1127
1128          bshuffle  %f8,  %f8,  %f8
1129          bshuffle  %f10, %f10, %f10
1130
1131          fxor                %f12, %f0, %f0                ! ^= tweak[0]
1132          fxor                %f14, %f2, %f2
1133          fxor                %f8,  %f4, %f4
1134          fxor                %f10, %f6, %f6
1135
1136          brnz,pn             $ooff, 2f
1137          sub                 $len, 2, $len
1138
1139          std                 %f0, [$out + 0]
1140          std                 %f2, [$out + 8]
1141          std                 %f4, [$out + 16]
1142          std                 %f6, [$out + 24]
1143          brnz,pt             $len, .L${bits}_xts_${dir}loop2x
1144          add                 $out, 32, $out
1145
1146          fsrc2               %f4, %f0
1147          fsrc2               %f6, %f2
1148          brnz,pn             $rem, .L${bits}_xts_${dir}steal
1149          nop
1150
1151          ret
1152          restore
1153
1154.align    16
11552:        ldxa                [$inp]0x82, %o0               ! avoid read-after-write hazard
1156                                                            ! and ~3x deterioration
1157                                                            ! in inp==out case
1158          faligndata          %f0, %f0, %f8                 ! handle unaligned output
1159          faligndata          %f0, %f2, %f10
1160          faligndata          %f2, %f4, %f12
1161          faligndata          %f4, %f6, %f14
1162          faligndata          %f6, %f6, %f0
1163
1164          stda                %f8, [$out + $omask]0xc0      ! partial store
1165          std                 %f10, [$out + 8]
1166          std                 %f12, [$out + 16]
1167          std                 %f14, [$out + 24]
1168          add                 $out, 32, $out
1169          orn                 %g0, $omask, $omask
1170          stda                %f0, [$out + $omask]0xc0      ! partial store
1171
1172          brnz,pt             $len, .L${bits}_xts_${dir}loop2x+4
1173          orn                 %g0, $omask, $omask
1174
1175          fsrc2               %f4, %f0
1176          fsrc2               %f6, %f2
1177          brnz,pn             $rem, .L${bits}_xts_${dir}steal
1178          nop
1179
1180          ret
1181          restore
1182
1183!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
1184.align    32
1185.L${bits}_xts_${dir}blk:
1186          add       $out, $len, $blk_init
1187          and       $blk_init, 63, $blk_init      ! tail
1188          sub       $len, $blk_init, $len
1189          add       $blk_init, 15, $blk_init      ! round up to 16n
1190          srlx      $len, 4, $len
1191          srl       $blk_init, 4, $blk_init
1192          sub       $len, 1, $len
1193          add       $blk_init, 1, $blk_init
1194
1195.L${bits}_xts_${dir}blk2x:
1196          ldx                 [$inp + 0], %o0
1197          ldx                 [$inp + 8], %o1
1198          ldx                 [$inp + 16], %o2
1199          brz,pt              $ileft, 5f
1200          ldx                 [$inp + 24], %o3
1201
1202          ldx                 [$inp + 32], %o4
1203          sllx                %o0, $ileft, %o0
1204          srlx                %o1, $iright, %g1
1205          or                  %g1, %o0, %o0
1206          sllx                %o1, $ileft, %o1
1207          srlx                %o2, $iright, %g1
1208          or                  %g1, %o1, %o1
1209          sllx                %o2, $ileft, %o2
1210          srlx                %o3, $iright, %g1
1211          or                  %g1, %o2, %o2
1212          sllx                %o3, $ileft, %o3
1213          srlx                %o4, $iright, %o4
1214          or                  %o4, %o3, %o3
12155:
1216          movxtod             %g2, %f12
1217          movxtod             %g3, %f14
1218          bshuffle  %f12, %f12, %f12
1219          bshuffle  %f14, %f14, %f14
1220
1221          srax                %g3, 63, %l7                  ! next tweak value
1222          addcc               %g2, %g2, %g2
1223          and                 %l7, 0x87, %l7
1224          addxc               %g3, %g3, %g3
1225          xor                 %l7, %g2, %g2
1226
1227          movxtod             %g2, %f8
1228          movxtod             %g3, %f10
1229          bshuffle  %f8,  %f8,  %f8
1230          bshuffle  %f10, %f10, %f10
1231
1232          xor                 %g4, %o0, %o0                 ! ^= rk[0]
1233          xor                 %g5, %o1, %o1
1234          xor                 %g4, %o2, %o2                 ! ^= rk[0]
1235          xor                 %g5, %o3, %o3
1236          movxtod             %o0, %f0
1237          movxtod             %o1, %f2
1238          movxtod             %o2, %f4
1239          movxtod             %o3, %f6
1240
1241          fxor                %f12, %f0, %f0                ! ^= tweak[0]
1242          fxor                %f14, %f2, %f2
1243          fxor                %f8,  %f4, %f4                ! ^= tweak[0]
1244          fxor                %f10, %f6, %f6
1245
1246          prefetch  [$inp + 32+63], 20
1247          call                _${alg}${bits}_${dir}crypt_2x
1248          add                 $inp, 32, $inp
1249
1250          movxtod             %g2, %f8
1251          movxtod             %g3, %f10
1252
1253          srax                %g3, 63, %l7                  ! next tweak value
1254          addcc               %g2, %g2, %g2
1255          and                 %l7, 0x87, %l7
1256          addxc               %g3, %g3, %g3
1257          xor                 %l7, %g2, %g2
1258
1259          bshuffle  %f8,  %f8,  %f8
1260          bshuffle  %f10, %f10, %f10
1261
1262          fxor                %f12, %f0, %f0                ! ^= tweak[0]
1263          fxor                %f14, %f2, %f2
1264          fxor                %f8,  %f4, %f4
1265          fxor                %f10, %f6, %f6
1266
1267          subcc               $len, 2, $len
1268          stda                %f0, [$out]0xe2               ! ASI_BLK_INIT, T4-specific
1269          add                 $out, 8, $out
1270          stda                %f2, [$out]0xe2               ! ASI_BLK_INIT, T4-specific
1271          add                 $out, 8, $out
1272          stda                %f4, [$out]0xe2               ! ASI_BLK_INIT, T4-specific
1273          add                 $out, 8, $out
1274          stda                %f6, [$out]0xe2               ! ASI_BLK_INIT, T4-specific
1275          bgu,pt              $::size_t_cc, .L${bits}_xts_${dir}blk2x
1276          add                 $out, 8, $out
1277
1278          add                 $blk_init, $len, $len
1279          andcc               $len, 1, %g0                  ! is number of blocks even?
1280          membar              #StoreLoad|#StoreStore
1281          bnz,pt              %icc, .L${bits}_xts_${dir}loop
1282          srl                 $len, 0, $len
1283          brnz,pn             $len, .L${bits}_xts_${dir}loop2x
1284          nop
1285
1286          fsrc2               %f4, %f0
1287          fsrc2               %f6, %f2
1288          brnz,pn             $rem, .L${bits}_xts_${dir}steal
1289          nop
1290
1291          ret
1292          restore
1293!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
1294___
1295$code.=<<___ if ($dir eq "en");
1296.align    32
1297.L${bits}_xts_${dir}steal:
1298          std                 %f0, [%fp + $::bias-16]       ! copy of output
1299          std                 %f2, [%fp + $::bias-8]
1300
1301          srl                 $ileft, 3, $ileft
1302          add                 %fp, $::bias-16, %l7
1303          add                 $inp, $ileft, $inp  ! original $inp+$len&-15
1304          add                 $out, $ooff, $out   ! original $out+$len&-15
1305          mov                 0, $ileft
1306          nop                                               ! align
1307
1308.L${bits}_xts_${dir}stealing:
1309          ldub                [$inp + $ileft], %o0
1310          ldub                [%l7  + $ileft], %o1
1311          dec                 $rem
1312          stb                 %o0, [%l7  + $ileft]
1313          stb                 %o1, [$out + $ileft]
1314          brnz                $rem, .L${bits}_xts_${dir}stealing
1315          inc                 $ileft
1316
1317          mov                 %l7, $inp
1318          sub                 $out, 16, $out
1319          mov                 0, $ileft
1320          sub                 $out, $ooff, $out
1321          ba                  .L${bits}_xts_${dir}loop      ! one more time
1322          mov                 1, $len                                 ! $rem is 0
1323___
1324$code.=<<___ if ($dir eq "de");
1325.align    32
1326.L${bits}_xts_${dir}steal:
1327          ldx                 [$inp + 0], %o0
1328          brz,pt              $ileft, 8f
1329          ldx                 [$inp + 8], %o1
1330
1331          ldx                 [$inp + 16], %o2
1332          sllx                %o0, $ileft, %o0
1333          srlx                %o1, $iright, %g1
1334          sllx                %o1, $ileft, %o1
1335          or                  %g1, %o0, %o0
1336          srlx                %o2, $iright, %o2
1337          or                  %o2, %o1, %o1
13388:
1339          srax                %g3, 63, %l7                  ! next tweak value
1340          addcc               %g2, %g2, %o2
1341          and                 %l7, 0x87, %l7
1342          addxc               %g3, %g3, %o3
1343          xor                 %l7, %o2, %o2
1344
1345          movxtod             %o2, %f12
1346          movxtod             %o3, %f14
1347          bshuffle  %f12, %f12, %f12
1348          bshuffle  %f14, %f14, %f14
1349
1350          xor                 %g4, %o0, %o0                 ! ^= rk[0]
1351          xor                 %g5, %o1, %o1
1352          movxtod             %o0, %f0
1353          movxtod             %o1, %f2
1354
1355          fxor                %f12, %f0, %f0                ! ^= tweak[0]
1356          fxor                %f14, %f2, %f2
1357
1358          call                _${alg}${bits}_${dir}crypt_1x
1359          add                 $inp, 16, $inp
1360
1361          fxor                %f12, %f0, %f0                ! ^= tweak[0]
1362          fxor                %f14, %f2, %f2
1363
1364          std                 %f0, [%fp + $::bias-16]
1365          std                 %f2, [%fp + $::bias-8]
1366
1367          srl                 $ileft, 3, $ileft
1368          add                 %fp, $::bias-16, %l7
1369          add                 $inp, $ileft, $inp  ! original $inp+$len&-15
1370          add                 $out, $ooff, $out   ! original $out+$len&-15
1371          mov                 0, $ileft
1372          add                 $out, 16, $out
1373          nop                                               ! align
1374
1375.L${bits}_xts_${dir}stealing:
1376          ldub                [$inp + $ileft], %o0
1377          ldub                [%l7  + $ileft], %o1
1378          dec                 $rem
1379          stb                 %o0, [%l7  + $ileft]
1380          stb                 %o1, [$out + $ileft]
1381          brnz                $rem, .L${bits}_xts_${dir}stealing
1382          inc                 $ileft
1383
1384          mov                 %l7, $inp
1385          sub                 $out, 16, $out
1386          mov                 0, $ileft
1387          sub                 $out, $ooff, $out
1388          ba                  .L${bits}_xts_${dir}loop      ! one more time
1389          mov                 1, $len                                 ! $rem is 0
1390___
1391$code.=<<___;
1392          ret
1393          restore
1394.type     ${alg}${bits}_t4_xts_${dir}crypt,#function
1395.size     ${alg}${bits}_t4_xts_${dir}crypt,.-${alg}${bits}_t4_xts_${dir}crypt
1396___
1397}
1398
1399# Purpose of these subroutines is to explicitly encode VIS instructions,
1400# so that one can compile the module without having to specify VIS
1401# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
1402# Idea is to reserve for option to produce "universal" binary and let
1403# programmer detect if current CPU is VIS capable at run-time.
1404sub unvis {
1405my ($mnemonic,$rs1,$rs2,$rd)=@_;
1406my ($ref,$opf);
1407my %visopf = (      "faligndata"        => 0x048,
1408                    "bshuffle"          => 0x04c,
1409                    "fnot2"             => 0x066,
1410                    "fxor"              => 0x06c,
1411                    "fsrc2"             => 0x078  );
1412
1413    $ref = "$mnemonic\t$rs1,$rs2,$rd";
1414
1415    if ($opf=$visopf{$mnemonic}) {
1416          foreach ($rs1,$rs2,$rd) {
1417              return $ref if (!/%f([0-9]{1,2})/);
1418              $_=$1;
1419              if ($1>=32) {
1420                    return $ref if ($1&1);
1421                    # re-encode for upper double register addressing
1422                    $_=($1|$1>>5)&31;
1423              }
1424          }
1425
1426          return    sprintf ".word\t0x%08x !%s",
1427                              0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
1428                              $ref;
1429    } else {
1430          return $ref;
1431    }
1432}
1433
1434sub unvis3 {
1435my ($mnemonic,$rs1,$rs2,$rd)=@_;
1436my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
1437my ($ref,$opf);
1438my %visopf = (      "addxc"             => 0x011,
1439                    "addxccc" => 0x013,
1440                    "umulxhi" => 0x016,
1441                    "alignaddr"         => 0x018,
1442                    "bmask"             => 0x019,
1443                    "alignaddrl"        => 0x01a  );
1444
1445    $ref = "$mnemonic\t$rs1,$rs2,$rd";
1446
1447    if ($opf=$visopf{$mnemonic}) {
1448          foreach ($rs1,$rs2,$rd) {
1449              return $ref if (!/%([goli])([0-9])/);
1450              $_=$bias{$1}+$2;
1451          }
1452
1453          return    sprintf ".word\t0x%08x !%s",
1454                              0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
1455                              $ref;
1456    } else {
1457          return $ref;
1458    }
1459}
1460
1461sub unaes_round {   # 4-argument instructions
1462my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
1463my ($ref,$opf);
1464my %aesopf = (      "aes_eround01"      => 0,
1465                    "aes_eround23"      => 1,
1466                    "aes_dround01"      => 2,
1467                    "aes_dround23"      => 3,
1468                    "aes_eround01_l"=> 4,
1469                    "aes_eround23_l"=> 5,
1470                    "aes_dround01_l"=> 6,
1471                    "aes_dround23_l"=> 7,
1472                    "aes_kexpand1"      => 8      );
1473
1474    $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
1475
1476    if (defined($opf=$aesopf{$mnemonic})) {
1477          $rs3 = ($rs3 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs3;
1478          foreach ($rs1,$rs2,$rd) {
1479              return $ref if (!/%f([0-9]{1,2})/);
1480              $_=$1;
1481              if ($1>=32) {
1482                    return $ref if ($1&1);
1483                    # re-encode for upper double register addressing
1484                    $_=($1|$1>>5)&31;
1485              }
1486          }
1487
1488          return    sprintf ".word\t0x%08x !%s",
1489                              2<<30|$rd<<25|0x19<<19|$rs1<<14|$rs3<<9|$opf<<5|$rs2,
1490                              $ref;
1491    } else {
1492          return $ref;
1493    }
1494}
1495
1496sub unaes_kexpand { # 3-argument instructions
1497my ($mnemonic,$rs1,$rs2,$rd)=@_;
1498my ($ref,$opf);
1499my %aesopf = (      "aes_kexpand0"      => 0x130,
1500                    "aes_kexpand2"      => 0x131  );
1501
1502    $ref = "$mnemonic\t$rs1,$rs2,$rd";
1503
1504    if (defined($opf=$aesopf{$mnemonic})) {
1505          foreach ($rs1,$rs2,$rd) {
1506              return $ref if (!/%f([0-9]{1,2})/);
1507              $_=$1;
1508              if ($1>=32) {
1509                    return $ref if ($1&1);
1510                    # re-encode for upper double register addressing
1511                    $_=($1|$1>>5)&31;
1512              }
1513          }
1514
1515          return    sprintf ".word\t0x%08x !%s",
1516                              2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2,
1517                              $ref;
1518    } else {
1519          return $ref;
1520    }
1521}
1522
1523sub uncamellia_f {  # 4-argument instructions
1524my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
1525my ($ref,$opf);
1526
1527    $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
1528
1529    if (1) {
1530          $rs3 = ($rs3 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs3;
1531          foreach ($rs1,$rs2,$rd) {
1532              return $ref if (!/%f([0-9]{1,2})/);
1533              $_=$1;
1534              if ($1>=32) {
1535                    return $ref if ($1&1);
1536                    # re-encode for upper double register addressing
1537                    $_=($1|$1>>5)&31;
1538              }
1539          }
1540
1541          return    sprintf ".word\t0x%08x !%s",
1542                              2<<30|$rd<<25|0x19<<19|$rs1<<14|$rs3<<9|0xc<<5|$rs2,
1543                              $ref;
1544    } else {
1545          return $ref;
1546    }
1547}
1548
1549sub uncamellia3 {   # 3-argument instructions
1550my ($mnemonic,$rs1,$rs2,$rd)=@_;
1551my ($ref,$opf);
1552my %cmllopf = (     "camellia_fl"       => 0x13c,
1553                    "camellia_fli"      => 0x13d  );
1554
1555    $ref = "$mnemonic\t$rs1,$rs2,$rd";
1556
1557    if (defined($opf=$cmllopf{$mnemonic})) {
1558          foreach ($rs1,$rs2,$rd) {
1559              return $ref if (!/%f([0-9]{1,2})/);
1560              $_=$1;
1561              if ($1>=32) {
1562                    return $ref if ($1&1);
1563                    # re-encode for upper double register addressing
1564                    $_=($1|$1>>5)&31;
1565              }
1566          }
1567
1568          return    sprintf ".word\t0x%08x !%s",
1569                              2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2,
1570                              $ref;
1571    } else {
1572          return $ref;
1573    }
1574}
1575
1576sub unmovxtox {               # 2-argument instructions
1577my ($mnemonic,$rs,$rd)=@_;
1578my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24, "f" => 0 );
1579my ($ref,$opf);
1580my %movxopf = (     "movdtox" => 0x110,
1581                    "movstouw"          => 0x111,
1582                    "movstosw"          => 0x113,
1583                    "movxtod" => 0x118,
1584                    "movwtos" => 0x119  );
1585
1586    $ref = "$mnemonic\t$rs,$rd";
1587
1588    if (defined($opf=$movxopf{$mnemonic})) {
1589          foreach ($rs,$rd) {
1590              return $ref if (!/%([fgoli])([0-9]{1,2})/);
1591              $_=$bias{$1}+$2;
1592              if ($2>=32) {
1593                    return $ref if ($2&1);
1594                    # re-encode for upper double register addressing
1595                    $_=($2|$2>>5)&31;
1596              }
1597          }
1598
1599          return    sprintf ".word\t0x%08x !%s",
1600                              2<<30|$rd<<25|0x36<<19|$opf<<5|$rs,
1601                              $ref;
1602    } else {
1603          return $ref;
1604    }
1605}
1606
1607sub undes {
1608my ($mnemonic)=shift;
1609my @args=@_;
1610my ($ref,$opf);
1611my %desopf = (      "des_round"         => 0b1001,
1612                    "des_ip"  => 0b100110100,
1613                    "des_iip" => 0b100110101,
1614                    "des_kexpand"       => 0b100110110      );
1615
1616    $ref = "$mnemonic\t".join(",",@_);
1617
1618    if (defined($opf=$desopf{$mnemonic})) {       # 4-arg
1619          if ($mnemonic eq "des_round") {
1620              foreach (@args[0..3]) {
1621                    return $ref if (!/%f([0-9]{1,2})/);
1622                    $_=$1;
1623                    if ($1>=32) {
1624                        return $ref if ($1&1);
1625                        # re-encode for upper double register addressing
1626                        $_=($1|$1>>5)&31;
1627                    }
1628              }
1629              return  sprintf ".word\t0x%08x !%s",
1630                                  2<<30|0b011001<<19|$opf<<5|$args[0]<<14|$args[1]|$args[2]<<9|$args[3]<<25,
1631                                  $ref;
1632          } elsif ($mnemonic eq "des_kexpand") {  # 3-arg
1633              foreach (@args[0..2]) {
1634                    return $ref if (!/(%f)?([0-9]{1,2})/);
1635                    $_=$2;
1636                    if ($2>=32) {
1637                        return $ref if ($2&1);
1638                        # re-encode for upper double register addressing
1639                        $_=($2|$2>>5)&31;
1640                    }
1641              }
1642              return  sprintf ".word\t0x%08x !%s",
1643                                  2<<30|0b110110<<19|$opf<<5|$args[0]<<14|$args[1]|$args[2]<<25,
1644                                  $ref;
1645          } else {                                # 2-arg
1646              foreach (@args[0..1]) {
1647                    return $ref if (!/%f([0-9]{1,2})/);
1648                    $_=$1;
1649                    if ($1>=32) {
1650                        return $ref if ($2&1);
1651                        # re-encode for upper double register addressing
1652                        $_=($1|$1>>5)&31;
1653                    }
1654              }
1655              return  sprintf ".word\t0x%08x !%s",
1656                                  2<<30|0b110110<<19|$opf<<5|$args[0]<<14|$args[1]<<25,
1657                                  $ref;
1658          }
1659    } else {
1660          return $ref;
1661    }
1662}
1663
1664sub emit_assembler {
1665    foreach (split("\n",$::code)) {
1666          s/\`([^\`]*)\`/eval $1/ge;
1667
1668          s/\b(f[a-z]+2[sd]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})\s*$/$1\t%f0,$2,$3/go;
1669
1670          s/\b(aes_[edk][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/
1671                    &unaes_round($1,$2,$3,$4,$5)
1672           /geo or
1673          s/\b(aes_kexpand[02])\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
1674                    &unaes_kexpand($1,$2,$3,$4)
1675           /geo or
1676          s/\b(camellia_f)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/
1677                    &uncamellia_f($1,$2,$3,$4,$5)
1678           /geo or
1679          s/\b(camellia_[^s]+)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
1680                    &uncamellia3($1,$2,$3,$4)
1681           /geo or
1682          s/\b(des_\w+)\s+(%f[0-9]{1,2}),\s*([%fx0-9]+)(?:,\s*(%f[0-9]{1,2})(?:,\s*(%f[0-9]{1,2}))?)?/
1683                    &undes($1,$2,$3,$4,$5)
1684           /geo or
1685          s/\b(mov[ds]to\w+)\s+(%f[0-9]{1,2}),\s*(%[goli][0-7])/
1686                    &unmovxtox($1,$2,$3)
1687           /geo or
1688          s/\b(mov[xw]to[ds])\s+(%[goli][0-7]),\s*(%f[0-9]{1,2})/
1689                    &unmovxtox($1,$2,$3)
1690           /geo or
1691          s/\b([fb][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
1692                    &unvis($1,$2,$3,$4)
1693           /geo or
1694          s/\b(umulxhi|bmask|addxc[c]{0,2}|alignaddr[l]*)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
1695                    &unvis3($1,$2,$3,$4)
1696           /geo;
1697
1698          print $_,"\n";
1699    }
1700}
1701
17021;
1703