1/* $NetBSD: bcopy.S,v 1.4 2024/02/07 04:20:25 msaitoh Exp $ */
2
3/*
4 * Copyright (c) 2018 Ryo Shimizu
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29#include <machine/asm.h>
30
31#if defined(LIBC_SCCS)
32RCSID("$NetBSD: bcopy.S,v 1.4 2024/02/07 04:20:25 msaitoh Exp $")
33#endif
34
35#if defined(MEMCOPY)
36
37/*
38 * void *memcpy(void * restrict dst, const void * restrict src, size_t len);
39 */
40#define FUNCTION              memcpy
41#define NO_OVERLAP
42#define SRC0                            x1
43#define DST0                            x0
44#define LEN                             x2
45
46#elif defined(MEMMOVE)
47
48/*
49 * void *memmove(void *dst, const void *src, size_t len);
50 */
51#define FUNCTION              memmove
52#undef NO_OVERLAP
53#define SRC0                            x1
54#define DST0                            x0
55#define LEN                             x2
56
57#else /* !MEMCOPY && !MEMMOVE */
58
59/*
60 * void bcopy(const void *src, void *dst, size_t len);
61 */
62#define FUNCTION              bcopy
63#define NO_OVERLAP
64#define SRC0                            x0
65#define DST0                            x1
66#define LEN                             x2
67
68#endif /* MEMCOPY/MEMMOVE/BCOPY */
69
70/* caller-saved temporary registers. breakable. */
71#define TMP_X                           x3
72#define TMP_Xw                          w3
73#define TMP_D                           x4
74#define TMP_S                           x5
75#define DST                             x6
76#define SRC                             x7
77#define DATA0                           x8
78#define DATA0w                          w8
79#define DATA1                           x9
80#define DATA1w                          w9
81#define DATA2                           x10
82#define SRC_ALIGNBIT                    x11       /* (SRC & 7) * 8 */
83#define DST_ALIGNBIT                    x12       /* (DST & 7) * 8 */
84#define SRC_DST_ALIGNBIT      x13       /* = SRC_ALIGNBIT - DST_ALIGNBIT */
85#define DST_SRC_ALIGNBIT      x14       /* = -SRC_DST_ALIGNBIT */
86
87#define STP_ALIGN             16        /* align before stp/ldp. 8 or 16 */
88#define SMALLSIZE             32
89
90          .text
91          .align    5
92
93#ifndef NO_OVERLAP
94#ifndef STRICT_ALIGNMENT
95backward_ignore_align:
96          prfm      PLDL1KEEP, [SRC0]
97          add       SRC0, SRC0, LEN
98          add       DST, DST0, LEN
99          cmp       LEN, #SMALLSIZE
100          bcs       copy_backward
101copy_backward_small:
102          cmp       LEN, #8
103          bcs       9f
104
105          /* 0 <= len < 8 */
106          /* if (len & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */
107          tbz       LEN, #2, 1f
108          ldr       TMP_Xw, [SRC0, #-4]!
109          str       TMP_Xw, [DST, #-4]!
1101:
111          /* if (len & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */
112          tbz       LEN, #1, 1f
113          ldrh      TMP_Xw, [SRC0, #-2]!
114          strh      TMP_Xw, [DST, #-2]!
1151:
116          /* if (len & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */
117          tbz       LEN, #0, 1f
118          ldrb      TMP_Xw, [SRC0, #-1]!
119          strb      TMP_Xw, [DST, #-1]!
1201:
121          ret
1229:
123
124          cmp       LEN, #16
125          bcs       9f
126
127          /* 8 <= len < 16 */
128          /* *--(uint64_t *)dst = *--(uint64_t *)src; */
129          ldr       TMP_X, [SRC0, #-8]!
130          str       TMP_X, [DST, #-8]!
131          /* if (len & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */
132          tbz       LEN, #2, 1f
133          ldr       TMP_Xw, [SRC0, #-4]!
134          str       TMP_Xw, [DST, #-4]!
1351:
136          /* if (len & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */
137          tbz       LEN, #1, 1f
138          ldrh      TMP_Xw, [SRC0, #-2]!
139          strh      TMP_Xw, [DST, #-2]!
1401:
141          /* if (len & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */
142          tbz       LEN, #0, 1f
143          ldrb      TMP_Xw, [SRC0, #-1]!
144          strb      TMP_Xw, [DST, #-1]!
1451:
146          ret
1479:
148
149          /* 16 <= len < 32 */
150          ldp       DATA0, DATA1, [SRC0, #-16]!
151          stp       DATA0, DATA1, [DST, #-16]!
152          /* if (len & 8) { *--(uint64_t *)dst = *--(uint64_t *)src; } */
153          tbz       LEN, #3, 1f
154          ldr       TMP_X, [SRC0, #-8]!
155          str       TMP_X, [DST, #-8]!
1561:
157          /* if (len & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */
158          tbz       LEN, #2, 1f
159          ldr       TMP_Xw, [SRC0, #-4]!
160          str       TMP_Xw, [DST, #-4]!
1611:
162          /* if (len & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */
163          tbz       LEN, #1, 1f
164          ldrh      TMP_Xw, [SRC0, #-2]!
165          strh      TMP_Xw, [DST, #-2]!
1661:
167          /* if (len & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */
168          tbz       LEN, #0, 1f
169          ldrb      TMP_Xw, [SRC0, #-1]!
170          strb      TMP_Xw, [DST, #-1]!
1711:
172          ret
173#endif /* !STRICT_ALIGNMENT */
174
175          .align    4
176copy_backward:
177          /* DST is not aligned at this point */
178#ifndef STRICT_ALIGNMENT
179          cmp       LEN, #512 /* pre-alignment can be overhead when small */
180          bcc       9f
181#endif
182          /* if (DST & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */
183          tbz       DST, #0, 1f
184          ldrb      TMP_Xw, [SRC0, #-1]!
185          strb      TMP_Xw, [DST, #-1]!
186          sub       LEN, LEN, #1
1871:
188          /* if (DST & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */
189          tbz       DST, #1, 1f
190          ldrh      TMP_Xw, [SRC0, #-2]!
191          strh      TMP_Xw, [DST, #-2]!
192          sub       LEN, LEN, #2
1931:
194          /* if (DST & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */
195          tbz       DST, #2, 1f
196          ldr       TMP_Xw, [SRC0, #-4]!
197          str       TMP_Xw, [DST, #-4]!
198          sub       LEN, LEN, #4
1991:
200#if (STP_ALIGN > 8)
201          /* if (DST & 8) { *--(uint64_t *)dst = *--(uint64_t *)src; } */
202          tbz       DST, #3, 1f
203          ldr       TMP_X, [SRC0, #-8]!
204          str       TMP_X, [DST, #-8]!
205          sub       LEN, LEN, #8
2061:
207#endif /* (STP_ALIGN > 8) */
2089:
209
210backward_copy1k:
211          /* while (len >= 1024) */
212          /* { src -= 1024; dst -= 1024; copy1024(dst, src); len -= 1024; } */
213          cmp       LEN, #1024
214          blo       9f
2151:
216          sub       LEN, LEN, #1024
217          .rept     (1024 / 16)
218          ldp       DATA0, DATA1, [SRC0, #-16]!   /* *--dst = *--src; */
219          stp       DATA0, DATA1, [DST, #-16]!
220          .endr
221          cmp       LEN, #1024
222          bhs       1b
2239:
224
225          /* if (len & 512) { src -= 512; dst -= 512; copy512(dst, src); } */
226          tbz       LEN, #9, 1f
227          .rept     (512 / 16)
228          ldp       DATA0, DATA1, [SRC0, #-16]!
229          stp       DATA0, DATA1, [DST, #-16]!
230          .endr
2311:
232          /* if (len & 256) { src -= 256; dst -= 256; copy256(dst, src); } */
233          tbz       LEN, #8, 1f
234          .rept     (256 / 16)
235          ldp       DATA0, DATA1, [SRC0, #-16]!
236          stp       DATA0, DATA1, [DST, #-16]!
237          .endr
2381:
239          /* if (len & 128) { src -= 128; dst -= 128; copy128(dst, src); } */
240          tbz       LEN, #7, 1f
241          .rept     (128 / 16)
242          ldp       DATA0, DATA1, [SRC0, #-16]!
243          stp       DATA0, DATA1, [DST, #-16]!
244          .endr
2451:
246          /* if (len & 64) { src -= 64; dst -= 64; copy64(dst, src); } */
247          tbz       LEN, #6, 1f
248          .rept     (64 / 16)
249          ldp       DATA0, DATA1, [SRC0, #-16]!
250          stp       DATA0, DATA1, [DST, #-16]!
251          .endr
2521:
253          /* if (len & 32) { src -= 32; dst -= 32; copy32(dst, src); } */
254          tbz       LEN, #5, 1f
255          .rept     (32 / 16)
256          ldp       DATA0, DATA1, [SRC0, #-16]!
257          stp       DATA0, DATA1, [DST, #-16]!
258          .endr
2591:
260          /* if (len & 16) { *--(uint128_t *)dst = *--(uint128_t *)src; } */
261          tbz       LEN, #4, 1f
262          ldp       DATA0, DATA1, [SRC0, #-16]!
263          stp       DATA0, DATA1, [DST, #-16]!
2641:
265          /* if (len & 8) { *--(uint64_t *)dst = *--(uint64_t *)src; } */
266          tbz       LEN, #3, 1f
267          ldr       TMP_X, [SRC0, #-8]!
268          str       TMP_X, [DST, #-8]!
2691:
270          /* if (len & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */
271          tbz       LEN, #2, 1f
272          ldr       TMP_Xw, [SRC0, #-4]!
273          str       TMP_Xw, [DST, #-4]!
2741:
275          /* if (len & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */
276          tbz       LEN, #1, 1f
277          ldrh      TMP_Xw, [SRC0, #-2]!
278          strh      TMP_Xw, [DST, #-2]!
2791:
280          /* if (len & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */
281          tbz       LEN, #0, 1f
282          ldrb      TMP_Xw, [SRC0, #-1]!
283          strb      TMP_Xw, [DST, #-1]!
2841:
285          ret
286#endif /* !NO_OVERLAP */
287
288
289#if defined(STRICT_ALIGNMENT) && !defined(NO_OVERLAP)
290          .align    5
291backward_copy:
292          prfm      PLDL1KEEP, [SRC0]
293          add       DST, DST0, LEN
294          add       SRC0, SRC0, LEN
295          cmp       LEN, #SMALLSIZE
296          bcs       strict_backward
297
298          cmp       LEN, #10
299          bcs       9f
300backward_tiny:
301          /* copy 1-10 bytes */
3021:        sub       LEN, LEN, #1
303          ldrb      TMP_Xw, [SRC0, #-1]!
304          strb      TMP_Xw, [DST, #-1]!
305          cbz       LEN, 1b
306          ret
3079:
308          /* length is small(<32), and src or dst may be unaligned */
309          eor       TMP_X, SRC0, DST
310          ands      TMP_X, TMP_X, #7
311          bne       notaligned_backward_small
312
313samealign_backward_small:
314          /* if (dst & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */
315          tbz       DST, #0, 1f
316          ldrb      TMP_Xw, [SRC0, #-1]!
317          strb      TMP_Xw, [DST, #-1]!
318          sub       LEN, LEN, #1
3191:
320          /* if (dst & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */
321          tbz       DST, #1, 1f
322          ldrh      TMP_Xw, [SRC0, #-2]!
323          strh      TMP_Xw, [DST, #-2]!
324          sub       LEN, LEN, #2
3251:
326          /* if (dst & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */
327          tbz       DST, #2, 1f
328          ldr       TMP_Xw, [SRC0, #-4]!
329          str       TMP_Xw, [DST, #-4]!
330          sub       LEN, LEN, #4
3311:
332          /* if (len & 16) { *--(uint128_t *)dst = *--(uint128_t *)src; } */
333          tbz       LEN, #4, 1f
334          ldp       DATA0, DATA1, [SRC0, #-16]!
335          stp       DATA0, DATA1, [DST, #-16]!
3361:
337          /* if (len & 8) { *--(uint64_t *)dst = *--(uint64_t *)src; } */
338          tbz       LEN, #3, 1f
339          ldr       TMP_X, [SRC0, #-8]!
340          str       TMP_X, [DST, #-8]!
3411:
342          /* if (len & 4) { *--(uint32_t *)dst = *--(uint32_t *)src; } */
343          tbz       LEN, #2, 1f
344          ldr       TMP_Xw, [SRC0, #-4]!
345          str       TMP_Xw, [DST, #-4]!
3461:
347          /* if (len & 2) { *--(uint16_t *)dst = *--(uint16_t *)src; } */
348          tbz       LEN, #1, 1f
349          ldrh      TMP_Xw, [SRC0, #-2]!
350          strh      TMP_Xw, [DST, #-2]!
3511:
352          /* if (len & 1) { *--(uint8_t *)dst = *--(uint8_t *)src; } */
353          tbz       LEN, #0, 1f
354          ldrb      TMP_Xw, [SRC0, #-1]!
355          strb      TMP_Xw, [DST, #-1]!
3561:
357          ret
358
359notaligned_backward_small:
360          /* length is small, and src or dst may be unaligned */
361          sub       TMP_S, SRC0, LEN    /* tmp_s = src - len */
3621:                                                /* do { */
363          ldrb      TMP_Xw, [SRC0, #-1]!
364          strb      TMP_Xw, [DST, #-1]! /*  *(char *)dst++ = *(char *)src++ */
365          cmp       TMP_S, SRC0                   /* while (tmp_s < src) */
366          blo       1b
367          ret
368
369strict_backward:
370          /* src or dst may be unaligned */
371          and       SRC_ALIGNBIT, SRC0, #7
372          and       DST_ALIGNBIT, DST, #7
373          lsl       SRC_ALIGNBIT, SRC_ALIGNBIT, #3
374          lsl       DST_ALIGNBIT, DST_ALIGNBIT, #3
375          sub       SRC_DST_ALIGNBIT, SRC_ALIGNBIT, DST_ALIGNBIT
376          cbz       SRC_DST_ALIGNBIT, copy_backward         /* same alignment? */
377
378          and       SRC, SRC0, #~7
379          and       DST, DST, #~7
380          neg       DST_SRC_ALIGNBIT, SRC_DST_ALIGNBIT
381
382#if BYTE_ORDER == LITTLE_ENDIAN
383          tbz       SRC_DST_ALIGNBIT, #63, 5f     /* if(SRC_DST_ALIGNBIT < 0) { */
384
385          cmp       SRC, SRC0                     /* don't access out of range */
386          beq       1f
387          ldr       DATA1, [SRC]
3881:
389          ldr       DATA0, [SRC, #-8]!
390
391          lsl       DATA1, DATA1, DST_SRC_ALIGNBIT          /* data1 =                    */
392          lsr       TMP_X, DATA0, SRC_DST_ALIGNBIT          /* (data1<<dst_src_alignbit)| */
393          orr       DATA1, DATA1, TMP_X           /* (data0<<src_dst_alignbit); */
394
395          b         9f                                      /* }                          */
3965:                                                          /* else {                     */
397          ldr       DATA0, [SRC]                            /*  data0 = *src;             */
398          lsr       DATA1, DATA0, SRC_DST_ALIGNBIT          /*  data1=data0>>src_dst_abit;*/
3999:                                                          /* }                          */
400
401          cbz       DST_ALIGNBIT, 9f    /* if (dst_alignbit != 0) {           */
402          mov       TMP_D, DST                    /*   tmp_d = dst;                     */
403
404          tbz       DST_ALIGNBIT, #(2+3), 1f /*   if (dst_ailgnbit & (4<<3)) {    */
405          str       DATA1w, [TMP_D], #4 /*      *(uint32_t *)tmp_d++ = data1; */
406          lsr       DATA1, DATA1, #32   /*      data1 >>= 32;                 */
4071:                                                /*    }                               */
408          tbz       DST_ALIGNBIT, #(1+3), 1f /*   if (dst_ailgnbit & (2<<3)) {    */
409          strh      DATA1w, [TMP_D], #2 /*      *(uint16_t *)tmp_d++ = data1; */
410          lsr       DATA1, DATA1, #16   /*      data1 >>= 16;                 */
4111:                                                /*    }                               */
412          tbz       DST_ALIGNBIT, #(0+3), 1f /*   if (dst_alignbit & (1<<3)) {    */
413          strb      DATA1w, [TMP_D]               /*      *(uint8_t *)tmp_d = data1;    */
4141:                                                /*    }                               */
415
416          sub       LEN, LEN, DST_ALIGNBIT, lsr #3          /* len -=(dst_alignbit>>3);   */
4179:                                                /* }                                  */
418#else /* BYTE_ORDER */
419          tbz       SRC_DST_ALIGNBIT, #63, 5f     /* if(SRC_DST_ALIGNBIT < 0) { */
420
421          cmp       SRC, SRC0                     /* don't access out of range */
422          beq       1f
423          ldr       DATA1, [SRC]
4241:
425          ldr       DATA0, [SRC, #-8]!
426
427          lsr       DATA1, DATA1, DST_SRC_ALIGNBIT          /* data1 =                    */
428          lsl       TMP_X, DATA0, SRC_DST_ALIGNBIT          /* (data1>>dst_src_alignbit)| */
429          orr       DATA1, DATA1, TMP_X           /* (data0<<src_dst_alignbit); */
430
431          b         9f                                      /* }                          */
4325:                                                          /* else {                     */
433          ldr       DATA0, [SRC]                            /*  data0 = *src;             */
434          lsr       DATA1, DATA0, DST_SRC_ALIGNBIT          /*  data1=data0<<dst_src_abit;*/
4359:                                                          /* }                          */
436
437          cbz       DST_ALIGNBIT, 9f    /* if (dst_alignbit != 0) {           */
438          mov       TMP_D, DST                    /*   tmp_d = dst;                     */
439
440          tbz       DST_ALIGNBIT, #(2+3), 1f /*   if (dst_ailgnbit & (4<<3)) {    */
441          lsr       TMP_X, DATA1, #32   /*      x = data1 >> 32;              */
442          str       TMP_Xw, [TMP_D], #4 /*      *(uint32_t *)tmp_d++ = x;     */
4431:                                                /*    }                               */
444          tbz       DST_ALIGNBIT, #(1+3), 1f /*   if (dst_ailgnbit & (2<<3)) {    */
445          lsr       TMP_X, DATA1, #16   /*      x = data1 >> 16;              */
446          strh      TMP_Xw, [TMP_D], #2 /*      *(uint16_t *)tmp_d++ = x;     */
4471:                                                /*    }                               */
448          tbz       DST_ALIGNBIT, #(0+3), 1f /*   if (dst_alignbit & (1<<3)) {    */
449          lsr       TMP_X, DATA1, #8    /*      x = data1 >> 8;               */
450          strb      TMP_Xw, [TMP_D], #1 /*      *(uint8_t *)tmp_d++ = x;      */
4511:                                                /*    }                               */
452
453          sub       LEN, LEN, DST_ALIGNBIT, lsr #3          /* len -=(dst_alignbit>>3);   */
4549:                                                /* }                                  */
455#endif /* BYTE_ORDER */
456
457
458backward_shifting_copy_loop:
459          ldp       DATA2, DATA1, [SRC, #-16]!
460#if BYTE_ORDER == LITTLE_ENDIAN
461          /* data0 = (data1 >> src_dst_alignbit) | (data0 << dst_src_alignbit); */
462          lsl       DATA0, DATA0, DST_SRC_ALIGNBIT
463          lsr       TMP_X, DATA1, SRC_DST_ALIGNBIT
464          orr       DATA0, DATA0, TMP_X
465          /* data1 = (data2 >> src_dst_alignbit) | (data1 << dst_src_alignbit); */
466          lsl       DATA1, DATA1, DST_SRC_ALIGNBIT
467          lsr       TMP_X, DATA2, SRC_DST_ALIGNBIT
468          orr       DATA1, DATA1, TMP_X
469#else /* BYTE_ORDER */
470          /* data0 = (data1 << src_dst_alignbit) | (data0 >> dst_src_alignbit); */
471          lsr       DATA0, DATA0, DST_SRC_ALIGNBIT
472          lsl       TMP_X, DATA1, SRC_DST_ALIGNBIT
473          orr       DATA0, DATA0, TMP_X
474          /* data1 = (data2 << src_dst_alignbit) | (data1 >> dst_src_alignbit); */
475          lsr       DATA1, DATA1, DST_SRC_ALIGNBIT
476          lsl       TMP_X, DATA2, SRC_DST_ALIGNBIT
477          orr       DATA1, DATA1, TMP_X
478#endif /* BYTE_ORDER */
479          stp       DATA1, DATA0, [DST, #-16]!
480          mov       DATA0, DATA2
481          sub       LEN, LEN, #16
482          cmp       LEN, #16
483          bhs       backward_shifting_copy_loop
484
485
486          /* write 8 bytes */
487          tbz       LEN, #3, 9f
488
489          ldr       DATA1, [SRC, #-8]!
490#if BYTE_ORDER == LITTLE_ENDIAN
491          /* data0 = (data1 >> src_dst_alignbit) | (data0 << dst_src_alignbit); */
492          lsl       DATA0, DATA0, DST_SRC_ALIGNBIT
493          lsr       TMP_X, DATA1, SRC_DST_ALIGNBIT
494          orr       DATA0, DATA0, TMP_X
495#else /* BYTE_ORDER */
496          /* data0 = (data1 << src_dst_alignbit) | (data0 >> dst_src_alignbit); */
497          lsr       DATA0, DATA0, DST_SRC_ALIGNBIT
498          lsl       TMP_X, DATA1, SRC_DST_ALIGNBIT
499          orr       DATA0, DATA0, TMP_X
500#endif /* BYTE_ORDER */
501          str       DATA0, [DST, #-8]!
502          mov       DATA0, DATA1
503          sub       LEN, LEN, #8
5049:
505
506          cbz       LEN, backward_shifting_copy_done
507
508          /* copy last 1-7 bytes */
509          and       TMP_X, SRC_DST_ALIGNBIT, #63
510          cmp       LEN, TMP_X, lsr #3
511          bls       1f
512          ldr       DATA1, [SRC, #-8]!  /* don't access out of range */
5131:
514
515#if BYTE_ORDER == LITTLE_ENDIAN
516          /* data0 = (data1 >> src_dst_alignbit) | (data0 << dst_src_alignbit); */
517          lsl       DATA0, DATA0, DST_SRC_ALIGNBIT
518          lsr       TMP_X, DATA1, SRC_DST_ALIGNBIT
519          orr       DATA0, DATA0, TMP_X
520#else /* BYTE_ORDER */
521          /* data0 = (data1 << src_dst_alignbit) | (data0 >> dst_src_alignbit); */
522          lsr       DATA0, DATA0, DST_SRC_ALIGNBIT
523          lsl       TMP_X, DATA1, SRC_DST_ALIGNBIT
524          orr       DATA0, DATA0, TMP_X
525#endif /* BYTE_ORDER */
526
527#if BYTE_ORDER == LITTLE_ENDIAN
528          tbz       LEN, #2, 1f
529          ror       DATA0, DATA0, #32
530          str       DATA0w, [DST, #-4]!
5311:
532          tbz       LEN, #1, 1f
533          ror       DATA0, DATA0, #48
534          strh      DATA0w, [DST, #-2]!
5351:
536          tbz       LEN, #0, 1f
537          ror       DATA0, DATA0, #56
538          strb      DATA0w, [DST, #-1]!
5391:
540#else /* BYTE_ORDER */
541          tbz       LEN, #2, 1f
542          str       DATA0w, [DST, #-4]!
543          lsr       DATA0, DATA0, #32
5441:
545          tbz       LEN, #1, 1f
546          strh      DATA0w, [DST, #-2]!
547          lsr       DATA0, DATA0, #16
5481:
549          tbz       LEN, #0, 1f
550          strb      DATA0w, [DST, #-1]!
5511:
552#endif /* BYTE_ORDER */
553backward_shifting_copy_done:
554          ret
555#endif /* defined(STRICT_ALIGNMENT) && !defined(NO_OVERLAP) */
556
557
558          .align    5
559ENTRY(FUNCTION)
560#ifdef STRICT_ALIGNMENT
561          cbz       LEN, done
562#ifndef NO_OVERLAP
563          cmp       SRC0, DST0
564          beq       done
565          bcc       backward_copy
566#endif /* NO_OVERLAP */
567          mov       DST, DST0
568          cmp       LEN, #SMALLSIZE
569          bcs       strict_forward
570
571          cmp       LEN, #10
572          bcs       9f
573forward_tiny:
574          /* copy 1-10 bytes */
5751:        sub       LEN, LEN, #1
576          ldrb      TMP_Xw, [SRC0], #1
577          strb      TMP_Xw, [DST], #1
578          cbz       LEN, 1b
579          ret
5809:
581          /* length is small(<32), and src or dst may be unaligned */
582          eor       TMP_X, SRC0, DST0
583          ands      TMP_X, TMP_X, #7
584          bne       notaligned_forward_small
585samealign_forward_small:
586          /* if (dst & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */
587          tbz       DST, #0, 1f
588          ldrb      TMP_Xw, [SRC0], #1
589          strb      TMP_Xw, [DST], #1
590          sub       LEN, LEN, #1
5911:
592          /* if (dst & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */
593          tbz       DST, #1, 1f
594          ldrh      TMP_Xw, [SRC0], #2
595          strh      TMP_Xw, [DST], #2
596          sub       LEN, LEN, #2
5971:
598          /* if (dst & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */
599          tbz       DST, #2, 1f
600          ldr       TMP_Xw, [SRC0], #4
601          str       TMP_Xw, [DST], #4
602          sub       LEN, LEN, #4
6031:
604          /* if (len & 16) { *(uint128_t *)dst++ = *(uint128_t *)src++; } */
605          tbz       LEN, #4, 1f
606          ldp       DATA0, DATA1, [SRC0], #16
607          stp       DATA0, DATA1, [DST], #16
6081:
609          /* if (len & 8) { *(uint64_t *)dst++ = *(uint64_t *)src++; } */
610          tbz       LEN, #3, 1f
611          ldr       TMP_X, [SRC0], #8
612          str       TMP_X, [DST], #8
6131:
614          /* if (len & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */
615          tbz       LEN, #2, 1f
616          ldr       TMP_Xw, [SRC0], #4
617          str       TMP_Xw, [DST], #4
6181:
619          /* if (len & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */
620          tbz       LEN, #1, 1f
621          ldrh      TMP_Xw, [SRC0], #2
622          strh      TMP_Xw, [DST], #2
6231:
624          /* if (len & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */
625          tbz       LEN, #0, 1f
626          ldrb      TMP_Xw, [SRC0], #1
627          strb      TMP_Xw, [DST], #1
6281:
629          ret
630
631notaligned_forward_small:
632          /* src and dst are not aligned... */
633          prfm      PLDL1KEEP, [SRC0]
634          prfm      PLDL1KEEP, [SRC0, #8]
635          prfm      PLDL1KEEP, [SRC0, #16]
636          add       TMP_S, SRC0, LEN    /* tmp_s = src + len */
6371:                                                /* do { */
638          ldrb      TMP_Xw, [SRC0], #1
639          strb      TMP_Xw, [DST], #1   /*  *(char *)dst++ = *(char *)src++ */
640          cmp       SRC0, TMP_S                   /* while (src < tmp_s); */
641          blo       1b
642          ret
643
644strict_forward:
645          /* src or dst may be unaligned */
646          and       SRC_ALIGNBIT, SRC0, #7
647          and       DST_ALIGNBIT, DST0, #7
648          lsl       SRC_ALIGNBIT, SRC_ALIGNBIT, #3
649          lsl       DST_ALIGNBIT, DST_ALIGNBIT, #3
650          sub       SRC_DST_ALIGNBIT, SRC_ALIGNBIT, DST_ALIGNBIT
651          cbz       SRC_DST_ALIGNBIT, copy_forward          /* same alignment? */
652
653          and       SRC, SRC0, #~7
654          and       DST, DST0, #~7
655          neg       DST_SRC_ALIGNBIT, SRC_DST_ALIGNBIT
656
657#if BYTE_ORDER == LITTLE_ENDIAN
658          tbz       DST_SRC_ALIGNBIT, #63, 5f     /* if(DST_SRC_ALIGNBIT < 0) { */
659          ldp       DATA1, DATA0, [SRC], #16
660          neg       TMP_X, SRC_ALIGNBIT
661          lsr       DATA1, DATA1, SRC_ALIGNBIT    /* data1 =                    */
662          lsl       TMP_X, DATA0, TMP_X           /*  (data1 >> src_alignbit) | */
663          orr       DATA1, DATA1, TMP_X           /*  (data0 << -src_alignbit); */
664          b         9f
6655:
666          ldr       DATA0, [SRC], #8
667          lsr       DATA1, DATA0, SRC_ALIGNBIT
6689:
669
670          cbz       DST_ALIGNBIT, 5f
671          mov       TMP_D, DST0
672          /* if (tmp_d & 1) { *(uint8_t *)tmp_d++ = data1; } */
673          tbz       TMP_D, #0, 1f
674          strb      DATA1w, [TMP_D], #1
675          lsr       DATA1, DATA1, #8
6761:
677          /* if (tmp_d & 2) { *(uint16_t *)tmp_d++ = data1; } */
678          tbz       TMP_D, #1, 1f
679          strh      DATA1w, [TMP_D], #2
680          lsr       DATA1, DATA1, #16
6811:
682          /* if (tmp-d & 4) { *(uint32_t *)tmp_d++ = data1; } */
683          tbz       TMP_D, #2, 1f
684          str       DATA1w, [TMP_D], #4
6851:
686          add       DST, DST, #8
687          b         9f
6885:
689          str       DATA1, [DST], #8
6909:
691          sub       LEN, LEN, #8
692          add       LEN, LEN, DST_ALIGNBIT, lsr #3
693#else /* BYTE_ORDER */
694          tbz       DST_SRC_ALIGNBIT, #63, 5f     /* if(DST_SRC_ALIGNBIT < 0) { */
695          ldp       DATA1, DATA0, [SRC], #16
696          neg       TMP_X, SRC_ALIGNBIT
697          lsl       DATA1, DATA1, SRC_ALIGNBIT    /* data1 =                    */
698          lsr       TMP_X, DATA0, TMP_X           /*  (data1 << src_alignbit) | */
699          orr       DATA1, DATA1, TMP_X           /*  (data0 >> -src_alignbit); */
700          b         9f
7015:
702          ldr       DATA0, [SRC], #8
703          lsl       DATA1, DATA0, SRC_ALIGNBIT
7049:
705
706          cbz       DST_ALIGNBIT, 5f
707          mov       TMP_D, DST0
708          /* if (tmp_d & 1) { *(uint8_t *)tmp_d++ = data1 >> 56; } */
709          tbz       TMP_D, #0, 1f
710          lsr       TMP_X, DATA1, #56
711          strb      TMP_Xw, [TMP_D], #1
7121:
713          /* if (tmp_d & 2) { *(uint16_t *)tmp_d++ = data1 >> 48; } */
714          tbz       TMP_D, #1, 1f
715          lsr       TMP_X, DATA1, #48
716          strh      TMP_Xw, [TMP_D], #2
7171:
718          /* if (tmp-d & 4) { *(uint32_t *)tmp_d++ = data1 >> 32; } */
719          tbz       TMP_D, #2, 1f
720          lsr       TMP_X, DATA1, #32
721          str       TMP_Xw, [TMP_D], #4
7221:
723          add       DST, DST, #8
724          b         9f
7255:
726          str       DATA1, [DST], #8
7279:
728          sub       LEN, LEN, #8
729          add       LEN, LEN, DST_ALIGNBIT, lsr #3
730#endif /* BYTE_ORDER */
731
732shifting_copy_loop:
733          ldp       DATA1, DATA2, [SRC], #16
734#if BYTE_ORDER == LITTLE_ENDIAN
735          /* data0 = (data0 >> src_dst_alignbit) | (data1 << dst_src_alignbit) */
736          lsr       DATA0, DATA0, SRC_DST_ALIGNBIT
737          lsl       TMP_X, DATA1, DST_SRC_ALIGNBIT
738          orr       DATA0, DATA0, TMP_X
739          /* data1 = (data1 >> src_dst_alignbit) | (data2 << dst_src_alignbit) */
740          lsr       DATA1, DATA1, SRC_DST_ALIGNBIT
741          lsl       TMP_X, DATA2, DST_SRC_ALIGNBIT
742          orr       DATA1, DATA1, TMP_X
743#else /* BYTE_ORDER */
744          /* data0 = (data0 << src_dst_alignbit) | (data1 >> dst_src_alignbit) */
745          lsl       DATA0, DATA0, SRC_DST_ALIGNBIT
746          lsr       TMP_X, DATA1, DST_SRC_ALIGNBIT
747          orr       DATA0, DATA0, TMP_X
748          /* data1 = (data1 << src_dst_alignbit) | (data2 >> dst_src_alignbit) */
749          lsl       DATA1, DATA1, SRC_DST_ALIGNBIT
750          lsr       TMP_X, DATA2, DST_SRC_ALIGNBIT
751          orr       DATA1, DATA1, TMP_X
752#endif /* BYTE_ORDER */
753          stp       DATA0, DATA1, [DST], #16
754          mov       DATA0, DATA2
755          sub       LEN, LEN, #16
756          cmp       LEN, #16
757          bhs       shifting_copy_loop
758
759
760          /* write 8 bytes */
761          tbz       LEN, #3, 9f
762          ldr       DATA1, [SRC], #8
763#if BYTE_ORDER == LITTLE_ENDIAN
764          /* data0 = (data0 >> src_dst_alignbit) | (data1 << dst_src_alignbit) */
765          lsr       DATA0, DATA0, SRC_DST_ALIGNBIT
766          lsl       TMP_X, DATA1, DST_SRC_ALIGNBIT
767          orr       DATA0, DATA0, TMP_X
768#else /* BYTE_ORDER */
769          /* data0 = (data0 << src_dst_alignbit) | (data1 >> dst_src_alignbit) */
770          lsl       DATA0, DATA0, SRC_DST_ALIGNBIT
771          lsr       TMP_X, DATA1, DST_SRC_ALIGNBIT
772          orr       DATA0, DATA0, TMP_X
773#endif /* BYTE_ORDER */
774          str       DATA0, [DST], #8
775          mov       DATA0, DATA1
776          sub       LEN, LEN, #8
7779:
778
779          cbz       LEN, shifting_copy_done
780
781          /* copy last 1-7 bytes */
782          and       TMP_X, DST_SRC_ALIGNBIT, #63
783          cmp       LEN, TMP_X, lsr #3
784          bls       1f
785          ldr       DATA1, [SRC], #8    /* don't access out of range */
7861:
787
788#if BYTE_ORDER == LITTLE_ENDIAN
789          /* data0 = (data0 >> src_dst_alignbit) | (data1 << dst_src_alignbit) */
790          lsr       DATA0, DATA0, SRC_DST_ALIGNBIT
791          lsl       TMP_X, DATA1, DST_SRC_ALIGNBIT
792          orr       DATA0, DATA0, TMP_X
793#else /* BYTE_ORDER */
794          /* data0 = (data0 << src_dst_alignbit) | (data1 >> dst_src_alignbit) */
795          lsl       DATA0, DATA0, SRC_DST_ALIGNBIT
796          lsr       TMP_X, DATA1, DST_SRC_ALIGNBIT
797          orr       DATA0, DATA0, TMP_X
798#endif /* BYTE_ORDER */
799
800#if BYTE_ORDER == LITTLE_ENDIAN
801          /* if (len & 4) { *(uint32_t *)dst++ = data0; } */
802          tbz       LEN, #2, 1f
803          str       DATA0w, [DST], #4
804          lsr       DATA0, DATA0, #32
8051:
806          /* if (len & 2) { *(uint16_t *)dst++ = data0; } */
807          tbz       LEN, #1, 1f
808          strh      DATA0w, [DST], #2
809          lsr       DATA0, DATA0, #16
8101:
811          /* if (len & 1) { *(uint8_t *)dst++ = data0; } */
812          tbz       LEN, #0, 1f
813          strb      DATA0w, [DST], #1
8141:
815#else /* BYTE_ORDER */
816          /* if (len & 4) { *(uint32_t *)dst++ = data0 >> 32; } */
817          tbz       LEN, #2, 1f
818          lsr       TMP_X, DATA0, #32
819          str       TMP_Xw, [DST], #4
8201:
821          /* if (len & 2) { *(uint16_t *)dst++ = data0 >> 16; } */
822          tbz       LEN, #1, 1f
823          lsr       TMP_X, DATA0, #16
824          strh      TMP_Xw, [DST], #2
8251:
826          /* if (len & 1) { *(uint8_t *)dst++ = data0 >> 8; } */
827          tbz       LEN, #0, 1f
828          lsr       TMP_X, DATA0, #8
829          strb      TMP_Xw, [DST], #1
8301:
831#endif /* BYTE_ORDER */
832shifting_copy_done:
833          ret
834
835#else /* STRICT_ALIGNMENT */
836#ifndef NO_OVERLAP
837          cbz       LEN, done
838          cmp       SRC0, DST0
839          beq       done
840          bcc       backward_ignore_align
841#endif /* NO_OVERLAP */
842
843          prfm      PLDL1KEEP, [SRC0]
844          cmp       LEN, #SMALLSIZE
845          bcs       copy_forward
846          mov       DST, DST0
847
848copy_forward_small:
849          cmp       LEN, #8
850          bcs       9f
851
852          /* 0 <= len < 8 */
853          /* if (len & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */
854          tbz       LEN, #2, 1f
855          ldr       TMP_Xw, [SRC0], #4
856          str       TMP_Xw, [DST], #4
8571:
858          /* if (len & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */
859          tbz       LEN, #1, 1f
860          ldrh      TMP_Xw, [SRC0], #2
861          strh      TMP_Xw, [DST], #2
8621:
863          /* if (len & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */
864          tbz       LEN, #0, 1f
865          ldrb      TMP_Xw, [SRC0], #1
866          strb      TMP_Xw, [DST], #1
8671:
868          ret
8699:
870
871          prfm      PLDL1KEEP, [SRC0, #8]
872          cmp       LEN, #16
873          bcs       9f
874
875          /* 8 <= len < 16 */
876          /* *(uint64_t *)dst++ = *(uint64_t *)src++; */
877          ldr       TMP_X, [SRC0], #8
878          str       TMP_X, [DST], #8
879          /* if (len & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */
880          tbz       LEN, #2, 1f
881          ldr       TMP_Xw, [SRC0], #4
882          str       TMP_Xw, [DST], #4
8831:
884          /* if (len & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */
885          tbz       LEN, #1, 1f
886          ldrh      TMP_Xw, [SRC0], #2
887          strh      TMP_Xw, [DST], #2
8881:
889          /* if (len & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */
890          tbz       LEN, #0, 1f
891          ldrb      TMP_Xw, [SRC0], #1
892          strb      TMP_Xw, [DST], #1
8931:
894          ret
8959:
896
897          /* 16 <= len < 32 */
898          prfm      PLDL1KEEP, [SRC0, 16]
899          prfm      PLDL1KEEP, [SRC0, 24]
900          ldp       DATA0, DATA1, [SRC0], #16
901          stp       DATA0, DATA1, [DST], #16
902          /* if (len & 8) { *(uint64_t *)dst++ = *(uint64_t *)src++; } */
903          tbz       LEN, #3, 1f
904          ldr       TMP_X, [SRC0], #8
905          str       TMP_X, [DST], #8
9061:
907          /* if (len & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */
908          tbz       LEN, #2, 1f
909          ldr       TMP_Xw, [SRC0], #4
910          str       TMP_Xw, [DST], #4
9111:
912          /* if (len & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */
913          tbz       LEN, #1, 1f
914          ldrh      TMP_Xw, [SRC0], #2
915          strh      TMP_Xw, [DST], #2
9161:
917          /* if (len & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */
918          tbz       LEN, #0, 1f
919          ldrb      TMP_Xw, [SRC0], #1
920          strb      TMP_Xw, [DST], #1
9211:
922          ret
923#endif /* !STRICT_ALIGNMENT */
924
925          .align    4
926copy_forward:
927          /* DST is not aligned at this point */
928          mov       DST, DST0
929#ifndef STRICT_ALIGNMENT
930          cmp       LEN, #512 /* pre-alignment can be overhead when small */
931          bcc       9f
932#endif /* STRICT_ALIGNMENT */
933          /* if (DST & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */
934          tbz       DST, #0, 1f
935          ldrb      TMP_Xw, [SRC0], #1
936          strb      TMP_Xw, [DST], #1
937          sub       LEN, LEN, #1
9381:
939          /* if (DST & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */
940          tbz       DST, #1, 1f
941          ldrh      TMP_Xw, [SRC0], #2
942          strh      TMP_Xw, [DST], #2
943          sub       LEN, LEN, #2
9441:
945          /* if (DST & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */
946          tbz       DST, #2, 1f
947          ldr       TMP_Xw, [SRC0], #4
948          str       TMP_Xw, [DST], #4
949          sub       LEN, LEN, #4
9501:
951#if (STP_ALIGN > 8)
952          /* if (DST & 8) { *(uint64_t *)dst++ = *(uint64_t *)src++; } */
953          tbz       DST, #3, 1f
954          ldr       TMP_X, [SRC0], #8
955          str       TMP_X, [DST], #8
956          sub       LEN, LEN, #8
9571:
958#endif /* (STP_ALIGN > 8) */
9599:
960
961forward_copy1k:
962          /* while (len >= 1024) */
963          /* { copy1024(dst, src); src += 1024; dst += 1024; len -= 1024; } */
964          cmp       LEN, #1024
965          blo       9f
9661:
967          sub       LEN, LEN, #1024
968          .rept     (1024 / 16)
969          ldp       DATA0, DATA1, [SRC0], #16     /* *dst++ = *src++; */
970          stp       DATA0, DATA1, [DST], #16
971          .endr
972          cmp       LEN, #1024
973          bhs       1b
9749:
975
976          /* if (len & 512) { copy512(dst, src); src += 512; dst += 512; */
977          tbz       LEN, #9, 1f
978          .rept     (512 / 16)
979          ldp       DATA0, DATA1, [SRC0], #16
980          stp       DATA0, DATA1, [DST], #16
981          .endr
9821:
983          /* if (len & 256) { copy256(dst, src); src += 256; dst += 256; */
984          tbz       LEN, #8, 1f
985          .rept     (256 / 16)
986          ldp       DATA0, DATA1, [SRC0], #16
987          stp       DATA0, DATA1, [DST], #16
988          .endr
9891:
990          /* if (len & 128) { copy128(dst, src); src += 128; dst += 128; */
991          tbz       LEN, #7, 1f
992          .rept     (128 / 16)
993          ldp       DATA0, DATA1, [SRC0], #16
994          stp       DATA0, DATA1, [DST], #16
995          .endr
9961:
997          /* if (len & 64) { copy64(dst, src); src += 64; dst += 64; */
998          tbz       LEN, #6, 1f
999          .rept     (64 / 16)
1000          ldp       DATA0, DATA1, [SRC0], #16
1001          stp       DATA0, DATA1, [DST], #16
1002          .endr
10031:
1004          /* if (len & 32) { copy32(dst, src); src += 32; dst += 32; */
1005          tbz       LEN, #5, 1f
1006          .rept     (32 / 16)
1007          ldp       DATA0, DATA1, [SRC0], #16
1008          stp       DATA0, DATA1, [DST], #16
1009          .endr
10101:
1011          /* if (len & 16) { *(uint128_t *)dst++ = *(uint128_t *)src++; } */
1012          tbz       LEN, #4, 1f
1013          ldp       DATA0, DATA1, [SRC0], #16
1014          stp       DATA0, DATA1, [DST], #16
10151:
1016          /* if (len & 8) { *(uint64_t *)dst++ = *(uint64_t *)src++; } */
1017          tbz       LEN, #3, 1f
1018          ldr       TMP_X, [SRC0], #8
1019          str       TMP_X, [DST], #8
10201:
1021          /* if (len & 4) { *(uint32_t *)dst++ = *(uint32_t *)src++; } */
1022          tbz       LEN, #2, 1f
1023          ldr       TMP_Xw, [SRC0], #4
1024          str       TMP_Xw, [DST], #4
10251:
1026          /* if (len & 2) { *(uint16_t *)dst++ = *(uint16_t *)src++; } */
1027          tbz       LEN, #1, 1f
1028          ldrh      TMP_Xw, [SRC0], #2
1029          strh      TMP_Xw, [DST], #2
10301:
1031          /* if (len & 1) { *(uint8_t *)dst++ = *(uint8_t *)src++; } */
1032          tbz       LEN, #0, 1f
1033          ldrb      TMP_Xw, [SRC0], #1
1034          strb      TMP_Xw, [DST], #1
10351:
1036done:
1037          ret
1038END(FUNCTION)
1039