1 /*-
2 * Copyright 2014 Garrett D'Amore <garrett@damore.org>
3 * Copyright 2010 Nexenta Systems, Inc. All rights reserved.
4 * Copyright (c) 1995 Alex Tatmanjants <alex@elvisti.kiev.ua>
5 * at Electronni Visti IA, Kiev, Ukraine.
6 * All rights reserved.
7 *
8 * Copyright (c) 2011 The FreeBSD Foundation
9 * All rights reserved.
10 * Portions of this software were developed by David Chisnall
11 * under sponsorship from the FreeBSD Foundation.
12 *
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
15 * are met:
16 * 1. Redistributions of source code must retain the above copyright
17 * notice, this list of conditions and the following disclaimer.
18 * 2. Redistributions in binary form must reproduce the above copyright
19 * notice, this list of conditions and the following disclaimer in the
20 * documentation and/or other materials provided with the distribution.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 * Adapted to xlocale by John Marino <draco@marino.st>
35 */
36
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
39
40 #include "namespace.h"
41
42 #include <sys/types.h>
43 #include <sys/stat.h>
44 #include <sys/mman.h>
45
46 #include <assert.h>
47 #include <stdio.h>
48 #include <stdlib.h>
49 #include <string.h>
50 #include <wchar.h>
51 #include <errno.h>
52 #include <unistd.h>
53 #include <fcntl.h>
54 #include "un-namespace.h"
55
56 #include "collate.h"
57 #include "setlocale.h"
58 #include "ldpart.h"
59 #include "libc_private.h"
60
61 struct xlocale_collate __xlocale_global_collate = {
62 {{0}, "C"}, 1, 0, 0, 0
63 };
64
65 struct xlocale_collate __xlocale_C_collate = {
66 {{0}, "C"}, 1, 0, 0, 0
67 };
68
69 static int
70 __collate_load_tables_l(const char *encoding, struct xlocale_collate *table);
71
72 static void
destruct_collate(void * t)73 destruct_collate(void *t)
74 {
75 struct xlocale_collate *table = t;
76 if (table->map && (table->maplen > 0)) {
77 (void) munmap(table->map, table->maplen);
78 }
79 free(t);
80 }
81
82 void *
__collate_load(const char * encoding,__unused locale_t unused)83 __collate_load(const char *encoding, __unused locale_t unused)
84 {
85 if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0) {
86 return &__xlocale_C_collate;
87 }
88 struct xlocale_collate *table = calloc(sizeof(struct xlocale_collate), 1);
89 table->header.header.destructor = destruct_collate;
90 // FIXME: Make sure that _LDP_CACHE is never returned. We should be doing
91 // the caching outside of this section
92 if (__collate_load_tables_l(encoding, table) != _LDP_LOADED) {
93 xlocale_release(table);
94 return NULL;
95 }
96 return table;
97 }
98
99 /**
100 * Load the collation tables for the specified encoding into the global table.
101 */
102 int
__collate_load_tables(const char * encoding)103 __collate_load_tables(const char *encoding)
104 {
105
106 return (__collate_load_tables_l(encoding, &__xlocale_global_collate));
107 }
108
109 int
__collate_load_tables_l(const char * encoding,struct xlocale_collate * table)110 __collate_load_tables_l(const char *encoding, struct xlocale_collate *table)
111 {
112 int i, chains, z;
113 char *buf;
114 char *TMP;
115 char *map;
116 collate_info_t *info;
117 struct stat sbuf;
118 int fd;
119
120 table->__collate_load_error = 1;
121
122 /* 'encoding' must be already checked. */
123 if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0) {
124 return (_LDP_CACHE);
125 }
126
127 asprintf(&buf, "%s/%s/LC_COLLATE", _PathLocale, encoding);
128 if (buf == NULL)
129 return (_LDP_ERROR);
130
131 if ((fd = _open(buf, O_RDONLY)) < 0) {
132 free(buf);
133 return (_LDP_ERROR);
134 }
135 free(buf);
136 if (_fstat(fd, &sbuf) < 0) {
137 (void) _close(fd);
138 return (_LDP_ERROR);
139 }
140 if (sbuf.st_size < (COLLATE_STR_LEN + sizeof (info))) {
141 (void) _close(fd);
142 errno = EINVAL;
143 return (_LDP_ERROR);
144 }
145 map = mmap(NULL, sbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
146 (void) _close(fd);
147 if ((TMP = map) == NULL) {
148 return (_LDP_ERROR);
149 }
150
151 if (strncmp(TMP, COLLATE_VERSION, COLLATE_STR_LEN) != 0) {
152 (void) munmap(map, sbuf.st_size);
153 errno = EINVAL;
154 return (_LDP_ERROR);
155 }
156 TMP += COLLATE_STR_LEN;
157
158 info = (void *)TMP;
159 TMP += sizeof (*info);
160
161 if ((info->directive_count < 1) ||
162 (info->directive_count >= COLL_WEIGHTS_MAX) ||
163 ((chains = info->chain_count) < 0)) {
164 (void) munmap(map, sbuf.st_size);
165 errno = EINVAL;
166 return (_LDP_ERROR);
167 }
168
169 i = (sizeof (collate_char_t) * (UCHAR_MAX + 1)) +
170 (sizeof (collate_chain_t) * chains) +
171 (sizeof (collate_large_t) * info->large_count);
172 for (z = 0; z < info->directive_count; z++) {
173 i += sizeof (collate_subst_t) * info->subst_count[z];
174 }
175 if (i != (sbuf.st_size - (TMP - map))) {
176 (void) munmap(map, sbuf.st_size);
177 errno = EINVAL;
178 return (_LDP_ERROR);
179 }
180
181 table->info = info;
182 table->char_pri_table = (void *)TMP;
183 TMP += sizeof (collate_char_t) * (UCHAR_MAX + 1);
184
185 for (z = 0; z < info->directive_count; z++) {
186 if (info->subst_count[z] > 0) {
187 table->subst_table[z] = (void *)TMP;
188 TMP += info->subst_count[z] * sizeof (collate_subst_t);
189 } else {
190 table->subst_table[z] = NULL;
191 }
192 }
193
194 if (chains > 0) {
195 table->chain_pri_table = (void *)TMP;
196 TMP += chains * sizeof (collate_chain_t);
197 } else
198 table->chain_pri_table = NULL;
199 if (info->large_count > 0)
200 table->large_pri_table = (void *)TMP;
201 else
202 table->large_pri_table = NULL;
203
204 table->__collate_load_error = 0;
205 return (_LDP_LOADED);
206 }
207
208 static const int32_t *
substsearch(struct xlocale_collate * table,const wchar_t key,int pass)209 substsearch(struct xlocale_collate *table, const wchar_t key, int pass)
210 {
211 const collate_subst_t *p;
212 int n = table->info->subst_count[pass];
213
214 if (n == 0)
215 return (NULL);
216
217 if (pass >= table->info->directive_count)
218 return (NULL);
219
220 if (!(key & COLLATE_SUBST_PRIORITY))
221 return (NULL);
222
223 p = table->subst_table[pass] + (key & ~COLLATE_SUBST_PRIORITY);
224 assert(p->key == key);
225 return (p->pri);
226 }
227
228 static collate_chain_t *
chainsearch(struct xlocale_collate * table,const wchar_t * key,int * len)229 chainsearch(struct xlocale_collate *table, const wchar_t *key, int *len)
230 {
231 int low = 0;
232 int high = table->info->chain_count - 1;;
233 int next, compar, l;
234 collate_chain_t *p;
235 collate_chain_t *tab = table->chain_pri_table;
236
237 if (high < 0)
238 return (NULL);
239
240 while (low <= high) {
241 next = (low + high) / 2;
242 p = tab + next;
243 compar = *key - *p->str;
244 if (compar == 0) {
245 l = wcsnlen(p->str, COLLATE_STR_LEN);
246 compar = wcsncmp(key, p->str, l);
247 if (compar == 0) {
248 *len = l;
249 return (p);
250 }
251 }
252 if (compar > 0)
253 low = next + 1;
254 else
255 high = next - 1;
256 }
257 return (NULL);
258 }
259
260 static collate_large_t *
largesearch(struct xlocale_collate * table,const wchar_t key)261 largesearch(struct xlocale_collate *table, const wchar_t key)
262 {
263 int low = 0;
264 int high = table->info->large_count - 1;
265 int next, compar;
266 collate_large_t *p;
267 collate_large_t *tab = table->large_pri_table;
268
269 if (high < 0)
270 return (NULL);
271
272 while (low <= high) {
273 next = (low + high) / 2;
274 p = tab + next;
275 compar = key - p->val;
276 if (compar == 0)
277 return (p);
278 if (compar > 0)
279 low = next + 1;
280 else
281 high = next - 1;
282 }
283 return (NULL);
284 }
285
286 void
_collate_lookup(struct xlocale_collate * table,const wchar_t * t,int * len,int * pri,int which,const int ** state)287 _collate_lookup(struct xlocale_collate *table, const wchar_t *t, int *len,
288 int *pri, int which, const int **state)
289 {
290 collate_chain_t *p2;
291 collate_large_t *match;
292 int p, l;
293 const int *sptr;
294
295 /*
296 * If this is the "last" pass for the UNDEFINED, then
297 * we just return the priority itself.
298 */
299 if (which >= table->info->directive_count) {
300 *pri = *t;
301 *len = 1;
302 *state = NULL;
303 return;
304 }
305
306 /*
307 * If we have remaining substitution data from a previous
308 * call, consume it first.
309 */
310 if ((sptr = *state) != NULL) {
311 *pri = *sptr;
312 sptr++;
313 if ((sptr == *state) || (sptr == NULL))
314 *state = NULL;
315 else
316 *state = sptr;
317 *len = 0;
318 return;
319 }
320
321 /* No active substitutions */
322 *len = 1;
323
324 /*
325 * Check for composites such as dipthongs that collate as a
326 * single element (aka chains or collating-elements).
327 */
328 if (((p2 = chainsearch(table, t, &l)) != NULL) &&
329 ((p = p2->pri[which]) >= 0)) {
330
331 *len = l;
332 *pri = p;
333
334 } else if (*t <= UCHAR_MAX) {
335
336 /*
337 * Character is a small (8-bit) character.
338 * We just look these up directly for speed.
339 */
340 *pri = table->char_pri_table[*t].pri[which];
341
342 } else if ((table->info->large_count > 0) &&
343 ((match = largesearch(table, *t)) != NULL)) {
344
345 /*
346 * Character was found in the extended table.
347 */
348 *pri = match->pri.pri[which];
349
350 } else {
351 /*
352 * Character lacks a specific definition.
353 */
354 if (table->info->directive[which] & DIRECTIVE_UNDEFINED) {
355 /* Mask off sign bit to prevent ordering confusion. */
356 *pri = (*t & COLLATE_MAX_PRIORITY);
357 } else {
358 *pri = table->info->undef_pri[which];
359 }
360 /* No substitutions for undefined characters! */
361 return;
362 }
363
364 /*
365 * Try substituting (expanding) the character. We are
366 * currently doing this *after* the chain compression. I
367 * think it should not matter, but this way might be slightly
368 * faster.
369 *
370 * We do this after the priority search, as this will help us
371 * to identify a single key value. In order for this to work,
372 * its important that the priority assigned to a given element
373 * to be substituted be unique for that level. The localedef
374 * code ensures this for us.
375 */
376 if ((sptr = substsearch(table, *pri, which)) != NULL) {
377 if ((*pri = *sptr) > 0) {
378 sptr++;
379 *state = *sptr ? sptr : NULL;
380 }
381 }
382
383 }
384
385 /*
386 * This is the meaty part of wcsxfrm & strxfrm. Note that it does
387 * NOT NULL terminate. That is left to the caller.
388 */
389 size_t
_collate_wxfrm(struct xlocale_collate * table,const wchar_t * src,wchar_t * xf,size_t room)390 _collate_wxfrm(struct xlocale_collate *table, const wchar_t *src, wchar_t *xf,
391 size_t room)
392 {
393 int pri;
394 int len;
395 const wchar_t *t;
396 wchar_t *tr = NULL;
397 int direc;
398 int pass;
399 const int32_t *state;
400 size_t want = 0;
401 size_t need = 0;
402 int ndir = table->info->directive_count;
403
404 assert(src);
405
406 for (pass = 0; pass <= ndir; pass++) {
407
408 state = NULL;
409
410 if (pass != 0) {
411 /* insert level separator from the previous pass */
412 if (room) {
413 *xf++ = 1;
414 room--;
415 }
416 want++;
417 }
418
419 /* special pass for undefined */
420 if (pass == ndir) {
421 direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED;
422 } else {
423 direc = table->info->directive[pass];
424 }
425
426 t = src;
427
428 if (direc & DIRECTIVE_BACKWARD) {
429 wchar_t *bp, *fp, c;
430 free(tr);
431 if ((tr = wcsdup(t)) == NULL) {
432 errno = ENOMEM;
433 goto fail;
434 }
435 bp = tr;
436 fp = tr + wcslen(tr) - 1;
437 while (bp < fp) {
438 c = *bp;
439 *bp++ = *fp;
440 *fp-- = c;
441 }
442 t = (const wchar_t *)tr;
443 }
444
445 if (direc & DIRECTIVE_POSITION) {
446 while (*t || state) {
447 _collate_lookup(table, t, &len, &pri, pass, &state);
448 t += len;
449 if (pri <= 0) {
450 if (pri < 0) {
451 errno = EINVAL;
452 goto fail;
453 }
454 pri = COLLATE_MAX_PRIORITY;
455 }
456 if (room) {
457 *xf++ = pri;
458 room--;
459 }
460 want++;
461 need = want;
462 }
463 } else {
464 while (*t || state) {
465 _collate_lookup(table, t, &len, &pri, pass, &state);
466 t += len;
467 if (pri <= 0) {
468 if (pri < 0) {
469 errno = EINVAL;
470 goto fail;
471 }
472 continue;
473 }
474 if (room) {
475 *xf++ = pri;
476 room--;
477 }
478 want++;
479 need = want;
480 }
481 }
482 }
483 free(tr);
484 return (need);
485
486 fail:
487 free(tr);
488 return ((size_t)(-1));
489 }
490
491 /*
492 * In the non-POSIX case, we transform each character into a string of
493 * characters representing the character's priority. Since char is usually
494 * signed, we are limited by 7 bits per byte. To avoid zero, we need to add
495 * XFRM_OFFSET, so we can't use a full 7 bits. For simplicity, we choose 6
496 * bits per byte.
497 *
498 * It turns out that we sometimes have real priorities that are
499 * 31-bits wide. (But: be careful using priorities where the high
500 * order bit is set -- i.e. the priority is negative. The sort order
501 * may be surprising!)
502 *
503 * TODO: This would be a good area to optimize somewhat. It turns out
504 * that real prioririties *except for the last UNDEFINED pass* are generally
505 * very small. We need the localedef code to precalculate the max
506 * priority for us, and ideally also give us a mask, and then we could
507 * severely limit what we expand to.
508 */
509 #define XFRM_BYTES 6
510 #define XFRM_OFFSET ('0') /* make all printable characters */
511 #define XFRM_SHIFT 6
512 #define XFRM_MASK ((1 << XFRM_SHIFT) - 1)
513 #define XFRM_SEP ('.') /* chosen to be less than XFRM_OFFSET */
514
515 static int
xfrm(struct xlocale_collate * table,unsigned char * p,int pri,int pass)516 xfrm(struct xlocale_collate *table, unsigned char *p, int pri, int pass)
517 {
518 /* we use unsigned to ensure zero fill on right shift */
519 uint32_t val = (uint32_t)table->info->pri_count[pass];
520 int nc = 0;
521
522 while (val) {
523 *p = (pri & XFRM_MASK) + XFRM_OFFSET;
524 pri >>= XFRM_SHIFT;
525 val >>= XFRM_SHIFT;
526 p++;
527 nc++;
528 }
529 return (nc);
530 }
531
532 size_t
_collate_sxfrm(struct xlocale_collate * table,const wchar_t * src,char * xf,size_t room)533 _collate_sxfrm(struct xlocale_collate *table, const wchar_t *src, char *xf,
534 size_t room)
535 {
536 int pri;
537 int len;
538 const wchar_t *t;
539 wchar_t *tr = NULL;
540 int direc;
541 int pass;
542 const int32_t *state;
543 size_t want = 0;
544 size_t need = 0;
545 int b;
546 uint8_t buf[XFRM_BYTES];
547 int ndir = table->info->directive_count;
548
549 assert(src);
550
551 for (pass = 0; pass <= ndir; pass++) {
552
553 state = NULL;
554
555 if (pass != 0) {
556 /* insert level separator from the previous pass */
557 if (room) {
558 *xf++ = XFRM_SEP;
559 room--;
560 }
561 want++;
562 }
563
564 /* special pass for undefined */
565 if (pass == ndir) {
566 direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED;
567 } else {
568 direc = table->info->directive[pass];
569 }
570
571 t = src;
572
573 if (direc & DIRECTIVE_BACKWARD) {
574 wchar_t *bp, *fp, c;
575 free(tr);
576 if ((tr = wcsdup(t)) == NULL) {
577 errno = ENOMEM;
578 goto fail;
579 }
580 bp = tr;
581 fp = tr + wcslen(tr) - 1;
582 while (bp < fp) {
583 c = *bp;
584 *bp++ = *fp;
585 *fp-- = c;
586 }
587 t = (const wchar_t *)tr;
588 }
589
590 if (direc & DIRECTIVE_POSITION) {
591 while (*t || state) {
592
593 _collate_lookup(table, t, &len, &pri, pass, &state);
594 t += len;
595 if (pri <= 0) {
596 if (pri < 0) {
597 errno = EINVAL;
598 goto fail;
599 }
600 pri = COLLATE_MAX_PRIORITY;
601 }
602
603 b = xfrm(table, buf, pri, pass);
604 want += b;
605 if (room) {
606 while (b) {
607 b--;
608 if (room) {
609 *xf++ = buf[b];
610 room--;
611 }
612 }
613 }
614 need = want;
615 }
616 } else {
617 while (*t || state) {
618 _collate_lookup(table, t, &len, &pri, pass, &state);
619 t += len;
620 if (pri <= 0) {
621 if (pri < 0) {
622 errno = EINVAL;
623 goto fail;
624 }
625 continue;
626 }
627
628 b = xfrm(table, buf, pri, pass);
629 want += b;
630 if (room) {
631
632 while (b) {
633 b--;
634 if (room) {
635 *xf++ = buf[b];
636 room--;
637 }
638 }
639 }
640 need = want;
641 }
642 }
643 }
644 free(tr);
645 return (need);
646
647 fail:
648 free(tr);
649 return ((size_t)(-1));
650 }
651
652 /*
653 * __collate_equiv_value returns the primary collation value for the given
654 * collating symbol specified by str and len. Zero or negative is returned
655 * if the collating symbol was not found. This function is used by bracket
656 * code in the TRE regex library.
657 */
658 int
__collate_equiv_value(locale_t locale,const wchar_t * str,size_t len)659 __collate_equiv_value(locale_t locale, const wchar_t *str, size_t len)
660 {
661 int32_t e;
662
663 if (len < 1 || len >= COLLATE_STR_LEN)
664 return (-1);
665
666 FIX_LOCALE(locale);
667 struct xlocale_collate *table =
668 (struct xlocale_collate*)locale->components[XLC_COLLATE];
669
670 if (table->__collate_load_error)
671 return ((len == 1 && *str <= UCHAR_MAX) ? *str : -1);
672
673 if (len == 1) {
674 e = -1;
675 if (*str <= UCHAR_MAX)
676 e = table->char_pri_table[*str].pri[0];
677 else if (table->info->large_count > 0) {
678 collate_large_t *match_large;
679 match_large = largesearch(table, *str);
680 if (match_large)
681 e = match_large->pri.pri[0];
682 }
683 if (e == 0)
684 return (1);
685 return (e > 0 ? e : 0);
686 }
687 if (table->info->chain_count > 0) {
688 wchar_t name[COLLATE_STR_LEN];
689 collate_chain_t *match_chain;
690 int clen;
691
692 wcsncpy (name, str, len);
693 name[len] = 0;
694 match_chain = chainsearch(table, name, &clen);
695 if (match_chain) {
696 e = match_chain->pri[0];
697 if (e == 0)
698 return (1);
699 return (e < 0 ? -e : e);
700 }
701 }
702 return (0);
703 }
704