1 /*
2 * Copyright 2010 Nexenta Systems, Inc. All rights reserved.
3 * Copyright 2015 John Marino <draco@marino.st>
4 *
5 * This source code is derived from the illumos localedef command, and
6 * provided under BSD-style license terms by Nexenta Systems, Inc.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 *
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
22 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28 * POSSIBILITY OF SUCH DAMAGE.
29 */
30
31 /*
32 * LC_COLLATE database generation routines for localedef.
33 */
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36
37 #include <sys/types.h>
38 #include <sys/tree.h>
39
40 #include <stdio.h>
41 #include <stddef.h>
42 #include <stdlib.h>
43 #include <errno.h>
44 #include <string.h>
45 #include <unistd.h>
46 #include <wchar.h>
47 #include <limits.h>
48 #include "localedef.h"
49 #include "parser.h"
50 #include "collate.h"
51
52 /*
53 * Design notes.
54 *
55 * It will be extremely helpful to the reader if they have access to
56 * the localedef and locale file format specifications available.
57 * Latest versions of these are available from www.opengroup.org.
58 *
59 * The design for the collation code is a bit complex. The goal is a
60 * single collation database as described in collate.h (in
61 * libc/port/locale). However, there are some other tidbits:
62 *
63 * a) The substitution entries are now a directly indexable array. A
64 * priority elsewhere in the table is taken as an index into the
65 * substitution table if it has a high bit (COLLATE_SUBST_PRIORITY)
66 * set. (The bit is cleared and the result is the index into the
67 * table.
68 *
69 * b) We eliminate duplicate entries into the substitution table.
70 * This saves a lot of space.
71 *
72 * c) The priorities for each level are "compressed", so that each
73 * sorting level has consecutively numbered priorities starting at 1.
74 * (O is reserved for the ignore priority.) This means sort levels
75 * which only have a few distinct priorities can represent the
76 * priority level in fewer bits, which makes the strxfrm output
77 * smaller.
78 *
79 * d) We record the total number of priorities so that strxfrm can
80 * figure out how many bytes to expand a numeric priority into.
81 *
82 * e) For the UNDEFINED pass (the last pass), we record the maximum
83 * number of bits needed to uniquely prioritize these entries, so that
84 * the last pass can also use smaller strxfrm output when possible.
85 *
86 * f) Priorities with the sign bit set are verboten. This works out
87 * because no active character set needs that bit to carry significant
88 * information once the character is in wide form.
89 *
90 * To process the entire data to make the database, we actually run
91 * multiple passes over the data.
92 *
93 * The first pass, which is done at parse time, identifies elements,
94 * substitutions, and such, and records them in priority order. As
95 * some priorities can refer to other priorities, using forward
96 * references, we use a table of references indicating whether the
97 * priority's value has been resolved, or whether it is still a
98 * reference.
99 *
100 * The second pass walks over all the items in priority order, noting
101 * that they are used directly, and not just an indirect reference.
102 * This is done by creating a "weight" structure for the item. The
103 * weights are stashed in an RB tree sorted by relative "priority".
104 *
105 * The third pass walks over all the weight structures, in priority
106 * order, and assigns a new monotonically increasing (per sort level)
107 * weight value to them. These are the values that will actually be
108 * written to the file.
109 *
110 * The fourth pass just writes the data out.
111 */
112
113 /*
114 * In order to resolve the priorities, we create a table of priorities.
115 * Entries in the table can be in one of three states.
116 *
117 * UNKNOWN is for newly allocated entries, and indicates that nothing
118 * is known about the priority. (For example, when new entries are created
119 * for collating-symbols, this is the value assigned for them until the
120 * collating symbol's order has been determined.
121 *
122 * RESOLVED is used for an entry where the priority indicates the final
123 * numeric weight.
124 *
125 * REFER is used for entries that reference other entries. Typically
126 * this is used for forward references. A collating-symbol can never
127 * have this value.
128 *
129 * The "pass" field is used during final resolution to aid in detection
130 * of referencing loops. (For example <A> depends on <B>, but <B> has its
131 * priority dependent on <A>.)
132 */
133 typedef enum {
134 UNKNOWN, /* priority is totally unknown */
135 RESOLVED, /* priority value fully resolved */
136 REFER /* priority is a reference (index) */
137 } res_t;
138
139 typedef struct weight {
140 int32_t pri;
141 int opt;
142 RB_ENTRY(weight) entry;
143 } weight_t;
144
145 typedef struct priority {
146 res_t res;
147 int32_t pri;
148 int pass;
149 int lineno;
150 } collpri_t;
151
152 #define NUM_WT collinfo.directive_count
153
154 /*
155 * These are the abstract collating symbols, which are just a symbolic
156 * way to reference a priority.
157 */
158 struct collsym {
159 char *name;
160 int32_t ref;
161 RB_ENTRY(collsym) entry;
162 };
163
164 /*
165 * These are also abstract collating symbols, but we allow them to have
166 * different priorities at different levels.
167 */
168 typedef struct collundef {
169 char *name;
170 int32_t ref[COLL_WEIGHTS_MAX];
171 RB_ENTRY(collundef) entry;
172 } collundef_t;
173
174 /*
175 * These are called "chains" in libc. This records the fact that two
176 * more characters should be treated as a single collating entity when
177 * they appear together. For example, in Spanish <C><h> gets collated
178 * as a character between <C> and <D>.
179 */
180 struct collelem {
181 char *symbol;
182 wchar_t *expand;
183 int32_t ref[COLL_WEIGHTS_MAX];
184 RB_ENTRY(collelem) rb_bysymbol;
185 RB_ENTRY(collelem) rb_byexpand;
186 };
187
188 /*
189 * Individual characters have a sequence of weights as well.
190 */
191 typedef struct collchar {
192 wchar_t wc;
193 int32_t ref[COLL_WEIGHTS_MAX];
194 RB_ENTRY(collchar) entry;
195 } collchar_t;
196
197 /*
198 * Substitution entries. The key is itself a priority. Note that
199 * when we create one of these, we *automatically* wind up with a
200 * fully resolved priority for the key, because creation of
201 * substitutions creates a resolved priority at the same time.
202 */
203 typedef struct subst{
204 int32_t key;
205 int32_t ref[COLLATE_STR_LEN];
206 RB_ENTRY(subst) entry;
207 RB_ENTRY(subst) entry_ref;
208 } subst_t;
209
210 static RB_HEAD(collsyms, collsym) collsyms;
211 static RB_HEAD(collundefs, collundef) collundefs;
212 static RB_HEAD(elem_by_symbol, collelem) elem_by_symbol;
213 static RB_HEAD(elem_by_expand, collelem) elem_by_expand;
214 static RB_HEAD(collchars, collchar) collchars;
215 static RB_HEAD(substs, subst) substs[COLL_WEIGHTS_MAX];
216 static RB_HEAD(substs_ref, subst) substs_ref[COLL_WEIGHTS_MAX];
217 static RB_HEAD(weights, weight) weights[COLL_WEIGHTS_MAX];
218 static int32_t nweight[COLL_WEIGHTS_MAX];
219
220 /*
221 * This is state tracking for the ellipsis token. Note that we start
222 * the initial values so that the ellipsis logic will think we got a
223 * magic starting value of NUL. It starts at minus one because the
224 * starting point is exclusive -- i.e. the starting point is not
225 * itself handled by the ellipsis code.
226 */
227 static int currorder = EOF;
228 static int lastorder = EOF;
229 static collelem_t *currelem;
230 static collchar_t *currchar;
231 static collundef_t *currundef;
232 static wchar_t ellipsis_start = 0;
233 static int32_t ellipsis_weights[COLL_WEIGHTS_MAX];
234
235 /*
236 * We keep a running tally of weights.
237 */
238 static int nextpri = 1;
239 static int nextsubst[COLL_WEIGHTS_MAX] = { 0 };
240
241 /*
242 * This array collects up the weights for each level.
243 */
244 static int32_t order_weights[COLL_WEIGHTS_MAX];
245 static int curr_weight = 0;
246 static int32_t subst_weights[COLLATE_STR_LEN];
247 static int curr_subst = 0;
248
249 /*
250 * Some initial priority values.
251 */
252 static int32_t pri_undefined[COLL_WEIGHTS_MAX];
253 static int32_t pri_ignore;
254
255 static collate_info_t collinfo;
256
257 static collpri_t *prilist = NULL;
258 static int numpri = 0;
259 static int maxpri = 0;
260
261 static void start_order(int);
262
263 static int32_t
new_pri(void)264 new_pri(void)
265 {
266 int i;
267
268 if (numpri >= maxpri) {
269 maxpri = maxpri ? maxpri * 2 : 1024;
270 prilist = realloc(prilist, sizeof (collpri_t) * maxpri);
271 if (prilist == NULL) {
272 fprintf(stderr,"out of memory");
273 return (-1);
274 }
275 for (i = numpri; i < maxpri; i++) {
276 prilist[i].res = UNKNOWN;
277 prilist[i].pri = 0;
278 prilist[i].pass = 0;
279 }
280 }
281 return (numpri++);
282 }
283
284 static collpri_t *
get_pri(int32_t ref)285 get_pri(int32_t ref)
286 {
287 if ((ref < 0) || (ref > numpri)) {
288 INTERR;
289 return (NULL);
290 }
291 return (&prilist[ref]);
292 }
293
294 static void
set_pri(int32_t ref,int32_t v,res_t res)295 set_pri(int32_t ref, int32_t v, res_t res)
296 {
297 collpri_t *pri;
298
299 pri = get_pri(ref);
300
301 if ((res == REFER) && ((v < 0) || (v >= numpri))) {
302 INTERR;
303 }
304
305 /* Resolve self references */
306 if ((res == REFER) && (ref == v)) {
307 v = nextpri;
308 res = RESOLVED;
309 }
310
311 if (pri->res != UNKNOWN) {
312 warn("repeated item in order list (first on %d)",
313 pri->lineno);
314 return;
315 }
316 pri->lineno = lineno;
317 pri->pri = v;
318 pri->res = res;
319 }
320
321 static int32_t
resolve_pri(int32_t ref)322 resolve_pri(int32_t ref)
323 {
324 collpri_t *pri;
325 static int32_t pass = 0;
326
327 pri = get_pri(ref);
328 pass++;
329 while (pri->res == REFER) {
330 if (pri->pass == pass) {
331 /* report a line with the circular symbol */
332 lineno = pri->lineno;
333 fprintf(stderr,"circular reference in order list");
334 return (-1);
335 }
336 if ((pri->pri < 0) || (pri->pri >= numpri)) {
337 INTERR;
338 return (-1);
339 }
340 pri->pass = pass;
341 pri = &prilist[pri->pri];
342 }
343
344 if (pri->res == UNKNOWN) {
345 return (-1);
346 }
347 if (pri->res != RESOLVED)
348 INTERR;
349
350 return (pri->pri);
351 }
352
353 static int
weight_compare(const void * n1,const void * n2)354 weight_compare(const void *n1, const void *n2)
355 {
356 int32_t k1 = ((const weight_t *)n1)->pri;
357 int32_t k2 = ((const weight_t *)n2)->pri;
358
359 return (k1 < k2 ? -1 : k1 > k2 ? 1 : 0);
360 }
361
362 RB_GENERATE_STATIC(weights, weight, entry, weight_compare);
363
364 static int
collsym_compare(const void * n1,const void * n2)365 collsym_compare(const void *n1, const void *n2)
366 {
367 const collsym_t *c1 = n1;
368 const collsym_t *c2 = n2;
369 int rv;
370
371 rv = strcmp(c1->name, c2->name);
372 return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
373 }
374
375 RB_GENERATE_STATIC(collsyms, collsym, entry, collsym_compare);
376
377 static int
collundef_compare(const void * n1,const void * n2)378 collundef_compare(const void *n1, const void *n2)
379 {
380 const collundef_t *c1 = n1;
381 const collundef_t *c2 = n2;
382 int rv;
383
384 rv = strcmp(c1->name, c2->name);
385 return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
386 }
387
388 RB_GENERATE_STATIC(collundefs, collundef, entry, collundef_compare);
389
390 static int
element_compare_symbol(const void * n1,const void * n2)391 element_compare_symbol(const void *n1, const void *n2)
392 {
393 const collelem_t *c1 = n1;
394 const collelem_t *c2 = n2;
395 int rv;
396
397 rv = strcmp(c1->symbol, c2->symbol);
398 return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
399 }
400
401 RB_GENERATE_STATIC(elem_by_symbol, collelem, rb_bysymbol, element_compare_symbol);
402
403 static int
element_compare_expand(const void * n1,const void * n2)404 element_compare_expand(const void *n1, const void *n2)
405 {
406 const collelem_t *c1 = n1;
407 const collelem_t *c2 = n2;
408 int rv;
409
410 rv = wcscmp(c1->expand, c2->expand);
411 return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
412 }
413
414 RB_GENERATE_STATIC(elem_by_expand, collelem, rb_byexpand, element_compare_expand);
415
416 static int
collchar_compare(const void * n1,const void * n2)417 collchar_compare(const void *n1, const void *n2)
418 {
419 wchar_t k1 = ((const collchar_t *)n1)->wc;
420 wchar_t k2 = ((const collchar_t *)n2)->wc;
421
422 return (k1 < k2 ? -1 : k1 > k2 ? 1 : 0);
423 }
424
425 RB_GENERATE_STATIC(collchars, collchar, entry, collchar_compare);
426
427 static int
subst_compare(const void * n1,const void * n2)428 subst_compare(const void *n1, const void *n2)
429 {
430 int32_t k1 = ((const subst_t *)n1)->key;
431 int32_t k2 = ((const subst_t *)n2)->key;
432
433 return (k1 < k2 ? -1 : k1 > k2 ? 1 : 0);
434 }
435
436 RB_GENERATE_STATIC(substs, subst, entry, subst_compare);
437
438 static int
subst_compare_ref(const void * n1,const void * n2)439 subst_compare_ref(const void *n1, const void *n2)
440 {
441 const wchar_t *c1 = ((const subst_t *)n1)->ref;
442 const wchar_t *c2 = ((const subst_t *)n2)->ref;
443 int rv;
444
445 rv = wcscmp(c1, c2);
446 return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
447 }
448
449 RB_GENERATE_STATIC(substs_ref, subst, entry_ref, subst_compare_ref);
450
451 void
init_collate(void)452 init_collate(void)
453 {
454 int i;
455
456 RB_INIT(&collsyms);
457
458 RB_INIT(&collundefs);
459
460 RB_INIT(&elem_by_symbol);
461
462 RB_INIT(&elem_by_expand);
463
464 RB_INIT(&collchars);
465
466 for (i = 0; i < COLL_WEIGHTS_MAX; i++) {
467 RB_INIT(&substs[i]);
468 RB_INIT(&substs_ref[i]);
469 RB_INIT(&weights[i]);
470 nweight[i] = 1;
471 }
472
473 (void) memset(&collinfo, 0, sizeof (collinfo));
474
475 /* allocate some initial priorities */
476 pri_ignore = new_pri();
477
478 set_pri(pri_ignore, 0, RESOLVED);
479
480 for (i = 0; i < COLL_WEIGHTS_MAX; i++) {
481 pri_undefined[i] = new_pri();
482
483 /* we will override this later */
484 set_pri(pri_undefined[i], COLLATE_MAX_PRIORITY, UNKNOWN);
485 }
486 }
487
488 void
define_collsym(char * name)489 define_collsym(char *name)
490 {
491 collsym_t *sym;
492
493 if ((sym = calloc(sizeof (*sym), 1)) == NULL) {
494 fprintf(stderr,"out of memory");
495 return;
496 }
497 sym->name = name;
498 sym->ref = new_pri();
499
500 if (RB_FIND(collsyms, &collsyms, sym) != NULL) {
501 /*
502 * This should never happen because we are only called
503 * for undefined symbols.
504 */
505 INTERR;
506 return;
507 }
508 RB_INSERT(collsyms, &collsyms, sym);
509 }
510
511 collsym_t *
lookup_collsym(char * name)512 lookup_collsym(char *name)
513 {
514 collsym_t srch;
515
516 srch.name = name;
517 return (RB_FIND(collsyms, &collsyms, &srch));
518 }
519
520 collelem_t *
lookup_collelem(char * symbol)521 lookup_collelem(char *symbol)
522 {
523 collelem_t srch;
524
525 srch.symbol = symbol;
526 return (RB_FIND(elem_by_symbol, &elem_by_symbol, &srch));
527 }
528
529 static collundef_t *
get_collundef(char * name)530 get_collundef(char *name)
531 {
532 collundef_t srch;
533 collundef_t *ud;
534 int i;
535
536 srch.name = name;
537 if ((ud = RB_FIND(collundefs, &collundefs, &srch)) == NULL) {
538 if (((ud = calloc(sizeof (*ud), 1)) == NULL) ||
539 ((ud->name = strdup(name)) == NULL)) {
540 fprintf(stderr,"out of memory");
541 return (NULL);
542 }
543 for (i = 0; i < NUM_WT; i++) {
544 ud->ref[i] = new_pri();
545 }
546 RB_INSERT(collundefs, &collundefs, ud);
547 }
548 add_charmap_undefined(name);
549 return (ud);
550 }
551
552 static collchar_t *
get_collchar(wchar_t wc,int create)553 get_collchar(wchar_t wc, int create)
554 {
555 collchar_t srch;
556 collchar_t *cc;
557 int i;
558
559 srch.wc = wc;
560 cc = RB_FIND(collchars, &collchars, &srch);
561 if ((cc == NULL) && create) {
562 if ((cc = calloc(sizeof (*cc), 1)) == NULL) {
563 fprintf(stderr, "out of memory");
564 return (NULL);
565 }
566 for (i = 0; i < NUM_WT; i++) {
567 cc->ref[i] = new_pri();
568 }
569 cc->wc = wc;
570 RB_INSERT(collchars, &collchars, cc);
571 }
572 return (cc);
573 }
574
575 void
end_order_collsym(collsym_t * sym)576 end_order_collsym(collsym_t *sym)
577 {
578 start_order(T_COLLSYM);
579 /* update the weight */
580
581 set_pri(sym->ref, nextpri, RESOLVED);
582 nextpri++;
583 }
584
585 void
end_order(void)586 end_order(void)
587 {
588 int i;
589 int32_t pri;
590 int32_t ref;
591 collpri_t *p;
592
593 /* advance the priority/weight */
594 pri = nextpri;
595
596 switch (currorder) {
597 case T_CHAR:
598 for (i = 0; i < NUM_WT; i++) {
599 if (((ref = order_weights[i]) < 0) ||
600 ((p = get_pri(ref)) == NULL) ||
601 (p->pri == -1)) {
602 /* unspecified weight is a self reference */
603 set_pri(currchar->ref[i], pri, RESOLVED);
604 } else {
605 set_pri(currchar->ref[i], ref, REFER);
606 }
607 order_weights[i] = -1;
608 }
609
610 /* leave a cookie trail in case next symbol is ellipsis */
611 ellipsis_start = currchar->wc + 1;
612 currchar = NULL;
613 break;
614
615 case T_ELLIPSIS:
616 /* save off the weights were we can find them */
617 for (i = 0; i < NUM_WT; i++) {
618 ellipsis_weights[i] = order_weights[i];
619 order_weights[i] = -1;
620 }
621 break;
622
623 case T_COLLELEM:
624 if (currelem == NULL) {
625 INTERR;
626 } else {
627 for (i = 0; i < NUM_WT; i++) {
628
629 if (((ref = order_weights[i]) < 0) ||
630 ((p = get_pri(ref)) == NULL) ||
631 (p->pri == -1)) {
632 set_pri(currelem->ref[i], pri,
633 RESOLVED);
634 } else {
635 set_pri(currelem->ref[i], ref, REFER);
636 }
637 order_weights[i] = -1;
638 }
639 }
640 break;
641
642 case T_UNDEFINED:
643 for (i = 0; i < NUM_WT; i++) {
644 if (((ref = order_weights[i]) < 0) ||
645 ((p = get_pri(ref)) == NULL) ||
646 (p->pri == -1)) {
647 set_pri(pri_undefined[i], -1, RESOLVED);
648 } else {
649 set_pri(pri_undefined[i], ref, REFER);
650 }
651 order_weights[i] = -1;
652 }
653 break;
654
655 case T_SYMBOL:
656 for (i = 0; i < NUM_WT; i++) {
657 if (((ref = order_weights[i]) < 0) ||
658 ((p = get_pri(ref)) == NULL) ||
659 (p->pri == -1)) {
660 set_pri(currundef->ref[i], pri, RESOLVED);
661 } else {
662 set_pri(currundef->ref[i], ref, REFER);
663 }
664 order_weights[i] = -1;
665 }
666 break;
667
668 default:
669 INTERR;
670 }
671
672 nextpri++;
673 }
674
675 static void
start_order(int type)676 start_order(int type)
677 {
678 int i;
679
680 lastorder = currorder;
681 currorder = type;
682
683 /* this is used to protect ELLIPSIS processing */
684 if ((lastorder == T_ELLIPSIS) && (type != T_CHAR)) {
685 fprintf(stderr, "character value expected");
686 }
687
688 for (i = 0; i < COLL_WEIGHTS_MAX; i++) {
689 order_weights[i] = -1;
690 }
691 curr_weight = 0;
692 }
693
694 void
start_order_undefined(void)695 start_order_undefined(void)
696 {
697 start_order(T_UNDEFINED);
698 }
699
700 void
start_order_symbol(char * name)701 start_order_symbol(char *name)
702 {
703 currundef = get_collundef(name);
704 start_order(T_SYMBOL);
705 }
706
707 void
start_order_char(wchar_t wc)708 start_order_char(wchar_t wc)
709 {
710 collchar_t *cc;
711 int32_t ref;
712
713 start_order(T_CHAR);
714
715 /*
716 * If we last saw an ellipsis, then we need to close the range.
717 * Handle that here. Note that we have to be careful because the
718 * items *inside* the range are treated exclusiveley to the items
719 * outside of the range. The ends of the range can have quite
720 * different weights than the range members.
721 */
722 if (lastorder == T_ELLIPSIS) {
723 int i;
724
725 if (wc < ellipsis_start) {
726 fprintf(stderr, "malformed range!");
727 return;
728 }
729 while (ellipsis_start < wc) {
730 /*
731 * pick all of the saved weights for the
732 * ellipsis. note that -1 encodes for the
733 * ellipsis itself, which means to take the
734 * current relative priority.
735 */
736 if ((cc = get_collchar(ellipsis_start, 1)) == NULL) {
737 INTERR;
738 return;
739 }
740 for (i = 0; i < NUM_WT; i++) {
741 collpri_t *p;
742 if (((ref = ellipsis_weights[i]) == -1) ||
743 ((p = get_pri(ref)) == NULL) ||
744 (p->pri == -1)) {
745 set_pri(cc->ref[i], nextpri, RESOLVED);
746 } else {
747 set_pri(cc->ref[i], ref, REFER);
748 }
749 ellipsis_weights[i] = 0;
750 }
751 ellipsis_start++;
752 nextpri++;
753 }
754 }
755
756 currchar = get_collchar(wc, 1);
757 }
758
759 void
start_order_collelem(collelem_t * e)760 start_order_collelem(collelem_t *e)
761 {
762 start_order(T_COLLELEM);
763 currelem = e;
764 }
765
766 void
start_order_ellipsis(void)767 start_order_ellipsis(void)
768 {
769 int i;
770
771 start_order(T_ELLIPSIS);
772
773 if (lastorder != T_CHAR) {
774 fprintf(stderr, "illegal starting point for range");
775 return;
776 }
777
778 for (i = 0; i < NUM_WT; i++) {
779 ellipsis_weights[i] = order_weights[i];
780 }
781 }
782
783 void
define_collelem(char * name,wchar_t * wcs)784 define_collelem(char *name, wchar_t *wcs)
785 {
786 collelem_t *e;
787 int i;
788
789 if (wcslen(wcs) >= COLLATE_STR_LEN) {
790 fprintf(stderr,"expanded collation element too long");
791 return;
792 }
793
794 if ((e = calloc(sizeof (*e), 1)) == NULL) {
795 fprintf(stderr, "out of memory");
796 return;
797 }
798 e->expand = wcs;
799 e->symbol = name;
800
801 /*
802 * This is executed before the order statement, so we don't
803 * know how many priorities we *really* need. We allocate one
804 * for each possible weight. Not a big deal, as collating-elements
805 * prove to be quite rare.
806 */
807 for (i = 0; i < COLL_WEIGHTS_MAX; i++) {
808 e->ref[i] = new_pri();
809 }
810
811 /* A character sequence can only reduce to one element. */
812 if ((RB_FIND(elem_by_symbol, &elem_by_symbol, e) != NULL) ||
813 (RB_FIND(elem_by_expand, &elem_by_expand, e) != NULL)) {
814 fprintf(stderr, "duplicate collating element definition");
815 return;
816 }
817 RB_INSERT(elem_by_symbol, &elem_by_symbol, e);
818 RB_INSERT(elem_by_expand, &elem_by_expand, e);
819 }
820
821 void
add_order_bit(int kw)822 add_order_bit(int kw)
823 {
824 uint8_t bit = DIRECTIVE_UNDEF;
825
826 switch (kw) {
827 case T_FORWARD:
828 bit = DIRECTIVE_FORWARD;
829 break;
830 case T_BACKWARD:
831 bit = DIRECTIVE_BACKWARD;
832 break;
833 case T_POSITION:
834 bit = DIRECTIVE_POSITION;
835 break;
836 default:
837 INTERR;
838 break;
839 }
840 collinfo.directive[collinfo.directive_count] |= bit;
841 }
842
843 void
add_order_directive(void)844 add_order_directive(void)
845 {
846 if (collinfo.directive_count >= COLL_WEIGHTS_MAX) {
847 fprintf(stderr,"too many directives (max %d)", COLL_WEIGHTS_MAX);
848 }
849 collinfo.directive_count++;
850 }
851
852 static void
add_order_pri(int32_t ref)853 add_order_pri(int32_t ref)
854 {
855 if (curr_weight >= NUM_WT) {
856 fprintf(stderr,"too many weights (max %d)", NUM_WT);
857 return;
858 }
859 order_weights[curr_weight] = ref;
860 curr_weight++;
861 }
862
863 void
add_order_collsym(collsym_t * s)864 add_order_collsym(collsym_t *s)
865 {
866 add_order_pri(s->ref);
867 }
868
869 void
add_order_char(wchar_t wc)870 add_order_char(wchar_t wc)
871 {
872 collchar_t *cc;
873
874 if ((cc = get_collchar(wc, 1)) == NULL) {
875 INTERR;
876 return;
877 }
878
879 add_order_pri(cc->ref[curr_weight]);
880 }
881
882 void
add_order_collelem(collelem_t * e)883 add_order_collelem(collelem_t *e)
884 {
885 add_order_pri(e->ref[curr_weight]);
886 }
887
888 void
add_order_ignore(void)889 add_order_ignore(void)
890 {
891 add_order_pri(pri_ignore);
892 }
893
894 void
add_order_symbol(char * sym)895 add_order_symbol(char *sym)
896 {
897 collundef_t *c;
898 if ((c = get_collundef(sym)) == NULL) {
899 INTERR;
900 return;
901 }
902 add_order_pri(c->ref[curr_weight]);
903 }
904
905 void
add_order_ellipsis(void)906 add_order_ellipsis(void)
907 {
908 /* special NULL value indicates self reference */
909 add_order_pri(0);
910 }
911
912 void
add_order_subst(void)913 add_order_subst(void)
914 {
915 subst_t srch;
916 subst_t *s;
917 int i;
918
919 (void) memset(&srch, 0, sizeof (srch));
920 for (i = 0; i < curr_subst; i++) {
921 srch.ref[i] = subst_weights[i];
922 subst_weights[i] = 0;
923 }
924 s = RB_FIND(substs_ref, &substs_ref[curr_weight], &srch);
925
926 if (s == NULL) {
927 if ((s = calloc(sizeof (*s), 1)) == NULL) {
928 fprintf(stderr,"out of memory");
929 return;
930 }
931 s->key = new_pri();
932
933 /*
934 * We use a self reference for our key, but we set a
935 * high bit to indicate that this is a substitution
936 * reference. This will expedite table lookups later,
937 * and prevent table lookups for situations that don't
938 * require it. (In short, its a big win, because we
939 * can skip a lot of binary searching.)
940 */
941 set_pri(s->key,
942 (nextsubst[curr_weight] | COLLATE_SUBST_PRIORITY),
943 RESOLVED);
944 nextsubst[curr_weight] += 1;
945
946 for (i = 0; i < curr_subst; i++) {
947 s->ref[i] = srch.ref[i];
948 }
949
950 RB_INSERT(substs_ref, &substs_ref[curr_weight], s);
951
952 if (RB_FIND(substs, &substs[curr_weight], s) != NULL) {
953 INTERR;
954 return;
955 }
956 RB_INSERT(substs, &substs[curr_weight], s);
957 }
958 curr_subst = 0;
959
960
961 /*
962 * We are using the current (unique) priority as a search key
963 * in the substitution table.
964 */
965 add_order_pri(s->key);
966 }
967
968 static void
add_subst_pri(int32_t ref)969 add_subst_pri(int32_t ref)
970 {
971 if (curr_subst >= COLLATE_STR_LEN) {
972 fprintf(stderr,"substitution string is too long");
973 return;
974 }
975 subst_weights[curr_subst] = ref;
976 curr_subst++;
977 }
978
979 void
add_subst_char(wchar_t wc)980 add_subst_char(wchar_t wc)
981 {
982 collchar_t *cc;
983
984
985 if (((cc = get_collchar(wc, 1)) == NULL) ||
986 (cc->wc != wc)) {
987 INTERR;
988 return;
989 }
990 /* we take the weight for the character at that position */
991 add_subst_pri(cc->ref[curr_weight]);
992 }
993
994 void
add_subst_collelem(collelem_t * e)995 add_subst_collelem(collelem_t *e)
996 {
997 add_subst_pri(e->ref[curr_weight]);
998 }
999
1000 void
add_subst_collsym(collsym_t * s)1001 add_subst_collsym(collsym_t *s)
1002 {
1003 add_subst_pri(s->ref);
1004 }
1005
1006 void
add_subst_symbol(char * ptr)1007 add_subst_symbol(char *ptr)
1008 {
1009 collundef_t *cu;
1010
1011 if ((cu = get_collundef(ptr)) != NULL) {
1012 add_subst_pri(cu->ref[curr_weight]);
1013 }
1014 }
1015
1016 void
add_weight(int32_t ref,int pass)1017 add_weight(int32_t ref, int pass)
1018 {
1019 weight_t srch;
1020 weight_t *w;
1021
1022 srch.pri = resolve_pri(ref);
1023
1024 /* No translation of ignores */
1025 if (srch.pri == 0)
1026 return;
1027
1028 /* Substitution priorities are not weights */
1029 if (srch.pri & COLLATE_SUBST_PRIORITY)
1030 return;
1031
1032 if (RB_FIND(weights, &weights[pass], &srch) != NULL)
1033 return;
1034
1035 if ((w = calloc(sizeof (*w), 1)) == NULL) {
1036 fprintf(stderr, "out of memory");
1037 return;
1038 }
1039 w->pri = srch.pri;
1040 RB_INSERT(weights, &weights[pass], w);
1041 }
1042
1043 void
add_weights(int32_t * refs)1044 add_weights(int32_t *refs)
1045 {
1046 int i;
1047 for (i = 0; i < NUM_WT; i++) {
1048 add_weight(refs[i], i);
1049 }
1050 }
1051
1052 int32_t
get_weight(int32_t ref,int pass)1053 get_weight(int32_t ref, int pass)
1054 {
1055 weight_t srch;
1056 weight_t *w;
1057 int32_t pri;
1058
1059 pri = resolve_pri(ref);
1060 if (pri & COLLATE_SUBST_PRIORITY) {
1061 return (pri);
1062 }
1063 if (pri <= 0) {
1064 return (pri);
1065 }
1066 srch.pri = pri;
1067 if ((w = RB_FIND(weights, &weights[pass], &srch)) == NULL) {
1068 INTERR;
1069 return (-1);
1070 }
1071 return (w->opt);
1072 }
1073
1074 wchar_t *
wsncpy(wchar_t * s1,const wchar_t * s2,size_t n)1075 wsncpy(wchar_t *s1, const wchar_t *s2, size_t n)
1076 {
1077 wchar_t *os1 = s1;
1078
1079 n++;
1080 while (--n > 0 && (*s1++ = *s2++) != 0)
1081 continue;
1082 if (n > 0)
1083 while (--n > 0)
1084 *s1++ = 0;
1085 return (os1);
1086 }
1087
1088 #define RB_COUNT(x, name, head, cnt) do { \
1089 (cnt) = 0; \
1090 RB_FOREACH(x, name, (head)) { \
1091 (cnt)++; \
1092 } \
1093 } while (0)
1094
1095 #define RB_NUMNODES(type, name, head, cnt) do { \
1096 type *t; \
1097 cnt = 0; \
1098 RB_FOREACH(t, name, head) { \
1099 cnt++; \
1100 } \
1101 } while (0)
1102
1103 void
dump_collate(void)1104 dump_collate(void)
1105 {
1106 FILE *f;
1107 int i, j, n;
1108 size_t sz;
1109 int32_t pri;
1110 collelem_t *ce;
1111 collchar_t *cc;
1112 subst_t *sb;
1113 char vers[COLLATE_STR_LEN];
1114 collate_char_t chars[UCHAR_MAX + 1];
1115 collate_large_t *large;
1116 collate_subst_t *subst[COLL_WEIGHTS_MAX];
1117 collate_chain_t *chain;
1118
1119 /*
1120 * We have to run throught a preliminary pass to identify all the
1121 * weights that we use for each sorting level.
1122 */
1123 for (i = 0; i < NUM_WT; i++) {
1124 add_weight(pri_ignore, i);
1125 }
1126 for (i = 0; i < NUM_WT; i++) {
1127 RB_FOREACH(sb, substs, &substs[i]) {
1128 for (j = 0; sb->ref[j]; j++) {
1129 add_weight(sb->ref[j], i);
1130 }
1131 }
1132 }
1133 RB_FOREACH(ce, elem_by_expand, &elem_by_expand) {
1134 add_weights(ce->ref);
1135 }
1136 RB_FOREACH(cc, collchars, &collchars) {
1137 add_weights(cc->ref);
1138 }
1139
1140 /*
1141 * Now we walk the entire set of weights, removing the gaps
1142 * in the weights. This gives us optimum usage. The walk
1143 * occurs in priority.
1144 */
1145 for (i = 0; i < NUM_WT; i++) {
1146 weight_t *w;
1147 RB_FOREACH(w, weights, &weights[i]) {
1148 w->opt = nweight[i];
1149 nweight[i] += 1;
1150 }
1151 }
1152
1153 (void) memset(&chars, 0, sizeof (chars));
1154 (void) memset(vers, 0, COLLATE_STR_LEN);
1155 (void) strlcpy(vers, COLLATE_VERSION, sizeof (vers));
1156
1157 /*
1158 * We need to make sure we arrange for the UNDEFINED field
1159 * to show up. Also, set the total weight counts.
1160 */
1161 for (i = 0; i < NUM_WT; i++) {
1162 if (resolve_pri(pri_undefined[i]) == -1) {
1163 set_pri(pri_undefined[i], -1, RESOLVED);
1164 /* they collate at the end of everything else */
1165 collinfo.undef_pri[i] = COLLATE_MAX_PRIORITY;
1166 }
1167 collinfo.pri_count[i] = nweight[i];
1168 }
1169
1170 collinfo.pri_count[NUM_WT] = max_wide();
1171 collinfo.undef_pri[NUM_WT] = COLLATE_MAX_PRIORITY;
1172 collinfo.directive[NUM_WT] = DIRECTIVE_UNDEFINED;
1173
1174 /*
1175 * Ordinary character priorities
1176 */
1177 for (i = 0; i <= UCHAR_MAX; i++) {
1178 if ((cc = get_collchar(i, 0)) != NULL) {
1179 for (j = 0; j < NUM_WT; j++) {
1180 chars[i].pri[j] = get_weight(cc->ref[j], j);
1181 }
1182 } else {
1183 for (j = 0; j < NUM_WT; j++) {
1184 chars[i].pri[j] =
1185 get_weight(pri_undefined[j], j);
1186 }
1187 /*
1188 * Per POSIX, for undefined characters, we
1189 * also have to add a last item, which is the
1190 * character code.
1191 */
1192 chars[i].pri[NUM_WT] = i;
1193 }
1194 }
1195
1196 /*
1197 * Substitution tables
1198 */
1199 for (i = 0; i < NUM_WT; i++) {
1200 collate_subst_t *st = NULL;
1201 subst_t *temp;
1202 RB_COUNT(temp, substs, &substs[i], n);
1203 collinfo.subst_count[i] = n;
1204 if ((st = calloc(sizeof (collate_subst_t) * n, 1)) == NULL) {
1205 fprintf(stderr, "out of memory");
1206 return;
1207 }
1208 n = 0;
1209 RB_FOREACH(sb, substs, &substs[i]) {
1210 if ((st[n].key = resolve_pri(sb->key)) < 0) {
1211 /* by definition these resolve! */
1212 INTERR;
1213 }
1214 if (st[n].key != (n | COLLATE_SUBST_PRIORITY)) {
1215 INTERR;
1216 }
1217 for (j = 0; sb->ref[j]; j++) {
1218 st[n].pri[j] = get_weight(sb->ref[j], i);
1219 }
1220 n++;
1221 }
1222 if (n != collinfo.subst_count[i])
1223 INTERR;
1224 subst[i] = st;
1225 }
1226
1227
1228 /*
1229 * Chains, i.e. collating elements
1230 */
1231 RB_NUMNODES(collelem_t, elem_by_expand, &elem_by_expand,
1232 collinfo.chain_count);
1233 chain = calloc(sizeof (collate_chain_t), collinfo.chain_count);
1234 if (chain == NULL) {
1235 fprintf(stderr, "out of memory");
1236 return;
1237 }
1238 n = 0;
1239 RB_FOREACH(ce, elem_by_expand, &elem_by_expand) {
1240 (void) wsncpy(chain[n].str, ce->expand, COLLATE_STR_LEN);
1241 for (i = 0; i < NUM_WT; i++) {
1242 chain[n].pri[i] = get_weight(ce->ref[i], i);
1243 }
1244 n++;
1245 }
1246 if (n != collinfo.chain_count)
1247 INTERR;
1248
1249 /*
1250 * Large (> UCHAR_MAX) character priorities
1251 */
1252 RB_NUMNODES(collchar_t, collchars, &collchars, n);
1253 large = calloc(n, sizeof (collate_large_t));
1254 if (large == NULL) {
1255 fprintf(stderr, "out of memory");
1256 return;
1257 }
1258
1259 i = 0;
1260 RB_FOREACH(cc, collchars, &collchars) {
1261 int undef = 0;
1262 /* we already gathered those */
1263 if (cc->wc <= UCHAR_MAX)
1264 continue;
1265 for (j = 0; j < NUM_WT; j++) {
1266 if ((pri = get_weight(cc->ref[j], j)) < 0) {
1267 undef = 1;
1268 }
1269 if (undef && (pri >= 0)) {
1270 /* if undefined, then all priorities are */
1271 INTERR;
1272 } else {
1273 large[i].pri.pri[j] = pri;
1274 }
1275 }
1276 if (!undef) {
1277 large[i].val = cc->wc;
1278 collinfo.large_count = i++;
1279 }
1280 }
1281
1282 if ((f = open_category()) == NULL) {
1283 return;
1284 }
1285
1286 /* Time to write the entire data set out */
1287
1288 if ((wr_category(vers, COLLATE_STR_LEN, f) < 0) ||
1289 (wr_category(&collinfo, sizeof (collinfo), f) < 0) ||
1290 (wr_category(&chars, sizeof (chars), f) < 0)) {
1291 return;
1292 }
1293
1294 for (i = 0; i < NUM_WT; i++) {
1295 sz = sizeof (collate_subst_t) * collinfo.subst_count[i];
1296 if (wr_category(subst[i], sz, f) < 0) {
1297 return;
1298 }
1299 }
1300 sz = sizeof (collate_chain_t) * collinfo.chain_count;
1301 if (wr_category(chain, sz, f) < 0) {
1302 return;
1303 }
1304 sz = sizeof (collate_large_t) * collinfo.large_count;
1305 if (wr_category(large, sz, f) < 0) {
1306 return;
1307 }
1308
1309 close_category(f);
1310 }
1311