xref: /dragonfly/usr.bin/localedef/wide.c (revision adb6cc9d6514221e978fa839ea26f7070fa1ac7a)
1 /*
2  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
3  * Copyright 2012 Garrett D'Amore <garrett@damore.org>  All rights reserved.
4  * Copyright 2015 John Marino <draco@marino.st>
5  *
6  * This source code is derived from the illumos localedef command, and
7  * provided under BSD-style license terms by Nexenta Systems, Inc.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  *
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
23  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * The functions in this file convert from the standard multibyte forms
34  * to the wide character forms used internally by libc.  Unfortunately,
35  * this approach means that we need a method for each and every encoding.
36  */
37 
38 #include <ctype.h>
39 #include <stdint.h>
40 #include <stdlib.h>
41 #include <wchar.h>
42 #include <string.h>
43 #include <sys/types.h>
44 #include "localedef.h"
45 
46 static int towide_none(wchar_t *, const char *, unsigned);
47 static int towide_utf8(wchar_t *, const char *, unsigned);
48 static int towide_big5(wchar_t *, const char *, unsigned);
49 static int towide_gbk(wchar_t *, const char *, unsigned);
50 static int towide_gb2312(wchar_t *, const char *, unsigned);
51 static int towide_gb18030(wchar_t *, const char *, unsigned);
52 static int towide_mskanji(wchar_t *, const char *, unsigned);
53 static int towide_euccn(wchar_t *, const char *, unsigned);
54 static int towide_eucjp(wchar_t *, const char *, unsigned);
55 static int towide_euckr(wchar_t *, const char *, unsigned);
56 static int towide_euctw(wchar_t *, const char *, unsigned);
57 
58 static int tomb_none(char *, wchar_t);
59 static int tomb_utf8(char *, wchar_t);
60 static int tomb_mbs(char *, wchar_t);
61 
62 static int (*_towide)(wchar_t *, const char *, unsigned) = towide_none;
63 static int (*_tomb)(char *, wchar_t) = tomb_none;
64 static char _encoding_buffer[20] = {'N','O','N','E'};
65 static const char *_encoding = _encoding_buffer;
66 static int _nbits = 7;
67 
68 /*
69  * Table of supported encodings.  We only bother to list the multibyte
70  * encodings here, because single byte locales are handed by "NONE".
71  */
72 static struct {
73           const char *name;
74           /* the name that the underlying libc implemenation uses */
75           const char *cname;
76           /* the maximum number of bits required for priorities */
77           int nbits;
78           int (*towide)(wchar_t *, const char *, unsigned);
79           int (*tomb)(char *, wchar_t);
80 } mb_encodings[] = {
81           /*
82            * UTF8 values max out at 0x1fffff (although in theory there could
83            * be later extensions, but it won't happen.)  This means we only need
84            * 21 bits to be able to encode the entire range of priorities.
85            */
86           { "UTF-8",          "UTF-8",  21, towide_utf8, tomb_utf8 },
87           { "UTF8", "UTF-8",  21, towide_utf8, tomb_utf8 },
88           { "utf8", "UTF-8",  21, towide_utf8, tomb_utf8 },
89           { "utf-8",          "UTF-8",  21, towide_utf8, tomb_utf8 },
90 
91           { "EUC-CN",         "EUC-CN", 16, towide_euccn, tomb_mbs },
92           { "eucCN",          "EUC-CN", 16, towide_euccn, tomb_mbs },
93           /*
94            * Becuase the 3-byte form of EUC-JP use the same leading byte,
95            * only 17 bits required to provide unique priorities.  (The low
96            * bit of that first byte is set.)  By setting this value low,
97            * we can get by with only 3 bytes in the strxfrm expansion.
98            */
99           { "EUC-JP",         "EUC-JP", 17, towide_eucjp, tomb_mbs },
100           { "eucJP",          "EUC-JP", 17, towide_eucjp, tomb_mbs },
101 
102           { "EUC-KR",         "EUC-KR", 16, towide_euckr, tomb_mbs },
103           { "eucKR",          "EUC-KR", 16, towide_euckr, tomb_mbs },
104           /*
105            * EUC-TW uses 2 bytes most of the time, but 4 bytes if the
106            * high order byte is 0x8E.  However, with 4 byte encodings,
107            * the third byte will be A0-B0.  So we only need to consider
108            * the lower order 24 bits for collation.
109            */
110           { "EUC-TW",         "EUC-TW", 24, towide_euctw, tomb_mbs },
111           { "eucTW",          "EUC-TW", 24, towide_euctw, tomb_mbs },
112 
113           { "MS_Kanji",       "MSKanji",          16, towide_mskanji, tomb_mbs },
114           { "MSKanji",        "MSKanji",          16, towide_mskanji, tomb_mbs },
115           { "PCK",  "MSKanji",          16, towide_mskanji, tomb_mbs },
116           { "SJIS", "MSKanji",          16, towide_mskanji, tomb_mbs },
117           { "Shift_JIS",      "MSKanji",          16, towide_mskanji, tomb_mbs },
118 
119           { "BIG5", "BIG5",             16, towide_big5, tomb_mbs },
120           { "big5", "BIG5",             16, towide_big5, tomb_mbs },
121           { "Big5", "BIG5",             16, towide_big5, tomb_mbs },
122 
123           { "GBK",  "GBK",              16, towide_gbk,     tomb_mbs },
124 
125           /*
126            * GB18030 can get away with just 31 bits.  This is because the
127            * high order bit is always set for 4 byte values, and the
128            * at least one of the other bits in that 4 byte value will
129            * be non-zero.
130            */
131           { "GB18030",        "GB18030",          31, towide_gb18030, tomb_mbs },
132 
133           /*
134            * This should probably be an aliase for euc-cn, or vice versa.
135            */
136           { "GB2312",         "GB2312", 16, towide_gb2312, tomb_mbs },
137 
138           { NULL, NULL, 0, 0, 0 },
139 };
140 
141 static char *
show_mb(const char * mb)142 show_mb(const char *mb)
143 {
144           static char buf[64];
145 
146           /* ASCII stuff we just print */
147           if (isascii(*mb) && isgraph(*mb)) {
148                     buf[0] = *mb;
149                     buf[1] = 0;
150                     return (buf);
151           }
152           buf[0] = 0;
153           while (*mb != 0) {
154                     char scr[8];
155                     (void) snprintf(scr, sizeof (scr), "\\x%02x", *mb);
156                     (void) strlcat(buf, scr, sizeof (buf));
157                     mb++;
158           }
159           return (buf);
160 }
161 
162 static char         *widemsg;
163 
164 __printflike(1, 2) void
werr(const char * fmt,...)165 werr(const char *fmt, ...)
166 {
167           char      *msg;
168 
169           va_list   va;
170           va_start(va, fmt);
171           (void) vasprintf(&msg, fmt, va);
172           va_end(va);
173 
174           free(widemsg);
175           widemsg = msg;
176 }
177 
178 /*
179  * This is used for 8-bit encodings.
180  */
181 int
towide_none(wchar_t * c,const char * mb,unsigned n __unused)182 towide_none(wchar_t *c, const char *mb, unsigned n __unused)
183 {
184           if (mb_cur_max != 1) {
185                     werr("invalid or unsupported multibyte locale");
186                     return (-1);
187           }
188           *c = (uint8_t)*mb;
189           return (1);
190 }
191 
192 int
tomb_none(char * mb,wchar_t wc)193 tomb_none(char *mb, wchar_t wc)
194 {
195           if (mb_cur_max != 1) {
196                     werr("invalid or unsupported multibyte locale");
197                     return (-1);
198           }
199           *(uint8_t *)mb = (wc & 0xff);
200           mb[1] = 0;
201           return (1);
202 }
203 
204 /*
205  * UTF-8 stores wide characters in UTF-32 form.
206  */
207 int
towide_utf8(wchar_t * wc,const char * mb,unsigned n)208 towide_utf8(wchar_t *wc, const char *mb, unsigned n)
209 {
210           wchar_t   c;
211           int       nb;
212           wchar_t   lv;       /* lowest legal value */
213           int       i;
214           const uint8_t *s = (const uint8_t *)mb;
215 
216           c = *s;
217 
218           if ((c & 0x80) == 0) {
219                     /* 7-bit ASCII */
220                     *wc = c;
221                     return (1);
222           } else if ((c & 0xe0) == 0xc0) {
223                     /* u80-u7ff - two bytes encoded */
224                     nb = 2;
225                     lv = 0x80;
226                     c &= ~0xe0;
227           } else if ((c & 0xf0) == 0xe0) {
228                     /* u800-uffff - three bytes encoded */
229                     nb = 3;
230                     lv = 0x800;
231                     c &= ~0xf0;
232           } else if ((c & 0xf8) == 0xf0) {
233                     /* u1000-u1fffff - four bytes encoded */
234                     nb = 4;
235                     lv = 0x1000;
236                     c &= ~0xf8;
237           } else {
238                     /* 5 and 6 byte encodings are not legal unicode */
239                     werr("utf8 encoding too large (%s)", show_mb(mb));
240                     return (-1);
241           }
242           if (nb > (int)n) {
243                     werr("incomplete utf8 sequence (%s)", show_mb(mb));
244                     return (-1);
245           }
246 
247           for (i = 1; i < nb; i++) {
248                     if (((s[i]) & 0xc0) != 0x80) {
249                               werr("illegal utf8 byte (%x)", s[i]);
250                               return (-1);
251                     }
252                     c <<= 6;
253                     c |= (s[i] & 0x3f);
254           }
255 
256           if (c < lv) {
257                     werr("illegal redundant utf8 encoding (%s)", show_mb(mb));
258                     return (-1);
259           }
260           *wc = c;
261           return (nb);
262 }
263 
264 int
tomb_utf8(char * mb,wchar_t wc)265 tomb_utf8(char *mb, wchar_t wc)
266 {
267           uint8_t *s = (uint8_t *)mb;
268           uint8_t msk;
269           int cnt;
270           int i;
271 
272           if (wc <= 0x7f) {
273                     s[0] = wc & 0x7f;
274                     s[1] = 0;
275                     return (1);
276           }
277           if (wc <= 0x7ff) {
278                     cnt = 2;
279                     msk = 0xc0;
280           } else if (wc <= 0xffff) {
281                     cnt = 3;
282                     msk = 0xe0;
283           } else if (wc <= 0x1fffff) {
284                     cnt = 4;
285                     msk = 0xf0;
286           } else {
287                     werr("illegal uf8 char (%x)", wc);
288                     return (-1);
289           }
290           for (i = cnt - 1; i; i--) {
291                     s[i] = (wc & 0x3f) | 0x80;
292                     wc >>= 6;
293           }
294           s[0] = (msk) | wc;
295           s[cnt] = 0;
296           return (cnt);
297 }
298 
299 /*
300  * Several encodings share a simplistic dual byte encoding.  In these
301  * forms, they all indicate that a two byte sequence is to be used if
302  * the first byte has its high bit set.  They all store this simple
303  * encoding as a 16-bit value, although a great many of the possible
304  * code points are not used in most character sets.  This gives a possible
305  * set of just over 32,000 valid code points.
306  *
307  * 0x00 - 0x7f                - 1 byte encoding
308  * 0x80 - 0x7fff    - illegal
309  * 0x8000 - 0xffff  - 2 byte encoding
310  */
311 
312 static int
towide_dbcs(wchar_t * wc,const char * mb,unsigned n)313 towide_dbcs(wchar_t *wc, const char *mb, unsigned n)
314 {
315           wchar_t   c;
316 
317           c = *(const uint8_t *)mb;
318 
319           if ((c & 0x80) == 0) {
320                     /* 7-bit */
321                     *wc = c;
322                     return (1);
323           }
324           if (n < 2) {
325                     werr("incomplete character sequence (%s)", show_mb(mb));
326                     return (-1);
327           }
328 
329           /* Store both bytes as a single 16-bit wide. */
330           c <<= 8;
331           c |= (uint8_t)(mb[1]);
332           *wc = c;
333           return (2);
334 }
335 
336 /*
337  * Most multibyte locales just convert the wide character to the multibyte
338  * form by stripping leading null bytes, and writing the 32-bit quantity
339  * in big-endian order.
340  */
341 int
tomb_mbs(char * mb,wchar_t wc)342 tomb_mbs(char *mb, wchar_t wc)
343 {
344           uint8_t *s = (uint8_t *)mb;
345           int       n = 0, c;
346 
347           if ((wc & 0xff000000U) != 0) {
348                     n = 4;
349           } else if ((wc & 0x00ff0000U) != 0) {
350                     n = 3;
351           } else if ((wc & 0x0000ff00U) != 0) {
352                     n = 2;
353           } else {
354                     n = 1;
355           }
356           c = n;
357           while (n) {
358                     n--;
359                     s[n] = wc & 0xff;
360                     wc >>= 8;
361           }
362           /* ensure null termination */
363           s[c] = 0;
364           return (c);
365 }
366 
367 
368 /*
369  * big5 is a simple dual byte character set.
370  */
371 int
towide_big5(wchar_t * wc,const char * mb,unsigned n)372 towide_big5(wchar_t *wc, const char *mb, unsigned n)
373 {
374           return (towide_dbcs(wc, mb, n));
375 }
376 
377 /*
378  * GBK encodes wides in the same way that big5 does, the high order
379  * bit of the first byte indicates a double byte character.
380  */
381 int
towide_gbk(wchar_t * wc,const char * mb,unsigned n)382 towide_gbk(wchar_t *wc, const char *mb, unsigned n)
383 {
384           return (towide_dbcs(wc, mb, n));
385 }
386 
387 /*
388  * GB2312 is another DBCS.  Its cleaner than others in that the second
389  * byte does not encode ASCII, but it supports characters.
390  */
391 int
towide_gb2312(wchar_t * wc,const char * mb,unsigned n)392 towide_gb2312(wchar_t *wc, const char *mb, unsigned n)
393 {
394           return (towide_dbcs(wc, mb, n));
395 }
396 
397 /*
398  * GB18030.  This encodes as 8, 16, or 32-bits.
399  * 7-bit values are in 1 byte,  4 byte sequences are used when
400  * the second byte encodes 0x30-39 and all other sequences are 2 bytes.
401  */
402 int
towide_gb18030(wchar_t * wc,const char * mb,unsigned n)403 towide_gb18030(wchar_t *wc, const char *mb, unsigned n)
404 {
405           wchar_t   c;
406 
407           c = *(const uint8_t *)mb;
408 
409           if ((c & 0x80) == 0) {
410                     /* 7-bit */
411                     *wc = c;
412                     return (1);
413           }
414           if (n < 2) {
415                     werr("incomplete character sequence (%s)", show_mb(mb));
416                     return (-1);
417           }
418 
419           /* pull in the second byte */
420           c <<= 8;
421           c |= (uint8_t)(mb[1]);
422 
423           if (((c & 0xff) >= 0x30) && ((c & 0xff) <= 0x39)) {
424                     if (n < 4) {
425                               werr("incomplete 4-byte character sequence (%s)",
426                                   show_mb(mb));
427                               return (-1);
428                     }
429                     c <<= 8;
430                     c |= (uint8_t)(mb[2]);
431                     c <<= 8;
432                     c |= (uint8_t)(mb[3]);
433                     *wc = c;
434                     return (4);
435           }
436 
437           *wc = c;
438           return (2);
439 }
440 
441 /*
442  * MS-Kanji (aka SJIS) is almost a clean DBCS like the others, but it
443  * also has a range of single byte characters above 0x80.  (0xa1-0xdf).
444  */
445 int
towide_mskanji(wchar_t * wc,const char * mb,unsigned n)446 towide_mskanji(wchar_t *wc, const char *mb, unsigned n)
447 {
448           wchar_t   c;
449 
450           c = *(const uint8_t *)mb;
451 
452           if ((c < 0x80) || ((c > 0xa0) && (c < 0xe0))) {
453                     /* 7-bit */
454                     *wc = c;
455                     return (1);
456           }
457 
458           if (n < 2) {
459                     werr("incomplete character sequence (%s)", show_mb(mb));
460                     return (-1);
461           }
462 
463           /* Store both bytes as a single 16-bit wide. */
464           c <<= 8;
465           c |= (uint8_t)(mb[1]);
466           *wc = c;
467           return (2);
468 }
469 
470 /*
471  * EUC forms.  EUC encodings are "variable".  FreeBSD carries some additional
472  * variable data to encode these, but we're going to treat each as independent
473  * instead.  Its the only way we can sensibly move forward.
474  *
475  * Note that the way in which the different EUC forms vary is how wide
476  * CS2 and CS3 are and what the first byte of them is.
477  */
478 static int
towide_euc_impl(wchar_t * wc,const char * mb,unsigned n,uint8_t cs2,uint8_t cs2width,uint8_t cs3,uint8_t cs3width)479 towide_euc_impl(wchar_t *wc, const char *mb, unsigned n,
480     uint8_t cs2, uint8_t cs2width, uint8_t cs3, uint8_t cs3width)
481 {
482           int i;
483           int width = 2;
484           wchar_t   c;
485 
486           c = *(const uint8_t *)mb;
487 
488           /*
489            * All variations of EUC encode 7-bit ASCII as one byte, and use
490            * additional bytes for more than that.
491            */
492           if ((c & 0x80) == 0) {
493                     /* 7-bit */
494                     *wc = c;
495                     return (1);
496           }
497 
498           /*
499            * All EUC variants reserve 0xa1-0xff to identify CS1, which
500            * is always two bytes wide.  Note that unused CS will be zero,
501            * and that cannot be true because we know that the high order
502            * bit must be set.
503            */
504           if (c >= 0xa1) {
505                     width = 2;
506           } else if (c == cs2) {
507                     width = cs2width;
508           } else if (c == cs3) {
509                     width = cs3width;
510           }
511 
512           if ((int)n < width) {
513                     werr("incomplete character sequence (%s)", show_mb(mb));
514                     return (-1);
515           }
516 
517           for (i = 1; i < width; i++) {
518                     /* pull in the next byte */
519                     c <<= 8;
520                     c |= (uint8_t)(mb[i]);
521           }
522 
523           *wc = c;
524           return (width);
525 }
526 
527 /*
528  * EUC-CN encodes as follows:
529  *
530  * Code set 0 (ASCII):                                      0x21-0x7E
531  * Code set 1 (CNS 11643-1992 Plane 1):           0xA1A1-0xFEFE
532  * Code set 2:                                              unused
533  * Code set 3:                                              unused
534  */
535 int
towide_euccn(wchar_t * wc,const char * mb,unsigned n)536 towide_euccn(wchar_t *wc, const char *mb, unsigned n)
537 {
538           return (towide_euc_impl(wc, mb, n, 0x8e, 4, 0, 0));
539 }
540 
541 /*
542  * EUC-JP encodes as follows:
543  *
544  * Code set 0 (ASCII or JIS X 0201-1976 Roman):   0x21-0x7E
545  * Code set 1 (JIS X 0208):                       0xA1A1-0xFEFE
546  * Code set 2 (half-width katakana):              0x8EA1-0x8EDF
547  * Code set 3 (JIS X 0212-1990):                  0x8FA1A1-0x8FFEFE
548  */
549 int
towide_eucjp(wchar_t * wc,const char * mb,unsigned n)550 towide_eucjp(wchar_t *wc, const char *mb, unsigned n)
551 {
552           return (towide_euc_impl(wc, mb, n, 0x8e, 2, 0x8f, 3));
553 }
554 
555 /*
556  * EUC-KR encodes as follows:
557  *
558  * Code set 0 (ASCII or KS C 5636-1993):          0x21-0x7E
559  * Code set 1 (KS C 5601-1992):                             0xA1A1-0xFEFE
560  * Code set 2:                                              unused
561  * Code set 3:                                              unused
562  */
563 int
towide_euckr(wchar_t * wc,const char * mb,unsigned n)564 towide_euckr(wchar_t *wc, const char *mb, unsigned n)
565 {
566           return (towide_euc_impl(wc, mb, n, 0, 0, 0, 0));
567 }
568 
569 /*
570  * EUC-TW encodes as follows:
571  *
572  * Code set 0 (ASCII):                                      0x21-0x7E
573  * Code set 1 (CNS 11643-1992 Plane 1):           0xA1A1-0xFEFE
574  * Code set 2 (CNS 11643-1992 Planes 1-16):       0x8EA1A1A1-0x8EB0FEFE
575  * Code set 3:                                              unused
576  */
577 int
towide_euctw(wchar_t * wc,const char * mb,unsigned n)578 towide_euctw(wchar_t *wc, const char *mb, unsigned n)
579 {
580           return (towide_euc_impl(wc, mb, n, 0x8e, 4, 0, 0));
581 }
582 
583 /*
584  * Public entry points.
585  */
586 
587 int
to_wide(wchar_t * wc,const char * mb)588 to_wide(wchar_t *wc, const char *mb)
589 {
590           /* this won't fail hard */
591           return (_towide(wc, mb, strlen(mb)));
592 }
593 
594 int
to_mb(char * mb,wchar_t wc)595 to_mb(char *mb, wchar_t wc)
596 {
597           int       rv;
598 
599           if ((rv = _tomb(mb, wc)) < 0) {
600                     errf(widemsg);
601                     free(widemsg);
602                     widemsg = NULL;
603           }
604           return (rv);
605 }
606 
607 char *
to_mb_string(const wchar_t * wcs)608 to_mb_string(const wchar_t *wcs)
609 {
610           char      *mbs;
611           char      *ptr;
612           int       len;
613 
614           mbs = malloc((wcslen(wcs) * mb_cur_max) + 1);
615           if (mbs == NULL) {
616                     errf("out of memory");
617                     return (NULL);
618           }
619           ptr = mbs;
620           while (*wcs) {
621                     if ((len = to_mb(ptr, *wcs)) < 0) {
622                               INTERR;
623                               free(mbs);
624                               return (NULL);
625                     }
626                     wcs++;
627                     ptr += len;
628           }
629           *ptr = 0;
630           return (mbs);
631 }
632 
633 void
set_wide_encoding(const char * encoding)634 set_wide_encoding(const char *encoding)
635 {
636           int i;
637 
638           _towide = towide_none;
639           _tomb = tomb_none;
640           _nbits = 8;
641 
642           snprintf(_encoding_buffer, sizeof(_encoding_buffer), "NONE:%s",
643               encoding);
644           for (i = 0; mb_encodings[i].name; i++) {
645                     if (strcasecmp(encoding, mb_encodings[i].name) == 0) {
646                               _towide = mb_encodings[i].towide;
647                               _tomb = mb_encodings[i].tomb;
648                               _encoding = mb_encodings[i].cname;
649                               _nbits = mb_encodings[i].nbits;
650                               break;
651                     }
652           }
653 }
654 
655 const char *
get_wide_encoding(void)656 get_wide_encoding(void)
657 {
658           return (_encoding);
659 }
660 
661 int
max_wide(void)662 max_wide(void)
663 {
664           return ((int)((1U << _nbits) - 1));
665 }
666