1 /*        $NetBSD: utf-8-conv.c,v 1.3 2021/08/14 16:14:56 christos Exp $        */
2 
3 /* $OpenLDAP$ */
4 /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
5  *
6  * Copyright 1998-2021 The OpenLDAP Foundation.
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted only as authorized by the OpenLDAP
11  * Public License.
12  *
13  * A copy of this license is available in the file LICENSE in the
14  * top-level directory of the distribution or, alternatively, at
15  * <http://www.OpenLDAP.org/license.html>.
16  */
17 /* Portions Copyright (C) 1999, 2000 Novell, Inc. All Rights Reserved.
18  *
19  * THIS WORK IS SUBJECT TO U.S. AND INTERNATIONAL COPYRIGHT LAWS AND
20  * TREATIES. USE, MODIFICATION, AND REDISTRIBUTION OF THIS WORK IS SUBJECT
21  * TO VERSION 2.0.1 OF THE OPENLDAP PUBLIC LICENSE, A COPY OF WHICH IS
22  * AVAILABLE AT HTTP://WWW.OPENLDAP.ORG/LICENSE.HTML OR IN THE FILE "LICENSE"
23  * IN THE TOP-LEVEL DIRECTORY OF THE DISTRIBUTION. ANY USE OR EXPLOITATION
24  * OF THIS WORK OTHER THAN AS AUTHORIZED IN VERSION 2.0.1 OF THE OPENLDAP
25  * PUBLIC LICENSE, OR OTHER PRIOR WRITTEN CONSENT FROM NOVELL, COULD SUBJECT
26  * THE PERPETRATOR TO CRIMINAL AND CIVIL LIABILITY.
27  *---
28  * Note: A verbatim copy of version 2.0.1 of the OpenLDAP Public License
29  * can be found in the file "build/LICENSE-2.0.1" in this distribution
30  * of OpenLDAP Software.
31  */
32 
33 /*
34  * UTF-8 Conversion Routines
35  *
36  * These routines convert between Wide Character and UTF-8,
37  * or between MultiByte and UTF-8 encodings.
38  *
39  * Both single character and string versions of the functions are provided.
40  * All functions return -1 if the character or string cannot be converted.
41  */
42 
43 #include <sys/cdefs.h>
44 __RCSID("$NetBSD: utf-8-conv.c,v 1.3 2021/08/14 16:14:56 christos Exp $");
45 
46 #include "portable.h"
47 
48 #if SIZEOF_WCHAR_T >= 4
49 /* These routines assume ( sizeof(wchar_t) >= 4 ) */
50 
51 #include <stdio.h>
52 #include <ac/stdlib.h>                  /* For wctomb, wcstombs, mbtowc, mbstowcs */
53 #include <ac/string.h>
54 #include <ac/time.h>                    /* for time_t */
55 
56 #include "ldap-int.h"
57 
58 #include <ldap_utf8.h>
59 
60 static unsigned char mask[] = { 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
61 
62 
63 /*-----------------------------------------------------------------------------
64                                                   UTF-8 Format Summary
65 
66 ASCII chars                                                           7 bits
67     0xxxxxxx
68 
69 2-character UTF-8 sequence:        11 bits
70     110xxxxx  10xxxxxx
71 
72 3-character UTF-8                  16 bits
73     1110xxxx  10xxxxxx  10xxxxxx
74 
75 4-char UTF-8                       21 bits
76     11110xxx  10xxxxxx  10xxxxxx  10xxxxxx
77 
78 5-char UTF-8                       26 bits
79     111110xx  10xxxxxx  10xxxxxx  10xxxxxx  10xxxxxx
80 
81 6-char UTF-8                       31 bits
82     1111110x  10xxxxxx  10xxxxxx  10xxxxxx  10xxxxxx  10xxxxxx
83 
84 Unicode address space   (0 - 0x10FFFF)    21 bits
85 ISO-10646 address space (0 - 0x7FFFFFFF)  31 bits
86 
87 Note: This code does not prevent UTF-8 sequences which are longer than
88       necessary from being decoded.
89 */
90 
91 /*-----------------------------------------------------------------------------
92    Convert a UTF-8 character to a wide char.
93    Return the length of the UTF-8 input character in bytes.
94 */
95 int
ldap_x_utf8_to_wc(wchar_t * wchar,const char * utf8char)96 ldap_x_utf8_to_wc ( wchar_t *wchar, const char *utf8char )
97 {
98           int utflen, i;
99           wchar_t ch;
100 
101           if (utf8char == NULL) return -1;
102 
103           /* Get UTF-8 sequence length from 1st byte */
104           utflen = LDAP_UTF8_CHARLEN2(utf8char, utflen);
105 
106           if( utflen==0 || utflen > (int)LDAP_MAX_UTF8_LEN ) return -1;
107 
108           /* First byte minus length tag */
109           ch = (wchar_t)(utf8char[0] & mask[utflen]);
110 
111           for(i=1; i < utflen; i++) {
112                     /* Subsequent bytes must start with 10 */
113                     if ((utf8char[i] & 0xc0) != 0x80) return -1;
114 
115                     ch <<= 6;                     /* 6 bits of data in each subsequent byte */
116                     ch |= (wchar_t)(utf8char[i] & 0x3f);
117           }
118 
119           if (wchar) *wchar = ch;
120 
121           return utflen;
122 }
123 
124 /*-----------------------------------------------------------------------------
125    Convert a UTF-8 string to a wide char string.
126    No more than 'count' wide chars will be written to the output buffer.
127    Return the size of the converted string in wide chars, excl null terminator.
128 */
129 int
ldap_x_utf8s_to_wcs(wchar_t * wcstr,const char * utf8str,size_t count)130 ldap_x_utf8s_to_wcs ( wchar_t *wcstr, const char *utf8str, size_t count )
131 {
132           size_t wclen = 0;
133           int utflen, i;
134           wchar_t ch;
135 
136 
137           /* If input ptr is NULL or empty... */
138           if (utf8str == NULL || !*utf8str) {
139                     if ( wcstr )
140                               *wcstr = 0;
141                     return 0;
142           }
143 
144           /* Examine next UTF-8 character.  If output buffer is NULL, ignore count */
145           while ( *utf8str && (wcstr==NULL || wclen<count) ) {
146                     /* Get UTF-8 sequence length from 1st byte */
147                     utflen = LDAP_UTF8_CHARLEN2(utf8str, utflen);
148 
149                     if( utflen==0 || utflen > (int)LDAP_MAX_UTF8_LEN ) return -1;
150 
151                     /* First byte minus length tag */
152                     ch = (wchar_t)(utf8str[0] & mask[utflen]);
153 
154                     for(i=1; i < utflen; i++) {
155                               /* Subsequent bytes must start with 10 */
156                               if ((utf8str[i] & 0xc0) != 0x80) return -1;
157 
158                               ch <<= 6;                     /* 6 bits of data in each subsequent byte */
159                               ch |= (wchar_t)(utf8str[i] & 0x3f);
160                     }
161 
162                     if (wcstr) wcstr[wclen] = ch;
163 
164                     utf8str += utflen;  /* Move to next UTF-8 character */
165                     wclen++;                      /* Count number of wide chars stored/required */
166           }
167 
168           /* Add null terminator if there's room in the buffer. */
169           if (wcstr && wclen < count) wcstr[wclen] = 0;
170 
171           return wclen;
172 }
173 
174 
175 /*-----------------------------------------------------------------------------
176    Convert one wide char to a UTF-8 character.
177    Return the length of the converted UTF-8 character in bytes.
178    No more than 'count' bytes will be written to the output buffer.
179 */
180 int
ldap_x_wc_to_utf8(char * utf8char,wchar_t wchar,size_t count)181 ldap_x_wc_to_utf8 ( char *utf8char, wchar_t wchar, size_t count )
182 {
183           int len=0;
184 
185           if (utf8char == NULL)   /* Just determine the required UTF-8 char length. */
186           {                                                           /* Ignore count */
187                     if( wchar < 0 )
188                               return -1;
189                     if( wchar < 0x80 )
190                               return 1;
191                     if( wchar < 0x800 )
192                               return 2;
193                     if( wchar < 0x10000 )
194                               return 3;
195                     if( wchar < 0x200000 )
196                               return 4;
197                     if( wchar < 0x4000000 )
198                               return 5;
199 #if SIZEOF_WCHAR_T > 4
200                     /* UL is not strictly needed by ANSI C */
201                     if( wchar < (wchar_t)0x80000000UL )
202 #endif /* SIZEOF_WCHAR_T > 4 */
203                               return 6;
204                     return -1;
205           }
206 
207 
208           if ( wchar < 0 ) {                                /* Invalid wide character */
209                     len = -1;
210 
211           } else if( wchar < 0x80 ) {
212                     if (count >= 1) {
213                               utf8char[len++] = (char)wchar;
214                     }
215 
216           } else if( wchar < 0x800 ) {
217                     if (count >=2) {
218                               utf8char[len++] = 0xc0 | ( wchar >> 6 );
219                               utf8char[len++] = 0x80 | ( wchar & 0x3f );
220                     }
221 
222           } else if( wchar < 0x10000 ) {
223                     if (count >= 3) {
224                               utf8char[len++] = 0xe0 | ( wchar >> 12 );
225                               utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
226                               utf8char[len++] = 0x80 | ( wchar & 0x3f );
227                     }
228 
229           } else if( wchar < 0x200000 ) {
230                     if (count >= 4) {
231                               utf8char[len++] = 0xf0 | ( wchar >> 18 );
232                               utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f );
233                               utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
234                               utf8char[len++] = 0x80 | ( wchar & 0x3f );
235                     }
236 
237           } else if( wchar < 0x4000000 ) {
238                     if (count >= 5) {
239                               utf8char[len++] = 0xf8 | ( wchar >> 24 );
240                               utf8char[len++] = 0x80 | ( (wchar >> 18) & 0x3f );
241                               utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f );
242                               utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
243                               utf8char[len++] = 0x80 | ( wchar & 0x3f );
244                     }
245 
246           } else
247 #if SIZEOF_WCHAR_T > 4
248                     /* UL is not strictly needed by ANSI C */
249                     if( wchar < (wchar_t)0x80000000UL )
250 #endif /* SIZEOF_WCHAR_T > 4 */
251           {
252                     if (count >= 6) {
253                               utf8char[len++] = 0xfc | ( wchar >> 30 );
254                               utf8char[len++] = 0x80 | ( (wchar >> 24) & 0x3f );
255                               utf8char[len++] = 0x80 | ( (wchar >> 18) & 0x3f );
256                               utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f );
257                               utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
258                               utf8char[len++] = 0x80 | ( wchar & 0x3f );
259                     }
260 
261 #if SIZEOF_WCHAR_T > 4
262           } else {
263                     len = -1;
264 #endif /* SIZEOF_WCHAR_T > 4 */
265           }
266 
267           return len;
268 
269 }
270 
271 
272 /*-----------------------------------------------------------------------------
273    Convert a wide char string to a UTF-8 string.
274    No more than 'count' bytes will be written to the output buffer.
275    Return the # of bytes written to the output buffer, excl null terminator.
276 */
277 int
ldap_x_wcs_to_utf8s(char * utf8str,const wchar_t * wcstr,size_t count)278 ldap_x_wcs_to_utf8s ( char *utf8str, const wchar_t *wcstr, size_t count )
279 {
280           int len = 0;
281           int n;
282           char *p = utf8str;
283           wchar_t empty = 0;            /* To avoid use of L"" construct */
284 
285           if (wcstr == NULL)            /* Treat input ptr NULL as an empty string */
286                     wcstr = &empty;
287 
288           if (utf8str == NULL)          /* Just compute size of output, excl null */
289           {
290                     while (*wcstr)
291                     {
292                               /* Get UTF-8 size of next wide char */
293                               n = ldap_x_wc_to_utf8( NULL, *wcstr++, LDAP_MAX_UTF8_LEN);
294                               if (n == -1)
295                                         return -1;
296                               len += n;
297                     }
298 
299                     return len;
300           }
301 
302 
303           /* Do the actual conversion. */
304 
305           n = 1;                                            /* In case of empty wcstr */
306           while (*wcstr)
307           {
308                     n = ldap_x_wc_to_utf8( p, *wcstr++, count);
309 
310                     if (n <= 0)                   /* If encoding error (-1) or won't fit (0), quit */
311                               break;
312 
313                     p += n;
314                     count -= n;                             /* Space left in output buffer */
315           }
316 
317           /* If not enough room for last character, pad remainder with null
318              so that return value = original count, indicating buffer full. */
319           if (n == 0)
320           {
321                     while (count--)
322                               *p++ = 0;
323           }
324 
325           /* Add a null terminator if there's room. */
326           else if (count)
327                     *p = 0;
328 
329           if (n == -1)                            /* Conversion encountered invalid wide char. */
330                     return -1;
331 
332           /* Return the number of bytes written to output buffer, excl null. */
333           return (p - utf8str);
334 }
335 
336 #ifdef ANDROID
wctomb(char * s,wchar_t wc)337 int wctomb(char *s, wchar_t wc) { return wcrtomb(s,wc,NULL); }
mbtowc(wchar_t * pwc,const char * s,size_t n)338 int mbtowc(wchar_t *pwc, const char *s, size_t n) { return mbrtowc(pwc, s, n, NULL); }
339 #endif
340 
341 /*-----------------------------------------------------------------------------
342    Convert a UTF-8 character to a MultiByte character.
343    Return the size of the converted character in bytes.
344 */
345 int
ldap_x_utf8_to_mb(char * mbchar,const char * utf8char,int (* f_wctomb)(char * mbchar,wchar_t wchar))346 ldap_x_utf8_to_mb ( char *mbchar, const char *utf8char,
347                     int (*f_wctomb)(char *mbchar, wchar_t wchar) )
348 {
349           wchar_t wchar;
350           int n;
351           char tmp[6];                                      /* Large enough for biggest multibyte char */
352 
353           if (f_wctomb == NULL)                   /* If no conversion function was given... */
354                     f_wctomb = wctomb;            /*    use the local ANSI C function */
355 
356           /* First convert UTF-8 char to a wide char */
357           n = ldap_x_utf8_to_wc( &wchar, utf8char);
358 
359           if (n == -1)
360                     return -1;                    /* Invalid UTF-8 character */
361 
362           if (mbchar == NULL)
363                     n = f_wctomb( tmp, wchar );
364           else
365                     n = f_wctomb( mbchar, wchar);
366 
367           return n;
368 }
369 
370 /*-----------------------------------------------------------------------------
371    Convert a UTF-8 string to a MultiByte string.
372    No more than 'count' bytes will be written to the output buffer.
373    Return the size of the converted string in bytes, excl null terminator.
374 */
375 int
ldap_x_utf8s_to_mbs(char * mbstr,const char * utf8str,size_t count,size_t (* f_wcstombs)(char * mbstr,const wchar_t * wcstr,size_t count))376 ldap_x_utf8s_to_mbs ( char *mbstr, const char *utf8str, size_t count,
377                     size_t (*f_wcstombs)(char *mbstr, const wchar_t *wcstr, size_t count) )
378 {
379           wchar_t *wcs;
380           size_t wcsize;
381     int n;
382 
383           if (f_wcstombs == NULL)                 /* If no conversion function was given... */
384                     f_wcstombs = wcstombs;        /*    use the local ANSI C function */
385 
386           if (utf8str == NULL || *utf8str == 0)   /* NULL or empty input string */
387           {
388                     if (mbstr)
389                               *mbstr = 0;
390                     return 0;
391           }
392 
393 /* Allocate memory for the maximum size wchar string that we could get. */
394           wcsize = strlen(utf8str) + 1;
395           wcs = (wchar_t *)LDAP_MALLOC(wcsize * sizeof(wchar_t));
396           if (wcs == NULL)
397                     return -1;                                        /* Memory allocation failure. */
398 
399           /* First convert the UTF-8 string to a wide char string */
400           n = ldap_x_utf8s_to_wcs( wcs, utf8str, wcsize);
401 
402           /* Then convert wide char string to multi-byte string */
403           if (n != -1)
404           {
405                     n = f_wcstombs(mbstr, wcs, count);
406           }
407 
408           LDAP_FREE(wcs);
409 
410           return n;
411 }
412 
413 /*-----------------------------------------------------------------------------
414    Convert a MultiByte character to a UTF-8 character.
415    'mbsize' indicates the number of bytes of 'mbchar' to check.
416    Returns the number of bytes written to the output character.
417 */
418 int
ldap_x_mb_to_utf8(char * utf8char,const char * mbchar,size_t mbsize,int (* f_mbtowc)(wchar_t * wchar,const char * mbchar,size_t count))419 ldap_x_mb_to_utf8 ( char *utf8char, const char *mbchar, size_t mbsize,
420                     int (*f_mbtowc)(wchar_t *wchar, const char *mbchar, size_t count) )
421 {
422     wchar_t wchar;
423     int n;
424 
425           if (f_mbtowc == NULL)                   /* If no conversion function was given... */
426                     f_mbtowc = mbtowc;            /*    use the local ANSI C function */
427 
428     if (mbsize == 0)                                        /* 0 is not valid. */
429         return -1;
430 
431     if (mbchar == NULL || *mbchar == 0)
432     {
433         if (utf8char)
434             *utf8char = 0;
435         return 1;
436     }
437 
438           /* First convert the MB char to a Wide Char */
439           n = f_mbtowc( &wchar, mbchar, mbsize);
440 
441           if (n == -1)
442                     return -1;
443 
444           /* Convert the Wide Char to a UTF-8 character. */
445           n = ldap_x_wc_to_utf8( utf8char, wchar, LDAP_MAX_UTF8_LEN);
446 
447           return n;
448 }
449 
450 
451 /*-----------------------------------------------------------------------------
452    Convert a MultiByte string to a UTF-8 string.
453    No more than 'count' bytes will be written to the output buffer.
454    Return the size of the converted string in bytes, excl null terminator.
455 */
456 int
ldap_x_mbs_to_utf8s(char * utf8str,const char * mbstr,size_t count,size_t (* f_mbstowcs)(wchar_t * wcstr,const char * mbstr,size_t count))457 ldap_x_mbs_to_utf8s ( char *utf8str, const char *mbstr, size_t count,
458                     size_t (*f_mbstowcs)(wchar_t *wcstr, const char *mbstr, size_t count) )
459 {
460           wchar_t *wcs;
461           int n;
462           size_t wcsize;
463 
464           if (mbstr == NULL)               /* Treat NULL input string as an empty string */
465                     mbstr = "";
466 
467           if (f_mbstowcs == NULL)                 /* If no conversion function was given... */
468                     f_mbstowcs = mbstowcs;        /*    use the local ANSI C function */
469 
470           /* Allocate memory for the maximum size wchar string that we could get. */
471           wcsize = strlen(mbstr) + 1;
472           wcs = (wchar_t *)LDAP_MALLOC( wcsize * sizeof(wchar_t) );
473           if (wcs == NULL)
474                     return -1;
475 
476           /* First convert multi-byte string to a wide char string */
477           n = f_mbstowcs(wcs, mbstr, wcsize);
478 
479           /* Convert wide char string to UTF-8 string */
480           if (n != -1)
481           {
482                     n = ldap_x_wcs_to_utf8s( utf8str, wcs, count);
483           }
484 
485           LDAP_FREE(wcs);
486 
487           return n;
488 }
489 
490 #endif /* SIZEOF_WCHAR_T >= 4 */
491