1 /*        $NetBSD: utf-8.c,v 1.3 2021/08/14 16:14:56 christos Exp $   */
2 
3 /* utf-8.c -- Basic UTF-8 routines */
4 /* $OpenLDAP$ */
5 /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
6  *
7  * Copyright 1998-2021 The OpenLDAP Foundation.
8  * All rights reserved.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted only as authorized by the OpenLDAP
12  * Public License.
13  *
14  * A copy of this license is available in the file LICENSE in the
15  * top-level directory of the distribution or, alternatively, at
16  * <http://www.OpenLDAP.org/license.html>.
17  */
18 /* Basic UTF-8 routines
19  *
20  * These routines are "dumb".  Though they understand UTF-8,
21  * they don't grok Unicode.  That is, they can push bits,
22  * but don't have a clue what the bits represent.  That's
23  * good enough for use with the LDAP Client SDK.
24  *
25  * These routines are not optimized.
26  */
27 
28 #include <sys/cdefs.h>
29 __RCSID("$NetBSD: utf-8.c,v 1.3 2021/08/14 16:14:56 christos Exp $");
30 
31 #include "portable.h"
32 
33 #include <stdio.h>
34 
35 #include <ac/stdlib.h>
36 
37 #include <ac/socket.h>
38 #include <ac/string.h>
39 #include <ac/time.h>
40 
41 #include "ldap_utf8.h"
42 
43 #include "ldap-int.h"
44 #include "ldap_defaults.h"
45 
46 /*
47  * return the number of bytes required to hold the
48  * NULL-terminated UTF-8 string NOT INCLUDING the
49  * termination.
50  */
ldap_utf8_bytes(const char * p)51 ber_len_t ldap_utf8_bytes( const char * p )
52 {
53           ber_len_t bytes;
54 
55           for( bytes=0; p[bytes]; bytes++ ) {
56                     /* EMPTY */ ;
57           }
58 
59           return bytes;
60 }
61 
ldap_utf8_chars(const char * p)62 ber_len_t ldap_utf8_chars( const char * p )
63 {
64           /* could be optimized and could check for invalid sequences */
65           ber_len_t chars=0;
66 
67           for( ; *p ; LDAP_UTF8_INCR(p) ) {
68                     chars++;
69           }
70 
71           return chars;
72 }
73 
74 /* return offset to next character */
ldap_utf8_offset(const char * p)75 int ldap_utf8_offset( const char * p )
76 {
77           return LDAP_UTF8_NEXT(p) - p;
78 }
79 
80 /*
81  * Returns length indicated by first byte.
82  */
83 const char ldap_utf8_lentab[] = {
84           0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
85           0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
86           0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
87           0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
88           0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
89           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
90           3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
91           4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 };
92 
ldap_utf8_charlen(const char * p)93 int ldap_utf8_charlen( const char * p )
94 {
95           if (!(*p & 0x80))
96                     return 1;
97 
98           return ldap_utf8_lentab[*(const unsigned char *)p ^ 0x80];
99 }
100 
101 /*
102  * Make sure the UTF-8 char used the shortest possible encoding
103  * returns charlen if valid, 0 if not.
104  *
105  * Here are the valid UTF-8 encodings, taken from RFC 2279 page 4.
106  * The table is slightly modified from that of the RFC.
107  *
108  * UCS-4 range (hex)      UTF-8 sequence (binary)
109  * 0000 0000-0000 007F   0.......
110  * 0000 0080-0000 07FF   110++++. 10......
111  * 0000 0800-0000 FFFF   1110++++ 10+..... 10......
112  * 0001 0000-001F FFFF   11110+++ 10++.... 10...... 10......
113  * 0020 0000-03FF FFFF   111110++ 10+++... 10...... 10...... 10......
114  * 0400 0000-7FFF FFFF   1111110+ 10++++.. 10...... 10...... 10...... 10......
115  *
116  * The '.' bits are "don't cares". When validating a UTF-8 sequence,
117  * at least one of the '+' bits must be set, otherwise the character
118  * should have been encoded in fewer octets. Note that in the two-octet
119  * case, only the first octet needs to be validated, and this is done
120  * in the ldap_utf8_lentab[] above.
121  */
122 
123 /* mask of required bits in second octet */
124 #undef c
125 #define c const char
126 c ldap_utf8_mintab[] = {
127           (c)0x20, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
128           (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
129           (c)0x30, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
130           (c)0x38, (c)0x80, (c)0x80, (c)0x80, (c)0x3c, (c)0x80, (c)0x00, (c)0x00 };
131 #undef c
132 
ldap_utf8_charlen2(const char * p)133 int ldap_utf8_charlen2( const char * p )
134 {
135           int i = LDAP_UTF8_CHARLEN( p );
136 
137           if ( i > 2 ) {
138                     if ( !( ldap_utf8_mintab[*p & 0x1f] & p[1] ) )
139                               i = 0;
140           }
141           return i;
142 }
143 
144 /* conv UTF-8 to UCS-4, useful for comparisons */
ldap_x_utf8_to_ucs4(const char * p)145 ldap_ucs4_t ldap_x_utf8_to_ucs4( const char * p )
146 {
147     const unsigned char *c = (const unsigned char *) p;
148     ldap_ucs4_t ch;
149           int len, i;
150           static unsigned char mask[] = {
151                     0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
152 
153           len = LDAP_UTF8_CHARLEN2(p, len);
154 
155           if( len == 0 ) return LDAP_UCS4_INVALID;
156 
157           ch = c[0] & mask[len];
158 
159           for(i=1; i < len; i++) {
160                     if ((c[i] & 0xc0) != 0x80) {
161                               return LDAP_UCS4_INVALID;
162                     }
163 
164                     ch <<= 6;
165                     ch |= c[i] & 0x3f;
166           }
167 
168           return ch;
169 }
170 
171 /* conv UCS-4 to UTF-8, not used */
ldap_x_ucs4_to_utf8(ldap_ucs4_t c,char * buf)172 int ldap_x_ucs4_to_utf8( ldap_ucs4_t c, char *buf )
173 {
174           int len=0;
175           unsigned char* p = (unsigned char *) buf;
176 
177           /* not a valid Unicode character */
178           if ( c < 0 ) return 0;
179 
180           /* Just return length, don't convert */
181           if(buf == NULL) {
182                     if( c < 0x80 ) return 1;
183                     else if( c < 0x800 ) return 2;
184                     else if( c < 0x10000 ) return 3;
185                     else if( c < 0x200000 ) return 4;
186                     else if( c < 0x4000000 ) return 5;
187                     else return 6;
188           }
189 
190           if( c < 0x80 ) {
191                     p[len++] = c;
192 
193           } else if( c < 0x800 ) {
194                     p[len++] = 0xc0 | ( c >> 6 );
195                     p[len++] = 0x80 | ( c & 0x3f );
196 
197           } else if( c < 0x10000 ) {
198                     p[len++] = 0xe0 | ( c >> 12 );
199                     p[len++] = 0x80 | ( (c >> 6) & 0x3f );
200                     p[len++] = 0x80 | ( c & 0x3f );
201 
202           } else if( c < 0x200000 ) {
203                     p[len++] = 0xf0 | ( c >> 18 );
204                     p[len++] = 0x80 | ( (c >> 12) & 0x3f );
205                     p[len++] = 0x80 | ( (c >> 6) & 0x3f );
206                     p[len++] = 0x80 | ( c & 0x3f );
207 
208           } else if( c < 0x4000000 ) {
209                     p[len++] = 0xf8 | ( c >> 24 );
210                     p[len++] = 0x80 | ( (c >> 18) & 0x3f );
211                     p[len++] = 0x80 | ( (c >> 12) & 0x3f );
212                     p[len++] = 0x80 | ( (c >> 6) & 0x3f );
213                     p[len++] = 0x80 | ( c & 0x3f );
214 
215           } else /* if( c < 0x80000000 ) */ {
216                     p[len++] = 0xfc | ( c >> 30 );
217                     p[len++] = 0x80 | ( (c >> 24) & 0x3f );
218                     p[len++] = 0x80 | ( (c >> 18) & 0x3f );
219                     p[len++] = 0x80 | ( (c >> 12) & 0x3f );
220                     p[len++] = 0x80 | ( (c >> 6) & 0x3f );
221                     p[len++] = 0x80 | ( c & 0x3f );
222           }
223 
224           return len;
225 }
226 
227 #define LDAP_UCS_UTF8LEN(c)   \
228           c < 0 ? 0 : (c < 0x80 ? 1 : (c < 0x800 ? 2 : (c < 0x10000 ? 3 : \
229           (c < 0x200000 ? 4 : (c < 0x4000000 ? 5 : 6)))))
230 
231 /* Convert a string to UTF-8 format. The input string is expected to
232  * have characters of 1, 2, or 4 octets (in network byte order)
233  * corresponding to the ASN.1 T61STRING, BMPSTRING, and UNIVERSALSTRING
234  * types respectively. (Here T61STRING just means that there is one
235  * octet per character and characters may use the high bit of the octet.
236  * The characters are assumed to use ISO mappings, no provision is made
237  * for converting from T.61 coding rules to Unicode.)
238  */
239 
240 int
ldap_ucs_to_utf8s(struct berval * ucs,int csize,struct berval * utf8s)241 ldap_ucs_to_utf8s( struct berval *ucs, int csize, struct berval *utf8s )
242 {
243           unsigned char *in, *end;
244           char *ptr;
245           ldap_ucs4_t u;
246           int i, l = 0;
247 
248           utf8s->bv_val = NULL;
249           utf8s->bv_len = 0;
250 
251           in = (unsigned char *)ucs->bv_val;
252 
253           /* Make sure we stop at an even multiple of csize */
254           end = in + ( ucs->bv_len & ~(csize-1) );
255 
256           for (; in < end; ) {
257                     u = *in++;
258                     if (csize > 1) {
259                               u <<= 8;
260                               u |= *in++;
261                     }
262                     if (csize > 2) {
263                               u <<= 8;
264                               u |= *in++;
265                               u <<= 8;
266                               u |= *in++;
267                     }
268                     i = LDAP_UCS_UTF8LEN(u);
269                     if (i == 0)
270                               return LDAP_INVALID_SYNTAX;
271                     l += i;
272           }
273 
274           utf8s->bv_val = LDAP_MALLOC( l+1 );
275           if (utf8s->bv_val == NULL)
276                     return LDAP_NO_MEMORY;
277           utf8s->bv_len = l;
278 
279           ptr = utf8s->bv_val;
280           for (in = (unsigned char *)ucs->bv_val; in < end; ) {
281                     u = *in++;
282                     if (csize > 1) {
283                               u <<= 8;
284                               u |= *in++;
285                     }
286                     if (csize > 2) {
287                               u <<= 8;
288                               u |= *in++;
289                               u <<= 8;
290                               u |= *in++;
291                     }
292                     ptr += ldap_x_ucs4_to_utf8(u, ptr);
293           }
294           *ptr = '\0';
295           return LDAP_SUCCESS;
296 }
297 
298 /*
299  * Advance to the next UTF-8 character
300  *
301  * Ignores length of multibyte character, instead rely on
302  * continuation markers to find start of next character.
303  * This allows for "resyncing" of when invalid characters
304  * are provided provided the start of the next character
305  * is appears within the 6 bytes examined.
306  */
ldap_utf8_next(const char * p)307 char* ldap_utf8_next( const char * p )
308 {
309           int i;
310           const unsigned char *u = (const unsigned char *) p;
311 
312           if( LDAP_UTF8_ISASCII(u) ) {
313                     return (char *) &p[1];
314           }
315 
316           for( i=1; i<6; i++ ) {
317                     if ( ( u[i] & 0xc0 ) != 0x80 ) {
318                               return (char *) &p[i];
319                     }
320           }
321 
322           return (char *) &p[i];
323 }
324 
325 /*
326  * Advance to the previous UTF-8 character
327  *
328  * Ignores length of multibyte character, instead rely on
329  * continuation markers to find start of next character.
330  * This allows for "resyncing" of when invalid characters
331  * are provided provided the start of the next character
332  * is appears within the 6 bytes examined.
333  */
ldap_utf8_prev(const char * p)334 char* ldap_utf8_prev( const char * p )
335 {
336           int i;
337           const unsigned char *u = (const unsigned char *) p;
338 
339           for( i=-1; i>-6 ; i-- ) {
340                     if ( ( u[i] & 0xc0 ) != 0x80 ) {
341                               return (char *) &p[i];
342                     }
343           }
344 
345           return (char *) &p[i];
346 }
347 
348 /*
349  * Copy one UTF-8 character from src to dst returning
350  * number of bytes copied.
351  *
352  * Ignores length of multibyte character, instead rely on
353  * continuation markers to find start of next character.
354  * This allows for "resyncing" of when invalid characters
355  * are provided provided the start of the next character
356  * is appears within the 6 bytes examined.
357  */
ldap_utf8_copy(char * dst,const char * src)358 int ldap_utf8_copy( char* dst, const char *src )
359 {
360           int i;
361           const unsigned char *u = (const unsigned char *) src;
362 
363           dst[0] = src[0];
364 
365           if( LDAP_UTF8_ISASCII(u) ) {
366                     return 1;
367           }
368 
369           for( i=1; i<6; i++ ) {
370                     if ( ( u[i] & 0xc0 ) != 0x80 ) {
371                               return i;
372                     }
373                     dst[i] = src[i];
374           }
375 
376           return i;
377 }
378 
379 #ifndef UTF8_ALPHA_CTYPE
380 /*
381  * UTF-8 ctype routines
382  * Only deals with characters < 0x80 (ie: US-ASCII)
383  */
384 
ldap_utf8_isascii(const char * p)385 int ldap_utf8_isascii( const char * p )
386 {
387           unsigned c = * (const unsigned char *) p;
388           return LDAP_ASCII(c);
389 }
390 
ldap_utf8_isdigit(const char * p)391 int ldap_utf8_isdigit( const char * p )
392 {
393           unsigned c = * (const unsigned char *) p;
394 
395           if(!LDAP_ASCII(c)) return 0;
396 
397           return LDAP_DIGIT( c );
398 }
399 
ldap_utf8_isxdigit(const char * p)400 int ldap_utf8_isxdigit( const char * p )
401 {
402           unsigned c = * (const unsigned char *) p;
403 
404           if(!LDAP_ASCII(c)) return 0;
405 
406           return LDAP_HEX(c);
407 }
408 
ldap_utf8_isspace(const char * p)409 int ldap_utf8_isspace( const char * p )
410 {
411           unsigned c = * (const unsigned char *) p;
412 
413           if(!LDAP_ASCII(c)) return 0;
414 
415           switch(c) {
416           case ' ':
417           case '\t':
418           case '\n':
419           case '\r':
420           case '\v':
421           case '\f':
422                     return 1;
423           }
424 
425           return 0;
426 }
427 
428 /*
429  * These are not needed by the C SDK and are
430  * not "good enough" for general use.
431  */
ldap_utf8_isalpha(const char * p)432 int ldap_utf8_isalpha( const char * p )
433 {
434           unsigned c = * (const unsigned char *) p;
435 
436           if(!LDAP_ASCII(c)) return 0;
437 
438           return LDAP_ALPHA(c);
439 }
440 
ldap_utf8_isalnum(const char * p)441 int ldap_utf8_isalnum( const char * p )
442 {
443           unsigned c = * (const unsigned char *) p;
444 
445           if(!LDAP_ASCII(c)) return 0;
446 
447           return LDAP_ALNUM(c);
448 }
449 
ldap_utf8_islower(const char * p)450 int ldap_utf8_islower( const char * p )
451 {
452           unsigned c = * (const unsigned char *) p;
453 
454           if(!LDAP_ASCII(c)) return 0;
455 
456           return LDAP_LOWER(c);
457 }
458 
ldap_utf8_isupper(const char * p)459 int ldap_utf8_isupper( const char * p )
460 {
461           unsigned c = * (const unsigned char *) p;
462 
463           if(!LDAP_ASCII(c)) return 0;
464 
465           return LDAP_UPPER(c);
466 }
467 #endif
468 
469 
470 /*
471  * UTF-8 string routines
472  */
473 
474 /* like strchr() */
475 char * (ldap_utf8_strchr)( const char *str, const char *chr )
476 {
477           for( ; *str != '\0'; LDAP_UTF8_INCR(str) ) {
478                     if( ldap_x_utf8_to_ucs4( str ) == ldap_x_utf8_to_ucs4( chr ) ) {
479                               return (char *) str;
480                     }
481           }
482 
483           return NULL;
484 }
485 
486 /* like strcspn() but returns number of bytes, not characters */
ber_len_t(ldap_utf8_strcspn)487 ber_len_t (ldap_utf8_strcspn)( const char *str, const char *set )
488 {
489           const char *cstr;
490           const char *cset;
491 
492           for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) {
493                     for( cset = set; *cset != '\0'; LDAP_UTF8_INCR(cset) ) {
494                               if( ldap_x_utf8_to_ucs4( cstr ) == ldap_x_utf8_to_ucs4( cset ) ) {
495                                         return cstr - str;
496                               }
497                     }
498           }
499 
500           return cstr - str;
501 }
502 
503 /* like strspn() but returns number of bytes, not characters */
ber_len_t(ldap_utf8_strspn)504 ber_len_t (ldap_utf8_strspn)( const char *str, const char *set )
505 {
506           const char *cstr;
507           const char *cset;
508 
509           for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) {
510                     for( cset = set; ; LDAP_UTF8_INCR(cset) ) {
511                               if( *cset == '\0' ) {
512                                         return cstr - str;
513                               }
514 
515                               if( ldap_x_utf8_to_ucs4( cstr ) == ldap_x_utf8_to_ucs4( cset ) ) {
516                                         break;
517                               }
518                     }
519           }
520 
521           return cstr - str;
522 }
523 
524 /* like strpbrk(), replaces strchr() as well */
525 char *(ldap_utf8_strpbrk)( const char *str, const char *set )
526 {
527           for( ; *str != '\0'; LDAP_UTF8_INCR(str) ) {
528                     const char *cset;
529 
530                     for( cset = set; *cset != '\0'; LDAP_UTF8_INCR(cset) ) {
531                               if( ldap_x_utf8_to_ucs4( str ) == ldap_x_utf8_to_ucs4( cset ) ) {
532                                         return (char *) str;
533                               }
534                     }
535           }
536 
537           return NULL;
538 }
539 
540 /* like strtok_r(), not strtok() */
541 char *(ldap_utf8_strtok)(char *str, const char *sep, char **last)
542 {
543           char *begin;
544           char *end;
545 
546           if( last == NULL ) return NULL;
547 
548           begin = str ? str : *last;
549 
550           begin += ldap_utf8_strspn( begin, sep );
551 
552           if( *begin == '\0' ) {
553                     *last = NULL;
554                     return NULL;
555           }
556 
557           end = &begin[ ldap_utf8_strcspn( begin, sep ) ];
558 
559           if( *end != '\0' ) {
560                     char *next = LDAP_UTF8_NEXT( end );
561                     *end = '\0';
562                     end = next;
563           }
564 
565           *last = end;
566           return begin;
567 }
568