1 /*        $NetBSD: casefold.c,v 1.4 2025/02/25 19:15:51 christos Exp $          */
2 
3 /*++
4 /* NAME
5 /*        casefold 3
6 /* SUMMARY
7 /*        casefold text for caseless comparison
8 /* SYNOPSIS
9 /*        #include <stringops.h>
10 /*
11 /*        char      *casefold(
12 /*        VSTRING *dst,
13 /*        const char *src)
14 /*
15 /*        char      *casefold_append(
16 /*        VSTRING *dst,
17 /*        const char *src)
18 /*
19 /*        char      *casefold_len(
20 /*        VSTRING *dst,
21 /*        const char *src,
22 /*        ssize_t   src_len)
23 /* AUXILIARY FUNCTIONS
24 /*        char      *casefoldx(
25 /*        int       flags,
26 /*        VSTRING *dst,
27 /*        const char *src,
28 /*        ssize_t   src_len)
29 /* DESCRIPTION
30 /*        casefold() converts text to a form that is suitable for
31 /*        caseless comparison, rather than presentation to humans.
32 /*
33 /*        When compiled without EAI support or util_utf8_enable is
34 /*        zero, casefold() implements ASCII case folding, leaving
35 /*        non-ASCII byte values unchanged.
36 /*
37 /*        When compiled with EAI support and util_utf8_enable is
38 /*        non-zero, casefold() implements UTF-8 case folding using
39 /*        the en_US locale, as recommended when the conversion result
40 /*        is not meant to be presented to humans.
41 /*
42 /*        casefold_len() implements casefold() with a source length
43 /*        argument.
44 /*
45 /*        casefold_append() implements casefold() without overwriting
46 /*        the result.
47 /*
48 /*        casefoldx() implements a more complex API that implements
49 /*        all of the above and more.
50 /*
51 /*        Arguments:
52 /* .IP src
53 /*        Null-terminated input string.
54 /* .IP dest
55 /*        Output buffer, null-terminated. Specify a null pointer to
56 /*        use an internal buffer that is overwritten upon each call.
57 /* .IP src_len
58 /*        The string length, -1 to determine the length dynamically.
59 /* .IP flags
60 /*        Bitwise OR of zero or more of the following:
61 /* .RS
62 /* .IP CASEF_FLAG_UTF8
63 /*        Enable UTF-8 support. This flag has no effect when compiled
64 /*        without EAI support.
65 /* .IP CASEF_FLAG_APPEND
66 /*        Append the result to the buffer, instead of overwriting it.
67 /* DIAGNOSTICS
68 /*        All errors are fatal. There appear to be no input-dependent
69 /*        errors.
70 /*
71 /*        With the ICU 4.8 library, there is no casefold error for
72 /*        UTF-8 code points U+0000..U+10FFFF (including surrogate
73 /*        range), not even when running inside an empty chroot jail.
74 /*        Nor does malformed UTF-8 trigger errors; non-UTF-8 bytes
75 /*        are copied verbatim. Based on ICU 4.8 source-code review
76 /*        and experimentation(!) we conclude that UTF-8 casefolding
77 /*        has no data-dependent error cases, and that it is safe to
78 /*        treat all casefolding errors as fatal runtime errors.
79 /* LICENSE
80 /* .ad
81 /* .fi
82 /*        The Secure Mailer license must be distributed with this software.
83 /* AUTHOR(S)
84 /*        Wietse Venema
85 /*        IBM T.J. Watson Research
86 /*        P.O. Box 704
87 /*        Yorktown Heights, NY 10598, USA
88 /*
89 /*        Wietse Venema
90 /*        Google, Inc.
91 /*        111 8th Avenue
92 /*        New York, NY 10011, USA
93 /*--*/
94 
95 /* System library. */
96 
97 #include <sys_defs.h>
98 #include <string.h>
99 #include <ctype.h>
100 #ifndef NO_EAI
101 #include <unicode/ucasemap.h>
102 #include <unicode/ustring.h>
103 #include <unicode/uchar.h>
104 #endif
105 
106 /* Utility library. */
107 
108 #include <msg.h>
109 #include <stringops.h>
110 
111 #define STR(x) vstring_str(x)
112 #define LEN(x) VSTRING_LEN(x)
113 
114 /* casefoldx - casefold an UTF-8 string */
115 
casefoldx(int flags,VSTRING * dest,const char * src,ssize_t len)116 char   *casefoldx(int flags, VSTRING *dest, const char *src, ssize_t len)
117 {
118     size_t  old_len;
119 
120 #ifdef NO_EAI
121 
122     /*
123      * ASCII mode only.
124      */
125     if (len < 0)
126           len = strlen(src);
127     if ((flags & CASEF_FLAG_APPEND) == 0)
128           VSTRING_RESET(dest);
129     old_len = VSTRING_LEN(dest);
130     vstring_strncat(dest, src, len);
131     lowercase(STR(dest) + old_len);
132     return (STR(dest));
133 #else
134 
135     /*
136      * Unicode mode.
137      */
138     const char myname[] = "casefold";
139     static VSTRING *fold_buf = 0;
140     static UCaseMap *csm = 0;
141     UErrorCode error;
142     ssize_t space_needed;
143     int     n;
144 
145     /*
146      * Handle special cases.
147      */
148     if (len < 0)
149           len = strlen(src);
150     if (dest == 0)
151           dest = (fold_buf != 0 ? fold_buf : (fold_buf = vstring_alloc(100)));
152     if ((flags & CASEF_FLAG_APPEND) == 0)
153           VSTRING_RESET(dest);
154     old_len = VSTRING_LEN(dest);
155 
156     /*
157      * All-ASCII input, or ASCII mode only.
158      */
159     if ((flags & CASEF_FLAG_UTF8) == 0 || allascii(src)) {
160           vstring_strncat(dest, src, len);
161           lowercase(STR(dest) + old_len);
162           return (STR(dest));
163     }
164 
165     /*
166      * ICU 4.8 ucasemap_utf8FoldCase() does not complain about UTF-8 syntax
167      * errors. XXX Based on source-code review we conclude that non-UTF-8
168      * bytes are copied verbatim, and experiments confirm this. Given that
169      * this behavior is intentional, we assume that it will stay that way.
170      */
171 #if 0
172     if (valid_utf8_string(src, len) == 0) {
173           if (err)
174               *err = "malformed UTF-8 or invalid codepoint";
175           return (0);
176     }
177 #endif
178 
179     /*
180      * One-time initialization. With ICU 4.8 this works while chrooted.
181      */
182     if (csm == 0) {
183           error = U_ZERO_ERROR;
184           csm = ucasemap_open("en_US", U_FOLD_CASE_DEFAULT, &error);
185           if (U_SUCCESS(error) == 0)
186               msg_fatal("ucasemap_open error: %s", u_errorName(error));
187     }
188 
189     /*
190      * Fold the input, adjusting the buffer size if needed. Safety: don't
191      * loop forever.
192      *
193      * Note: the requested amount of space for casemapped output (as reported
194      * with space_needed below) does not include storage for the null
195      * terminator. The terminator is written only when the output buffer is
196      * large enough. This is why we overallocate space when the output does
197      * not fit. But if the output fits exactly, then the output will be
198      * unterminated, and we have to terminate the output ourselves.
199      */
200     for (n = 0; n < 3; n++) {
201           error = U_ZERO_ERROR;
202           space_needed = ucasemap_utf8FoldCase(csm, STR(dest) + old_len,
203                                              vstring_avail(dest), src, len, &error);
204           if (U_SUCCESS(error)) {
205               vstring_set_payload_size(dest, old_len + space_needed);
206               if (vstring_avail(dest) == 0)       /* exact fit, no terminator */
207                     VSTRING_TERMINATE(dest);      /* add terminator */
208               break;
209           } else if (error == U_BUFFER_OVERFLOW_ERROR) {
210               VSTRING_SPACE(dest, space_needed + 1);        /* for terminator */
211           } else {
212               msg_fatal("%s: conversion error for \"%s\": %s",
213                           myname, src, u_errorName(error));
214           }
215     }
216     return (STR(dest));
217 #endif                                                      /* NO_EAI */
218 }
219 
220 #ifdef TEST
221 
encode_utf8(VSTRING * buffer,int codepoint)222 static void encode_utf8(VSTRING *buffer, int codepoint)
223 {
224     const char myname[] = "encode_utf8";
225 
226     VSTRING_RESET(buffer);
227     if (codepoint < 0x80) {
228           VSTRING_ADDCH(buffer, codepoint);
229     } else if (codepoint < 0x800) {
230           VSTRING_ADDCH(buffer, 0xc0 | (codepoint >> 6));
231           VSTRING_ADDCH(buffer, 0x80 | (codepoint & 0x3f));
232     } else if (codepoint < 0x10000) {
233           VSTRING_ADDCH(buffer, 0xe0 | (codepoint >> 12));
234           VSTRING_ADDCH(buffer, 0x80 | ((codepoint >> 6) & 0x3f));
235           VSTRING_ADDCH(buffer, 0x80 | (codepoint & 0x3f));
236     } else if (codepoint <= 0x10FFFF) {
237           VSTRING_ADDCH(buffer, 0xf0 | (codepoint >> 18));
238           VSTRING_ADDCH(buffer, 0x80 | ((codepoint >> 12) & 0x3f));
239           VSTRING_ADDCH(buffer, 0x80 | ((codepoint >> 6) & 0x3f));
240           VSTRING_ADDCH(buffer, 0x80 | (codepoint & 0x3f));
241     } else {
242           msg_panic("%s: out-of-range codepoint U+%X", myname, codepoint);
243     }
244     VSTRING_TERMINATE(buffer);
245 }
246 
247 #include <stdlib.h>
248 #include <stdio.h>
249 #include <locale.h>
250 
251 #include <vstream.h>
252 #include <vstring_vstream.h>
253 #include <msg_vstream.h>
254 
main(int argc,char ** argv)255 int     main(int argc, char **argv)
256 {
257     VSTRING *buffer = vstring_alloc(1);
258     VSTRING *dest = vstring_alloc(1);
259     char   *bp;
260     char   *conv_res;
261     char   *cmd;
262     int     codepoint, first, last;
263     VSTREAM *fp;
264 
265     if (setlocale(LC_ALL, "C") == 0)
266           msg_fatal("setlocale(LC_ALL, C) failed: %m");
267 
268     msg_vstream_init(argv[0], VSTREAM_ERR);
269 
270     util_utf8_enable = 1;
271 
272     VSTRING_SPACE(buffer, 256);                             /* chroot/file pathname */
273 
274     while (vstring_fgets_nonl(buffer, VSTREAM_IN)) {
275           bp = STR(buffer);
276           vstream_printf("> %s\n", bp);
277           cmd = mystrtok(&bp, CHARS_SPACE);
278           if (cmd == 0 || *cmd == '#')
279               continue;
280           while (ISSPACE(*bp))
281               bp++;
282 
283           /*
284            * Null-terminated string.
285            */
286           if (strcmp(cmd, "fold") == 0) {
287               conv_res = casefold(dest, bp);
288               vstream_printf("\"%s\" ->fold \"%s\"\n", bp, conv_res);
289           }
290 
291           /*
292            * Codepoint range.
293            */
294           else if (strcmp(cmd, "range") == 0
295                      && sscanf(bp, "%i %i", &first, &last) == 2
296                      && first <= last) {
297               for (codepoint = first; codepoint <= last; codepoint++) {
298                     if (codepoint >= 0xD800 && codepoint <= 0xDFFF) {
299                         vstream_printf("skipping surrogate range\n");
300                         codepoint = 0xDFFF;
301                     } else {
302                         encode_utf8(buffer, codepoint);
303                         if (msg_verbose)
304                               vstream_printf("U+%X -> %s\n", codepoint, STR(buffer));
305                         if (valid_utf8_stringz(STR(buffer)) == 0)
306                               msg_fatal("bad utf-8 encoding for U+%X\n", codepoint);
307                         casefold(dest, STR(buffer));
308                     }
309               }
310               vstream_printf("range completed: 0x%x..0x%x\n", first, last);
311           }
312 
313           /*
314            * Chroot directory.
315            */
316           else if (strcmp(cmd, "chroot") == 0
317                      && sscanf(bp, "%255s", STR(buffer)) == 1) {
318               if (geteuid() == 0) {
319                     if (chdir(STR(buffer)) < 0)
320                         msg_fatal("chdir(%s): %m", STR(buffer));
321                     if (chroot(STR(buffer)) < 0)
322                         msg_fatal("chroot(%s): %m", STR(buffer));
323                     vstream_printf("chroot %s completed\n", STR(buffer));
324               }
325           }
326 
327           /*
328            * File.
329            */
330           else if (strcmp(cmd, "file") == 0
331                      && sscanf(bp, "%255s", STR(buffer)) == 1) {
332               if ((fp = vstream_fopen(STR(buffer), O_RDONLY, 0)) == 0)
333                     msg_fatal("open(%s): %m", STR(buffer));
334               while (vstring_fgets_nonl(buffer, fp))
335                     vstream_printf("%s\n", casefold(dest, STR(buffer)));
336               vstream_fclose(fp);
337           }
338 
339           /*
340            * Verbose.
341            */
342           else if (strcmp(cmd, "verbose") == 0
343                      && sscanf(bp, "%i", &msg_verbose) == 1) {
344                /* void */ ;
345           }
346 
347           /*
348            * Usage
349            */
350           else {
351               vstream_printf("Usage: %s chroot <path> | file <path> | fold <text> | range <first> <last> | verbose <int>\n",
352                                  argv[0]);
353           }
354           vstream_fflush(VSTREAM_OUT);
355     }
356     vstring_free(buffer);
357     vstring_free(dest);
358     exit(0);
359 }
360 
361 #endif                                            /* TEST */
362