1 /*        $NetBSD: valid_utf8_string.c,v 1.3 2025/02/25 19:15:52 christos Exp $ */
2 
3 /*++
4 /* NAME
5 /*        valid_utf8_string 3
6 /* SUMMARY
7 /*        predicate if string is valid UTF-8
8 /* SYNOPSIS
9 /*        #include <stringops.h>
10 /*
11 /*        int       valid_utf8_string(str, len)
12 /*        const char *str;
13 /*        ssize_t   len;
14 /*
15 /*        int       valid_utf8_stringz(str)
16 /*        const char *str;
17 /*        ssize_t   len;
18 /* DESCRIPTION
19 /*        valid_utf8_string() determines if all bytes in a string
20 /*        satisfy parse_utf8_char(3h) checks. See there for any
21 /*        implementation limitations.
22 /*
23 /*        valid_utf8_stringz() determines the same for zero-terminated
24 /*        strings.
25 /*
26 /*        A zero-length string is considered valid.
27 /* DIAGNOSTICS
28 /*        The result value is zero when the caller specifies a negative
29 /*        length, or a string that does not pass parse_utf8_char(3h) checks.
30 /* SEE ALSO
31 /*        parse_utf8_char(3h), parse one UTF-8 multibyte character
32 /* LICENSE
33 /* .ad
34 /* .fi
35 /*        The Secure Mailer license must be distributed with this software.
36 /* AUTHOR(S)
37 /*        Wietse Venema
38 /*        IBM T.J. Watson Research
39 /*        P.O. Box 704
40 /*        Yorktown Heights, NY 10598, USA
41 /*
42 /*        Wietse Venema
43 /*        porcupine.org
44 /*        Amawalk, NY 10501, USA
45 /*--*/
46 
47 /* System library. */
48 
49 #include <sys_defs.h>
50 
51 /* Utility library. */
52 
53 #include <stringops.h>
54 #include <parse_utf8_char.h>
55 
56 /* valid_utf8_string - validate string according to RFC 3629 */
57 
valid_utf8_string(const char * str,ssize_t len)58 int     valid_utf8_string(const char *str, ssize_t len)
59 {
60     const char *ep = str + len;
61     const char *cp;
62     const char *last;
63 
64     if (len < 0)
65           return (0);
66     if (len == 0)
67           return (1);
68 
69     /*
70      * Ideally, the compiler will inline parse_utf8_char().
71      */
72     for (cp = str; cp < ep; cp++) {
73           if ((last = parse_utf8_char(cp, ep)) != 0)
74               cp = last;
75           else
76               return (0);
77     }
78     return (1);
79 }
80 
81 /* valid_utf8_stringz - validate string according to RFC 3629 */
82 
valid_utf8_stringz(const char * str)83 int     valid_utf8_stringz(const char *str)
84 {
85     const char *cp;
86     const char *last;
87 
88     /*
89      * Ideally, the compiler will inline parse_utf8_char(), propagate the
90      * null pointer constant value, and eliminate code branches that test
91      * whether 0 != 0.
92      */
93     for (cp = str; *cp; cp++) {
94           if ((last = parse_utf8_char(cp, 0)) != 0)
95               cp = last;
96           else
97               return (0);
98     }
99     return (1);
100 }
101 
102  /*
103   * Stand-alone test program. Each string is a line without line terminator.
104   */
105 #ifdef TEST
106 #include <stdlib.h>
107 #include <string.h>
108 #include <msg.h>
109 #include <vstream.h>
110 #include <msg_vstream.h>
111 
112  /*
113   * Test cases for 1-, 2-, and 3-byte encodings. See printable.c for UTF8
114   * parser resychronization tests.
115   *
116   * XXX Need a test for 4-byte encodings, preferably with strings that can be
117   * displayed.
118   *
119   * XXX Need tests with hand-crafted over-long encodings and surrogates.
120   */
121 struct testcase {
122     const char *name;
123     const char *input;
124     int     expected;
125 };
126 
127 #define T_VALID               (1)
128 #define T_INVALID   (0)
129 #define valid_to_str(v)       ((v) ? "VALID" : "INVALID")
130 
131 static const struct testcase testcases[] = {
132     {"Printable ASCII",
133           "printable", T_VALID,
134     },
135     {"Latin script, accented, no error",
136           "na\303\257ve", T_VALID,
137     },
138     {"Latin script, accented, missing non-leading byte",
139           "na\303ve", T_INVALID,
140     },
141     {"Latin script, accented, missing leading byte",
142           "na\257ve", T_INVALID,
143     },
144     {"Viktor, Cyrillic, no error",
145           "\320\262\320\270\320\272\321\202\320\276\321\200", T_VALID,
146     },
147     {"Viktor, Cyrillic, missing non-leading byte",
148           "\320\262\320\320\272\321\202\320\276\321\200", T_INVALID,
149     },
150     {"Viktor, Cyrillic, missing leading byte",
151           "\320\262\270\320\272\321\202\320\276\321\200", T_INVALID,
152     },
153     {"Viktor, Cyrillic, truncated",
154           "\320\262\320\270\320\272\321\202\320\276\321", T_INVALID,
155     },
156     {"Viktor, Hebrew, no error",
157           "\327\225\327\231\327\247\327\230\327\225\326\274\327\250", T_VALID,
158     },
159     {"Viktor, Hebrew, missing leading byte",
160           "\327\225\231\327\247\327\230\327\225\326\274\327\250", T_INVALID,
161     },
162     {"Chinese (Simplified), no error",
163           "\344\270\255\345\233\275\344\272\222\350\201\224\347\275\221\347"
164           "\273\234\345\217\221\345\261\225\347\212\266\345\206\265\347\273"
165           "\237\350\256\241\346\212\245\345\221\212", T_VALID,
166     },
167     {"Chinese (Simplified), missing leading byte",
168           "\344\270\255\345\233\275\344\272\222\350\201\224\275\221\347"
169           "\273\234\345\217\221\345\261\225\347\212\266\345\206\265\347\273"
170           "\237\350\256\241\346\212\245\345\221\212", T_INVALID,
171     },
172     {"Chinese (Simplified), missing first non-leading byte",
173           "\344\270\255\345\233\275\344\272\222\350\201\224\347\221\347"
174           "\273\234\345\217\221\345\261\225\347\212\266\345\206\265\347\273"
175           "\237\350\256\241\346\212\245\345\221\212", T_INVALID,
176     },
177     {"Chinese (Simplified), missing second non-leading byte",
178           "\344\270\255\345\233\275\344\272\222\350\201\224\347\275\347"
179           "\273\234\345\217\221\345\261\225\347\212\266\345\206\265\347\273"
180           "\237\350\256\241\346\212\245\345\221\212", T_INVALID,
181     },
182     {"Chinese (Simplified), truncated",
183           "\344\270\255\345\233\275\344\272\222\350\201\224\347\275\221\347"
184           "\273\234\345\217\221\345\261\225\347\212\266\345\206\265\347\273"
185           "\237\350\256\241\346\212\245\345", T_INVALID,
186     },
187 };
188 
main(int argc,char ** argv)189 int     main(int argc, char **argv)
190 {
191     const struct testcase *tp;
192     int     pass;
193     int     fail;
194 
195 #define NUM_TESTS       sizeof(testcases)/sizeof(testcases[0])
196 
197     msg_vstream_init(basename(argv[0]), VSTREAM_ERR);
198     util_utf8_enable = 1;
199 
200     for (pass = fail = 0, tp = testcases; tp < testcases + NUM_TESTS; tp++) {
201           int     actual_l;
202           int     actual_z;
203           int     ok = 0;
204 
205           /*
206            * Notes:
207            *
208            * - The msg(3) functions use printable() which interferes when logging
209            * inputs and outputs. Use vstream_fprintf() instead.
210            */
211           vstream_fprintf(VSTREAM_ERR, "RUN  %s\n", tp->name);
212           actual_l = valid_utf8_string(tp->input, strlen(tp->input));
213           actual_z = valid_utf8_stringz(tp->input);
214 
215           if (actual_l != tp->expected) {
216               vstream_fprintf(VSTREAM_ERR,
217                                 "input: >%s<, 'actual_l' got: >%s<, want: >%s<\n",
218                                   tp->input, valid_to_str(actual_l),
219                                   valid_to_str(tp->expected));
220           } else if (actual_z != tp->expected) {
221               vstream_fprintf(VSTREAM_ERR,
222                                 "input: >%s<, 'actual_z' got: >%s<, want: >%s<\n",
223                                   tp->input, valid_to_str(actual_z),
224                                   valid_to_str(tp->expected));
225           } else {
226               vstream_fprintf(VSTREAM_ERR, "input: >%s<, got and want: >%s<\n",
227                                   tp->input, valid_to_str(actual_l));
228               ok = 1;
229           }
230           if (ok) {
231               vstream_fprintf(VSTREAM_ERR, "PASS %s\n", tp->name);
232               pass++;
233           } else {
234               vstream_fprintf(VSTREAM_ERR, "FAIL %s\n", tp->name);
235               fail++;
236           }
237     }
238     msg_info("PASS=%d FAIL=%d", pass, fail);
239     return (fail > 0);
240 }
241 
242 #endif
243