1 /* $NetBSD: valid_utf8_string.c,v 1.3 2025/02/25 19:15:52 christos Exp $ */
2
3 /*++
4 /* NAME
5 /* valid_utf8_string 3
6 /* SUMMARY
7 /* predicate if string is valid UTF-8
8 /* SYNOPSIS
9 /* #include <stringops.h>
10 /*
11 /* int valid_utf8_string(str, len)
12 /* const char *str;
13 /* ssize_t len;
14 /*
15 /* int valid_utf8_stringz(str)
16 /* const char *str;
17 /* ssize_t len;
18 /* DESCRIPTION
19 /* valid_utf8_string() determines if all bytes in a string
20 /* satisfy parse_utf8_char(3h) checks. See there for any
21 /* implementation limitations.
22 /*
23 /* valid_utf8_stringz() determines the same for zero-terminated
24 /* strings.
25 /*
26 /* A zero-length string is considered valid.
27 /* DIAGNOSTICS
28 /* The result value is zero when the caller specifies a negative
29 /* length, or a string that does not pass parse_utf8_char(3h) checks.
30 /* SEE ALSO
31 /* parse_utf8_char(3h), parse one UTF-8 multibyte character
32 /* LICENSE
33 /* .ad
34 /* .fi
35 /* The Secure Mailer license must be distributed with this software.
36 /* AUTHOR(S)
37 /* Wietse Venema
38 /* IBM T.J. Watson Research
39 /* P.O. Box 704
40 /* Yorktown Heights, NY 10598, USA
41 /*
42 /* Wietse Venema
43 /* porcupine.org
44 /* Amawalk, NY 10501, USA
45 /*--*/
46
47 /* System library. */
48
49 #include <sys_defs.h>
50
51 /* Utility library. */
52
53 #include <stringops.h>
54 #include <parse_utf8_char.h>
55
56 /* valid_utf8_string - validate string according to RFC 3629 */
57
valid_utf8_string(const char * str,ssize_t len)58 int valid_utf8_string(const char *str, ssize_t len)
59 {
60 const char *ep = str + len;
61 const char *cp;
62 const char *last;
63
64 if (len < 0)
65 return (0);
66 if (len == 0)
67 return (1);
68
69 /*
70 * Ideally, the compiler will inline parse_utf8_char().
71 */
72 for (cp = str; cp < ep; cp++) {
73 if ((last = parse_utf8_char(cp, ep)) != 0)
74 cp = last;
75 else
76 return (0);
77 }
78 return (1);
79 }
80
81 /* valid_utf8_stringz - validate string according to RFC 3629 */
82
valid_utf8_stringz(const char * str)83 int valid_utf8_stringz(const char *str)
84 {
85 const char *cp;
86 const char *last;
87
88 /*
89 * Ideally, the compiler will inline parse_utf8_char(), propagate the
90 * null pointer constant value, and eliminate code branches that test
91 * whether 0 != 0.
92 */
93 for (cp = str; *cp; cp++) {
94 if ((last = parse_utf8_char(cp, 0)) != 0)
95 cp = last;
96 else
97 return (0);
98 }
99 return (1);
100 }
101
102 /*
103 * Stand-alone test program. Each string is a line without line terminator.
104 */
105 #ifdef TEST
106 #include <stdlib.h>
107 #include <string.h>
108 #include <msg.h>
109 #include <vstream.h>
110 #include <msg_vstream.h>
111
112 /*
113 * Test cases for 1-, 2-, and 3-byte encodings. See printable.c for UTF8
114 * parser resychronization tests.
115 *
116 * XXX Need a test for 4-byte encodings, preferably with strings that can be
117 * displayed.
118 *
119 * XXX Need tests with hand-crafted over-long encodings and surrogates.
120 */
121 struct testcase {
122 const char *name;
123 const char *input;
124 int expected;
125 };
126
127 #define T_VALID (1)
128 #define T_INVALID (0)
129 #define valid_to_str(v) ((v) ? "VALID" : "INVALID")
130
131 static const struct testcase testcases[] = {
132 {"Printable ASCII",
133 "printable", T_VALID,
134 },
135 {"Latin script, accented, no error",
136 "na\303\257ve", T_VALID,
137 },
138 {"Latin script, accented, missing non-leading byte",
139 "na\303ve", T_INVALID,
140 },
141 {"Latin script, accented, missing leading byte",
142 "na\257ve", T_INVALID,
143 },
144 {"Viktor, Cyrillic, no error",
145 "\320\262\320\270\320\272\321\202\320\276\321\200", T_VALID,
146 },
147 {"Viktor, Cyrillic, missing non-leading byte",
148 "\320\262\320\320\272\321\202\320\276\321\200", T_INVALID,
149 },
150 {"Viktor, Cyrillic, missing leading byte",
151 "\320\262\270\320\272\321\202\320\276\321\200", T_INVALID,
152 },
153 {"Viktor, Cyrillic, truncated",
154 "\320\262\320\270\320\272\321\202\320\276\321", T_INVALID,
155 },
156 {"Viktor, Hebrew, no error",
157 "\327\225\327\231\327\247\327\230\327\225\326\274\327\250", T_VALID,
158 },
159 {"Viktor, Hebrew, missing leading byte",
160 "\327\225\231\327\247\327\230\327\225\326\274\327\250", T_INVALID,
161 },
162 {"Chinese (Simplified), no error",
163 "\344\270\255\345\233\275\344\272\222\350\201\224\347\275\221\347"
164 "\273\234\345\217\221\345\261\225\347\212\266\345\206\265\347\273"
165 "\237\350\256\241\346\212\245\345\221\212", T_VALID,
166 },
167 {"Chinese (Simplified), missing leading byte",
168 "\344\270\255\345\233\275\344\272\222\350\201\224\275\221\347"
169 "\273\234\345\217\221\345\261\225\347\212\266\345\206\265\347\273"
170 "\237\350\256\241\346\212\245\345\221\212", T_INVALID,
171 },
172 {"Chinese (Simplified), missing first non-leading byte",
173 "\344\270\255\345\233\275\344\272\222\350\201\224\347\221\347"
174 "\273\234\345\217\221\345\261\225\347\212\266\345\206\265\347\273"
175 "\237\350\256\241\346\212\245\345\221\212", T_INVALID,
176 },
177 {"Chinese (Simplified), missing second non-leading byte",
178 "\344\270\255\345\233\275\344\272\222\350\201\224\347\275\347"
179 "\273\234\345\217\221\345\261\225\347\212\266\345\206\265\347\273"
180 "\237\350\256\241\346\212\245\345\221\212", T_INVALID,
181 },
182 {"Chinese (Simplified), truncated",
183 "\344\270\255\345\233\275\344\272\222\350\201\224\347\275\221\347"
184 "\273\234\345\217\221\345\261\225\347\212\266\345\206\265\347\273"
185 "\237\350\256\241\346\212\245\345", T_INVALID,
186 },
187 };
188
main(int argc,char ** argv)189 int main(int argc, char **argv)
190 {
191 const struct testcase *tp;
192 int pass;
193 int fail;
194
195 #define NUM_TESTS sizeof(testcases)/sizeof(testcases[0])
196
197 msg_vstream_init(basename(argv[0]), VSTREAM_ERR);
198 util_utf8_enable = 1;
199
200 for (pass = fail = 0, tp = testcases; tp < testcases + NUM_TESTS; tp++) {
201 int actual_l;
202 int actual_z;
203 int ok = 0;
204
205 /*
206 * Notes:
207 *
208 * - The msg(3) functions use printable() which interferes when logging
209 * inputs and outputs. Use vstream_fprintf() instead.
210 */
211 vstream_fprintf(VSTREAM_ERR, "RUN %s\n", tp->name);
212 actual_l = valid_utf8_string(tp->input, strlen(tp->input));
213 actual_z = valid_utf8_stringz(tp->input);
214
215 if (actual_l != tp->expected) {
216 vstream_fprintf(VSTREAM_ERR,
217 "input: >%s<, 'actual_l' got: >%s<, want: >%s<\n",
218 tp->input, valid_to_str(actual_l),
219 valid_to_str(tp->expected));
220 } else if (actual_z != tp->expected) {
221 vstream_fprintf(VSTREAM_ERR,
222 "input: >%s<, 'actual_z' got: >%s<, want: >%s<\n",
223 tp->input, valid_to_str(actual_z),
224 valid_to_str(tp->expected));
225 } else {
226 vstream_fprintf(VSTREAM_ERR, "input: >%s<, got and want: >%s<\n",
227 tp->input, valid_to_str(actual_l));
228 ok = 1;
229 }
230 if (ok) {
231 vstream_fprintf(VSTREAM_ERR, "PASS %s\n", tp->name);
232 pass++;
233 } else {
234 vstream_fprintf(VSTREAM_ERR, "FAIL %s\n", tp->name);
235 fail++;
236 }
237 }
238 msg_info("PASS=%d FAIL=%d", pass, fail);
239 return (fail > 0);
240 }
241
242 #endif
243