1 /* $NetBSD: parse_utf8_char.h,v 1.2 2025/02/25 19:15:52 christos Exp $ */
2
3 /*++
4 /* NAME
5 /* parse_utf8_char 3h
6 /* SUMMARY
7 /* parse one UTF-8 multibyte character
8 /* SYNOPSIS
9 /* #include <parse_utf8_char.h>
10 /*
11 /* char *parse_utf8_char(str, end)
12 /* const char *str;
13 /* const char *end;
14 /* DESCRIPTION
15 /* parse_utf8_char() determines if the byte sequence starting
16 /* at \fBstr\fR begins with a complete UTF-8 character as
17 /* defined in RFC 3629. That is, a proper encoding of code
18 /* points U+0000..U+10FFFF, excluding over-long encodings and
19 /* excluding U+D800..U+DFFF surrogates.
20 /*
21 /* When the byte sequence starting at \fBstr\fR begins with a
22 /* complete UTF-8 character, this function returns a pointer
23 /* to the last byte in that character. Otherwise, it returns
24 /* a null pointer.
25 /*
26 /* The \fBend\fR argument is either null (the byte sequence
27 /* starting at \fBstr\fR must be null terminated), or \fBend
28 /* - str\fR specifies the length of the byte sequence.
29 /* BUGS
30 /* Code points in the range U+FDD0..U+FDEF and ending in FFFE
31 /* or FFFF are non-characters in UNICODE. This function does
32 /* not reject these.
33 /* LICENSE
34 /* .ad
35 /* .fi
36 /* The Secure Mailer license must be distributed with this software.
37 /* AUTHOR(S)
38 /* Wietse Venema
39 /* IBM T.J. Watson Research
40 /* P.O. Box 704
41 /* Yorktown Heights, NY 10598, USA
42 /*
43 /* Wietse Venema
44 /* porcupine.org
45 /* Amawalk, NY 10501, USA
46 /*--*/
47
48 /*
49 * System library.
50 */
51 #include <sys_defs.h>
52
53 #ifdef NO_INLINE
54 #define inline /* */
55 #endif
56
57 /* parse_utf8_char - parse and validate one UTF8 multibyte sequence */
58
parse_utf8_char(const char * str,const char * end)59 static inline char *parse_utf8_char(const char *str, const char *end)
60 {
61 const unsigned char *cp = (const unsigned char *) str;
62 const unsigned char *ep = (const unsigned char *) end;
63 unsigned char c0, ch;
64
65 /*
66 * Optimized for correct input, time, space, and for CPUs that have a
67 * decent number of registers. Other implementation considerations:
68 *
69 * - In the UTF-8 encoding, a non-leading byte is never null. Therefore,
70 * this function will correctly reject a partial UTF-8 character at the
71 * end of a null-terminated string.
72 *
73 * - If the "end" argument is a null constant, and if this function is
74 * inlined, then an optimizing compiler should propagate the constant
75 * through the "ep" variable, and eliminate any code branches that
76 * require ep != 0.
77 */
78 /* Single-byte encodings. */
79 if (EXPECTED((c0 = *cp) <= 0x7f) /* we know that c0 >= 0x0 */ ) {
80 return ((char *) cp);
81 }
82 /* Two-byte encodings. */
83 else if (EXPECTED(c0 <= 0xdf) /* we know that c0 >= 0x80 */ ) {
84 /* Exclude over-long encodings. */
85 if (UNEXPECTED(c0 < 0xc2)
86 || UNEXPECTED(ep && cp + 1 >= ep)
87 /* Require UTF-8 tail byte. */
88 || UNEXPECTED(((ch = *++cp) & 0xc0) != 0x80))
89 return (0);
90 return ((char *) cp);
91 }
92 /* Three-byte encodings. */
93 else if (EXPECTED(c0 <= 0xef) /* we know that c0 >= 0xe0 */ ) {
94 if (UNEXPECTED(ep && cp + 2 >= ep)
95 /* Exclude over-long encodings. */
96 || UNEXPECTED((ch = *++cp) < (c0 == 0xe0 ? 0xa0 : 0x80))
97 /* Exclude U+D800..U+DFFF. */
98 || UNEXPECTED(ch > (c0 == 0xed ? 0x9f : 0xbf))
99 /* Require UTF-8 tail byte. */
100 || UNEXPECTED(((ch = *++cp) & 0xc0) != 0x80))
101 return (0);
102 return ((char *) cp);
103 }
104 /* Four-byte encodings. */
105 else if (EXPECTED(c0 <= 0xf4) /* we know that c0 >= 0xf0 */ ) {
106 if (UNEXPECTED(ep && cp + 3 >= ep)
107 /* Exclude over-long encodings. */
108 || UNEXPECTED((ch = *++cp) < (c0 == 0xf0 ? 0x90 : 0x80))
109 /* Exclude code points above U+10FFFF. */
110 || UNEXPECTED(ch > (c0 == 0xf4 ? 0x8f : 0xbf))
111 /* Require UTF-8 tail byte. */
112 || UNEXPECTED(((ch = *++cp) & 0xc0) != 0x80)
113 /* Require UTF-8 tail byte. */
114 || UNEXPECTED(((ch = *++cp) & 0xc0) != 0x80))
115 return (0);
116 return ((char *) cp);
117 }
118 /* Invalid: c0 >= 0xf5 */
119 else {
120 return (0);
121 }
122 }
123
124 #undef inline
125