1 /*        $NetBSD: parse_utf8_char.h,v 1.2 2025/02/25 19:15:52 christos Exp $   */
2 
3 /*++
4 /* NAME
5 /*        parse_utf8_char 3h
6 /* SUMMARY
7 /*        parse one UTF-8 multibyte character
8 /* SYNOPSIS
9 /*        #include <parse_utf8_char.h>
10 /*
11 /*        char      *parse_utf8_char(str, end)
12 /*        const char *str;
13 /*        const char *end;
14 /* DESCRIPTION
15 /*        parse_utf8_char() determines if the byte sequence starting
16 /*        at \fBstr\fR begins with a complete UTF-8 character as
17 /*        defined in RFC 3629. That is, a proper encoding of code
18 /*        points U+0000..U+10FFFF, excluding over-long encodings and
19 /*        excluding U+D800..U+DFFF surrogates.
20 /*
21 /*        When the byte sequence starting at \fBstr\fR begins with a
22 /*        complete UTF-8 character, this function returns a pointer
23 /*        to the last byte in that character. Otherwise, it returns
24 /*        a null pointer.
25 /*
26 /*        The \fBend\fR argument is either null (the byte sequence
27 /*        starting at \fBstr\fR must be null terminated), or \fBend
28 /*        - str\fR specifies the length of the byte sequence.
29 /* BUGS
30 /*        Code points in the range U+FDD0..U+FDEF and ending in FFFE
31 /*        or FFFF are non-characters in UNICODE. This function does
32 /*        not reject these.
33 /* LICENSE
34 /* .ad
35 /* .fi
36 /*        The Secure Mailer license must be distributed with this software.
37 /* AUTHOR(S)
38 /*        Wietse Venema
39 /*        IBM T.J. Watson Research
40 /*        P.O. Box 704
41 /*        Yorktown Heights, NY 10598, USA
42 /*
43 /*        Wietse Venema
44 /*        porcupine.org
45 /*        Amawalk, NY 10501, USA
46 /*--*/
47 
48  /*
49   * System library.
50   */
51 #include <sys_defs.h>
52 
53 #ifdef NO_INLINE
54 #define inline                                    /* */
55 #endif
56 
57 /* parse_utf8_char - parse and validate one UTF8 multibyte sequence */
58 
parse_utf8_char(const char * str,const char * end)59 static inline char *parse_utf8_char(const char *str, const char *end)
60 {
61     const unsigned char *cp = (const unsigned char *) str;
62     const unsigned char *ep = (const unsigned char *) end;
63     unsigned char c0, ch;
64 
65     /*
66      * Optimized for correct input, time, space, and for CPUs that have a
67      * decent number of registers. Other implementation considerations:
68      *
69      * - In the UTF-8 encoding, a non-leading byte is never null. Therefore,
70      * this function will correctly reject a partial UTF-8 character at the
71      * end of a null-terminated string.
72      *
73      * - If the "end" argument is a null constant, and if this function is
74      * inlined, then an optimizing compiler should propagate the constant
75      * through the "ep" variable, and eliminate any code branches that
76      * require ep != 0.
77      */
78     /* Single-byte encodings. */
79     if (EXPECTED((c0 = *cp) <= 0x7f) /* we know that c0 >= 0x0 */ ) {
80           return ((char *) cp);
81     }
82     /* Two-byte encodings. */
83     else if (EXPECTED(c0 <= 0xdf) /* we know that c0 >= 0x80 */ ) {
84           /* Exclude over-long encodings. */
85           if (UNEXPECTED(c0 < 0xc2)
86               || UNEXPECTED(ep && cp + 1 >= ep)
87           /* Require UTF-8 tail byte. */
88               || UNEXPECTED(((ch = *++cp) & 0xc0) != 0x80))
89               return (0);
90           return ((char *) cp);
91     }
92     /* Three-byte encodings. */
93     else if (EXPECTED(c0 <= 0xef) /* we know that c0 >= 0xe0 */ ) {
94           if (UNEXPECTED(ep && cp + 2 >= ep)
95           /* Exclude over-long encodings. */
96               || UNEXPECTED((ch = *++cp) < (c0 == 0xe0 ? 0xa0 : 0x80))
97           /* Exclude U+D800..U+DFFF. */
98               || UNEXPECTED(ch > (c0 == 0xed ? 0x9f : 0xbf))
99           /* Require UTF-8 tail byte. */
100               || UNEXPECTED(((ch = *++cp) & 0xc0) != 0x80))
101               return (0);
102           return ((char *) cp);
103     }
104     /* Four-byte encodings. */
105     else if (EXPECTED(c0 <= 0xf4) /* we know that c0 >= 0xf0 */ ) {
106           if (UNEXPECTED(ep && cp + 3 >= ep)
107           /* Exclude over-long encodings. */
108               || UNEXPECTED((ch = *++cp) < (c0 == 0xf0 ? 0x90 : 0x80))
109           /* Exclude code points above U+10FFFF. */
110               || UNEXPECTED(ch > (c0 == 0xf4 ? 0x8f : 0xbf))
111           /* Require UTF-8 tail byte. */
112               || UNEXPECTED(((ch = *++cp) & 0xc0) != 0x80)
113           /* Require UTF-8 tail byte. */
114               || UNEXPECTED(((ch = *++cp) & 0xc0) != 0x80))
115               return (0);
116           return ((char *) cp);
117     }
118     /* Invalid: c0 >= 0xf5 */
119     else {
120           return (0);
121     }
122 }
123 
124 #undef inline
125