xref: /trueos/contrib/jansson/src/utf.c (revision 47b2a07a74bf982bec730c936e92feeef05c7575)
1 /*
2  * Copyright (c) 2009-2014 Petri Lehtinen <petri@digip.org>
3  *
4  * Jansson is free software; you can redistribute it and/or modify
5  * it under the terms of the MIT license. See LICENSE for details.
6  */
7 
8 #include <string.h>
9 #include "utf.h"
10 
utf8_encode(int32_t codepoint,char * buffer,size_t * size)11 int utf8_encode(int32_t codepoint, char *buffer, size_t *size)
12 {
13     if(codepoint < 0)
14         return -1;
15     else if(codepoint < 0x80)
16     {
17         buffer[0] = (char)codepoint;
18         *size = 1;
19     }
20     else if(codepoint < 0x800)
21     {
22         buffer[0] = 0xC0 + ((codepoint & 0x7C0) >> 6);
23         buffer[1] = 0x80 + ((codepoint & 0x03F));
24         *size = 2;
25     }
26     else if(codepoint < 0x10000)
27     {
28         buffer[0] = 0xE0 + ((codepoint & 0xF000) >> 12);
29         buffer[1] = 0x80 + ((codepoint & 0x0FC0) >> 6);
30         buffer[2] = 0x80 + ((codepoint & 0x003F));
31         *size = 3;
32     }
33     else if(codepoint <= 0x10FFFF)
34     {
35         buffer[0] = 0xF0 + ((codepoint & 0x1C0000) >> 18);
36         buffer[1] = 0x80 + ((codepoint & 0x03F000) >> 12);
37         buffer[2] = 0x80 + ((codepoint & 0x000FC0) >> 6);
38         buffer[3] = 0x80 + ((codepoint & 0x00003F));
39         *size = 4;
40     }
41     else
42         return -1;
43 
44     return 0;
45 }
46 
utf8_check_first(char byte)47 size_t utf8_check_first(char byte)
48 {
49     unsigned char u = (unsigned char)byte;
50 
51     if(u < 0x80)
52         return 1;
53 
54     if(0x80 <= u && u <= 0xBF) {
55         /* second, third or fourth byte of a multi-byte
56            sequence, i.e. a "continuation byte" */
57         return 0;
58     }
59     else if(u == 0xC0 || u == 0xC1) {
60         /* overlong encoding of an ASCII byte */
61         return 0;
62     }
63     else if(0xC2 <= u && u <= 0xDF) {
64         /* 2-byte sequence */
65         return 2;
66     }
67 
68     else if(0xE0 <= u && u <= 0xEF) {
69         /* 3-byte sequence */
70         return 3;
71     }
72     else if(0xF0 <= u && u <= 0xF4) {
73         /* 4-byte sequence */
74         return 4;
75     }
76     else { /* u >= 0xF5 */
77         /* Restricted (start of 4-, 5- or 6-byte sequence) or invalid
78            UTF-8 */
79         return 0;
80     }
81 }
82 
utf8_check_full(const char * buffer,size_t size,int32_t * codepoint)83 size_t utf8_check_full(const char *buffer, size_t size, int32_t *codepoint)
84 {
85     size_t i;
86     int32_t value = 0;
87     unsigned char u = (unsigned char)buffer[0];
88 
89     if(size == 2)
90     {
91         value = u & 0x1F;
92     }
93     else if(size == 3)
94     {
95         value = u & 0xF;
96     }
97     else if(size == 4)
98     {
99         value = u & 0x7;
100     }
101     else
102         return 0;
103 
104     for(i = 1; i < size; i++)
105     {
106         u = (unsigned char)buffer[i];
107 
108         if(u < 0x80 || u > 0xBF) {
109             /* not a continuation byte */
110             return 0;
111         }
112 
113         value = (value << 6) + (u & 0x3F);
114     }
115 
116     if(value > 0x10FFFF) {
117         /* not in Unicode range */
118         return 0;
119     }
120 
121     else if(0xD800 <= value && value <= 0xDFFF) {
122         /* invalid code point (UTF-16 surrogate halves) */
123         return 0;
124     }
125 
126     else if((size == 2 && value < 0x80) ||
127             (size == 3 && value < 0x800) ||
128             (size == 4 && value < 0x10000)) {
129         /* overlong encoding */
130         return 0;
131     }
132 
133     if(codepoint)
134         *codepoint = value;
135 
136     return 1;
137 }
138 
utf8_iterate(const char * buffer,size_t bufsize,int32_t * codepoint)139 const char *utf8_iterate(const char *buffer, size_t bufsize, int32_t *codepoint)
140 {
141     size_t count;
142     int32_t value;
143 
144     if(!bufsize)
145         return buffer;
146 
147     count = utf8_check_first(buffer[0]);
148     if(count <= 0)
149         return NULL;
150 
151     if(count == 1)
152         value = (unsigned char)buffer[0];
153     else
154     {
155         if(count > bufsize || !utf8_check_full(buffer, count, &value))
156             return NULL;
157     }
158 
159     if(codepoint)
160         *codepoint = value;
161 
162     return buffer + count;
163 }
164 
utf8_check_string(const char * string,size_t length)165 int utf8_check_string(const char *string, size_t length)
166 {
167     size_t i;
168 
169     for(i = 0; i < length; i++)
170     {
171         size_t count = utf8_check_first(string[i]);
172         if(count == 0)
173             return 0;
174         else if(count > 1)
175         {
176             if(count > length - i)
177                 return 0;
178 
179             if(!utf8_check_full(&string[i], count, NULL))
180                 return 0;
181 
182             i += count - 1;
183         }
184     }
185 
186     return 1;
187 }
188