1 /*        $NetBSD: normalize.c,v 1.3 2023/06/19 21:41:45 christos Exp $         */
2 
3 /*
4  * Copyright (c) 2004 Kungliga Tekniska Högskolan
5  * (Royal Institute of Technology, Stockholm, Sweden).
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  *
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * 3. Neither the name of the Institute nor the names of its contributors
20  *    may be used to endorse or promote products derived from this software
21  *    without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE INSTITUTE AND CONTRIBUTORS ``AS IS'' AND
24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE INSTITUTE OR CONTRIBUTORS BE LIABLE
27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 
36 #ifdef HAVE_CONFIG_H
37 #include <config.h>
38 #endif
39 #include "windlocl.h"
40 
41 #include <assert.h>
42 #include <stdlib.h>
43 #include <errno.h>
44 #include <stdio.h>
45 
46 #include <krb5/roken.h>
47 
48 #include "normalize_table.h"
49 
50 static int
translation_cmp(const void * key,const void * data)51 translation_cmp(const void *key, const void *data)
52 {
53     const struct translation *t1 = (const struct translation *)key;
54     const struct translation *t2 = (const struct translation *)data;
55 
56     return t1->key - t2->key;
57 }
58 
59 enum { s_base  = 0xAC00};
60 enum { s_count = 11172};
61 enum { l_base  = 0x1100};
62 enum { l_count = 19};
63 enum { v_base  = 0x1161};
64 enum { v_count = 21};
65 enum { t_base  = 0x11A7};
66 enum { t_count = 28};
67 enum { n_count = v_count * t_count};
68 
69 static int
hangul_decomp(const uint32_t * in,size_t in_len,uint32_t * out,size_t * out_len)70 hangul_decomp(const uint32_t *in, size_t in_len,
71                 uint32_t *out, size_t *out_len)
72 {
73     uint32_t u = *in;
74     unsigned s_index;
75     unsigned l, v, t;
76     unsigned o;
77 
78     if (u < s_base || u >= s_base + s_count)
79           return 0;
80     s_index = u - s_base;
81     l = l_base + s_index / n_count;
82     v = v_base + (s_index % n_count) / t_count;
83     t = t_base + s_index % t_count;
84     o = 2;
85     if (t != t_base)
86           ++o;
87     if (*out_len < o)
88           return WIND_ERR_OVERRUN;
89     out[0] = l;
90     out[1] = v;
91     if (t != t_base)
92           out[2] = t;
93     *out_len = o;
94     return 1;
95 }
96 
97 static uint32_t
hangul_composition(const uint32_t * in,size_t in_len)98 hangul_composition(const uint32_t *in, size_t in_len)
99 {
100     if (in_len < 2)
101           return 0;
102     if (in[0] >= l_base && in[0] < l_base + l_count) {
103           unsigned l_index = in[0] - l_base;
104           unsigned v_index;
105 
106           if (in[1] < v_base || in[1] >= v_base + v_count)
107               return 0;
108           v_index = in[1] - v_base;
109           return (l_index * v_count + v_index) * t_count + s_base;
110     } else if (in[0] >= s_base && in[0] < s_base + s_count) {
111           unsigned s_index = in[0] - s_base;
112           unsigned t_index;
113 
114           if (s_index % t_count != 0)
115               return 0;
116           if (in[1] < t_base || in[1] >= t_base + t_count)
117               return 0;
118           t_index = in[1] - t_base;
119           return in[0] + t_index;
120     }
121     return 0;
122 }
123 
124 static int
compat_decomp(const uint32_t * in,size_t in_len,uint32_t * out,size_t * out_len)125 compat_decomp(const uint32_t *in, size_t in_len,
126                 uint32_t *out, size_t *out_len)
127 {
128     unsigned i;
129     unsigned o = 0;
130 
131     for (i = 0; i < in_len; ++i) {
132           struct translation ts = {in[i], 0, 0};
133           size_t sub_len = *out_len - o;
134           int ret;
135 
136           ret = hangul_decomp(in + i, in_len - i,
137                                   out + o, &sub_len);
138           if (ret) {
139               if (ret == WIND_ERR_OVERRUN)
140                     return ret;
141               o += sub_len;
142           } else {
143               void *s = bsearch(&ts,
144                                     _wind_normalize_table,
145                                     _wind_normalize_table_size,
146                                     sizeof(_wind_normalize_table[0]),
147                                     translation_cmp);
148               if (s != NULL) {
149                     const struct translation *t = (const struct translation *)s;
150 
151                     ret = compat_decomp(_wind_normalize_val_table + t->val_offset,
152                                             t->val_len,
153                                             out + o, &sub_len);
154                     if (ret)
155                         return ret;
156                     o += sub_len;
157               } else {
158                     if (o >= *out_len)
159                         return WIND_ERR_OVERRUN;
160                     out[o++] = in[i];
161 
162               }
163           }
164     }
165     *out_len = o;
166     return 0;
167 }
168 
169 static void
swap_char(uint32_t * a,uint32_t * b)170 swap_char(uint32_t * a, uint32_t * b)
171 {
172     uint32_t t;
173     t = *a;
174     *a = *b;
175     *b = t;
176 }
177 
178 /* Unicode 5.2.0 D109 Canonical Ordering for a sequence of code points
179  * that all have Canonical_Combining_Class > 0 */
180 static void
canonical_reorder_sequence(uint32_t * a,size_t len)181 canonical_reorder_sequence(uint32_t * a, size_t len)
182 {
183     size_t i, j;
184 
185     if (len <= 1)
186           return;
187 
188     for (i = 1; i < len; i++) {
189           for (j = i;
190                j > 0 &&
191                      _wind_combining_class(a[j]) < _wind_combining_class(a[j-1]);
192                j--)
193               swap_char(&a[j], &a[j-1]);
194     }
195 }
196 
197 static void
canonical_reorder(uint32_t * tmp,size_t tmp_len)198 canonical_reorder(uint32_t *tmp, size_t tmp_len)
199 {
200     size_t i;
201 
202     for (i = 0; i < tmp_len; ++i) {
203           int cc = _wind_combining_class(tmp[i]);
204           if (cc) {
205               size_t j;
206               for (j = i + 1;
207                      j < tmp_len && _wind_combining_class(tmp[j]);
208                      ++j)
209                     ;
210               canonical_reorder_sequence(&tmp[i], j - i);
211               i = j;
212           }
213     }
214 }
215 
216 static uint32_t
find_composition(const uint32_t * in,unsigned in_len)217 find_composition(const uint32_t *in, unsigned in_len)
218 {
219     unsigned short canon_index = 0;
220     uint32_t cur;
221     unsigned n = 0;
222 
223     cur = hangul_composition(in, in_len);
224     if (cur)
225           return cur;
226 
227     do {
228           const struct canon_node *c = &_wind_canon_table[canon_index];
229           unsigned i;
230 
231           if (n % 5 == 0) {
232               if (in_len-- == 0)
233                     return c->val;
234               cur = *in++;
235           }
236 
237           i = cur >> 16;
238           if (i < c->next_start || i >= c->next_end)
239               canon_index = 0;
240           else
241               canon_index =
242                     _wind_canon_next_table[c->next_offset + i - c->next_start];
243           if (canon_index != 0) {
244               cur = (cur << 4) & 0xFFFFF;
245               ++n;
246           }
247     } while (canon_index != 0);
248     return 0;
249 }
250 
251 static int
combine(const uint32_t * in,size_t in_len,uint32_t * out,size_t * out_len)252 combine(const uint32_t *in, size_t in_len,
253           uint32_t *out, size_t *out_len)
254 {
255     unsigned i;
256     int ostarter;
257     unsigned o = 0;
258     int old_cc;
259 
260     for (i = 0; i < in_len;) {
261           while (i < in_len && _wind_combining_class(in[i]) != 0) {
262               out[o++] = in[i++];
263           }
264           if (i < in_len) {
265               if (o >= *out_len)
266                     return WIND_ERR_OVERRUN;
267               ostarter = o;
268               out[o++] = in[i++];
269               old_cc   = -1;
270 
271               while (i < in_len) {
272                     uint32_t comb;
273                     uint32_t v[2];
274                     int cc;
275 
276                     v[0] = out[ostarter];
277                     v[1] = in[i];
278 
279                     cc = _wind_combining_class(in[i]);
280                     if (old_cc != cc && (comb = find_composition(v, 2))) {
281                         out[ostarter] = comb;
282                     } else if (cc == 0) {
283                         break;
284                     } else {
285                         if (o >= *out_len)
286                               return WIND_ERR_OVERRUN;
287                         out[o++] = in[i];
288                         old_cc   = cc;
289                     }
290                     ++i;
291               }
292           }
293     }
294     *out_len = o;
295     return 0;
296 }
297 
298 int
_wind_stringprep_normalize(const uint32_t * in,size_t in_len,uint32_t * out,size_t * out_len)299 _wind_stringprep_normalize(const uint32_t *in, size_t in_len,
300                                  uint32_t *out, size_t *out_len)
301 {
302     size_t tmp_len;
303     uint32_t *tmp;
304     int ret;
305 
306     if (in_len == 0) {
307           *out_len = 0;
308           return 0;
309     }
310 
311     tmp_len = in_len * 4;
312     if (tmp_len < MAX_LENGTH_CANON)
313           tmp_len = MAX_LENGTH_CANON;
314     tmp = malloc(tmp_len * sizeof(uint32_t));
315     if (tmp == NULL)
316           return ENOMEM;
317 
318     ret = compat_decomp(in, in_len, tmp, &tmp_len);
319     if (ret) {
320           free(tmp);
321           return ret;
322     }
323     canonical_reorder(tmp, tmp_len);
324     ret = combine(tmp, tmp_len, out, out_len);
325     free(tmp);
326     return ret;
327 }
328