1 /* $OpenBSD$ */
2 
3 /*
4  * Copyright (c) 2008 Nicholas Marriott <nicholas.marriott@gmail.com>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER
15  * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
16  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include <sys/types.h>
20 
21 #include <ctype.h>
22 #include <errno.h>
23 #include <stdlib.h>
24 #include <string.h>
25 #include <wchar.h>
26 
27 #include "compat.h"
28 #include "tmux.h"
29 
30 static const wchar_t utf8_force_wide[] = {
31           0x0261D,
32           0x026F9,
33           0x0270A,
34           0x0270B,
35           0x0270C,
36           0x0270D,
37           0x1F1E6,
38           0x1F1E7,
39           0x1F1E8,
40           0x1F1E9,
41           0x1F1EA,
42           0x1F1EB,
43           0x1F1EC,
44           0x1F1ED,
45           0x1F1EE,
46           0x1F1EF,
47           0x1F1F0,
48           0x1F1F1,
49           0x1F1F2,
50           0x1F1F3,
51           0x1F1F4,
52           0x1F1F5,
53           0x1F1F6,
54           0x1F1F7,
55           0x1F1F8,
56           0x1F1F9,
57           0x1F1FA,
58           0x1F1FB,
59           0x1F1FC,
60           0x1F1FD,
61           0x1F1FE,
62           0x1F1FF,
63           0x1F385,
64           0x1F3C2,
65           0x1F3C3,
66           0x1F3C4,
67           0x1F3C7,
68           0x1F3CA,
69           0x1F3CB,
70           0x1F3CC,
71           0x1F3FB,
72           0x1F3FC,
73           0x1F3FD,
74           0x1F3FE,
75           0x1F3FF,
76           0x1F442,
77           0x1F443,
78           0x1F446,
79           0x1F447,
80           0x1F448,
81           0x1F449,
82           0x1F44A,
83           0x1F44B,
84           0x1F44C,
85           0x1F44D,
86           0x1F44E,
87           0x1F44F,
88           0x1F450,
89           0x1F466,
90           0x1F467,
91           0x1F468,
92           0x1F469,
93           0x1F46B,
94           0x1F46C,
95           0x1F46D,
96           0x1F46E,
97           0x1F470,
98           0x1F471,
99           0x1F472,
100           0x1F473,
101           0x1F474,
102           0x1F475,
103           0x1F476,
104           0x1F477,
105           0x1F478,
106           0x1F47C,
107           0x1F481,
108           0x1F482,
109           0x1F483,
110           0x1F485,
111           0x1F486,
112           0x1F487,
113           0x1F48F,
114           0x1F491,
115           0x1F4AA,
116           0x1F574,
117           0x1F575,
118           0x1F57A,
119           0x1F590,
120           0x1F595,
121           0x1F596,
122           0x1F645,
123           0x1F646,
124           0x1F647,
125           0x1F64B,
126           0x1F64C,
127           0x1F64D,
128           0x1F64E,
129           0x1F64F,
130           0x1F6A3,
131           0x1F6B4,
132           0x1F6B5,
133           0x1F6B6,
134           0x1F6C0,
135           0x1F6CC,
136           0x1F90C,
137           0x1F90F,
138           0x1F918,
139           0x1F919,
140           0x1F91A,
141           0x1F91B,
142           0x1F91C,
143           0x1F91D,
144           0x1F91E,
145           0x1F91F,
146           0x1F926,
147           0x1F930,
148           0x1F931,
149           0x1F932,
150           0x1F933,
151           0x1F934,
152           0x1F935,
153           0x1F936,
154           0x1F937,
155           0x1F938,
156           0x1F939,
157           0x1F93D,
158           0x1F93E,
159           0x1F977,
160           0x1F9B5,
161           0x1F9B6,
162           0x1F9B8,
163           0x1F9B9,
164           0x1F9BB,
165           0x1F9CD,
166           0x1F9CE,
167           0x1F9CF,
168           0x1F9D1,
169           0x1F9D2,
170           0x1F9D3,
171           0x1F9D4,
172           0x1F9D5,
173           0x1F9D6,
174           0x1F9D7,
175           0x1F9D8,
176           0x1F9D9,
177           0x1F9DA,
178           0x1F9DB,
179           0x1F9DC,
180           0x1F9DD,
181           0x1FAC3,
182           0x1FAC4,
183           0x1FAC5,
184           0x1FAF0,
185           0x1FAF1,
186           0x1FAF2,
187           0x1FAF3,
188           0x1FAF4,
189           0x1FAF5,
190           0x1FAF6,
191           0x1FAF7,
192           0x1FAF8
193 };
194 
195 struct utf8_item {
196           RB_ENTRY(utf8_item) index_entry;
197           u_int                         index;
198 
199           RB_ENTRY(utf8_item) data_entry;
200           char                          data[UTF8_SIZE];
201           u_char                        size;
202 };
203 
204 static int
utf8_data_cmp(struct utf8_item * ui1,struct utf8_item * ui2)205 utf8_data_cmp(struct utf8_item *ui1, struct utf8_item *ui2)
206 {
207           if (ui1->size < ui2->size)
208                     return (-1);
209           if (ui1->size > ui2->size)
210                     return (1);
211           return (memcmp(ui1->data, ui2->data, ui1->size));
212 }
213 RB_HEAD(utf8_data_tree, utf8_item);
214 RB_GENERATE_STATIC(utf8_data_tree, utf8_item, data_entry, utf8_data_cmp);
215 static struct utf8_data_tree utf8_data_tree = RB_INITIALIZER(utf8_data_tree);
216 
217 static int
utf8_index_cmp(struct utf8_item * ui1,struct utf8_item * ui2)218 utf8_index_cmp(struct utf8_item *ui1, struct utf8_item *ui2)
219 {
220           if (ui1->index < ui2->index)
221                     return (-1);
222           if (ui1->index > ui2->index)
223                     return (1);
224           return (0);
225 }
226 RB_HEAD(utf8_index_tree, utf8_item);
227 RB_GENERATE_STATIC(utf8_index_tree, utf8_item, index_entry, utf8_index_cmp);
228 static struct utf8_index_tree utf8_index_tree = RB_INITIALIZER(utf8_index_tree);
229 
230 static u_int utf8_next_index;
231 
232 #define UTF8_GET_SIZE(uc) (((uc) >> 24) & 0x1f)
233 #define UTF8_GET_WIDTH(uc) (((uc) >> 29) - 1)
234 
235 #define UTF8_SET_SIZE(size) (((utf8_char)(size)) << 24)
236 #define UTF8_SET_WIDTH(width) ((((utf8_char)(width)) + 1) << 29)
237 
238 /* Get a UTF-8 item from data. */
239 static struct utf8_item *
utf8_item_by_data(const u_char * data,size_t size)240 utf8_item_by_data(const u_char *data, size_t size)
241 {
242           struct utf8_item    ui;
243 
244           memcpy(ui.data, data, size);
245           ui.size = size;
246 
247           return (RB_FIND(utf8_data_tree, &utf8_data_tree, &ui));
248 }
249 
250 /* Get a UTF-8 item from data. */
251 static struct utf8_item *
utf8_item_by_index(u_int index)252 utf8_item_by_index(u_int index)
253 {
254           struct utf8_item    ui;
255 
256           ui.index = index;
257 
258           return (RB_FIND(utf8_index_tree, &utf8_index_tree, &ui));
259 }
260 
261 /* Add a UTF-8 item. */
262 static int
utf8_put_item(const u_char * data,size_t size,u_int * index)263 utf8_put_item(const u_char *data, size_t size, u_int *index)
264 {
265           struct utf8_item    *ui;
266 
267           ui = utf8_item_by_data((const unsigned char *)data, size);
268           if (ui != NULL) {
269                     *index = ui->index;
270                     log_debug("%s: found %.*s = %u", __func__, (int)size, data,
271                         *index);
272                     return (0);
273           }
274 
275           if (utf8_next_index == 0xffffff + 1)
276                     return (-1);
277 
278           ui = xcalloc(1, sizeof *ui);
279           ui->index = utf8_next_index++;
280           RB_INSERT(utf8_index_tree, &utf8_index_tree, ui);
281 
282           memcpy(ui->data, data, size);
283           ui->size = size;
284           RB_INSERT(utf8_data_tree, &utf8_data_tree, ui);
285 
286           *index = ui->index;
287           log_debug("%s: added %.*s = %u", __func__, (int)size, data, *index);
288           return (0);
289 }
290 
291 static int
utf8_table_cmp(const void * vp1,const void * vp2)292 utf8_table_cmp(const void *vp1, const void *vp2)
293 {
294           const wchar_t       *wc1 = vp1, *wc2 = vp2;
295 
296           if (*wc1 < *wc2)
297                     return (-1);
298           if (*wc1 > *wc2)
299                     return (1);
300           return (0);
301 }
302 
303 /* Check if character in table. */
304 int
utf8_in_table(wchar_t find,const wchar_t * table,u_int count)305 utf8_in_table(wchar_t find, const wchar_t *table, u_int count)
306 {
307           wchar_t   *found;
308 
309           found = bsearch(&find, table, count, sizeof *table, utf8_table_cmp);
310           return (found != NULL);
311 }
312 
313 /* Get UTF-8 character from data. */
314 enum utf8_state
utf8_from_data(const struct utf8_data * ud,utf8_char * uc)315 utf8_from_data(const struct utf8_data *ud, utf8_char *uc)
316 {
317           u_int     index;
318 
319           if (ud->width > 2)
320                     fatalx("invalid UTF-8 width: %u", ud->width);
321 
322           if (ud->size > UTF8_SIZE)
323                     goto fail;
324           if (ud->size <= 3) {
325                     index = (((utf8_char)ud->data[2] << 16)|
326                                 ((utf8_char)ud->data[1] << 8)|
327                                 ((utf8_char)ud->data[0]));
328           } else if (utf8_put_item(ud->data, ud->size, &index) != 0)
329                     goto fail;
330           *uc = UTF8_SET_SIZE(ud->size)|UTF8_SET_WIDTH(ud->width)|index;
331           log_debug("%s: (%d %d %.*s) -> %08x", __func__, ud->width, ud->size,
332               (int)ud->size, ud->data, *uc);
333           return (UTF8_DONE);
334 
335 fail:
336           if (ud->width == 0)
337                     *uc = UTF8_SET_SIZE(0)|UTF8_SET_WIDTH(0);
338           else if (ud->width == 1)
339                     *uc = UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|0x20;
340           else
341                     *uc = UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|0x2020;
342           return (UTF8_ERROR);
343 }
344 
345 /* Get UTF-8 data from character. */
346 void
utf8_to_data(utf8_char uc,struct utf8_data * ud)347 utf8_to_data(utf8_char uc, struct utf8_data *ud)
348 {
349           struct utf8_item    *ui;
350           u_int                          index;
351 
352           memset(ud, 0, sizeof *ud);
353           ud->size = ud->have = UTF8_GET_SIZE(uc);
354           ud->width = UTF8_GET_WIDTH(uc);
355 
356           if (ud->size <= 3) {
357                     ud->data[2] = (uc >> 16);
358                     ud->data[1] = ((uc >> 8) & 0xff);
359                     ud->data[0] = (uc & 0xff);
360           } else {
361                     index = (uc & 0xffffff);
362                     if ((ui = utf8_item_by_index(index)) == NULL)
363                               memset(ud->data, ' ', ud->size);
364                     else
365                               memcpy(ud->data, ui->data, ud->size);
366           }
367 
368           log_debug("%s: %08x -> (%d %d %.*s)", __func__, uc, ud->width, ud->size,
369               (int)ud->size, ud->data);
370 }
371 
372 /* Get UTF-8 character from a single ASCII character. */
373 u_int
utf8_build_one(u_char ch)374 utf8_build_one(u_char ch)
375 {
376           return (UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|ch);
377 }
378 
379 /* Set a single character. */
380 void
utf8_set(struct utf8_data * ud,u_char ch)381 utf8_set(struct utf8_data *ud, u_char ch)
382 {
383           static const struct utf8_data empty = { { 0 }, 1, 1, 1 };
384 
385           memcpy(ud, &empty, sizeof *ud);
386           *ud->data = ch;
387 }
388 
389 /* Copy UTF-8 character. */
390 void
utf8_copy(struct utf8_data * to,const struct utf8_data * from)391 utf8_copy(struct utf8_data *to, const struct utf8_data *from)
392 {
393           u_int     i;
394 
395           memcpy(to, from, sizeof *to);
396 
397           for (i = to->size; i < sizeof to->data; i++)
398                     to->data[i] = '\0';
399 }
400 
401 /* Get width of Unicode character. */
402 static enum utf8_state
utf8_width(struct utf8_data * ud,int * width)403 utf8_width(struct utf8_data *ud, int *width)
404 {
405           wchar_t   wc;
406 
407           if (utf8_towc(ud, &wc) != UTF8_DONE)
408                     return (UTF8_ERROR);
409           if (utf8_in_table(wc, utf8_force_wide, nitems(utf8_force_wide))) {
410                     *width = 2;
411                     return (UTF8_DONE);
412           }
413 #ifdef HAVE_UTF8PROC
414           *width = utf8proc_wcwidth(wc);
415           log_debug("utf8proc_wcwidth(%05X) returned %d", (u_int)wc, *width);
416 #else
417           *width = wcwidth(wc);
418           log_debug("wcwidth(%05X) returned %d", (u_int)wc, *width);
419           if (*width < 0) {
420                     /*
421                      * C1 control characters are nonprintable, so they are always
422                      * zero width.
423                      */
424                     *width = (wc >= 0x80 && wc <= 0x9f) ? 0 : 1;
425           }
426 #endif
427           if (*width >= 0 && *width <= 0xff)
428                     return (UTF8_DONE);
429           return (UTF8_ERROR);
430 }
431 
432 /* Convert UTF-8 character to wide character. */
433 enum utf8_state
utf8_towc(const struct utf8_data * ud,wchar_t * wc)434 utf8_towc(const struct utf8_data *ud, wchar_t *wc)
435 {
436 #ifdef HAVE_UTF8PROC
437           switch (utf8proc_mbtowc(wc, ud->data, ud->size)) {
438 #else
439           switch (mbtowc(wc, __UNCONST(ud->data), ud->size)) {
440 #endif
441           case -1:
442                     log_debug("UTF-8 %.*s, mbtowc() %d", (int)ud->size, ud->data,
443                         errno);
444                     mbtowc(NULL, NULL, MB_CUR_MAX);
445                     return (UTF8_ERROR);
446           case 0:
447                     return (UTF8_ERROR);
448           }
449           log_debug("UTF-8 %.*s is %05X", (int)ud->size, ud->data, (u_int)*wc);
450           return (UTF8_DONE);
451 }
452 
453 /* Convert wide character to UTF-8 character. */
454 enum utf8_state
455 utf8_fromwc(wchar_t wc, struct utf8_data *ud)
456 {
457           int       size, width;
458 
459 #ifdef HAVE_UTF8PROC
460           size = utf8proc_wctomb(ud->data, wc);
461 #else
462           size = wctomb((char *)ud->data, wc);
463 #endif
464           if (size < 0) {
465                     log_debug("UTF-8 %d, wctomb() %d", wc, errno);
466                     wctomb(NULL, 0);
467                     return (UTF8_ERROR);
468           }
469           if (size == 0)
470                     return (UTF8_ERROR);
471           ud->size = ud->have = size;
472           if (utf8_width(ud, &width) == UTF8_DONE) {
473                     ud->width = width;
474                     return (UTF8_DONE);
475           }
476           return (UTF8_ERROR);
477 }
478 
479 /*
480  * Open UTF-8 sequence.
481  *
482  * 11000010-11011111 C2-DF start of 2-byte sequence
483  * 11100000-11101111 E0-EF start of 3-byte sequence
484  * 11110000-11110100 F0-F4 start of 4-byte sequence
485  */
486 enum utf8_state
487 utf8_open(struct utf8_data *ud, u_char ch)
488 {
489           memset(ud, 0, sizeof *ud);
490           if (ch >= 0xc2 && ch <= 0xdf)
491                     ud->size = 2;
492           else if (ch >= 0xe0 && ch <= 0xef)
493                     ud->size = 3;
494           else if (ch >= 0xf0 && ch <= 0xf4)
495                     ud->size = 4;
496           else
497                     return (UTF8_ERROR);
498           utf8_append(ud, ch);
499           return (UTF8_MORE);
500 }
501 
502 /* Append character to UTF-8, closing if finished. */
503 enum utf8_state
504 utf8_append(struct utf8_data *ud, u_char ch)
505 {
506           int       width;
507 
508           if (ud->have >= ud->size)
509                     fatalx("UTF-8 character overflow");
510           if (ud->size > sizeof ud->data)
511                     fatalx("UTF-8 character size too large");
512 
513           if (ud->have != 0 && (ch & 0xc0) != 0x80)
514                     ud->width = 0xff;
515 
516           ud->data[ud->have++] = ch;
517           if (ud->have != ud->size)
518                     return (UTF8_MORE);
519 
520           if (ud->width == 0xff)
521                     return (UTF8_ERROR);
522           if (utf8_width(ud, &width) != UTF8_DONE)
523                     return (UTF8_ERROR);
524           ud->width = width;
525 
526           return (UTF8_DONE);
527 }
528 
529 /*
530  * Encode len characters from src into dst, which is guaranteed to have four
531  * bytes available for each character from src (for \abc or UTF-8) plus space
532  * for \0.
533  */
534 int
535 utf8_strvis(char *dst, const char *src, size_t len, int flag)
536 {
537           struct utf8_data     ud;
538           const char                    *start = dst, *end = src + len;
539           enum utf8_state                more;
540           size_t                         i;
541 
542           while (src < end) {
543                     if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
544                               while (++src < end && more == UTF8_MORE)
545                                         more = utf8_append(&ud, *src);
546                               if (more == UTF8_DONE) {
547                                         /* UTF-8 character finished. */
548                                         for (i = 0; i < ud.size; i++)
549                                                   *dst++ = ud.data[i];
550                                         continue;
551                               }
552                               /* Not a complete, valid UTF-8 character. */
553                               src -= ud.have;
554                     }
555                     if ((flag & VIS_DQ) && src[0] == '$' && src < end - 1) {
556                               if (isalpha((u_char)src[1]) ||
557                                   src[1] == '_' ||
558                                   src[1] == '{')
559                                         *dst++ = '\\';
560                               *dst++ = '$';
561                     } else if (src < end - 1)
562                               dst = vis(dst, src[0], flag, src[1]);
563                     else if (src < end)
564                               dst = vis(dst, src[0], flag, '\0');
565                     src++;
566           }
567           *dst = '\0';
568           return (dst - start);
569 }
570 
571 /* Same as utf8_strvis but allocate the buffer. */
572 int
573 utf8_stravis(char **dst, const char *src, int flag)
574 {
575           char      *buf;
576           int        len;
577 
578           buf = xreallocarray(NULL, 4, strlen(src) + 1);
579           len = utf8_strvis(buf, src, strlen(src), flag);
580 
581           *dst = xrealloc(buf, len + 1);
582           return (len);
583 }
584 
585 /* Same as utf8_strvis but allocate the buffer. */
586 int
587 utf8_stravisx(char **dst, const char *src, size_t srclen, int flag)
588 {
589           char      *buf;
590           int        len;
591 
592           buf = xreallocarray(NULL, 4, srclen + 1);
593           len = utf8_strvis(buf, src, srclen, flag);
594 
595           *dst = xrealloc(buf, len + 1);
596           return (len);
597 }
598 
599 /* Does this string contain anything that isn't valid UTF-8? */
600 int
601 utf8_isvalid(const char *s)
602 {
603           struct utf8_data ud;
604           const char          *end;
605           enum utf8_state      more;
606 
607           end = s + strlen(s);
608           while (s < end) {
609                     if ((more = utf8_open(&ud, *s)) == UTF8_MORE) {
610                               while (++s < end && more == UTF8_MORE)
611                                         more = utf8_append(&ud, *s);
612                               if (more == UTF8_DONE)
613                                         continue;
614                               return (0);
615                     }
616                     if (*s < 0x20 || *s > 0x7e)
617                               return (0);
618                     s++;
619           }
620           return (1);
621 }
622 
623 /*
624  * Sanitize a string, changing any UTF-8 characters to '_'. Caller should free
625  * the returned string. Anything not valid printable ASCII or UTF-8 is
626  * stripped.
627  */
628 char *
629 utf8_sanitize(const char *src)
630 {
631           char                *dst = NULL;
632           size_t               n = 0;
633           enum utf8_state      more;
634           struct utf8_data ud;
635           u_int                i;
636 
637           while (*src != '\0') {
638                     dst = xreallocarray(dst, n + 1, sizeof *dst);
639                     if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
640                               while (*++src != '\0' && more == UTF8_MORE)
641                                         more = utf8_append(&ud, *src);
642                               if (more == UTF8_DONE) {
643                                         dst = xreallocarray(dst, n + ud.width,
644                                             sizeof *dst);
645                                         for (i = 0; i < ud.width; i++)
646                                                   dst[n++] = '_';
647                                         continue;
648                               }
649                               src -= ud.have;
650                     }
651                     if (*src > 0x1f && *src < 0x7f)
652                               dst[n++] = *src;
653                     else
654                               dst[n++] = '_';
655                     src++;
656           }
657           dst = xreallocarray(dst, n + 1, sizeof *dst);
658           dst[n] = '\0';
659           return (dst);
660 }
661 
662 /* Get UTF-8 buffer length. */
663 size_t
664 utf8_strlen(const struct utf8_data *s)
665 {
666           size_t    i;
667 
668           for (i = 0; s[i].size != 0; i++)
669                     /* nothing */;
670           return (i);
671 }
672 
673 /* Get UTF-8 string width. */
674 u_int
675 utf8_strwidth(const struct utf8_data *s, ssize_t n)
676 {
677           ssize_t   i;
678           u_int     width = 0;
679 
680           for (i = 0; s[i].size != 0; i++) {
681                     if (n != -1 && n == i)
682                               break;
683                     width += s[i].width;
684           }
685           return (width);
686 }
687 
688 /*
689  * Convert a string into a buffer of UTF-8 characters. Terminated by size == 0.
690  * Caller frees.
691  */
692 struct utf8_data *
693 utf8_fromcstr(const char *src)
694 {
695           struct utf8_data    *dst = NULL;
696           size_t                         n = 0;
697           enum utf8_state                more;
698 
699           while (*src != '\0') {
700                     dst = xreallocarray(dst, n + 1, sizeof *dst);
701                     if ((more = utf8_open(&dst[n], *src)) == UTF8_MORE) {
702                               while (*++src != '\0' && more == UTF8_MORE)
703                                         more = utf8_append(&dst[n], *src);
704                               if (more == UTF8_DONE) {
705                                         n++;
706                                         continue;
707                               }
708                               src -= dst[n].have;
709                     }
710                     utf8_set(&dst[n], *src);
711                     n++;
712                     src++;
713           }
714           dst = xreallocarray(dst, n + 1, sizeof *dst);
715           dst[n].size = 0;
716           return (dst);
717 }
718 
719 /* Convert from a buffer of UTF-8 characters into a string. Caller frees. */
720 char *
721 utf8_tocstr(struct utf8_data *src)
722 {
723           char      *dst = NULL;
724           size_t     n = 0;
725 
726           for(; src->size != 0; src++) {
727                     dst = xreallocarray(dst, n + src->size, 1);
728                     memcpy(dst + n, src->data, src->size);
729                     n += src->size;
730           }
731           dst = xreallocarray(dst, n + 1, 1);
732           dst[n] = '\0';
733           return (dst);
734 }
735 
736 /* Get width of UTF-8 string. */
737 u_int
738 utf8_cstrwidth(const char *s)
739 {
740           struct utf8_data    tmp;
741           u_int                         width;
742           enum utf8_state               more;
743 
744           width = 0;
745           while (*s != '\0') {
746                     if ((more = utf8_open(&tmp, *s)) == UTF8_MORE) {
747                               while (*++s != '\0' && more == UTF8_MORE)
748                                         more = utf8_append(&tmp, *s);
749                               if (more == UTF8_DONE) {
750                                         width += tmp.width;
751                                         continue;
752                               }
753                               s -= tmp.have;
754                     }
755                     if (*s > 0x1f && *s != 0x7f)
756                               width++;
757                     s++;
758           }
759           return (width);
760 }
761 
762 /* Pad UTF-8 string to width on the left. Caller frees. */
763 char *
764 utf8_padcstr(const char *s, u_int width)
765 {
766           size_t     slen;
767           char      *out;
768           u_int      n, i;
769 
770           n = utf8_cstrwidth(s);
771           if (n >= width)
772                     return (xstrdup(s));
773 
774           slen = strlen(s);
775           out = xmalloc(slen + 1 + (width - n));
776           memcpy(out, s, slen);
777           for (i = n; i < width; i++)
778                     out[slen++] = ' ';
779           out[slen] = '\0';
780           return (out);
781 }
782 
783 /* Pad UTF-8 string to width on the right. Caller frees. */
784 char *
785 utf8_rpadcstr(const char *s, u_int width)
786 {
787           size_t     slen;
788           char      *out;
789           u_int      n, i;
790 
791           n = utf8_cstrwidth(s);
792           if (n >= width)
793                     return (xstrdup(s));
794 
795           slen = strlen(s);
796           out = xmalloc(slen + 1 + (width - n));
797           for (i = 0; i < width - n; i++)
798                     out[i] = ' ';
799           memcpy(out + i, s, slen);
800           out[i + slen] = '\0';
801           return (out);
802 }
803 
804 int
805 utf8_cstrhas(const char *s, const struct utf8_data *ud)
806 {
807           struct utf8_data    *copy, *loop;
808           int                            found = 0;
809 
810           copy = utf8_fromcstr(s);
811           for (loop = copy; loop->size != 0; loop++) {
812                     if (loop->size != ud->size)
813                               continue;
814                     if (memcmp(loop->data, ud->data, loop->size) == 0) {
815                               found = 1;
816                               break;
817                     }
818           }
819           free(copy);
820 
821           return (found);
822 }
823