xref: /dragonfly/lib/libc/gen/unvis.c (revision 38bb838d867031cc472f452979fc220d01cca3fa)
1 /*        @(#)unvis.c         8.1 (Berkeley) 6/4/93         */
2 /*        $NetBSD: unvis.c,v 1.44 2014/09/26 15:43:36 roy Exp $       */
3 
4 /*-
5  * Copyright (c) 1989, 1993
6  *        The Regents of the University of California.  All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  */
32 
33 #include "namespace.h"
34 #include <sys/types.h>
35 
36 #include <assert.h>
37 #include <ctype.h>
38 #include <stdint.h>
39 #include <stdio.h>
40 #include <errno.h>
41 #include <vis.h>
42 #include "un-namespace.h"
43 
44 /*
45  * Return the number of elements in a statically-allocated array,
46  * __x.
47  */
48 #define   __arraycount(__x)   (sizeof(__x) / sizeof(__x[0]))
49 
50 #ifdef __weak_alias
51 __weak_alias(strnunvisx,_strnunvisx)
52 #endif
53 
54 #if !HAVE_VIS
55 /*
56  * decode driven by state machine
57  */
58 #define   S_GROUND  0         /* haven't seen escape char */
59 #define   S_START             1         /* start decoding special sequence */
60 #define   S_META              2         /* metachar started (M) */
61 #define   S_META1             3         /* metachar more, regular char (-) */
62 #define   S_CTRL              4         /* control char started (^) */
63 #define   S_OCTAL2  5         /* octal digit 2 */
64 #define   S_OCTAL3  6         /* octal digit 3 */
65 #define   S_HEX               7         /* mandatory hex digit */
66 #define   S_HEX1              8         /* http hex digit */
67 #define   S_HEX2              9         /* http hex digit 2 */
68 #define   S_MIME1             10        /* mime hex digit 1 */
69 #define   S_MIME2             11        /* mime hex digit 2 */
70 #define   S_EATCRNL 12        /* mime eating CRNL */
71 #define   S_AMP               13        /* seen & */
72 #define   S_NUMBER  14        /* collecting number */
73 #define   S_STRING  15        /* collecting string */
74 
75 #define   isoctal(c)          (((u_char)(c)) >= '0' && ((u_char)(c)) <= '7')
76 #define   xtod(c)             (isdigit(c) ? (c - '0') : ((tolower(c) - 'a') + 10))
77 #define   XTOD(c)             (isdigit(c) ? (c - '0') : ((c - 'A') + 10))
78 
79 /*
80  * RFC 1866
81  */
82 static const struct nv {
83           char name[7];
84           uint8_t value;
85 } nv[] = {
86           { "AElig",          198 }, /* capital AE diphthong (ligature)  */
87           { "Aacute",         193 }, /* capital A, acute accent  */
88           { "Acirc",          194 }, /* capital A, circumflex accent  */
89           { "Agrave",         192 }, /* capital A, grave accent  */
90           { "Aring",          197 }, /* capital A, ring  */
91           { "Atilde",         195 }, /* capital A, tilde  */
92           { "Auml", 196 }, /* capital A, dieresis or umlaut mark  */
93           { "Ccedil",         199 }, /* capital C, cedilla  */
94           { "ETH",  208 }, /* capital Eth, Icelandic  */
95           { "Eacute",         201 }, /* capital E, acute accent  */
96           { "Ecirc",          202 }, /* capital E, circumflex accent  */
97           { "Egrave",         200 }, /* capital E, grave accent  */
98           { "Euml", 203 }, /* capital E, dieresis or umlaut mark  */
99           { "Iacute",         205 }, /* capital I, acute accent  */
100           { "Icirc",          206 }, /* capital I, circumflex accent  */
101           { "Igrave",         204 }, /* capital I, grave accent  */
102           { "Iuml", 207 }, /* capital I, dieresis or umlaut mark  */
103           { "Ntilde",         209 }, /* capital N, tilde  */
104           { "Oacute",         211 }, /* capital O, acute accent  */
105           { "Ocirc",          212 }, /* capital O, circumflex accent  */
106           { "Ograve",         210 }, /* capital O, grave accent  */
107           { "Oslash",         216 }, /* capital O, slash  */
108           { "Otilde",         213 }, /* capital O, tilde  */
109           { "Ouml", 214 }, /* capital O, dieresis or umlaut mark  */
110           { "THORN",          222 }, /* capital THORN, Icelandic  */
111           { "Uacute",         218 }, /* capital U, acute accent  */
112           { "Ucirc",          219 }, /* capital U, circumflex accent  */
113           { "Ugrave",         217 }, /* capital U, grave accent  */
114           { "Uuml", 220 }, /* capital U, dieresis or umlaut mark  */
115           { "Yacute",         221 }, /* capital Y, acute accent  */
116           { "aacute",         225 }, /* small a, acute accent  */
117           { "acirc",          226 }, /* small a, circumflex accent  */
118           { "acute",          180 }, /* acute accent  */
119           { "aelig",          230 }, /* small ae diphthong (ligature)  */
120           { "agrave",         224 }, /* small a, grave accent  */
121           { "amp",   38 }, /* ampersand  */
122           { "aring",          229 }, /* small a, ring  */
123           { "atilde",         227 }, /* small a, tilde  */
124           { "auml", 228 }, /* small a, dieresis or umlaut mark  */
125           { "brvbar",         166 }, /* broken (vertical) bar  */
126           { "ccedil",         231 }, /* small c, cedilla  */
127           { "cedil",          184 }, /* cedilla  */
128           { "cent", 162 }, /* cent sign  */
129           { "copy", 169 }, /* copyright sign  */
130           { "curren",         164 }, /* general currency sign  */
131           { "deg",  176 }, /* degree sign  */
132           { "divide",         247 }, /* divide sign  */
133           { "eacute",         233 }, /* small e, acute accent  */
134           { "ecirc",          234 }, /* small e, circumflex accent  */
135           { "egrave",         232 }, /* small e, grave accent  */
136           { "eth",  240 }, /* small eth, Icelandic  */
137           { "euml", 235 }, /* small e, dieresis or umlaut mark  */
138           { "frac12",         189 }, /* fraction one-half  */
139           { "frac14",         188 }, /* fraction one-quarter  */
140           { "frac34",         190 }, /* fraction three-quarters  */
141           { "gt",              62 }, /* greater than  */
142           { "iacute",         237 }, /* small i, acute accent  */
143           { "icirc",          238 }, /* small i, circumflex accent  */
144           { "iexcl",          161 }, /* inverted exclamation mark  */
145           { "igrave",         236 }, /* small i, grave accent  */
146           { "iquest",         191 }, /* inverted question mark  */
147           { "iuml", 239 }, /* small i, dieresis or umlaut mark  */
148           { "laquo",          171 }, /* angle quotation mark, left  */
149           { "lt",              60 }, /* less than  */
150           { "macr", 175 }, /* macron  */
151           { "micro",          181 }, /* micro sign  */
152           { "middot",         183 }, /* middle dot  */
153           { "nbsp", 160 }, /* no-break space  */
154           { "not",  172 }, /* not sign  */
155           { "ntilde",         241 }, /* small n, tilde  */
156           { "oacute",         243 }, /* small o, acute accent  */
157           { "ocirc",          244 }, /* small o, circumflex accent  */
158           { "ograve",         242 }, /* small o, grave accent  */
159           { "ordf", 170 }, /* ordinal indicator, feminine  */
160           { "ordm", 186 }, /* ordinal indicator, masculine  */
161           { "oslash",         248 }, /* small o, slash  */
162           { "otilde",         245 }, /* small o, tilde  */
163           { "ouml", 246 }, /* small o, dieresis or umlaut mark  */
164           { "para", 182 }, /* pilcrow (paragraph sign)  */
165           { "plusmn",         177 }, /* plus-or-minus sign  */
166           { "pound",          163 }, /* pound sterling sign  */
167           { "quot",  34 }, /* double quote  */
168           { "raquo",          187 }, /* angle quotation mark, right  */
169           { "reg",  174 }, /* registered sign  */
170           { "sect", 167 }, /* section sign  */
171           { "shy",  173 }, /* soft hyphen  */
172           { "sup1", 185 }, /* superscript one  */
173           { "sup2", 178 }, /* superscript two  */
174           { "sup3", 179 }, /* superscript three  */
175           { "szlig",          223 }, /* small sharp s, German (sz ligature)  */
176           { "thorn",          254 }, /* small thorn, Icelandic  */
177           { "times",          215 }, /* multiply sign  */
178           { "uacute",         250 }, /* small u, acute accent  */
179           { "ucirc",          251 }, /* small u, circumflex accent  */
180           { "ugrave",         249 }, /* small u, grave accent  */
181           { "uml",  168 }, /* umlaut (dieresis)  */
182           { "uuml", 252 }, /* small u, dieresis or umlaut mark  */
183           { "yacute",         253 }, /* small y, acute accent  */
184           { "yen",  165 }, /* yen sign  */
185           { "yuml", 255 }, /* small y, dieresis or umlaut mark  */
186 };
187 
188 /*
189  * unvis - decode characters previously encoded by vis
190  */
191 int
unvis(char * cp,int c,int * astate,int flag)192 unvis(char *cp, int c, int *astate, int flag)
193 {
194           unsigned char uc = (unsigned char)c;
195           unsigned char st, ia, is, lc;
196 
197 /*
198  * Bottom 8 bits of astate hold the state machine state.
199  * Top 8 bits hold the current character in the http 1866 nv string decoding
200  */
201 #define GS(a)                 ((a) & 0xff)
202 #define SS(a, b)    (((uint32_t)(a) << 24) | (b))
203 #define GI(a)                 ((uint32_t)(a) >> 24)
204 
205           _DIAGASSERT(cp != NULL);
206           _DIAGASSERT(astate != NULL);
207           st = GS(*astate);
208 
209           if (flag & UNVIS_END) {
210                     switch (st) {
211                     case S_OCTAL2:
212                     case S_OCTAL3:
213                     case S_HEX2:
214                               *astate = SS(0, S_GROUND);
215                               return UNVIS_VALID;
216                     case S_GROUND:
217                               return UNVIS_NOCHAR;
218                     default:
219                               return UNVIS_SYNBAD;
220                     }
221           }
222 
223           switch (st) {
224 
225           case S_GROUND:
226                     *cp = 0;
227                     if ((flag & VIS_NOESCAPE) == 0 && c == '\\') {
228                               *astate = SS(0, S_START);
229                               return UNVIS_NOCHAR;
230                     }
231                     if ((flag & VIS_HTTP1808) && c == '%') {
232                               *astate = SS(0, S_HEX1);
233                               return UNVIS_NOCHAR;
234                     }
235                     if ((flag & VIS_HTTP1866) && c == '&') {
236                               *astate = SS(0, S_AMP);
237                               return UNVIS_NOCHAR;
238                     }
239                     if ((flag & VIS_MIMESTYLE) && c == '=') {
240                               *astate = SS(0, S_MIME1);
241                               return UNVIS_NOCHAR;
242                     }
243                     *cp = c;
244                     return UNVIS_VALID;
245 
246           case S_START:
247                     switch(c) {
248                     case '-':
249                               *cp = 0;
250                               *astate = SS(0, S_GROUND);
251                               return UNVIS_NOCHAR;
252                     case '\\':
253                               *cp = c;
254                               *astate = SS(0, S_GROUND);
255                               return UNVIS_VALID;
256                     case '0': case '1': case '2': case '3':
257                     case '4': case '5': case '6': case '7':
258                               *cp = (c - '0');
259                               *astate = SS(0, S_OCTAL2);
260                               return UNVIS_NOCHAR;
261                     case 'M':
262                               *cp = (char)0200;
263                               *astate = SS(0, S_META);
264                               return UNVIS_NOCHAR;
265                     case '^':
266                               *astate = SS(0, S_CTRL);
267                               return UNVIS_NOCHAR;
268                     case 'n':
269                               *cp = '\n';
270                               *astate = SS(0, S_GROUND);
271                               return UNVIS_VALID;
272                     case 'r':
273                               *cp = '\r';
274                               *astate = SS(0, S_GROUND);
275                               return UNVIS_VALID;
276                     case 'b':
277                               *cp = '\b';
278                               *astate = SS(0, S_GROUND);
279                               return UNVIS_VALID;
280                     case 'a':
281                               *cp = '\007';
282                               *astate = SS(0, S_GROUND);
283                               return UNVIS_VALID;
284                     case 'v':
285                               *cp = '\v';
286                               *astate = SS(0, S_GROUND);
287                               return UNVIS_VALID;
288                     case 't':
289                               *cp = '\t';
290                               *astate = SS(0, S_GROUND);
291                               return UNVIS_VALID;
292                     case 'f':
293                               *cp = '\f';
294                               *astate = SS(0, S_GROUND);
295                               return UNVIS_VALID;
296                     case 's':
297                               *cp = ' ';
298                               *astate = SS(0, S_GROUND);
299                               return UNVIS_VALID;
300                     case 'E':
301                               *cp = '\033';
302                               *astate = SS(0, S_GROUND);
303                               return UNVIS_VALID;
304                     case 'x':
305                               *astate = SS(0, S_HEX);
306                               return UNVIS_NOCHAR;
307                     case '\n':
308                               /*
309                                * hidden newline
310                                */
311                               *astate = SS(0, S_GROUND);
312                               return UNVIS_NOCHAR;
313                     case '$':
314                               /*
315                                * hidden marker
316                                */
317                               *astate = SS(0, S_GROUND);
318                               return UNVIS_NOCHAR;
319                     default:
320                               if (isgraph(c)) {
321                                         *cp = c;
322                                         *astate = SS(0, S_GROUND);
323                                         return UNVIS_VALID;
324                               }
325                     }
326                     goto bad;
327 
328           case S_META:
329                     if (c == '-')
330                               *astate = SS(0, S_META1);
331                     else if (c == '^')
332                               *astate = SS(0, S_CTRL);
333                     else
334                               goto bad;
335                     return UNVIS_NOCHAR;
336 
337           case S_META1:
338                     *astate = SS(0, S_GROUND);
339                     *cp |= c;
340                     return UNVIS_VALID;
341 
342           case S_CTRL:
343                     if (c == '?')
344                               *cp |= 0177;
345                     else
346                               *cp |= c & 037;
347                     *astate = SS(0, S_GROUND);
348                     return UNVIS_VALID;
349 
350           case S_OCTAL2:      /* second possible octal digit */
351                     if (isoctal(uc)) {
352                               /*
353                                * yes - and maybe a third
354                                */
355                               *cp = (*cp << 3) + (c - '0');
356                               *astate = SS(0, S_OCTAL3);
357                               return UNVIS_NOCHAR;
358                     }
359                     /*
360                      * no - done with current sequence, push back passed char
361                      */
362                     *astate = SS(0, S_GROUND);
363                     return UNVIS_VALIDPUSH;
364 
365           case S_OCTAL3:      /* third possible octal digit */
366                     *astate = SS(0, S_GROUND);
367                     if (isoctal(uc)) {
368                               *cp = (*cp << 3) + (c - '0');
369                               return UNVIS_VALID;
370                     }
371                     /*
372                      * we were done, push back passed char
373                      */
374                     return UNVIS_VALIDPUSH;
375 
376           case S_HEX:
377                     if (!isxdigit(uc))
378                               goto bad;
379                     /*FALLTHROUGH*/
380           case S_HEX1:
381                     if (isxdigit(uc)) {
382                               *cp = xtod(uc);
383                               *astate = SS(0, S_HEX2);
384                               return UNVIS_NOCHAR;
385                     }
386                     /*
387                      * no - done with current sequence, push back passed char
388                      */
389                     *astate = SS(0, S_GROUND);
390                     return UNVIS_VALIDPUSH;
391 
392           case S_HEX2:
393                     *astate = S_GROUND;
394                     if (isxdigit(uc)) {
395                               *cp = xtod(uc) | (*cp << 4);
396                               return UNVIS_VALID;
397                     }
398                     return UNVIS_VALIDPUSH;
399 
400           case S_MIME1:
401                     if (uc == '\n' || uc == '\r') {
402                               *astate = SS(0, S_EATCRNL);
403                               return UNVIS_NOCHAR;
404                     }
405                     if (isxdigit(uc) && (isdigit(uc) || isupper(uc))) {
406                               *cp = XTOD(uc);
407                               *astate = SS(0, S_MIME2);
408                               return UNVIS_NOCHAR;
409                     }
410                     goto bad;
411 
412           case S_MIME2:
413                     if (isxdigit(uc) && (isdigit(uc) || isupper(uc))) {
414                               *astate = SS(0, S_GROUND);
415                               *cp = XTOD(uc) | (*cp << 4);
416                               return UNVIS_VALID;
417                     }
418                     goto bad;
419 
420           case S_EATCRNL:
421                     switch (uc) {
422                     case '\r':
423                     case '\n':
424                               return UNVIS_NOCHAR;
425                     case '=':
426                               *astate = SS(0, S_MIME1);
427                               return UNVIS_NOCHAR;
428                     default:
429                               *cp = uc;
430                               *astate = SS(0, S_GROUND);
431                               return UNVIS_VALID;
432                     }
433 
434           case S_AMP:
435                     *cp = 0;
436                     if (uc == '#') {
437                               *astate = SS(0, S_NUMBER);
438                               return UNVIS_NOCHAR;
439                     }
440                     *astate = SS(0, S_STRING);
441                     /*FALLTHROUGH*/
442 
443           case S_STRING:
444                     ia = *cp;           /* index in the array */
445                     is = GI(*astate);   /* index in the string */
446                     lc = is == 0 ? 0 : nv[ia].name[is - 1]; /* last character */
447 
448                     if (uc == ';')
449                               uc = '\0';
450 
451                     for (; ia < __arraycount(nv); ia++) {
452                               if (is != 0 && nv[ia].name[is - 1] != lc)
453                                         goto bad;
454                               if (nv[ia].name[is] == uc)
455                                         break;
456                     }
457 
458                     if (ia == __arraycount(nv))
459                               goto bad;
460 
461                     if (uc != 0) {
462                               *cp = ia;
463                               *astate = SS(is + 1, S_STRING);
464                               return UNVIS_NOCHAR;
465                     }
466 
467                     *cp = nv[ia].value;
468                     *astate = SS(0, S_GROUND);
469                     return UNVIS_VALID;
470 
471           case S_NUMBER:
472                     if (uc == ';')
473                               return UNVIS_VALID;
474                     if (!isdigit(uc))
475                               goto bad;
476                     *cp += (*cp * 10) + uc - '0';
477                     return UNVIS_NOCHAR;
478 
479           default:
480           bad:
481                     /*
482                      * decoder in unknown state - (probably uninitialized)
483                      */
484                     *astate = SS(0, S_GROUND);
485                     return UNVIS_SYNBAD;
486           }
487 }
488 
489 /*
490  * strnunvisx - decode src into dst
491  *
492  *        Number of chars decoded into dst is returned, -1 on error.
493  *        Dst is null terminated.
494  */
495 
496 int
strnunvisx(char * dst,size_t dlen,const char * src,int flag)497 strnunvisx(char *dst, size_t dlen, const char *src, int flag)
498 {
499           char c;
500           char t = '\0', *start = dst;
501           int state = 0;
502 
503           _DIAGASSERT(src != NULL);
504           _DIAGASSERT(dst != NULL);
505 #define CHECKSPACE() \
506           do { \
507                     if (dlen-- == 0) { \
508                               errno = ENOSPC; \
509                               return -1; \
510                     } \
511           } while (/*CONSTCOND*/0)
512 
513           while ((c = *src++) != '\0') {
514  again:
515                     switch (unvis(&t, c, &state, flag)) {
516                     case UNVIS_VALID:
517                               CHECKSPACE();
518                               *dst++ = t;
519                               break;
520                     case UNVIS_VALIDPUSH:
521                               CHECKSPACE();
522                               *dst++ = t;
523                               goto again;
524                     case 0:
525                     case UNVIS_NOCHAR:
526                               break;
527                     case UNVIS_SYNBAD:
528                               errno = EINVAL;
529                               return -1;
530                     default:
531                               _DIAGASSERT(/*CONSTCOND*/0);
532                               errno = EINVAL;
533                               return -1;
534                     }
535           }
536           if (unvis(&t, c, &state, UNVIS_END) == UNVIS_VALID) {
537                     CHECKSPACE();
538                     *dst++ = t;
539           }
540           CHECKSPACE();
541           *dst = '\0';
542           return (int)(dst - start);
543 }
544 
545 int
strunvisx(char * dst,const char * src,int flag)546 strunvisx(char *dst, const char *src, int flag)
547 {
548           return strnunvisx(dst, (size_t)~0, src, flag);
549 }
550 
551 int
strunvis(char * dst,const char * src)552 strunvis(char *dst, const char *src)
553 {
554           return strnunvisx(dst, (size_t)~0, src, 0);
555 }
556 
557 int
strnunvis(char * dst,size_t dlen,const char * src)558 strnunvis(char *dst, size_t dlen, const char *src)
559 {
560           return strnunvisx(dst, dlen, src, 0);
561 }
562 #endif
563