1 /*        $NetBSD: str.c,v 1.30 2018/05/26 11:20:30 leot Exp $        */
2 
3 /*-
4  * Copyright (c) 1991, 1993
5  *        The Regents of the University of California.  All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. Neither the name of the University nor the names of its contributors
16  *    may be used to endorse or promote products derived from this software
17  *    without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  */
31 
32 #include <sys/cdefs.h>
33 #ifndef lint
34 #if 0
35 static char sccsid[] = "@(#)str.c       8.2 (Berkeley) 4/28/95";
36 #endif
37 __RCSID("$NetBSD: str.c,v 1.30 2018/05/26 11:20:30 leot Exp $");
38 #endif /* not lint */
39 
40 #include <sys/types.h>
41 
42 #include <err.h>
43 #include <errno.h>
44 #include <stddef.h>
45 #include <stdio.h>
46 #include <stdlib.h>
47 #include <string.h>
48 #include <ctype.h>
49 #include <assert.h>
50 
51 #include "extern.h"
52 
53 struct str {
54           enum { STRING1, STRING2 } which;
55           enum { EOS, INFINITE, NORMAL, RANGE, SEQUENCE, SET } state;
56           int cnt;                      /* character count */
57           int lastch;                             /* last character */
58           int equiv[2];                           /* equivalence set */
59           int *set;                     /* set of characters */
60           const char *str;              /* user's string */
61 };
62 
63 static int backslash(STR *);
64 static int bracket(STR *);
65 static int c_class(const void *, const void *);
66 static int *genclass(const char *, size_t);
67 static void genequiv(STR *);
68 static int genrange(STR *);
69 static void genseq(STR *);
70 
71 STR *
str_create(int whichstring,const char * txt)72 str_create(int whichstring, const char *txt)
73 {
74           STR *s;
75 
76           s = malloc(sizeof(*s));
77           if (s == NULL) {
78                     err(1, "Out of memory");
79           }
80 
81           s->which = whichstring == 2 ? STRING2 : STRING1;
82           s->state = NORMAL;
83           s->cnt = 0;
84           s->lastch = OOBCH;
85           s->equiv[0] = 0;
86           s->equiv[1] = OOBCH;
87           s->set = NULL;
88           s->str = txt;
89 
90           return s;
91 }
92 
93 void
str_destroy(STR * s)94 str_destroy(STR *s)
95 {
96           if (s->set != NULL && s->set != s->equiv) {
97                     free(s->set);
98           }
99           free(s);
100 }
101 
102 int
next(STR * s,int * ret)103 next(STR *s, int *ret)
104 {
105           int ch;
106 
107           switch (s->state) {
108           case EOS:
109                     *ret = s->lastch;
110                     return 0;
111           case INFINITE:
112                     *ret = s->lastch;
113                     return 1;
114           case NORMAL:
115                     ch = (unsigned char)s->str[0];
116                     switch (ch) {
117                     case '\0':
118                               s->state = EOS;
119                               *ret = s->lastch;
120                               return 0;
121                     case '\\':
122                               s->lastch = backslash(s);
123                               break;
124                     case '[':
125                               if (bracket(s)) {
126                                         return next(s, ret);
127                               }
128                               /* FALLTHROUGH */
129                     default:
130                               ++s->str;
131                               s->lastch = ch;
132                               break;
133                     }
134 
135                     /* We can start a range at any time. */
136                     if (s->str[0] == '-' && genrange(s)) {
137                               return next(s, ret);
138                     }
139                     *ret = s->lastch;
140                     return 1;
141           case RANGE:
142                     if (s->cnt == 0) {
143                               s->state = NORMAL;
144                               return next(s, ret);
145                     }
146                     s->cnt--;
147                     ++s->lastch;
148                     *ret = s->lastch;
149                     return 1;
150           case SEQUENCE:
151                     if (s->cnt == 0) {
152                               s->state = NORMAL;
153                               return next(s, ret);
154                     }
155                     s->cnt--;
156                     *ret = s->lastch;
157                     return 1;
158           case SET:
159                     s->lastch = s->set[s->cnt++];
160                     if (s->lastch == OOBCH) {
161                               s->state = NORMAL;
162                               if (s->set != s->equiv) {
163                                         free(s->set);
164                               }
165                               s->set = NULL;
166                               return next(s, ret);
167                     }
168                     *ret = s->lastch;
169                     return 1;
170           }
171           /* NOTREACHED */
172           assert(0);
173           *ret = s->lastch;
174           return 0;
175 }
176 
177 static int
bracket(STR * s)178 bracket(STR *s)
179 {
180           const char *p;
181           int *q;
182 
183           switch (s->str[1]) {
184           case ':':                               /* "[:class:]" */
185                     if ((p = strstr(s->str + 2, ":]")) == NULL)
186                               return 0;
187                     s->str += 2;
188                     q = genclass(s->str, p - s->str);
189                     s->state = SET;
190                     s->set = q;
191                     s->cnt = 0;
192                     s->str = p + 2;
193                     return 1;
194           case '=':                               /* "[=equiv=]" */
195                     if ((p = strstr(s->str + 2, "=]")) == NULL)
196                               return 0;
197                     s->str += 2;
198                     genequiv(s);
199                     s->str = p + 2;
200                     return 1;
201           default:                                /* "[\###*n]" or "[#*n]" */
202                     if ((p = strpbrk(s->str + 2, "*]")) == NULL)
203                               return 0;
204                     if (p[0] != '*' || strchr(p, ']') == NULL)
205                               return 0;
206                     s->str += 1;
207                     genseq(s);
208                     return 1;
209           }
210           /* NOTREACHED */
211 }
212 
213 typedef struct {
214           const char *name;
215           int (*func)(int);
216 } CLASS;
217 
218 static const CLASS classes[] = {
219           { "alnum",  isalnum  },
220           { "alpha",  isalpha  },
221           { "blank",  isblank  },
222           { "cntrl",  iscntrl  },
223           { "digit",  isdigit  },
224           { "graph",  isgraph  },
225           { "lower",  islower  },
226           { "print",  isprint  },
227           { "punct",  ispunct  },
228           { "space",  isspace  },
229           { "upper",  isupper  },
230           { "xdigit", isxdigit },
231 };
232 
233 typedef struct {
234           const char *name;
235           size_t len;
236 } CLASSKEY;
237 
238 static int *
genclass(const char * class,size_t len)239 genclass(const char *class, size_t len)
240 {
241           int ch;
242           const CLASS *cp;
243           CLASSKEY key;
244           int *p;
245           unsigned pos, num;
246 
247           /* Find the class */
248           key.name = class;
249           key.len = len;
250           cp = bsearch(&key, classes, __arraycount(classes), sizeof(classes[0]),
251                          c_class);
252           if (cp == NULL) {
253                     errx(1, "unknown class %.*s", (int)len, class);
254           }
255 
256           /*
257            * Figure out what characters are in the class
258            */
259 
260           num = NCHARS + 1;
261           p = malloc(num * sizeof(*p));
262           if (p == NULL) {
263                     err(1, "malloc");
264           }
265 
266           pos = 0;
267           for (ch = 0; ch < NCHARS; ch++) {
268                     if (cp->func(ch)) {
269                               p[pos++] = ch;
270                     }
271           }
272 
273           p[pos++] = OOBCH;
274           for (; pos < num; pos++) {
275                     p[pos] = 0;
276           }
277 
278           return p;
279 }
280 
281 static int
c_class(const void * av,const void * bv)282 c_class(const void *av, const void *bv)
283 {
284           const CLASSKEY *a = av;
285           const CLASS *b = bv;
286           size_t blen;
287           int r;
288 
289           blen = strlen(b->name);
290           r = strncmp(a->name, b->name, a->len);
291           if (r != 0) {
292                     return r;
293           }
294           if (a->len < blen) {
295                     /* someone gave us a prefix of the right name */
296                     return -1;
297           }
298           assert(a-> len == blen);
299           return 0;
300 }
301 
302 /*
303  * English doesn't have any equivalence classes, so for now
304  * we just syntax check and grab the character.
305  */
306 static void
genequiv(STR * s)307 genequiv(STR *s)
308 {
309           int ch;
310 
311           ch = (unsigned char)s->str[0];
312           if (ch == '\\') {
313                     s->equiv[0] = backslash(s);
314           } else {
315                     s->equiv[0] = ch;
316                     s->str++;
317           }
318           if (s->str[0] != '=') {
319                     errx(1, "Misplaced equivalence equals sign");
320           }
321           s->str++;
322           if (s->str[0] != ']') {
323                     errx(1, "Misplaced equivalence right bracket");
324           }
325           s->str++;
326 
327           s->cnt = 0;
328           s->state = SET;
329           s->set = s->equiv;
330 }
331 
332 static int
genrange(STR * s)333 genrange(STR *s)
334 {
335           int stopval;
336           const char *savestart;
337 
338           savestart = s->str++;
339           stopval = s->str[0] == '\\' ? backslash(s) : (unsigned char)*s->str++;
340           if (stopval < (unsigned char)s->lastch) {
341                     s->str = savestart;
342                     return 0;
343           }
344           s->cnt = stopval - s->lastch + 1;
345           s->state = RANGE;
346           --s->lastch;
347           return 1;
348 }
349 
350 static void
genseq(STR * s)351 genseq(STR *s)
352 {
353           char *ep;
354 
355           if (s->which == STRING1) {
356                     errx(1, "Sequences only valid in string2");
357           }
358 
359           if (*s->str == '\\') {
360                     s->lastch = backslash(s);
361           } else {
362                     s->lastch = (unsigned char)*s->str++;
363           }
364           if (*s->str != '*') {
365                     errx(1, "Misplaced sequence asterisk");
366           }
367 
368           s->str++;
369           switch (s->str[0]) {
370           case '\\':
371                     s->cnt = backslash(s);
372                     break;
373           case ']':
374                     s->cnt = 0;
375                     ++s->str;
376                     break;
377           default:
378                     if (isdigit((unsigned char)s->str[0])) {
379                               s->cnt = strtol(s->str, &ep, 0);
380                               if (*ep == ']') {
381                                         s->str = ep + 1;
382                                         break;
383                               }
384                     }
385                     errx(1, "illegal sequence count");
386                     /* NOTREACHED */
387           }
388 
389           s->state = s->cnt ? SEQUENCE : INFINITE;
390 }
391 
392 /*
393  * Translate \??? into a character.  Up to 3 octal digits, if no digits either
394  * an escape code or a literal character.
395  */
396 static int
backslash(STR * s)397 backslash(STR *s)
398 {
399           int ch, cnt, val;
400 
401           cnt = val = 0;
402           for (;;) {
403                     /* Consume the character we're already on. */
404                     s->str++;
405 
406                     /* Look at the next character. */
407                     ch = (unsigned char)s->str[0];
408                     if (!isascii(ch) || !isdigit(ch)) {
409                               break;
410                     }
411                     val = val * 8 + ch - '0';
412                     if (++cnt == 3) {
413                               /* Enough digits; consume this one and stop */
414                               ++s->str;
415                               break;
416                     }
417           }
418           if (cnt) {
419                     /* We saw digits, so return their value */
420                     if (val >= OOBCH)
421                               errx(1, "Invalid octal character value");
422                     return val;
423           }
424           if (ch == '\0') {
425                     /* \<end> -> \ */
426                     s->state = EOS;
427                     return '\\';
428           }
429 
430           /* Consume the escaped character */
431           s->str++;
432 
433           switch (ch) {
434           case 'a':                     /* escape characters */
435                     return '\7';
436           case 'b':
437                     return '\b';
438           case 'e':
439                     return '\033';
440           case 'f':
441                     return '\f';
442           case 'n':
443                     return '\n';
444           case 'r':
445                     return '\r';
446           case 't':
447                     return '\t';
448           case 'v':
449                     return '\13';
450           default:                      /* \q -> q */
451                     return ch;
452           }
453 }
454