xref: /dragonfly/contrib/awk/b.c (revision e2ee60a4f1757f9ded9e1041053222b631f387b6)
1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
3 All Rights Reserved
4 
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
13 permission.
14 
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 THIS SOFTWARE.
23 ****************************************************************/
24 
25 /* lasciate ogne speranza, voi ch'intrate. */
26 
27 #define   DEBUG
28 
29 #include <ctype.h>
30 #include <limits.h>
31 #include <stdio.h>
32 #include <string.h>
33 #include <stdlib.h>
34 #include "awk.h"
35 #include "awkgram.tab.h"
36 
37 #define MAXLIN 22
38 
39 #define type(v)               (v)->nobj /* badly overloaded here */
40 #define info(v)               (v)->ntype          /* badly overloaded here */
41 #define left(v)               (v)->narg[0]
42 #define right(v)    (v)->narg[1]
43 #define parent(v)   (v)->nnext
44 
45 #define LEAF        case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
46 #define ELEAF       case EMPTYRE:                 /* empty string in regexp */
47 #define UNARY       case STAR: case PLUS: case QUEST:
48 
49 /* encoding in tree Nodes:
50           leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE):
51                     left is index, right contains value or pointer to value
52           unary (STAR, PLUS, QUEST): left is child, right is null
53           binary (CAT, OR): left and right are children
54           parent contains pointer to parent
55 */
56 
57 
58 int       *setvec;
59 int       *tmpset;
60 int       maxsetvec = 0;
61 
62 int       rtok;               /* next token in current re */
63 int       rlxval;
64 static const uschar *rlxstr;
65 static const uschar *prestr;  /* current position in current re */
66 static const uschar *lastre;  /* origin of last re */
67 static const uschar *lastatom;          /* origin of last Atom */
68 static const uschar *starttok;
69 static const uschar           *basestr; /* starts with original, replaced during
70                                            repetition processing */
71 static const uschar           *firstbasestr;
72 
73 static    int setcnt;
74 static    int poscnt;
75 
76 const char          *patbeg;
77 int       patlen;
78 
79 #define   NFA       128       /* cache this many dynamic fa's */
80 fa        *fatab[NFA];
81 int       nfatab    = 0;      /* entries in fatab */
82 
83 extern int u8_nextlen(const char *s);
84 
85 
86 /* utf-8 mechanism:
87 
88    For most of Awk, utf-8 strings just "work", since they look like
89    null-terminated sequences of 8-bit bytes.
90 
91    Functions like length(), index(), and substr() have to operate
92    in units of utf-8 characters.  The u8_* functions in run.c
93    handle this.
94 
95    Regular expressions are more complicated, since the basic
96    mechanism of the goto table used 8-bit byte indices into the
97    gototab entries to compute the next state.  Unicode is a lot
98    bigger, so the gototab entries are now structs with a character
99    and a next state. These are sorted by code point and binary
100    searched.
101 
102    Throughout the RE mechanism in b.c, utf-8 characters are
103    converted to their utf-32 value.  This mostly shows up in
104    cclenter, which expands character class ranges like a-z and now
105    alpha-omega.  The size of a gototab array is still about 256.
106    This should be dynamic, but for now things work ok for a single
107    code page of Unicode, which is the most likely case.
108 
109    The code changes are localized in run.c and b.c.  I have added a
110    handful of functions to somewhat better hide the implementation,
111    but a lot more could be done.
112 
113  */
114 
115 static int entry_cmp(const void *l, const void *r);
116 static int get_gototab(fa*, int, int);
117 static int set_gototab(fa*, int, int, int);
118 static void clear_gototab(fa*, int);
119 extern int u8_rune(int *, const char *);
120 
121 static int *
intalloc(size_t n,const char * f)122 intalloc(size_t n, const char *f)
123 {
124           int *p = (int *) calloc(n, sizeof(int));
125           if (p == NULL)
126                     overflo(f);
127           return p;
128 }
129 
130 static void
resizesetvec(const char * f)131 resizesetvec(const char *f)
132 {
133           if (maxsetvec == 0)
134                     maxsetvec = MAXLIN;
135           else
136                     maxsetvec *= 4;
137           setvec = (int *) realloc(setvec, maxsetvec * sizeof(*setvec));
138           tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(*tmpset));
139           if (setvec == NULL || tmpset == NULL)
140                     overflo(f);
141 }
142 
143 static void
resize_state(fa * f,int state)144 resize_state(fa *f, int state)
145 {
146           gtt *p;
147           uschar *p2;
148           int **p3;
149           int i, new_count;
150 
151           if (++state < f->state_count)
152                     return;
153 
154           new_count = state + 10; /* needs to be tuned */
155 
156           p = (gtt *) realloc(f->gototab, new_count * sizeof(gtt));
157           if (p == NULL)
158                     goto out;
159           f->gototab = p;
160 
161           p2 = (uschar *) realloc(f->out, new_count * sizeof(f->out[0]));
162           if (p2 == NULL)
163                     goto out;
164           f->out = p2;
165 
166           p3 = (int **) realloc(f->posns, new_count * sizeof(f->posns[0]));
167           if (p3 == NULL)
168                     goto out;
169           f->posns = p3;
170 
171           for (i = f->state_count; i < new_count; ++i) {
172                     f->gototab[i].entries = (gtte *) calloc(NCHARS, sizeof(gtte));
173                     if (f->gototab[i].entries == NULL)
174                               goto out;
175                     f->gototab[i].allocated = NCHARS;
176                     f->gototab[i].inuse = 0;
177                     f->out[i] = 0;
178                     f->posns[i] = NULL;
179           }
180           f->state_count = new_count;
181           return;
182 out:
183           overflo(__func__);
184 }
185 
makedfa(const char * s,bool anchor)186 fa *makedfa(const char *s, bool anchor) /* returns dfa for reg expr s */
187 {
188           int i, use, nuse;
189           fa *pfa;
190           static int now = 1;
191 
192           if (setvec == NULL) {         /* first time through any RE */
193                     resizesetvec(__func__);
194           }
195 
196           if (compile_time != RUNNING)  /* a constant for sure */
197                     return mkdfa(s, anchor);
198           for (i = 0; i < nfatab; i++)  /* is it there already? */
199                     if (fatab[i]->anchor == anchor
200                       && strcmp((const char *) fatab[i]->restr, s) == 0) {
201                               fatab[i]->use = now++;
202                               return fatab[i];
203                     }
204           pfa = mkdfa(s, anchor);
205           if (nfatab < NFA) { /* room for another */
206                     fatab[nfatab] = pfa;
207                     fatab[nfatab]->use = now++;
208                     nfatab++;
209                     return pfa;
210           }
211           use = fatab[0]->use;          /* replace least-recently used */
212           nuse = 0;
213           for (i = 1; i < nfatab; i++)
214                     if (fatab[i]->use < use) {
215                               use = fatab[i]->use;
216                               nuse = i;
217                     }
218           freefa(fatab[nuse]);
219           fatab[nuse] = pfa;
220           pfa->use = now++;
221           return pfa;
222 }
223 
mkdfa(const char * s,bool anchor)224 fa *mkdfa(const char *s, bool anchor)   /* does the real work of making a dfa */
225                                         /* anchor = true for anchored matches, else false */
226 {
227           Node *p, *p1;
228           fa *f;
229 
230           firstbasestr = (const uschar *) s;
231           basestr = firstbasestr;
232           p = reparse(s);
233           p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
234                     /* put ALL STAR in front of reg.  exp. */
235           p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
236                     /* put FINAL after reg.  exp. */
237 
238           poscnt = 0;
239           penter(p1);         /* enter parent pointers and leaf indices */
240           if ((f = (fa *) calloc(1, sizeof(fa) + poscnt * sizeof(rrow))) == NULL)
241                     overflo(__func__);
242           f->accept = poscnt-1;         /* penter has computed number of positions in re */
243           cfoll(f, p1);       /* set up follow sets */
244           freetr(p1);
245           resize_state(f, 1);
246           f->posns[0] = intalloc(*(f->re[0].lfollow), __func__);
247           f->posns[1] = intalloc(1, __func__);
248           *f->posns[1] = 0;
249           f->initstat = makeinit(f, anchor);
250           f->anchor = anchor;
251           f->restr = (uschar *) tostring(s);
252           if (firstbasestr != basestr) {
253                     if (basestr)
254                               xfree(basestr);
255           }
256           return f;
257 }
258 
makeinit(fa * f,bool anchor)259 int makeinit(fa *f, bool anchor)
260 {
261           int i, k;
262 
263           f->curstat = 2;
264           f->out[2] = 0;
265           k = *(f->re[0].lfollow);
266           xfree(f->posns[2]);
267           f->posns[2] = intalloc(k + 1,  __func__);
268           for (i = 0; i <= k; i++) {
269                     (f->posns[2])[i] = (f->re[0].lfollow)[i];
270           }
271           if ((f->posns[2])[1] == f->accept)
272                     f->out[2] = 1;
273           clear_gototab(f, 2);
274           f->curstat = cgoto(f, 2, HAT);
275           if (anchor) {
276                     *f->posns[2] = k-1; /* leave out position 0 */
277                     for (i = 0; i < k; i++) {
278                               (f->posns[0])[i] = (f->posns[2])[i];
279                     }
280 
281                     f->out[0] = f->out[2];
282                     if (f->curstat != 2)
283                               --(*f->posns[f->curstat]);
284           }
285           return f->curstat;
286 }
287 
penter(Node * p)288 void penter(Node *p)          /* set up parent pointers and leaf indices */
289 {
290           switch (type(p)) {
291           ELEAF
292           LEAF
293                     info(p) = poscnt;
294                     poscnt++;
295                     break;
296           UNARY
297                     penter(left(p));
298                     parent(left(p)) = p;
299                     break;
300           case CAT:
301           case OR:
302                     penter(left(p));
303                     penter(right(p));
304                     parent(left(p)) = p;
305                     parent(right(p)) = p;
306                     break;
307           case ZERO:
308                     break;
309           default:  /* can't happen */
310                     FATAL("can't happen: unknown type %d in penter", type(p));
311                     break;
312           }
313 }
314 
freetr(Node * p)315 void freetr(Node *p)          /* free parse tree */
316 {
317           switch (type(p)) {
318           ELEAF
319           LEAF
320                     xfree(p);
321                     break;
322           UNARY
323           case ZERO:
324                     freetr(left(p));
325                     xfree(p);
326                     break;
327           case CAT:
328           case OR:
329                     freetr(left(p));
330                     freetr(right(p));
331                     xfree(p);
332                     break;
333           default:  /* can't happen */
334                     FATAL("can't happen: unknown type %d in freetr", type(p));
335                     break;
336           }
337 }
338 
339 /* in the parsing of regular expressions, metacharacters like . have */
340 /* to be seen literally;  \056 is not a metacharacter. */
341 
hexstr(const uschar ** pp,int max)342 int hexstr(const uschar **pp, int max)  /* find and eval hex string at pp, return new p */
343 {                             /* only pick up one 8-bit byte (2 chars) */
344           const uschar *p;
345           int n = 0;
346           int i;
347 
348           for (i = 0, p = *pp; i < max && isxdigit(*p); i++, p++) {
349                     if (isdigit((int) *p))
350                               n = 16 * n + *p - '0';
351                     else if (*p >= 'a' && *p <= 'f')
352                               n = 16 * n + *p - 'a' + 10;
353                     else if (*p >= 'A' && *p <= 'F')
354                               n = 16 * n + *p - 'A' + 10;
355           }
356           *pp = p;
357           return n;
358 }
359 
360 
361 
362 #define isoctdigit(c) ((c) >= '0' && (c) <= '7')  /* multiple use of arg */
363 
quoted(const uschar ** pp)364 int quoted(const uschar **pp) /* pick up next thing after a \\ */
365                               /* and increment *pp */
366 {
367           const uschar *p = *pp;
368           int c;
369 
370 /* BUG: should advance by utf-8 char even if makes no sense */
371 
372           if ((c = *p++) == 't') {
373                     c = '\t';
374           } else if (c == 'n') {
375                     c = '\n';
376           } else if (c == 'f') {
377                     c = '\f';
378           } else if (c == 'r') {
379                     c = '\r';
380           } else if (c == 'b') {
381                     c = '\b';
382           } else if (c == 'v') {
383                     c = '\v';
384           } else if (c == 'a') {
385                     c = '\a';
386           } else if (c == '\\') {
387                     c = '\\';
388           } else if (c == 'x') {        /* 2 hex digits follow */
389                     c = hexstr(&p, 2);  /* this adds a null if number is invalid */
390           } else if (c == 'u') {        /* unicode char number up to 8 hex digits */
391                     c = hexstr(&p, 8);
392           } else if (isoctdigit(c)) {   /* \d \dd \ddd */
393                     int n = c - '0';
394                     if (isoctdigit(*p)) {
395                               n = 8 * n + *p++ - '0';
396                               if (isoctdigit(*p))
397                                         n = 8 * n + *p++ - '0';
398                     }
399                     c = n;
400           } /* else */
401                     /* c = c; */
402           *pp = p;
403           return c;
404 }
405 
cclenter(const char * argp)406 int *cclenter(const char *argp)         /* add a character class */
407 {
408           int i, c, c2;
409           int n;
410           const uschar *p = (const uschar *) argp;
411           int *bp, *retp;
412           static int *buf = NULL;
413           static int bufsz = 100;
414 
415           if (buf == NULL && (buf = (int *) calloc(bufsz, sizeof(int))) == NULL)
416                     FATAL("out of space for character class [%.10s...] 1", p);
417           bp = buf;
418           for (i = 0; *p != 0; ) {
419                     n = u8_rune(&c, (const char *) p);
420                     p += n;
421                     if (c == '\\') {
422                               c = quoted(&p);
423                     } else if (c == '-' && i > 0 && bp[-1] != 0) {
424                               if (*p != 0) {
425                                         c = bp[-1];
426                                         /* c2 = *p++; */
427                                         n = u8_rune(&c2, (const char *) p);
428                                         p += n;
429                                         if (c2 == '\\')
430                                                   c2 = quoted(&p); /* BUG: sets p, has to be u8 size */
431                                         if (c > c2) {       /* empty; ignore */
432                                                   bp--;
433                                                   i--;
434                                                   continue;
435                                         }
436                                         while (c < c2) {
437                                                   if (i >= bufsz) {
438                                                             bufsz *= 2;
439                                                             buf = (int *) realloc(buf, bufsz * sizeof(int));
440                                                             if (buf == NULL)
441                                                                       FATAL("out of space for character class [%.10s...] 2", p);
442                                                             bp = buf + i;
443                                                   }
444                                                   *bp++ = ++c;
445                                                   i++;
446                                         }
447                                         continue;
448                               }
449                     }
450                     if (i >= bufsz) {
451                               bufsz *= 2;
452                               buf = (int *) realloc(buf, bufsz * sizeof(int));
453                               if (buf == NULL)
454                                         FATAL("out of space for character class [%.10s...] 2", p);
455                               bp = buf + i;
456                     }
457                     *bp++ = c;
458                     i++;
459           }
460           *bp = 0;
461           /* DPRINTF("cclenter: in = |%s|, out = |%s|\n", op, buf); BUG: can't print array of int */
462           /* xfree(op);  BUG: what are we freeing here? */
463           retp = (int *) calloc(bp-buf+1, sizeof(int));
464           for (i = 0; i < bp-buf+1; i++)
465                     retp[i] = buf[i];
466           return retp;
467 }
468 
overflo(const char * s)469 void overflo(const char *s)
470 {
471           FATAL("regular expression too big: out of space in %.30s...", s);
472 }
473 
cfoll(fa * f,Node * v)474 void cfoll(fa *f, Node *v)    /* enter follow set of each leaf of vertex v into lfollow[leaf] */
475 {
476           int i;
477           int *p;
478 
479           switch (type(v)) {
480           ELEAF
481           LEAF
482                     f->re[info(v)].ltype = type(v);
483                     f->re[info(v)].lval.np = right(v);
484                     while (f->accept >= maxsetvec) {        /* guessing here! */
485                               resizesetvec(__func__);
486                     }
487                     for (i = 0; i <= f->accept; i++)
488                               setvec[i] = 0;
489                     setcnt = 0;
490                     follow(v);          /* computes setvec and setcnt */
491                     p = intalloc(setcnt + 1, __func__);
492                     f->re[info(v)].lfollow = p;
493                     *p = setcnt;
494                     for (i = f->accept; i >= 0; i--)
495                               if (setvec[i] == 1)
496                                         *++p = i;
497                     break;
498           UNARY
499                     cfoll(f,left(v));
500                     break;
501           case CAT:
502           case OR:
503                     cfoll(f,left(v));
504                     cfoll(f,right(v));
505                     break;
506           case ZERO:
507                     break;
508           default:  /* can't happen */
509                     FATAL("can't happen: unknown type %d in cfoll", type(v));
510           }
511 }
512 
first(Node * p)513 int first(Node *p)  /* collects initially active leaves of p into setvec */
514                               /* returns 0 if p matches empty string */
515 {
516           int b, lp;
517 
518           switch (type(p)) {
519           ELEAF
520           LEAF
521                     lp = info(p);       /* look for high-water mark of subscripts */
522                     while (setcnt >= maxsetvec || lp >= maxsetvec) {  /* guessing here! */
523                               resizesetvec(__func__);
524                     }
525                     if (type(p) == EMPTYRE) {
526                               setvec[lp] = 0;
527                               return(0);
528                     }
529                     if (setvec[lp] != 1) {
530                               setvec[lp] = 1;
531                               setcnt++;
532                     }
533                     if (type(p) == CCL && (*(int *) right(p)) == 0)
534                               return(0);                    /* empty CCL */
535                     return(1);
536           case PLUS:
537                     if (first(left(p)) == 0)
538                               return(0);
539                     return(1);
540           case STAR:
541           case QUEST:
542                     first(left(p));
543                     return(0);
544           case CAT:
545                     if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
546                     return(1);
547           case OR:
548                     b = first(right(p));
549                     if (first(left(p)) == 0 || b == 0) return(0);
550                     return(1);
551           case ZERO:
552                     return 0;
553           }
554           FATAL("can't happen: unknown type %d in first", type(p));   /* can't happen */
555           return(-1);
556 }
557 
follow(Node * v)558 void follow(Node *v)          /* collects leaves that can follow v into setvec */
559 {
560           Node *p;
561 
562           if (type(v) == FINAL)
563                     return;
564           p = parent(v);
565           switch (type(p)) {
566           case STAR:
567           case PLUS:
568                     first(v);
569                     follow(p);
570                     return;
571 
572           case OR:
573           case QUEST:
574                     follow(p);
575                     return;
576 
577           case CAT:
578                     if (v == left(p)) { /* v is left child of p */
579                               if (first(right(p)) == 0) {
580                                         follow(p);
581                                         return;
582                               }
583                     } else              /* v is right child */
584                               follow(p);
585                     return;
586           }
587 }
588 
member(int c,int * sarg)589 int member(int c, int *sarg)  /* is c in s? */
590 {
591           int *s = (int *) sarg;
592 
593           while (*s)
594                     if (c == *s++)
595                               return(1);
596           return(0);
597 }
598 
resize_gototab(fa * f,int state)599 static void resize_gototab(fa *f, int state)
600 {
601           size_t new_size = f->gototab[state].allocated * 2;
602           gtte *p = (gtte *) realloc(f->gototab[state].entries, new_size * sizeof(gtte));
603           if (p == NULL)
604                     overflo(__func__);
605 
606           // need to initialized the new memory to zero
607           size_t orig_size = f->gototab[state].allocated;             // 2nd half of new mem is this size
608           memset(p + orig_size, 0, orig_size * sizeof(gtte));         // clean it out
609 
610           f->gototab[state].allocated = new_size;                     // update gototab info
611           f->gototab[state].entries = p;
612 }
613 
get_gototab(fa * f,int state,int ch)614 static int get_gototab(fa *f, int state, int ch) /* hide gototab implementation */
615 {
616           gtte key;
617           gtte *item;
618 
619           key.ch = ch;
620           key.state = 0;      /* irrelevant */
621           item = (gtte *) bsearch(& key, f->gototab[state].entries,
622                               f->gototab[state].inuse, sizeof(gtte),
623                               entry_cmp);
624 
625           if (item == NULL)
626                     return 0;
627           else
628                     return item->state;
629 }
630 
entry_cmp(const void * l,const void * r)631 static int entry_cmp(const void *l, const void *r)
632 {
633           const gtte *left, *right;
634 
635           left = (const gtte *) l;
636           right = (const gtte *) r;
637 
638           return left->ch - right->ch;
639 }
640 
set_gototab(fa * f,int state,int ch,int val)641 static int set_gototab(fa *f, int state, int ch, int val) /* hide gototab implementation */
642 {
643           if (f->gototab[state].inuse == 0) {
644                     f->gototab[state].entries[0].ch = ch;
645                     f->gototab[state].entries[0].state = val;
646                     f->gototab[state].inuse++;
647                     return val;
648           } else if (ch > f->gototab[state].entries[f->gototab[state].inuse-1].ch) {
649                     // not seen yet, insert and return
650                     gtt *tab = & f->gototab[state];
651                     if (tab->inuse + 1 >= tab->allocated)
652                               resize_gototab(f, state);
653 
654                     f->gototab[state].entries[f->gototab[state].inuse-1].ch = ch;
655                     f->gototab[state].entries[f->gototab[state].inuse-1].state = val;
656                     f->gototab[state].inuse++;
657                     return val;
658           } else {
659                     // maybe we have it, maybe we don't
660                     gtte key;
661                     gtte *item;
662 
663                     key.ch = ch;
664                     key.state = 0;      /* irrelevant */
665                     item = (gtte *) bsearch(& key, f->gototab[state].entries,
666                                         f->gototab[state].inuse, sizeof(gtte),
667                                         entry_cmp);
668 
669                     if (item != NULL) {
670                               // we have it, update state and return
671                               item->state = val;
672                               return item->state;
673                     }
674                     // otherwise, fall through to insert and reallocate.
675           }
676 
677           gtt *tab = & f->gototab[state];
678           if (tab->inuse + 1 >= tab->allocated)
679                     resize_gototab(f, state);
680           ++tab->inuse;
681           f->gototab[state].entries[tab->inuse].ch = ch;
682           f->gototab[state].entries[tab->inuse].state = val;
683 
684           qsort(f->gototab[state].entries,
685                     f->gototab[state].inuse, sizeof(gtte), entry_cmp);
686 
687           return val; /* not used anywhere at the moment */
688 }
689 
clear_gototab(fa * f,int state)690 static void clear_gototab(fa *f, int state)
691 {
692           memset(f->gototab[state].entries, 0,
693                     f->gototab[state].allocated * sizeof(gtte));
694           f->gototab[state].inuse = 0;
695 }
696 
match(fa * f,const char * p0)697 int match(fa *f, const char *p0)        /* shortest match ? */
698 {
699           int s, ns;
700           int n;
701           int rune;
702           const uschar *p = (const uschar *) p0;
703 
704           /* return pmatch(f, p0); does it matter whether longest or shortest? */
705 
706           s = f->initstat;
707           assert (s < f->state_count);
708 
709           if (f->out[s])
710                     return(1);
711           do {
712                     /* assert(*p < NCHARS); */
713                     n = u8_rune(&rune, (const char *) p);
714                     if ((ns = get_gototab(f, s, rune)) != 0)
715                               s = ns;
716                     else
717                               s = cgoto(f, s, rune);
718                     if (f->out[s])
719                               return(1);
720                     if (*p == 0)
721                               break;
722                     p += n;
723           } while (1);  /* was *p++ != 0 */
724           return(0);
725 }
726 
pmatch(fa * f,const char * p0)727 int pmatch(fa *f, const char *p0)       /* longest match, for sub */
728 {
729           int s, ns;
730           int n;
731           int rune;
732           const uschar *p = (const uschar *) p0;
733           const uschar *q;
734 
735           s = f->initstat;
736           assert(s < f->state_count);
737 
738           patbeg = (const char *)p;
739           patlen = -1;
740           do {
741                     q = p;
742                     do {
743                               if (f->out[s])                /* final state */
744                                         patlen = q-p;
745                               /* assert(*q < NCHARS); */
746                               n = u8_rune(&rune, (const char *) q);
747                               if ((ns = get_gototab(f, s, rune)) != 0)
748                                         s = ns;
749                               else
750                                         s = cgoto(f, s, rune);
751 
752                               assert(s < f->state_count);
753 
754                               if (s == 1) {       /* no transition */
755                                         if (patlen >= 0) {
756                                                   patbeg = (const char *) p;
757                                                   return(1);
758                                         }
759                                         else
760                                                   goto nextin;        /* no match */
761                               }
762                               if (*q == 0)
763                                         break;
764                               q += n;
765                     } while (1);
766                     q++;  /* was *q++ */
767                     if (f->out[s])
768                               patlen = q-p-1;     /* don't count $ */
769                     if (patlen >= 0) {
770                               patbeg = (const char *) p;
771                               return(1);
772                     }
773           nextin:
774                     s = 2;
775                     if (*p == 0)
776                               break;
777                     n = u8_rune(&rune, (const char *) p);
778                     p += n;
779           } while (1); /* was *p++ */
780           return (0);
781 }
782 
nematch(fa * f,const char * p0)783 int nematch(fa *f, const char *p0)      /* non-empty match, for sub */
784 {
785           int s, ns;
786         int n;
787         int rune;
788           const uschar *p = (const uschar *) p0;
789           const uschar *q;
790 
791           s = f->initstat;
792           assert(s < f->state_count);
793 
794           patbeg = (const char *)p;
795           patlen = -1;
796           while (*p) {
797                     q = p;
798                     do {
799                               if (f->out[s])                /* final state */
800                                         patlen = q-p;
801                               /* assert(*q < NCHARS); */
802                               n = u8_rune(&rune, (const char *) q);
803                               if ((ns = get_gototab(f, s, rune)) != 0)
804                                         s = ns;
805                               else
806                                         s = cgoto(f, s, rune);
807                               if (s == 1) {       /* no transition */
808                                         if (patlen > 0) {
809                                                   patbeg = (const char *) p;
810                                                   return(1);
811                                         } else
812                                                   goto nnextin;       /* no nonempty match */
813                               }
814                               if (*q == 0)
815                                         break;
816                               q += n;
817                     } while (1);
818                     q++;
819                     if (f->out[s])
820                               patlen = q-p-1;     /* don't count $ */
821                     if (patlen > 0 ) {
822                               patbeg = (const char *) p;
823                               return(1);
824                     }
825           nnextin:
826                     s = 2;
827                     p++;
828           }
829           return (0);
830 }
831 
832 
833 #define MAX_UTF_BYTES         4         // UTF-8 is up to 4 bytes long
834 
835 /*
836  * NAME
837  *     fnematch
838  *
839  * DESCRIPTION
840  *     A stream-fed version of nematch which transfers characters to a
841  *     null-terminated buffer. All characters up to and including the last
842  *     character of the matching text or EOF are placed in the buffer. If
843  *     a match is found, patbeg and patlen are set appropriately.
844  *
845  * RETURN VALUES
846  *     false    No match found.
847  *     true     Match found.
848  */
849 
fnematch(fa * pfa,FILE * f,char ** pbuf,int * pbufsize,int quantum)850 bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
851 {
852           char *i, *j, *k, *buf = *pbuf;
853           int bufsize = *pbufsize;
854           int c, n, ns, s;
855 
856           s = pfa->initstat;
857           patlen = 0;
858 
859           /*
860            * buf <= i <= j <= k <= buf+bufsize
861            *
862            * i: origin of active substring
863            * j: current character
864            * k: destination of the next getc
865            */
866 
867           i = j = k = buf;
868 
869           do {
870                     /*
871                      * Call u8_rune with at least MAX_UTF_BYTES ahead in
872                      * the buffer until EOF interferes.
873                      */
874                     if (k - j < MAX_UTF_BYTES) {
875                               if (k + MAX_UTF_BYTES > buf + bufsize) {
876                                         adjbuf((char **) &buf, &bufsize,
877                                             bufsize + MAX_UTF_BYTES,
878                                             quantum, 0, "fnematch");
879                               }
880                               for (n = MAX_UTF_BYTES ; n > 0; n--) {
881                                         *k++ = (c = getc(f)) != EOF ? c : 0;
882                                         if (c == EOF) {
883                                                   if (ferror(f))
884                                                             FATAL("fnematch: getc error");
885                                                   break;
886                                         }
887                               }
888                     }
889 
890                     j += u8_rune(&c, j);
891 
892                     if ((ns = get_gototab(pfa, s, c)) != 0)
893                               s = ns;
894                     else
895                               s = cgoto(pfa, s, c);
896 
897                     if (pfa->out[s]) {  /* final state */
898                               patbeg = i;
899                               patlen = j - i;
900                               if (c == 0)         /* don't count $ */
901                                         patlen--;
902                     }
903 
904                     if (c && s != 1)
905                               continue;  /* origin i still viable, next j */
906                     if (patlen)
907                               break;     /* best match found */
908 
909                     /* no match at origin i, next i and start over */
910                     i += u8_rune(&c, i);
911                     if (c == 0)
912                               break;    /* no match */
913                     j = i;
914                     s = 2;
915           } while (1);
916 
917           /* adjbuf() may have relocated a resized buffer. Inform the world. */
918           *pbuf = buf;
919           *pbufsize = bufsize;
920 
921           if (patlen) {
922                     /*
923                      * Under no circumstances is the last character fed to
924                      * the automaton part of the match. It is EOF's nullbyte,
925                      * or it sent the automaton into a state with no further
926                      * transitions available (s==1), or both. Room for a
927                      * terminating nullbyte is guaranteed.
928                      *
929                      * ungetc any chars after the end of matching text
930                      * (except for EOF's nullbyte, if present) and null
931                      * terminate the buffer.
932                      */
933                     do
934                               if (*--k && ungetc(*k, f) == EOF)
935                                         FATAL("unable to ungetc '%c'", *k);
936                     while (k > patbeg + patlen);
937                     *k = '\0';
938                     return true;
939           }
940           else
941                     return false;
942 }
943 
reparse(const char * p)944 Node *reparse(const char *p)  /* parses regular expression pointed to by p */
945 {                             /* uses relex() to scan regular expression */
946           Node *np;
947 
948           DPRINTF("reparse <%s>\n", p);
949           lastre = prestr = (const uschar *) p;   /* prestr points to string to be parsed */
950           rtok = relex();
951           /* GNU compatibility: an empty regexp matches anything */
952           if (rtok == '\0') {
953                     /* FATAL("empty regular expression"); previous */
954                     return(op2(EMPTYRE, NIL, NIL));
955           }
956           np = regexp();
957           if (rtok != '\0')
958                     FATAL("syntax error in regular expression %s at %s", lastre, prestr);
959           return(np);
960 }
961 
regexp(void)962 Node *regexp(void)  /* top-level parse of reg expr */
963 {
964           return (alt(concat(primary())));
965 }
966 
primary(void)967 Node *primary(void)
968 {
969           Node *np;
970           int savelastatom;
971 
972           switch (rtok) {
973           case CHAR:
974                     lastatom = starttok;
975                     np = op2(CHAR, NIL, itonp(rlxval));
976                     rtok = relex();
977                     return (unary(np));
978           case ALL:
979                     rtok = relex();
980                     return (unary(op2(ALL, NIL, NIL)));
981           case EMPTYRE:
982                     rtok = relex();
983                     return (unary(op2(EMPTYRE, NIL, NIL)));
984           case DOT:
985                     lastatom = starttok;
986                     rtok = relex();
987                     return (unary(op2(DOT, NIL, NIL)));
988           case CCL:
989                     np = op2(CCL, NIL, (Node*) cclenter((const char *) rlxstr));
990                     lastatom = starttok;
991                     rtok = relex();
992                     return (unary(np));
993           case NCCL:
994                     np = op2(NCCL, NIL, (Node *) cclenter((const char *) rlxstr));
995                     lastatom = starttok;
996                     rtok = relex();
997                     return (unary(np));
998           case '^':
999                     rtok = relex();
1000                     return (unary(op2(CHAR, NIL, itonp(HAT))));
1001           case '$':
1002                     rtok = relex();
1003                     return (unary(op2(CHAR, NIL, NIL)));
1004           case '(':
1005                     lastatom = starttok;
1006                     savelastatom = starttok - basestr; /* Retain over recursion */
1007                     rtok = relex();
1008                     if (rtok == ')') {  /* special pleading for () */
1009                               rtok = relex();
1010                               return unary(op2(CCL, NIL, (Node *) cclenter("")));
1011                     }
1012                     np = regexp();
1013                     if (rtok == ')') {
1014                               lastatom = basestr + savelastatom; /* Restore */
1015                               rtok = relex();
1016                               return (unary(np));
1017                     }
1018                     else
1019                               FATAL("syntax error in regular expression %s at %s", lastre, prestr);
1020           default:
1021                     FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
1022           }
1023           return 0; /*NOTREACHED*/
1024 }
1025 
concat(Node * np)1026 Node *concat(Node *np)
1027 {
1028           switch (rtok) {
1029           case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
1030                     return (concat(op2(CAT, np, primary())));
1031           case EMPTYRE:
1032                     rtok = relex();
1033                     return (concat(op2(CAT, op2(CCL, NIL, (Node *) cclenter("")),
1034                                         primary())));
1035           }
1036           return (np);
1037 }
1038 
alt(Node * np)1039 Node *alt(Node *np)
1040 {
1041           if (rtok == OR) {
1042                     rtok = relex();
1043                     return (alt(op2(OR, np, concat(primary()))));
1044           }
1045           return (np);
1046 }
1047 
unary(Node * np)1048 Node *unary(Node *np)
1049 {
1050           switch (rtok) {
1051           case STAR:
1052                     rtok = relex();
1053                     return (unary(op2(STAR, np, NIL)));
1054           case PLUS:
1055                     rtok = relex();
1056                     return (unary(op2(PLUS, np, NIL)));
1057           case QUEST:
1058                     rtok = relex();
1059                     return (unary(op2(QUEST, np, NIL)));
1060           case ZERO:
1061                     rtok = relex();
1062                     return (unary(op2(ZERO, np, NIL)));
1063           default:
1064                     return (np);
1065           }
1066 }
1067 
1068 /*
1069  * Character class definitions conformant to the POSIX locale as
1070  * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
1071  * and operating character sets are both ASCII (ISO646) or supersets
1072  * thereof.
1073  *
1074  * Note that to avoid overflowing the temporary buffer used in
1075  * relex(), the expanded character class (prior to range expansion)
1076  * must be less than twice the size of their full name.
1077  */
1078 
1079 /* Because isblank doesn't show up in any of the header files on any
1080  * system i use, it's defined here.  if some other locale has a richer
1081  * definition of "blank", define HAS_ISBLANK and provide your own
1082  * version.
1083  * the parentheses here are an attempt to find a path through the maze
1084  * of macro definition and/or function and/or version provided.  thanks
1085  * to nelson beebe for the suggestion; let's see if it works everywhere.
1086  */
1087 
1088 /* #define HAS_ISBLANK */
1089 #ifndef HAS_ISBLANK
1090 
1091 int (xisblank)(int c)
1092 {
1093           return c==' ' || c=='\t';
1094 }
1095 
1096 #endif
1097 
1098 static const struct charclass {
1099           const char *cc_name;
1100           int cc_namelen;
1101           int (*cc_func)(int);
1102 } charclasses[] = {
1103           { "alnum",          5,        isalnum },
1104           { "alpha",          5,        isalpha },
1105 #ifndef HAS_ISBLANK
1106           { "blank",          5,        xisblank },
1107 #else
1108           { "blank",          5,        isblank },
1109 #endif
1110           { "cntrl",          5,        iscntrl },
1111           { "digit",          5,        isdigit },
1112           { "graph",          5,        isgraph },
1113           { "lower",          5,        islower },
1114           { "print",          5,        isprint },
1115           { "punct",          5,        ispunct },
1116           { "space",          5,        isspace },
1117           { "upper",          5,        isupper },
1118           { "xdigit",         6,        isxdigit },
1119           { NULL,             0,        NULL },
1120 };
1121 
1122 #define REPEAT_SIMPLE                   0
1123 #define REPEAT_PLUS_APPENDED  1
1124 #define REPEAT_WITH_Q                   2
1125 #define REPEAT_ZERO           3
1126 
1127 static int
replace_repeat(const uschar * reptok,int reptoklen,const uschar * atom,int atomlen,int firstnum,int secondnum,int special_case)1128 replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom,
1129                  int atomlen, int firstnum, int secondnum, int special_case)
1130 {
1131           int i, j;
1132           uschar *buf = 0;
1133           int ret = 1;
1134           int init_q = (firstnum == 0);           /* first added char will be ? */
1135           int n_q_reps = secondnum-firstnum;      /* m>n, so reduce until {1,m-n} left  */
1136           int prefix_length = reptok - basestr;   /* prefix includes first rep  */
1137           int suffix_length = strlen((const char *) reptok) - reptoklen;        /* string after rep specifier */
1138           int size = prefix_length +  suffix_length;
1139 
1140           if (firstnum > 1) { /* add room for reps 2 through firstnum */
1141                     size += atomlen*(firstnum-1);
1142           }
1143 
1144           /* Adjust size of buffer for special cases */
1145           if (special_case == REPEAT_PLUS_APPENDED) {
1146                     size++;             /* for the final + */
1147           } else if (special_case == REPEAT_WITH_Q) {
1148                     size += init_q + (atomlen+1)* (n_q_reps-init_q);
1149           } else if (special_case == REPEAT_ZERO) {
1150                     size += 2;          /* just a null ERE: () */
1151           }
1152           if ((buf = (uschar *) malloc(size + 1)) == NULL)
1153                     FATAL("out of space in reg expr %.10s..", lastre);
1154           memcpy(buf, basestr, prefix_length);    /* copy prefix      */
1155           j = prefix_length;
1156           if (special_case == REPEAT_ZERO) {
1157                     j -= atomlen;
1158                     buf[j++] = '(';
1159                     buf[j++] = ')';
1160           }
1161           for (i = 1; i < firstnum; i++) {        /* copy x reps      */
1162                     memcpy(&buf[j], atom, atomlen);
1163                     j += atomlen;
1164           }
1165           if (special_case == REPEAT_PLUS_APPENDED) {
1166                     buf[j++] = '+';
1167           } else if (special_case == REPEAT_WITH_Q) {
1168                     if (init_q)
1169                               buf[j++] = '?';
1170                     for (i = init_q; i < n_q_reps; i++) {   /* copy x? reps */
1171                               memcpy(&buf[j], atom, atomlen);
1172                               j += atomlen;
1173                               buf[j++] = '?';
1174                     }
1175           }
1176           memcpy(&buf[j], reptok+reptoklen, suffix_length);
1177           j += suffix_length;
1178           buf[j] = '\0';
1179           /* free old basestr */
1180           if (firstbasestr != basestr) {
1181                     if (basestr)
1182                               xfree(basestr);
1183           }
1184           basestr = buf;
1185           prestr  = buf + prefix_length;
1186           if (special_case == REPEAT_ZERO) {
1187                     prestr  -= atomlen;
1188                     ret++;
1189           }
1190           return ret;
1191 }
1192 
repeat(const uschar * reptok,int reptoklen,const uschar * atom,int atomlen,int firstnum,int secondnum)1193 static int repeat(const uschar *reptok, int reptoklen, const uschar *atom,
1194                       int atomlen, int firstnum, int secondnum)
1195 {
1196           /*
1197              In general, the repetition specifier or "bound" is replaced here
1198              by an equivalent ERE string, repeating the immediately previous atom
1199              and appending ? and + as needed. Note that the first copy of the
1200              atom is left in place, except in the special_case of a zero-repeat
1201              (i.e., {0}).
1202            */
1203           if (secondnum < 0) {          /* means {n,} -> repeat n-1 times followed by PLUS */
1204                     if (firstnum < 2) {
1205                               /* 0 or 1: should be handled before you get here */
1206                               FATAL("internal error");
1207                     } else {
1208                               return replace_repeat(reptok, reptoklen, atom, atomlen,
1209                                         firstnum, secondnum, REPEAT_PLUS_APPENDED);
1210                     }
1211           } else if (firstnum == secondnum) {     /* {n} or {n,n} -> simply repeat n-1 times */
1212                     if (firstnum == 0) {          /* {0} or {0,0} */
1213                               /* This case is unusual because the resulting
1214                                  replacement string might actually be SMALLER than
1215                                  the original ERE */
1216                               return replace_repeat(reptok, reptoklen, atom, atomlen,
1217                                                   firstnum, secondnum, REPEAT_ZERO);
1218                     } else {            /* (firstnum >= 1) */
1219                               return replace_repeat(reptok, reptoklen, atom, atomlen,
1220                                                   firstnum, secondnum, REPEAT_SIMPLE);
1221                     }
1222           } else if (firstnum < secondnum) {      /* {n,m} -> repeat n-1 times then alternate  */
1223                     /*  x{n,m}  =>  xx...x{1, m-n+1}  =>  xx...x?x?x?..x?       */
1224                     return replace_repeat(reptok, reptoklen, atom, atomlen,
1225                                                   firstnum, secondnum, REPEAT_WITH_Q);
1226           } else {  /* Error - shouldn't be here (n>m) */
1227                     FATAL("internal error");
1228           }
1229           return 0;
1230 }
1231 
relex(void)1232 int relex(void)               /* lexical analyzer for reparse */
1233 {
1234           int c, n;
1235           int cflag;
1236           static uschar *buf = NULL;
1237           static int bufsz = 100;
1238           uschar *bp;
1239           const struct charclass *cc;
1240           int i;
1241           int num, m;
1242           bool commafound, digitfound;
1243           const uschar *startreptok;
1244           static int parens = 0;
1245 
1246 rescan:
1247           starttok = prestr;
1248 
1249           if ((n = u8_rune(&rlxval, (const char *) prestr)) > 1) {
1250                     prestr += n;
1251                     starttok = prestr;
1252                     return CHAR;
1253           }
1254 
1255           switch (c = *prestr++) {
1256           case '|': return OR;
1257           case '*': return STAR;
1258           case '+': return PLUS;
1259           case '?': return QUEST;
1260           case '.': return DOT;
1261           case '\0': prestr--; return '\0';
1262           case '^':
1263           case '$':
1264                     return c;
1265           case '(':
1266                     parens++;
1267                     return c;
1268           case ')':
1269                     if (parens) {
1270                               parens--;
1271                               return c;
1272                     }
1273                     /* unmatched close parenthesis; per POSIX, treat as literal */
1274                     rlxval = c;
1275                     return CHAR;
1276           case '\\':
1277                     rlxval = quoted(&prestr);
1278                     return CHAR;
1279           default:
1280                     rlxval = c;
1281                     return CHAR;
1282           case '[':
1283                     if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL)
1284                               FATAL("out of space in reg expr %.10s..", lastre);
1285                     bp = buf;
1286                     if (*prestr == '^') {
1287                               cflag = 1;
1288                               prestr++;
1289                     }
1290                     else
1291                               cflag = 0;
1292                     n = 5 * strlen((const char *) prestr)+1; /* BUG: was 2.  what value? */
1293                     if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1"))
1294                               FATAL("out of space for reg expr %.10s...", lastre);
1295                     for (; ; ) {
1296                               if ((n = u8_rune(&rlxval, (const char *) prestr)) > 1) {
1297                                         for (i = 0; i < n; i++)
1298                                                   *bp++ = *prestr++;
1299                                         continue;
1300                               }
1301                               if ((c = *prestr++) == '\\') {
1302                                         *bp++ = '\\';
1303                                         if ((c = *prestr++) == '\0')
1304                                                   FATAL("nonterminated character class %.20s...", lastre);
1305                                         *bp++ = c;
1306                               /* } else if (c == '\n') { */
1307                               /*        FATAL("newline in character class %.20s...", lastre); */
1308                               } else if (c == '[' && *prestr == ':') {
1309                                         /* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
1310                                         for (cc = charclasses; cc->cc_name; cc++)
1311                                                   if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
1312                                                             break;
1313                                         if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
1314                                             prestr[2 + cc->cc_namelen] == ']') {
1315                                                   prestr += cc->cc_namelen + 3;
1316                                                   /*
1317                                                    * BUG: We begin at 1, instead of 0, since we
1318                                                    * would otherwise prematurely terminate the
1319                                                    * string for classes like [[:cntrl:]]. This
1320                                                    * means that we can't match the NUL character,
1321                                                    * not without first adapting the entire
1322                                                    * program to track each string's length.
1323                                                    */
1324                                                   for (i = 1; i <= UCHAR_MAX; i++) {
1325                                                             if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "relex2"))
1326                                                                 FATAL("out of space for reg expr %.10s...", lastre);
1327                                                             if (cc->cc_func(i)) {
1328                                                                       /* escape backslash */
1329                                                                       if (i == '\\') {
1330                                                                                 *bp++ = '\\';
1331                                                                                 n++;
1332                                                                       }
1333 
1334                                                                       *bp++ = i;
1335                                                                       n++;
1336                                                             }
1337                                                   }
1338                                         } else
1339                                                   *bp++ = c;
1340                               } else if (c == '[' && *prestr == '.') {
1341                                         char collate_char;
1342                                         prestr++;
1343                                         collate_char = *prestr++;
1344                                         if (*prestr == '.' && prestr[1] == ']') {
1345                                                   prestr += 2;
1346                                                   /* Found it: map via locale TBD: for
1347                                                      now, simply return this char.  This
1348                                                      is sufficient to pass conformance
1349                                                      test awk.ex 156
1350                                                    */
1351                                                   if (*prestr == ']') {
1352                                                             prestr++;
1353                                                             rlxval = collate_char;
1354                                                             return CHAR;
1355                                                   }
1356                                         }
1357                               } else if (c == '[' && *prestr == '=') {
1358                                         char equiv_char;
1359                                         prestr++;
1360                                         equiv_char = *prestr++;
1361                                         if (*prestr == '=' && prestr[1] == ']') {
1362                                                   prestr += 2;
1363                                                   /* Found it: map via locale TBD: for now
1364                                                      simply return this char. This is
1365                                                      sufficient to pass conformance test
1366                                                      awk.ex 156
1367                                                    */
1368                                                   if (*prestr == ']') {
1369                                                             prestr++;
1370                                                             rlxval = equiv_char;
1371                                                             return CHAR;
1372                                                   }
1373                                         }
1374                               } else if (c == '\0') {
1375                                         FATAL("nonterminated character class %.20s", lastre);
1376                               } else if (bp == buf) {       /* 1st char is special */
1377                                         *bp++ = c;
1378                               } else if (c == ']') {
1379                                         *bp++ = 0;
1380                                         rlxstr = (uschar *) tostring((char *) buf);
1381                                         if (cflag == 0)
1382                                                   return CCL;
1383                                         else
1384                                                   return NCCL;
1385                               } else
1386                                         *bp++ = c;
1387                     }
1388                     break;
1389           case '{':
1390                     if (isdigit((int) *(prestr))) {
1391                               num = 0;  /* Process as a repetition */
1392                               n = -1; m = -1;
1393                               commafound = false;
1394                               digitfound = false;
1395                               startreptok = prestr-1;
1396                               /* Remember start of previous atom here ? */
1397                     } else {            /* just a { char, not a repetition */
1398                               rlxval = c;
1399                               return CHAR;
1400                 }
1401                     for (; ; ) {
1402                               if ((c = *prestr++) == '}') {
1403                                         if (commafound) {
1404                                                   if (digitfound) { /* {n,m} */
1405                                                             m = num;
1406                                                             if (m < n)
1407                                                                       FATAL("illegal repetition expression: class %.20s",
1408                                                                                 lastre);
1409                                                             if (n == 0 && m == 1) {
1410                                                                       return QUEST;
1411                                                             }
1412                                                   } else {  /* {n,} */
1413                                                             if (n == 0)
1414                                                                       return STAR;
1415                                                             else if (n == 1)
1416                                                                       return PLUS;
1417                                                   }
1418                                         } else {
1419                                                   if (digitfound) { /* {n} same as {n,n} */
1420                                                             n = num;
1421                                                             m = num;
1422                                                   } else {  /* {} */
1423                                                             FATAL("illegal repetition expression: class %.20s",
1424                                                                       lastre);
1425                                                   }
1426                                         }
1427                                         if (repeat(starttok, prestr-starttok, lastatom,
1428                                                      startreptok - lastatom, n, m) > 0) {
1429                                                   if (n == 0 && m == 0) {
1430                                                             return ZERO;
1431                                                   }
1432                                                   /* must rescan input for next token */
1433                                                   goto rescan;
1434                                         }
1435                                         /* Failed to replace: eat up {...} characters
1436                                            and treat like just PLUS */
1437                                         return PLUS;
1438                               } else if (c == '\0') {
1439                                         FATAL("nonterminated character class %.20s",
1440                                                   lastre);
1441                               } else if (isdigit(c)) {
1442                                         num = 10 * num + c - '0';
1443                                         digitfound = true;
1444                               } else if (c == ',') {
1445                                         if (commafound)
1446                                                   FATAL("illegal repetition expression: class %.20s",
1447                                                             lastre);
1448                                         /* looking for {n,} or {n,m} */
1449                                         commafound = true;
1450                                         n = num;
1451                                         digitfound = false; /* reset */
1452                                         num = 0;
1453                               } else {
1454                                         FATAL("illegal repetition expression: class %.20s",
1455                                                   lastre);
1456                               }
1457                     }
1458                     break;
1459           }
1460 }
1461 
cgoto(fa * f,int s,int c)1462 int cgoto(fa *f, int s, int c)
1463 {
1464           int *p, *q;
1465           int i, j, k;
1466 
1467           /* assert(c == HAT || c < NCHARS);  BUG: seg fault if disable test */
1468           while (f->accept >= maxsetvec) {        /* guessing here! */
1469                     resizesetvec(__func__);
1470           }
1471           for (i = 0; i <= f->accept; i++)
1472                     setvec[i] = 0;
1473           setcnt = 0;
1474           resize_state(f, s);
1475           /* compute positions of gototab[s,c] into setvec */
1476           p = f->posns[s];
1477           for (i = 1; i <= *p; i++) {
1478                     if ((k = f->re[p[i]].ltype) != FINAL) {
1479                               if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
1480                                || (k == DOT && c != 0 && c != HAT)
1481                                || (k == ALL && c != 0)
1482                                || (k == EMPTYRE && c != 0)
1483                                || (k == CCL && member(c, (int *) f->re[p[i]].lval.rp))
1484                                || (k == NCCL && !member(c, (int *) f->re[p[i]].lval.rp) && c != 0 && c != HAT)) {
1485                                         q = f->re[p[i]].lfollow;
1486                                         for (j = 1; j <= *q; j++) {
1487                                                   if (q[j] >= maxsetvec) {
1488                                                             resizesetvec(__func__);
1489                                                   }
1490                                                   if (setvec[q[j]] == 0) {
1491                                                             setcnt++;
1492                                                             setvec[q[j]] = 1;
1493                                                   }
1494                                         }
1495                               }
1496                     }
1497           }
1498           /* determine if setvec is a previous state */
1499           tmpset[0] = setcnt;
1500           j = 1;
1501           for (i = f->accept; i >= 0; i--)
1502                     if (setvec[i]) {
1503                               tmpset[j++] = i;
1504                     }
1505           resize_state(f, f->curstat > s ? f->curstat : s);
1506           /* tmpset == previous state? */
1507           for (i = 1; i <= f->curstat; i++) {
1508                     p = f->posns[i];
1509                     if ((k = tmpset[0]) != p[0])
1510                               goto different;
1511                     for (j = 1; j <= k; j++)
1512                               if (tmpset[j] != p[j])
1513                                         goto different;
1514                     /* setvec is state i */
1515                     if (c != HAT)
1516                               set_gototab(f, s, c, i);
1517                     return i;
1518             different:;
1519           }
1520 
1521           /* add tmpset to current set of states */
1522           ++(f->curstat);
1523           resize_state(f, f->curstat);
1524           clear_gototab(f, f->curstat);
1525           xfree(f->posns[f->curstat]);
1526           p = intalloc(setcnt + 1, __func__);
1527 
1528           f->posns[f->curstat] = p;
1529           if (c != HAT)
1530                     set_gototab(f, s, c, f->curstat);
1531           for (i = 0; i <= setcnt; i++)
1532                     p[i] = tmpset[i];
1533           if (setvec[f->accept])
1534                     f->out[f->curstat] = 1;
1535           else
1536                     f->out[f->curstat] = 0;
1537           return f->curstat;
1538 }
1539 
1540 
freefa(fa * f)1541 void freefa(fa *f)  /* free a finite automaton */
1542 {
1543           int i;
1544 
1545           if (f == NULL)
1546                     return;
1547           for (i = 0; i < f->state_count; i++)
1548                     xfree(f->gototab[i].entries);
1549           xfree(f->gototab);
1550           for (i = 0; i <= f->curstat; i++)
1551                     xfree(f->posns[i]);
1552           for (i = 0; i <= f->accept; i++) {
1553                     xfree(f->re[i].lfollow);
1554                     if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
1555                               xfree(f->re[i].lval.np);
1556           }
1557           xfree(f->restr);
1558           xfree(f->out);
1559           xfree(f->posns);
1560           xfree(f->gototab);
1561           xfree(f);
1562 }
1563