1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
3 All Rights Reserved
4 
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
13 permission.
14 
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 THIS SOFTWARE.
23 ****************************************************************/
24 
25 /* lasciate ogne speranza, voi ch'intrate. */
26 
27 #if HAVE_NBTOOL_CONFIG_H
28 #include "nbtool_config.h"
29 #endif
30 
31 #define   DEBUG
32 
33 #include <ctype.h>
34 #include <limits.h>
35 #include <stdio.h>
36 #include <string.h>
37 #include <stdlib.h>
38 #include "awk.h"
39 #include "awkgram.h"
40 
41 #define MAXLIN 22
42 
43 #define type(v)               (v)->nobj /* badly overloaded here */
44 #define info(v)               (v)->ntype          /* badly overloaded here */
45 #define left(v)               (v)->narg[0]
46 #define right(v)    (v)->narg[1]
47 #define parent(v)   (v)->nnext
48 
49 #define LEAF        case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
50 #define ELEAF       case EMPTYRE:                 /* empty string in regexp */
51 #define UNARY       case STAR: case PLUS: case QUEST:
52 
53 /* encoding in tree Nodes:
54           leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE):
55                     left is index, right contains value or pointer to value
56           unary (STAR, PLUS, QUEST): left is child, right is null
57           binary (CAT, OR): left and right are children
58           parent contains pointer to parent
59 */
60 
61 
62 int       *setvec;
63 int       *tmpset;
64 int       maxsetvec = 0;
65 
66 int       rtok;               /* next token in current re */
67 int       rlxval;
68 static const uschar *rlxstr;
69 static const uschar *prestr;  /* current position in current re */
70 static const uschar *lastre;  /* origin of last re */
71 static const uschar *lastatom;          /* origin of last Atom */
72 static const uschar *starttok;
73 static const uschar           *basestr; /* starts with original, replaced during
74                                            repetition processing */
75 static const uschar           *firstbasestr;
76 
77 static    int setcnt;
78 static    int poscnt;
79 
80 const char          *patbeg;
81 int       patlen;
82 
83 #define   NFA       128       /* cache this many dynamic fa's */
84 fa        *fatab[NFA];
85 int       nfatab    = 0;      /* entries in fatab */
86 
87 /* utf-8 mechanism:
88 
89    For most of Awk, utf-8 strings just "work", since they look like
90    null-terminated sequences of 8-bit bytes.
91 
92    Functions like length(), index(), and substr() have to operate
93    in units of utf-8 characters.  The u8_* functions in run.c
94    handle this.
95 
96    Regular expressions are more complicated, since the basic
97    mechanism of the goto table used 8-bit byte indices into the
98    gototab entries to compute the next state.  Unicode is a lot
99    bigger, so the gototab entries are now structs with a character
100    and a next state. These are sorted by code point and binary
101    searched.
102 
103    Throughout the RE mechanism in b.c, utf-8 characters are
104    converted to their utf-32 value.  This mostly shows up in
105    cclenter, which expands character class ranges like a-z and now
106    alpha-omega.  The size of a gototab array is still about 256.
107    This should be dynamic, but for now things work ok for a single
108    code page of Unicode, which is the most likely case.
109 
110    The code changes are localized in run.c and b.c.  I have added a
111    handful of functions to somewhat better hide the implementation,
112    but a lot more could be done.
113 
114  */
115 
116 static int entry_cmp(const void *l, const void *r);
117 static int get_gototab(fa*, int, int);
118 static int set_gototab(fa*, int, int, int);
119 static void clear_gototab(fa*, int);
120 
121 static int *
intalloc(size_t n,const char * f)122 intalloc(size_t n, const char *f)
123 {
124           int *p = (int *) calloc(n, sizeof(int));
125           if (p == NULL)
126                     overflo(f);
127           return p;
128 }
129 
130 static void
resizesetvec(const char * f)131 resizesetvec(const char *f)
132 {
133           if (maxsetvec == 0)
134                     maxsetvec = MAXLIN;
135           else
136                     maxsetvec *= 4;
137           setvec = (int *) realloc(setvec, maxsetvec * sizeof(*setvec));
138           tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(*tmpset));
139           if (setvec == NULL || tmpset == NULL)
140                     overflo(f);
141 }
142 
143 static void
resize_state(fa * f,int state)144 resize_state(fa *f, int state)
145 {
146           gtt *p;
147           uschar *p2;
148           int **p3;
149           int i, new_count;
150 
151           if (++state < f->state_count)
152                     return;
153 
154           new_count = state + 10; /* needs to be tuned */
155 
156           p = (gtt *) realloc(f->gototab, new_count * sizeof(gtt));
157           if (p == NULL)
158                     goto out;
159           f->gototab = p;
160 
161           p2 = (uschar *) realloc(f->out, new_count * sizeof(f->out[0]));
162           if (p2 == NULL)
163                     goto out;
164           f->out = p2;
165 
166           p3 = (int **) realloc(f->posns, new_count * sizeof(f->posns[0]));
167           if (p3 == NULL)
168                     goto out;
169           f->posns = p3;
170 
171           for (i = f->state_count; i < new_count; ++i) {
172                     f->gototab[i].entries = (gtte *) calloc(NCHARS, sizeof(gtte));
173                     if (f->gototab[i].entries == NULL)
174                               goto out;
175                     f->gototab[i].allocated = NCHARS;
176                     f->gototab[i].inuse = 0;
177                     f->out[i] = 0;
178                     f->posns[i] = NULL;
179           }
180           f->state_count = new_count;
181           return;
182 out:
183           overflo(__func__);
184 }
185 
makedfa(const char * s,bool anchor)186 fa *makedfa(const char *s, bool anchor) /* returns dfa for reg expr s */
187 {
188           int i, use, nuse;
189           fa *pfa;
190           static int now = 1;
191 
192           if (setvec == NULL) {         /* first time through any RE */
193                     resizesetvec(__func__);
194           }
195 
196           if (compile_time != RUNNING)  /* a constant for sure */
197                     return mkdfa(s, anchor);
198           for (i = 0; i < nfatab; i++)  /* is it there already? */
199                     if (fatab[i]->anchor == anchor
200                       && strcmp((const char *) fatab[i]->restr, s) == 0) {
201                               fatab[i]->use = now++;
202                               return fatab[i];
203                     }
204           pfa = mkdfa(s, anchor);
205           if (nfatab < NFA) { /* room for another */
206                     fatab[nfatab] = pfa;
207                     fatab[nfatab]->use = now++;
208                     nfatab++;
209                     return pfa;
210           }
211           use = fatab[0]->use;          /* replace least-recently used */
212           nuse = 0;
213           for (i = 1; i < nfatab; i++)
214                     if (fatab[i]->use < use) {
215                               use = fatab[i]->use;
216                               nuse = i;
217                     }
218           freefa(fatab[nuse]);
219           fatab[nuse] = pfa;
220           pfa->use = now++;
221           return pfa;
222 }
223 
mkdfa(const char * s,bool anchor)224 fa *mkdfa(const char *s, bool anchor)   /* does the real work of making a dfa */
225                                         /* anchor = true for anchored matches, else false */
226 {
227           Node *p, *p1;
228           fa *f;
229 
230           firstbasestr = (const uschar *) s;
231           basestr = firstbasestr;
232           p = reparse(s);
233           p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
234                     /* put ALL STAR in front of reg.  exp. */
235           p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
236                     /* put FINAL after reg.  exp. */
237 
238           poscnt = 0;
239           penter(p1);         /* enter parent pointers and leaf indices */
240           if ((f = (fa *) calloc(1, sizeof(fa) + poscnt * sizeof(rrow))) == NULL)
241                     overflo(__func__);
242           f->accept = poscnt-1;         /* penter has computed number of positions in re */
243           cfoll(f, p1);       /* set up follow sets */
244           freetr(p1);
245           resize_state(f, 1);
246           f->posns[0] = intalloc(*(f->re[0].lfollow), __func__);
247           f->posns[1] = intalloc(1, __func__);
248           *f->posns[1] = 0;
249           f->initstat = makeinit(f, anchor);
250           f->anchor = anchor;
251           f->restr = (uschar *) tostring(s);
252           if (firstbasestr != basestr) {
253                     if (basestr)
254                               xfree(basestr);
255           }
256           return f;
257 }
258 
makeinit(fa * f,bool anchor)259 int makeinit(fa *f, bool anchor)
260 {
261           int i, k;
262 
263           f->curstat = 2;
264           f->out[2] = 0;
265           k = *(f->re[0].lfollow);
266           xfree(f->posns[2]);
267           f->posns[2] = intalloc(k + 1,  __func__);
268           for (i = 0; i <= k; i++) {
269                     (f->posns[2])[i] = (f->re[0].lfollow)[i];
270           }
271           if ((f->posns[2])[1] == f->accept)
272                     f->out[2] = 1;
273           clear_gototab(f, 2);
274           f->curstat = cgoto(f, 2, HAT);
275           if (anchor) {
276                     *f->posns[2] = k-1; /* leave out position 0 */
277                     for (i = 0; i < k; i++) {
278                               (f->posns[0])[i] = (f->posns[2])[i];
279                     }
280 
281                     f->out[0] = f->out[2];
282                     if (f->curstat != 2)
283                               --(*f->posns[f->curstat]);
284           }
285           return f->curstat;
286 }
287 
penter(Node * p)288 void penter(Node *p)          /* set up parent pointers and leaf indices */
289 {
290           switch (type(p)) {
291           ELEAF
292           LEAF
293                     info(p) = poscnt;
294                     poscnt++;
295                     break;
296           UNARY
297                     penter(left(p));
298                     parent(left(p)) = p;
299                     break;
300           case CAT:
301           case OR:
302                     penter(left(p));
303                     penter(right(p));
304                     parent(left(p)) = p;
305                     parent(right(p)) = p;
306                     break;
307           case ZERO:
308                     break;
309           default:  /* can't happen */
310                     FATAL("can't happen: unknown type %d in penter", type(p));
311                     break;
312           }
313 }
314 
freetr(Node * p)315 void freetr(Node *p)          /* free parse tree */
316 {
317           switch (type(p)) {
318           ELEAF
319           LEAF
320                     xfree(p);
321                     break;
322           UNARY
323           case ZERO:
324                     freetr(left(p));
325                     xfree(p);
326                     break;
327           case CAT:
328           case OR:
329                     freetr(left(p));
330                     freetr(right(p));
331                     xfree(p);
332                     break;
333           default:  /* can't happen */
334                     FATAL("can't happen: unknown type %d in freetr", type(p));
335                     break;
336           }
337 }
338 
339 /* in the parsing of regular expressions, metacharacters like . have */
340 /* to be seen literally;  \056 is not a metacharacter. */
341 
hexstr(const uschar ** pp,int max)342 static int hexstr(const uschar **pp, int max)     /* find and eval hex string at pp, return new p */
343 {                             /* only pick up one 8-bit byte (2 chars) */
344           const uschar *p;
345           int n = 0;
346           int i;
347 
348           for (i = 0, p = *pp; i < max && isxdigit(*p); i++, p++) {
349                     if (isdigit((int) *p))
350                               n = 16 * n + *p - '0';
351                     else if (*p >= 'a' && *p <= 'f')
352                               n = 16 * n + *p - 'a' + 10;
353                     else if (*p >= 'A' && *p <= 'F')
354                               n = 16 * n + *p - 'A' + 10;
355           }
356           *pp = p;
357           return n;
358 }
359 
360 
361 
362 #define isoctdigit(c) ((c) >= '0' && (c) <= '7')  /* multiple use of arg */
363 
quoted(const uschar ** pp)364 int quoted(const uschar **pp) /* pick up next thing after a \\ */
365                               /* and increment *pp */
366 {
367           const uschar *p = *pp;
368           int c;
369 
370 /* BUG: should advance by utf-8 char even if makes no sense */
371 
372           switch ((c = *p++)) {
373           case 't':
374                     c = '\t';
375                     break;
376           case 'n':
377                     c = '\n';
378                     break;
379           case 'f':
380                     c = '\f';
381                     break;
382           case 'r':
383                     c = '\r';
384                     break;
385           case 'b':
386                     c = '\b';
387                     break;
388           case 'v':
389                     c = '\v';
390                     break;
391           case 'a':
392                     c = '\a';
393                     break;
394           case '\\':
395                     c = '\\';
396                     break;
397           case 'x': /* 2 hex digits follow */
398                     c = hexstr(&p, 2); /* this adds a null if number is invalid */
399                     break;
400           case 'u': /* unicode char number up to 8 hex digits */
401                     c = hexstr(&p, 8);
402                     break;
403           default:
404                     if (isoctdigit(c)) { /* \d \dd \ddd */
405                               int n = c - '0';
406                               if (isoctdigit(*p)) {
407                                         n = 8 * n + *p++ - '0';
408                                         if (isoctdigit(*p))
409                                                   n = 8 * n + *p++ - '0';
410                               }
411                               c = n;
412                     }
413           }
414 
415           *pp = p;
416           return c;
417 }
418 
cclenter(const char * argp)419 int *cclenter(const char *argp)         /* add a character class */
420 {
421           int i, c, c2;
422           int n;
423           const uschar *p = (const uschar *) argp;
424           int *bp, *retp;
425           static int *buf = NULL;
426           static int bufsz = 100;
427 
428           if (buf == NULL && (buf = (int *) calloc(bufsz, sizeof(int))) == NULL)
429                     FATAL("out of space for character class [%.10s...] 1", p);
430           bp = buf;
431           for (i = 0; *p != 0; ) {
432                     n = u8_rune(&c, (const char *) p);
433                     p += n;
434                     if (c == '\\') {
435                               c = quoted(&p);
436                     } else if (c == '-' && i > 0 && bp[-1] != 0) {
437                               if (*p != 0) {
438                                         c = bp[-1];
439                                         /* c2 = *p++; */
440                                         n = u8_rune(&c2, (const char *) p);
441                                         p += n;
442                                         if (c2 == '\\')
443                                                   c2 = quoted(&p); /* BUG: sets p, has to be u8 size */
444                                         if (c > c2) {       /* empty; ignore */
445                                                   bp--;
446                                                   i--;
447                                                   continue;
448                                         }
449                                         while (c < c2) {
450                                                   if (i >= bufsz) {
451                                                             bufsz *= 2;
452                                                             buf = (int *) realloc(buf, bufsz * sizeof(int));
453                                                             if (buf == NULL)
454                                                                       FATAL("out of space for character class [%.10s...] 2", p);
455                                                             bp = buf + i;
456                                                   }
457                                                   *bp++ = ++c;
458                                                   i++;
459                                         }
460                                         continue;
461                               }
462                     }
463                     if (i >= bufsz) {
464                               bufsz *= 2;
465                               buf = (int *) realloc(buf, bufsz * sizeof(int));
466                               if (buf == NULL)
467                                         FATAL("out of space for character class [%.10s...] 2", p);
468                               bp = buf + i;
469                     }
470                     *bp++ = c;
471                     i++;
472           }
473           *bp = 0;
474           /* DPRINTF("cclenter: in = |%s|, out = |%s|\n", op, buf); BUG: can't print array of int */
475           /* xfree(op);  BUG: what are we freeing here? */
476           retp = (int *) calloc(bp-buf+1, sizeof(int));
477           for (i = 0; i < bp-buf+1; i++)
478                     retp[i] = buf[i];
479           return retp;
480 }
481 
overflo(const char * s)482 void overflo(const char *s)
483 {
484           FATAL("regular expression too big: out of space in %.30s...", s);
485 }
486 
cfoll(fa * f,Node * v)487 void cfoll(fa *f, Node *v)    /* enter follow set of each leaf of vertex v into lfollow[leaf] */
488 {
489           int i;
490           int *p;
491 
492           switch (type(v)) {
493           ELEAF
494           LEAF
495                     f->re[info(v)].ltype = type(v);
496                     f->re[info(v)].lval.np = right(v);
497                     while (f->accept >= maxsetvec) {        /* guessing here! */
498                               resizesetvec(__func__);
499                     }
500                     for (i = 0; i <= f->accept; i++)
501                               setvec[i] = 0;
502                     setcnt = 0;
503                     follow(v);          /* computes setvec and setcnt */
504                     p = intalloc(setcnt + 1, __func__);
505                     f->re[info(v)].lfollow = p;
506                     *p = setcnt;
507                     for (i = f->accept; i >= 0; i--)
508                               if (setvec[i] == 1)
509                                         *++p = i;
510                     break;
511           UNARY
512                     cfoll(f,left(v));
513                     break;
514           case CAT:
515           case OR:
516                     cfoll(f,left(v));
517                     cfoll(f,right(v));
518                     break;
519           case ZERO:
520                     break;
521           default:  /* can't happen */
522                     FATAL("can't happen: unknown type %d in cfoll", type(v));
523           }
524 }
525 
first(Node * p)526 int first(Node *p)  /* collects initially active leaves of p into setvec */
527                               /* returns 0 if p matches empty string */
528 {
529           int b, lp;
530 
531           switch (type(p)) {
532           ELEAF
533           LEAF
534                     lp = info(p);       /* look for high-water mark of subscripts */
535                     while (setcnt >= maxsetvec || lp >= maxsetvec) {  /* guessing here! */
536                               resizesetvec(__func__);
537                     }
538                     if (type(p) == EMPTYRE) {
539                               setvec[lp] = 0;
540                               return(0);
541                     }
542                     if (setvec[lp] != 1) {
543                               setvec[lp] = 1;
544                               setcnt++;
545                     }
546                     if (type(p) == CCL && (*(int *) right(p)) == 0)
547                               return(0);                    /* empty CCL */
548                     return(1);
549           case PLUS:
550                     if (first(left(p)) == 0)
551                               return(0);
552                     return(1);
553           case STAR:
554           case QUEST:
555                     first(left(p));
556                     return(0);
557           case CAT:
558                     if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
559                     return(1);
560           case OR:
561                     b = first(right(p));
562                     if (first(left(p)) == 0 || b == 0) return(0);
563                     return(1);
564           case ZERO:
565                     return 0;
566           }
567           FATAL("can't happen: unknown type %d in first", type(p));   /* can't happen */
568           return(-1);
569 }
570 
follow(Node * v)571 void follow(Node *v)          /* collects leaves that can follow v into setvec */
572 {
573           Node *p;
574 
575           if (type(v) == FINAL)
576                     return;
577           p = parent(v);
578           switch (type(p)) {
579           case STAR:
580           case PLUS:
581                     first(v);
582                     follow(p);
583                     return;
584 
585           case OR:
586           case QUEST:
587                     follow(p);
588                     return;
589 
590           case CAT:
591                     if (v == left(p)) { /* v is left child of p */
592                               if (first(right(p)) == 0) {
593                                         follow(p);
594                                         return;
595                               }
596                     } else              /* v is right child */
597                               follow(p);
598                     return;
599           }
600 }
601 
member(int c,int * sarg)602 int member(int c, int *sarg)  /* is c in s? */
603 {
604           int *s = (int *) sarg;
605 
606           while (*s)
607                     if (c == *s++)
608                               return(1);
609           return(0);
610 }
611 
resize_gototab(fa * f,int state)612 static void resize_gototab(fa *f, int state)
613 {
614           size_t new_size = f->gototab[state].allocated * 2;
615           gtte *p = (gtte *) realloc(f->gototab[state].entries, new_size * sizeof(gtte));
616           if (p == NULL)
617                     overflo(__func__);
618 
619           // need to initialized the new memory to zero
620           size_t orig_size = f->gototab[state].allocated;             // 2nd half of new mem is this size
621           memset(p + orig_size, 0, orig_size * sizeof(gtte));         // clean it out
622 
623           f->gototab[state].allocated = new_size;                     // update gototab info
624           f->gototab[state].entries = p;
625 }
626 
get_gototab(fa * f,int state,int ch)627 static int get_gototab(fa *f, int state, int ch) /* hide gototab implementation */
628 {
629           gtte key;
630           gtte *item;
631 
632           key.ch = ch;
633           key.state = 0;      /* irrelevant */
634           item = (gtte *) bsearch(& key, f->gototab[state].entries,
635                               f->gototab[state].inuse, sizeof(gtte),
636                               entry_cmp);
637 
638           if (item == NULL)
639                     return 0;
640           else
641                     return item->state;
642 }
643 
entry_cmp(const void * l,const void * r)644 static int entry_cmp(const void *l, const void *r)
645 {
646           const gtte *left, *right;
647 
648           left = (const gtte *) l;
649           right = (const gtte *) r;
650 
651           return left->ch - right->ch;
652 }
653 
set_gototab(fa * f,int state,int ch,int val)654 static int set_gototab(fa *f, int state, int ch, int val) /* hide gototab implementation */
655 {
656           if (f->gototab[state].inuse == 0) {
657                     f->gototab[state].entries[0].ch = ch;
658                     f->gototab[state].entries[0].state = val;
659                     f->gototab[state].inuse++;
660                     return val;
661           } else if ((unsigned)ch > f->gototab[state].entries[f->gototab[state].inuse-1].ch) {
662                     // not seen yet, insert and return
663                     gtt *tab = & f->gototab[state];
664                     if (tab->inuse + 1 >= tab->allocated)
665                               resize_gototab(f, state);
666 
667                     f->gototab[state].entries[f->gototab[state].inuse].ch = ch;
668                     f->gototab[state].entries[f->gototab[state].inuse].state = val;
669                     f->gototab[state].inuse++;
670                     return val;
671           } else {
672                     // maybe we have it, maybe we don't
673                     gtte key;
674                     gtte *item;
675 
676                     key.ch = ch;
677                     key.state = 0;      /* irrelevant */
678                     item = (gtte *) bsearch(& key, f->gototab[state].entries,
679                                         f->gototab[state].inuse, sizeof(gtte),
680                                         entry_cmp);
681 
682                     if (item != NULL) {
683                               // we have it, update state and return
684                               item->state = val;
685                               return item->state;
686                     }
687                     // otherwise, fall through to insert and reallocate.
688           }
689 
690           gtt *tab = & f->gototab[state];
691           if (tab->inuse + 1 >= tab->allocated)
692                     resize_gototab(f, state);
693           f->gototab[state].entries[tab->inuse].ch = ch;
694           f->gototab[state].entries[tab->inuse].state = val;
695           ++tab->inuse;
696 
697           qsort(f->gototab[state].entries,
698                     f->gototab[state].inuse, sizeof(gtte), entry_cmp);
699 
700           return val; /* not used anywhere at the moment */
701 }
702 
clear_gototab(fa * f,int state)703 static void clear_gototab(fa *f, int state)
704 {
705           memset(f->gototab[state].entries, 0,
706                     f->gototab[state].allocated * sizeof(gtte));
707           f->gototab[state].inuse = 0;
708 }
709 
match(fa * f,const char * p0)710 int match(fa *f, const char *p0)        /* shortest match ? */
711 {
712           int s, ns;
713           int n;
714           int rune;
715           const uschar *p = (const uschar *) p0;
716 
717           /* return pmatch(f, p0); does it matter whether longest or shortest? */
718 
719           s = f->initstat;
720           assert (s < f->state_count);
721 
722           if (f->out[s])
723                     return(1);
724           do {
725                     /* assert(*p < NCHARS); */
726                     n = u8_rune(&rune, (const char *) p);
727                     if ((ns = get_gototab(f, s, rune)) != 0)
728                               s = ns;
729                     else
730                               s = cgoto(f, s, rune);
731                     if (f->out[s])
732                               return(1);
733                     if (*p == 0)
734                               break;
735                     p += n;
736           } while (1);  /* was *p++ != 0 */
737           return(0);
738 }
739 
pmatch(fa * f,const char * p0)740 int pmatch(fa *f, const char *p0)       /* longest match, for sub */
741 {
742           int s, ns;
743           int n;
744           int rune;
745           const uschar *p = (const uschar *) p0;
746           const uschar *q;
747 
748           s = f->initstat;
749           assert(s < f->state_count);
750 
751           patbeg = (const char *)p;
752           patlen = -1;
753           do {
754                     q = p;
755                     do {
756                               if (f->out[s])                /* final state */
757                                         patlen = q-p;
758                               /* assert(*q < NCHARS); */
759                               n = u8_rune(&rune, (const char *) q);
760                               if ((ns = get_gototab(f, s, rune)) != 0)
761                                         s = ns;
762                               else
763                                         s = cgoto(f, s, rune);
764 
765                               assert(s < f->state_count);
766 
767                               if (s == 1) {       /* no transition */
768                                         if (patlen >= 0) {
769                                                   patbeg = (const char *) p;
770                                                   return(1);
771                                         }
772                                         else
773                                                   goto nextin;        /* no match */
774                               }
775                               if (*q == 0)
776                                         break;
777                               q += n;
778                     } while (1);
779                     q++;  /* was *q++ */
780                     if (f->out[s])
781                               patlen = q-p-1;     /* don't count $ */
782                     if (patlen >= 0) {
783                               patbeg = (const char *) p;
784                               return(1);
785                     }
786           nextin:
787                     s = 2;
788                     if (*p == 0)
789                               break;
790                     n = u8_rune(&rune, (const char *) p);
791                     p += n;
792           } while (1); /* was *p++ */
793           return (0);
794 }
795 
nematch(fa * f,const char * p0)796 int nematch(fa *f, const char *p0)      /* non-empty match, for sub */
797 {
798           int s, ns;
799         int n;
800         int rune;
801           const uschar *p = (const uschar *) p0;
802           const uschar *q;
803 
804           s = f->initstat;
805           assert(s < f->state_count);
806 
807           patbeg = (const char *)p;
808           patlen = -1;
809           while (*p) {
810                     q = p;
811                     do {
812                               if (f->out[s])                /* final state */
813                                         patlen = q-p;
814                               /* assert(*q < NCHARS); */
815                               n = u8_rune(&rune, (const char *) q);
816                               if ((ns = get_gototab(f, s, rune)) != 0)
817                                         s = ns;
818                               else
819                                         s = cgoto(f, s, rune);
820                               if (s == 1) {       /* no transition */
821                                         if (patlen > 0) {
822                                                   patbeg = (const char *) p;
823                                                   return(1);
824                                         } else
825                                                   goto nnextin;       /* no nonempty match */
826                               }
827                               if (*q == 0)
828                                         break;
829                               q += n;
830                     } while (1);
831                     q++;
832                     if (f->out[s])
833                               patlen = q-p-1;     /* don't count $ */
834                     if (patlen > 0 ) {
835                               patbeg = (const char *) p;
836                               return(1);
837                     }
838           nnextin:
839                     s = 2;
840                     p++;
841           }
842           return (0);
843 }
844 
845 
846 /*
847  * NAME
848  *     fnematch
849  *
850  * DESCRIPTION
851  *     A stream-fed version of nematch which transfers characters to a
852  *     null-terminated buffer. All characters up to and including the last
853  *     character of the matching text or EOF are placed in the buffer. If
854  *     a match is found, patbeg and patlen are set appropriately.
855  *
856  * RETURN VALUES
857  *     false    No match found.
858  *     true     Match found.
859  */
860 
fnematch(fa * pfa,FILE * f,char ** pbuf,int * pbufsize,int quantum)861 bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
862 {
863           char *i, *j, *k, *buf = *pbuf;
864           int bufsize = *pbufsize;
865           int c, n, ns, s;
866 
867           s = pfa->initstat;
868           patlen = 0;
869 
870           /*
871            * buf <= i <= j <= k <= buf+bufsize
872            *
873            * i: origin of active substring
874            * j: current character
875            * k: destination of the next getc
876            */
877 
878           i = j = k = buf;
879 
880           do {
881                     /*
882                      * Call u8_rune with at least awk_mb_cur_max ahead in
883                      * the buffer until EOF interferes.
884                      */
885                     if (k - j < (int)awk_mb_cur_max) {
886                               if (k + awk_mb_cur_max > buf + bufsize) {
887                                         char *obuf = buf;
888                                         adjbuf((char **) &buf, &bufsize,
889                                             bufsize + awk_mb_cur_max,
890                                             quantum, 0, "fnematch");
891 
892                                         /* buf resized, maybe moved. update pointers */
893                                         *pbufsize = bufsize;
894                                         if (obuf != buf) {
895                                                   i = buf + (i - obuf);
896                                                   j = buf + (j - obuf);
897                                                   k = buf + (k - obuf);
898                                                   *pbuf = buf;
899                                                   if (patlen)
900                                                             patbeg = buf + (patbeg - obuf);
901                                         }
902                               }
903                               for (n = awk_mb_cur_max ; n > 0; n--) {
904                                         *k++ = (c = getc(f)) != EOF ? c : 0;
905                                         if (c == EOF) {
906                                                   if (ferror(f))
907                                                             FATAL("fnematch: getc error");
908                                                   break;
909                                         }
910                               }
911                     }
912 
913                     j += u8_rune(&c, j);
914 
915                     if ((ns = get_gototab(pfa, s, c)) != 0)
916                               s = ns;
917                     else
918                               s = cgoto(pfa, s, c);
919 
920                     if (pfa->out[s]) {  /* final state */
921                               patbeg = i;
922                               patlen = j - i;
923                               if (c == 0)         /* don't count $ */
924                                         patlen--;
925                     }
926 
927                     if (c && s != 1)
928                               continue;  /* origin i still viable, next j */
929                     if (patlen)
930                               break;     /* best match found */
931 
932                     /* no match at origin i, next i and start over */
933                     i += u8_rune(&c, i);
934                     if (c == 0)
935                               break;    /* no match */
936                     j = i;
937                     s = 2;
938           } while (1);
939 
940           if (patlen) {
941                     /*
942                      * Under no circumstances is the last character fed to
943                      * the automaton part of the match. It is EOF's nullbyte,
944                      * or it sent the automaton into a state with no further
945                      * transitions available (s==1), or both. Room for a
946                      * terminating nullbyte is guaranteed.
947                      *
948                      * ungetc any chars after the end of matching text
949                      * (except for EOF's nullbyte, if present) and null
950                      * terminate the buffer.
951                      */
952                     do
953                               if (*--k && ungetc(*k, f) == EOF)
954                                         FATAL("unable to ungetc '%c'", *k);
955                     while (k > patbeg + patlen);
956                     *k = '\0';
957                     return true;
958           }
959           else
960                     return false;
961 }
962 
reparse(const char * p)963 Node *reparse(const char *p)  /* parses regular expression pointed to by p */
964 {                             /* uses relex() to scan regular expression */
965           Node *np;
966 
967           DPRINTF("reparse <%s>\n", p);
968           lastre = prestr = (const uschar *) p;   /* prestr points to string to be parsed */
969           rtok = relex();
970           /* GNU compatibility: an empty regexp matches anything */
971           if (rtok == '\0') {
972                     /* FATAL("empty regular expression"); previous */
973                     return(op2(EMPTYRE, NIL, NIL));
974           }
975           np = regexp();
976           if (rtok != '\0')
977                     FATAL("syntax error in regular expression %s at %s", lastre, prestr);
978           return(np);
979 }
980 
regexp(void)981 Node *regexp(void)  /* top-level parse of reg expr */
982 {
983           return (alt(concat(primary())));
984 }
985 
primary(void)986 Node *primary(void)
987 {
988           Node *np;
989           int savelastatom;
990 
991           switch (rtok) {
992           case CHAR:
993                     lastatom = starttok;
994                     np = op2(CHAR, NIL, itonp(rlxval));
995                     rtok = relex();
996                     return (unary(np));
997           case ALL:
998                     rtok = relex();
999                     return (unary(op2(ALL, NIL, NIL)));
1000           case EMPTYRE:
1001                     rtok = relex();
1002                     return (unary(op2(EMPTYRE, NIL, NIL)));
1003           case DOT:
1004                     lastatom = starttok;
1005                     rtok = relex();
1006                     return (unary(op2(DOT, NIL, NIL)));
1007           case CCL:
1008                     np = op2(CCL, NIL, (Node*) cclenter((const char *) rlxstr));
1009                     lastatom = starttok;
1010                     rtok = relex();
1011                     return (unary(np));
1012           case NCCL:
1013                     np = op2(NCCL, NIL, (Node *) cclenter((const char *) rlxstr));
1014                     lastatom = starttok;
1015                     rtok = relex();
1016                     return (unary(np));
1017           case '^':
1018                     rtok = relex();
1019                     return (unary(op2(CHAR, NIL, itonp(HAT))));
1020           case '$':
1021                     rtok = relex();
1022                     return (unary(op2(CHAR, NIL, NIL)));
1023           case '(':
1024                     lastatom = starttok;
1025                     savelastatom = starttok - basestr; /* Retain over recursion */
1026                     rtok = relex();
1027                     if (rtok == ')') {  /* special pleading for () */
1028                               rtok = relex();
1029                               return unary(op2(CCL, NIL, (Node *) cclenter("")));
1030                     }
1031                     np = regexp();
1032                     if (rtok == ')') {
1033                               lastatom = basestr + savelastatom; /* Restore */
1034                               rtok = relex();
1035                               return (unary(np));
1036                     }
1037                     else
1038                               FATAL("syntax error in regular expression %s at %s", lastre, prestr);
1039           default:
1040                     FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
1041           }
1042           return 0; /*NOTREACHED*/
1043 }
1044 
concat(Node * np)1045 Node *concat(Node *np)
1046 {
1047           switch (rtok) {
1048           case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
1049                     return (concat(op2(CAT, np, primary())));
1050           case EMPTYRE:
1051                     rtok = relex();
1052                     return (concat(op2(CAT, op2(CCL, NIL, (Node *) cclenter("")),
1053                                         primary())));
1054           }
1055           return (np);
1056 }
1057 
alt(Node * np)1058 Node *alt(Node *np)
1059 {
1060           if (rtok == OR) {
1061                     rtok = relex();
1062                     return (alt(op2(OR, np, concat(primary()))));
1063           }
1064           return (np);
1065 }
1066 
unary(Node * np)1067 Node *unary(Node *np)
1068 {
1069           switch (rtok) {
1070           case STAR:
1071                     rtok = relex();
1072                     return (unary(op2(STAR, np, NIL)));
1073           case PLUS:
1074                     rtok = relex();
1075                     return (unary(op2(PLUS, np, NIL)));
1076           case QUEST:
1077                     rtok = relex();
1078                     return (unary(op2(QUEST, np, NIL)));
1079           case ZERO:
1080                     rtok = relex();
1081                     return (unary(op2(ZERO, np, NIL)));
1082           default:
1083                     return (np);
1084           }
1085 }
1086 
1087 /*
1088  * Character class definitions conformant to the POSIX locale as
1089  * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
1090  * and operating character sets are both ASCII (ISO646) or supersets
1091  * thereof.
1092  *
1093  * Note that to avoid overflowing the temporary buffer used in
1094  * relex(), the expanded character class (prior to range expansion)
1095  * must be less than twice the size of their full name.
1096  */
1097 
1098 /* Because isblank doesn't show up in any of the header files on any
1099  * system i use, it's defined here.  if some other locale has a richer
1100  * definition of "blank", define HAS_ISBLANK and provide your own
1101  * version.
1102  * the parentheses here are an attempt to find a path through the maze
1103  * of macro definition and/or function and/or version provided.  thanks
1104  * to nelson beebe for the suggestion; let's see if it works everywhere.
1105  */
1106 
1107 /* #define HAS_ISBLANK */
1108 #ifndef HAS_ISBLANK
1109 
1110 int (xisblank)(int c)
1111 {
1112           return c==' ' || c=='\t';
1113 }
1114 
1115 #endif
1116 
1117 static const struct charclass {
1118           const char *cc_name;
1119           int cc_namelen;
1120           int (*cc_func)(int);
1121 } charclasses[] = {
1122           { "alnum",          5,        isalnum },
1123           { "alpha",          5,        isalpha },
1124 #ifndef HAS_ISBLANK
1125           { "blank",          5,        xisblank },
1126 #else
1127           { "blank",          5,        isblank },
1128 #endif
1129           { "cntrl",          5,        iscntrl },
1130           { "digit",          5,        isdigit },
1131           { "graph",          5,        isgraph },
1132           { "lower",          5,        islower },
1133           { "print",          5,        isprint },
1134           { "punct",          5,        ispunct },
1135           { "space",          5,        isspace },
1136           { "upper",          5,        isupper },
1137           { "xdigit",         6,        isxdigit },
1138           { NULL,             0,        NULL },
1139 };
1140 
1141 #define REPEAT_SIMPLE                   0
1142 #define REPEAT_PLUS_APPENDED  1
1143 #define REPEAT_WITH_Q                   2
1144 #define REPEAT_ZERO           3
1145 
1146 static int
replace_repeat(const uschar * reptok,int reptoklen,const uschar * atom,int atomlen,int firstnum,int secondnum,int special_case)1147 replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom,
1148                  int atomlen, int firstnum, int secondnum, int special_case)
1149 {
1150           int i, j;
1151           uschar *buf = 0;
1152           int ret = 1;
1153           int init_q = (firstnum == 0);           /* first added char will be ? */
1154           int n_q_reps = secondnum-firstnum;      /* m>n, so reduce until {1,m-n} left  */
1155           int prefix_length = reptok - basestr;   /* prefix includes first rep  */
1156           int suffix_length = strlen((const char *) reptok) - reptoklen;        /* string after rep specifier */
1157           int size = prefix_length +  suffix_length;
1158 
1159           if (firstnum > 1) { /* add room for reps 2 through firstnum */
1160                     size += atomlen*(firstnum-1);
1161           }
1162 
1163           /* Adjust size of buffer for special cases */
1164           if (special_case == REPEAT_PLUS_APPENDED) {
1165                     size++;             /* for the final + */
1166           } else if (special_case == REPEAT_WITH_Q) {
1167                     size += init_q + (atomlen+1)* (n_q_reps-init_q);
1168           } else if (special_case == REPEAT_ZERO) {
1169                     size += 2;          /* just a null ERE: () */
1170           }
1171           if ((buf = (uschar *) malloc(size + 1)) == NULL)
1172                     FATAL("out of space in reg expr %.10s..", lastre);
1173           memcpy(buf, basestr, prefix_length);    /* copy prefix      */
1174           j = prefix_length;
1175           if (special_case == REPEAT_ZERO) {
1176                     j -= atomlen;
1177                     buf[j++] = '(';
1178                     buf[j++] = ')';
1179           }
1180           for (i = 1; i < firstnum; i++) {        /* copy x reps      */
1181                     memcpy(&buf[j], atom, atomlen);
1182                     j += atomlen;
1183           }
1184           if (special_case == REPEAT_PLUS_APPENDED) {
1185                     buf[j++] = '+';
1186           } else if (special_case == REPEAT_WITH_Q) {
1187                     if (init_q)
1188                               buf[j++] = '?';
1189                     for (i = init_q; i < n_q_reps; i++) {   /* copy x? reps */
1190                               memcpy(&buf[j], atom, atomlen);
1191                               j += atomlen;
1192                               buf[j++] = '?';
1193                     }
1194           }
1195           memcpy(&buf[j], reptok+reptoklen, suffix_length);
1196           j += suffix_length;
1197           buf[j] = '\0';
1198           /* free old basestr */
1199           if (firstbasestr != basestr) {
1200                     if (basestr)
1201                               xfree(basestr);
1202           }
1203           basestr = buf;
1204           prestr  = buf + prefix_length;
1205           if (special_case == REPEAT_ZERO) {
1206                     prestr  -= atomlen;
1207                     ret++;
1208           }
1209           return ret;
1210 }
1211 
repeat(const uschar * reptok,int reptoklen,const uschar * atom,int atomlen,int firstnum,int secondnum)1212 static int repeat(const uschar *reptok, int reptoklen, const uschar *atom,
1213                       int atomlen, int firstnum, int secondnum)
1214 {
1215           /*
1216              In general, the repetition specifier or "bound" is replaced here
1217              by an equivalent ERE string, repeating the immediately previous atom
1218              and appending ? and + as needed. Note that the first copy of the
1219              atom is left in place, except in the special_case of a zero-repeat
1220              (i.e., {0}).
1221            */
1222           if (secondnum < 0) {          /* means {n,} -> repeat n-1 times followed by PLUS */
1223                     if (firstnum < 2) {
1224                               /* 0 or 1: should be handled before you get here */
1225                               FATAL("internal error");
1226                     } else {
1227                               return replace_repeat(reptok, reptoklen, atom, atomlen,
1228                                         firstnum, secondnum, REPEAT_PLUS_APPENDED);
1229                     }
1230           } else if (firstnum == secondnum) {     /* {n} or {n,n} -> simply repeat n-1 times */
1231                     if (firstnum == 0) {          /* {0} or {0,0} */
1232                               /* This case is unusual because the resulting
1233                                  replacement string might actually be SMALLER than
1234                                  the original ERE */
1235                               return replace_repeat(reptok, reptoklen, atom, atomlen,
1236                                                   firstnum, secondnum, REPEAT_ZERO);
1237                     } else {            /* (firstnum >= 1) */
1238                               return replace_repeat(reptok, reptoklen, atom, atomlen,
1239                                                   firstnum, secondnum, REPEAT_SIMPLE);
1240                     }
1241           } else if (firstnum < secondnum) {      /* {n,m} -> repeat n-1 times then alternate  */
1242                     /*  x{n,m}  =>  xx...x{1, m-n+1}  =>  xx...x?x?x?..x?       */
1243                     return replace_repeat(reptok, reptoklen, atom, atomlen,
1244                                                   firstnum, secondnum, REPEAT_WITH_Q);
1245           } else {  /* Error - shouldn't be here (n>m) */
1246                     FATAL("internal error");
1247           }
1248           return 0;
1249 }
1250 
relex(void)1251 int relex(void)               /* lexical analyzer for reparse */
1252 {
1253           int c, n;
1254           int cflag;
1255           static uschar *buf = NULL;
1256           static int bufsz = 100;
1257           uschar *bp;
1258           const struct charclass *cc;
1259           int i;
1260           int num, m;
1261           bool commafound, digitfound;
1262           const uschar *startreptok;
1263           static int parens = 0;
1264 
1265 rescan:
1266           starttok = prestr;
1267 
1268           if ((n = u8_rune(&rlxval, (const char *) prestr)) > 1) {
1269                     prestr += n;
1270                     starttok = prestr;
1271                     return CHAR;
1272           }
1273 
1274           switch (c = *prestr++) {
1275           case '|': return OR;
1276           case '*': return STAR;
1277           case '+': return PLUS;
1278           case '?': return QUEST;
1279           case '.': return DOT;
1280           case '\0': prestr--; return '\0';
1281           case '^':
1282           case '$':
1283                     return c;
1284           case '(':
1285                     parens++;
1286                     return c;
1287           case ')':
1288                     if (parens) {
1289                               parens--;
1290                               return c;
1291                     }
1292                     /* unmatched close parenthesis; per POSIX, treat as literal */
1293                     rlxval = c;
1294                     return CHAR;
1295           case '\\':
1296                     rlxval = quoted(&prestr);
1297                     return CHAR;
1298           default:
1299                     rlxval = c;
1300                     return CHAR;
1301           case '[':
1302                     if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL)
1303                               FATAL("out of space in reg expr %.10s..", lastre);
1304                     bp = buf;
1305                     if (*prestr == '^') {
1306                               cflag = 1;
1307                               prestr++;
1308                     }
1309                     else
1310                               cflag = 0;
1311                     n = 5 * strlen((const char *) prestr)+1; /* BUG: was 2.  what value? */
1312                     if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1"))
1313                               FATAL("out of space for reg expr %.10s...", lastre);
1314                     for (; ; ) {
1315                               if ((n = u8_rune(&rlxval, (const char *) prestr)) > 1) {
1316                                         for (i = 0; i < n; i++)
1317                                                   *bp++ = *prestr++;
1318                                         continue;
1319                               }
1320                               if ((c = *prestr++) == '\\') {
1321                                         *bp++ = '\\';
1322                                         if ((c = *prestr++) == '\0')
1323                                                   FATAL("nonterminated character class %.20s...", lastre);
1324                                         *bp++ = c;
1325                               /* } else if (c == '\n') { */
1326                               /*        FATAL("newline in character class %.20s...", lastre); */
1327                               } else if (c == '[' && *prestr == ':') {
1328                                         /* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
1329                                         for (cc = charclasses; cc->cc_name; cc++)
1330                                                   if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
1331                                                             break;
1332                                         if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
1333                                             prestr[2 + cc->cc_namelen] == ']') {
1334                                                   prestr += cc->cc_namelen + 3;
1335                                                   /*
1336                                                    * BUG: We begin at 1, instead of 0, since we
1337                                                    * would otherwise prematurely terminate the
1338                                                    * string for classes like [[:cntrl:]]. This
1339                                                    * means that we can't match the NUL character,
1340                                                    * not without first adapting the entire
1341                                                    * program to track each string's length.
1342                                                    */
1343                                                   for (i = 1; i <= UCHAR_MAX; i++) {
1344                                                             if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "relex2"))
1345                                                                 FATAL("out of space for reg expr %.10s...", lastre);
1346                                                             if (cc->cc_func(i)) {
1347                                                                       /* escape backslash */
1348                                                                       if (i == '\\') {
1349                                                                                 *bp++ = '\\';
1350                                                                                 n++;
1351                                                                       }
1352 
1353                                                                       *bp++ = i;
1354                                                                       n++;
1355                                                             }
1356                                                   }
1357                                         } else
1358                                                   *bp++ = c;
1359                               } else if (c == '[' && *prestr == '.') {
1360                                         char collate_char;
1361                                         prestr++;
1362                                         collate_char = *prestr++;
1363                                         if (*prestr == '.' && prestr[1] == ']') {
1364                                                   prestr += 2;
1365                                                   /* Found it: map via locale TBD: for
1366                                                      now, simply return this char.  This
1367                                                      is sufficient to pass conformance
1368                                                      test awk.ex 156
1369                                                    */
1370                                                   if (*prestr == ']') {
1371                                                             prestr++;
1372                                                             rlxval = collate_char;
1373                                                             return CHAR;
1374                                                   }
1375                                         }
1376                               } else if (c == '[' && *prestr == '=') {
1377                                         char equiv_char;
1378                                         prestr++;
1379                                         equiv_char = *prestr++;
1380                                         if (*prestr == '=' && prestr[1] == ']') {
1381                                                   prestr += 2;
1382                                                   /* Found it: map via locale TBD: for now
1383                                                      simply return this char. This is
1384                                                      sufficient to pass conformance test
1385                                                      awk.ex 156
1386                                                    */
1387                                                   if (*prestr == ']') {
1388                                                             prestr++;
1389                                                             rlxval = equiv_char;
1390                                                             return CHAR;
1391                                                   }
1392                                         }
1393                               } else if (c == '\0') {
1394                                         FATAL("nonterminated character class %.20s", lastre);
1395                               } else if (bp == buf) {       /* 1st char is special */
1396                                         *bp++ = c;
1397                               } else if (c == ']') {
1398                                         *bp++ = 0;
1399                                         rlxstr = (uschar *) tostring((char *) buf);
1400                                         if (cflag == 0)
1401                                                   return CCL;
1402                                         else
1403                                                   return NCCL;
1404                               } else
1405                                         *bp++ = c;
1406                     }
1407                     break;
1408           case '{':
1409                     if (isdigit((int) *(prestr))) {
1410                               num = 0;  /* Process as a repetition */
1411                               n = -1; m = -1;
1412                               commafound = false;
1413                               digitfound = false;
1414                               startreptok = prestr-1;
1415                               /* Remember start of previous atom here ? */
1416                     } else {            /* just a { char, not a repetition */
1417                               rlxval = c;
1418                               return CHAR;
1419                 }
1420                     for (; ; ) {
1421                               if ((c = *prestr++) == '}') {
1422                                         if (commafound) {
1423                                                   if (digitfound) { /* {n,m} */
1424                                                             m = num;
1425                                                             if (m < n)
1426                                                                       FATAL("illegal repetition expression: class %.20s",
1427                                                                                 lastre);
1428                                                             if (n == 0 && m == 1) {
1429                                                                       return QUEST;
1430                                                             }
1431                                                   } else {  /* {n,} */
1432                                                             if (n == 0)
1433                                                                       return STAR;
1434                                                             else if (n == 1)
1435                                                                       return PLUS;
1436                                                   }
1437                                         } else {
1438                                                   if (digitfound) { /* {n} same as {n,n} */
1439                                                             n = num;
1440                                                             m = num;
1441                                                   } else {  /* {} */
1442                                                             FATAL("illegal repetition expression: class %.20s",
1443                                                                       lastre);
1444                                                   }
1445                                         }
1446                                         if (repeat(starttok, prestr-starttok, lastatom,
1447                                                      startreptok - lastatom, n, m) > 0) {
1448                                                   if (n == 0 && m == 0) {
1449                                                             return ZERO;
1450                                                   }
1451                                                   /* must rescan input for next token */
1452                                                   goto rescan;
1453                                         }
1454                                         /* Failed to replace: eat up {...} characters
1455                                            and treat like just PLUS */
1456                                         return PLUS;
1457                               } else if (c == '\0') {
1458                                         FATAL("nonterminated character class %.20s",
1459                                                   lastre);
1460                               } else if (isdigit(c)) {
1461                                         num = 10 * num + c - '0';
1462                                         digitfound = true;
1463                               } else if (c == ',') {
1464                                         if (commafound)
1465                                                   FATAL("illegal repetition expression: class %.20s",
1466                                                             lastre);
1467                                         /* looking for {n,} or {n,m} */
1468                                         commafound = true;
1469                                         n = num;
1470                                         digitfound = false; /* reset */
1471                                         num = 0;
1472                               } else {
1473                                         FATAL("illegal repetition expression: class %.20s",
1474                                                   lastre);
1475                               }
1476                     }
1477                     break;
1478           }
1479 }
1480 
cgoto(fa * f,int s,int c)1481 int cgoto(fa *f, int s, int c)
1482 {
1483           int *p, *q;
1484           int i, j, k;
1485 
1486           /* assert(c == HAT || c < NCHARS);  BUG: seg fault if disable test */
1487           while (f->accept >= maxsetvec) {        /* guessing here! */
1488                     resizesetvec(__func__);
1489           }
1490           for (i = 0; i <= f->accept; i++)
1491                     setvec[i] = 0;
1492           setcnt = 0;
1493           resize_state(f, s);
1494           /* compute positions of gototab[s,c] into setvec */
1495           p = f->posns[s];
1496           for (i = 1; i <= *p; i++) {
1497                     if ((k = f->re[p[i]].ltype) != FINAL) {
1498                               if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
1499                                || (k == DOT && c != 0 && c != HAT)
1500                                || (k == ALL && c != 0)
1501                                || (k == EMPTYRE && c != 0)
1502                                || (k == CCL && member(c, (int *) f->re[p[i]].lval.rp))
1503                                || (k == NCCL && !member(c, (int *) f->re[p[i]].lval.rp) && c != 0 && c != HAT)) {
1504                                         q = f->re[p[i]].lfollow;
1505                                         for (j = 1; j <= *q; j++) {
1506                                                   if (q[j] >= maxsetvec) {
1507                                                             resizesetvec(__func__);
1508                                                   }
1509                                                   if (setvec[q[j]] == 0) {
1510                                                             setcnt++;
1511                                                             setvec[q[j]] = 1;
1512                                                   }
1513                                         }
1514                               }
1515                     }
1516           }
1517           /* determine if setvec is a previous state */
1518           tmpset[0] = setcnt;
1519           j = 1;
1520           for (i = f->accept; i >= 0; i--)
1521                     if (setvec[i]) {
1522                               tmpset[j++] = i;
1523                     }
1524           resize_state(f, f->curstat > s ? f->curstat : s);
1525           /* tmpset == previous state? */
1526           for (i = 1; i <= f->curstat; i++) {
1527                     p = f->posns[i];
1528                     if ((k = tmpset[0]) != p[0])
1529                               goto different;
1530                     for (j = 1; j <= k; j++)
1531                               if (tmpset[j] != p[j])
1532                                         goto different;
1533                     /* setvec is state i */
1534                     if (c != HAT)
1535                               set_gototab(f, s, c, i);
1536                     return i;
1537             different:;
1538           }
1539 
1540           /* add tmpset to current set of states */
1541           ++(f->curstat);
1542           resize_state(f, f->curstat);
1543           clear_gototab(f, f->curstat);
1544           xfree(f->posns[f->curstat]);
1545           p = intalloc(setcnt + 1, __func__);
1546 
1547           f->posns[f->curstat] = p;
1548           if (c != HAT)
1549                     set_gototab(f, s, c, f->curstat);
1550           for (i = 0; i <= setcnt; i++)
1551                     p[i] = tmpset[i];
1552           if (setvec[f->accept])
1553                     f->out[f->curstat] = 1;
1554           else
1555                     f->out[f->curstat] = 0;
1556           return f->curstat;
1557 }
1558 
1559 
freefa(fa * f)1560 void freefa(fa *f)  /* free a finite automaton */
1561 {
1562           int i;
1563 
1564           if (f == NULL)
1565                     return;
1566           for (i = 0; i < f->state_count; i++)
1567                     xfree(f->gototab[i].entries);
1568           xfree(f->gototab);
1569           for (i = 0; i <= f->curstat; i++)
1570                     xfree(f->posns[i]);
1571           for (i = 0; i <= f->accept; i++) {
1572                     xfree(f->re[i].lfollow);
1573                     if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
1574                               xfree(f->re[i].lval.np);
1575           }
1576           xfree(f->restr);
1577           xfree(f->out);
1578           xfree(f->posns);
1579           xfree(f->gototab);
1580           xfree(f);
1581 }
1582