1 /* $NetBSD: regcomp.c,v 1.7 2011/11/19 17:45:11 tnozaki Exp $ */
2
3 /*-
4 * Copyright (c) 1992, 1993, 1994 Henry Spencer.
5 * Copyright (c) 1992, 1993, 1994
6 * The Regents of the University of California. All rights reserved.
7 *
8 * This code is derived from software contributed to Berkeley by
9 * Henry Spencer of the University of Toronto.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 * 3. All advertising materials mentioning features or use of this software
20 * must display the following acknowledgement:
21 * This product includes software developed by the University of
22 * California, Berkeley and its contributors.
23 * 4. Neither the name of the University nor the names of its contributors
24 * may be used to endorse or promote products derived from this software
25 * without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37 * SUCH DAMAGE.
38 *
39 * @(#)regcomp.c 8.4 (Berkeley) 3/19/94
40 */
41
42 #if defined(LIBC_SCCS) && !defined(lint)
43 static char sccsid[] = "@(#)regcomp.c 8.4 (Berkeley) 3/19/94";
44 #endif /* LIBC_SCCS and not lint */
45
46 #include <sys/types.h>
47 #include <stdio.h>
48 #include <string.h>
49 #include <ctype.h>
50 #include <limits.h>
51 #include <stdlib.h>
52 #include <regex.h>
53
54 #include "utils.h"
55 #include "regex2.h"
56
57 #include "cclass.h"
58 #include "cname.h"
59
60 /*
61 * parse structure, passed up and down to avoid global variables and
62 * other clumsinesses
63 */
64 struct parse {
65 RCHAR_T *next; /* next character in RE */
66 RCHAR_T *end; /* end of string (-> NUL normally) */
67 int error; /* has an error been seen? */
68 sop *strip; /* malloced strip */
69 RCHAR_T *stripdata; /* malloced stripdata */
70 sopno ssize; /* malloced strip size (allocated) */
71 sopno slen; /* malloced strip length (used) */
72 int ncsalloc; /* number of csets allocated */
73 struct re_guts *g;
74 # define NPAREN 10 /* we need to remember () 1-9 for back refs */
75 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */
76 sopno pend[NPAREN]; /* -> ) ([0] unused) */
77 };
78
79 /* ========= begin header generated by ./mkh ========= */
80 #ifdef __cplusplus
81 extern "C" {
82 #endif
83
84 /* === regcomp.c === */
85 static void p_ere __P((struct parse *p, int stop, size_t reclimit));
86 static void p_ere_exp __P((struct parse *p, size_t reclimit));
87 static void p_str __P((struct parse *p));
88 static void p_bre __P((struct parse *p, int end1, int end2, size_t reclimit));
89 static int p_simp_re __P((struct parse *p, int starordinary, size_t reclimit));
90 static int p_count __P((struct parse *p));
91 static void p_bracket __P((struct parse *p));
92 static void p_b_term __P((struct parse *p, cset *cs));
93 static void p_b_cclass __P((struct parse *p, cset *cs));
94 static void p_b_eclass __P((struct parse *p, cset *cs));
95 static char p_b_symbol __P((struct parse *p));
96 static char p_b_coll_elem __P((struct parse *p, int endc));
97 static char othercase __P((int ch));
98 static void bothcases __P((struct parse *p, int ch));
99 static void ordinary __P((struct parse *p, int ch));
100 static void nonnewline __P((struct parse *p));
101 static void repeat __P((struct parse *p, sopno start, int from, int to, size_t reclimit));
102 static int seterr __P((struct parse *p, int e));
103 static cset *allocset __P((struct parse *p));
104 static void freeset __P((struct parse *p, cset *cs));
105 static int freezeset __P((struct parse *p, cset *cs));
106 static int firstch __P((struct parse *p, cset *cs));
107 static int nch __P((struct parse *p, cset *cs));
108 static void mcadd __P((struct parse *p, cset *cs, const char *cp));
109 #ifdef notdef
110 static void mcsub __P((cset *cs, char *cp));
111 static int mcin __P((cset *cs, char *cp));
112 static char *mcfind __P((cset *cs, char *cp));
113 #endif
114 static void mcinvert __P((struct parse *p, cset *cs));
115 static void mccase __P((struct parse *p, cset *cs));
116 #ifdef notdef
117 static int isinsets __P((struct re_guts *g, int c));
118 static int samesets __P((struct re_guts *g, int c1, int c2));
119 #endif
120 static void categorize __P((struct parse *p, struct re_guts *g));
121 static sopno dupl __P((struct parse *p, sopno start, sopno finish));
122 static void doemit __P((struct parse *p, sop op, size_t opnd));
123 static void doinsert __P((struct parse *p, sop op, size_t opnd, sopno pos));
124 static void dofwd __P((struct parse *p, sopno pos, sop value));
125 static int enlarge __P((struct parse *p, sopno size));
126 static void stripsnug __P((struct parse *p, struct re_guts *g));
127 static void findmust __P((struct parse *p, struct re_guts *g));
128 static sopno pluscount __P((struct parse *p, struct re_guts *g));
129
130 #ifdef __cplusplus
131 }
132 #endif
133 /* ========= end header generated by ./mkh ========= */
134
135 static RCHAR_T nuls[10]; /* place to point scanner in event of error */
136
137 /*
138 * macros for use with parse structure
139 * BEWARE: these know that the parse structure is named `p' !!!
140 */
141 #define PEEK() (*p->next)
142 #define PEEK2() (*(p->next+1))
143 #define MORE() (p->next < p->end)
144 #define MORE2() (p->next+1 < p->end)
145 #define SEE(c) (MORE() && PEEK() == (c))
146 #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
147 #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0)
148 #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
149 #define NEXT() (p->next++)
150 #define NEXT2() (p->next += 2)
151 #define NEXTn(n) (p->next += (n))
152 #define GETNEXT() (*p->next++)
153 #define SETERROR(e) seterr(p, (e))
154 #define REQUIRE(co, e) ((co) || SETERROR(e))
155 #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e))
156 #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e))
157 #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e))
158 #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd))
159 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
160 #define AHEAD(pos) dofwd(p, pos, HERE()-(pos))
161 #define ASTERN(sop, pos) EMIT(sop, HERE()-pos)
162 #define HERE() (p->slen)
163 #define THERE() (p->slen - 1)
164 #define THERETHERE() (p->slen - 2)
165 #define DROP(n) (p->slen -= (n))
166
167 #ifndef NDEBUG
168 static int never = 0; /* for use in asserts; shuts lint up */
169 #else
170 #define never 0 /* some <assert.h>s have bugs too */
171 #endif
172
173 #define MEMLIMIT 0x8000000
174 #define MEMSIZE(p) \
175 ((p)->ncsalloc / CHAR_BIT * (p)->g->csetsize + \
176 (p)->ncsalloc * sizeof(cset) + \
177 (p)->ssize * sizeof(sop))
178 #define RECLIMIT 256
179
180 /*
181 - regcomp - interface for parser and compilation
182 = extern int regcomp(regex_t *, const RCHAR_T *, int);
183 = #define REG_BASIC 0000
184 = #define REG_EXTENDED 0001
185 = #define REG_ICASE 0002
186 = #define REG_NOSUB 0004
187 = #define REG_NEWLINE 0010
188 = #define REG_NOSPEC 0020
189 = #define REG_PEND 0040
190 = #define REG_DUMP 0200
191 */
192 int /* 0 success, otherwise REG_something */
regcomp(regex_t * preg,const RCHAR_T * pattern,int cflags)193 regcomp(regex_t *preg, const RCHAR_T *pattern, int cflags)
194 {
195 struct parse pa;
196 register struct re_guts *g;
197 register struct parse *p = &pa;
198 register int i;
199 register size_t len;
200 #ifdef REDEBUG
201 # define GOODFLAGS(f) (f)
202 #else
203 # define GOODFLAGS(f) ((f)&~REG_DUMP)
204 #endif
205
206 cflags = GOODFLAGS(cflags);
207 if ((cflags®_EXTENDED) && (cflags®_NOSPEC))
208 return(REG_INVARG);
209
210 if (cflags®_PEND) {
211 if (preg->re_endp < pattern)
212 return(REG_INVARG);
213 len = preg->re_endp - pattern;
214 } else
215 len = STRLEN(pattern);
216
217 /* do the mallocs early so failure handling is easy */
218 g = (struct re_guts *)malloc(sizeof(struct re_guts) +
219 (NC-1)*sizeof(cat_t));
220 if (g == NULL)
221 return(REG_ESPACE);
222 p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */
223 p->strip = (sop *)malloc(p->ssize * sizeof(sop));
224 if (p->strip == NULL) {
225 free((char *)g);
226 return(REG_ESPACE);
227 }
228 p->stripdata = (RCHAR_T *)malloc(p->ssize * sizeof(RCHAR_T));
229 if (p->stripdata == NULL) {
230 free((char *)p->strip);
231 free((char *)g);
232 return(REG_ESPACE);
233 }
234 p->slen = 0;
235
236 /* set things up */
237 p->g = g;
238 p->next = (RCHAR_T *)pattern; /* convenience; we do not modify it */
239 p->end = p->next + len;
240 p->error = 0;
241 p->ncsalloc = 0;
242 for (i = 0; i < NPAREN; i++) {
243 p->pbegin[i] = 0;
244 p->pend[i] = 0;
245 }
246 g->csetsize = NC;
247 g->sets = NULL;
248 g->setbits = NULL;
249 g->ncsets = 0;
250 g->cflags = cflags;
251 g->iflags = 0;
252 g->nbol = 0;
253 g->neol = 0;
254 g->must = NULL;
255 g->mlen = 0;
256 g->nsub = 0;
257 #if 0
258 g->ncategories = 1; /* category 0 is "everything else" */
259 g->categories = &g->catspace[-(CHAR_MIN)];
260 (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t));
261 #endif
262 g->backrefs = 0;
263
264 /* do it */
265 EMIT(OEND, 0);
266 g->firststate = THERE();
267 if (cflags®_EXTENDED)
268 p_ere(p, OUT, 0);
269 else if (cflags®_NOSPEC)
270 p_str(p);
271 else
272 p_bre(p, OUT, OUT, 0);
273 EMIT(OEND, 0);
274 g->laststate = THERE();
275
276 /* tidy up loose ends and fill things in */
277 categorize(p, g);
278 stripsnug(p, g);
279 findmust(p, g);
280 g->nplus = pluscount(p, g);
281 g->magic = MAGIC2;
282 preg->re_nsub = g->nsub;
283 preg->re_g = g;
284 preg->re_magic = MAGIC1;
285 #ifndef REDEBUG
286 /* not debugging, so can't rely on the assert() in regexec() */
287 if (g->iflags&BAD)
288 SETERROR(REG_ASSERT);
289 #endif
290
291 /* win or lose, we're done */
292 if (p->error != 0) /* lose */
293 regfree(preg);
294 return(p->error);
295 }
296
297 /*
298 - p_ere - ERE parser top level, concatenation and alternation
299 == static void p_ere(register struct parse *p, int stop, size_t reclimit);
300 */
301 static void
p_ere(register struct parse * p,int stop,size_t reclimit)302 p_ere(register struct parse *p, int stop, size_t reclimit)
303
304 /* character this ERE should end at */
305 {
306 register char c;
307 register sopno prevback = 0;
308 register sopno prevfwd = 0;
309 register sopno conc;
310 register int first = 1; /* is this the first alternative? */
311
312 if (reclimit++ > RECLIMIT || p->error == REG_ESPACE) {
313 p->error = REG_ESPACE;
314 return;
315 }
316
317 for (;;) {
318 /* do a bunch of concatenated expressions */
319 conc = HERE();
320 while (MORE() && (c = PEEK()) != '|' && c != stop)
321 p_ere_exp(p, reclimit);
322 (void)REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */
323
324 if (!EAT('|'))
325 break; /* NOTE BREAK OUT */
326
327 if (first) {
328 INSERT(OCH_, conc); /* offset is wrong */
329 prevfwd = conc;
330 prevback = conc;
331 first = 0;
332 }
333 ASTERN(OOR1, prevback);
334 prevback = THERE();
335 AHEAD(prevfwd); /* fix previous offset */
336 prevfwd = HERE();
337 EMIT(OOR2, 0); /* offset is very wrong */
338 }
339
340 if (!first) { /* tail-end fixups */
341 AHEAD(prevfwd);
342 ASTERN(O_CH, prevback);
343 }
344
345 assert(!MORE() || SEE(stop));
346 }
347
348 /*
349 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
350 == static void p_ere_exp(register struct parse *p);
351 */
352 static void
p_ere_exp(register struct parse * p,size_t reclimit)353 p_ere_exp(register struct parse *p, size_t reclimit)
354 {
355 register char c;
356 register sopno pos;
357 register int count;
358 register int count2;
359 register sopno subno;
360 int wascaret = 0;
361
362 assert(MORE()); /* caller should have ensured this */
363 c = GETNEXT();
364
365 pos = HERE();
366 switch (c) {
367 case '(':
368 (void)REQUIRE(MORE(), REG_EPAREN);
369 p->g->nsub++;
370 subno = p->g->nsub;
371 if (subno < NPAREN)
372 p->pbegin[subno] = HERE();
373 EMIT(OLPAREN, subno);
374 if (!SEE(')'))
375 p_ere(p, ')', reclimit);
376 if (subno < NPAREN) {
377 p->pend[subno] = HERE();
378 assert(p->pend[subno] != 0);
379 }
380 EMIT(ORPAREN, subno);
381 (void)MUSTEAT(')', REG_EPAREN);
382 break;
383 #ifndef POSIX_MISTAKE
384 case ')': /* happens only if no current unmatched ( */
385 /*
386 * You may ask, why the ifndef? Because I didn't notice
387 * this until slightly too late for 1003.2, and none of the
388 * other 1003.2 regular-expression reviewers noticed it at
389 * all. So an unmatched ) is legal POSIX, at least until
390 * we can get it fixed.
391 */
392 SETERROR(REG_EPAREN);
393 break;
394 #endif
395 case '^':
396 EMIT(OBOL, 0);
397 p->g->iflags |= USEBOL;
398 p->g->nbol++;
399 wascaret = 1;
400 break;
401 case '$':
402 EMIT(OEOL, 0);
403 p->g->iflags |= USEEOL;
404 p->g->neol++;
405 break;
406 case '|':
407 SETERROR(REG_EMPTY);
408 break;
409 case '*':
410 case '+':
411 case '?':
412 SETERROR(REG_BADRPT);
413 break;
414 case '.':
415 if (p->g->cflags®_NEWLINE)
416 nonnewline(p);
417 else
418 EMIT(OANY, 0);
419 break;
420 case '[':
421 p_bracket(p);
422 break;
423 case '\\':
424 (void)REQUIRE(MORE(), REG_EESCAPE);
425 c = GETNEXT();
426 ordinary(p, c);
427 break;
428 case '{': /* okay as ordinary except if digit follows */
429 (void)REQUIRE(!MORE() || !ISDIGIT((UCHAR_T)PEEK()), REG_BADRPT);
430 /* FALLTHROUGH */
431 default:
432 ordinary(p, c);
433 break;
434 }
435
436 if (!MORE())
437 return;
438 c = PEEK();
439 /* we call { a repetition if followed by a digit */
440 if (!( c == '*' || c == '+' || c == '?' ||
441 (c == '{' && MORE2() && ISDIGIT((UCHAR_T)PEEK2())) ))
442 return; /* no repetition, we're done */
443 NEXT();
444
445 (void)REQUIRE(!wascaret, REG_BADRPT);
446 switch (c) {
447 case '*': /* implemented as +? */
448 /* this case does not require the (y|) trick, noKLUDGE */
449 INSERT(OPLUS_, pos);
450 ASTERN(O_PLUS, pos);
451 INSERT(OQUEST_, pos);
452 ASTERN(O_QUEST, pos);
453 break;
454 case '+':
455 INSERT(OPLUS_, pos);
456 ASTERN(O_PLUS, pos);
457 break;
458 case '?':
459 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
460 INSERT(OCH_, pos); /* offset slightly wrong */
461 ASTERN(OOR1, pos); /* this one's right */
462 AHEAD(pos); /* fix the OCH_ */
463 EMIT(OOR2, 0); /* offset very wrong... */
464 AHEAD(THERE()); /* ...so fix it */
465 ASTERN(O_CH, THERETHERE());
466 break;
467 case '{':
468 count = p_count(p);
469 if (EAT(',')) {
470 if (ISDIGIT((UCHAR_T)PEEK())) {
471 count2 = p_count(p);
472 (void)REQUIRE(count <= count2, REG_BADBR);
473 } else /* single number with comma */
474 count2 = INFINITY;
475 } else /* just a single number */
476 count2 = count;
477 repeat(p, pos, count, count2, 0);
478 if (!EAT('}')) { /* error heuristics */
479 while (MORE() && PEEK() != '}')
480 NEXT();
481 (void)REQUIRE(MORE(), REG_EBRACE);
482 SETERROR(REG_BADBR);
483 }
484 break;
485 }
486
487 if (!MORE())
488 return;
489 c = PEEK();
490 if (!( c == '*' || c == '+' || c == '?' ||
491 (c == '{' && MORE2() && ISDIGIT((UCHAR_T)PEEK2())) ) )
492 return;
493 SETERROR(REG_BADRPT);
494 }
495
496 /*
497 - p_str - string (no metacharacters) "parser"
498 == static void p_str(register struct parse *p);
499 */
500 static void
p_str(register struct parse * p)501 p_str(register struct parse *p)
502 {
503 (void)REQUIRE(MORE(), REG_EMPTY);
504 while (MORE())
505 ordinary(p, GETNEXT());
506 }
507
508 /*
509 - p_bre - BRE parser top level, anchoring and concatenation
510 == static void p_bre(register struct parse *p, register int end1, \
511 == register int end2, size_t reclimit);
512 * Giving end1 as OUT essentially eliminates the end1/end2 check.
513 *
514 * This implementation is a bit of a kludge, in that a trailing $ is first
515 * taken as an ordinary character and then revised to be an anchor. The
516 * only undesirable side effect is that '$' gets included as a character
517 * category in such cases. This is fairly harmless; not worth fixing.
518 * The amount of lookahead needed to avoid this kludge is excessive.
519 */
520 static void
p_bre(register struct parse * p,register int end1,register int end2,size_t reclimit)521 p_bre(register struct parse *p, register int end1, register int end2, size_t reclimit)
522
523 /* first terminating character */
524 /* second terminating character */
525 {
526 register sopno start;
527 register int first = 1; /* first subexpression? */
528 register int wasdollar = 0;
529
530 if (reclimit++ > RECLIMIT || p->error == REG_ESPACE) {
531 p->error = REG_ESPACE;
532 return;
533 }
534
535 start = HERE();
536
537 if (EAT('^')) {
538 EMIT(OBOL, 0);
539 p->g->iflags |= USEBOL;
540 p->g->nbol++;
541 }
542 while (MORE() && !SEETWO(end1, end2)) {
543 wasdollar = p_simp_re(p, first, reclimit);
544 first = 0;
545 }
546 if (wasdollar) { /* oops, that was a trailing anchor */
547 DROP(1);
548 EMIT(OEOL, 0);
549 p->g->iflags |= USEEOL;
550 p->g->neol++;
551 }
552
553 (void)REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */
554 }
555
556 /*
557 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
558 == static int p_simp_re(register struct parse *p, int starordinary, size_t reclimit);
559 */
560 static int /* was the simple RE an unbackslashed $? */
p_simp_re(register struct parse * p,int starordinary,size_t reclimit)561 p_simp_re(register struct parse *p, int starordinary, size_t reclimit)
562
563 /* is a leading * an ordinary character? */
564 {
565 register int c;
566 register int count;
567 register int count2;
568 register sopno pos;
569 register int i;
570 register sopno subno;
571 int backsl;
572
573 pos = HERE(); /* repetion op, if any, covers from here */
574
575 assert(MORE()); /* caller should have ensured this */
576 c = GETNEXT();
577 backsl = c == '\\';
578 if (backsl) {
579 (void)REQUIRE(MORE(), REG_EESCAPE);
580 c = (unsigned char)GETNEXT();
581 switch (c) {
582 case '{':
583 SETERROR(REG_BADRPT);
584 break;
585 case '(':
586 p->g->nsub++;
587 subno = p->g->nsub;
588 if (subno < NPAREN)
589 p->pbegin[subno] = HERE();
590 EMIT(OLPAREN, subno);
591 /* the MORE here is an error heuristic */
592 if (MORE() && !SEETWO('\\', ')'))
593 p_bre(p, '\\', ')', reclimit);
594 if (subno < NPAREN) {
595 p->pend[subno] = HERE();
596 assert(p->pend[subno] != 0);
597 }
598 EMIT(ORPAREN, subno);
599 (void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
600 break;
601 case ')': /* should not get here -- must be user */
602 case '}':
603 SETERROR(REG_EPAREN);
604 break;
605 case '1':
606 case '2':
607 case '3':
608 case '4':
609 case '5':
610 case '6':
611 case '7':
612 case '8':
613 case '9':
614 i = c - '0';
615 assert(i < NPAREN);
616 if (p->pend[i] != 0) {
617 assert(i <= p->g->nsub);
618 EMIT(OBACK_, i);
619 assert(p->pbegin[i] != 0);
620 assert(p->strip[p->pbegin[i]] == OLPAREN);
621 assert(p->strip[p->pend[i]] == ORPAREN);
622 (void) dupl(p, p->pbegin[i]+1, p->pend[i]);
623 EMIT(O_BACK, i);
624 } else
625 SETERROR(REG_ESUBREG);
626 p->g->backrefs = 1;
627 break;
628 default:
629 ordinary(p, c);
630 break;
631 }
632 } else {
633 switch (c) {
634 case '.':
635 if (p->g->cflags®_NEWLINE)
636 nonnewline(p);
637 else
638 EMIT(OANY, 0);
639 break;
640 case '[':
641 p_bracket(p);
642 break;
643 case '*':
644 (void)REQUIRE(starordinary, REG_BADRPT);
645 /* FALLTHROUGH */
646 default:
647 ordinary(p, c);
648 break;
649 }
650 }
651
652 if (EAT('*')) { /* implemented as +? */
653 /* this case does not require the (y|) trick, noKLUDGE */
654 INSERT(OPLUS_, pos);
655 ASTERN(O_PLUS, pos);
656 INSERT(OQUEST_, pos);
657 ASTERN(O_QUEST, pos);
658 } else if (EATTWO('\\', '{')) {
659 count = p_count(p);
660 if (EAT(',')) {
661 if (MORE() && ISDIGIT((UCHAR_T)PEEK())) {
662 count2 = p_count(p);
663 (void)REQUIRE(count <= count2, REG_BADBR);
664 } else /* single number with comma */
665 count2 = INFINITY;
666 } else /* just a single number */
667 count2 = count;
668 repeat(p, pos, count, count2, reclimit);
669 if (!EATTWO('\\', '}')) { /* error heuristics */
670 while (MORE() && !SEETWO('\\', '}'))
671 NEXT();
672 (void)REQUIRE(MORE(), REG_EBRACE);
673 SETERROR(REG_BADBR);
674 }
675 } else if (!backsl && c == (unsigned char)'$') /* $ (but not \$) ends it */
676 return(1);
677
678 return(0);
679 }
680
681 /*
682 - p_count - parse a repetition count
683 == static int p_count(register struct parse *p);
684 */
685 static int /* the value */
p_count(register struct parse * p)686 p_count(register struct parse *p)
687 {
688 register int count = 0;
689 register int ndigits = 0;
690
691 while (MORE() && ISDIGIT((UCHAR_T)PEEK()) && count <= DUPMAX) {
692 count = count*10 + (GETNEXT() - '0');
693 ndigits++;
694 }
695
696 (void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
697 return(count);
698 }
699
700 /*
701 - p_bracket - parse a bracketed character list
702 == static void p_bracket(register struct parse *p);
703 *
704 * Note a significant property of this code: if the allocset() did SETERROR,
705 * no set operations are done.
706 */
707 static void
p_bracket(register struct parse * p)708 p_bracket(register struct parse *p)
709 {
710 register cset *cs;
711 register int invert = 0;
712 static RCHAR_T bow[] = { '[', ':', '<', ':', ']', ']' };
713 static RCHAR_T eow[] = { '[', ':', '>', ':', ']', ']' };
714
715 cs = allocset(p);
716 if (cs == NULL)
717 return;
718
719 /* Dept of Truly Sickening Special-Case Kludges */
720 if (p->next + 5 < p->end && MEMCMP(p->next, bow, 6) == 0) {
721 EMIT(OBOW, 0);
722 NEXTn(6);
723 return;
724 }
725 if (p->next + 5 < p->end && MEMCMP(p->next, eow, 6) == 0) {
726 EMIT(OEOW, 0);
727 NEXTn(6);
728 return;
729 }
730
731 if (EAT('^'))
732 invert++; /* make note to invert set at end */
733 if (EAT(']'))
734 CHadd(cs, ']');
735 else if (EAT('-'))
736 CHadd(cs, '-');
737 while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
738 p_b_term(p, cs);
739 if (EAT('-'))
740 CHadd(cs, '-');
741 (void)MUSTEAT(']', REG_EBRACK);
742
743 if (p->error != 0) /* don't mess things up further */
744 return;
745
746 if (p->g->cflags®_ICASE) {
747 register int i;
748 register int ci;
749
750 for (i = p->g->csetsize - 1; i >= 0; i--)
751 if (CHIN(cs, i) && isalpha(i)) {
752 ci = othercase(i);
753 if (ci != i)
754 CHadd(cs, ci);
755 }
756 if (cs->multis != NULL)
757 mccase(p, cs);
758 }
759 if (invert) {
760 register int i;
761
762 for (i = p->g->csetsize - 1; i >= 0; i--)
763 if (CHIN(cs, i))
764 CHsub(cs, i);
765 else
766 CHadd(cs, i);
767 if (p->g->cflags®_NEWLINE)
768 CHsub(cs, '\n');
769 if (cs->multis != NULL)
770 mcinvert(p, cs);
771 }
772
773 assert(cs->multis == NULL); /* xxx */
774
775 if (nch(p, cs) == 1) { /* optimize singleton sets */
776 ordinary(p, firstch(p, cs));
777 freeset(p, cs);
778 } else
779 EMIT(OANYOF, freezeset(p, cs));
780 }
781
782 /*
783 - p_b_term - parse one term of a bracketed character list
784 == static void p_b_term(register struct parse *p, register cset *cs);
785 */
786 static void
p_b_term(register struct parse * p,register cset * cs)787 p_b_term(register struct parse *p, register cset *cs)
788 {
789 register char c;
790 register char start, finish;
791 register int i;
792
793 /* classify what we've got */
794 switch ((MORE()) ? PEEK() : '\0') {
795 case '[':
796 c = (MORE2()) ? PEEK2() : '\0';
797 break;
798 case '-':
799 SETERROR(REG_ERANGE);
800 return; /* NOTE RETURN */
801 break;
802 default:
803 c = '\0';
804 break;
805 }
806
807 switch (c) {
808 case ':': /* character class */
809 NEXT2();
810 (void)REQUIRE(MORE(), REG_EBRACK);
811 c = PEEK();
812 (void)REQUIRE(c != '-' && c != ']', REG_ECTYPE);
813 p_b_cclass(p, cs);
814 (void)REQUIRE(MORE(), REG_EBRACK);
815 (void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
816 break;
817 case '=': /* equivalence class */
818 NEXT2();
819 (void)REQUIRE(MORE(), REG_EBRACK);
820 c = PEEK();
821 (void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
822 p_b_eclass(p, cs);
823 (void)REQUIRE(MORE(), REG_EBRACK);
824 (void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
825 break;
826 default: /* symbol, ordinary character, or range */
827 /* xxx revision needed for multichar stuff */
828 start = p_b_symbol(p);
829 if (SEE('-') && MORE2() && PEEK2() != ']') {
830 /* range */
831 NEXT();
832 if (EAT('-'))
833 finish = '-';
834 else
835 finish = p_b_symbol(p);
836 } else
837 finish = start;
838 /* xxx what about signed chars here... */
839 (void)REQUIRE(start <= finish, REG_ERANGE);
840 for (i = start; i <= finish; i++)
841 CHadd(cs, i);
842 break;
843 }
844 }
845
846 /*
847 - p_b_cclass - parse a character-class name and deal with it
848 == static void p_b_cclass(register struct parse *p, register cset *cs);
849 */
850 static void
p_b_cclass(register struct parse * p,register cset * cs)851 p_b_cclass(register struct parse *p, register cset *cs)
852 {
853 register RCHAR_T *sp = p->next;
854 register struct cclass *cp;
855 register size_t len;
856 register const char *u;
857 register char c;
858
859 while (MORE() && isalpha(PEEK()))
860 NEXT();
861 len = p->next - sp;
862 for (cp = cclasses; cp->name != NULL; cp++)
863 if (STRLEN(cp->name) == len && !MEMCMP(cp->name, sp, len))
864 break;
865 if (cp->name == NULL) {
866 /* oops, didn't find it */
867 SETERROR(REG_ECTYPE);
868 return;
869 }
870
871 u = cp->chars;
872 while ((c = *u++) != '\0')
873 CHadd(cs, c);
874 for (u = cp->multis; *u != '\0'; u += strlen(u) + 1)
875 MCadd(p, cs, u);
876 }
877
878 /*
879 - p_b_eclass - parse an equivalence-class name and deal with it
880 == static void p_b_eclass(register struct parse *p, register cset *cs);
881 *
882 * This implementation is incomplete. xxx
883 */
884 static void
p_b_eclass(register struct parse * p,register cset * cs)885 p_b_eclass(register struct parse *p, register cset *cs)
886 {
887 register char c;
888
889 c = p_b_coll_elem(p, '=');
890 CHadd(cs, c);
891 }
892
893 /*
894 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
895 == static char p_b_symbol(register struct parse *p);
896 */
897 static char /* value of symbol */
p_b_symbol(register struct parse * p)898 p_b_symbol(register struct parse *p)
899 {
900 register char value;
901
902 (void)REQUIRE(MORE(), REG_EBRACK);
903 if (!EATTWO('[', '.'))
904 return(GETNEXT());
905
906 /* collating symbol */
907 value = p_b_coll_elem(p, '.');
908 (void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
909 return(value);
910 }
911
912 /*
913 - p_b_coll_elem - parse a collating-element name and look it up
914 == static char p_b_coll_elem(register struct parse *p, int endc);
915 */
916 static char /* value of collating element */
p_b_coll_elem(register struct parse * p,int endc)917 p_b_coll_elem(register struct parse *p, int endc)
918
919 /* name ended by endc,']' */
920 {
921 register RCHAR_T *sp = p->next;
922 register struct cname *cp;
923 register size_t len;
924
925 while (MORE() && !SEETWO(endc, ']'))
926 NEXT();
927 if (!MORE()) {
928 SETERROR(REG_EBRACK);
929 return(0);
930 }
931 len = p->next - sp;
932 for (cp = cnames; cp->name != NULL; cp++)
933 if (STRLEN(cp->name) == len && MEMCMP(cp->name, sp, len))
934 return(cp->code); /* known name */
935 if (len == 1)
936 return(*sp); /* single character */
937 SETERROR(REG_ECOLLATE); /* neither */
938 return(0);
939 }
940
941 /*
942 - othercase - return the case counterpart of an alphabetic
943 == static char othercase(int ch);
944 */
945 static char /* if no counterpart, return ch */
othercase(int ch)946 othercase(int ch)
947 {
948 assert(isalpha(ch));
949 if (isupper(ch))
950 return(tolower(ch));
951 else if (islower(ch))
952 return(toupper(ch));
953 else /* peculiar, but could happen */
954 return(ch);
955 }
956
957 /*
958 - bothcases - emit a dualcase version of a two-case character
959 == static void bothcases(register struct parse *p, int ch);
960 *
961 * Boy, is this implementation ever a kludge...
962 */
963 static void
bothcases(register struct parse * p,int ch)964 bothcases(register struct parse *p, int ch)
965 {
966 register RCHAR_T *oldnext = p->next;
967 register RCHAR_T *oldend = p->end;
968 RCHAR_T bracket[3];
969
970 assert(othercase(ch) != ch); /* p_bracket() would recurse */
971 p->next = bracket;
972 p->end = bracket+2;
973 bracket[0] = ch;
974 bracket[1] = ']';
975 bracket[2] = '\0';
976 p_bracket(p);
977 assert(p->next == bracket+2);
978 p->next = oldnext;
979 p->end = oldend;
980 }
981
982 /*
983 - ordinary - emit an ordinary character
984 == static void ordinary(register struct parse *p, register int ch);
985 */
986 static void
ordinary(register struct parse * p,register int ch)987 ordinary(register struct parse *p, register int ch)
988 {
989 /*
990 register cat_t *cap = p->g->categories;
991 */
992
993 if ((p->g->cflags®_ICASE) && isalpha(ch) && othercase(ch) != ch)
994 bothcases(p, ch);
995 else {
996 EMIT(OCHAR, (UCHAR_T)ch);
997 /*
998 if (cap[ch] == 0)
999 cap[ch] = p->g->ncategories++;
1000 */
1001 }
1002 }
1003
1004 /*
1005 - nonnewline - emit REG_NEWLINE version of OANY
1006 == static void nonnewline(register struct parse *p);
1007 *
1008 * Boy, is this implementation ever a kludge...
1009 */
1010 static void
nonnewline(register struct parse * p)1011 nonnewline(register struct parse *p)
1012 {
1013 register RCHAR_T *oldnext = p->next;
1014 register RCHAR_T *oldend = p->end;
1015 RCHAR_T bracket[4];
1016
1017 p->next = bracket;
1018 p->end = bracket+3;
1019 bracket[0] = '^';
1020 bracket[1] = '\n';
1021 bracket[2] = ']';
1022 bracket[3] = '\0';
1023 p_bracket(p);
1024 assert(p->next == bracket+3);
1025 p->next = oldnext;
1026 p->end = oldend;
1027 }
1028
1029 /*
1030 - repeat - generate code for a bounded repetition, recursively if needed
1031 == static void repeat(register struct parse *p, sopno start, int from, int to, size_t reclimit);
1032 */
1033 static void
repeat(register struct parse * p,sopno start,int from,int to,size_t reclimit)1034 repeat(register struct parse *p, sopno start, int from, int to, size_t reclimit)
1035
1036 /* operand from here to end of strip */
1037 /* repeated from this number */
1038 /* to this number of times (maybe INFINITY) */
1039 {
1040 register sopno finish;
1041 # define N 2
1042 # define INF 3
1043 # define REP(f, t) ((f)*8 + (t))
1044 # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
1045 register sopno copy;
1046
1047 if (reclimit++ > RECLIMIT)
1048 p->error = REG_ESPACE;
1049 if (p->error)
1050 return;
1051
1052 finish = HERE();
1053
1054 assert(from <= to);
1055
1056 switch (REP(MAP(from), MAP(to))) {
1057 case REP(0, 0): /* must be user doing this */
1058 DROP(finish-start); /* drop the operand */
1059 break;
1060 case REP(0, 1): /* as x{1,1}? */
1061 case REP(0, N): /* as x{1,n}? */
1062 case REP(0, INF): /* as x{1,}? */
1063 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1064 INSERT(OCH_, start); /* offset is wrong... */
1065 repeat(p, start+1, 1, to, reclimit);
1066 ASTERN(OOR1, start);
1067 AHEAD(start); /* ... fix it */
1068 EMIT(OOR2, 0);
1069 AHEAD(THERE());
1070 ASTERN(O_CH, THERETHERE());
1071 break;
1072 case REP(1, 1): /* trivial case */
1073 /* done */
1074 break;
1075 case REP(1, N): /* as x?x{1,n-1} */
1076 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1077 INSERT(OCH_, start);
1078 ASTERN(OOR1, start);
1079 AHEAD(start);
1080 EMIT(OOR2, 0); /* offset very wrong... */
1081 AHEAD(THERE()); /* ...so fix it */
1082 ASTERN(O_CH, THERETHERE());
1083 copy = dupl(p, start+1, finish+1);
1084 assert(copy == finish+4);
1085 repeat(p, copy, 1, to-1, reclimit);
1086 break;
1087 case REP(1, INF): /* as x+ */
1088 INSERT(OPLUS_, start);
1089 ASTERN(O_PLUS, start);
1090 break;
1091 case REP(N, N): /* as xx{m-1,n-1} */
1092 copy = dupl(p, start, finish);
1093 repeat(p, copy, from-1, to-1, reclimit);
1094 break;
1095 case REP(N, INF): /* as xx{n-1,INF} */
1096 copy = dupl(p, start, finish);
1097 repeat(p, copy, from-1, to, reclimit);
1098 break;
1099 default: /* "can't happen" */
1100 SETERROR(REG_ASSERT); /* just in case */
1101 break;
1102 }
1103 }
1104
1105 /*
1106 - seterr - set an error condition
1107 == static int seterr(register struct parse *p, int e);
1108 */
1109 static int /* useless but makes type checking happy */
seterr(register struct parse * p,int e)1110 seterr(register struct parse *p, int e)
1111 {
1112 if (p->error == 0) /* keep earliest error condition */
1113 p->error = e;
1114 p->next = nuls; /* try to bring things to a halt */
1115 p->end = nuls;
1116 return(0); /* make the return value well-defined */
1117 }
1118
1119 /*
1120 - allocset - allocate a set of characters for []
1121 == static cset *allocset(register struct parse *p);
1122 */
1123 static cset *
allocset(register struct parse * p)1124 allocset(register struct parse *p)
1125 {
1126 register int no = p->g->ncsets++;
1127 register size_t nc;
1128 register size_t nbytes;
1129 register cset *cs;
1130 register size_t css = (size_t)p->g->csetsize;
1131 register int i;
1132
1133 if (no >= p->ncsalloc) { /* need another column of space */
1134 p->ncsalloc += CHAR_BIT;
1135 nc = p->ncsalloc;
1136 assert(nc % CHAR_BIT == 0);
1137 nbytes = nc / CHAR_BIT * css;
1138 if (MEMSIZE(p) > MEMLIMIT)
1139 goto oomem;
1140 if (p->g->sets == NULL)
1141 p->g->sets = (cset *)malloc(nc * sizeof(cset));
1142 else
1143 p->g->sets = (cset *)realloc((char *)p->g->sets,
1144 nc * sizeof(cset));
1145 if (p->g->setbits == NULL)
1146 p->g->setbits = (uch *)malloc(nbytes);
1147 else {
1148 p->g->setbits = (uch *)realloc((char *)p->g->setbits,
1149 nbytes);
1150 /* xxx this isn't right if setbits is now NULL */
1151 for (i = 0; i < no; i++)
1152 p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT);
1153 }
1154 if (p->g->sets != NULL && p->g->setbits != NULL)
1155 (void) memset((char *)p->g->setbits + (nbytes - css),
1156 0, css);
1157 else {
1158 oomem:
1159 no = 0;
1160 SETERROR(REG_ESPACE);
1161 /* caller's responsibility not to do set ops */
1162 return NULL;
1163 }
1164 }
1165
1166 cs = &p->g->sets[no];
1167 cs->ptr = p->g->setbits + css*((no)/CHAR_BIT);
1168 cs->mask = 1 << ((no) % CHAR_BIT);
1169 cs->hash = 0;
1170 cs->smultis = 0;
1171 cs->multis = NULL;
1172
1173 return(cs);
1174 }
1175
1176 /*
1177 - freeset - free a now-unused set
1178 == static void freeset(register struct parse *p, register cset *cs);
1179 */
1180 static void
freeset(register struct parse * p,register cset * cs)1181 freeset(register struct parse *p, register cset *cs)
1182 {
1183 register size_t i;
1184 register cset *top = &p->g->sets[p->g->ncsets];
1185 register size_t css = (size_t)p->g->csetsize;
1186
1187 for (i = 0; i < css; i++)
1188 CHsub(cs, i);
1189 if (cs == top-1) /* recover only the easy case */
1190 p->g->ncsets--;
1191 }
1192
1193 /*
1194 - freezeset - final processing on a set of characters
1195 == static int freezeset(register struct parse *p, register cset *cs);
1196 *
1197 * The main task here is merging identical sets. This is usually a waste
1198 * of time (although the hash code minimizes the overhead), but can win
1199 * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash
1200 * is done using addition rather than xor -- all ASCII [aA] sets xor to
1201 * the same value!
1202 */
1203 static int /* set number */
freezeset(register struct parse * p,register cset * cs)1204 freezeset(register struct parse *p, register cset *cs)
1205 {
1206 register uch h = cs->hash;
1207 register size_t i;
1208 register cset *top = &p->g->sets[p->g->ncsets];
1209 register cset *cs2;
1210 register size_t css = (size_t)p->g->csetsize;
1211
1212 /* look for an earlier one which is the same */
1213 for (cs2 = &p->g->sets[0]; cs2 < top; cs2++)
1214 if (cs2->hash == h && cs2 != cs) {
1215 /* maybe */
1216 for (i = 0; i < css; i++)
1217 if (!!CHIN(cs2, i) != !!CHIN(cs, i))
1218 break; /* no */
1219 if (i == css)
1220 break; /* yes */
1221 }
1222
1223 if (cs2 < top) { /* found one */
1224 freeset(p, cs);
1225 cs = cs2;
1226 }
1227
1228 return((int)(cs - p->g->sets));
1229 }
1230
1231 /*
1232 - firstch - return first character in a set (which must have at least one)
1233 == static int firstch(register struct parse *p, register cset *cs);
1234 */
1235 static int /* character; there is no "none" value */
firstch(register struct parse * p,register cset * cs)1236 firstch(register struct parse *p, register cset *cs)
1237 {
1238 register size_t i;
1239 register size_t css = (size_t)p->g->csetsize;
1240
1241 for (i = 0; i < css; i++)
1242 if (CHIN(cs, i))
1243 return((char)i);
1244 assert(never);
1245 return(0); /* arbitrary */
1246 }
1247
1248 /*
1249 - nch - number of characters in a set
1250 == static int nch(register struct parse *p, register cset *cs);
1251 */
1252 static int
nch(register struct parse * p,register cset * cs)1253 nch(register struct parse *p, register cset *cs)
1254 {
1255 register size_t i;
1256 register size_t css = (size_t)p->g->csetsize;
1257 register int n = 0;
1258
1259 for (i = 0; i < css; i++)
1260 if (CHIN(cs, i))
1261 n++;
1262 return(n);
1263 }
1264
1265 /*
1266 - mcadd - add a collating element to a cset
1267 == static void mcadd(register struct parse *p, register cset *cs, \
1268 == register char *cp);
1269 */
1270 static void
mcadd(register struct parse * p,register cset * cs,register const char * cp)1271 mcadd(register struct parse *p, register cset *cs, register const char *cp)
1272 {
1273 register size_t oldend = cs->smultis;
1274 void *np;
1275
1276 cs->smultis += strlen(cp) + 1;
1277 np = realloc(cs->multis, cs->smultis);
1278 if (np == NULL) {
1279 if (cs->multis)
1280 free(cs->multis);
1281 cs->multis = NULL;
1282 SETERROR(REG_ESPACE);
1283 return;
1284 }
1285 cs->multis = np;
1286
1287 (void) strlcpy(cs->multis + oldend - 1, cp, cs->smultis - oldend + 1);
1288 }
1289
1290 #ifdef notdef
1291 /*
1292 - mcsub - subtract a collating element from a cset
1293 == static void mcsub(register cset *cs, register char *cp);
1294 */
1295 static void
mcsub(register cset * cs,register char * cp)1296 mcsub(register cset *cs, register char *cp)
1297 {
1298 register char *fp = mcfind(cs, cp);
1299 register size_t len = strlen(fp);
1300
1301 assert(fp != NULL);
1302 (void) memmove(fp, fp + len + 1,
1303 cs->smultis - (fp + len + 1 - cs->multis));
1304 cs->smultis -= len;
1305
1306 if (cs->smultis == 0) {
1307 free(cs->multis);
1308 cs->multis = NULL;
1309 return;
1310 }
1311
1312 cs->multis = realloc(cs->multis, cs->smultis);
1313 assert(cs->multis != NULL);
1314 }
1315
1316 /*
1317 - mcin - is a collating element in a cset?
1318 == static int mcin(register cset *cs, register char *cp);
1319 */
1320 static int
mcin(register cset * cs,register char * cp)1321 mcin(register cset *cs, register char *cp)
1322 {
1323 return(mcfind(cs, cp) != NULL);
1324 }
1325
1326 /*
1327 - mcfind - find a collating element in a cset
1328 == static char *mcfind(register cset *cs, register char *cp);
1329 */
1330 static char *
mcfind(register cset * cs,register char * cp)1331 mcfind(register cset *cs, register char *cp)
1332 {
1333 register char *p;
1334
1335 if (cs->multis == NULL)
1336 return(NULL);
1337 for (p = cs->multis; *p != '\0'; p += strlen(p) + 1)
1338 if (strcmp(cp, p) == 0)
1339 return(p);
1340 return(NULL);
1341 }
1342 #endif
1343
1344 /*
1345 - mcinvert - invert the list of collating elements in a cset
1346 == static void mcinvert(register struct parse *p, register cset *cs);
1347 *
1348 * This would have to know the set of possibilities. Implementation
1349 * is deferred.
1350 */
1351 static void
mcinvert(register struct parse * p,register cset * cs)1352 mcinvert(register struct parse *p, register cset *cs)
1353 {
1354 assert(cs->multis == NULL); /* xxx */
1355 }
1356
1357 /*
1358 - mccase - add case counterparts of the list of collating elements in a cset
1359 == static void mccase(register struct parse *p, register cset *cs);
1360 *
1361 * This would have to know the set of possibilities. Implementation
1362 * is deferred.
1363 */
1364 static void
mccase(register struct parse * p,register cset * cs)1365 mccase(register struct parse *p, register cset *cs)
1366 {
1367 assert(cs->multis == NULL); /* xxx */
1368 }
1369
1370 #ifdef notdef
1371 /*
1372 - isinsets - is this character in any sets?
1373 == static int isinsets(register struct re_guts *g, int c);
1374 */
1375 static int /* predicate */
isinsets(register struct re_guts * g,int c)1376 isinsets(register struct re_guts *g, int c)
1377 {
1378 register uch *col;
1379 register int i;
1380 register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1381 register unsigned uc = (unsigned char)c;
1382
1383 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1384 if (col[uc] != 0)
1385 return(1);
1386 return(0);
1387 }
1388
1389 /*
1390 - samesets - are these two characters in exactly the same sets?
1391 == static int samesets(register struct re_guts *g, int c1, int c2);
1392 */
1393 static int /* predicate */
samesets(register struct re_guts * g,int c1,int c2)1394 samesets(register struct re_guts *g, int c1, int c2)
1395 {
1396 register uch *col;
1397 register int i;
1398 register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1399 register unsigned uc1 = (unsigned char)c1;
1400 register unsigned uc2 = (unsigned char)c2;
1401
1402 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1403 if (col[uc1] != col[uc2])
1404 return(0);
1405 return(1);
1406 }
1407 #endif
1408
1409 /*
1410 - categorize - sort out character categories
1411 == static void categorize(struct parse *p, register struct re_guts *g);
1412 */
1413 static void
categorize(struct parse * p,register struct re_guts * g)1414 categorize(struct parse *p, register struct re_guts *g)
1415 {
1416 #ifdef notdef
1417 register cat_t *cats = g->categories;
1418 register int c;
1419 register int c2;
1420 register cat_t cat;
1421
1422 /* avoid making error situations worse */
1423 if (p->error != 0)
1424 return;
1425
1426 for (c = CHAR_MIN; c <= CHAR_MAX; c++)
1427 if (cats[c] == 0 && isinsets(g, c)) {
1428 cat = g->ncategories++;
1429 cats[c] = cat;
1430 for (c2 = c+1; c2 <= CHAR_MAX; c2++)
1431 if (cats[c2] == 0 && samesets(g, c, c2))
1432 cats[c2] = cat;
1433 }
1434 #endif
1435 }
1436
1437 /*
1438 - dupl - emit a duplicate of a bunch of sops
1439 == static sopno dupl(register struct parse *p, sopno start, sopno finish);
1440 */
1441 static sopno /* start of duplicate */
dupl(register struct parse * p,sopno start,sopno finish)1442 dupl(register struct parse *p, sopno start, sopno finish)
1443
1444 /* from here */
1445 /* to this less one */
1446 {
1447 register sopno ret = HERE();
1448 register sopno len = finish - start;
1449
1450 assert(finish >= start);
1451 if (len == 0)
1452 return(ret);
1453 if (!enlarge(p, p->ssize + len)) /* this many unexpected additions */
1454 return ret;
1455 assert(p->ssize >= p->slen + len);
1456 (void) memcpy((char *)(p->strip + p->slen),
1457 (char *)(p->strip + start), (size_t)len*sizeof(sop));
1458 (void) memcpy((char *)(p->stripdata + p->slen),
1459 (char *)(p->stripdata + start), (size_t)len*sizeof(RCHAR_T));
1460 p->slen += len;
1461 return(ret);
1462 }
1463
1464 /*
1465 - doemit - emit a strip operator
1466 == static void doemit(register struct parse *p, sop op, size_t opnd);
1467 *
1468 * It might seem better to implement this as a macro with a function as
1469 * hard-case backup, but it's just too big and messy unless there are
1470 * some changes to the data structures. Maybe later.
1471 */
1472 static void
doemit(register struct parse * p,sop op,size_t opnd)1473 doemit(register struct parse *p, sop op, size_t opnd)
1474 {
1475 /* avoid making error situations worse */
1476 if (p->error != 0)
1477 return;
1478
1479 /* deal with oversize operands ("can't happen", more or less) */
1480 assert(opnd < 1);
1481
1482 /* deal with undersized strip */
1483 if (p->slen >= p->ssize)
1484 if (!enlarge(p, (p->ssize+1) / 2 * 3)) /* +50% */
1485 return;
1486
1487 /* finally, it's all reduced to the easy case */
1488 p->strip[p->slen] = op;
1489 p->stripdata[p->slen] = opnd;
1490 p->slen++;
1491 }
1492
1493 /*
1494 - doinsert - insert a sop into the strip
1495 == static void doinsert(register struct parse *p, sop op, size_t opnd, sopno pos);
1496 */
1497 static void
doinsert(register struct parse * p,sop op,size_t opnd,sopno pos)1498 doinsert(register struct parse *p, sop op, size_t opnd, sopno pos)
1499 {
1500 register sopno sn;
1501 register sop s;
1502 register RCHAR_T d;
1503 register int i;
1504
1505 /* avoid making error situations worse */
1506 if (p->error != 0)
1507 return;
1508
1509 sn = HERE();
1510 EMIT(op, opnd); /* do checks, ensure space */
1511 assert(HERE() == sn+1);
1512 s = p->strip[sn];
1513 d = p->stripdata[sn];
1514
1515 /* adjust paren pointers */
1516 assert(pos > 0);
1517 for (i = 1; i < NPAREN; i++) {
1518 if (p->pbegin[i] >= pos) {
1519 p->pbegin[i]++;
1520 }
1521 if (p->pend[i] >= pos) {
1522 p->pend[i]++;
1523 }
1524 }
1525
1526 memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
1527 (HERE()-pos-1)*sizeof(sop));
1528 memmove((char *)&p->stripdata[pos+1], (char *)&p->stripdata[pos],
1529 (HERE()-pos-1)*sizeof(RCHAR_T));
1530 p->strip[pos] = s;
1531 p->stripdata[pos] = d;
1532 }
1533
1534 /*
1535 - dofwd - complete a forward reference
1536 == static void dofwd(register struct parse *p, sopno pos, sop value);
1537 */
1538 static void
dofwd(register struct parse * p,register sopno pos,sop value)1539 dofwd(register struct parse *p, register sopno pos, sop value)
1540 {
1541 /* avoid making error situations worse */
1542 if (p->error != 0)
1543 return;
1544
1545 assert(value < 1);
1546 p->stripdata[pos] = value;
1547 }
1548
1549 /*
1550 - enlarge - enlarge the strip
1551 == static int enlarge(register struct parse *p, sopno size);
1552 */
1553 static int
enlarge(register struct parse * p,register sopno size)1554 enlarge(register struct parse *p, register sopno size)
1555 {
1556 register sop *sp;
1557 register RCHAR_T *dp;
1558 sopno osize;
1559
1560 if (p->ssize >= size)
1561 return 1;
1562
1563 osize = p->ssize;
1564 p->ssize = size;
1565 if (MEMSIZE(p) > MEMLIMIT)
1566 goto oomem;
1567 sp = realloc(p->strip, p->ssize * sizeof(sop));
1568 if (sp == NULL)
1569 goto oomem;
1570 p->strip = sp;
1571 dp = realloc(p->stripdata, p->ssize * sizeof(RCHAR_T));
1572 if (dp == NULL) {
1573 oomem:
1574 p->ssize = osize;
1575 SETERROR(REG_ESPACE);
1576 return 0;
1577 }
1578 p->stripdata = dp;
1579 return 1;
1580 }
1581
1582 /*
1583 - stripsnug - compact the strip
1584 == static void stripsnug(register struct parse *p, register struct re_guts *g);
1585 */
1586 static void
stripsnug(register struct parse * p,register struct re_guts * g)1587 stripsnug(register struct parse *p, register struct re_guts *g)
1588 {
1589 g->nstates = p->slen;
1590 g->strip = (sop *)realloc((char *)p->strip,
1591 p->slen * sizeof(sop));
1592 if (g->strip == NULL) {
1593 SETERROR(REG_ESPACE);
1594 g->strip = p->strip;
1595 }
1596 g->stripdata = (RCHAR_T *)realloc((char *)p->stripdata,
1597 p->slen * sizeof(RCHAR_T));
1598 if (g->stripdata == NULL) {
1599 SETERROR(REG_ESPACE);
1600 g->stripdata = p->stripdata;
1601 }
1602 }
1603
1604 /*
1605 - findmust - fill in must and mlen with longest mandatory literal string
1606 == static void findmust(register struct parse *p, register struct re_guts *g);
1607 *
1608 * This algorithm could do fancy things like analyzing the operands of |
1609 * for common subsequences. Someday. This code is simple and finds most
1610 * of the interesting cases.
1611 *
1612 * Note that must and mlen got initialized during setup.
1613 */
1614 static void
findmust(struct parse * p,register struct re_guts * g)1615 findmust(struct parse *p, register struct re_guts *g)
1616 {
1617 register sop *scans;
1618 register RCHAR_T *scand;
1619 sop *starts = 0;
1620 RCHAR_T *startd = NULL;
1621 register sop *newstarts = 0;
1622 register RCHAR_T *newstartd = NULL;
1623 register sopno newlen;
1624 register sop s;
1625 register RCHAR_T d;
1626 register RCHAR_T *cp;
1627 register sopno i;
1628
1629 /* avoid making error situations worse */
1630 if (p->error != 0)
1631 return;
1632
1633 /* find the longest OCHAR sequence in strip */
1634 newlen = 0;
1635 scans = g->strip + 1;
1636 scand = g->stripdata + 1;
1637 do {
1638 s = *scans++;
1639 d = *scand++;
1640 switch (s) {
1641 case OCHAR: /* sequence member */
1642 if (newlen == 0) { /* new sequence */
1643 newstarts = scans - 1;
1644 newstartd = scand - 1;
1645 }
1646 newlen++;
1647 break;
1648 case OPLUS_: /* things that don't break one */
1649 case OLPAREN:
1650 case ORPAREN:
1651 break;
1652 case OQUEST_: /* things that must be skipped */
1653 case OCH_:
1654 scans--;
1655 scand--;
1656 do {
1657 scans += d;
1658 scand += d;
1659 s = *scans;
1660 d = *scand;
1661 /* assert() interferes w debug printouts */
1662 if (s != O_QUEST && s != O_CH && s != OOR2) {
1663 g->iflags |= BAD;
1664 return;
1665 }
1666 } while (s != O_QUEST && s != O_CH);
1667 /* fallthrough */
1668 default: /* things that break a sequence */
1669 if (newlen > g->mlen) { /* ends one */
1670 starts = newstarts;
1671 startd = newstartd;
1672 g->mlen = newlen;
1673 }
1674 newlen = 0;
1675 break;
1676 }
1677 } while (s != OEND);
1678
1679 if (g->mlen == 0) /* there isn't one */
1680 return;
1681
1682 /* turn it into a character string */
1683 g->must = malloc(((size_t)g->mlen + 1) * sizeof(RCHAR_T));
1684 if (g->must == NULL) { /* argh; just forget it */
1685 g->mlen = 0;
1686 return;
1687 }
1688 cp = g->must;
1689 scans = starts;
1690 scand = startd;
1691 for (i = g->mlen; i > 0; i--) {
1692 for (;;) {
1693 s = *scans++;
1694 d = *scand++;
1695 if (s == OCHAR)
1696 break;
1697 }
1698 assert(cp < g->must + g->mlen);
1699 *cp++ = d;
1700 }
1701 assert(cp == g->must + g->mlen);
1702 *cp++ = '\0'; /* just on general principles */
1703 }
1704
1705 /*
1706 - pluscount - count + nesting
1707 == static sopno pluscount(register struct parse *p, register struct re_guts *g);
1708 */
1709 static sopno /* nesting depth */
pluscount(struct parse * p,register struct re_guts * g)1710 pluscount(struct parse *p, register struct re_guts *g)
1711 {
1712 register sop *scan;
1713 register sop s;
1714 register sopno plusnest = 0;
1715 register sopno maxnest = 0;
1716
1717 if (p->error != 0)
1718 return(0); /* there may not be an OEND */
1719
1720 scan = g->strip + 1;
1721 do {
1722 s = *scan++;
1723 switch (s) {
1724 case OPLUS_:
1725 plusnest++;
1726 break;
1727 case O_PLUS:
1728 if (plusnest > maxnest)
1729 maxnest = plusnest;
1730 plusnest--;
1731 break;
1732 }
1733 } while (s != OEND);
1734 if (plusnest != 0)
1735 g->iflags |= BAD;
1736 return(maxnest);
1737 }
1738