1 /*        $NetBSD: regexp.c,v 1.16 2024/12/08 07:53:18 andvar Exp $   */
2 
3 /*
4  * Copyright (c) 1980, 1993
5  *        The Regents of the University of California.  All rights reserved.
6  *
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  */
32 
33 #if HAVE_NBTOOL_CONFIG_H
34 #include "nbtool_config.h"
35 #endif
36 
37 #include <sys/cdefs.h>
38 #ifndef lint
39 __COPYRIGHT("@(#) Copyright (c) 1980, 1993\
40  The Regents of the University of California.  All rights reserved.");
41 #endif /* not lint */
42 
43 #ifndef lint
44 #if 0
45 static char sccsid[] = "@(#)regexp.c    8.1 (Berkeley) 6/6/93";
46 #endif
47 __RCSID("$NetBSD: regexp.c,v 1.16 2024/12/08 07:53:18 andvar Exp $");
48 #endif /* not lint */
49 
50 #include <assert.h>
51 #include <ctype.h>
52 #include <stdlib.h>
53 #include <stdbool.h>
54 #include <string.h>
55 #include "extern.h"
56 
57 static void         expconv (void);
58 
59 bool       x_escaped;         /* true if we are currently x_escaped */
60 char      *x_start; /* start of string */
61 bool       l_onecase;         /* true if upper and lower equivalent */
62 
63 #define makelower(c) (isupper((unsigned char)(c)) ? tolower((unsigned char)(c)) : (c))
64 
65 static char
tocc(ptrdiff_t x)66 tocc(ptrdiff_t x) {
67           if (x & ~0xff)
68                     abort();
69           return (char)x;
70 }
71 
72 /*  STRNCMP -       like strncmp except that we convert the
73  *                  first string to lower case before comparing
74  *                  if l_onecase is set.
75  */
76 
77 int
STRNCMP(char * s1,char * s2,int len)78 STRNCMP(char *s1, char *s2, int len)
79 {
80           if (l_onecase) {
81               do
82                     if (*s2 - makelower(*s1))
83                               return *s2 - makelower(*s1);
84                     else {
85                               s2++;
86                               s1++;
87                     }
88               while (--len);
89           } else {
90               do
91                     if (*s2 - *s1)
92                               return *s2 - *s1;
93                     else {
94                               s2++;
95                               s1++;
96                     }
97               while (--len);
98           }
99           return(0);
100 }
101 
102 /*        The following routine converts an irregular expression to
103  *        internal format.
104  *
105  *        Either meta symbols (\a \d or \p) or character strings or
106  *        operations ( alternation or perenthesizing ) can be
107  *        specified.  Each starts with a descriptor byte.  The descriptor
108  *        byte has STR set for strings, META set for meta symbols
109  *        and OPER set for operations.
110  *        The descriptor byte can also have the OPT bit set if the object
111  *        defined is optional.  Also ALT can be set to indicate an alternation.
112  *
113  *        For metasymbols the byte following the descriptor byte identities
114  *        the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '(').  For
115  *        strings the byte after the descriptor is a character count for
116  *        the string:
117  *
118  *                  meta symbols := descriptor
119  *                                      symbol
120  *
121  *                  strings :=          descriptor
122  *                                      character count
123  *                                      the string
124  *
125  *                  operations :=       descriptor
126  *                                      symbol
127  *                                      character count
128  */
129 
130 /*
131  *  handy macros for accessing parts of match blocks
132  */
133 #define MSYM(A) (*(A+1))      /* symbol in a meta symbol block */
134 #define MNEXT(A) (A+2)                  /* character following a metasymbol block */
135 
136 #define OSYM(A) (*(A+1))      /* symbol in an operation block */
137 #define OCNT(A) (*(A+2))      /* character count */
138 #define ONEXT(A) (A+3)                  /* next character after the operation */
139 #define OPTR(A) (A+*(A+2))    /* place pointed to by the operator */
140 
141 #define SCNT(A) (*(A+1))      /* byte count of a string */
142 #define SSTR(A) (A+2)                   /* address of the string */
143 #define SNEXT(A) (A+2+*(A+1)) /* character following the string */
144 
145 /*
146  *  bit flags in the descriptor
147  */
148 #define OPT 1
149 #define STR 2
150 #define META 4
151 #define ALT 8
152 #define OPER 16
153 
154 static char *ccre;  /* pointer to current position in converted exp*/
155 static char *ure;   /* pointer current position in unconverted exp */
156 
157 char *
convexp(char * re)158 convexp(char *re)   /* unconverted irregular expression */
159 {
160     char *cre;                /* pointer to converted regular expression */
161 
162     /* allocate room for the converted expression */
163     if (re == NULL)
164           return NULL;
165     if (*re == '\0')
166           return NULL;
167     cre = malloc(4 * strlen(re) + 3);
168     ccre = cre;
169     ure = re;
170 
171     /* start the conversion with a \a */
172     *cre = META | OPT;
173     MSYM(cre) = 'a';
174     ccre = MNEXT(cre);
175 
176     /* start the conversion (its recursive) */
177     expconv();
178     *ccre = 0;
179     return cre;
180 }
181 
182 static void
expconv(void)183 expconv(void)
184 {
185     char *cs;                 /* pointer to current symbol in converted exp */
186     char c;                   /* character being processed */
187     char *acs;                /* pinter to last alternate */
188     int temp;
189 
190     /* let the conversion begin */
191     acs = NULL;
192     cs = NULL;
193     while (*ure) {
194           switch (c = *ure++) {
195 
196           case '\\':
197               switch (c = *ure++) {
198 
199               /* escaped characters are just characters */
200               default:
201                     if (cs == NULL || (*cs & STR) == 0) {
202                         cs = ccre;
203                         *cs = STR;
204                         SCNT(cs) = 1;
205                         ccre += 2;
206                     } else
207                         SCNT(cs)++;
208                     *ccre++ = c;
209                     break;
210 
211               /* normal(?) metacharacters */
212               case 'a':
213               case 'd':
214               case 'e':
215               case 'p':
216                     if (acs != NULL && acs != cs) {
217                         do {
218                               temp = OCNT(acs);
219                               OCNT(acs) = tocc(ccre - acs);
220                               acs -= temp;
221                         } while (temp != 0);
222                         acs = NULL;
223                     }
224                     cs = ccre;
225                     *cs = META;
226                     MSYM(cs) = c;
227                     ccre = MNEXT(cs);
228                     break;
229               }
230               break;
231 
232           /* just put the symbol in */
233           case '^':
234           case '$':
235               if (acs != NULL && acs != cs) {
236                     do {
237                         temp = OCNT(acs);
238                         OCNT(acs) = tocc(ccre - acs);
239                         acs -= temp;
240                     } while (temp != 0);
241                     acs = NULL;
242               }
243               cs = ccre;
244               *cs = META;
245               MSYM(cs) = c;
246               ccre = MNEXT(cs);
247               break;
248 
249           /* mark the last match sequence as optional */
250           case '?':
251               if (cs)
252                     *cs = *cs | OPT;
253               break;
254 
255           /* recurse and define a subexpression */
256           case '(':
257               if (acs != NULL && acs != cs) {
258                     do {
259                         temp = OCNT(acs);
260                         OCNT(acs) = tocc(ccre - acs);
261                         acs -= temp;
262                     } while (temp != 0);
263                     acs = NULL;
264               }
265               cs = ccre;
266               *cs = OPER;
267               OSYM(cs) = '(';
268               ccre = ONEXT(cs);
269               expconv();
270               OCNT(cs) = tocc(ccre - cs);                   /* offset to next symbol */
271               break;
272 
273           /* return from a recursion */
274           case ')':
275               if (acs != NULL) {
276                     do {
277                         temp = OCNT(acs);
278                         OCNT(acs) = tocc(ccre - acs);
279                         acs -= temp;
280                     } while (temp != 0);
281                     acs = NULL;
282               }
283               cs = ccre;
284               *cs = META;
285               MSYM(cs) = c;
286               ccre = MNEXT(cs);
287               return;
288 
289           /* mark the last match sequence as having an alternate */
290           /* the third byte will contain an offset to jump over the */
291           /* alternate match in case the first did not fail */
292           case '|':
293               if (acs != NULL && acs != cs)
294                     OCNT(ccre) = tocc(ccre - acs);          /* make a back pointer */
295               else
296                     OCNT(ccre) = 0;
297               assert(cs != NULL);
298               *cs |= ALT;
299               cs = ccre;
300               *cs = OPER;
301               OSYM(cs) = '|';
302               ccre = ONEXT(cs);
303               acs = cs;       /* remember that the pointer is to be filles */
304               break;
305 
306           /* if its not a metasymbol just build a scharacter string */
307           default:
308               if (cs == NULL || (*cs & STR) == 0) {
309                     cs = ccre;
310                     *cs = STR;
311                     SCNT(cs) = 1;
312                     ccre = SSTR(cs);
313               } else
314                     SCNT(cs)++;
315               *ccre++ = c;
316               break;
317           }
318     }
319     if (acs != NULL) {
320           do {
321               temp = OCNT(acs);
322               OCNT(acs) = tocc(ccre - acs);
323               acs -= temp;
324           } while (temp != 0);
325           acs = NULL;
326     }
327     return;
328 }
329 /* end of convertre */
330 
331 
332 /*
333  *        The following routine recognises an irregular expression
334  *        with the following special characters:
335  *
336  *                  \?        -         means last match was optional
337  *                  \a        -         matches any number of characters
338  *                  \d        -         matches any number of spaces and tabs
339  *                  \p        -         matches any number of alphanumeric
340  *                                      characters. The
341  *                                      characters matched will be copied into
342  *                                      the area pointed to by 'name'.
343  *                  \|        -         alternation
344  *                  \( \)     -         grouping used mostly for alternation and
345  *                                      optionality
346  *
347  *        The irregular expression must be translated to internal form
348  *        prior to calling this routine
349  *
350  *        The value returned is the pointer to the first non \a
351  *        character matched.
352  */
353 
354 char *
expmatch(char * s,char * re,char * mstring)355 expmatch(
356     char *s,                  /* string to check for a match in */
357     char *re,                 /* a converted irregular expression */
358     char *mstring)  /* where to put whatever matches a \p */
359 {
360     char *cs;                 /* the current symbol */
361     char *ptr,*s1;  /* temporary pointer */
362     bool matched;   /* a temporary bool */
363 
364     /* initial conditions */
365     if (re == NULL)
366           return NULL;
367     cs = re;
368     matched = false;
369 
370     /* loop till expression string is exhausted (or at least pretty tired) */
371     while (*cs) {
372           switch (*cs & (OPER | STR | META)) {
373 
374           /* try to match a string */
375           case STR:
376               matched = !STRNCMP (s, SSTR(cs), SCNT(cs));
377               if (matched) {
378 
379                     /* hoorah it matches */
380                     s += SCNT(cs);
381                     cs = SNEXT(cs);
382               } else if (*cs & ALT) {
383 
384                     /* alternation, skip to next expression */
385                     cs = SNEXT(cs);
386               } else if (*cs & OPT) {
387 
388                     /* the match is optional */
389                     cs = SNEXT(cs);
390                     matched = 1;                  /* indicate a successful match */
391               } else {
392 
393                     /* no match, error return */
394                     return NULL;
395               }
396               break;
397 
398           /* an operator, do something fancy */
399           case OPER:
400               switch (OSYM(cs)) {
401 
402               /* this is an alternation */
403               case '|':
404                     if (matched)
405 
406                         /* last thing in the alternation was a match, skip ahead */
407                         cs = OPTR(cs);
408                     else
409 
410                         /* no match, keep trying */
411                         cs = ONEXT(cs);
412                     break;
413 
414               /* this is a grouping, recurse */
415               case '(':
416                     ptr = expmatch(s, ONEXT(cs), mstring);
417                     if (ptr != NULL) {
418 
419                         /* the subexpression matched */
420                         matched = 1;
421                         s = ptr;
422                     } else if (*cs & ALT) {
423 
424                         /* alternation, skip to next expression */
425                         matched = 0;
426                     } else if (*cs & OPT) {
427 
428                         /* the match is optional */
429                         matched = 1;    /* indicate a successful match */
430                     } else {
431 
432                         /* no match, error return */
433                         return NULL;
434                     }
435                     cs = OPTR(cs);
436                     break;
437               }
438               break;
439 
440           /* try to match a metasymbol */
441           case META:
442               switch (MSYM(cs)) {
443 
444               /* try to match anything and remember what was matched */
445               case 'p':
446                     /*
447                      *  This is really the same as trying the match the
448                      *  remaining parts of the expression to any subset
449                      *  of the string.
450                      */
451                     s1 = s;
452                     do {
453                         ptr = expmatch(s1, MNEXT(cs), mstring);
454                         if (ptr != NULL && s1 != s) {
455 
456                               /* we have a match, remember the match */
457                               strncpy(mstring, s, (size_t)(s1 - s));
458                               mstring[s1 - s] = '\0';
459                               return ptr;
460                         } else if (ptr != NULL && (*cs & OPT)) {
461 
462                               /* it was aoptional so no match is ok */
463                               return ptr;
464                         } else if (ptr != NULL) {
465 
466                               /* not optional and we still matched */
467                               return NULL;
468                         }
469                         if (!isalnum((unsigned char)*s1) && *s1 != '_')
470                               return NULL;
471                         if (*s1 == '\\')
472                               x_escaped = x_escaped ? false : true;
473                         else
474                               x_escaped = false;
475                     } while (*s1++);
476                     return NULL;
477 
478               /* try to match anything */
479               case 'a':
480                     /*
481                      *  This is really the same as trying the match the
482                      *  remaining parts of the expression to any subset
483                      *  of the string.
484                      */
485                     s1 = s;
486                     do {
487                         ptr = expmatch(s1, MNEXT(cs), mstring);
488                         if (ptr != NULL && s1 != s) {
489 
490                               /* we have a match */
491                               return ptr;
492                         } else if (ptr != NULL && (*cs & OPT)) {
493 
494                               /* it was aoptional so no match is ok */
495                               return ptr;
496                         } else if (ptr != NULL) {
497 
498                               /* not optional and we still matched */
499                               return NULL;
500                         }
501                         if (*s1 == '\\')
502                               x_escaped = x_escaped ? false : true;
503                         else
504                               x_escaped = false;
505                     } while (*s1++);
506                     return NULL;
507 
508               /* fail if we are currently x_escaped */
509               case 'e':
510                     if (x_escaped)
511                         return NULL;
512                     cs = MNEXT(cs);
513                     break;
514 
515               /* match any number of tabs and spaces */
516               case 'd':
517                     ptr = s;
518                     while (*s == ' ' || *s == '\t')
519                         s++;
520                     if (s != ptr || s == x_start) {
521 
522                         /* match, be happy */
523                         matched = 1;
524                         cs = MNEXT(cs);
525                     } else if (*s == '\n' || *s == '\0') {
526 
527                         /* match, be happy */
528                         matched = 1;
529                         cs = MNEXT(cs);
530                     } else if (*cs & ALT) {
531 
532                         /* try the next part */
533                         matched = 0;
534                         cs = MNEXT(cs);
535                     } else if (*cs & OPT) {
536 
537                         /* doesn't matter */
538                         matched = 1;
539                         cs = MNEXT(cs);
540                     } else
541 
542                         /* no match, error return */
543                         return NULL;
544                     break;
545 
546               /* check for end of line */
547               case '$':
548                     if (*s == '\0' || *s == '\n') {
549 
550                         /* match, be happy */
551                         s++;
552                         matched = 1;
553                         cs = MNEXT(cs);
554                     } else if (*cs & ALT) {
555 
556                         /* try the next part */
557                         matched = 0;
558                         cs = MNEXT(cs);
559                     } else if (*cs & OPT) {
560 
561                         /* doesn't matter */
562                         matched = 1;
563                         cs = MNEXT(cs);
564                     } else
565 
566                         /* no match, error return */
567                         return NULL;
568                     break;
569 
570               /* check for start of line */
571               case '^':
572                     if (s == x_start) {
573 
574                         /* match, be happy */
575                         matched = 1;
576                         cs = MNEXT(cs);
577                     } else if (*cs & ALT) {
578 
579                         /* try the next part */
580                         matched = 0;
581                         cs = MNEXT(cs);
582                     } else if (*cs & OPT) {
583 
584                         /* doesn't matter */
585                         matched = 1;
586                         cs = MNEXT(cs);
587                     } else
588 
589                         /* no match, error return */
590                         return NULL;
591                     break;
592 
593               /* end of a subexpression, return success */
594               case ')':
595                     return s;
596               }
597               break;
598           }
599     }
600     return s;
601 }
602