xref: /NextBSD/usr.bin/grep/regex/tre-fastmatch.c (revision b137080f19736ee33fede2e88bb54438604cf86b)
1 /* $FreeBSD$ */
2 
3 /*-
4  * Copyright (c) 1999 James Howard and Dag-Erling Coïdan Smørgrav
5  * Copyright (C) 2008-2011 Gabor Kovesdan <gabor@FreeBSD.org>
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include "glue.h"
31 
32 #include <ctype.h>
33 #include <limits.h>
34 #include <regex.h>
35 #include <stdbool.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #ifdef TRE_WCHAR
39 #include <wchar.h>
40 #include <wctype.h>
41 #endif
42 
43 #include "hashtable.h"
44 #include "tre-fastmatch.h"
45 #include "xmalloc.h"
46 
47 static int	fastcmp(const fastmatch_t *fg, const void *data,
48 			tre_str_type_t type);
49 
50 /*
51  * Clean up if pattern compilation fails.
52  */
53 #define FAIL_COMP(errcode)						\
54   {									\
55     if (fg->pattern)							\
56       xfree(fg->pattern);						\
57     if (fg->wpattern)							\
58       xfree(fg->wpattern);						\
59     if (fg->qsBc_table)							\
60       hashtable_free(fg->qsBc_table);					\
61     fg = NULL;								\
62     return errcode;							\
63   }
64 
65 /*
66  * Skips n characters in the input string and assigns the start
67  * address to startptr. Note: as per IEEE Std 1003.1-2008
68  * matching is based on bit pattern not character representations
69  * so we can handle MB strings as byte sequences just like
70  * SB strings.
71  */
72 #define SKIP_CHARS(n)							\
73   switch (type)								\
74     {									\
75       case STR_WIDE:							\
76 	startptr = str_wide + n;					\
77 	break;								\
78       default:								\
79 	startptr = str_byte + n;					\
80     }
81 
82 /*
83  * Converts the wide string pattern to SB/MB string and stores
84  * it in fg->pattern. Sets fg->len to the byte length of the
85  * converted string.
86  */
87 #define STORE_MBS_PAT							\
88   {									\
89     size_t siz;								\
90 									\
91     siz = wcstombs(NULL, fg->wpattern, 0);				\
92     if (siz == (size_t)-1)						\
93       return REG_BADPAT;						\
94     fg->len = siz;							\
95     fg->pattern = xmalloc(siz + 1);					\
96     if (fg->pattern == NULL)						\
97       return REG_ESPACE;						\
98     wcstombs(fg->pattern, fg->wpattern, siz);				\
99     fg->pattern[siz] = '\0';						\
100   }									\
101 
102 #define IS_OUT_OF_BOUNDS						\
103   ((!fg->reversed							\
104     ? ((type == STR_WIDE) ? ((j + fg->wlen) > len)			\
105 			  : ((j + fg->len) > len))			\
106     : (j < 0)))
107 
108 /*
109  * Checks whether the new position after shifting in the input string
110  * is out of the bounds and break out from the loop if so.
111  */
112 #define CHECKBOUNDS							\
113   if (IS_OUT_OF_BOUNDS)							\
114     break;								\
115 
116 /*
117  * Shifts in the input string after a mismatch. The position of the
118  * mismatch is stored in the mismatch variable.
119  */
120 #define SHIFT								\
121   CHECKBOUNDS;								\
122 									\
123   {									\
124     int r = -1;								\
125     unsigned int bc = 0, gs = 0, ts;					\
126 									\
127     switch (type)							\
128       {									\
129 	case STR_WIDE:							\
130 	  if (!fg->hasdot)						\
131 	    {								\
132 	      if (u != 0 && (unsigned)mismatch == fg->wlen - 1 - shift)	\
133 		mismatch -= u;						\
134 	      v = fg->wlen - 1 - mismatch;				\
135 	      r = hashtable_get(fg->qsBc_table,				\
136 		&str_wide[!fg->reversed ? (size_t)j + fg->wlen		\
137 			  : (size_t)j - 1], &bc);			\
138 	      gs = fg->bmGs[mismatch];					\
139 	    }								\
140 	    bc = (r == HASH_OK) ? bc : fg->defBc;			\
141 	    DPRINT(("tre_fast_match: mismatch on character" CHF ", "	\
142 		    "BC %d, GS %d\n",					\
143 		    ((const tre_char_t *)startptr)[mismatch + 1],	\
144 		    bc, gs));						\
145             break;							\
146 	default:							\
147 	  if (!fg->hasdot)						\
148 	    {								\
149 	      if (u != 0 && (unsigned)mismatch == fg->len - 1 - shift)	\
150 		mismatch -= u;						\
151 	      v = fg->len - 1 - mismatch;				\
152 	      gs = fg->sbmGs[mismatch];					\
153 	    }								\
154 	  bc = fg->qsBc[((const unsigned char *)str_byte)		\
155 			[!fg->reversed ? (size_t)j + fg->len		\
156 			 : (size_t)j - 1]];				\
157 	  DPRINT(("tre_fast_match: mismatch on character %c, "		\
158 		 "BC %d, GS %d\n",					\
159 		 ((const unsigned char *)startptr)[mismatch + 1],	\
160 		 bc, gs));						\
161       }									\
162     if (fg->hasdot)							\
163       shift = bc;							\
164     else								\
165       {									\
166 	ts = (u >= v) ? (u - v) : 0;					\
167 	shift = MAX(ts, bc);						\
168 	shift = MAX(shift, gs);						\
169 	if (shift == gs)						\
170 	  u = MIN((type == STR_WIDE ? fg->wlen : fg->len) - shift, v);	\
171 	else								\
172 	  {								\
173 	    if (ts < bc)						\
174 	      shift = MAX(shift, u + 1);				\
175 	    u = 0;							\
176 	  }								\
177       }									\
178       DPRINT(("tre_fast_match: shifting %u characters\n", shift));	\
179       j = !fg->reversed ? j + shift : j - shift;			\
180   }
181 
182 /*
183  * Normal Quick Search would require a shift based on the position the
184  * next character after the comparison is within the pattern.  With
185  * wildcards, the position of the last dot effects the maximum shift
186  * distance.
187  * The closer to the end the wild card is the slower the search.
188  *
189  * Examples:
190  * Pattern    Max shift
191  * -------    ---------
192  * this               5
193  * .his               4
194  * t.is               3
195  * th.s               2
196  * thi.               1
197  */
198 
199 /*
200  * Fills in the bad character shift array for SB/MB strings.
201  */
202 #define FILL_QSBC							\
203   if (fg->reversed)							\
204     {									\
205       _FILL_QSBC_REVERSED						\
206     }									\
207   else									\
208     {									\
209       _FILL_QSBC							\
210     }
211 
212 #define _FILL_QSBC							\
213   for (unsigned int i = 0; i <= UCHAR_MAX; i++)				\
214     fg->qsBc[i] = fg->len - hasdot;					\
215   for (unsigned int i = hasdot + 1; i < fg->len; i++)			\
216     {									\
217       fg->qsBc[(unsigned char)fg->pattern[i]] = fg->len - i;		\
218       DPRINT(("BC shift for char %c is %zu\n", fg->pattern[i],		\
219 	     fg->len - i));						\
220       if (fg->icase)							\
221 	{								\
222 	  char c = islower((unsigned char)fg->pattern[i]) ?		\
223 		   toupper((unsigned char)fg->pattern[i]) :		\
224 		   tolower((unsigned char)fg->pattern[i]);		\
225 	  fg->qsBc[(unsigned char)c] = fg->len - i;			\
226 	  DPRINT(("BC shift for char %c is %zu\n", c, fg->len - i));	\
227 	}								\
228     }
229 
230 #define _FILL_QSBC_REVERSED						\
231   for (unsigned int i = 0; i <= UCHAR_MAX; i++)				\
232     fg->qsBc[i] = firstdot + 1;						\
233   for (int i = firstdot - 1; i >= 0; i--)				\
234     {									\
235       fg->qsBc[(unsigned char)fg->pattern[i]] = i + 1;			\
236       DPRINT(("Reverse BC shift for char %c is %d\n", fg->pattern[i],	\
237 	     i + 1));							\
238       if (fg->icase)							\
239         {								\
240           char c = islower((unsigned char)fg->pattern[i]) ?		\
241 		   toupper((unsigned char)fg->pattern[i]) :		\
242 		   tolower((unsigned char)fg->pattern[i]);		\
243           fg->qsBc[(unsigned char)c] = i + 1;				\
244 	  DPRINT(("Reverse BC shift for char %c is %d\n", c, i + 1));	\
245         }								\
246     }
247 
248 /*
249  * Fills in the bad character shifts into a hastable for wide strings.
250  * With wide characters it is not possible any more to use a normal
251  * array because there are too many characters and we could not
252  * provide enough memory. Fortunately, we only have to store distinct
253  * values for so many characters as the number of distinct characters
254  * in the pattern, so we can store them in a hashtable and store a
255  * default shift value for the rest.
256  */
257 #define FILL_QSBC_WIDE							\
258   if (fg->reversed)							\
259     {									\
260       _FILL_QSBC_WIDE_REVERSED						\
261     }									\
262   else									\
263     {									\
264       _FILL_QSBC_WIDE							\
265     }
266 
267 #define _FILL_QSBC_WIDE							\
268   /* Adjust the shift based on location of the last dot ('.'). */	\
269   fg->defBc = fg->wlen - whasdot;					\
270 									\
271   /* Preprocess pattern. */						\
272   fg->qsBc_table = hashtable_init(fg->wlen * (fg->icase ? 8 : 4),	\
273 				  sizeof(tre_char_t), sizeof(int));	\
274   if (!fg->qsBc_table)							\
275     FAIL_COMP(REG_ESPACE);						\
276   for (unsigned int i = whasdot + 1; i < fg->wlen; i++)			\
277     {									\
278       int k = fg->wlen - i;						\
279       int r;								\
280 									\
281       r = hashtable_put(fg->qsBc_table, &fg->wpattern[i], &k);		\
282       if ((r == HASH_FAIL) || (r == HASH_FULL))				\
283 	FAIL_COMP(REG_ESPACE);						\
284       DPRINT(("BC shift for wide char " CHF " is %d\n", fg->wpattern[i],\
285 	     k));							\
286       if (fg->icase)							\
287 	{								\
288 	  tre_char_t wc = iswlower(fg->wpattern[i]) ?			\
289 	    towupper(fg->wpattern[i]) : towlower(fg->wpattern[i]);	\
290 	  r = hashtable_put(fg->qsBc_table, &wc, &k);			\
291 	  if ((r == HASH_FAIL) || (r == HASH_FULL))			\
292 	    FAIL_COMP(REG_ESPACE);					\
293 	  DPRINT(("BC shift for wide char " CHF " is %d\n", wc, k));	\
294 	}								\
295     }
296 
297 #define _FILL_QSBC_WIDE_REVERSED					\
298   /* Adjust the shift based on location of the last dot ('.'). */	\
299   fg->defBc = (size_t)wfirstdot;					\
300 									\
301   /* Preprocess pattern. */						\
302   fg->qsBc_table = hashtable_init(fg->wlen * (fg->icase ? 8 : 4),	\
303 				  sizeof(tre_char_t), sizeof(int));	\
304   if (!fg->qsBc_table)							\
305     FAIL_COMP(REG_ESPACE);						\
306   for (int i = wfirstdot - 1; i >= 0; i--)				\
307     {									\
308       int k = i + 1;							\
309       int r;								\
310 									\
311       r = hashtable_put(fg->qsBc_table, &fg->wpattern[i], &k);		\
312       if ((r == HASH_FAIL) || (r == HASH_FULL))				\
313 	FAIL_COMP(REG_ESPACE);						\
314       DPRINT(("Reverse BC shift for wide char " CHF " is %d\n",		\
315 	     fg->wpattern[i], k));					\
316       if (fg->icase)							\
317 	{								\
318 	  tre_char_t wc = iswlower(fg->wpattern[i]) ?			\
319 	    towupper(fg->wpattern[i]) : towlower(fg->wpattern[i]);	\
320 	  r = hashtable_put(fg->qsBc_table, &wc, &k);			\
321 	  if ((r == HASH_FAIL) || (r == HASH_FULL))			\
322 	    FAIL_COMP(REG_ESPACE);					\
323 	  DPRINT(("Reverse BC shift for wide char " CHF " is %d\n", wc,	\
324 		 k));							\
325 	}								\
326     }
327 
328 #ifdef _GREP_DEBUG
329 #define DPRINT_BMGS(len, fmt_str, sh)					\
330   for (unsigned int i = 0; i < len; i++)				\
331     DPRINT((fmt_str, i, sh[i]));
332 #else
333 #define DPRINT_BMGS(len, fmt_str, sh)					\
334   do { } while(/*CONSTCOND*/0)
335 #endif
336 
337 /*
338  * Fills in the good suffix table for SB/MB strings.
339  */
340 #define FILL_BMGS							\
341   if (!fg->hasdot)							\
342     {									\
343       fg->sbmGs = xmalloc(fg->len * sizeof(int));			\
344       if (!fg->sbmGs)							\
345 	return REG_ESPACE;						\
346       if (fg->len == 1)							\
347 	fg->sbmGs[0] = 1;						\
348       else								\
349 	_FILL_BMGS(fg->sbmGs, fg->pattern, fg->len, false);		\
350       DPRINT_BMGS(fg->len, "GS shift for pos %d is %d\n", fg->sbmGs);	\
351     }
352 
353 /*
354  * Fills in the good suffix table for wide strings.
355  */
356 #define FILL_BMGS_WIDE							\
357   if (!fg->hasdot)							\
358     {									\
359       fg->bmGs = xmalloc(fg->wlen * sizeof(int));			\
360       if (!fg->bmGs)							\
361 	return REG_ESPACE;						\
362       if (fg->wlen == 1)						\
363 	fg->bmGs[0] = 1;						\
364       else								\
365 	_FILL_BMGS(fg->bmGs, fg->wpattern, fg->wlen, true);		\
366       DPRINT_BMGS(fg->wlen, "GS shift (wide) for pos %d is %d\n",	\
367 		  fg->bmGs);						\
368     }
369 
370 #define _FILL_BMGS(arr, pat, plen, wide)				\
371   {									\
372     char *p;								\
373     tre_char_t *wp;							\
374 									\
375     if (wide)								\
376       {									\
377 	if (fg->icase)							\
378 	  {								\
379 	    wp = xmalloc(plen * sizeof(tre_char_t));			\
380 	    if (wp == NULL)						\
381 	      return REG_ESPACE;					\
382 	    for (unsigned int i = 0; i < plen; i++)			\
383 	      wp[i] = towlower(pat[i]);					\
384 	    _CALC_BMGS(arr, wp, plen);					\
385 	    xfree(wp);							\
386 	  }								\
387 	else								\
388 	  _CALC_BMGS(arr, pat, plen);					\
389       }									\
390     else								\
391       {									\
392 	if (fg->icase)							\
393 	  {								\
394 	    p = xmalloc(plen);						\
395 	    if (p == NULL)						\
396 	      return REG_ESPACE;					\
397 	    for (unsigned int i = 0; i < plen; i++)			\
398 	      p[i] = tolower((unsigned char)pat[i]);                    \
399 	    _CALC_BMGS(arr, p, plen);					\
400 	    xfree(p);							\
401 	  }								\
402 	else								\
403 	  _CALC_BMGS(arr, pat, plen);					\
404       }									\
405   }
406 
407 #define _CALC_BMGS(arr, pat, plen)					\
408   {									\
409     int f = 0, g;							\
410 									\
411     int *suff = xmalloc(plen * sizeof(int));				\
412     if (suff == NULL)							\
413       return REG_ESPACE;						\
414 									\
415     suff[plen - 1] = plen;						\
416     g = plen - 1;							\
417     for (int i = plen - 2; i >= 0; i--)					\
418       {									\
419 	if (i > g && suff[i + plen - 1 - f] < i - g)			\
420 	  suff[i] = suff[i + plen - 1 - f];				\
421 	else								\
422 	  {								\
423 	    if (i < g)							\
424 	      g = i;							\
425 	    f = i;							\
426 	    while (g >= 0 && pat[g] == pat[g + plen - 1 - f])		\
427 	      g--;							\
428 	    suff[i] = f - g;						\
429 	  }								\
430       }									\
431 									\
432     for (unsigned int i = 0; i < plen; i++)				\
433       arr[i] = plen;							\
434     g = 0;								\
435     for (int i = plen - 1; i >= 0; i--)					\
436       if (suff[i] == i + 1)						\
437 	for(; (unsigned long)g < plen - 1 - i; g++)			\
438 	  if (arr[g] == plen)						\
439 	    arr[g] = plen - 1 - i;					\
440     for (unsigned int i = 0; i <= plen - 2; i++)			\
441       arr[plen - 1 - suff[i]] = plen - 1 - i;				\
442 									\
443     xfree(suff);							\
444   }
445 
446 /*
447  * Copies the pattern pat having length n to p and stores
448  * the size in l.
449  */
450 #define SAVE_PATTERN(src, srclen, dst, dstlen)				\
451   dstlen = srclen;							\
452   dst = xmalloc((dstlen + 1) * sizeof(tre_char_t));			\
453   if (dst == NULL)							\
454     return REG_ESPACE;							\
455   if (dstlen > 0)							\
456     memcpy(dst, src, dstlen * sizeof(tre_char_t));			\
457   dst[dstlen] = TRE_CHAR('\0');
458 
459 /*
460  * Initializes pattern compiling.
461  */
462 #define INIT_COMP							\
463   /* Initialize. */							\
464   memset(fg, 0, sizeof(*fg));						\
465   fg->icase = (cflags & REG_ICASE);					\
466   fg->word = (cflags & REG_WORD);					\
467   fg->newline = (cflags & REG_NEWLINE);					\
468   fg->nosub = (cflags & REG_NOSUB);					\
469 									\
470   /* Cannot handle REG_ICASE with MB string */				\
471   if (fg->icase && (TRE_MB_CUR_MAX > 1) && n > 0)			\
472     {									\
473       DPRINT(("Cannot use fast matcher for MBS with REG_ICASE\n"));	\
474       return REG_BADPAT;						\
475     }
476 
477 /*
478  * Checks whether we have a 0-length pattern that will match
479  * anything. If literal is set to false, the EOL anchor is also
480  * taken into account.
481  */
482 #define CHECK_MATCHALL(literal)						\
483   if (!literal && n == 1 && pat[0] == TRE_CHAR('$'))			\
484     {									\
485       n--;								\
486       fg->eol = true;							\
487     }									\
488 									\
489   if (n == 0)								\
490     {									\
491       fg->matchall = true;						\
492       fg->pattern = xmalloc(sizeof(char));				\
493       if (!fg->pattern)							\
494 	FAIL_COMP(REG_ESPACE);						\
495       fg->pattern[0] = '\0';						\
496       fg->wpattern = xmalloc(sizeof(tre_char_t));			\
497       if (!fg->wpattern)						\
498 	FAIL_COMP(REG_ESPACE);						\
499       fg->wpattern[0] = TRE_CHAR('\0');					\
500       DPRINT(("Matching every input\n"));				\
501       return REG_OK;							\
502     }
503 
504 /*
505  * Returns: REG_OK on success, error code otherwise
506  */
507 int
tre_compile_literal(fastmatch_t * fg,const tre_char_t * pat,size_t n,int cflags)508 tre_compile_literal(fastmatch_t *fg, const tre_char_t *pat, size_t n,
509 		    int cflags)
510 {
511   size_t hasdot = 0, whasdot = 0;
512   ssize_t firstdot = -1, wfirstdot = -1;
513 
514   INIT_COMP;
515 
516   CHECK_MATCHALL(true);
517 
518   /* Cannot handle word boundaries with MB string */
519   if (fg->word && (TRE_MB_CUR_MAX > 1))
520     return REG_BADPAT;
521 
522 #ifdef TRE_WCHAR
523   SAVE_PATTERN(pat, n, fg->wpattern, fg->wlen);
524   STORE_MBS_PAT;
525 #else
526   SAVE_PATTERN(pat, n, fg->pattern, fg->len);
527 #endif
528 
529   DPRINT(("tre_compile_literal: pattern: %s, len %zu, icase: %c, word: %c, "
530 	 "newline %c\n", fg->pattern, fg->len, fg->icase ? 'y' : 'n',
531 	 fg->word ? 'y' : 'n', fg->newline ? 'y' : 'n'));
532 
533   FILL_QSBC;
534   FILL_BMGS;
535 #ifdef TRE_WCHAR
536   FILL_QSBC_WIDE;
537   FILL_BMGS_WIDE;
538 #endif
539 
540   return REG_OK;
541 }
542 
543 /*
544  * Returns: REG_OK on success, error code otherwise
545  */
546 int
tre_compile_fast(fastmatch_t * fg,const tre_char_t * pat,size_t n,int cflags)547 tre_compile_fast(fastmatch_t *fg, const tre_char_t *pat, size_t n,
548 		 int cflags)
549 {
550   tre_char_t *tmp;
551   size_t pos = 0, hasdot = 0, whasdot = 0;
552   ssize_t firstdot = -1, wfirstdot = -1;
553   bool escaped = false;
554   bool *_escmap = NULL;
555 
556   INIT_COMP;
557 
558   /* Remove beginning-of-line character ('^'). */
559   if (pat[0] == TRE_CHAR('^'))
560     {
561       fg->bol = true;
562       n--;
563       pat++;
564     }
565 
566   CHECK_MATCHALL(false);
567 
568   /* Handle word-boundary matching when GNU extensions are enabled */
569   if ((cflags & REG_GNU) && (n >= 14) &&
570       (memcmp(pat, TRE_CHAR("[[:<:]]"), 7 * sizeof(tre_char_t)) == 0) &&
571       (memcmp(pat + n - 7, TRE_CHAR("[[:>:]]"),
572 	      7 * sizeof(tre_char_t)) == 0))
573     {
574       n -= 14;
575       pat += 7;
576       fg->word = true;
577     }
578 
579   /* Cannot handle word boundaries with MB string */
580   if (fg->word && (TRE_MB_CUR_MAX > 1))
581     return REG_BADPAT;
582 
583   tmp = xmalloc((n + 1) * sizeof(tre_char_t));
584   if (tmp == NULL)
585     return REG_ESPACE;
586 
587 /* Copies the char into the stored pattern and skips to the next char. */
588 #define STORE_CHAR							\
589   do									\
590     {									\
591       tmp[pos++] = pat[i];						\
592       escaped = false;							\
593       continue;								\
594     } while (0)
595 
596   /* Traverse the input pattern for processing */
597   for (unsigned int i = 0; i < n; i++)
598     {
599       switch (pat[i])
600 	{
601 	  case TRE_CHAR('\\'):
602 	    if (escaped)
603 	      STORE_CHAR;
604 	    else if (i == n - 1)
605 	      goto badpat;
606 	    else
607 	      escaped = true;
608 	    continue;
609 	  case TRE_CHAR('['):
610 	    if (escaped)
611 	      STORE_CHAR;
612 	    else
613 	      goto badpat;
614 	    continue;
615 	  case TRE_CHAR('*'):
616 	    if (escaped || (!(cflags & REG_EXTENDED) && (i == 0)))
617 	      STORE_CHAR;
618 	    else
619 	      goto badpat;
620 	    continue;
621 	  case TRE_CHAR('+'):
622 	  case TRE_CHAR('?'):
623 	    if ((cflags & REG_EXTENDED) && (i == 0))
624 	      continue;
625 	    else if ((cflags & REG_EXTENDED) ^ !escaped)
626 	      STORE_CHAR;
627 	    else
628 	      goto badpat;
629 	    continue;
630 	  case TRE_CHAR('.'):
631 	    if (escaped)
632 	      {
633 		if (!_escmap)
634 		  _escmap = xmalloc(n * sizeof(bool));
635 		if (!_escmap)
636 		  {
637 		    xfree(tmp);
638 		    return REG_ESPACE;
639 		  }
640 		_escmap[i] = true;
641 		STORE_CHAR;
642 	      }
643 	    else
644 	      {
645 		whasdot = i;
646 		if (wfirstdot == -1)
647 			wfirstdot = i;
648 		STORE_CHAR;
649 	      }
650 	    continue;
651 	  case TRE_CHAR('^'):
652 	    STORE_CHAR;
653 	    continue;
654 	  case TRE_CHAR('$'):
655 	    if (!escaped && (i == n - 1))
656 	      fg->eol = true;
657 	    else
658 	      STORE_CHAR;
659 	    continue;
660 	  case TRE_CHAR('('):
661 	    if ((cflags & REG_EXTENDED) ^ escaped)
662 	      goto badpat;
663 	    else
664 	      STORE_CHAR;
665 	    continue;
666 	  case TRE_CHAR('{'):
667 	    if (!(cflags & REG_EXTENDED) ^ escaped)
668 	      STORE_CHAR;
669 	    else if (!(cflags & REG_EXTENDED) && (i == 0))
670 	      STORE_CHAR;
671 	    else if ((cflags & REG_EXTENDED) && (i == 0))
672 	      continue;
673 	    else
674 	      goto badpat;
675 	    continue;
676 	  case TRE_CHAR('|'):
677 	    if ((cflags & REG_EXTENDED) ^ escaped)
678 	      goto badpat;
679 	    else
680 	      STORE_CHAR;
681 	    continue;
682 	  default:
683 	    if (escaped)
684 	      goto badpat;
685 	    else
686 	      STORE_CHAR;
687 	    continue;
688 	}
689       continue;
690 badpat:
691       xfree(tmp);
692       DPRINT(("tre_compile_fast: compilation of pattern failed, falling"
693 	      "back to NFA\n"));
694       return REG_BADPAT;
695     }
696 
697   fg->hasdot = wfirstdot > -1;
698 
699   /*
700    * The pattern has been processed and copied to tmp as a literal string
701    * with escapes, anchors (^$) and the word boundary match character
702    * classes stripped out.
703    */
704 #ifdef TRE_WCHAR
705   SAVE_PATTERN(tmp, pos, fg->wpattern, fg->wlen);
706   fg->wescmap = _escmap;
707   STORE_MBS_PAT;
708 
709   /*
710    * The position of dots and escaped dots is different in the MB string
711    * than in to the wide string so traverse the converted string, as well,
712    * to store these positions.
713    */
714   if (fg->hasdot || (fg->wescmap != NULL))
715     {
716       if (fg->wescmap != NULL)
717 	{
718 	  fg->escmap = xmalloc(fg->len * sizeof(bool));
719 	  if (!fg->escmap)
720 	    {
721 	      tre_free_fast(fg);
722 	      return REG_ESPACE;
723 	    }
724 	}
725 
726       escaped = false;
727       for (unsigned int i = 0; i < fg->len; i++)
728 	if (fg->pattern[i] == '\\')
729 	  escaped = !escaped;
730 	else if (fg->pattern[i] == '.' && fg->escmap && escaped)
731 	  {
732 	    fg->escmap[i] = true;
733 	    escaped = false;
734 	  }
735 	else if (fg->pattern[i] == '.' && !escaped)
736 	  {
737 	    hasdot = i;
738 	    if (firstdot == -1)
739 	      firstdot = i;
740 	  }
741 	else
742 	  escaped = false;
743     }
744 #else
745   SAVE_PATTERN(tmp, pos, fg->pattern, fg->len);
746   fg->escmap = _escmap;
747 #endif
748 
749   xfree(tmp);
750 
751   DPRINT(("tre_compile_fast: pattern: %s, len %zu, bol %c, eol %c, "
752 	 "icase: %c, word: %c, newline %c\n", fg->pattern, fg->len,
753 	 fg->bol ? 'y' : 'n', fg->eol ? 'y' : 'n',
754 	 fg->icase ? 'y' : 'n', fg->word ? 'y' : 'n',
755 	 fg->newline ? 'y' : 'n'));
756 
757   /* Check whether reverse QS algorithm is more efficient */
758   if ((wfirstdot > -1) && (fg->wlen - whasdot + 1 < (size_t)wfirstdot) &&
759       fg->nosub)
760     {
761       fg->reversed = true;
762       DPRINT(("tre_compile_fast: using reverse QS algorithm\n"));
763     }
764 
765   FILL_QSBC;
766   FILL_BMGS;
767 #ifdef TRE_WCHAR
768   FILL_QSBC_WIDE;
769   FILL_BMGS_WIDE;
770 #endif
771 
772   return REG_OK;
773 }
774 
775 #define _SHIFT_ONE							\
776   {									\
777     shift = 1;								\
778     j = !fg->reversed ? j + shift : j - shift;				\
779     continue;								\
780   }
781 
782 #define _BBOUND_COND							\
783   ((type == STR_WIDE) ?							\
784     ((j == 0) || !(tre_isalnum(str_wide[j - 1]) ||			\
785       (str_wide[j - 1] == TRE_CHAR('_')))) :				\
786     ((j == 0) || !(tre_isalnum(str_byte[j - 1]) ||			\
787       (str_byte[j - 1] == '_'))))
788 
789 #define _EBOUND_COND							\
790   ((type == STR_WIDE) ?							\
791     ((j + fg->wlen == len) || !(tre_isalnum(str_wide[j + fg->wlen]) ||	\
792       (str_wide[j + fg->wlen] == TRE_CHAR('_')))) :			\
793     ((j + fg->len == len) || !(tre_isalnum(str_byte[j + fg->len]) ||	\
794       (str_byte[j + fg->len] == '_'))))
795 
796 /*
797  * Condition to check whether the match on position j is on a
798  * word boundary.
799  */
800 #define IS_ON_WORD_BOUNDARY						\
801   (_BBOUND_COND && _EBOUND_COND)
802 
803 /*
804  * Checks word boundary and shifts one if match is not on a
805  * boundary.
806  */
807 #define CHECK_WORD_BOUNDARY						\
808     if (!IS_ON_WORD_BOUNDARY)						\
809       _SHIFT_ONE;
810 
811 #define _BOL_COND							\
812   ((j == 0) || ((type == STR_WIDE) ? (str_wide[j - 1] == TRE_CHAR('\n'))\
813 				   : (str_byte[j - 1] == '\n')))
814 
815 /*
816  * Checks BOL anchor and shifts one if match is not on a
817  * boundary.
818  */
819 #define CHECK_BOL_ANCHOR						\
820     if (!_BOL_COND)							\
821       _SHIFT_ONE;
822 
823 #define _EOL_COND							\
824   ((type == STR_WIDE)							\
825     ? ((j + fg->wlen == len) ||						\
826 		(str_wide[j + fg->wlen] == TRE_CHAR('\n')))		\
827     : ((j + fg->len == len) || (str_byte[j + fg->wlen] == '\n')))
828 
829 /*
830  * Checks EOL anchor and shifts one if match is not on a
831  * boundary.
832  */
833 #define CHECK_EOL_ANCHOR						\
834     if (!_EOL_COND)							\
835       _SHIFT_ONE;
836 
837 /*
838  * Executes matching of the precompiled pattern on the input string.
839  * Returns REG_OK or REG_NOMATCH depending on if we find a match or not.
840  */
841 int
tre_match_fast(const fastmatch_t * fg,const void * data,size_t len,tre_str_type_t type,int nmatch,regmatch_t pmatch[],int eflags)842 tre_match_fast(const fastmatch_t *fg, const void *data, size_t len,
843     tre_str_type_t type, int nmatch, regmatch_t pmatch[], int eflags)
844 {
845   unsigned int shift, u = 0, v = 0;
846   ssize_t j = 0;
847   int ret = REG_NOMATCH;
848   int mismatch;
849   const char *str_byte = data;
850   const void *startptr = NULL;
851   const tre_char_t *str_wide = data;
852 
853   /* Calculate length if unspecified. */
854   if (len == (size_t)-1)
855     switch (type)
856       {
857 	case STR_WIDE:
858 	  len = tre_strlen(str_wide);
859 	  break;
860 	default:
861 	  len = strlen(str_byte);
862 	  break;
863       }
864 
865   /* Shortcut for empty pattern */
866   if (fg->matchall)
867     {
868       if (!fg->nosub && nmatch >= 1)
869 	{
870 	  pmatch[0].rm_so = 0;
871 	  pmatch[0].rm_eo = len;
872 	}
873       if (fg->bol && fg->eol)
874 	return (len == 0) ? REG_OK : REG_NOMATCH;
875       else
876 	return REG_OK;
877     }
878 
879   /* No point in going farther if we do not have enough data. */
880   switch (type)
881     {
882       case STR_WIDE:
883 	if (len < fg->wlen)
884 	  return ret;
885 	shift = fg->wlen;
886 	break;
887       default:
888 	if (len < fg->len)
889 	  return ret;
890 	shift = fg->len;
891     }
892 
893   /*
894    * REG_NOTBOL means not anchoring ^ to the beginning of the line, so we
895    * can shift one because there can't be a match at the beginning.
896    */
897   if (fg->bol && (eflags & REG_NOTBOL))
898     j = 1;
899 
900   /*
901    * Like above, we cannot have a match at the very end when anchoring to
902    * the end and REG_NOTEOL is specified.
903    */
904   if (fg->eol && (eflags & REG_NOTEOL))
905     len--;
906 
907   if (fg->reversed)
908     j = len - (type == STR_WIDE ? fg->wlen : fg->len);
909 
910 
911   /* Only try once at the beginning or ending of the line. */
912   if ((fg->bol || fg->eol) && !fg->newline && !(eflags & REG_NOTBOL) &&
913       !(eflags & REG_NOTEOL))
914     {
915       /* Simple text comparison. */
916       if (!((fg->bol && fg->eol) &&
917 	  (type == STR_WIDE ? (len != fg->wlen) : (len != fg->len))))
918 	{
919 	  /* Determine where in data to start search at. */
920 	  j = fg->eol ? len - (type == STR_WIDE ? fg->wlen : fg->len) : 0;
921 	  SKIP_CHARS(j);
922 	  mismatch = fastcmp(fg, startptr, type);
923 	  if (mismatch == REG_OK)
924 	    {
925 	      if (fg->word && !IS_ON_WORD_BOUNDARY)
926 		return ret;
927 	      if (!fg->nosub && nmatch >= 1)
928 		{
929 		  pmatch[0].rm_so = j;
930 		  pmatch[0].rm_eo = j + (type == STR_WIDE ? fg->wlen : fg->len);
931 		}
932 	      return REG_OK;
933             }
934         }
935     }
936   else
937     {
938       /* Quick Search / Turbo Boyer-Moore algorithm. */
939       do
940 	{
941 	  SKIP_CHARS(j);
942 	  mismatch = fastcmp(fg, startptr, type);
943 	  if (mismatch == REG_OK)
944 	    {
945 	      if (fg->word)
946 		CHECK_WORD_BOUNDARY;
947 	      if (fg->bol)
948 		CHECK_BOL_ANCHOR;
949 	      if (fg->eol)
950 		CHECK_EOL_ANCHOR;
951 	      if (!fg->nosub && nmatch >= 1)
952 		{
953 		  pmatch[0].rm_so = j;
954 		  pmatch[0].rm_eo = j + ((type == STR_WIDE) ? fg->wlen : fg->len);
955 		}
956 	      return REG_OK;
957 	    }
958 	  else if (mismatch > 0)
959 	    return mismatch;
960 	  mismatch = -mismatch - 1;
961 	  SHIFT;
962         } while (!IS_OUT_OF_BOUNDS);
963     }
964     return ret;
965 }
966 
967 /*
968  * Frees the resources that were allocated when the pattern was compiled.
969  */
970 void
tre_free_fast(fastmatch_t * fg)971 tre_free_fast(fastmatch_t *fg)
972 {
973 
974   DPRINT(("tre_fast_free: freeing structures for pattern %s\n",
975 	 fg->pattern));
976 
977 #ifdef TRE_WCHAR
978   hashtable_free(fg->qsBc_table);
979   if (!fg->hasdot)
980     xfree(fg->bmGs);
981   if (fg->wescmap)
982     xfree(fg->wescmap);
983   xfree(fg->wpattern);
984 #endif
985   if (!fg->hasdot)
986     xfree(fg->sbmGs);
987   if (fg->escmap)
988     xfree(fg->escmap);
989   xfree(fg->pattern);
990 }
991 
992 /*
993  * Returns:	-(i + 1) on failure (position that it failed with minus sign)
994  *		error code on error
995  *		REG_OK on success
996  */
997 static inline int
fastcmp(const fastmatch_t * fg,const void * data,tre_str_type_t type)998 fastcmp(const fastmatch_t *fg, const void *data, tre_str_type_t type)
999 {
1000   const char *str_byte = data;
1001   const char *pat_byte = fg->pattern;
1002   const tre_char_t *str_wide = data;
1003   const tre_char_t *pat_wide = fg->wpattern;
1004   const bool *escmap = (type == STR_WIDE) ? fg->wescmap : fg->escmap;
1005   size_t len = (type == STR_WIDE) ? fg->wlen : fg->len;
1006   int ret = REG_OK;
1007 
1008   /* Compare the pattern and the input char-by-char from the last position. */
1009   for (int i = len - 1; i >= 0; i--) {
1010     switch (type)
1011       {
1012 	case STR_WIDE:
1013 
1014 	  /* Check dot */
1015 	  if (fg->hasdot && pat_wide[i] == TRE_CHAR('.') &&
1016 	      (!escmap || !escmap[i]) &&
1017 	      (!fg->newline || (str_wide[i] != TRE_CHAR('\n'))))
1018 	    continue;
1019 
1020 	  /* Compare */
1021 	  if (fg->icase ? (towlower(pat_wide[i]) == towlower(str_wide[i]))
1022 		    : (pat_wide[i] == str_wide[i]))
1023 	    continue;
1024 	  break;
1025 	default:
1026 	  /* Check dot */
1027 	  if (fg->hasdot && pat_byte[i] == '.' &&
1028 	      (!escmap || !escmap[i]) &&
1029 	      (!fg->newline || (str_byte[i] != '\n')))
1030 	    continue;
1031 
1032 	  /* Compare */
1033 	  if (fg->icase ? (tolower((unsigned char)pat_byte[i]) == tolower((unsigned char)str_byte[i]))
1034 		    : (pat_byte[i] == str_byte[i]))
1035 	  continue;
1036       }
1037     DPRINT(("fastcmp: mismatch at position %d\n", i));
1038     ret = -(i + 1);
1039     break;
1040   }
1041   return ret;
1042 }
1043