1 /* $FreeBSD$ */
2
3 /*-
4 * Copyright (c) 1999 James Howard and Dag-Erling Coïdan Smørgrav
5 * Copyright (C) 2008-2011 Gabor Kovesdan <gabor@FreeBSD.org>
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30 #include "glue.h"
31
32 #include <ctype.h>
33 #include <limits.h>
34 #include <regex.h>
35 #include <stdbool.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #ifdef TRE_WCHAR
39 #include <wchar.h>
40 #include <wctype.h>
41 #endif
42
43 #include "hashtable.h"
44 #include "tre-fastmatch.h"
45 #include "xmalloc.h"
46
47 static int fastcmp(const fastmatch_t *fg, const void *data,
48 tre_str_type_t type);
49
50 /*
51 * Clean up if pattern compilation fails.
52 */
53 #define FAIL_COMP(errcode) \
54 { \
55 if (fg->pattern) \
56 xfree(fg->pattern); \
57 if (fg->wpattern) \
58 xfree(fg->wpattern); \
59 if (fg->qsBc_table) \
60 hashtable_free(fg->qsBc_table); \
61 fg = NULL; \
62 return errcode; \
63 }
64
65 /*
66 * Skips n characters in the input string and assigns the start
67 * address to startptr. Note: as per IEEE Std 1003.1-2008
68 * matching is based on bit pattern not character representations
69 * so we can handle MB strings as byte sequences just like
70 * SB strings.
71 */
72 #define SKIP_CHARS(n) \
73 switch (type) \
74 { \
75 case STR_WIDE: \
76 startptr = str_wide + n; \
77 break; \
78 default: \
79 startptr = str_byte + n; \
80 }
81
82 /*
83 * Converts the wide string pattern to SB/MB string and stores
84 * it in fg->pattern. Sets fg->len to the byte length of the
85 * converted string.
86 */
87 #define STORE_MBS_PAT \
88 { \
89 size_t siz; \
90 \
91 siz = wcstombs(NULL, fg->wpattern, 0); \
92 if (siz == (size_t)-1) \
93 return REG_BADPAT; \
94 fg->len = siz; \
95 fg->pattern = xmalloc(siz + 1); \
96 if (fg->pattern == NULL) \
97 return REG_ESPACE; \
98 wcstombs(fg->pattern, fg->wpattern, siz); \
99 fg->pattern[siz] = '\0'; \
100 } \
101
102 #define IS_OUT_OF_BOUNDS \
103 ((!fg->reversed \
104 ? ((type == STR_WIDE) ? ((j + fg->wlen) > len) \
105 : ((j + fg->len) > len)) \
106 : (j < 0)))
107
108 /*
109 * Checks whether the new position after shifting in the input string
110 * is out of the bounds and break out from the loop if so.
111 */
112 #define CHECKBOUNDS \
113 if (IS_OUT_OF_BOUNDS) \
114 break; \
115
116 /*
117 * Shifts in the input string after a mismatch. The position of the
118 * mismatch is stored in the mismatch variable.
119 */
120 #define SHIFT \
121 CHECKBOUNDS; \
122 \
123 { \
124 int r = -1; \
125 unsigned int bc = 0, gs = 0, ts; \
126 \
127 switch (type) \
128 { \
129 case STR_WIDE: \
130 if (!fg->hasdot) \
131 { \
132 if (u != 0 && (unsigned)mismatch == fg->wlen - 1 - shift) \
133 mismatch -= u; \
134 v = fg->wlen - 1 - mismatch; \
135 r = hashtable_get(fg->qsBc_table, \
136 &str_wide[!fg->reversed ? (size_t)j + fg->wlen \
137 : (size_t)j - 1], &bc); \
138 gs = fg->bmGs[mismatch]; \
139 } \
140 bc = (r == HASH_OK) ? bc : fg->defBc; \
141 DPRINT(("tre_fast_match: mismatch on character" CHF ", " \
142 "BC %d, GS %d\n", \
143 ((const tre_char_t *)startptr)[mismatch + 1], \
144 bc, gs)); \
145 break; \
146 default: \
147 if (!fg->hasdot) \
148 { \
149 if (u != 0 && (unsigned)mismatch == fg->len - 1 - shift) \
150 mismatch -= u; \
151 v = fg->len - 1 - mismatch; \
152 gs = fg->sbmGs[mismatch]; \
153 } \
154 bc = fg->qsBc[((const unsigned char *)str_byte) \
155 [!fg->reversed ? (size_t)j + fg->len \
156 : (size_t)j - 1]]; \
157 DPRINT(("tre_fast_match: mismatch on character %c, " \
158 "BC %d, GS %d\n", \
159 ((const unsigned char *)startptr)[mismatch + 1], \
160 bc, gs)); \
161 } \
162 if (fg->hasdot) \
163 shift = bc; \
164 else \
165 { \
166 ts = (u >= v) ? (u - v) : 0; \
167 shift = MAX(ts, bc); \
168 shift = MAX(shift, gs); \
169 if (shift == gs) \
170 u = MIN((type == STR_WIDE ? fg->wlen : fg->len) - shift, v); \
171 else \
172 { \
173 if (ts < bc) \
174 shift = MAX(shift, u + 1); \
175 u = 0; \
176 } \
177 } \
178 DPRINT(("tre_fast_match: shifting %u characters\n", shift)); \
179 j = !fg->reversed ? j + shift : j - shift; \
180 }
181
182 /*
183 * Normal Quick Search would require a shift based on the position the
184 * next character after the comparison is within the pattern. With
185 * wildcards, the position of the last dot effects the maximum shift
186 * distance.
187 * The closer to the end the wild card is the slower the search.
188 *
189 * Examples:
190 * Pattern Max shift
191 * ------- ---------
192 * this 5
193 * .his 4
194 * t.is 3
195 * th.s 2
196 * thi. 1
197 */
198
199 /*
200 * Fills in the bad character shift array for SB/MB strings.
201 */
202 #define FILL_QSBC \
203 if (fg->reversed) \
204 { \
205 _FILL_QSBC_REVERSED \
206 } \
207 else \
208 { \
209 _FILL_QSBC \
210 }
211
212 #define _FILL_QSBC \
213 for (unsigned int i = 0; i <= UCHAR_MAX; i++) \
214 fg->qsBc[i] = fg->len - hasdot; \
215 for (unsigned int i = hasdot + 1; i < fg->len; i++) \
216 { \
217 fg->qsBc[(unsigned char)fg->pattern[i]] = fg->len - i; \
218 DPRINT(("BC shift for char %c is %zu\n", fg->pattern[i], \
219 fg->len - i)); \
220 if (fg->icase) \
221 { \
222 char c = islower((unsigned char)fg->pattern[i]) ? \
223 toupper((unsigned char)fg->pattern[i]) : \
224 tolower((unsigned char)fg->pattern[i]); \
225 fg->qsBc[(unsigned char)c] = fg->len - i; \
226 DPRINT(("BC shift for char %c is %zu\n", c, fg->len - i)); \
227 } \
228 }
229
230 #define _FILL_QSBC_REVERSED \
231 for (unsigned int i = 0; i <= UCHAR_MAX; i++) \
232 fg->qsBc[i] = firstdot + 1; \
233 for (int i = firstdot - 1; i >= 0; i--) \
234 { \
235 fg->qsBc[(unsigned char)fg->pattern[i]] = i + 1; \
236 DPRINT(("Reverse BC shift for char %c is %d\n", fg->pattern[i], \
237 i + 1)); \
238 if (fg->icase) \
239 { \
240 char c = islower((unsigned char)fg->pattern[i]) ? \
241 toupper((unsigned char)fg->pattern[i]) : \
242 tolower((unsigned char)fg->pattern[i]); \
243 fg->qsBc[(unsigned char)c] = i + 1; \
244 DPRINT(("Reverse BC shift for char %c is %d\n", c, i + 1)); \
245 } \
246 }
247
248 /*
249 * Fills in the bad character shifts into a hastable for wide strings.
250 * With wide characters it is not possible any more to use a normal
251 * array because there are too many characters and we could not
252 * provide enough memory. Fortunately, we only have to store distinct
253 * values for so many characters as the number of distinct characters
254 * in the pattern, so we can store them in a hashtable and store a
255 * default shift value for the rest.
256 */
257 #define FILL_QSBC_WIDE \
258 if (fg->reversed) \
259 { \
260 _FILL_QSBC_WIDE_REVERSED \
261 } \
262 else \
263 { \
264 _FILL_QSBC_WIDE \
265 }
266
267 #define _FILL_QSBC_WIDE \
268 /* Adjust the shift based on location of the last dot ('.'). */ \
269 fg->defBc = fg->wlen - whasdot; \
270 \
271 /* Preprocess pattern. */ \
272 fg->qsBc_table = hashtable_init(fg->wlen * (fg->icase ? 8 : 4), \
273 sizeof(tre_char_t), sizeof(int)); \
274 if (!fg->qsBc_table) \
275 FAIL_COMP(REG_ESPACE); \
276 for (unsigned int i = whasdot + 1; i < fg->wlen; i++) \
277 { \
278 int k = fg->wlen - i; \
279 int r; \
280 \
281 r = hashtable_put(fg->qsBc_table, &fg->wpattern[i], &k); \
282 if ((r == HASH_FAIL) || (r == HASH_FULL)) \
283 FAIL_COMP(REG_ESPACE); \
284 DPRINT(("BC shift for wide char " CHF " is %d\n", fg->wpattern[i],\
285 k)); \
286 if (fg->icase) \
287 { \
288 tre_char_t wc = iswlower(fg->wpattern[i]) ? \
289 towupper(fg->wpattern[i]) : towlower(fg->wpattern[i]); \
290 r = hashtable_put(fg->qsBc_table, &wc, &k); \
291 if ((r == HASH_FAIL) || (r == HASH_FULL)) \
292 FAIL_COMP(REG_ESPACE); \
293 DPRINT(("BC shift for wide char " CHF " is %d\n", wc, k)); \
294 } \
295 }
296
297 #define _FILL_QSBC_WIDE_REVERSED \
298 /* Adjust the shift based on location of the last dot ('.'). */ \
299 fg->defBc = (size_t)wfirstdot; \
300 \
301 /* Preprocess pattern. */ \
302 fg->qsBc_table = hashtable_init(fg->wlen * (fg->icase ? 8 : 4), \
303 sizeof(tre_char_t), sizeof(int)); \
304 if (!fg->qsBc_table) \
305 FAIL_COMP(REG_ESPACE); \
306 for (int i = wfirstdot - 1; i >= 0; i--) \
307 { \
308 int k = i + 1; \
309 int r; \
310 \
311 r = hashtable_put(fg->qsBc_table, &fg->wpattern[i], &k); \
312 if ((r == HASH_FAIL) || (r == HASH_FULL)) \
313 FAIL_COMP(REG_ESPACE); \
314 DPRINT(("Reverse BC shift for wide char " CHF " is %d\n", \
315 fg->wpattern[i], k)); \
316 if (fg->icase) \
317 { \
318 tre_char_t wc = iswlower(fg->wpattern[i]) ? \
319 towupper(fg->wpattern[i]) : towlower(fg->wpattern[i]); \
320 r = hashtable_put(fg->qsBc_table, &wc, &k); \
321 if ((r == HASH_FAIL) || (r == HASH_FULL)) \
322 FAIL_COMP(REG_ESPACE); \
323 DPRINT(("Reverse BC shift for wide char " CHF " is %d\n", wc, \
324 k)); \
325 } \
326 }
327
328 #ifdef _GREP_DEBUG
329 #define DPRINT_BMGS(len, fmt_str, sh) \
330 for (unsigned int i = 0; i < len; i++) \
331 DPRINT((fmt_str, i, sh[i]));
332 #else
333 #define DPRINT_BMGS(len, fmt_str, sh) \
334 do { } while(/*CONSTCOND*/0)
335 #endif
336
337 /*
338 * Fills in the good suffix table for SB/MB strings.
339 */
340 #define FILL_BMGS \
341 if (!fg->hasdot) \
342 { \
343 fg->sbmGs = xmalloc(fg->len * sizeof(int)); \
344 if (!fg->sbmGs) \
345 return REG_ESPACE; \
346 if (fg->len == 1) \
347 fg->sbmGs[0] = 1; \
348 else \
349 _FILL_BMGS(fg->sbmGs, fg->pattern, fg->len, false); \
350 DPRINT_BMGS(fg->len, "GS shift for pos %d is %d\n", fg->sbmGs); \
351 }
352
353 /*
354 * Fills in the good suffix table for wide strings.
355 */
356 #define FILL_BMGS_WIDE \
357 if (!fg->hasdot) \
358 { \
359 fg->bmGs = xmalloc(fg->wlen * sizeof(int)); \
360 if (!fg->bmGs) \
361 return REG_ESPACE; \
362 if (fg->wlen == 1) \
363 fg->bmGs[0] = 1; \
364 else \
365 _FILL_BMGS(fg->bmGs, fg->wpattern, fg->wlen, true); \
366 DPRINT_BMGS(fg->wlen, "GS shift (wide) for pos %d is %d\n", \
367 fg->bmGs); \
368 }
369
370 #define _FILL_BMGS(arr, pat, plen, wide) \
371 { \
372 char *p; \
373 tre_char_t *wp; \
374 \
375 if (wide) \
376 { \
377 if (fg->icase) \
378 { \
379 wp = xmalloc(plen * sizeof(tre_char_t)); \
380 if (wp == NULL) \
381 return REG_ESPACE; \
382 for (unsigned int i = 0; i < plen; i++) \
383 wp[i] = towlower(pat[i]); \
384 _CALC_BMGS(arr, wp, plen); \
385 xfree(wp); \
386 } \
387 else \
388 _CALC_BMGS(arr, pat, plen); \
389 } \
390 else \
391 { \
392 if (fg->icase) \
393 { \
394 p = xmalloc(plen); \
395 if (p == NULL) \
396 return REG_ESPACE; \
397 for (unsigned int i = 0; i < plen; i++) \
398 p[i] = tolower((unsigned char)pat[i]); \
399 _CALC_BMGS(arr, p, plen); \
400 xfree(p); \
401 } \
402 else \
403 _CALC_BMGS(arr, pat, plen); \
404 } \
405 }
406
407 #define _CALC_BMGS(arr, pat, plen) \
408 { \
409 int f = 0, g; \
410 \
411 int *suff = xmalloc(plen * sizeof(int)); \
412 if (suff == NULL) \
413 return REG_ESPACE; \
414 \
415 suff[plen - 1] = plen; \
416 g = plen - 1; \
417 for (int i = plen - 2; i >= 0; i--) \
418 { \
419 if (i > g && suff[i + plen - 1 - f] < i - g) \
420 suff[i] = suff[i + plen - 1 - f]; \
421 else \
422 { \
423 if (i < g) \
424 g = i; \
425 f = i; \
426 while (g >= 0 && pat[g] == pat[g + plen - 1 - f]) \
427 g--; \
428 suff[i] = f - g; \
429 } \
430 } \
431 \
432 for (unsigned int i = 0; i < plen; i++) \
433 arr[i] = plen; \
434 g = 0; \
435 for (int i = plen - 1; i >= 0; i--) \
436 if (suff[i] == i + 1) \
437 for(; (unsigned long)g < plen - 1 - i; g++) \
438 if (arr[g] == plen) \
439 arr[g] = plen - 1 - i; \
440 for (unsigned int i = 0; i <= plen - 2; i++) \
441 arr[plen - 1 - suff[i]] = plen - 1 - i; \
442 \
443 xfree(suff); \
444 }
445
446 /*
447 * Copies the pattern pat having length n to p and stores
448 * the size in l.
449 */
450 #define SAVE_PATTERN(src, srclen, dst, dstlen) \
451 dstlen = srclen; \
452 dst = xmalloc((dstlen + 1) * sizeof(tre_char_t)); \
453 if (dst == NULL) \
454 return REG_ESPACE; \
455 if (dstlen > 0) \
456 memcpy(dst, src, dstlen * sizeof(tre_char_t)); \
457 dst[dstlen] = TRE_CHAR('\0');
458
459 /*
460 * Initializes pattern compiling.
461 */
462 #define INIT_COMP \
463 /* Initialize. */ \
464 memset(fg, 0, sizeof(*fg)); \
465 fg->icase = (cflags & REG_ICASE); \
466 fg->word = (cflags & REG_WORD); \
467 fg->newline = (cflags & REG_NEWLINE); \
468 fg->nosub = (cflags & REG_NOSUB); \
469 \
470 /* Cannot handle REG_ICASE with MB string */ \
471 if (fg->icase && (TRE_MB_CUR_MAX > 1) && n > 0) \
472 { \
473 DPRINT(("Cannot use fast matcher for MBS with REG_ICASE\n")); \
474 return REG_BADPAT; \
475 }
476
477 /*
478 * Checks whether we have a 0-length pattern that will match
479 * anything. If literal is set to false, the EOL anchor is also
480 * taken into account.
481 */
482 #define CHECK_MATCHALL(literal) \
483 if (!literal && n == 1 && pat[0] == TRE_CHAR('$')) \
484 { \
485 n--; \
486 fg->eol = true; \
487 } \
488 \
489 if (n == 0) \
490 { \
491 fg->matchall = true; \
492 fg->pattern = xmalloc(sizeof(char)); \
493 if (!fg->pattern) \
494 FAIL_COMP(REG_ESPACE); \
495 fg->pattern[0] = '\0'; \
496 fg->wpattern = xmalloc(sizeof(tre_char_t)); \
497 if (!fg->wpattern) \
498 FAIL_COMP(REG_ESPACE); \
499 fg->wpattern[0] = TRE_CHAR('\0'); \
500 DPRINT(("Matching every input\n")); \
501 return REG_OK; \
502 }
503
504 /*
505 * Returns: REG_OK on success, error code otherwise
506 */
507 int
tre_compile_literal(fastmatch_t * fg,const tre_char_t * pat,size_t n,int cflags)508 tre_compile_literal(fastmatch_t *fg, const tre_char_t *pat, size_t n,
509 int cflags)
510 {
511 size_t hasdot = 0, whasdot = 0;
512 ssize_t firstdot = -1, wfirstdot = -1;
513
514 INIT_COMP;
515
516 CHECK_MATCHALL(true);
517
518 /* Cannot handle word boundaries with MB string */
519 if (fg->word && (TRE_MB_CUR_MAX > 1))
520 return REG_BADPAT;
521
522 #ifdef TRE_WCHAR
523 SAVE_PATTERN(pat, n, fg->wpattern, fg->wlen);
524 STORE_MBS_PAT;
525 #else
526 SAVE_PATTERN(pat, n, fg->pattern, fg->len);
527 #endif
528
529 DPRINT(("tre_compile_literal: pattern: %s, len %zu, icase: %c, word: %c, "
530 "newline %c\n", fg->pattern, fg->len, fg->icase ? 'y' : 'n',
531 fg->word ? 'y' : 'n', fg->newline ? 'y' : 'n'));
532
533 FILL_QSBC;
534 FILL_BMGS;
535 #ifdef TRE_WCHAR
536 FILL_QSBC_WIDE;
537 FILL_BMGS_WIDE;
538 #endif
539
540 return REG_OK;
541 }
542
543 /*
544 * Returns: REG_OK on success, error code otherwise
545 */
546 int
tre_compile_fast(fastmatch_t * fg,const tre_char_t * pat,size_t n,int cflags)547 tre_compile_fast(fastmatch_t *fg, const tre_char_t *pat, size_t n,
548 int cflags)
549 {
550 tre_char_t *tmp;
551 size_t pos = 0, hasdot = 0, whasdot = 0;
552 ssize_t firstdot = -1, wfirstdot = -1;
553 bool escaped = false;
554 bool *_escmap = NULL;
555
556 INIT_COMP;
557
558 /* Remove beginning-of-line character ('^'). */
559 if (pat[0] == TRE_CHAR('^'))
560 {
561 fg->bol = true;
562 n--;
563 pat++;
564 }
565
566 CHECK_MATCHALL(false);
567
568 /* Handle word-boundary matching when GNU extensions are enabled */
569 if ((cflags & REG_GNU) && (n >= 14) &&
570 (memcmp(pat, TRE_CHAR("[[:<:]]"), 7 * sizeof(tre_char_t)) == 0) &&
571 (memcmp(pat + n - 7, TRE_CHAR("[[:>:]]"),
572 7 * sizeof(tre_char_t)) == 0))
573 {
574 n -= 14;
575 pat += 7;
576 fg->word = true;
577 }
578
579 /* Cannot handle word boundaries with MB string */
580 if (fg->word && (TRE_MB_CUR_MAX > 1))
581 return REG_BADPAT;
582
583 tmp = xmalloc((n + 1) * sizeof(tre_char_t));
584 if (tmp == NULL)
585 return REG_ESPACE;
586
587 /* Copies the char into the stored pattern and skips to the next char. */
588 #define STORE_CHAR \
589 do \
590 { \
591 tmp[pos++] = pat[i]; \
592 escaped = false; \
593 continue; \
594 } while (0)
595
596 /* Traverse the input pattern for processing */
597 for (unsigned int i = 0; i < n; i++)
598 {
599 switch (pat[i])
600 {
601 case TRE_CHAR('\\'):
602 if (escaped)
603 STORE_CHAR;
604 else if (i == n - 1)
605 goto badpat;
606 else
607 escaped = true;
608 continue;
609 case TRE_CHAR('['):
610 if (escaped)
611 STORE_CHAR;
612 else
613 goto badpat;
614 continue;
615 case TRE_CHAR('*'):
616 if (escaped || (!(cflags & REG_EXTENDED) && (i == 0)))
617 STORE_CHAR;
618 else
619 goto badpat;
620 continue;
621 case TRE_CHAR('+'):
622 case TRE_CHAR('?'):
623 if ((cflags & REG_EXTENDED) && (i == 0))
624 continue;
625 else if ((cflags & REG_EXTENDED) ^ !escaped)
626 STORE_CHAR;
627 else
628 goto badpat;
629 continue;
630 case TRE_CHAR('.'):
631 if (escaped)
632 {
633 if (!_escmap)
634 _escmap = xmalloc(n * sizeof(bool));
635 if (!_escmap)
636 {
637 xfree(tmp);
638 return REG_ESPACE;
639 }
640 _escmap[i] = true;
641 STORE_CHAR;
642 }
643 else
644 {
645 whasdot = i;
646 if (wfirstdot == -1)
647 wfirstdot = i;
648 STORE_CHAR;
649 }
650 continue;
651 case TRE_CHAR('^'):
652 STORE_CHAR;
653 continue;
654 case TRE_CHAR('$'):
655 if (!escaped && (i == n - 1))
656 fg->eol = true;
657 else
658 STORE_CHAR;
659 continue;
660 case TRE_CHAR('('):
661 if ((cflags & REG_EXTENDED) ^ escaped)
662 goto badpat;
663 else
664 STORE_CHAR;
665 continue;
666 case TRE_CHAR('{'):
667 if (!(cflags & REG_EXTENDED) ^ escaped)
668 STORE_CHAR;
669 else if (!(cflags & REG_EXTENDED) && (i == 0))
670 STORE_CHAR;
671 else if ((cflags & REG_EXTENDED) && (i == 0))
672 continue;
673 else
674 goto badpat;
675 continue;
676 case TRE_CHAR('|'):
677 if ((cflags & REG_EXTENDED) ^ escaped)
678 goto badpat;
679 else
680 STORE_CHAR;
681 continue;
682 default:
683 if (escaped)
684 goto badpat;
685 else
686 STORE_CHAR;
687 continue;
688 }
689 continue;
690 badpat:
691 xfree(tmp);
692 DPRINT(("tre_compile_fast: compilation of pattern failed, falling"
693 "back to NFA\n"));
694 return REG_BADPAT;
695 }
696
697 fg->hasdot = wfirstdot > -1;
698
699 /*
700 * The pattern has been processed and copied to tmp as a literal string
701 * with escapes, anchors (^$) and the word boundary match character
702 * classes stripped out.
703 */
704 #ifdef TRE_WCHAR
705 SAVE_PATTERN(tmp, pos, fg->wpattern, fg->wlen);
706 fg->wescmap = _escmap;
707 STORE_MBS_PAT;
708
709 /*
710 * The position of dots and escaped dots is different in the MB string
711 * than in to the wide string so traverse the converted string, as well,
712 * to store these positions.
713 */
714 if (fg->hasdot || (fg->wescmap != NULL))
715 {
716 if (fg->wescmap != NULL)
717 {
718 fg->escmap = xmalloc(fg->len * sizeof(bool));
719 if (!fg->escmap)
720 {
721 tre_free_fast(fg);
722 return REG_ESPACE;
723 }
724 }
725
726 escaped = false;
727 for (unsigned int i = 0; i < fg->len; i++)
728 if (fg->pattern[i] == '\\')
729 escaped = !escaped;
730 else if (fg->pattern[i] == '.' && fg->escmap && escaped)
731 {
732 fg->escmap[i] = true;
733 escaped = false;
734 }
735 else if (fg->pattern[i] == '.' && !escaped)
736 {
737 hasdot = i;
738 if (firstdot == -1)
739 firstdot = i;
740 }
741 else
742 escaped = false;
743 }
744 #else
745 SAVE_PATTERN(tmp, pos, fg->pattern, fg->len);
746 fg->escmap = _escmap;
747 #endif
748
749 xfree(tmp);
750
751 DPRINT(("tre_compile_fast: pattern: %s, len %zu, bol %c, eol %c, "
752 "icase: %c, word: %c, newline %c\n", fg->pattern, fg->len,
753 fg->bol ? 'y' : 'n', fg->eol ? 'y' : 'n',
754 fg->icase ? 'y' : 'n', fg->word ? 'y' : 'n',
755 fg->newline ? 'y' : 'n'));
756
757 /* Check whether reverse QS algorithm is more efficient */
758 if ((wfirstdot > -1) && (fg->wlen - whasdot + 1 < (size_t)wfirstdot) &&
759 fg->nosub)
760 {
761 fg->reversed = true;
762 DPRINT(("tre_compile_fast: using reverse QS algorithm\n"));
763 }
764
765 FILL_QSBC;
766 FILL_BMGS;
767 #ifdef TRE_WCHAR
768 FILL_QSBC_WIDE;
769 FILL_BMGS_WIDE;
770 #endif
771
772 return REG_OK;
773 }
774
775 #define _SHIFT_ONE \
776 { \
777 shift = 1; \
778 j = !fg->reversed ? j + shift : j - shift; \
779 continue; \
780 }
781
782 #define _BBOUND_COND \
783 ((type == STR_WIDE) ? \
784 ((j == 0) || !(tre_isalnum(str_wide[j - 1]) || \
785 (str_wide[j - 1] == TRE_CHAR('_')))) : \
786 ((j == 0) || !(tre_isalnum(str_byte[j - 1]) || \
787 (str_byte[j - 1] == '_'))))
788
789 #define _EBOUND_COND \
790 ((type == STR_WIDE) ? \
791 ((j + fg->wlen == len) || !(tre_isalnum(str_wide[j + fg->wlen]) || \
792 (str_wide[j + fg->wlen] == TRE_CHAR('_')))) : \
793 ((j + fg->len == len) || !(tre_isalnum(str_byte[j + fg->len]) || \
794 (str_byte[j + fg->len] == '_'))))
795
796 /*
797 * Condition to check whether the match on position j is on a
798 * word boundary.
799 */
800 #define IS_ON_WORD_BOUNDARY \
801 (_BBOUND_COND && _EBOUND_COND)
802
803 /*
804 * Checks word boundary and shifts one if match is not on a
805 * boundary.
806 */
807 #define CHECK_WORD_BOUNDARY \
808 if (!IS_ON_WORD_BOUNDARY) \
809 _SHIFT_ONE;
810
811 #define _BOL_COND \
812 ((j == 0) || ((type == STR_WIDE) ? (str_wide[j - 1] == TRE_CHAR('\n'))\
813 : (str_byte[j - 1] == '\n')))
814
815 /*
816 * Checks BOL anchor and shifts one if match is not on a
817 * boundary.
818 */
819 #define CHECK_BOL_ANCHOR \
820 if (!_BOL_COND) \
821 _SHIFT_ONE;
822
823 #define _EOL_COND \
824 ((type == STR_WIDE) \
825 ? ((j + fg->wlen == len) || \
826 (str_wide[j + fg->wlen] == TRE_CHAR('\n'))) \
827 : ((j + fg->len == len) || (str_byte[j + fg->wlen] == '\n')))
828
829 /*
830 * Checks EOL anchor and shifts one if match is not on a
831 * boundary.
832 */
833 #define CHECK_EOL_ANCHOR \
834 if (!_EOL_COND) \
835 _SHIFT_ONE;
836
837 /*
838 * Executes matching of the precompiled pattern on the input string.
839 * Returns REG_OK or REG_NOMATCH depending on if we find a match or not.
840 */
841 int
tre_match_fast(const fastmatch_t * fg,const void * data,size_t len,tre_str_type_t type,int nmatch,regmatch_t pmatch[],int eflags)842 tre_match_fast(const fastmatch_t *fg, const void *data, size_t len,
843 tre_str_type_t type, int nmatch, regmatch_t pmatch[], int eflags)
844 {
845 unsigned int shift, u = 0, v = 0;
846 ssize_t j = 0;
847 int ret = REG_NOMATCH;
848 int mismatch;
849 const char *str_byte = data;
850 const void *startptr = NULL;
851 const tre_char_t *str_wide = data;
852
853 /* Calculate length if unspecified. */
854 if (len == (size_t)-1)
855 switch (type)
856 {
857 case STR_WIDE:
858 len = tre_strlen(str_wide);
859 break;
860 default:
861 len = strlen(str_byte);
862 break;
863 }
864
865 /* Shortcut for empty pattern */
866 if (fg->matchall)
867 {
868 if (!fg->nosub && nmatch >= 1)
869 {
870 pmatch[0].rm_so = 0;
871 pmatch[0].rm_eo = len;
872 }
873 if (fg->bol && fg->eol)
874 return (len == 0) ? REG_OK : REG_NOMATCH;
875 else
876 return REG_OK;
877 }
878
879 /* No point in going farther if we do not have enough data. */
880 switch (type)
881 {
882 case STR_WIDE:
883 if (len < fg->wlen)
884 return ret;
885 shift = fg->wlen;
886 break;
887 default:
888 if (len < fg->len)
889 return ret;
890 shift = fg->len;
891 }
892
893 /*
894 * REG_NOTBOL means not anchoring ^ to the beginning of the line, so we
895 * can shift one because there can't be a match at the beginning.
896 */
897 if (fg->bol && (eflags & REG_NOTBOL))
898 j = 1;
899
900 /*
901 * Like above, we cannot have a match at the very end when anchoring to
902 * the end and REG_NOTEOL is specified.
903 */
904 if (fg->eol && (eflags & REG_NOTEOL))
905 len--;
906
907 if (fg->reversed)
908 j = len - (type == STR_WIDE ? fg->wlen : fg->len);
909
910
911 /* Only try once at the beginning or ending of the line. */
912 if ((fg->bol || fg->eol) && !fg->newline && !(eflags & REG_NOTBOL) &&
913 !(eflags & REG_NOTEOL))
914 {
915 /* Simple text comparison. */
916 if (!((fg->bol && fg->eol) &&
917 (type == STR_WIDE ? (len != fg->wlen) : (len != fg->len))))
918 {
919 /* Determine where in data to start search at. */
920 j = fg->eol ? len - (type == STR_WIDE ? fg->wlen : fg->len) : 0;
921 SKIP_CHARS(j);
922 mismatch = fastcmp(fg, startptr, type);
923 if (mismatch == REG_OK)
924 {
925 if (fg->word && !IS_ON_WORD_BOUNDARY)
926 return ret;
927 if (!fg->nosub && nmatch >= 1)
928 {
929 pmatch[0].rm_so = j;
930 pmatch[0].rm_eo = j + (type == STR_WIDE ? fg->wlen : fg->len);
931 }
932 return REG_OK;
933 }
934 }
935 }
936 else
937 {
938 /* Quick Search / Turbo Boyer-Moore algorithm. */
939 do
940 {
941 SKIP_CHARS(j);
942 mismatch = fastcmp(fg, startptr, type);
943 if (mismatch == REG_OK)
944 {
945 if (fg->word)
946 CHECK_WORD_BOUNDARY;
947 if (fg->bol)
948 CHECK_BOL_ANCHOR;
949 if (fg->eol)
950 CHECK_EOL_ANCHOR;
951 if (!fg->nosub && nmatch >= 1)
952 {
953 pmatch[0].rm_so = j;
954 pmatch[0].rm_eo = j + ((type == STR_WIDE) ? fg->wlen : fg->len);
955 }
956 return REG_OK;
957 }
958 else if (mismatch > 0)
959 return mismatch;
960 mismatch = -mismatch - 1;
961 SHIFT;
962 } while (!IS_OUT_OF_BOUNDS);
963 }
964 return ret;
965 }
966
967 /*
968 * Frees the resources that were allocated when the pattern was compiled.
969 */
970 void
tre_free_fast(fastmatch_t * fg)971 tre_free_fast(fastmatch_t *fg)
972 {
973
974 DPRINT(("tre_fast_free: freeing structures for pattern %s\n",
975 fg->pattern));
976
977 #ifdef TRE_WCHAR
978 hashtable_free(fg->qsBc_table);
979 if (!fg->hasdot)
980 xfree(fg->bmGs);
981 if (fg->wescmap)
982 xfree(fg->wescmap);
983 xfree(fg->wpattern);
984 #endif
985 if (!fg->hasdot)
986 xfree(fg->sbmGs);
987 if (fg->escmap)
988 xfree(fg->escmap);
989 xfree(fg->pattern);
990 }
991
992 /*
993 * Returns: -(i + 1) on failure (position that it failed with minus sign)
994 * error code on error
995 * REG_OK on success
996 */
997 static inline int
fastcmp(const fastmatch_t * fg,const void * data,tre_str_type_t type)998 fastcmp(const fastmatch_t *fg, const void *data, tre_str_type_t type)
999 {
1000 const char *str_byte = data;
1001 const char *pat_byte = fg->pattern;
1002 const tre_char_t *str_wide = data;
1003 const tre_char_t *pat_wide = fg->wpattern;
1004 const bool *escmap = (type == STR_WIDE) ? fg->wescmap : fg->escmap;
1005 size_t len = (type == STR_WIDE) ? fg->wlen : fg->len;
1006 int ret = REG_OK;
1007
1008 /* Compare the pattern and the input char-by-char from the last position. */
1009 for (int i = len - 1; i >= 0; i--) {
1010 switch (type)
1011 {
1012 case STR_WIDE:
1013
1014 /* Check dot */
1015 if (fg->hasdot && pat_wide[i] == TRE_CHAR('.') &&
1016 (!escmap || !escmap[i]) &&
1017 (!fg->newline || (str_wide[i] != TRE_CHAR('\n'))))
1018 continue;
1019
1020 /* Compare */
1021 if (fg->icase ? (towlower(pat_wide[i]) == towlower(str_wide[i]))
1022 : (pat_wide[i] == str_wide[i]))
1023 continue;
1024 break;
1025 default:
1026 /* Check dot */
1027 if (fg->hasdot && pat_byte[i] == '.' &&
1028 (!escmap || !escmap[i]) &&
1029 (!fg->newline || (str_byte[i] != '\n')))
1030 continue;
1031
1032 /* Compare */
1033 if (fg->icase ? (tolower((unsigned char)pat_byte[i]) == tolower((unsigned char)str_byte[i]))
1034 : (pat_byte[i] == str_byte[i]))
1035 continue;
1036 }
1037 DPRINT(("fastcmp: mismatch at position %d\n", i));
1038 ret = -(i + 1);
1039 break;
1040 }
1041 return ret;
1042 }
1043