1 /*
2   tre-match-backtrack.c - TRE backtracking regex matching engine
3 
4   This software is released under a BSD-style license.
5   See the file LICENSE for details and copyright.
6 
7 */
8 
9 /*
10   This matcher is for regexps that use back referencing.  Regexp matching
11   with back referencing is an NP-complete problem on the number of back
12   references.  The easiest way to match them is to use a backtracking
13   routine which basically goes through all possible paths in the TNFA
14   and chooses the one which results in the best (leftmost and longest)
15   match.  This can be spectacularly expensive and may run out of stack
16   space, but there really is no better known generic algorithm.        Quoting
17   Henry Spencer from comp.compilers:
18   <URL: http://compilers.iecc.com/comparch/article/93-03-102>
19 
20     POSIX.2 REs require longest match, which is really exciting to
21     implement since the obsolete ("basic") variant also includes
22     \<digit>.  I haven't found a better way of tackling this than doing
23     a preliminary match using a DFA (or simulation) on a modified RE
24     that just replicates subREs for \<digit>, and then doing a
25     backtracking match to determine whether the subRE matches were
26     right.  This can be rather slow, but I console myself with the
27     thought that people who use \<digit> deserve very slow execution.
28     (Pun unintentional but very appropriate.)
29 
30 */
31 
32 
33 #ifdef HAVE_CONFIG_H
34 #include <config.h>
35 #endif /* HAVE_CONFIG_H */
36 
37 #include "tre-alloca.h"
38 
39 #include <assert.h>
40 #include <stdlib.h>
41 #include <string.h>
42 #ifdef HAVE_WCHAR_H
43 #include <wchar.h>
44 #endif /* HAVE_WCHAR_H */
45 #ifdef HAVE_WCTYPE_H
46 #include <wctype.h>
47 #endif /* HAVE_WCTYPE_H */
48 #ifndef TRE_WCHAR
49 #include <ctype.h>
50 #endif /* !TRE_WCHAR */
51 #ifdef HAVE_MALLOC_H
52 #include <malloc.h>
53 #endif /* HAVE_MALLOC_H */
54 
55 #include "tre-internal.h"
56 #include "tre-mem.h"
57 #include "tre-match-utils.h"
58 #include "tre.h"
59 #include "xmalloc.h"
60 
61 typedef struct {
62   int pos;
63   const char *str_byte;
64 #ifdef TRE_WCHAR
65   const wchar_t *str_wide;
66 #endif /* TRE_WCHAR */
67   tre_tnfa_transition_t *state;
68   int state_id;
69   int next_c;
70   int *tags;
71 #ifdef TRE_MBSTATE
72   mbstate_t mbstate;
73 #endif /* TRE_MBSTATE */
74 } tre_backtrack_item_t;
75 
76 typedef struct tre_backtrack_struct {
77   tre_backtrack_item_t item;
78   struct tre_backtrack_struct *prev;
79   struct tre_backtrack_struct *next;
80 } *tre_backtrack_t;
81 
82 #ifdef TRE_WCHAR
83 #define BT_STACK_WIDE_IN(_str_wide)     stack->item.str_wide = (_str_wide)
84 #define BT_STACK_WIDE_OUT               (str_wide) = stack->item.str_wide
85 #else /* !TRE_WCHAR */
86 #define BT_STACK_WIDE_IN(_str_wide)
87 #define BT_STACK_WIDE_OUT
88 #endif /* !TRE_WCHAR */
89 
90 #ifdef TRE_MBSTATE
91 #define BT_STACK_MBSTATE_IN  stack->item.mbstate = (mbstate)
92 #define BT_STACK_MBSTATE_OUT (mbstate) = stack->item.mbstate
93 #else /* !TRE_MBSTATE */
94 #define BT_STACK_MBSTATE_IN
95 #define BT_STACK_MBSTATE_OUT
96 #endif /* !TRE_MBSTATE */
97 
98 
99 #ifdef TRE_USE_ALLOCA
100 #define tre_bt_mem_new                    tre_mem_newa
101 #define tre_bt_mem_alloc        tre_mem_alloca
102 #define tre_bt_mem_destroy(obj)           do { } while (/*CONSTCOND*/(void)0,0)
103 #else /* !TRE_USE_ALLOCA */
104 #define tre_bt_mem_new                    tre_mem_new
105 #define tre_bt_mem_alloc        tre_mem_alloc
106 #define tre_bt_mem_destroy      tre_mem_destroy
107 #endif /* !TRE_USE_ALLOCA */
108 
109 
110 #define BT_STACK_PUSH(_pos, _str_byte, _str_wide, _state, _state_id, _next_c, _tags, _mbstate) \
111   do                                                                                            \
112     {                                                                                           \
113       size_t i;                                                                                 \
114       if (!stack->next)                                                                         \
115           {                                                                                     \
116             tre_backtrack_t s;                                                                  \
117             s = tre_bt_mem_alloc(mem, sizeof(*s));                                    \
118             if (!s)                                                                   \
119               {                                                                                 \
120                 tre_bt_mem_destroy(mem);                                                        \
121                 if (tags)                                                                       \
122                     xfree(tags);                                                                \
123                 if (pmatch)                                                           \
124                     xfree(pmatch);                                                              \
125                 if (states_seen)                                                                \
126                     xfree(states_seen);                                               \
127                 return REG_ESPACE;                                                    \
128               }                                                                                 \
129             s->prev = stack;                                                          \
130             s->next = NULL;                                                           \
131             s->item.tags = tre_bt_mem_alloc(mem,                                      \
132                                                     sizeof(*tags) * tnfa->num_tags);    \
133             if (!s->item.tags)                                                                  \
134               {                                                                                 \
135                 tre_bt_mem_destroy(mem);                                                        \
136                 if (tags)                                                                       \
137                     xfree(tags);                                                                \
138                 if (pmatch)                                                           \
139                     xfree(pmatch);                                                              \
140                 if (states_seen)                                                                \
141                     xfree(states_seen);                                               \
142                 return REG_ESPACE;                                                    \
143               }                                                                                 \
144             stack->next = s;                                                          \
145             stack = s;                                                                          \
146           }                                                                                     \
147       else                                                                                      \
148           stack = stack->next;                                                                  \
149       stack->item.pos = (_pos);                                                                 \
150       stack->item.str_byte = (_str_byte);                                             \
151       BT_STACK_WIDE_IN(_str_wide);                                                    \
152       stack->item.state = (_state);                                                   \
153       stack->item.state_id = (_state_id);                                             \
154       stack->item.next_c = (_next_c);                                                 \
155       for (i = 0; i < tnfa->num_tags; i++)                                            \
156           stack->item.tags[i] = (_tags)[i];                                           \
157       BT_STACK_MBSTATE_IN;                                                            \
158     }                                                                                           \
159   while (/*CONSTCOND*/(void)0,0)
160 
161 #define BT_STACK_POP()                                                                          \
162   do                                                                                            \
163     {                                                                                           \
164       size_t i;                                                                                 \
165       assert(stack->prev);                                                            \
166       pos = stack->item.pos;                                                          \
167       if (type == STR_USER)                                                   \
168         str_source->rewind(pos + pos_add_next, str_source->context);          \
169       str_byte = stack->item.str_byte;                                                \
170       BT_STACK_WIDE_OUT;                                                              \
171       state = stack->item.state;                                                      \
172       next_c = (tre_char_t) stack->item.next_c;                                                 \
173       for (i = 0; i < tnfa->num_tags; i++)                                            \
174           tags[i] = stack->item.tags[i];                                                        \
175       BT_STACK_MBSTATE_OUT;                                                           \
176       stack = stack->prev;                                                            \
177     }                                                                                           \
178   while (/*CONSTCOND*/(void)0,0)
179 
180 #undef MIN
181 #define MIN(a, b) ((a) <= (b) ? (a) : (b))
182 
183 reg_errcode_t
tre_tnfa_run_backtrack(const tre_tnfa_t * tnfa,const void * string,int len,tre_str_type_t type,int * match_tags,int eflags,int * match_end_ofs)184 tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
185                            int len, tre_str_type_t type, int *match_tags,
186                            int eflags, int *match_end_ofs)
187 {
188   /* State variables required by GET_NEXT_WCHAR. */
189   tre_char_t prev_c = 0, next_c = 0;
190   const char *str_byte = string;
191   int pos = 0;
192   unsigned int pos_add_next = 1;
193 #ifdef TRE_WCHAR
194   const wchar_t *str_wide = string;
195 #ifdef TRE_MBSTATE
196   mbstate_t mbstate;
197 #endif /* TRE_MBSTATE */
198 #endif /* TRE_WCHAR */
199   int reg_notbol = eflags & REG_NOTBOL;
200   int reg_noteol = eflags & REG_NOTEOL;
201   int reg_newline = tnfa->cflags & REG_NEWLINE;
202   size_t str_user_end = 0;
203 
204   /* These are used to remember the necessary values of the above
205      variables to return to the position where the current search
206      started from. */
207   int next_c_start;
208   const char *str_byte_start;
209   int pos_start = -1;
210 #ifdef TRE_WCHAR
211   const wchar_t *str_wide_start;
212 #endif /* TRE_WCHAR */
213 #ifdef TRE_MBSTATE
214   mbstate_t mbstate_start;
215 #endif /* TRE_MBSTATE */
216 
217   /* End offset of best match so far, or -1 if no match found yet. */
218   int match_eo = -1;
219   /* Tag arrays. */
220   int *next_tags, *tags = NULL;
221   /* Current TNFA state. */
222   tre_tnfa_transition_t *state;
223   int *states_seen = NULL;
224 
225   /* Memory allocator to for allocating the backtracking stack. */
226   tre_mem_t mem = tre_bt_mem_new();
227 
228   /* The backtracking stack. */
229   tre_backtrack_t stack;
230 
231   tre_tnfa_transition_t *trans_i;
232   regmatch_t *pmatch = NULL;
233   reg_errcode_t ret;
234 
235 #ifdef TRE_MBSTATE
236   memset(&mbstate, '\0', sizeof(mbstate));
237 #endif /* TRE_MBSTATE */
238 
239   if (!mem)
240     return REG_ESPACE;
241   stack = tre_bt_mem_alloc(mem, sizeof(*stack));
242   if (!stack)
243     {
244       ret = REG_ESPACE;
245       goto error_exit;
246     }
247   stack->prev = NULL;
248   stack->next = NULL;
249 
250   DPRINT(("tnfa_execute_backtrack, input type %d\n", type));
251   DPRINT(("len = %d\n", len));
252 
253 #ifdef TRE_USE_ALLOCA
254   tags = alloca(sizeof(*tags) * tnfa->num_tags);
255   pmatch = alloca(sizeof(*pmatch) * tnfa->num_submatches);
256   states_seen = alloca(sizeof(*states_seen) * tnfa->num_states);
257 #else /* !TRE_USE_ALLOCA */
258   if (tnfa->num_tags)
259     {
260       tags = xmalloc(sizeof(*tags) * tnfa->num_tags);
261       if (!tags)
262           {
263             ret = REG_ESPACE;
264             goto error_exit;
265           }
266     }
267   if (tnfa->num_submatches)
268     {
269       pmatch = xmalloc(sizeof(*pmatch) * tnfa->num_submatches);
270       if (!pmatch)
271           {
272             ret = REG_ESPACE;
273             goto error_exit;
274           }
275     }
276   if (tnfa->num_states)
277     {
278       states_seen = xmalloc(sizeof(*states_seen) * tnfa->num_states);
279       if (!states_seen)
280           {
281             ret = REG_ESPACE;
282             goto error_exit;
283           }
284     }
285 #endif /* !TRE_USE_ALLOCA */
286 
287  retry:
288   {
289     size_t i;
290     for (i = 0; i < tnfa->num_tags; i++)
291       {
292           tags[i] = -1;
293           if (match_tags)
294             match_tags[i] = -1;
295       }
296     for (i = 0; i < tnfa->num_states; i++)
297       states_seen[i] = 0;
298   }
299 
300   state = NULL;
301   pos = pos_start;
302   if (type == STR_USER)
303     str_source->rewind(pos + pos_add_next, str_source->context);
304   GET_NEXT_WCHAR();
305   pos_start = pos;
306   next_c_start = next_c;
307   str_byte_start = str_byte;
308 #ifdef TRE_WCHAR
309   str_wide_start = str_wide;
310 #endif /* TRE_WCHAR */
311 #ifdef TRE_MBSTATE
312   mbstate_start = mbstate;
313 #endif /* TRE_MBSTATE */
314 
315   /* Handle initial states. */
316   next_tags = NULL;
317   for (trans_i = tnfa->initial; trans_i->state; trans_i++)
318     {
319       DPRINT(("> init %p, prev_c %lc\n", trans_i->state, (tre_cint_t)prev_c));
320       if (trans_i->assertions && CHECK_ASSERTIONS(trans_i->assertions))
321           {
322             DPRINT(("assert failed\n"));
323             continue;
324           }
325       if (state == NULL)
326           {
327             /* Start from this state. */
328             state = trans_i->state;
329             next_tags = trans_i->tags;
330           }
331       else
332           {
333             /* Backtrack to this state. */
334             DPRINT(("saving state %d for backtracking\n", trans_i->state_id));
335             BT_STACK_PUSH(pos, str_byte, str_wide, trans_i->state,
336                               trans_i->state_id, next_c, tags, mbstate);
337             {
338               int *tmp = trans_i->tags;
339               if (tmp)
340                 while (*tmp >= 0)
341                     stack->item.tags[*tmp++] = pos;
342             }
343           }
344     }
345 
346   if (next_tags)
347     for (; *next_tags >= 0; next_tags++)
348       tags[*next_tags] = pos;
349 
350 
351   DPRINT(("entering match loop, pos %d, str_byte %p\n", pos, str_byte));
352   DPRINT(("pos:chr/code | state and tags\n"));
353   DPRINT(("-------------+------------------------------------------------\n"));
354 
355   if (state == NULL)
356     goto backtrack;
357 
358   while (/*CONSTCOND*/(void)1,1)
359     {
360       tre_tnfa_transition_t *next_state;
361       int empty_br_match;
362 
363       DPRINT(("start loop\n"));
364       if (state == tnfa->final)
365           {
366             DPRINT(("  match found, %d %d\n", match_eo, pos));
367             if (match_eo < pos
368                 || (match_eo == pos
369                       && match_tags
370                       && tre_tag_order(tnfa->num_tags, tnfa->tag_directions,
371                                            tags, match_tags)))
372               {
373                 size_t i;
374                 /* This match wins the previous match. */
375                 DPRINT(("      win previous\n"));
376                 match_eo = pos;
377                 if (match_tags)
378                     for (i = 0; i < tnfa->num_tags; i++)
379                       match_tags[i] = tags[i];
380               }
381             /* Our TNFAs never have transitions leaving from the final state,
382                so we jump right to backtracking. */
383             goto backtrack;
384           }
385 
386 #ifdef TRE_DEBUG
387       DPRINT(("%3d:%2lc/%05d | %p ", pos, (tre_cint_t)next_c, (int)next_c,
388                 state));
389       {
390           int i;
391           for (i = 0; i < tnfa->num_tags; i++)
392             DPRINT(("%d%s", tags[i], i < tnfa->num_tags - 1 ? ", " : ""));
393           DPRINT(("\n"));
394       }
395 #endif /* TRE_DEBUG */
396 
397       /* Go to the next character in the input string. */
398       empty_br_match = 0;
399       trans_i = state;
400       if (trans_i->state && trans_i->assertions & ASSERT_BACKREF)
401           {
402             /* This is a back reference state.  All transitions leaving from
403                this state have the same back reference "assertion".  Instead
404                of reading the next character, we match the back reference. */
405             regoff_t so, eo, bt = trans_i->u.backref;
406             regoff_t bt_len;
407             regoff_t result;
408 
409             DPRINT(("  should match back reference %d\n", bt));
410             /* Get the substring we need to match against.  Remember to
411                turn off REG_NOSUB temporarily. */
412             tre_fill_pmatch(bt + 1, pmatch, tnfa->cflags & ~REG_NOSUB,
413                                 tnfa, tags, pos);
414             /* LINTED */so = pmatch[bt].rm_so;
415             /* LINTED */eo = pmatch[bt].rm_eo;
416             bt_len = eo - so;
417 
418 #ifdef TRE_DEBUG
419             {
420               int slen;
421               if (len < 0)
422                 slen = bt_len;
423               else
424                 slen = MIN(bt_len, len - pos);
425 
426               if (type == STR_BYTE)
427                 {
428                     DPRINT(("  substring (len %d) is [%d, %d[: '%.*s'\n",
429                               bt_len, so, eo, bt_len, (char*)string + so));
430                     DPRINT(("  current string is '%.*s'\n", slen, str_byte - 1));
431                 }
432 #ifdef TRE_WCHAR
433               else if (type == STR_WIDE)
434                 {
435                     DPRINT(("  substring (len %d) is [%d, %d[: '%.*" STRF "'\n",
436                               bt_len, so, eo, bt_len, (wchar_t*)string + so));
437                     DPRINT(("  current string is '%.*" STRF "'\n",
438                               slen, str_wide - 1));
439                 }
440 #endif /* TRE_WCHAR */
441             }
442 #endif
443 
444             if (len < 0)
445               {
446                 if (type == STR_USER)
447                     result = str_source->compare((unsigned)so, (unsigned)pos,
448                                                        (unsigned)bt_len,
449                                                        str_source->context);
450 #ifdef TRE_WCHAR
451                 else if (type == STR_WIDE)
452                     result = wcsncmp((const wchar_t*)string + so, str_wide - 1,
453                                          (size_t)bt_len);
454 #endif /* TRE_WCHAR */
455                 else
456                     /* LINTED */result = strncmp((const char*)string + so, str_byte - 1,
457                                          (size_t)bt_len);
458               }
459             else if (len - pos < bt_len)
460               result = 1;
461 #ifdef TRE_WCHAR
462             else if (type == STR_WIDE)
463               result = wmemcmp((const wchar_t*)string + so, str_wide - 1,
464                                    (size_t)bt_len);
465 #endif /* TRE_WCHAR */
466             else
467               /* LINTED */result = memcmp((const char*)string + so, str_byte - 1,
468                                   (size_t)bt_len);
469 
470             if (result == 0)
471               {
472                 /* Back reference matched.  Check for infinite loop. */
473                 if (bt_len == 0)
474                     empty_br_match = 1;
475                 if (empty_br_match && states_seen[trans_i->state_id])
476                     {
477                       DPRINT(("  avoid loop\n"));
478                       goto backtrack;
479                     }
480 
481                 states_seen[trans_i->state_id] = empty_br_match;
482 
483                 /* Advance in input string and resync `prev_c', `next_c'
484                      and pos. */
485                 DPRINT(("      back reference matched\n"));
486                 /* LINTED */str_byte += bt_len - 1;
487 #ifdef TRE_WCHAR
488                 /* LINTED */str_wide += bt_len - 1;
489 #endif /* TRE_WCHAR */
490                 /* LINTED */pos += bt_len - 1;
491                 GET_NEXT_WCHAR();
492                 DPRINT(("      pos now %d\n", pos));
493               }
494             else
495               {
496                 DPRINT(("      back reference did not match\n"));
497                 goto backtrack;
498               }
499           }
500       else
501           {
502             /* Check for end of string. */
503             if (len < 0)
504               {
505                 if (type == STR_USER)
506                     {
507                       if (str_user_end)
508                         goto backtrack;
509                     }
510                 else if (next_c == L'\0')
511                     goto backtrack;
512               }
513             else
514               {
515                 if (pos >= len)
516                     goto backtrack;
517               }
518 
519             /* Read the next character. */
520             GET_NEXT_WCHAR();
521           }
522 
523       next_state = NULL;
524       for (trans_i = state; trans_i->state; trans_i++)
525           {
526             DPRINT(("  transition %d-%d (%c-%c) %d to %d\n",
527                       trans_i->code_min, trans_i->code_max,
528                       trans_i->code_min, trans_i->code_max,
529                       trans_i->assertions, trans_i->state_id));
530             if (trans_i->code_min <= (tre_cint_t)prev_c
531                 && trans_i->code_max >= (tre_cint_t)prev_c)
532               {
533                 if (trans_i->assertions
534                       && (CHECK_ASSERTIONS(trans_i->assertions)
535                           || CHECK_CHAR_CLASSES(trans_i, tnfa, eflags)))
536                     {
537                       DPRINT(("  assertion failed\n"));
538                       continue;
539                     }
540 
541                 if (next_state == NULL)
542                     {
543                       /* First matching transition. */
544                       DPRINT(("  Next state is %d\n", trans_i->state_id));
545                       next_state = trans_i->state;
546                       next_tags = trans_i->tags;
547                     }
548                 else
549                     {
550                       /* Second matching transition.  We may need to backtrack here
551                          to take this transition instead of the first one, so we
552                          push this transition in the backtracking stack so we can
553                          jump back here if needed. */
554                       DPRINT(("  saving state %d for backtracking\n",
555                                 trans_i->state_id));
556                       BT_STACK_PUSH(pos, str_byte, str_wide, trans_i->state,
557                                         trans_i->state_id, next_c, tags, mbstate);
558                       {
559                         int *tmp;
560                         for (tmp = trans_i->tags; tmp && *tmp >= 0; tmp++)
561                           stack->item.tags[*tmp] = pos;
562                       }
563 #if 0 /* XXX - it's important not to look at all transitions here to keep
564            the stack small! */
565                       break;
566 #endif
567                     }
568               }
569           }
570 
571       if (next_state != NULL)
572           {
573             /* Matching transitions were found.  Take the first one. */
574             state = next_state;
575 
576             /* Update the tag values. */
577             if (next_tags)
578               while (*next_tags >= 0)
579                 tags[*next_tags++] = pos;
580           }
581       else
582           {
583           backtrack:
584             /* A matching transition was not found.  Try to backtrack. */
585             if (stack->prev)
586               {
587                 DPRINT(("      backtracking\n"));
588 #if ASSERT_BACKREF
589                 if (stack->item.state->assertions)
590                     {
591                       DPRINT(("  states_seen[%d] = 0\n",
592                                 stack->item.state_id));
593                       states_seen[stack->item.state_id] = 0;
594                     }
595 #endif
596 
597                 BT_STACK_POP();
598               }
599             else if (match_eo < 0)
600               {
601                 /* Try starting from a later position in the input string. */
602                 /* Check for end of string. */
603                 if (len < 0)
604                     {
605                       if (next_c == L'\0')
606                         {
607                           DPRINT(("end of string.\n"));
608                           break;
609                         }
610                     }
611                 else
612                     {
613                       if (pos >= len)
614                         {
615                           DPRINT(("end of string.\n"));
616                           break;
617                         }
618                     }
619                 DPRINT(("restarting from next start position\n"));
620                 next_c = (tre_char_t) next_c_start;
621 #ifdef TRE_MBSTATE
622                 mbstate = mbstate_start;
623 #endif /* TRE_MBSTATE */
624                 str_byte = str_byte_start;
625 #ifdef TRE_WCHAR
626                 str_wide = str_wide_start;
627 #endif /* TRE_WCHAR */
628                 goto retry;
629               }
630             else
631               {
632                 DPRINT(("finished\n"));
633                 break;
634               }
635           }
636     }
637 
638   ret = match_eo >= 0 ? REG_OK : REG_NOMATCH;
639   *match_end_ofs = match_eo;
640 
641  error_exit:
642   tre_bt_mem_destroy(mem);
643 #ifndef TRE_USE_ALLOCA
644   if (tags)
645     xfree(tags);
646   if (pmatch)
647     xfree(pmatch);
648   if (states_seen)
649     xfree(states_seen);
650 #endif /* !TRE_USE_ALLOCA */
651 
652   return ret;
653 }
654