1 /*
2  *    Stack-less Just-In-Time compiler
3  *
4  *    Copyright Zoltan Herczeg (hzmester@freemail.hu). All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without modification, are
7  * permitted provided that the following conditions are met:
8  *
9  *   1. Redistributions of source code must retain the above copyright notice, this list of
10  *      conditions and the following disclaimer.
11  *
12  *   2. Redistributions in binary form must reproduce the above copyright notice, this list
13  *      of conditions and the following disclaimer in the documentation and/or other materials
14  *      provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) AND CONTRIBUTORS ``AS IS'' AND ANY
17  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
19  * SHALL THE COPYRIGHT HOLDER(S) OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
21  * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
22  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
24  * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  */
26 
27 #include "sljitLir.h"
28 #include "regexJIT.h"
29 
30 #include <stdlib.h>
31 
32 #ifdef REGEX_MATCH_VERBOSE
33 #include <stdio.h>
34 #endif
35 
36 /* Extra, hidden flags:
37    {id!} where id > 0 found in the code. */
38 #define REGEX_ID_CHECK                  0x100
39 /* When REGEX_NEWLINE && REGEX_MATCH_BEGIN defined, the pattern turn to a normal search,
40    which starts with [\r\n] character range. */
41 #define REGEX_FAKE_MATCH_BEGIN          0x200
42 /* When REGEX_NEWLINE && REGEX_MATCH_END defined, the pattern turn to a normal search,
43    which ends with [\r\n] character range. */
44 #define REGEX_FAKE_MATCH_END  0x400
45 
46 /* Check match completition after every (FINISH_TEST + 1) steps. */
47 #define FINISH_TEST 0x7
48 
49 /* --------------------------------------------------------------------- */
50 /*  Structures for JIT-ed pattern matching                               */
51 /* --------------------------------------------------------------------- */
52 
53 struct regex_machine
54 {
55           /* flags. */
56           int flags;
57           /* Number of state descriptors for one term. */
58           sljit_sw no_states;
59           /* Total size. */
60           sljit_sw size;
61 
62           union {
63                     void *init_match;
64                     sljit_sw (SLJIT_CALL *call_init)(void *next, void* match);
65           } u;
66 #if (defined SLJIT_INDIRECT_CALL && SLJIT_INDIRECT_CALL)
67           struct sljit_function_context context;
68 #endif
69 
70           void *continue_match;
71 
72           /* Variable sized array to contain the handler addresses. */
73           sljit_uw entry_addrs[1];
74 };
75 
76 struct regex_match
77 {
78           /* Current and next state array. */
79           sljit_sw *current;
80           sljit_sw *next;
81           /* Starting. */
82           sljit_sw head;
83           /* String character index (ever increasing). */
84           sljit_sw index;
85           /* Best match found so far (members in priority order). */
86           sljit_sw best_begin;
87           sljit_sw best_end;
88           sljit_sw best_id;
89           /* Bool flags (encoded as word). */
90           sljit_sw fast_quit;
91           sljit_sw fast_forward;
92           /* Machine. */
93           struct regex_machine *machine;
94 
95           union {
96                     void *continue_match;
97                     void (SLJIT_CALL *call_continue)(struct regex_match *match, const regex_char_t *input_string, int length);
98           } u;
99 
100           /* Variable sized array to contain the state arrays. */
101           sljit_sw states[1];
102 };
103 
104 /* State vector
105     ITEM[0] - pointer to the address inside the machine code
106     ITEM[1] - next pointer
107     ITEM[2] - string started from (optional)
108     ITEM[3] - max ID (optional) */
109 
110 /* Register allocation. */
111 /* Current state array (loaded & stored: regex_match->current). */
112 #define R_CURR_STATE          SLJIT_S0
113 /* Next state array (loaded & stored: regex_match->next). */
114 #define R_NEXT_STATE          SLJIT_S1
115 /* Head (loaded & stored: regex_match->head). */
116 #define R_NEXT_HEAD SLJIT_S2
117 /* String fragment pointer. */
118 #define R_STRING    SLJIT_S3
119 /* String fragment length. */
120 #define R_LENGTH    SLJIT_S4
121 /* 'struct regex_match*' */
122 #define R_REGEX_MATCH         SLJIT_R0
123 /* Current character. */
124 #define R_CURR_CHAR SLJIT_R1
125 /* Temporary register. */
126 #define R_TEMP                SLJIT_R2
127 /* Caches the regex_match->best_begin. */
128 #define R_BEST_BEGIN          SLJIT_R3
129 /* Current character index. */
130 #define R_CURR_INDEX          SLJIT_R4
131 
132 /* --------------------------------------------------------------------- */
133 /*  Stack management                                                     */
134 /* --------------------------------------------------------------------- */
135 
136 /* Try to allocate 2^n blocks. */
137 #define STACK_FRAGMENT_SIZE (((64 * sizeof(struct stack_item)) - (sizeof(struct stack_fragment_data))) / (sizeof(struct stack_item)))
138 
139 struct stack_item {
140           int type;
141           int value;
142 };
143 
144 struct stack_fragment_data {
145           struct stack_fragment *next;
146           struct stack_fragment *prev;
147 };
148 
149 struct stack_fragment {
150           struct stack_fragment_data data;
151           struct stack_item items[STACK_FRAGMENT_SIZE];
152 };
153 
154 struct stack {
155           struct stack_fragment *first;
156           struct stack_fragment *last;
157           int index;
158           int count;
159 };
160 
161 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
162 
stack_check(struct stack * stack)163 static void stack_check(struct stack *stack)
164 {
165           struct stack_fragment *curr;
166           int found;
167 
168           if (!stack)
169                     return;
170 
171           SLJIT_ASSERT(stack->index >= 0 && stack->index < STACK_FRAGMENT_SIZE);
172 
173           if (stack->first == NULL) {
174                     SLJIT_ASSERT(stack->first == NULL && stack->last == NULL);
175                     SLJIT_ASSERT(stack->index == STACK_FRAGMENT_SIZE - 1 && stack->count == 0);
176                     return;
177           }
178 
179           found = 0;
180           if (stack->last == NULL) {
181                     SLJIT_ASSERT(stack->index == STACK_FRAGMENT_SIZE - 1 && stack->count == 0);
182                     found = 1;
183           }
184           else
185                     SLJIT_ASSERT(stack->index >= 0 && stack->count >= 0);
186 
187           SLJIT_ASSERT(stack->first->data.prev == NULL);
188           curr = stack->first;
189           while (curr) {
190                     if (curr == stack->last)
191                               found = 1;
192                     if (curr->data.next)
193                               SLJIT_ASSERT(curr->data.next->data.prev == curr);
194                     curr = curr->data.next;
195           }
196           SLJIT_ASSERT(found);
197 }
198 
199 #endif
200 
stack_init(struct stack * stack)201 static void stack_init(struct stack *stack)
202 {
203           stack->first = NULL;
204           stack->last = NULL;
205           stack->index = STACK_FRAGMENT_SIZE - 1;
206           stack->count = 0;
207 }
208 
stack_destroy(struct stack * stack)209 static void stack_destroy(struct stack *stack)
210 {
211           struct stack_fragment *curr = stack->first;
212           struct stack_fragment *prev;
213 
214 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
215           stack_check(stack);
216 #endif
217 
218           while (curr) {
219                     prev = curr;
220                     curr = curr->data.next;
221                     SLJIT_FREE(prev, NULL);
222           }
223 }
224 
stack_top(struct stack * stack)225 static SLJIT_INLINE struct stack_item* stack_top(struct stack *stack)
226 {
227           SLJIT_ASSERT(stack->last);
228           return stack->last->items + stack->index;
229 }
230 
stack_push(struct stack * stack,int type,int value)231 static int stack_push(struct stack *stack, int type, int value)
232 {
233           if (stack->last) {
234                     stack->index++;
235                     if (stack->index >= STACK_FRAGMENT_SIZE) {
236                               stack->index = 0;
237                               if (!stack->last->data.next) {
238                                         stack->last->data.next = (struct stack_fragment*)SLJIT_MALLOC(sizeof(struct stack_fragment), NULL);
239                                         if (!stack->last->data.next)
240                                                   return 1;
241                                         stack->last->data.next->data.next = NULL;
242                                         stack->last->data.next->data.prev = stack->last;
243                               }
244                               stack->last = stack->last->data.next;
245                     }
246           }
247           else if (!stack->first) {
248                     stack->last = (struct stack_fragment*)SLJIT_MALLOC(sizeof(struct stack_fragment), NULL);
249                     if (!stack->last)
250                               return 1;
251                     stack->last->data.prev = NULL;
252                     stack->last->data.next = NULL;
253                     stack->first = stack->last;
254                     stack->index = 0;
255           }
256           else {
257                     stack->last = stack->first;
258                     stack->index = 0;
259           }
260           stack->last->items[stack->index].type = type;
261           stack->last->items[stack->index].value = value;
262           stack->count++;
263 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
264           stack_check(stack);
265 #endif
266           return 0;
267 }
268 
stack_pop(struct stack * stack)269 static struct stack_item* stack_pop(struct stack *stack)
270 {
271           struct stack_item *ret = stack_top(stack);
272 
273           if (stack->index > 0)
274                     stack->index--;
275           else {
276                     stack->last = stack->last->data.prev;
277                     stack->index = STACK_FRAGMENT_SIZE - 1;
278           }
279 
280           stack->count--;
281 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
282           stack_check(stack);
283 #endif
284           return ret;
285 }
286 
stack_clone(struct stack * src,struct stack * dst)287 static SLJIT_INLINE void stack_clone(struct stack *src, struct stack *dst)
288 {
289           *dst = *src;
290 }
291 
stack_push_copy(struct stack * stack,int items,int length)292 static int stack_push_copy(struct stack *stack, int items, int length)
293 {
294           struct stack_fragment *frag1;
295           int ind1;
296           struct stack_fragment *frag2;
297           int ind2;
298           int counter;
299 
300           SLJIT_ASSERT(stack->count >= length && items <= length && items > 0);
301 
302           /* Allocate the necessary elements. */
303           counter = items;
304           frag1 = stack->last;
305           ind1 = stack->index;
306           while (counter > 0) {
307                     if (stack->index + counter >= STACK_FRAGMENT_SIZE) {
308                               counter -= STACK_FRAGMENT_SIZE - stack->index - 1 + 1;
309                               stack->index = 0;
310                               if (!stack->last->data.next) {
311                                         stack->last->data.next = (struct stack_fragment*)SLJIT_MALLOC(sizeof(struct stack_fragment), NULL);
312                                         if (!stack->last->data.next)
313                                                   return 1;
314                                         stack->last->data.next->data.next = NULL;
315                                         stack->last->data.next->data.prev = stack->last;
316                               }
317                               stack->last = stack->last->data.next;
318                     }
319                     else {
320                               stack->index += counter;
321                               counter = 0;
322                     }
323           }
324 
325           frag2 = stack->last;
326           ind2 = stack->index;
327           while (length > 0) {
328                     frag2->items[ind2--] = frag1->items[ind1--];
329                     if (ind1 < 0) {
330                               ind1 = STACK_FRAGMENT_SIZE - 1;
331                               frag1 = frag1->data.prev;
332                     }
333                     if (ind2 < 0) {
334                               ind2 = STACK_FRAGMENT_SIZE - 1;
335                               frag2 = frag2->data.prev;
336                     }
337                     length--;
338           }
339 
340 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
341           stack_check(stack);
342 #endif
343           stack->count += items;
344           return 0;
345 }
346 
347 /* --------------------------------------------------------------------- */
348 /*  Parser                                                               */
349 /* --------------------------------------------------------------------- */
350 
351 enum {
352           /* Common. */
353           type_begin,
354           type_end,
355           type_char,
356           type_newline,
357           type_id,
358           type_rng_start,
359           type_rng_end,
360           type_rng_char,
361           type_rng_left,
362           type_rng_right,
363 
364           /* generator only. */
365           type_branch,
366           type_jump,
367 
368           /* Parser only. */
369           type_open_br,
370           type_close_br,
371           type_select,
372           type_asterisk,
373           type_plus_sign,
374           type_qestion_mark
375 };
376 
377 struct compiler_common {
378           /* Temporary stacks. */
379           struct stack stack;
380           struct stack depth;
381           /* REGEX_ flags. */
382           int flags;
383           /* Encoded size of the dfa representation. */
384           sljit_sw dfa_size;
385           /* Number of terms. */
386           sljit_sw terms_size;
387           /* Number of state descriptors for one term (same as machine->no_states). */
388           sljit_sw no_states;
389           /* Number of type_rng_(char|left)-s in the longest character range. */
390           sljit_sw longest_range_size;
391 
392           /* DFA linear representation (size: dfa_size). */
393           struct stack_item *dfa_transitions;
394           /* Term id and search state pairs (size: dfa_size). */
395           struct stack_item *search_states;
396 
397           /* sljit compiler */
398           struct sljit_compiler *compiler;
399           /* Machine data, which must be kept for later use. */
400           struct regex_machine *machine;
401           /* Temporary space for jumps (size: longest_range_size). */
402           struct sljit_jump **range_jump_list;
403 };
404 
decode_number(const regex_char_t * regex_string,int length,int * result)405 static const regex_char_t* decode_number(const regex_char_t *regex_string, int length, int *result)
406 {
407           int value = 0;
408 
409           SLJIT_ASSERT(length > 0);
410           if (*regex_string < '0' || *regex_string > '9') {
411                     *result = -1;
412                     return regex_string;
413           }
414 
415           while (length > 0 && *regex_string >= '0' && *regex_string <= '9') {
416                     value = value * 10 + (*regex_string - '0');
417                     length--;
418                     regex_string++;
419           }
420 
421           *result = value;
422           return regex_string;
423 }
424 
iterate(struct stack * stack,int min,int max)425 static int iterate(struct stack *stack, int min, int max)
426 {
427           struct stack it;
428           struct stack_item *item;
429           int count = -1;
430           int len = 0;
431           int depth = 0;
432 
433           stack_clone(stack, &it);
434 
435           /* Calculate size. */
436           while (count < 0) {
437                     item = stack_pop(&it);
438                     switch (item->type) {
439                     case type_id:
440                     case type_rng_end:
441                     case type_rng_char:
442                     case type_rng_left:
443                     case type_rng_right:
444                     case type_plus_sign:
445                     case type_qestion_mark:
446                               len++;
447                               break;
448 
449                     case type_asterisk:
450                               len += 2;
451                               break;
452 
453                     case type_close_br:
454                               depth++;
455                               break;
456 
457                     case type_open_br:
458                               SLJIT_ASSERT(depth > 0);
459                               depth--;
460                               if (depth == 0)
461                                         count = it.count;
462                               break;
463 
464                     case type_select:
465                               SLJIT_ASSERT(depth > 0);
466                               len += 2;
467                               break;
468 
469                     default:
470                               SLJIT_ASSERT(item->type != type_begin && item->type != type_end);
471                               if (depth == 0)
472                                         count = it.count;
473                               len++;
474                               break;
475                     }
476           }
477 
478           if (min == 0 && max == 0) {
479                     /* {0,0} case, not {0,} case: delete subtree. */
480                     stack_clone(&it, stack);
481                     /* and put an empty bracket expression instead of it. */
482                     if (stack_push(stack, type_open_br, 0))
483                               return REGEX_MEMORY_ERROR;
484                     if (stack_push(stack, type_close_br, 0))
485                               return REGEX_MEMORY_ERROR;
486                     return len;
487           }
488 
489           count = stack->count - count;
490 
491           /* Put an open bracket before the sequence. */
492           if (stack_push_copy(stack, 1, count))
493                     return -1;
494 
495 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
496           SLJIT_ASSERT(stack_push(&it, type_open_br, 0) == 0);
497 #else
498           stack_push(&it, type_open_br, 0);
499 #endif
500 
501           /* Copy the data. */
502           if (max > 0) {
503                     len = len * (max - 1);
504                     max -= min;
505                     /* Insert ? operators. */
506                     len += max;
507 
508                     if (min > 0) {
509                               min--;
510                               while (min > 0) {
511                                         if (stack_push_copy(stack, count, count))
512                                                   return -1;
513                                         min--;
514                               }
515                               if (max > 0) {
516                                         if (stack_push_copy(stack, count, count))
517                                                   return -1;
518                                         if (stack_push(stack, type_qestion_mark, 0))
519                                                   return REGEX_MEMORY_ERROR;
520                                         count++;
521                                         max--;
522                               }
523                     }
524                     else {
525                               SLJIT_ASSERT(max > 0);
526                               max--;
527                               count++;
528                               if (stack_push(stack, type_qestion_mark, 0))
529                                         return REGEX_MEMORY_ERROR;
530                     }
531 
532                     while (max > 0) {
533                               if (stack_push_copy(stack, count, count))
534                                         return -1;
535                               max--;
536                     }
537           }
538           else {
539                     SLJIT_ASSERT(min > 0);
540                     min--;
541                     /* Insert + operator. */
542                     len = len * min + 1;
543                     while (min > 0) {
544                               if (stack_push_copy(stack, count, count))
545                                         return -1;
546                               min--;
547                     }
548 
549                     if (stack_push(stack, type_plus_sign, 0))
550                               return REGEX_MEMORY_ERROR;
551           }
552 
553           /* Close the opened bracket. */
554           if (stack_push(stack, type_close_br, 0))
555                     return REGEX_MEMORY_ERROR;
556 
557           return len;
558 }
559 
parse_iterator(const regex_char_t * regex_string,int length,struct stack * stack,sljit_sw * dfa_size,int begin)560 static int parse_iterator(const regex_char_t *regex_string, int length, struct stack *stack, sljit_sw *dfa_size, int begin)
561 {
562           /* We only know that *regex_string == { . */
563           int val1, val2;
564           const regex_char_t *base_from = regex_string;
565           const regex_char_t *from;
566 
567           length--;
568           regex_string++;
569 
570           /* Decode left value. */
571           val2 = -1;
572           if (length == 0)
573                     return -2;
574           if (*regex_string == ',') {
575                     val1 = 0;
576                     length--;
577                     regex_string++;
578           }
579           else {
580                     from = regex_string;
581                     regex_string = decode_number(regex_string, length, &val1);
582                     if (val1 < 0)
583                               return -2;
584                     length -= regex_string - from;
585 
586                     if (length == 0)
587                               return -2;
588                     if (*regex_string == '}') {
589                               val2 = val1;
590                               if (val1 == 0)
591                                         val1 = -1;
592                     }
593                     else if (length >= 2 && *regex_string == '!' && regex_string[1] == '}') {
594                               /* Non posix extension. */
595                               if (stack_push(stack, type_id, val1))
596                                         return -1;
597                               (*dfa_size)++;
598                               return (regex_string - base_from) + 1;
599                     }
600                     else {
601                               if (*regex_string != ',')
602                                         return -2;
603                               length--;
604                               regex_string++;
605                     }
606           }
607 
608           if (begin)
609                     return -2;
610 
611           /* Decode right value. */
612           if (val2 == -1) {
613                     if (length == 0)
614                               return -2;
615                     if (*regex_string == '}')
616                               val2 = 0;
617                     else {
618                               from = regex_string;
619                               regex_string = decode_number(regex_string, length, &val2);
620                               length -= regex_string - from;
621                               if (val2 < 0 || length == 0 || *regex_string != '}' || val2 < val1)
622                                         return -2;
623                               if (val2 == 0) {
624                                         SLJIT_ASSERT(val1 == 0);
625                                         val1 = -1;
626                               }
627                     }
628           }
629 
630           /* Fast cases. */
631           if (val1 > 1 || val2 > 1) {
632                     val1 = iterate(stack, val1, val2);
633                     if (val1 < 0)
634                               return -1;
635                     *dfa_size += val1;
636           }
637           else if (val1 == 0 && val2 == 0) {
638                     if (stack_push(stack, type_asterisk, 0))
639                               return -1;
640                     *dfa_size += 2;
641           }
642           else if (val1 == 1 && val2 == 0) {
643                     if (stack_push(stack, type_plus_sign, 0))
644                               return -1;
645                     (*dfa_size)++;
646           }
647           else if (val1 == 0 && val2 == 1) {
648                     if (stack_push(stack, type_qestion_mark, 0))
649                               return -1;
650                     (*dfa_size)++;
651           }
652           else if (val1 == -1) {
653                     val1 = iterate(stack, 0, 0);
654                     if (val1 < 0)
655                               return -1;
656                     *dfa_size -= val1;
657                     SLJIT_ASSERT(*dfa_size >= 2);
658           }
659           else {
660                     /* Ignore. */
661                     SLJIT_ASSERT(val1 == 1 && val2 == 1);
662           }
663           return regex_string - base_from;
664 }
665 
parse_char_range(const regex_char_t * regex_string,int length,struct compiler_common * compiler_common)666 static int parse_char_range(const regex_char_t *regex_string, int length, struct compiler_common *compiler_common)
667 {
668           struct stack* stack = &compiler_common->stack;
669           const regex_char_t *base_from = regex_string;
670           regex_char_t left_char, right_char, tmp_char;
671 
672           length--;
673           regex_string++;
674 
675           if (length == 0)
676                     return -2;
677 
678           if (*regex_string != '^') {
679                     if (stack_push(stack, type_rng_start, 0))
680                               return -1;
681           }
682           else {
683                     length--;
684                     regex_string++;
685 
686                     if (length == 0)
687                               return -2;
688 
689                     if (stack_push(stack, type_rng_start, 1))
690                               return -1;
691           }
692           /* For both the type_rng_start & type_rng_end. */
693           compiler_common->dfa_size += 2;
694 
695           /* Range must be at least 1 character. */
696           if (*regex_string == ']') {
697                     length--;
698                     regex_string++;
699                     if (stack_push(stack, type_rng_char, ']'))
700                               return -1;
701                     compiler_common->dfa_size++;
702           }
703 
704           while (1) {
705                     if (length == 0)
706                               return -2;
707 
708                     if (*regex_string == ']')
709                               break;
710 
711                     if (*regex_string != '\\')
712                               left_char = *regex_string;
713                     else {
714                               regex_string++;
715                               length--;
716                               if (length == 0)
717                                         return -2;
718                               left_char = *regex_string;
719                     }
720                     regex_string++;
721                     length--;
722 
723                     /* Is a range here? */
724                     if (length >= 3 && *regex_string == '-' && *(regex_string + 1) != ']') {
725                               regex_string++;
726                               length--;
727 
728                               if (*regex_string != '\\')
729                                         right_char = *regex_string;
730                               else {
731                                         regex_string++;
732                                         length--;
733                                         if (length == 0)
734                                                   return -2;
735                                         right_char = *regex_string;
736                               }
737                               regex_string++;
738                               length--;
739 
740                               if (left_char > right_char) {
741                                         /* Swap if necessary. */
742                                         tmp_char = left_char;
743                                         left_char = right_char;
744                                         right_char = tmp_char;
745                               }
746 
747                               if (stack_push(stack, type_rng_left, left_char))
748                                         return -1;
749                               if (stack_push(stack, type_rng_right, right_char))
750                                         return -1;
751                               compiler_common->dfa_size += 2;
752                     }
753                     else {
754                               if (stack_push(stack, type_rng_char, left_char))
755                                         return -1;
756                               compiler_common->dfa_size++;
757                     }
758           }
759 
760           if (stack_push(stack, type_rng_end, 0))
761                     return -1;
762           return regex_string - base_from;
763 }
764 
parse(const regex_char_t * regex_string,int length,struct compiler_common * compiler_common)765 static int parse(const regex_char_t *regex_string, int length, struct compiler_common *compiler_common)
766 {
767           /* Depth of bracketed expressions. */
768           int depth = 0;
769           /* Have we already found a term? '1' if not yet. */
770           int begin = 1;
771           /* Cache stack pointer. */
772           struct stack* stack = &compiler_common->stack;
773           int tmp;
774 
775           /* Type_begin and type_end. */
776           compiler_common->dfa_size = 2;
777           stack_init(stack);
778           if (stack_push(stack, type_begin, 0))
779                     return REGEX_MEMORY_ERROR;
780 
781           if (length > 0 && *regex_string == '^') {
782                     compiler_common->flags |= REGEX_MATCH_BEGIN;
783                     length--;
784                     regex_string++;
785           }
786 
787           if ((compiler_common->flags & (REGEX_MATCH_BEGIN | REGEX_NEWLINE)) == (REGEX_MATCH_BEGIN | REGEX_NEWLINE)) {
788                     /* Replace REGEX_MATCH_BEGIN flag to REGEX_FAKE_MATCH_BEGIN */
789                     compiler_common->flags &= ~REGEX_MATCH_BEGIN;
790                     compiler_common->flags |= REGEX_FAKE_MATCH_BEGIN;
791                     /* and append a new-line search. */
792                     if (stack_push(stack, type_newline, 0))
793                               return REGEX_MEMORY_ERROR;
794                     compiler_common->dfa_size++;
795                     /* Begin intentionally kept as 1. */
796           }
797 
798           while (length > 0) {
799                     switch (*regex_string) {
800                     case '\\' :
801                               length--;
802                               regex_string++;
803                               if (length == 0)
804                                         return REGEX_INVALID_REGEX;
805                               if (stack_push(stack, type_char, *regex_string))
806                                         return REGEX_MEMORY_ERROR;
807                               begin = 0;
808                               compiler_common->dfa_size++;
809                               break;
810 
811                     case '.' :
812                               if (stack_push(stack, type_rng_start, 1))
813                                         return REGEX_MEMORY_ERROR;
814                               if (compiler_common->flags & REGEX_NEWLINE) {
815                                         if (stack_push(stack, type_rng_char, '\n'))
816                                                   return REGEX_MEMORY_ERROR;
817                                         if (stack_push(stack, type_rng_char, '\r'))
818                                                   return REGEX_MEMORY_ERROR;
819                                         compiler_common->dfa_size += 2;
820                               }
821                               if (stack_push(stack, type_rng_end, 1))
822                                         return REGEX_MEMORY_ERROR;
823                               begin = 0;
824                               compiler_common->dfa_size += 2;
825                               break;
826 
827                     case '(' :
828                               depth++;
829                               if (stack_push(stack, type_open_br, 0))
830                                         return REGEX_MEMORY_ERROR;
831                               begin = 1;
832                               break;
833 
834                     case ')' :
835                               if (depth == 0)
836                                         return REGEX_INVALID_REGEX;
837                               depth--;
838                               if (stack_push(stack, type_close_br, 0))
839                                         return REGEX_MEMORY_ERROR;
840                               begin = 0;
841                               break;
842 
843                     case '|' :
844                               if (stack_push(stack, type_select, 0))
845                                         return REGEX_MEMORY_ERROR;
846                               begin = 1;
847                               compiler_common->dfa_size += 2;
848                               break;
849 
850                     case '*' :
851                               if (begin)
852                                         return REGEX_INVALID_REGEX;
853                               if (stack_push(stack, type_asterisk, 0))
854                                         return REGEX_MEMORY_ERROR;
855                               compiler_common->dfa_size += 2;
856                               break;
857 
858                     case '?' :
859                     case '+' :
860                               if (begin)
861                                         return REGEX_INVALID_REGEX;
862                               if (stack_push(stack, (*regex_string == '+') ? type_plus_sign : type_qestion_mark, 0))
863                                         return REGEX_MEMORY_ERROR;
864                               compiler_common->dfa_size++;
865                               break;
866 
867                     case '{' :
868                               tmp = parse_iterator(regex_string, length, stack, &compiler_common->dfa_size, begin);
869 
870                               if (tmp >= 0) {
871                                         length -= tmp;
872                                         regex_string += tmp;
873                               }
874                               else if (tmp == -1)
875                                         return REGEX_MEMORY_ERROR;
876                               else {
877                                         /* Not a valid range expression. */
878                                         SLJIT_ASSERT(tmp == -2);
879                                         if (stack_push(stack, type_char, '{'))
880                                                   return REGEX_MEMORY_ERROR;
881                                         compiler_common->dfa_size++;
882                               }
883                               break;
884 
885                     case '[' :
886                               tmp = parse_char_range(regex_string, length, compiler_common);
887                               if (tmp >= 0) {
888                                         length -= tmp;
889                                         regex_string += tmp;
890                               }
891                               else if (tmp == -1)
892                                         return REGEX_MEMORY_ERROR;
893                               else {
894                                         SLJIT_ASSERT(tmp == -2);
895                                         return REGEX_INVALID_REGEX;
896                               }
897                               begin = 0;
898                               break;
899 
900                     default:
901                               if (length == 1 && *regex_string == '$') {
902                                         compiler_common->flags |= REGEX_MATCH_END;
903                                         break;
904                               }
905                               if (stack_push(stack, type_char, *regex_string))
906                                         return REGEX_MEMORY_ERROR;
907                               begin = 0;
908                               compiler_common->dfa_size++;
909                               break;
910                     }
911                     length--;
912                     regex_string++;
913           }
914 
915           if (depth != 0)
916                     return REGEX_INVALID_REGEX;
917 
918           if ((compiler_common->flags & (REGEX_MATCH_END | REGEX_NEWLINE)) == (REGEX_MATCH_END | REGEX_NEWLINE)) {
919                     /* Replace REGEX_MATCH_END flag to REGEX_FAKE_MATCH_END */
920                     compiler_common->flags &= ~REGEX_MATCH_END;
921                     compiler_common->flags |= REGEX_FAKE_MATCH_END;
922                     /* and append a new-line search. */
923                     if (stack_push(stack, type_newline, 1))
924                               return REGEX_MEMORY_ERROR;
925                     compiler_common->dfa_size++;
926                     /* Begin intentionally kept as 1. */
927           }
928 
929           if (stack_push(stack, type_end, 0))
930                     return REGEX_MEMORY_ERROR;
931 
932           return REGEX_NO_ERROR;
933 }
934 
935 /* --------------------------------------------------------------------- */
936 /*  Generating machine state transitions                                 */
937 /* --------------------------------------------------------------------- */
938 
939 #define PUT_TRANSITION(typ, val) \
940           do { \
941                     --transitions_ptr; \
942                     transitions_ptr->type = typ; \
943                     transitions_ptr->value = val; \
944           } while (0)
945 
handle_iteratives(struct stack_item * transitions_ptr,struct stack_item * transitions,struct stack * depth)946 static struct stack_item* handle_iteratives(struct stack_item *transitions_ptr, struct stack_item *transitions, struct stack *depth)
947 {
948           struct stack_item *item;
949 
950           while (1) {
951                     item = stack_top(depth);
952 
953                     switch (item->type) {
954                     case type_asterisk:
955                               SLJIT_ASSERT(transitions[item->value].type == type_branch);
956                               transitions[item->value].value = transitions_ptr - transitions;
957                               PUT_TRANSITION(type_branch, item->value + 1);
958                               break;
959 
960                     case type_plus_sign:
961                               SLJIT_ASSERT(transitions[item->value].type == type_branch);
962                               transitions[item->value].value = transitions_ptr - transitions;
963                               break;
964 
965                     case type_qestion_mark:
966                               PUT_TRANSITION(type_branch, item->value);
967                               break;
968 
969                     default:
970                               return transitions_ptr;
971                     }
972                     stack_pop(depth);
973           }
974 }
975 
generate_transitions(struct compiler_common * compiler_common)976 static int generate_transitions(struct compiler_common *compiler_common)
977 {
978           struct stack *stack = &compiler_common->stack;
979           struct stack *depth = &compiler_common->depth;
980           struct stack_item *transitions_ptr;
981           struct stack_item *item;
982 
983           stack_init(depth);
984           compiler_common->dfa_transitions = SLJIT_MALLOC(sizeof(struct stack_item) * compiler_common->dfa_size, NULL);
985           if (!compiler_common->dfa_transitions)
986                     return REGEX_MEMORY_ERROR;
987 
988           /* Go through the items of the stack and generate the necessary branches and jumps (edges of DFA). */
989           transitions_ptr = compiler_common->dfa_transitions + compiler_common->dfa_size;
990           while (stack->count > 0) {
991                     item = stack_pop(stack);
992                     switch (item->type) {
993                     case type_begin:
994                     case type_open_br:
995                               item = stack_pop(depth);
996                               if (item->type == type_select)
997                                         PUT_TRANSITION(type_branch, item->value + 1);
998                               else
999                                         SLJIT_ASSERT(item->type == type_close_br);
1000                               if (stack->count == 0)
1001                                         PUT_TRANSITION(type_begin, 0);
1002                               else
1003                                         transitions_ptr = handle_iteratives(transitions_ptr, compiler_common->dfa_transitions, depth);
1004                               break;
1005 
1006                     case type_end:
1007                     case type_close_br:
1008                               if (item->type == type_end)
1009                                         *--transitions_ptr = *item;
1010                               if (stack_push(depth, type_close_br, transitions_ptr - compiler_common->dfa_transitions))
1011                                         return REGEX_MEMORY_ERROR;
1012                               break;
1013 
1014                     case type_select:
1015                               item = stack_top(depth);
1016                               if (item->type == type_select) {
1017                                         SLJIT_ASSERT(compiler_common->dfa_transitions[item->value].type == type_jump);
1018                                         PUT_TRANSITION(type_branch, item->value + 1);
1019                                         PUT_TRANSITION(type_jump, item->value);
1020                                         item->value = transitions_ptr - compiler_common->dfa_transitions;
1021                               }
1022                               else {
1023                                         SLJIT_ASSERT(item->type == type_close_br);
1024                                         item->type = type_select;
1025                                         PUT_TRANSITION(type_jump, item->value);
1026                                         item->value = transitions_ptr - compiler_common->dfa_transitions;
1027                               }
1028                               break;
1029 
1030                     case type_asterisk:
1031                     case type_plus_sign:
1032                     case type_qestion_mark:
1033                               if (item->type != type_qestion_mark)
1034                                         PUT_TRANSITION(type_branch, 0);
1035                               if (stack_push(depth, item->type, transitions_ptr - compiler_common->dfa_transitions))
1036                                         return REGEX_MEMORY_ERROR;
1037                               break;
1038 
1039                     case type_char:
1040                     case type_newline:
1041                     case type_rng_start:
1042                               /* Requires handle_iteratives. */
1043                               *--transitions_ptr = *item;
1044                               transitions_ptr = handle_iteratives(transitions_ptr, compiler_common->dfa_transitions, depth);
1045                               break;
1046 
1047                     default:
1048                               *--transitions_ptr = *item;
1049                               break;
1050                     }
1051           }
1052 
1053           SLJIT_ASSERT(compiler_common->dfa_transitions == transitions_ptr);
1054           SLJIT_ASSERT(depth->count == 0);
1055           return REGEX_NO_ERROR;
1056 }
1057 
1058 #undef PUT_TRANSITION
1059 
1060 #ifdef REGEX_MATCH_VERBOSE
1061 
verbose_transitions(struct compiler_common * compiler_common)1062 static void verbose_transitions(struct compiler_common *compiler_common)
1063 {
1064           struct stack_item *transitions_ptr = compiler_common->dfa_transitions;
1065           struct stack_item *transitions_end = transitions_ptr + compiler_common->dfa_size;
1066           struct stack_item *search_states_ptr = compiler_common->search_states;
1067           int pos;
1068 
1069           printf("-----------------\nTransitions\n-----------------\n");
1070           pos = 0;
1071           while (transitions_ptr < transitions_end) {
1072                     printf("[%3d] ", pos++);
1073                     if (search_states_ptr->type >= 0)
1074                               printf("(%3d) ", search_states_ptr->type);
1075                     switch (transitions_ptr->type) {
1076                     case type_begin:
1077                               printf("type_begin\n");
1078                               break;
1079 
1080                     case type_end:
1081                               printf("type_end\n");
1082                               break;
1083 
1084                     case type_char:
1085                               if (transitions_ptr->value >= ' ')
1086                                         printf("type_char '%c'\n", transitions_ptr->value);
1087                               else
1088                                         printf("type_char 0x%x\n", transitions_ptr->value);
1089                               break;
1090 
1091                     case type_newline:
1092                               printf("type_newline %s\n", transitions_ptr->value ? "(end)" : "(begin)");
1093                               break;
1094 
1095                     case type_id:
1096                               printf("type_id %d\n", transitions_ptr->value);
1097                               break;
1098 
1099                     case type_rng_start:
1100                               printf("type_rng_start %s\n", transitions_ptr->value ? "(invert)" : "(normal)");
1101                               break;
1102 
1103                     case type_rng_end:
1104                               printf("type_rng_end\n");
1105                               break;
1106 
1107                     case type_rng_char:
1108                               if (transitions_ptr->value >= ' ')
1109                                         printf("type_rng_char '%c'\n", transitions_ptr->value);
1110                               else
1111                                         printf("type_rng_char 0x%x\n", transitions_ptr->value);
1112                               break;
1113 
1114                     case type_rng_left:
1115                               if (transitions_ptr->value >= ' ')
1116                                         printf("type_rng_left '%c'\n", transitions_ptr->value);
1117                               else
1118                                         printf("type_rng_left 0x%x\n", transitions_ptr->value);
1119                               break;
1120 
1121                     case type_rng_right:
1122                               if (transitions_ptr->value >= ' ')
1123                                         printf("type_rng_right '%c'\n", transitions_ptr->value);
1124                               else
1125                                         printf("type_rng_right 0x%x\n", transitions_ptr->value);
1126                               break;
1127 
1128                     case type_branch:
1129                               printf("type_branch -> %d\n", transitions_ptr->value);
1130                               break;
1131 
1132                     case type_jump:
1133                               printf("type_jump -> %d\n", transitions_ptr->value);
1134                               break;
1135 
1136                     default:
1137                               printf("UNEXPECTED TYPE\n");
1138                               break;
1139                     }
1140                     transitions_ptr++;
1141                     search_states_ptr++;
1142           }
1143           printf("flags: ");
1144           if (!(compiler_common->flags & (REGEX_MATCH_BEGIN | REGEX_MATCH_END | REGEX_NEWLINE | REGEX_ID_CHECK | REGEX_FAKE_MATCH_BEGIN | REGEX_FAKE_MATCH_END)))
1145                     printf("none ");
1146           if (compiler_common->flags & REGEX_MATCH_BEGIN)
1147                     printf("REGEX_MATCH_BEGIN ");
1148           if (compiler_common->flags & REGEX_MATCH_END)
1149                     printf("REGEX_MATCH_END ");
1150           if (compiler_common->flags & REGEX_NEWLINE)
1151                     printf("REGEX_NEWLINE ");
1152           if (compiler_common->flags & REGEX_ID_CHECK)
1153                     printf("REGEX_ID_CHECK ");
1154           if (compiler_common->flags & REGEX_FAKE_MATCH_BEGIN)
1155                     printf("REGEX_FAKE_MATCH_BEGIN ");
1156           if (compiler_common->flags & REGEX_FAKE_MATCH_END)
1157                     printf("REGEX_FAKE_MATCH_END ");
1158           if (compiler_common->longest_range_size > 0)
1159                     printf("(longest range: %ld) ", (long)compiler_common->longest_range_size);
1160           printf("\n");
1161 }
1162 
1163 #endif
1164 
1165 /* --------------------------------------------------------------------- */
1166 /*  Utilities                                                            */
1167 /* --------------------------------------------------------------------- */
1168 
generate_search_states(struct compiler_common * compiler_common)1169 static int generate_search_states(struct compiler_common *compiler_common)
1170 {
1171           struct stack_item *transitions_ptr = compiler_common->dfa_transitions;
1172           struct stack_item *transitions_end = transitions_ptr + compiler_common->dfa_size;
1173           struct stack_item *search_states_ptr;
1174           struct stack_item *rng_start = NULL;
1175 
1176           compiler_common->terms_size = !(compiler_common->flags & REGEX_FAKE_MATCH_END) ? 1 : 2;
1177           compiler_common->longest_range_size = 0;
1178           compiler_common->search_states = SLJIT_MALLOC(sizeof(struct stack_item) * compiler_common->dfa_size, NULL);
1179           if (!compiler_common->search_states)
1180                     return REGEX_MEMORY_ERROR;
1181 
1182           search_states_ptr = compiler_common->search_states;
1183           while (transitions_ptr < transitions_end) {
1184                     switch (transitions_ptr->type) {
1185                     case type_begin:
1186                     case type_end:
1187                               search_states_ptr->type = 0;
1188                               break;
1189 
1190                     case type_char:
1191                               search_states_ptr->type = compiler_common->terms_size++;
1192                               break;
1193 
1194                     case type_newline:
1195                               if (transitions_ptr->value)
1196                                         search_states_ptr->type = 1;
1197                               else
1198                                         search_states_ptr->type = compiler_common->terms_size++;
1199                               SLJIT_ASSERT(search_states_ptr->type == 1 || search_states_ptr->type == 2);
1200                               break;
1201 
1202                     case type_id:
1203                               if (transitions_ptr->value > 0)
1204                                         compiler_common->flags |= REGEX_ID_CHECK;
1205                               search_states_ptr->type = -1;
1206                               break;
1207 
1208                     case type_rng_start:
1209                               search_states_ptr->type = compiler_common->terms_size;
1210                               rng_start = search_states_ptr;
1211                               break;
1212 
1213                     case type_rng_end:
1214                               search_states_ptr->type = compiler_common->terms_size++;
1215                               /* Ok, this is a blunt over estimation :) */
1216                               if (compiler_common->longest_range_size < search_states_ptr - rng_start)
1217                                         compiler_common->longest_range_size = search_states_ptr - rng_start;
1218                               break;
1219 
1220                     default:
1221                               search_states_ptr->type = -1;
1222                               break;
1223                     }
1224                     search_states_ptr->value = -1;
1225                     search_states_ptr++;
1226                     transitions_ptr++;
1227           }
1228           return REGEX_NO_ERROR;
1229 }
1230 
trace_transitions(int from,struct compiler_common * compiler_common)1231 static int trace_transitions(int from, struct compiler_common *compiler_common)
1232 {
1233           int id = 0;
1234           struct stack *stack = &compiler_common->stack;
1235           struct stack *depth = &compiler_common->depth;
1236           struct stack_item *dfa_transitions = compiler_common->dfa_transitions;
1237           struct stack_item *search_states = compiler_common->search_states;
1238 
1239           SLJIT_ASSERT(search_states[from].type >= 0);
1240 
1241           from++;
1242 
1243           /* Be prepared for any paths (loops, etc). */
1244           while (1) {
1245                     if (dfa_transitions[from].type == type_id)
1246                               if (id < dfa_transitions[from].value)
1247                                         id = dfa_transitions[from].value;
1248 
1249                     if (search_states[from].value < id) {
1250                               /* Forward step. */
1251                               if (search_states[from].value == -1)
1252                                         if (stack_push(stack, 0, from))
1253                                                   return REGEX_MEMORY_ERROR;
1254                               search_states[from].value = id;
1255 
1256                               if (dfa_transitions[from].type == type_branch) {
1257                                         if (stack_push(depth, id, from))
1258                                                   return REGEX_MEMORY_ERROR;
1259                                         from++;
1260                                         continue;
1261                               }
1262                               else if (dfa_transitions[from].type == type_jump) {
1263                                         from = dfa_transitions[from].value;
1264                                         continue;
1265                               }
1266                               else if (search_states[from].type < 0) {
1267                                         from++;
1268                                         continue;
1269                               }
1270                     }
1271 
1272                     /* Back tracking. */
1273                     if (depth->count > 0) {
1274                               id = stack_top(depth)->type;
1275                               from = dfa_transitions[stack_pop(depth)->value].value;
1276                               continue;
1277                     }
1278                     return 0;
1279           }
1280 }
1281 
1282 /* --------------------------------------------------------------------- */
1283 /*  Code generator                                                       */
1284 /* --------------------------------------------------------------------- */
1285 
1286 #define TERM_OFFSET_OF(index, offs)     (((index) * no_states + (offs)) * sizeof(sljit_sw))
1287 #define TERM_REL_OFFSET_OF(base, offs)  ((base) + ((offs) * sizeof(sljit_sw)))
1288 
1289 #define EMIT_OP1(type, arg1, arg2, arg3, arg4) \
1290           CHECK(sljit_emit_op1(compiler, type, arg1, arg2, arg3, arg4))
1291 
1292 #define EMIT_OP2(type, arg1, arg2, arg3, arg4, arg5, arg6) \
1293           CHECK(sljit_emit_op2(compiler, type, arg1, arg2, arg3, arg4, arg5, arg6))
1294 
1295 #define EMIT_LABEL(label) \
1296           label = sljit_emit_label(compiler); \
1297           CHECK(!label)
1298 
1299 #define EMIT_JUMP(jump, type) \
1300           jump = sljit_emit_jump(compiler, type); \
1301           CHECK(!jump)
1302 
1303 #define EMIT_CMP(jump, type, arg1, arg2, arg3, arg4) \
1304           jump = sljit_emit_cmp(compiler, type, arg1, arg2, arg3, arg4); \
1305           CHECK(!jump)
1306 
1307 /* CHECK depends on the use case. */
1308 
1309 #define CHECK(exp) \
1310           if (SLJIT_UNLIKELY(exp)) \
1311                     return REGEX_MEMORY_ERROR
1312 
compile_uncond_tran(struct compiler_common * compiler_common,int reg)1313 static int compile_uncond_tran(struct compiler_common *compiler_common, int reg)
1314 {
1315           struct sljit_compiler *compiler = compiler_common->compiler;
1316           struct stack *stack = &compiler_common->stack;
1317           struct stack_item *search_states = compiler_common->search_states;
1318           int flags = compiler_common->flags;
1319           sljit_sw no_states = compiler_common->no_states;
1320           sljit_uw head = 0;
1321           sljit_sw offset, value;
1322 
1323           if (reg != R_CURR_STATE || !(compiler_common->flags & REGEX_FAKE_MATCH_BEGIN)) {
1324                     CHECK(trace_transitions(0, compiler_common));
1325           }
1326           else {
1327                     CHECK(trace_transitions(1, compiler_common));
1328           }
1329 
1330           while (stack->count > 0) {
1331                     value = stack_pop(stack)->value;
1332                     if (search_states[value].type >= 0) {
1333                               offset = TERM_OFFSET_OF(search_states[value].type, 0);
1334                               EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 1), SLJIT_IMM, head);
1335                               if (offset > 0)
1336                                         head = offset;
1337 
1338                               if (!(flags & REGEX_MATCH_BEGIN)) {
1339                                         EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 2), R_TEMP, 0);
1340                                         if (flags & REGEX_ID_CHECK) {
1341                                                   EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 3), SLJIT_IMM, search_states[value].value);
1342                                         }
1343                               }
1344                               else if (flags & REGEX_ID_CHECK) {
1345                                         EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 2), SLJIT_IMM, search_states[value].value);
1346                               }
1347                     }
1348                     search_states[value].value = -1;
1349           }
1350           if (reg == R_NEXT_STATE) {
1351                     EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_NEXT_HEAD, 0);
1352           }
1353           else if (flags & REGEX_FAKE_MATCH_BEGIN) {
1354                     SLJIT_ASSERT(compiler_common->dfa_transitions[1].type == type_newline && !compiler_common->dfa_transitions[1].value);
1355                     offset = TERM_OFFSET_OF(search_states[1].type, 0);
1356 
1357                     SLJIT_ASSERT(!(flags & REGEX_MATCH_BEGIN));
1358 
1359                     EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 1), SLJIT_IMM, head);
1360                     head = offset;
1361 
1362                     EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 2), SLJIT_IMM, 1);
1363                     if (flags & REGEX_ID_CHECK) {
1364                               EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 3), SLJIT_IMM, 0);
1365                     }
1366           }
1367           EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, head);
1368           return REGEX_NO_ERROR;
1369 }
1370 
compile_cond_tran(struct compiler_common * compiler_common,sljit_sw curr_index)1371 static int compile_cond_tran(struct compiler_common *compiler_common, sljit_sw curr_index)
1372 {
1373           struct sljit_compiler *compiler = compiler_common->compiler;
1374           struct stack *stack = &compiler_common->stack;
1375           struct stack_item *search_states = compiler_common->search_states;
1376           sljit_sw offset;
1377           int flags;
1378           sljit_sw no_states;
1379           sljit_sw value;
1380           struct sljit_jump *jump1;
1381           struct sljit_jump *jump2;
1382           struct sljit_jump *jump3;
1383           struct sljit_jump *jump4;
1384           struct sljit_jump *jump5;
1385           struct sljit_label *label1;
1386 
1387           flags = compiler_common->flags;
1388           no_states = compiler_common->no_states;
1389 
1390           EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_forward), SLJIT_IMM, 0);
1391           if (!(flags & (REGEX_ID_CHECK | REGEX_MATCH_BEGIN))) {
1392                     EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(curr_index, 2));
1393           }
1394 
1395           while (stack->count > 0) {
1396                     value = stack_pop(stack)->value;
1397                     if (search_states[value].type >= 0) {
1398 #ifdef REGEX_MATCH_VERBOSE
1399                               if (flags & REGEX_MATCH_VERBOSE)
1400                                         printf("-> (%3d:%3d) ", search_states[value].type, search_states[value].value);
1401 #endif
1402                               offset = TERM_OFFSET_OF(search_states[value].type, 0);
1403 
1404                               if (!(flags & REGEX_ID_CHECK)) {
1405                                         if (!(flags & REGEX_MATCH_BEGIN)) {
1406                                                   /* Check whether item is inserted. */
1407                                                   EMIT_CMP(jump1, SLJIT_NOT_EQUAL, SLJIT_MEM1(R_NEXT_STATE), offset + sizeof(sljit_sw), SLJIT_IMM, -1);
1408                                                   EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + sizeof(sljit_sw), R_NEXT_HEAD, 0);
1409                                                   if (offset > 0) {
1410                                                             EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, offset);
1411                                                   }
1412                                                   EMIT_JUMP(jump2, SLJIT_JUMP);
1413 
1414                                                   /* Check whether old index <= index. */
1415                                                   EMIT_LABEL(label1);
1416                                                   sljit_set_label(jump1, label1);
1417 
1418                                                   EMIT_CMP(jump1, SLJIT_LESS_EQUAL, SLJIT_MEM1(R_NEXT_STATE), offset + 2 * sizeof(sljit_sw), R_TEMP, 0);
1419 
1420                                                   EMIT_LABEL(label1);
1421                                                   sljit_set_label(jump2, label1);
1422                                                   EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + 2 * sizeof(sljit_sw), R_TEMP, 0);
1423 
1424                                                   EMIT_LABEL(label1);
1425                                                   sljit_set_label(jump1, label1);
1426                                         }
1427                                         else {
1428                                                   /* Check whether item is inserted. */
1429                                                   EMIT_CMP(jump1, SLJIT_NOT_EQUAL, SLJIT_MEM1(R_NEXT_STATE), offset + sizeof(sljit_sw), SLJIT_IMM, -1);
1430                                                   EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + sizeof(sljit_sw), R_NEXT_HEAD, 0);
1431                                                   if (offset > 0) {
1432                                                             EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, offset);
1433                                                   }
1434                                                   EMIT_LABEL(label1);
1435                                                   sljit_set_label(jump1, label1);
1436                                         }
1437                               }
1438                               else {
1439                                         if (!(flags & REGEX_MATCH_BEGIN)) {
1440                                                   EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(curr_index, 2));
1441 
1442                                                   /* Check whether item is inserted. */
1443                                                   EMIT_CMP(jump1, SLJIT_NOT_EQUAL, SLJIT_MEM1(R_NEXT_STATE), offset + sizeof(sljit_sw), SLJIT_IMM, -1);
1444                                                   EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + sizeof(sljit_sw), R_NEXT_HEAD, 0);
1445                                                   if (offset > 0) {
1446                                                             EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, offset);
1447                                                   }
1448                                                   EMIT_JUMP(jump2, SLJIT_JUMP);
1449 
1450                                                   /* Check whether old index != index. */
1451                                                   EMIT_LABEL(label1);
1452                                                   sljit_set_label(jump1, label1);
1453 
1454                                                   EMIT_OP2(SLJIT_SUB | SLJIT_SET_Z | SLJIT_SET_LESS, SLJIT_UNUSED, 0, SLJIT_MEM1(R_NEXT_STATE), offset + 2 * sizeof(sljit_sw), R_TEMP, 0);
1455                                                   EMIT_JUMP(jump1, SLJIT_LESS);
1456                                                   EMIT_JUMP(jump3, SLJIT_NOT_EQUAL); /* Greater. */
1457 
1458                                                   /* Old index == index. */
1459                                                   EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(curr_index, 3));
1460                                                   if (search_states[value].value > 0) {
1461                                                             EMIT_CMP(jump4, SLJIT_GREATER, R_TEMP, 0, SLJIT_IMM, search_states[value].value);
1462 
1463                                                             EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_IMM, search_states[value].value);
1464                                                             EMIT_LABEL(label1);
1465                                                             sljit_set_label(jump4, label1);
1466                                                   }
1467 
1468                                                   EMIT_OP2(SLJIT_SUB | SLJIT_SET_GREATER_EQUAL, SLJIT_UNUSED, 0, SLJIT_MEM1(R_NEXT_STATE), offset + 3 * sizeof(sljit_sw), R_TEMP, 0);
1469                                                   EMIT_JUMP(jump4, SLJIT_GREATER_EQUAL);
1470                                                   EMIT_JUMP(jump5, SLJIT_JUMP);
1471 
1472                                                   /* Overwrite index & id. */
1473                                                   EMIT_LABEL(label1);
1474                                                   sljit_set_label(jump3, label1);
1475                                                   sljit_set_label(jump2, label1);
1476                                                   EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + 2 * sizeof(sljit_sw), R_TEMP, 0);
1477 
1478                                                   EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(curr_index, 3));
1479                                                   if (search_states[value].value > 0) {
1480                                                             EMIT_CMP(jump3, SLJIT_GREATER, R_TEMP, 0, SLJIT_IMM, search_states[value].value);
1481 
1482                                                             EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_IMM, search_states[value].value);
1483                                                             EMIT_LABEL(label1);
1484                                                             sljit_set_label(jump3, label1);
1485                                                   }
1486 
1487                                                   EMIT_LABEL(label1);
1488                                                   sljit_set_label(jump5, label1);
1489                                                   EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + 3 * sizeof(sljit_sw), R_TEMP, 0);
1490 
1491                                                   /* Exit. */
1492                                                   EMIT_LABEL(label1);
1493                                                   sljit_set_label(jump1, label1);
1494                                                   sljit_set_label(jump4, label1);
1495                                         }
1496                                         else {
1497                                                   EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(curr_index, 2));
1498 
1499                                                   if (search_states[value].value > 0) {
1500                                                             EMIT_CMP(jump1, SLJIT_GREATER, R_TEMP, 0, SLJIT_IMM, search_states[value].value);
1501 
1502                                                             EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_IMM, search_states[value].value);
1503                                                             EMIT_LABEL(label1);
1504                                                             sljit_set_label(jump1, label1);
1505                                                   }
1506 
1507                                                   /* Check whether item is inserted. */
1508                                                   EMIT_CMP(jump1, SLJIT_NOT_EQUAL, SLJIT_MEM1(R_NEXT_STATE), offset + sizeof(sljit_sw), SLJIT_IMM, -1);
1509                                                   EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + sizeof(sljit_sw), R_NEXT_HEAD, 0);
1510                                                   if (offset > 0) {
1511                                                             EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, offset);
1512                                                   }
1513                                                   EMIT_JUMP(jump2, SLJIT_JUMP);
1514 
1515                                                   /* Check whether old id >= id. */
1516                                                   EMIT_LABEL(label1);
1517                                                   sljit_set_label(jump1, label1);
1518 
1519                                                   EMIT_CMP(jump1, SLJIT_GREATER_EQUAL, SLJIT_MEM1(R_NEXT_STATE), offset + 2 * sizeof(sljit_sw), R_TEMP, 0);
1520 
1521                                                   EMIT_LABEL(label1);
1522                                                   sljit_set_label(jump2, label1);
1523                                                   EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + 2 * sizeof(sljit_sw), R_TEMP, 0);
1524 
1525                                                   EMIT_LABEL(label1);
1526                                                   sljit_set_label(jump1, label1);
1527                                         }
1528                               }
1529                     }
1530                     search_states[value].value = -1;
1531           }
1532 
1533 #ifdef REGEX_MATCH_VERBOSE
1534           if (flags & REGEX_MATCH_VERBOSE)
1535                     printf("\n");
1536 #endif
1537           return REGEX_NO_ERROR;
1538 }
1539 
compile_end_check(struct compiler_common * compiler_common,struct sljit_label * end_check_label)1540 static int compile_end_check(struct compiler_common *compiler_common, struct sljit_label *end_check_label)
1541 {
1542           struct sljit_compiler *compiler = compiler_common->compiler;
1543           struct sljit_jump *jump;
1544           struct sljit_jump *clear_states_jump;
1545           struct sljit_label *label;
1546           struct sljit_label *leave_label;
1547           struct sljit_label *begin_loop_label;
1548 
1549           /* Priority order: best_begin > best_end > best_id.
1550              In other words:
1551                  if (new best_begin > old test_begin) do nothing
1552                  otherwise we know that new_end > old_end, since R_CURR_INDEX ever increasing
1553                  therefore we must overwrite all best_* variables (new_id also contains the highest id for this turn). */
1554 
1555           /* Both R_CURR_CHAR and R_BEST_BEGIN used as temporary registers. */
1556 
1557           if (!(compiler_common->flags & REGEX_MATCH_BEGIN)) {
1558                     EMIT_OP1(SLJIT_MOV, R_CURR_CHAR, 0, SLJIT_MEM1(R_CURR_STATE), TERM_REL_OFFSET_OF(0, 2));
1559                     EMIT_CMP(jump, !(compiler_common->flags & REGEX_MATCH_NON_GREEDY) ? SLJIT_LESS : SLJIT_LESS_EQUAL, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_begin), R_CURR_CHAR, 0);
1560                     sljit_set_label(jump, end_check_label);
1561 
1562                     EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_begin), R_CURR_CHAR, 0);
1563                     if (!(compiler_common->flags & (REGEX_FAKE_MATCH_BEGIN | REGEX_FAKE_MATCH_END))) {
1564                               EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_end), R_CURR_INDEX, 0);
1565                     }
1566                     else {
1567                               if ((compiler_common->flags & (REGEX_FAKE_MATCH_BEGIN | REGEX_FAKE_MATCH_END)) == (REGEX_FAKE_MATCH_BEGIN | REGEX_FAKE_MATCH_END)) {
1568                                         EMIT_OP2(SLJIT_SUB, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_end), R_CURR_INDEX, 0, SLJIT_IMM, 2);
1569                               }
1570                               else {
1571                                         EMIT_OP2(SLJIT_SUB, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_end), R_CURR_INDEX, 0, SLJIT_IMM, 1);
1572                               }
1573                     }
1574                     if (compiler_common->flags & REGEX_ID_CHECK) {
1575                               EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_id), SLJIT_MEM1(R_CURR_STATE), TERM_REL_OFFSET_OF(0, 3));
1576                     }
1577 
1578                     EMIT_CMP(clear_states_jump, SLJIT_LESS, R_CURR_CHAR, 0, R_BEST_BEGIN, 0);
1579 
1580                     EMIT_LABEL(leave_label);
1581                     EMIT_OP1(SLJIT_MOV, R_BEST_BEGIN, 0, R_CURR_CHAR, 0);
1582                     EMIT_JUMP(jump, SLJIT_JUMP);
1583                     sljit_set_label(jump, end_check_label);
1584 
1585                     /* A loop to clear all states, which are > (or >=) than R_CURR_CHAR. */
1586                     EMIT_LABEL(label);
1587                     sljit_set_label(clear_states_jump, label);
1588 
1589                     EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_NEXT_HEAD, 0);
1590                     EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, 0);
1591 
1592                     /* Begin of the loop. */
1593                     EMIT_LABEL(begin_loop_label);
1594                     EMIT_CMP(jump, SLJIT_EQUAL, R_TEMP, 0, SLJIT_IMM, 0);
1595                     sljit_set_label(jump, leave_label);
1596 
1597                     EMIT_OP2(SLJIT_ADD, R_TEMP, 0, R_TEMP, 0, R_CURR_STATE, 0);
1598                     EMIT_OP1(SLJIT_MOV, R_BEST_BEGIN, 0, SLJIT_MEM1(R_TEMP), sizeof(sljit_sw));
1599                     EMIT_CMP(clear_states_jump, !(compiler_common->flags & REGEX_MATCH_NON_GREEDY) ? SLJIT_GREATER : SLJIT_GREATER_EQUAL, SLJIT_MEM1(R_TEMP), 2 * sizeof(sljit_sw), R_CURR_CHAR, 0);
1600 
1601                     /* Case 1: keep this case. */
1602                     EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_TEMP), sizeof(sljit_sw), R_NEXT_HEAD, 0);
1603                     EMIT_OP2(SLJIT_SUB, R_NEXT_HEAD, 0, R_TEMP, 0, R_CURR_STATE, 0);
1604 
1605                     EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_BEST_BEGIN, 0);
1606                     EMIT_JUMP(jump, SLJIT_JUMP);
1607                     sljit_set_label(jump, begin_loop_label);
1608 
1609                     /* Case 2: remove this case. */
1610                     EMIT_LABEL(label);
1611                     sljit_set_label(clear_states_jump, label);
1612 
1613                     EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_TEMP), sizeof(sljit_sw), SLJIT_IMM, -1);
1614 
1615                     EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_BEST_BEGIN, 0);
1616                     EMIT_JUMP(jump, SLJIT_JUMP);
1617                     sljit_set_label(jump, begin_loop_label);
1618           }
1619           else {
1620                     EMIT_OP1(SLJIT_MOV, R_BEST_BEGIN, 0, SLJIT_IMM, 0);
1621                     EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_begin), SLJIT_IMM, 0);
1622                     EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_end), R_CURR_INDEX, 0);
1623                     if (compiler_common->flags & REGEX_ID_CHECK) {
1624                               EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_id), SLJIT_MEM1(R_CURR_STATE), TERM_REL_OFFSET_OF(0, 2));
1625                     }
1626                     EMIT_JUMP(jump, SLJIT_JUMP);
1627                     sljit_set_label(jump, end_check_label);
1628           }
1629           return REGEX_NO_ERROR;
1630 }
1631 
compile_leave_fast_forward(struct compiler_common * compiler_common,struct sljit_label * fast_forward_label)1632 static int compile_leave_fast_forward(struct compiler_common *compiler_common, struct sljit_label *fast_forward_label)
1633 {
1634           struct sljit_compiler *compiler = compiler_common->compiler;
1635           struct stack *stack = &compiler_common->stack;
1636           struct stack_item *dfa_transitions = compiler_common->dfa_transitions;
1637           struct stack_item *search_states = compiler_common->search_states;
1638           int ind;
1639           struct sljit_jump *jump;
1640           int init_range = 1, prev_value = 0;
1641 
1642           while (stack->count > 0) {
1643                     ind = stack_pop(stack)->value;
1644                     search_states[ind].value = -1;
1645                     if (search_states[ind].type >= 0) {
1646                               if (dfa_transitions[ind].type == type_char) {
1647                                         EMIT_CMP(jump, SLJIT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, dfa_transitions[ind].value);
1648                                         sljit_set_label(jump, fast_forward_label);
1649                               }
1650                               else if (dfa_transitions[ind].type == type_rng_start) {
1651                                         SLJIT_ASSERT(!dfa_transitions[ind].value);
1652                                         ind++;
1653                                         while (dfa_transitions[ind].type != type_rng_end) {
1654                                                   if (dfa_transitions[ind].type == type_rng_char) {
1655                                                             EMIT_CMP(jump, SLJIT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, dfa_transitions[ind].value);
1656                                                             sljit_set_label(jump, fast_forward_label);
1657                                                   }
1658                                                   else {
1659                                                             SLJIT_ASSERT(dfa_transitions[ind].type == type_rng_left);
1660                                                             if (init_range) {
1661                                                                       EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_CURR_CHAR, 0);
1662                                                                       init_range = 0;
1663                                                             }
1664                                                             if (dfa_transitions[ind].value != prev_value) {
1665                                                                       /* Best compatibility to all archs. */
1666                                                                       prev_value -= dfa_transitions[ind].value;
1667                                                                       if (prev_value < 0) {
1668                                                                                 EMIT_OP2(SLJIT_SUB, R_TEMP, 0, R_TEMP, 0, SLJIT_IMM, -prev_value);
1669                                                                       }
1670                                                                       else {
1671                                                                                 EMIT_OP2(SLJIT_ADD, R_TEMP, 0, R_TEMP, 0, SLJIT_IMM, prev_value);
1672                                                                       }
1673                                                                       prev_value = dfa_transitions[ind].value;
1674                                                             }
1675                                                             EMIT_CMP(jump, SLJIT_LESS_EQUAL, R_TEMP, 0, SLJIT_IMM, dfa_transitions[ind + 1].value - dfa_transitions[ind].value);
1676                                                             sljit_set_label(jump, fast_forward_label);
1677                                                             ind++;
1678                                                   }
1679                                                   ind++;
1680                                         }
1681                               }
1682                               else {
1683                                         SLJIT_ASSERT(dfa_transitions[ind].type == type_newline);
1684                                         EMIT_CMP(jump, SLJIT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, '\n');
1685                                         sljit_set_label(jump, fast_forward_label);
1686                                         EMIT_CMP(jump, SLJIT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, '\r');
1687                                         sljit_set_label(jump, fast_forward_label);
1688                               }
1689                     }
1690           }
1691           return REGEX_NO_ERROR;
1692 }
1693 
compile_newline_check(struct compiler_common * compiler_common,sljit_sw ind)1694 static int compile_newline_check(struct compiler_common *compiler_common, sljit_sw ind)
1695 {
1696           struct sljit_compiler *compiler = compiler_common->compiler;
1697           struct sljit_jump *jump1;
1698           struct sljit_jump *jump2;
1699           struct sljit_label *label;
1700           sljit_sw no_states;
1701           sljit_sw offset;
1702 
1703           /* Check whether a new-line character is found. */
1704           EMIT_CMP(jump1, SLJIT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, '\n');
1705           EMIT_CMP(jump2, SLJIT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, '\r');
1706 
1707           no_states = compiler_common->no_states;
1708           offset = TERM_OFFSET_OF(compiler_common->search_states[ind].type, 1);
1709           EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), offset);
1710           EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_CURR_STATE), offset, SLJIT_IMM, -1);
1711           CHECK(sljit_emit_ijump(compiler, SLJIT_JUMP, SLJIT_MEM2(R_CURR_STATE, R_TEMP), 0));
1712 
1713           EMIT_LABEL(label);
1714           sljit_set_label(jump1, label);
1715           sljit_set_label(jump2, label);
1716           return REGEX_NO_ERROR;
1717 }
1718 
1719 #undef CHECK
1720 
1721 #define CHECK(exp) \
1722           if (SLJIT_UNLIKELY(exp)) \
1723                     return 0
1724 
range_set_label(struct sljit_jump ** range_jump_list,struct sljit_label * label)1725 static SLJIT_INLINE void range_set_label(struct sljit_jump **range_jump_list, struct sljit_label *label)
1726 {
1727           while (*range_jump_list) {
1728                     sljit_set_label(*range_jump_list, label);
1729                     range_jump_list++;
1730           }
1731 }
1732 
compile_range_check(struct compiler_common * compiler_common,sljit_sw ind)1733 static sljit_sw compile_range_check(struct compiler_common *compiler_common, sljit_sw ind)
1734 {
1735           struct sljit_compiler *compiler = compiler_common->compiler;
1736           struct stack_item *dfa_transitions = compiler_common->dfa_transitions;
1737           struct sljit_jump **range_jump_list = compiler_common->range_jump_list;
1738           int invert = dfa_transitions[ind].value;
1739           struct sljit_label *label;
1740           sljit_sw no_states;
1741           sljit_sw offset;
1742           int init_range = 1, prev_value = 0;
1743 
1744           ind++;
1745 
1746           while (dfa_transitions[ind].type != type_rng_end) {
1747                     if (dfa_transitions[ind].type == type_rng_char) {
1748                               EMIT_CMP(*range_jump_list, SLJIT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, dfa_transitions[ind].value);
1749                               range_jump_list++;
1750                     }
1751                     else {
1752                               SLJIT_ASSERT(dfa_transitions[ind].type == type_rng_left);
1753                               if (init_range) {
1754                                         EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_CURR_CHAR, 0);
1755                                         init_range = 0;
1756                               }
1757                               if (dfa_transitions[ind].value != prev_value) {
1758                                         /* Best compatibility to all archs. */
1759                                         prev_value -= dfa_transitions[ind].value;
1760                                         if (prev_value < 0) {
1761                                                   EMIT_OP2(SLJIT_SUB, R_TEMP, 0, R_TEMP, 0, SLJIT_IMM, -prev_value);
1762                                         }
1763                                         else {
1764                                                   EMIT_OP2(SLJIT_ADD, R_TEMP, 0, R_TEMP, 0, SLJIT_IMM, prev_value);
1765                                         }
1766                                         prev_value = dfa_transitions[ind].value;
1767                               }
1768                               EMIT_CMP(*range_jump_list, SLJIT_LESS_EQUAL, R_TEMP, 0, SLJIT_IMM, dfa_transitions[ind + 1].value - dfa_transitions[ind].value);
1769                               range_jump_list++;
1770                               ind++;
1771                     }
1772                     ind++;
1773           }
1774 
1775           *range_jump_list = NULL;
1776 
1777           if (!invert) {
1778                     no_states = compiler_common->no_states;
1779                     offset = TERM_OFFSET_OF(compiler_common->search_states[ind].type, 1);
1780                     EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), offset);
1781                     EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_CURR_STATE), offset, SLJIT_IMM, -1);
1782                     CHECK(sljit_emit_ijump(compiler, SLJIT_JUMP, SLJIT_MEM2(R_CURR_STATE, R_TEMP), 0));
1783 
1784                     EMIT_LABEL(label);
1785                     range_set_label(compiler_common->range_jump_list, label);
1786                     /* Clears the jump list. */
1787                     *compiler_common->range_jump_list = NULL;
1788           }
1789           return ind;
1790 }
1791 
1792 #undef TERM_OFFSET_OF
1793 #undef EMIT_OP1
1794 #undef EMIT_OP2
1795 #undef EMIT_LABEL
1796 #undef EMIT_JUMP
1797 #undef EMIT_CMP
1798 #undef CHECK
1799 
1800 /* --------------------------------------------------------------------- */
1801 /*  Main compiler                                                        */
1802 /* --------------------------------------------------------------------- */
1803 
1804 #define TERM_OFFSET_OF(ind, offs) (((ind) * compiler_common.no_states + (offs)) * sizeof(sljit_sw))
1805 
1806 #define EMIT_OP1(type, arg1, arg2, arg3, arg4) \
1807           CHECK(sljit_emit_op1(compiler_common.compiler, type, arg1, arg2, arg3, arg4))
1808 
1809 #define EMIT_OP2(type, arg1, arg2, arg3, arg4, arg5, arg6) \
1810           CHECK(sljit_emit_op2(compiler_common.compiler, type, arg1, arg2, arg3, arg4, arg5, arg6))
1811 
1812 #define EMIT_LABEL(label) \
1813           label = sljit_emit_label(compiler_common.compiler); \
1814           CHECK(!label)
1815 
1816 #define EMIT_JUMP(jump, type) \
1817           jump = sljit_emit_jump(compiler_common.compiler, type); \
1818           CHECK(!jump)
1819 
1820 #define EMIT_CMP(jump, type, arg1, arg2, arg3, arg4) \
1821           jump = sljit_emit_cmp(compiler_common.compiler, type, arg1, arg2, arg3, arg4); \
1822           CHECK(!jump)
1823 
1824 /* A do {} while(0) expression helps to avoid goto statements. */
1825 #define BEGIN_GUARD \
1826           do {
1827 
1828 #define END_GUARD \
1829           } while(0);
1830 
1831 #define CHECK(exp) \
1832           if (SLJIT_UNLIKELY(exp)) \
1833                     break;
1834 
regex_compile(const regex_char_t * regex_string,int length,int re_flags,int * error)1835 struct regex_machine* regex_compile(const regex_char_t *regex_string, int length, int re_flags, int *error)
1836 {
1837           struct compiler_common compiler_common;
1838           sljit_sw ind;
1839           int error_code, done, suggest_fast_forward;
1840           /* ID of an empty match (-1 if not reachable). */
1841           int empty_match_id;
1842 
1843           struct sljit_jump *jump;
1844           struct sljit_jump *best_match_found_jump;
1845           struct sljit_jump *fast_forward_jump = NULL;
1846           struct sljit_jump *length_is_zero_jump;
1847           struct sljit_jump *end_check_jump = NULL;
1848           struct sljit_jump *best_match_check_jump = NULL;
1849           struct sljit_jump *non_greedy_end_jump = NULL;
1850           struct sljit_label *label;
1851           struct sljit_label *end_check_label = NULL;
1852           struct sljit_label *start_label;
1853           struct sljit_label *fast_forward_label;
1854           struct sljit_label *fast_forward_return_label;
1855 
1856           if (error)
1857                     *error = REGEX_NO_ERROR;
1858 #ifdef REGEX_MATCH_VERBOSE
1859           compiler_common.flags = re_flags & (REGEX_MATCH_BEGIN | REGEX_MATCH_END | REGEX_MATCH_NON_GREEDY | REGEX_NEWLINE | REGEX_MATCH_VERBOSE);
1860 #else
1861           compiler_common.flags = re_flags & (REGEX_MATCH_BEGIN | REGEX_MATCH_END | REGEX_MATCH_NON_GREEDY | REGEX_NEWLINE);
1862 #endif
1863 
1864           /* Step 1: parsing (Left->Right).
1865              Syntax check and AST generator. */
1866           error_code = parse(regex_string, length, &compiler_common);
1867           if (error_code) {
1868                     stack_destroy(&compiler_common.stack);
1869                     if (error)
1870                               *error = error_code;
1871                     return NULL;
1872           }
1873 
1874           /* Step 2: generating branches (Right->Left). */
1875           error_code = generate_transitions(&compiler_common);
1876           stack_destroy(&compiler_common.stack);
1877           stack_destroy(&compiler_common.depth);
1878           if (error_code) {
1879                     if (compiler_common.dfa_transitions)
1880                               SLJIT_FREE(compiler_common.dfa_transitions, NULL);
1881                     if (error)
1882                               *error = error_code;
1883                     return NULL;
1884           }
1885 
1886           /* Step 3: Generate necessary data for depth-first search (Left->Right). */
1887           error_code = generate_search_states(&compiler_common);
1888           if (error_code) {
1889                     SLJIT_FREE(compiler_common.dfa_transitions, NULL);
1890                     if (error)
1891                               *error = error_code;
1892                     return NULL;
1893           }
1894 
1895 #ifdef REGEX_MATCH_VERBOSE
1896           if (compiler_common.flags & REGEX_MATCH_VERBOSE)
1897                     verbose_transitions(&compiler_common);
1898 #endif
1899 
1900           /* Step 4: Left->Right generate code. */
1901           stack_init(&compiler_common.stack);
1902           stack_init(&compiler_common.depth);
1903           done = 0;
1904           compiler_common.machine = NULL;
1905           compiler_common.compiler = NULL;
1906           compiler_common.range_jump_list = NULL;
1907 
1908           BEGIN_GUARD
1909 
1910           compiler_common.machine = (struct regex_machine*)SLJIT_MALLOC(sizeof(struct regex_machine) + (compiler_common.terms_size - 1) * sizeof(sljit_uw), NULL);
1911           CHECK(!compiler_common.machine);
1912 
1913           compiler_common.compiler = sljit_create_compiler(NULL);
1914           CHECK(!compiler_common.compiler);
1915 
1916           if (compiler_common.longest_range_size > 0) {
1917                     compiler_common.range_jump_list = (struct sljit_jump**)SLJIT_MALLOC(sizeof(struct sljit_jump*) * compiler_common.longest_range_size, NULL);
1918                     CHECK(!compiler_common.range_jump_list);
1919           }
1920 
1921           if ((compiler_common.flags & REGEX_ID_CHECK) && !(compiler_common.flags & REGEX_MATCH_BEGIN))
1922                     compiler_common.no_states = 4;
1923           else if (!(compiler_common.flags & REGEX_ID_CHECK) && (compiler_common.flags & REGEX_MATCH_BEGIN))
1924                     compiler_common.no_states = 2;
1925           else
1926                     compiler_common.no_states = 3;
1927 
1928           compiler_common.machine->flags = compiler_common.flags;
1929           compiler_common.machine->no_states = compiler_common.no_states;
1930           compiler_common.machine->size = compiler_common.machine->no_states * compiler_common.terms_size;
1931 
1932           /* Study the regular expression. */
1933           empty_match_id = -1;
1934           suggest_fast_forward = 1;
1935           if (!(compiler_common.flags & REGEX_FAKE_MATCH_BEGIN)) {
1936                     CHECK(trace_transitions(0, &compiler_common));
1937                     while (compiler_common.stack.count > 0) {
1938                               ind = stack_pop(&compiler_common.stack)->value;
1939                               if (compiler_common.search_states[ind].type == 0) {
1940                                         SLJIT_ASSERT(compiler_common.dfa_transitions[ind].type == type_end);
1941                                         suggest_fast_forward = 0;
1942                                         empty_match_id = compiler_common.search_states[ind].value;
1943                               }
1944                               else if (compiler_common.search_states[ind].type > 0) {
1945                                         SLJIT_ASSERT(compiler_common.dfa_transitions[ind].type != type_end);
1946                                         if (compiler_common.dfa_transitions[ind].type == type_rng_start && compiler_common.dfa_transitions[ind].value)
1947                                                   suggest_fast_forward = 0;
1948                               }
1949                               compiler_common.search_states[ind].value = -1;
1950                     }
1951           }
1952           else {
1953                     SLJIT_ASSERT(compiler_common.dfa_transitions[1].type == type_newline);
1954                     CHECK(trace_transitions(1, &compiler_common));
1955                     while (compiler_common.stack.count > 0) {
1956                               ind = stack_pop(&compiler_common.stack)->value;
1957                               if (compiler_common.search_states[ind].type == 0) {
1958                                         SLJIT_ASSERT(compiler_common.dfa_transitions[ind].type == type_end);
1959                                         suggest_fast_forward = 0;
1960                                         empty_match_id = compiler_common.search_states[ind].value;
1961                               }
1962                               compiler_common.search_states[ind].value = -1;
1963                     }
1964           }
1965 
1966           /* Step 4.1: Generate entry. */
1967           CHECK(sljit_emit_enter(compiler_common.compiler, 0, 3, 5, 5, 0, 0, 0));
1968 
1969           /* Copy arguments to their place. */
1970           EMIT_OP1(SLJIT_MOV, R_REGEX_MATCH, 0, SLJIT_S0, 0);
1971           EMIT_OP1(SLJIT_MOV, R_STRING, 0, SLJIT_S1, 0);
1972           EMIT_OP2(SLJIT_ADD, R_LENGTH, 0, SLJIT_S2, 0, SLJIT_IMM, 1);
1973 
1974           /* Init global registers. */
1975           EMIT_OP1(SLJIT_MOV, R_CURR_STATE, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, current));
1976           EMIT_OP1(SLJIT_MOV, R_NEXT_STATE, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, next));
1977           EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, head));
1978           EMIT_OP1(SLJIT_MOV, R_BEST_BEGIN, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_begin));
1979           EMIT_OP1(SLJIT_MOV, R_CURR_INDEX, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, index));
1980 
1981           /* Check whether the best match has already found in a previous frame. */
1982           EMIT_CMP(jump, SLJIT_EQUAL, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_quit), SLJIT_IMM, 0);
1983           EMIT_JUMP(best_match_found_jump, SLJIT_JUMP);
1984 
1985 #ifdef REGEX_MATCH_VERBOSE
1986           if (compiler_common.flags & REGEX_MATCH_VERBOSE)
1987                     printf("\n-----------------\nTrace\n-----------------\n");
1988 #endif
1989 
1990           /* Step 4.2: Generate code for state 0. */
1991           EMIT_LABEL(label);
1992           compiler_common.machine->entry_addrs[0] = (sljit_uw)label;
1993 
1994           /* Swapping current and next. */
1995           EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_CURR_STATE, 0);
1996           EMIT_OP1(SLJIT_MOV, R_CURR_STATE, 0, R_NEXT_STATE, 0);
1997           EMIT_OP1(SLJIT_MOV, R_NEXT_STATE, 0, R_TEMP, 0);
1998 
1999           /* Checking whether the best case needs to be updated. */
2000           if (!(compiler_common.flags & REGEX_MATCH_END)) {
2001                     EMIT_CMP(end_check_jump, SLJIT_NOT_EQUAL, SLJIT_MEM1(R_CURR_STATE), TERM_REL_OFFSET_OF(0, 1), SLJIT_IMM, -1);
2002                     EMIT_LABEL(end_check_label);
2003           }
2004           EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), TERM_REL_OFFSET_OF(0, 1), SLJIT_IMM, -1);
2005           EMIT_OP2(SLJIT_ADD, R_CURR_INDEX, 0, R_CURR_INDEX, 0, SLJIT_IMM, 1);
2006 
2007           /* Checking whether best case has already found. */
2008           if (!(compiler_common.flags & REGEX_MATCH_END) || (compiler_common.flags & REGEX_MATCH_BEGIN)) {
2009                     if (!(compiler_common.flags & REGEX_MATCH_BEGIN)) {
2010                               /* We can bail out if no more active states remain and R_BEST_BEGIN != -1. */
2011                               EMIT_CMP(best_match_check_jump, SLJIT_NOT_EQUAL, R_BEST_BEGIN, 0, SLJIT_IMM, -1);
2012                     }
2013                     else {
2014                               /* We can bail out if no more active states remain (regardless of R_BEST_BEGIN). */
2015                               EMIT_CMP(best_match_check_jump, SLJIT_EQUAL, R_NEXT_HEAD, 0, SLJIT_IMM, 0);
2016                     }
2017           }
2018 
2019           EMIT_LABEL(start_label);
2020           sljit_set_label(jump, start_label);
2021 
2022           if (!(compiler_common.flags & REGEX_MATCH_BEGIN) && suggest_fast_forward) {
2023                     EMIT_CMP(fast_forward_jump, SLJIT_NOT_EQUAL, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_forward), SLJIT_IMM, 0);
2024           }
2025 
2026           /* Loading the next character. */
2027           EMIT_OP2(SLJIT_SUB | SLJIT_SET_Z, R_LENGTH, 0, R_LENGTH, 0, SLJIT_IMM, 1);
2028           EMIT_JUMP(length_is_zero_jump, SLJIT_EQUAL);
2029 
2030           EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_STRING, 0);
2031 #ifdef REGEX_USE_8BIT_CHARS
2032           EMIT_OP1(SLJIT_MOV_U8, R_CURR_CHAR, 0, SLJIT_MEM1(R_TEMP), 0);
2033           EMIT_OP2(SLJIT_ADD, R_TEMP, 0, R_TEMP, 0, SLJIT_IMM, 1);
2034 #else
2035           EMIT_OP1(SLJIT_MOV_UH, R_CURR_CHAR, 0, SLJIT_MEM1(R_TEMP), 0);
2036           EMIT_OP2(SLJIT_ADD, R_TEMP, 0, R_TEMP, 0, SLJIT_IMM, 2);
2037 #endif
2038           EMIT_OP1(SLJIT_MOV, R_STRING, 0, R_TEMP, 0);
2039 
2040 #ifdef REGEX_MATCH_VERBOSE
2041           if (compiler_common.flags & REGEX_MATCH_VERBOSE) {
2042                     printf("(%3d): ", 0);
2043                     CHECK(trace_transitions(0, &compiler_common));
2044                     while (compiler_common.stack.count > 0) {
2045                               ind = stack_pop(&compiler_common.stack)->value;
2046                               if (compiler_common.search_states[ind].type >= 0)
2047                                         printf("-> (%3d:%3d) ", compiler_common.search_states[ind].type, compiler_common.search_states[ind].value);
2048                               compiler_common.search_states[ind].value = -1;
2049                     }
2050                     printf("\n");
2051           }
2052 #endif
2053 
2054           EMIT_LABEL(fast_forward_return_label);
2055           if (!(compiler_common.flags & REGEX_MATCH_BEGIN)) {
2056                     EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_forward), SLJIT_IMM, 1);
2057                     if (!(compiler_common.flags & REGEX_MATCH_END)) {
2058                               EMIT_CMP(jump, SLJIT_NOT_EQUAL, R_BEST_BEGIN, 0, SLJIT_IMM, -1);
2059                     }
2060 
2061                     EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_CURR_INDEX, 0);
2062                     CHECK(compile_uncond_tran(&compiler_common, R_NEXT_STATE));
2063                     /* And branching to the first state. */
2064                     CHECK(sljit_emit_ijump(compiler_common.compiler, SLJIT_JUMP, SLJIT_MEM2(R_CURR_STATE, R_TEMP), 0));
2065 
2066                     if (!(compiler_common.flags & REGEX_MATCH_END)) {
2067                               EMIT_LABEL(label);
2068                               sljit_set_label(jump, label);
2069                     }
2070           }
2071           /* This is the case where we only have to reset the R_NEXT_HEAD. */
2072           EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_NEXT_HEAD, 0);
2073           EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, 0);
2074           CHECK(sljit_emit_ijump(compiler_common.compiler, SLJIT_JUMP, SLJIT_MEM2(R_CURR_STATE, R_TEMP), 0));
2075 
2076           /* Fast-forward loop. */
2077           if (fast_forward_jump) {
2078                     /* Quit from fast-forward loop. */
2079                     EMIT_LABEL(fast_forward_label);
2080                     EMIT_OP2(SLJIT_SUB, R_TEMP, 0, R_NEXT_HEAD, 0, SLJIT_IMM, 1);
2081                     EMIT_OP1(SLJIT_MOV, R_LENGTH, 0, R_NEXT_STATE, 0);
2082                     EMIT_OP1(SLJIT_MOV, R_STRING, 0, R_CURR_STATE, 0);
2083                     EMIT_OP1(SLJIT_MOV, R_CURR_INDEX, 0, R_NEXT_HEAD, 0);
2084                     EMIT_OP1(SLJIT_MOV, R_NEXT_STATE, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, next));
2085                     EMIT_OP1(SLJIT_MOV, R_CURR_STATE, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, current));
2086                     EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, head));
2087 
2088                     /* Update the start field of the locations. */
2089                     CHECK(trace_transitions(0, &compiler_common));
2090                     while (compiler_common.stack.count > 0) {
2091                               ind = stack_pop(&compiler_common.stack)->value;
2092                               if (compiler_common.search_states[ind].type >= 0) {
2093                                         EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(compiler_common.search_states[ind].type, 2), R_TEMP, 0);
2094                               }
2095                               compiler_common.search_states[ind].value = -1;
2096                     }
2097                     EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_forward), SLJIT_IMM, 0);
2098                     EMIT_JUMP(jump, SLJIT_JUMP);
2099                     sljit_set_label(jump, fast_forward_return_label);
2100 
2101                     /* Start fast-forward. */
2102                     EMIT_LABEL(label);
2103                     sljit_set_label(fast_forward_jump, label);
2104 
2105                     /* Moving everything to registers. */
2106                     EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, next), R_NEXT_STATE, 0);
2107                     EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, current), R_CURR_STATE, 0);
2108                     EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, head), R_NEXT_HEAD, 0);
2109                     EMIT_OP1(SLJIT_MOV, R_NEXT_STATE, 0, R_LENGTH, 0);
2110                     EMIT_OP1(SLJIT_MOV, R_CURR_STATE, 0, R_STRING, 0);
2111                     EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, R_CURR_INDEX, 0);
2112 
2113                     /* Fast forward mainloop. */
2114                     EMIT_LABEL(label);
2115                     EMIT_OP2(SLJIT_SUB | SLJIT_SET_Z, R_NEXT_STATE, 0, R_NEXT_STATE, 0, SLJIT_IMM, 1);
2116                     EMIT_JUMP(fast_forward_jump, SLJIT_EQUAL);
2117 
2118 #ifdef REGEX_USE_8BIT_CHARS
2119                     EMIT_OP1(SLJIT_MOV_U8, R_CURR_CHAR, 0, SLJIT_MEM1(R_CURR_STATE), 0);
2120                     EMIT_OP2(SLJIT_ADD, R_CURR_STATE, 0, R_CURR_STATE, 0, SLJIT_IMM, 1);
2121 #else
2122                     EMIT_OP1(SLJIT_MOV_UH, R_CURR_CHAR, 0, SLJIT_MEM1(R_CURR_STATE), 0);
2123                     EMIT_OP2(SLJIT_ADD, R_CURR_STATE, 0, R_CURR_STATE, 0, SLJIT_IMM, 2);
2124 #endif
2125 
2126                     CHECK(trace_transitions(0, &compiler_common));
2127                     CHECK(compile_leave_fast_forward(&compiler_common, fast_forward_label));
2128 
2129                     EMIT_OP2(SLJIT_ADD, R_NEXT_HEAD, 0, R_NEXT_HEAD, 0, SLJIT_IMM, 1);
2130                     EMIT_JUMP(jump, SLJIT_JUMP);
2131                     sljit_set_label(jump, label);
2132 
2133                     /* String is finished. */
2134                     EMIT_LABEL(label);
2135                     sljit_set_label(fast_forward_jump, label);
2136                     EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, index), R_NEXT_HEAD, 0);
2137                     EMIT_JUMP(fast_forward_jump, SLJIT_JUMP);
2138           }
2139 
2140           /* End check. */
2141           if (end_check_jump) {
2142                     EMIT_LABEL(label);
2143                     sljit_set_label(end_check_jump, label);
2144 
2145                     if (!(compiler_common.flags & REGEX_MATCH_NON_GREEDY) || !(compiler_common.flags & REGEX_MATCH_BEGIN)) {
2146                               CHECK(compile_end_check(&compiler_common, end_check_label));
2147                     }
2148                     else {
2149                               /* Since we leave, we do not need to update the R_BEST_BEGIN. */
2150                               EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_begin), SLJIT_IMM, 0);
2151                               EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_end), R_CURR_INDEX, 0);
2152                               if (compiler_common.flags & REGEX_ID_CHECK) {
2153                                         EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_id), SLJIT_MEM1(R_CURR_STATE), TERM_REL_OFFSET_OF(0, 2));
2154                               }
2155                               EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_quit), SLJIT_IMM, 1);
2156                               EMIT_JUMP(non_greedy_end_jump, SLJIT_JUMP);
2157                     }
2158           }
2159 
2160           /* Finish check. */
2161           if (best_match_check_jump) {
2162                     EMIT_LABEL(label);
2163                     sljit_set_label(best_match_check_jump, label);
2164 
2165                     if (!(compiler_common.flags & REGEX_MATCH_BEGIN)) {
2166                               EMIT_CMP(jump, SLJIT_NOT_EQUAL, R_NEXT_HEAD, 0, SLJIT_IMM, 0);
2167                               sljit_set_label(jump, start_label);
2168                     }
2169                     EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_quit), SLJIT_IMM, 1);
2170           }
2171 
2172           /* Leaving matching and storing the necessary values. */
2173           EMIT_LABEL(label);
2174           sljit_set_label(length_is_zero_jump, label);
2175           if (non_greedy_end_jump)
2176                     sljit_set_label(non_greedy_end_jump, label);
2177 
2178           EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, index), R_CURR_INDEX, 0);
2179           EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, head), R_NEXT_HEAD, 0);
2180           EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, next), R_NEXT_STATE, 0);
2181           EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, current), R_CURR_STATE, 0);
2182 
2183           /* Exit from JIT. */
2184           EMIT_LABEL(label);
2185           sljit_set_label(best_match_found_jump, label);
2186           if (fast_forward_jump)
2187                     sljit_set_label(fast_forward_jump, label);
2188           CHECK(sljit_emit_return(compiler_common.compiler, SLJIT_UNUSED, 0, 0));
2189 
2190           for (ind = 1; ind < compiler_common.dfa_size - 1; ind++) {
2191                     if (compiler_common.search_states[ind].type >= 0) {
2192                               SLJIT_ASSERT(compiler_common.search_states[ind].type < compiler_common.terms_size);
2193                               EMIT_LABEL(label);
2194                               compiler_common.machine->entry_addrs[compiler_common.search_states[ind].type] = (sljit_uw)label;
2195 
2196                               if (compiler_common.dfa_transitions[ind].type == type_char) {
2197                                         EMIT_CMP(jump, SLJIT_NOT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, compiler_common.dfa_transitions[ind].value);
2198                               }
2199                               else if (compiler_common.dfa_transitions[ind].type == type_rng_start) {
2200                                         ind = compile_range_check(&compiler_common, ind);
2201                                         CHECK(!ind);
2202                               }
2203                               else {
2204                                         SLJIT_ASSERT(compiler_common.dfa_transitions[ind].type == type_newline);
2205                                         CHECK(compile_newline_check(&compiler_common, ind));
2206                               }
2207 
2208                               CHECK(trace_transitions(ind, &compiler_common));
2209 #ifdef REGEX_MATCH_VERBOSE
2210                               if (compiler_common.flags & REGEX_MATCH_VERBOSE)
2211                                         printf("(%3d): ", compiler_common.search_states[ind].type);
2212 #endif
2213                               CHECK(compile_cond_tran(&compiler_common, compiler_common.search_states[ind].type));
2214 
2215                               if (compiler_common.dfa_transitions[ind].type == type_char) {
2216                                         EMIT_LABEL(label);
2217                                         sljit_set_label(jump, label);
2218                               }
2219                               else if (compiler_common.dfa_transitions[ind].type == type_rng_end) {
2220                                         EMIT_LABEL(label);
2221                                         range_set_label(compiler_common.range_jump_list, label);
2222                               }
2223                               else {
2224                                         SLJIT_ASSERT(compiler_common.dfa_transitions[ind].type == type_newline);
2225                               }
2226 
2227                               /* Branch to the next item in the list. */
2228                               EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(compiler_common.search_states[ind].type, 1));
2229                               EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(compiler_common.search_states[ind].type, 1), SLJIT_IMM, -1);
2230                               CHECK(sljit_emit_ijump(compiler_common.compiler, SLJIT_JUMP, SLJIT_MEM2(R_CURR_STATE, R_TEMP), 0));
2231                     }
2232           }
2233 
2234           if (ind == compiler_common.dfa_size - 1) {
2235                     /* Generate an init stub function. */
2236                     EMIT_LABEL(label);
2237                     CHECK(sljit_emit_enter(compiler_common.compiler, 0, 2, 3, 3, 0, 0, 0));
2238 
2239                     if (empty_match_id == -1) {
2240                               EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_S1), SLJIT_OFFSETOF(struct regex_match, best_begin), SLJIT_IMM, -1);
2241                               EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_S1), SLJIT_OFFSETOF(struct regex_match, best_id), SLJIT_IMM, 0);
2242                     }
2243                     else {
2244                               EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_S1), SLJIT_OFFSETOF(struct regex_match, best_begin), SLJIT_IMM, 0);
2245                               EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_S1), SLJIT_OFFSETOF(struct regex_match, best_id), SLJIT_IMM, empty_match_id);
2246                     }
2247 
2248                     EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_S1), SLJIT_OFFSETOF(struct regex_match, index), SLJIT_IMM, !(compiler_common.flags & REGEX_FAKE_MATCH_BEGIN) ? 1 : 2);
2249 
2250                     if (!(compiler_common.flags & REGEX_MATCH_NON_GREEDY) || empty_match_id == -1) {
2251                               /* The else is a really rare event, so we still generate an empty function instead of a runtime pointer check. */
2252                               SLJIT_ASSERT(R_CURR_STATE == SLJIT_S0);
2253                               if (!(compiler_common.flags & REGEX_MATCH_BEGIN)) {
2254                                         /* R_CURR_INDEX (put to R_TEMP) is zero. */
2255                                         EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_IMM, 0);
2256                               }
2257                               CHECK(compile_uncond_tran(&compiler_common, R_CURR_STATE));
2258                     }
2259                     else {
2260                               EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, 0);
2261                     }
2262                     CHECK(sljit_emit_return(compiler_common.compiler, SLJIT_MOV, R_NEXT_HEAD, 0));
2263 
2264                     compiler_common.machine->continue_match = sljit_generate_code(compiler_common.compiler);
2265 #ifndef SLJIT_INDIRECT_CALL
2266                     compiler_common.machine->u.init_match = (void*)(sljit_sw)sljit_get_label_addr(label);
2267 #else
2268                     sljit_set_function_context(&compiler_common.machine->u.init_match, &compiler_common.machine->context, sljit_get_label_addr(label), regex_compile);
2269 #endif
2270 #ifdef REGEX_MATCH_VERBOSE
2271                     if (compiler_common.flags & REGEX_MATCH_VERBOSE)
2272                               printf("Continue match: %p Init match: %p\n\n", compiler_common.machine->continue_match, compiler_common.machine->u.init_match);
2273 #endif
2274                     if (compiler_common.machine->continue_match) {
2275                               for (ind = 0; ind < compiler_common.terms_size; ++ind)
2276                                         compiler_common.machine->entry_addrs[ind] = sljit_get_label_addr((struct sljit_label*)compiler_common.machine->entry_addrs[ind]);
2277                               done = 1;
2278                     }
2279           }
2280           END_GUARD
2281 
2282           stack_destroy(&compiler_common.stack);
2283           stack_destroy(&compiler_common.depth);
2284           SLJIT_FREE(compiler_common.dfa_transitions, NULL);
2285           SLJIT_FREE(compiler_common.search_states, NULL);
2286           if (compiler_common.range_jump_list)
2287                     SLJIT_FREE(compiler_common.range_jump_list, NULL);
2288           if (compiler_common.compiler)
2289                     sljit_free_compiler(compiler_common.compiler);
2290           if (done)
2291                     return compiler_common.machine;
2292 
2293           if (compiler_common.machine) {
2294                     SLJIT_FREE(compiler_common.machine, NULL);
2295           }
2296           if (error)
2297                     *error = REGEX_MEMORY_ERROR;
2298           return NULL;
2299 }
2300 
2301 #undef TERM_OFFSET_OF
2302 #undef EMIT_OP1
2303 #undef EMIT_OP2
2304 #undef EMIT_LABEL
2305 #undef EMIT_JUMP
2306 #undef EMIT_CMP
2307 #undef BEGIN_GUARD
2308 #undef END_GUARD
2309 #undef CHECK
2310 
regex_free_machine(struct regex_machine * machine)2311 void regex_free_machine(struct regex_machine *machine)
2312 {
2313           sljit_free_code(machine->continue_match);
2314           SLJIT_FREE(machine, NULL);
2315 }
2316 
regex_get_platform_name(void)2317 const char* regex_get_platform_name(void)
2318 {
2319           return sljit_get_platform_name();
2320 }
2321 
2322 /* --------------------------------------------------------------------- */
2323 /*  Mathching utilities                                                  */
2324 /* --------------------------------------------------------------------- */
2325 
regex_begin_match(struct regex_machine * machine)2326 struct regex_match* regex_begin_match(struct regex_machine *machine)
2327 {
2328           sljit_sw *ptr1;
2329           sljit_sw *ptr2;
2330           sljit_sw *end;
2331           sljit_sw *entry_addrs;
2332 
2333           struct regex_match *match = (struct regex_match*)SLJIT_MALLOC(sizeof(struct regex_match) + (machine->size * 2 - 1) * sizeof(sljit_sw), NULL);
2334           if (!match)
2335                     return NULL;
2336 
2337           ptr1 = match->states;
2338           ptr2 = match->states + machine->size;
2339           end = ptr2;
2340           entry_addrs = (sljit_sw*)machine->entry_addrs;
2341 
2342           match->current = ptr1;
2343           match->next = ptr2;
2344           match->head = 0;
2345           match->machine = machine;
2346 
2347           /* Init machine states. */
2348           switch (machine->no_states) {
2349           case 2:
2350                     while (ptr1 < end) {
2351                               *ptr1++ = *entry_addrs;
2352                               *ptr2++ = *entry_addrs++;
2353                               *ptr1++ = -1;
2354                               *ptr2++ = -1;
2355                     }
2356                     break;
2357 
2358           case 3:
2359                     while (ptr1 < end) {
2360                               *ptr1++ = *entry_addrs;
2361                               *ptr2++ = *entry_addrs++;
2362                               *ptr1++ = -1;
2363                               *ptr2++ = -1;
2364                               *ptr1++ = 0;
2365                               *ptr2++ = 0;
2366                     }
2367                     break;
2368 
2369           case 4:
2370                     while (ptr1 < end) {
2371                               *ptr1++ = *entry_addrs;
2372                               *ptr2++ = *entry_addrs++;
2373                               *ptr1++ = -1;
2374                               *ptr2++ = -1;
2375                               *ptr1++ = 0;
2376                               *ptr2++ = 0;
2377                               *ptr1++ = 0;
2378                               *ptr2++ = 0;
2379                     }
2380                     break;
2381 
2382           default:
2383                     SLJIT_UNREACHABLE();
2384                     break;
2385           }
2386 
2387           SLJIT_ASSERT(ptr1 == end);
2388 
2389           match->u.continue_match = machine->continue_match;
2390 
2391           regex_reset_match(match);
2392           return match;
2393 }
2394 
regex_reset_match(struct regex_match * match)2395 void regex_reset_match(struct regex_match *match)
2396 {
2397           struct regex_machine *machine = match->machine;
2398           sljit_sw current, ind;
2399           sljit_sw *current_ptr;
2400 
2401           match->best_end = 0;
2402           match->fast_quit = 0;
2403           match->fast_forward = 0;
2404 
2405           if (match->head != 0) {
2406                     /* Clear the current state. */
2407                     current = match->head;
2408                     current_ptr = match->current;
2409                     do {
2410                               ind = (current / sizeof(sljit_sw)) + 1;
2411                               current = current_ptr[ind];
2412                               current_ptr[ind] = -1;
2413                     } while (current != 0);
2414           }
2415           match->head = machine->u.call_init(match->current, match);
2416 }
2417 
regex_free_match(struct regex_match * match)2418 void regex_free_match(struct regex_match *match)
2419 {
2420           SLJIT_FREE(match, NULL);
2421 }
2422 
regex_continue_match(struct regex_match * match,const regex_char_t * input_string,int length)2423 void regex_continue_match(struct regex_match *match, const regex_char_t *input_string, int length)
2424 {
2425           match->u.call_continue(match, input_string, length);
2426 }
2427 
regex_get_result(struct regex_match * match,int * end,int * id)2428 int regex_get_result(struct regex_match *match, int *end, int *id)
2429 {
2430           int flags = match->machine->flags;
2431           sljit_sw no_states;
2432 
2433           *end = match->best_end;
2434           *id = match->best_id;
2435           if (!(flags & (REGEX_MATCH_END | REGEX_FAKE_MATCH_END)))
2436                     return match->best_begin;
2437 
2438           if (flags & REGEX_FAKE_MATCH_END) {
2439                     SLJIT_ASSERT(!(flags & (REGEX_MATCH_BEGIN | REGEX_MATCH_END)));
2440                     if (match->best_begin != -1)
2441                               return match->best_begin;
2442 
2443                     no_states = match->machine->no_states;
2444                     if (match->current[no_states + 1] == -1)
2445                               return -1;
2446                     if (flags & REGEX_ID_CHECK)
2447                               *id = match->current[no_states + 3];
2448                     if (!(flags & REGEX_FAKE_MATCH_BEGIN))
2449                               *end = match->index - 1;
2450                     else
2451                               *end = match->index - 2;
2452                     return match->current[no_states + 2];
2453           }
2454           else {
2455                     /* Check the status of the last code. */
2456                     if (!(flags & REGEX_MATCH_BEGIN)) {
2457                               /* No shortcut in this case. */
2458                               if (!(flags & REGEX_ID_CHECK)) {
2459                                         if (match->current[1] == -1)
2460                                                   return -1;
2461                                         *end = match->index - 1;
2462                                         return match->current[2];
2463                               }
2464 
2465                               if (match->current[1] == -1)
2466                                         return -1;
2467                               *end = match->index - 1;
2468                               *id = match->current[3];
2469                               return match->current[2];
2470                     }
2471 
2472                     /* Shortcut is possible in this case. */
2473                     if (!(flags & REGEX_ID_CHECK)) {
2474                               if (match->current[1] == -1 || match->head == -1)
2475                                         return -1;
2476                               *end = match->index - 1;
2477                               return 0;
2478                     }
2479 
2480                     if (match->current[1] == -1 || match->head == -1)
2481                               return -1;
2482                     *end = match->index - 1;
2483                     *id = match->current[2];
2484                     return 0;
2485           }
2486 }
2487 
regex_is_match_finished(struct regex_match * match)2488 int regex_is_match_finished(struct regex_match *match)
2489 {
2490           return match->fast_quit;
2491 }
2492 
2493 #ifdef REGEX_MATCH_VERBOSE
regex_continue_match_debug(struct regex_match * match,const regex_char_t * input_string,int length)2494 void regex_continue_match_debug(struct regex_match *match, const regex_char_t *input_string, int length)
2495 {
2496           sljit_sw *ptr;
2497           sljit_sw *end;
2498           sljit_sw count;
2499 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
2500           sljit_sw current;
2501 #endif
2502           sljit_sw no_states = match->machine->no_states;
2503           sljit_sw len = match->machine->size;
2504 
2505           while (length > 0) {
2506                     match->u.call_continue(match, input_string, 1);
2507 
2508                     if (match->fast_forward) {
2509                               if (match->machine->flags & REGEX_MATCH_VERBOSE)
2510                                         printf("fast forward\n");
2511                     }
2512 
2513                     /* Verbose (first). */
2514                     if (match->machine->flags & REGEX_MATCH_VERBOSE) {
2515                               ptr = match->current;
2516                               end = ptr + len;
2517                               count = 0;
2518                               printf("'%c' (%3ld->%3ld [%3ld]) ", *input_string, (long)match->best_begin, (long)match->best_end, (long)match->best_id);
2519                               while (ptr < end) {
2520                                         printf("[%3ld:", (long)count++);
2521                                         switch (no_states) {
2522                                         case 2:
2523                                                   if (ptr[1] != -1)
2524                                                             printf("+] ");
2525                                                   else
2526                                                             printf(" ] ");
2527                                                   break;
2528 
2529                                         case 3:
2530                                                   if (ptr[1] != -1)
2531                                                             printf("+,%3ld] ", (long)ptr[2]);
2532                                                   else
2533                                                             printf(" ,XXX] ");
2534                                                   break;
2535 
2536                                         case 4:
2537                                                   if (ptr[1] != -1)
2538                                                             printf("+,%3ld,%3ld] ", (long)ptr[2], (long)ptr[3]);
2539                                                   else
2540                                                             printf(" ,XXX,XXX] ");
2541                                                   break;
2542                                         }
2543                                         ptr += no_states;
2544                               }
2545                               printf("\n");
2546                     }
2547 
2548 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
2549                     /* Sanity check (later). */
2550                     ptr = match->next;
2551                     end = ptr + len;
2552                     while (ptr < end) {
2553                               SLJIT_ASSERT(ptr[1] == -1);
2554                               ptr += no_states;
2555                     }
2556 
2557                     /* Check number of active elements. */
2558                     ptr = match->current + no_states;
2559                     end = ptr + len - no_states;
2560                     count = 0;
2561                     while (ptr < end) {
2562                               if (ptr[1] != -1)
2563                                         count++;
2564                               ptr += no_states;
2565                     }
2566 
2567                     /* Check chain list. */
2568                     current = match->head;
2569                     ptr = match->current;
2570                     while (current != 0) {
2571                               SLJIT_ASSERT(current >= 0 && current < len * sizeof(sljit_sw));
2572                               SLJIT_ASSERT((current % (no_states * sizeof(sljit_sw))) == 0);
2573                               SLJIT_ASSERT(count > 0);
2574                               current = ptr[(current / sizeof(sljit_sw)) + 1];
2575                               count--;
2576                     }
2577                     SLJIT_ASSERT(count == 0);
2578 #endif
2579 
2580                     if (match->fast_quit) {
2581                               /* the machine has stopped working. */
2582                               if (match->machine->flags & REGEX_MATCH_VERBOSE)
2583                                         printf("Best match has found\n");
2584                               break;
2585                     }
2586 
2587                     input_string++;
2588                     length--;
2589           }
2590 }
2591 #endif
2592