1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
9 any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License along
17 with this program; if not, write to the Free Software Foundation,
18 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
19
20 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
21 Idx length, reg_syntax_t syntax);
22 static void re_compile_fastmap_iter (regex_t *bufp,
23 const re_dfastate_t *init_state,
24 char *fastmap);
25 static reg_errcode_t init_dfa (re_dfa_t *dfa, Idx pat_len);
26 #ifdef RE_ENABLE_I18N
27 static void free_charset (re_charset_t *cset);
28 #endif /* RE_ENABLE_I18N */
29 static void free_workarea_compile (regex_t *preg);
30 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
31 #ifdef RE_ENABLE_I18N
32 static void optimize_utf8 (re_dfa_t *dfa);
33 #endif
34 static reg_errcode_t analyze (regex_t *preg);
35 static reg_errcode_t preorder (bin_tree_t *root,
36 reg_errcode_t (fn (void *, bin_tree_t *)),
37 void *extra);
38 static reg_errcode_t postorder (bin_tree_t *root,
39 reg_errcode_t (fn (void *, bin_tree_t *)),
40 void *extra);
41 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
42 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
43 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
44 bin_tree_t *node);
45 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
46 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
47 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
48 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
49 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
50 unsigned int constraint);
51 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
52 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
53 Idx node, bool root);
54 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
55 static Idx fetch_number (re_string_t *input, re_token_t *token,
56 reg_syntax_t syntax);
57 static int peek_token (re_token_t *token, re_string_t *input,
58 reg_syntax_t syntax);
59 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
60 reg_syntax_t syntax, reg_errcode_t *err);
61 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
62 re_token_t *token, reg_syntax_t syntax,
63 Idx nest, reg_errcode_t *err);
64 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
65 re_token_t *token, reg_syntax_t syntax,
66 Idx nest, reg_errcode_t *err);
67 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
68 re_token_t *token, reg_syntax_t syntax,
69 Idx nest, reg_errcode_t *err);
70 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
71 re_token_t *token, reg_syntax_t syntax,
72 Idx nest, reg_errcode_t *err);
73 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
74 re_dfa_t *dfa, re_token_t *token,
75 reg_syntax_t syntax, reg_errcode_t *err);
76 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
77 re_token_t *token, reg_syntax_t syntax,
78 reg_errcode_t *err);
79 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
80 re_string_t *regexp,
81 re_token_t *token, int token_len,
82 re_dfa_t *dfa,
83 reg_syntax_t syntax,
84 bool accept_hyphen);
85 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
86 re_string_t *regexp,
87 re_token_t *token);
88 #ifdef RE_ENABLE_I18N
89 static reg_errcode_t build_equiv_class (bitset sbcset,
90 re_charset_t *mbcset,
91 Idx *equiv_class_alloc,
92 const unsigned char *name);
93 static reg_errcode_t build_charclass (unsigned REG_TRANSLATE_TYPE trans,
94 bitset sbcset,
95 re_charset_t *mbcset,
96 Idx *char_class_alloc,
97 const unsigned char *class_name,
98 reg_syntax_t syntax);
99 #else /* not RE_ENABLE_I18N */
100 static reg_errcode_t build_equiv_class (bitset sbcset,
101 const unsigned char *name);
102 static reg_errcode_t build_charclass (unsigned REG_TRANSLATE_TYPE trans,
103 bitset sbcset,
104 const unsigned char *class_name,
105 reg_syntax_t syntax);
106 #endif /* not RE_ENABLE_I18N */
107 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
108 unsigned REG_TRANSLATE_TYPE trans,
109 const unsigned char *class_name,
110 const unsigned char *extra,
111 bool non_match, reg_errcode_t *err);
112 static bin_tree_t *create_tree (re_dfa_t *dfa,
113 bin_tree_t *left, bin_tree_t *right,
114 re_token_type_t type);
115 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
116 bin_tree_t *left, bin_tree_t *right,
117 const re_token_t *token);
118 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
119 static void free_token (re_token_t *node);
120 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
121 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
122
123 /* This table gives an error message for each of the error codes listed
124 in regex.h. Obviously the order here has to be same as there.
125 POSIX doesn't require that we do anything for REG_NOERROR,
126 but why not be nice? */
127
128 const char __re_error_msgid[] attribute_hidden =
129 {
130 #define REG_NOERROR_IDX 0
131 gettext_noop ("Success") /* REG_NOERROR */
132 "\0"
133 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
134 gettext_noop ("No match") /* REG_NOMATCH */
135 "\0"
136 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
137 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
138 "\0"
139 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
140 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
141 "\0"
142 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
143 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
144 "\0"
145 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
146 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
147 "\0"
148 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
149 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
150 "\0"
151 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
152 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
153 "\0"
154 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
155 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
156 "\0"
157 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
158 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
159 "\0"
160 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
161 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
162 "\0"
163 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
164 gettext_noop ("Invalid range end") /* REG_ERANGE */
165 "\0"
166 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
167 gettext_noop ("Memory exhausted") /* REG_ESPACE */
168 "\0"
169 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
170 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
171 "\0"
172 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
173 gettext_noop ("Premature end of regular expression") /* REG_EEND */
174 "\0"
175 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
176 gettext_noop ("Regular expression too big") /* REG_ESIZE */
177 "\0"
178 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
179 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
180 };
181
182 const size_t __re_error_msgid_idx[] attribute_hidden =
183 {
184 REG_NOERROR_IDX,
185 REG_NOMATCH_IDX,
186 REG_BADPAT_IDX,
187 REG_ECOLLATE_IDX,
188 REG_ECTYPE_IDX,
189 REG_EESCAPE_IDX,
190 REG_ESUBREG_IDX,
191 REG_EBRACK_IDX,
192 REG_EPAREN_IDX,
193 REG_EBRACE_IDX,
194 REG_BADBR_IDX,
195 REG_ERANGE_IDX,
196 REG_ESPACE_IDX,
197 REG_BADRPT_IDX,
198 REG_EEND_IDX,
199 REG_ESIZE_IDX,
200 REG_ERPAREN_IDX
201 };
202
203 /* Entry points for GNU code. */
204
205 /* re_compile_pattern is the GNU regular expression compiler: it
206 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
207 Returns 0 if the pattern was valid, otherwise an error string.
208
209 Assumes the `re_allocated' (and perhaps `re_buffer') and `translate' fields
210 are set in BUFP on entry. */
211
212 const char *
re_compile_pattern(const char * pattern,size_t length,struct re_pattern_buffer * bufp)213 re_compile_pattern (const char *pattern, size_t length,
214 struct re_pattern_buffer *bufp)
215 {
216 reg_errcode_t ret;
217
218 /* And GNU code determines whether or not to get register information
219 by passing null for the REGS argument to re_match, etc., not by
220 setting re_no_sub, unless REG_NO_SUB is set. */
221 bufp->re_no_sub = !!(re_syntax_options & REG_NO_SUB);
222
223 /* Match anchors at newline. */
224 bufp->re_newline_anchor = 1;
225
226 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
227
228 if (!ret)
229 return NULL;
230 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
231 }
232 #ifdef _LIBC
weak_alias(__re_compile_pattern,re_compile_pattern)233 weak_alias (__re_compile_pattern, re_compile_pattern)
234 #endif
235
236 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
237 also be assigned to arbitrarily: each pattern buffer stores its own
238 syntax, so it can be changed between regex compilations. */
239 /* This has no initializer because initialized variables in Emacs
240 become read-only after dumping. */
241 reg_syntax_t re_syntax_options;
242
243
244 /* Specify the precise syntax of regexps for compilation. This provides
245 for compatibility for various utilities which historically have
246 different, incompatible syntaxes.
247
248 The argument SYNTAX is a bit mask comprised of the various bits
249 defined in regex.h. We return the old syntax. */
250
251 reg_syntax_t
252 re_set_syntax (reg_syntax_t syntax)
253 {
254 reg_syntax_t ret = re_syntax_options;
255
256 re_syntax_options = syntax;
257 return ret;
258 }
259 #ifdef _LIBC
weak_alias(__re_set_syntax,re_set_syntax)260 weak_alias (__re_set_syntax, re_set_syntax)
261 #endif
262
263 int
264 re_compile_fastmap (struct re_pattern_buffer *bufp)
265 {
266 re_dfa_t *dfa = (re_dfa_t *) bufp->re_buffer;
267 char *fastmap = bufp->re_fastmap;
268
269 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
270 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
271 if (dfa->init_state != dfa->init_state_word)
272 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
273 if (dfa->init_state != dfa->init_state_nl)
274 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
275 if (dfa->init_state != dfa->init_state_begbuf)
276 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
277 bufp->re_fastmap_accurate = 1;
278 return 0;
279 }
280 #ifdef _LIBC
weak_alias(__re_compile_fastmap,re_compile_fastmap)281 weak_alias (__re_compile_fastmap, re_compile_fastmap)
282 #endif
283
284 static inline void
285 __attribute ((always_inline))
286 re_set_fastmap (char *fastmap, bool icase, int ch)
287 {
288 fastmap[ch] = 1;
289 if (icase)
290 fastmap[tolower (ch)] = 1;
291 }
292
293 /* Helper function for re_compile_fastmap.
294 Compile fastmap for the initial_state INIT_STATE. */
295
296 static void
re_compile_fastmap_iter(regex_t * bufp,const re_dfastate_t * init_state,char * fastmap)297 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
298 char *fastmap)
299 {
300 re_dfa_t *dfa = (re_dfa_t *) bufp->re_buffer;
301 Idx node_cnt;
302 bool icase = (dfa->mb_cur_max == 1 && (bufp->re_syntax & REG_IGNORE_CASE));
303 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
304 {
305 Idx node = init_state->nodes.elems[node_cnt];
306 re_token_type_t type = dfa->nodes[node].type;
307
308 if (type == CHARACTER)
309 {
310 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
311 #ifdef RE_ENABLE_I18N
312 if ((bufp->re_syntax & REG_IGNORE_CASE) && dfa->mb_cur_max > 1)
313 {
314 unsigned char buf[MB_LEN_MAX];
315 unsigned char *p;
316 wchar_t wc;
317 mbstate_t state;
318
319 p = buf;
320 *p++ = dfa->nodes[node].opr.c;
321 while (++node < dfa->nodes_len
322 && dfa->nodes[node].type == CHARACTER
323 && dfa->nodes[node].mb_partial)
324 *p++ = dfa->nodes[node].opr.c;
325 memset (&state, 0, sizeof (state));
326 if (mbrtowc (&wc, (const char *) buf, p - buf,
327 &state) == p - buf
328 && (__wcrtomb ((char *) buf, towlower (wc), &state)
329 != (size_t) -1))
330 re_set_fastmap (fastmap, false, buf[0]);
331 }
332 #endif
333 }
334 else if (type == SIMPLE_BRACKET)
335 {
336 int i, j, ch;
337 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
338 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
339 if (dfa->nodes[node].opr.sbcset[i] & ((bitset_word) 1 << j))
340 re_set_fastmap (fastmap, icase, ch);
341 }
342 #ifdef RE_ENABLE_I18N
343 else if (type == COMPLEX_BRACKET)
344 {
345 Idx i;
346 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
347 if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
348 || cset->nranges || cset->nchar_classes)
349 {
350 # ifdef _LIBC
351 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
352 {
353 /* In this case we want to catch the bytes which are
354 the first byte of any collation elements.
355 e.g. In da_DK, we want to catch 'a' since "aa"
356 is a valid collation element, and don't catch
357 'b' since 'b' is the only collation element
358 which starts from 'b'. */
359 const int32_t *table = (const int32_t *)
360 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
361 for (i = 0; i < SBC_MAX; ++i)
362 if (table[i] < 0)
363 re_set_fastmap (fastmap, icase, i);
364 }
365 # else
366 if (dfa->mb_cur_max > 1)
367 for (i = 0; i < SBC_MAX; ++i)
368 if (__btowc (i) == WEOF)
369 re_set_fastmap (fastmap, icase, i);
370 # endif /* not _LIBC */
371 }
372 for (i = 0; i < cset->nmbchars; ++i)
373 {
374 char buf[256];
375 mbstate_t state;
376 memset (&state, '\0', sizeof (state));
377 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
378 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
379 if ((bufp->re_syntax & REG_IGNORE_CASE) && dfa->mb_cur_max > 1)
380 {
381 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
382 != (size_t) -1)
383 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
384 }
385 }
386 }
387 #endif /* RE_ENABLE_I18N */
388 else if (type == OP_PERIOD
389 #ifdef RE_ENABLE_I18N
390 || type == OP_UTF8_PERIOD
391 #endif /* RE_ENABLE_I18N */
392 || type == END_OF_RE)
393 {
394 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
395 if (type == END_OF_RE)
396 bufp->re_can_be_null = 1;
397 return;
398 }
399 }
400 }
401
402 /* Entry point for POSIX code. */
403 /* regcomp takes a regular expression as a string and compiles it.
404
405 PREG is a regex_t *. We do not expect any fields to be initialized,
406 since POSIX says we shouldn't. Thus, we set
407
408 `re_buffer' to the compiled pattern;
409 `re_used' to the length of the compiled pattern;
410 `re_syntax' to REG_SYNTAX_POSIX_EXTENDED if the
411 REG_EXTENDED bit in CFLAGS is set; otherwise, to
412 REG_SYNTAX_POSIX_BASIC;
413 `re_newline_anchor' to REG_NEWLINE being set in CFLAGS;
414 `re_fastmap' to an allocated space for the fastmap;
415 `re_fastmap_accurate' to zero;
416 `re_nsub' to the number of subexpressions in PATTERN.
417
418 PATTERN is the address of the pattern string.
419
420 CFLAGS is a series of bits which affect compilation.
421
422 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
423 use POSIX basic syntax.
424
425 If REG_NEWLINE is set, then . and [^...] don't match newline.
426 Also, regexec will try a match beginning after every newline.
427
428 If REG_ICASE is set, then we considers upper- and lowercase
429 versions of letters to be equivalent when matching.
430
431 If REG_NOSUB is set, then when PREG is passed to regexec, that
432 routine will report only success or failure, and nothing about the
433 registers.
434
435 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
436 the return codes and their meanings.) */
437
438 int
regcomp(regex_t * __restrict preg,const char * __restrict pattern,int cflags)439 regcomp (regex_t *__restrict preg, const char *__restrict pattern, int cflags)
440 {
441 reg_errcode_t ret;
442 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? REG_SYNTAX_POSIX_EXTENDED
443 : REG_SYNTAX_POSIX_BASIC);
444
445 preg->re_buffer = NULL;
446 preg->re_allocated = 0;
447 preg->re_used = 0;
448
449 /* Try to allocate space for the fastmap. */
450 preg->re_fastmap = re_malloc (char, SBC_MAX);
451 if (BE (preg->re_fastmap == NULL, 0))
452 return REG_ESPACE;
453
454 syntax |= (cflags & REG_ICASE) ? REG_IGNORE_CASE : 0;
455
456 /* If REG_NEWLINE is set, newlines are treated differently. */
457 if (cflags & REG_NEWLINE)
458 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
459 syntax &= ~REG_DOT_NEWLINE;
460 syntax |= REG_HAT_LISTS_NOT_NEWLINE;
461 /* It also changes the matching behavior. */
462 preg->re_newline_anchor = 1;
463 }
464 else
465 preg->re_newline_anchor = 0;
466 preg->re_no_sub = !!(cflags & REG_NOSUB);
467 preg->re_translate = NULL;
468
469 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
470
471 /* POSIX doesn't distinguish between an unmatched open-group and an
472 unmatched close-group: both are REG_EPAREN. */
473 if (ret == REG_ERPAREN)
474 ret = REG_EPAREN;
475
476 /* We have already checked preg->re_fastmap != NULL. */
477 if (BE (ret == REG_NOERROR, 1))
478 /* Compute the fastmap now, since regexec cannot modify the pattern
479 buffer. This function never fails in this implementation. */
480 (void) re_compile_fastmap (preg);
481 else
482 {
483 /* Some error occurred while compiling the expression. */
484 re_free (preg->re_fastmap);
485 preg->re_fastmap = NULL;
486 }
487
488 return (int) ret;
489 }
490 #ifdef _LIBC
weak_alias(__regcomp,regcomp)491 weak_alias (__regcomp, regcomp)
492 #endif
493
494 /* Returns a message corresponding to an error code, ERRCODE, returned
495 from either regcomp or regexec. We don't use PREG here. */
496
497 size_t
498 regerror (int errcode, const regex_t *__restrict preg,
499 char *__restrict errbuf, size_t errbuf_size)
500 {
501 const char *msg;
502 size_t msg_size;
503
504 if (BE (errcode < 0
505 || errcode >= (int) (sizeof (__re_error_msgid_idx)
506 / sizeof (__re_error_msgid_idx[0])), 0))
507 /* Only error codes returned by the rest of the code should be passed
508 to this routine. If we are given anything else, or if other regex
509 code generates an invalid error code, then the program has a bug.
510 Dump core so we can fix it. */
511 abort ();
512
513 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
514
515 msg_size = strlen (msg) + 1; /* Includes the null. */
516
517 if (BE (errbuf_size != 0, 1))
518 {
519 if (BE (msg_size > errbuf_size, 0))
520 {
521 #if defined HAVE_MEMPCPY || defined _LIBC
522 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
523 #else
524 memcpy (errbuf, msg, errbuf_size - 1);
525 errbuf[errbuf_size - 1] = 0;
526 #endif
527 }
528 else
529 memcpy (errbuf, msg, msg_size);
530 }
531
532 return msg_size;
533 }
534 #ifdef _LIBC
535 weak_alias (__regerror, regerror)
536 #endif
537
538
539 #ifdef RE_ENABLE_I18N
540 /* This static array is used for the map to single-byte characters when
541 UTF-8 is used. Otherwise we would allocate memory just to initialize
542 it the same all the time. UTF-8 is the preferred encoding so this is
543 a worthwhile optimization. */
544 static const bitset utf8_sb_map =
545 {
546 /* Set the first 128 bits. */
547 # if 2 < BITSET_WORDS
548 BITSET_WORD_MAX,
549 # endif
550 # if 4 < BITSET_WORDS
551 BITSET_WORD_MAX,
552 # endif
553 # if 6 < BITSET_WORDS
554 BITSET_WORD_MAX,
555 # endif
556 # if 8 < BITSET_WORDS
557 # error "Invalid BITSET_WORDS"
558 # endif
559 (BITSET_WORD_MAX
560 >> (SBC_MAX % BITSET_WORD_BITS == 0
561 ? 0
562 : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
563 };
564 #endif
565
566
567 static void
free_dfa_content(re_dfa_t * dfa)568 free_dfa_content (re_dfa_t *dfa)
569 {
570 Idx i, j;
571
572 if (dfa->nodes)
573 for (i = 0; i < dfa->nodes_len; ++i)
574 free_token (dfa->nodes + i);
575 re_free (dfa->nexts);
576 for (i = 0; i < dfa->nodes_len; ++i)
577 {
578 if (dfa->eclosures != NULL)
579 re_node_set_free (dfa->eclosures + i);
580 if (dfa->inveclosures != NULL)
581 re_node_set_free (dfa->inveclosures + i);
582 if (dfa->edests != NULL)
583 re_node_set_free (dfa->edests + i);
584 }
585 re_free (dfa->edests);
586 re_free (dfa->eclosures);
587 re_free (dfa->inveclosures);
588 re_free (dfa->nodes);
589
590 if (dfa->state_table)
591 for (i = 0; i <= dfa->state_hash_mask; ++i)
592 {
593 struct re_state_table_entry *entry = dfa->state_table + i;
594 for (j = 0; j < entry->num; ++j)
595 {
596 re_dfastate_t *state = entry->array[j];
597 free_state (state);
598 }
599 re_free (entry->array);
600 }
601 re_free (dfa->state_table);
602 #ifdef RE_ENABLE_I18N
603 if (dfa->sb_char != utf8_sb_map)
604 re_free (dfa->sb_char);
605 #endif
606 re_free (dfa->subexp_map);
607 #ifdef DEBUG
608 re_free (dfa->re_str);
609 #endif
610
611 re_free (dfa);
612 }
613
614
615 /* Free dynamically allocated space used by PREG. */
616
617 void
regfree(regex_t * preg)618 regfree (regex_t *preg)
619 {
620 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
621 if (BE (dfa != NULL, 1))
622 free_dfa_content (dfa);
623 preg->re_buffer = NULL;
624 preg->re_allocated = 0;
625
626 re_free (preg->re_fastmap);
627 preg->re_fastmap = NULL;
628
629 re_free (preg->re_translate);
630 preg->re_translate = NULL;
631 }
632 #ifdef _LIBC
633 weak_alias (__regfree, regfree)
634 #endif
635
636 /* Entry points compatible with 4.2 BSD regex library. We don't define
637 them unless specifically requested. */
638
639 #if defined _REGEX_RE_COMP || defined _LIBC
640
641 /* BSD has one and only one pattern buffer. */
642 static struct re_pattern_buffer re_comp_buf;
643
644 char *
645 # ifdef _LIBC
646 /* Make these definitions weak in libc, so POSIX programs can redefine
647 these names if they don't use our functions, and still use
648 regcomp/regexec above without link errors. */
649 weak_function
650 # endif
re_comp(const char * s)651 re_comp (const char *s)
652 {
653 reg_errcode_t ret;
654 char *fastmap;
655
656 if (!s)
657 {
658 if (!re_comp_buf.re_buffer)
659 return gettext ("No previous regular expression");
660 return 0;
661 }
662
663 if (re_comp_buf.re_buffer)
664 {
665 fastmap = re_comp_buf.re_fastmap;
666 re_comp_buf.re_fastmap = NULL;
667 __regfree (&re_comp_buf);
668 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
669 re_comp_buf.re_fastmap = fastmap;
670 }
671
672 if (re_comp_buf.re_fastmap == NULL)
673 {
674 re_comp_buf.re_fastmap = (char *) malloc (SBC_MAX);
675 if (re_comp_buf.re_fastmap == NULL)
676 return (char *) gettext (__re_error_msgid
677 + __re_error_msgid_idx[(int) REG_ESPACE]);
678 }
679
680 /* Since `re_exec' always passes NULL for the `regs' argument, we
681 don't need to initialize the pattern buffer fields which affect it. */
682
683 /* Match anchors at newlines. */
684 re_comp_buf.re_newline_anchor = 1;
685
686 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
687
688 if (!ret)
689 return NULL;
690
691 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
692 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
693 }
694
695 #ifdef _LIBC
libc_freeres_fn(free_mem)696 libc_freeres_fn (free_mem)
697 {
698 __regfree (&re_comp_buf);
699 }
700 #endif
701
702 #endif /* _REGEX_RE_COMP */
703
704 /* Internal entry point.
705 Compile the regular expression PATTERN, whose length is LENGTH.
706 SYNTAX indicate regular expression's syntax. */
707
708 static reg_errcode_t
re_compile_internal(regex_t * preg,const char * pattern,Idx length,reg_syntax_t syntax)709 re_compile_internal (regex_t *preg, const char *pattern, Idx length,
710 reg_syntax_t syntax)
711 {
712 reg_errcode_t err = REG_NOERROR;
713 re_dfa_t *dfa;
714 re_string_t regexp;
715
716 /* Initialize the pattern buffer. */
717 preg->re_fastmap_accurate = 0;
718 preg->re_syntax = syntax;
719 preg->re_not_bol = preg->re_not_eol = 0;
720 preg->re_used = 0;
721 preg->re_nsub = 0;
722 preg->re_can_be_null = 0;
723 preg->re_regs_allocated = REG_UNALLOCATED;
724
725 /* Initialize the dfa. */
726 dfa = (re_dfa_t *) preg->re_buffer;
727 if (BE (preg->re_allocated < sizeof (re_dfa_t), 0))
728 {
729 /* If zero allocated, but buffer is non-null, try to realloc
730 enough space. This loses if buffer's address is bogus, but
731 that is the user's responsibility. If buffer is null this
732 is a simple allocation. */
733 dfa = re_realloc (preg->re_buffer, re_dfa_t, 1);
734 if (dfa == NULL)
735 return REG_ESPACE;
736 preg->re_allocated = sizeof (re_dfa_t);
737 preg->re_buffer = (unsigned char *) dfa;
738 }
739 preg->re_used = sizeof (re_dfa_t);
740
741 __libc_lock_init (dfa->lock);
742
743 err = init_dfa (dfa, length);
744 if (BE (err != REG_NOERROR, 0))
745 {
746 free_dfa_content (dfa);
747 preg->re_buffer = NULL;
748 preg->re_allocated = 0;
749 return err;
750 }
751 #ifdef DEBUG
752 dfa->re_str = re_malloc (char, length + 1);
753 strncpy (dfa->re_str, pattern, length + 1);
754 #endif
755
756 err = re_string_construct (®exp, pattern, length, preg->re_translate,
757 syntax & REG_IGNORE_CASE, dfa);
758 if (BE (err != REG_NOERROR, 0))
759 {
760 re_compile_internal_free_return:
761 free_workarea_compile (preg);
762 re_string_destruct (®exp);
763 free_dfa_content (dfa);
764 preg->re_buffer = NULL;
765 preg->re_allocated = 0;
766 return err;
767 }
768
769 /* Parse the regular expression, and build a structure tree. */
770 preg->re_nsub = 0;
771 dfa->str_tree = parse (®exp, preg, syntax, &err);
772 if (BE (dfa->str_tree == NULL, 0))
773 goto re_compile_internal_free_return;
774
775 /* Analyze the tree and create the nfa. */
776 err = analyze (preg);
777 if (BE (err != REG_NOERROR, 0))
778 goto re_compile_internal_free_return;
779
780 #ifdef RE_ENABLE_I18N
781 /* If possible, do searching in single byte encoding to speed things up. */
782 if (dfa->is_utf8 && !(syntax & REG_IGNORE_CASE) && preg->re_translate == NULL)
783 optimize_utf8 (dfa);
784 #endif
785
786 /* Then create the initial state of the dfa. */
787 err = create_initial_state (dfa);
788
789 /* Release work areas. */
790 free_workarea_compile (preg);
791 re_string_destruct (®exp);
792
793 if (BE (err != REG_NOERROR, 0))
794 {
795 free_dfa_content (dfa);
796 preg->re_buffer = NULL;
797 preg->re_allocated = 0;
798 }
799
800 return err;
801 }
802
803 /* Initialize DFA. We use the length of the regular expression PAT_LEN
804 as the initial length of some arrays. */
805
806 static reg_errcode_t
init_dfa(re_dfa_t * dfa,Idx pat_len)807 init_dfa (re_dfa_t *dfa, Idx pat_len)
808 {
809 __re_size_t table_size;
810 #ifndef _LIBC
811 char *codeset_name;
812 #endif
813
814 memset (dfa, '\0', sizeof (re_dfa_t));
815
816 /* Force allocation of str_tree_storage the first time. */
817 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
818
819 dfa->nodes_alloc = pat_len + 1;
820 dfa->nodes = re_xmalloc (re_token_t, dfa->nodes_alloc);
821
822 /* table_size = 2 ^ ceil(log pat_len) */
823 for (table_size = 1; table_size <= pat_len; table_size <<= 1)
824 if (0 < (Idx) -1 && table_size == 0)
825 return REG_ESPACE;
826
827 dfa->state_table = re_calloc (struct re_state_table_entry, table_size);
828 dfa->state_hash_mask = table_size - 1;
829
830 dfa->mb_cur_max = MB_CUR_MAX;
831 #ifdef _LIBC
832 if (dfa->mb_cur_max == 6
833 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
834 dfa->is_utf8 = 1;
835 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
836 != 0);
837 #else
838 # ifdef HAVE_LANGINFO_CODESET
839 codeset_name = nl_langinfo (CODESET);
840 # else
841 codeset_name = getenv ("LC_ALL");
842 if (codeset_name == NULL || codeset_name[0] == '\0')
843 codeset_name = getenv ("LC_CTYPE");
844 if (codeset_name == NULL || codeset_name[0] == '\0')
845 codeset_name = getenv ("LANG");
846 if (codeset_name == NULL)
847 codeset_name = "";
848 else if (strchr (codeset_name, '.') != NULL)
849 codeset_name = strchr (codeset_name, '.') + 1;
850 # endif
851
852 if (strcasecmp (codeset_name, "UTF-8") == 0
853 || strcasecmp (codeset_name, "UTF8") == 0)
854 dfa->is_utf8 = 1;
855
856 /* We check exhaustively in the loop below if this charset is a
857 superset of ASCII. */
858 dfa->map_notascii = 0;
859 #endif
860
861 #ifdef RE_ENABLE_I18N
862 if (dfa->mb_cur_max > 1)
863 {
864 if (dfa->is_utf8)
865 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
866 else
867 {
868 int i, j, ch;
869
870 dfa->sb_char = re_calloc (bitset_word, BITSET_WORDS);
871 if (BE (dfa->sb_char == NULL, 0))
872 return REG_ESPACE;
873
874 /* Set the bits corresponding to single byte chars. */
875 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
876 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
877 {
878 wint_t wch = __btowc (ch);
879 if (wch != WEOF)
880 dfa->sb_char[i] |= (bitset_word) 1 << j;
881 # ifndef _LIBC
882 if (isascii (ch) && wch != ch)
883 dfa->map_notascii = 1;
884 # endif
885 }
886 }
887 }
888 #endif
889
890 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
891 return REG_ESPACE;
892 return REG_NOERROR;
893 }
894
895 /* Initialize WORD_CHAR table, which indicate which character is
896 "word". In this case "word" means that it is the word construction
897 character used by some operators like "\<", "\>", etc. */
898
899 static void
init_word_char(re_dfa_t * dfa)900 init_word_char (re_dfa_t *dfa)
901 {
902 int i, j, ch;
903 dfa->word_ops_used = 1;
904 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
905 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
906 if (isalnum (ch) || ch == '_')
907 dfa->word_char[i] |= (bitset_word) 1 << j;
908 }
909
910 /* Free the work area which are only used while compiling. */
911
912 static void
free_workarea_compile(regex_t * preg)913 free_workarea_compile (regex_t *preg)
914 {
915 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
916 bin_tree_storage_t *storage, *next;
917 for (storage = dfa->str_tree_storage; storage; storage = next)
918 {
919 next = storage->next;
920 re_free (storage);
921 }
922 dfa->str_tree_storage = NULL;
923 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
924 dfa->str_tree = NULL;
925 re_free (dfa->org_indices);
926 dfa->org_indices = NULL;
927 }
928
929 /* Create initial states for all contexts. */
930
931 static reg_errcode_t
create_initial_state(re_dfa_t * dfa)932 create_initial_state (re_dfa_t *dfa)
933 {
934 Idx first, i;
935 reg_errcode_t err;
936 re_node_set init_nodes;
937
938 /* Initial states have the epsilon closure of the node which is
939 the first node of the regular expression. */
940 first = dfa->str_tree->first->node_idx;
941 dfa->init_node = first;
942 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
943 if (BE (err != REG_NOERROR, 0))
944 return err;
945
946 /* The back-references which are in initial states can epsilon transit,
947 since in this case all of the subexpressions can be null.
948 Then we add epsilon closures of the nodes which are the next nodes of
949 the back-references. */
950 if (dfa->nbackref > 0)
951 for (i = 0; i < init_nodes.nelem; ++i)
952 {
953 Idx node_idx = init_nodes.elems[i];
954 re_token_type_t type = dfa->nodes[node_idx].type;
955
956 Idx clexp_idx;
957 if (type != OP_BACK_REF)
958 continue;
959 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
960 {
961 re_token_t *clexp_node;
962 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
963 if (clexp_node->type == OP_CLOSE_SUBEXP
964 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
965 break;
966 }
967 if (clexp_idx == init_nodes.nelem)
968 continue;
969
970 if (type == OP_BACK_REF)
971 {
972 Idx dest_idx = dfa->edests[node_idx].elems[0];
973 if (!re_node_set_contains (&init_nodes, dest_idx))
974 {
975 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
976 i = 0;
977 }
978 }
979 }
980
981 /* It must be the first time to invoke acquire_state. */
982 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
983 /* We don't check ERR here, since the initial state must not be NULL. */
984 if (BE (dfa->init_state == NULL, 0))
985 return err;
986 if (dfa->init_state->has_constraint)
987 {
988 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
989 CONTEXT_WORD);
990 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
991 CONTEXT_NEWLINE);
992 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
993 &init_nodes,
994 CONTEXT_NEWLINE
995 | CONTEXT_BEGBUF);
996 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
997 || dfa->init_state_begbuf == NULL, 0))
998 return err;
999 }
1000 else
1001 dfa->init_state_word = dfa->init_state_nl
1002 = dfa->init_state_begbuf = dfa->init_state;
1003
1004 re_node_set_free (&init_nodes);
1005 return REG_NOERROR;
1006 }
1007
1008 #ifdef RE_ENABLE_I18N
1009 /* If it is possible to do searching in single byte encoding instead of UTF-8
1010 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1011 DFA nodes where needed. */
1012
1013 static void
optimize_utf8(re_dfa_t * dfa)1014 optimize_utf8 (re_dfa_t *dfa)
1015 {
1016 Idx node;
1017 int i;
1018 bool mb_chars = false;
1019 bool has_period = false;
1020
1021 for (node = 0; node < dfa->nodes_len; ++node)
1022 switch (dfa->nodes[node].type)
1023 {
1024 case CHARACTER:
1025 if (dfa->nodes[node].opr.c >= 0x80)
1026 mb_chars = true;
1027 break;
1028 case ANCHOR:
1029 switch (dfa->nodes[node].opr.idx)
1030 {
1031 case LINE_FIRST:
1032 case LINE_LAST:
1033 case BUF_FIRST:
1034 case BUF_LAST:
1035 break;
1036 default:
1037 /* Word anchors etc. cannot be handled. */
1038 return;
1039 }
1040 break;
1041 case OP_PERIOD:
1042 has_period = true;
1043 break;
1044 case OP_BACK_REF:
1045 case OP_ALT:
1046 case END_OF_RE:
1047 case OP_DUP_ASTERISK:
1048 case OP_OPEN_SUBEXP:
1049 case OP_CLOSE_SUBEXP:
1050 break;
1051 case COMPLEX_BRACKET:
1052 return;
1053 case SIMPLE_BRACKET:
1054 /* Just double check. */
1055 {
1056 int rshift =
1057 (SBC_MAX / 2 % BITSET_WORD_BITS == 0
1058 ? 0
1059 : BITSET_WORD_BITS - SBC_MAX / 2 % BITSET_WORD_BITS);
1060 for (i = SBC_MAX / 2 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1061 {
1062 if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1063 return;
1064 rshift = 0;
1065 }
1066 }
1067 break;
1068 default:
1069 abort ();
1070 }
1071
1072 if (mb_chars || has_period)
1073 for (node = 0; node < dfa->nodes_len; ++node)
1074 {
1075 if (dfa->nodes[node].type == CHARACTER
1076 && dfa->nodes[node].opr.c >= 0x80)
1077 dfa->nodes[node].mb_partial = 0;
1078 else if (dfa->nodes[node].type == OP_PERIOD)
1079 dfa->nodes[node].type = OP_UTF8_PERIOD;
1080 }
1081
1082 /* The search can be in single byte locale. */
1083 dfa->mb_cur_max = 1;
1084 dfa->is_utf8 = 0;
1085 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1086 }
1087 #endif
1088
1089 /* Analyze the structure tree, and calculate "first", "next", "edest",
1090 "eclosure", and "inveclosure". */
1091
1092 static reg_errcode_t
analyze(regex_t * preg)1093 analyze (regex_t *preg)
1094 {
1095 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
1096 reg_errcode_t ret;
1097
1098 /* Allocate arrays. */
1099 dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1100 dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1101 dfa->edests = re_xmalloc (re_node_set, dfa->nodes_alloc);
1102 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1103 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1104 || dfa->eclosures == NULL, 0))
1105 return REG_ESPACE;
1106
1107 dfa->subexp_map = re_xmalloc (Idx, preg->re_nsub);
1108 if (dfa->subexp_map != NULL)
1109 {
1110 Idx i;
1111 for (i = 0; i < preg->re_nsub; i++)
1112 dfa->subexp_map[i] = i;
1113 preorder (dfa->str_tree, optimize_subexps, dfa);
1114 for (i = 0; i < preg->re_nsub; i++)
1115 if (dfa->subexp_map[i] != i)
1116 break;
1117 if (i == preg->re_nsub)
1118 {
1119 free (dfa->subexp_map);
1120 dfa->subexp_map = NULL;
1121 }
1122 }
1123
1124 ret = postorder (dfa->str_tree, lower_subexps, preg);
1125 if (BE (ret != REG_NOERROR, 0))
1126 return ret;
1127 ret = postorder (dfa->str_tree, calc_first, dfa);
1128 if (BE (ret != REG_NOERROR, 0))
1129 return ret;
1130 preorder (dfa->str_tree, calc_next, dfa);
1131 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1132 if (BE (ret != REG_NOERROR, 0))
1133 return ret;
1134 ret = calc_eclosure (dfa);
1135 if (BE (ret != REG_NOERROR, 0))
1136 return ret;
1137
1138 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1139 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1140 if ((!preg->re_no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1141 || dfa->nbackref)
1142 {
1143 dfa->inveclosures = re_xmalloc (re_node_set, dfa->nodes_len);
1144 if (BE (dfa->inveclosures == NULL, 0))
1145 return REG_ESPACE;
1146 ret = calc_inveclosure (dfa);
1147 }
1148
1149 return ret;
1150 }
1151
1152 /* Our parse trees are very unbalanced, so we cannot use a stack to
1153 implement parse tree visits. Instead, we use parent pointers and
1154 some hairy code in these two functions. */
1155 static reg_errcode_t
postorder(bin_tree_t * root,reg_errcode_t (fn (void *,bin_tree_t *)),void * extra)1156 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1157 void *extra)
1158 {
1159 bin_tree_t *node, *prev;
1160
1161 for (node = root; ; )
1162 {
1163 /* Descend down the tree, preferably to the left (or to the right
1164 if that's the only child). */
1165 while (node->left || node->right)
1166 if (node->left)
1167 node = node->left;
1168 else
1169 node = node->right;
1170
1171 do
1172 {
1173 reg_errcode_t err = fn (extra, node);
1174 if (BE (err != REG_NOERROR, 0))
1175 return err;
1176 if (node->parent == NULL)
1177 return REG_NOERROR;
1178 prev = node;
1179 node = node->parent;
1180 }
1181 /* Go up while we have a node that is reached from the right. */
1182 while (node->right == prev || node->right == NULL);
1183 node = node->right;
1184 }
1185 }
1186
1187 static reg_errcode_t
preorder(bin_tree_t * root,reg_errcode_t (fn (void *,bin_tree_t *)),void * extra)1188 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1189 void *extra)
1190 {
1191 bin_tree_t *node;
1192
1193 for (node = root; ; )
1194 {
1195 reg_errcode_t err = fn (extra, node);
1196 if (BE (err != REG_NOERROR, 0))
1197 return err;
1198
1199 /* Go to the left node, or up and to the right. */
1200 if (node->left)
1201 node = node->left;
1202 else
1203 {
1204 bin_tree_t *prev = NULL;
1205 while (node->right == prev || node->right == NULL)
1206 {
1207 prev = node;
1208 node = node->parent;
1209 if (!node)
1210 return REG_NOERROR;
1211 }
1212 node = node->right;
1213 }
1214 }
1215 }
1216
1217 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1218 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1219 backreferences as well. Requires a preorder visit. */
1220 static reg_errcode_t
optimize_subexps(void * extra,bin_tree_t * node)1221 optimize_subexps (void *extra, bin_tree_t *node)
1222 {
1223 re_dfa_t *dfa = (re_dfa_t *) extra;
1224
1225 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1226 {
1227 int idx = node->token.opr.idx;
1228 node->token.opr.idx = dfa->subexp_map[idx];
1229 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1230 }
1231
1232 else if (node->token.type == SUBEXP
1233 && node->left && node->left->token.type == SUBEXP)
1234 {
1235 Idx other_idx = node->left->token.opr.idx;
1236
1237 node->left = node->left->left;
1238 if (node->left)
1239 node->left->parent = node;
1240
1241 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1242 if (other_idx < BITSET_WORD_BITS)
1243 dfa->used_bkref_map &= ~ ((bitset_word) 1 << other_idx);
1244 }
1245
1246 return REG_NOERROR;
1247 }
1248
1249 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1250 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1251 static reg_errcode_t
lower_subexps(void * extra,bin_tree_t * node)1252 lower_subexps (void *extra, bin_tree_t *node)
1253 {
1254 regex_t *preg = (regex_t *) extra;
1255 reg_errcode_t err = REG_NOERROR;
1256
1257 if (node->left && node->left->token.type == SUBEXP)
1258 {
1259 node->left = lower_subexp (&err, preg, node->left);
1260 if (node->left)
1261 node->left->parent = node;
1262 }
1263 if (node->right && node->right->token.type == SUBEXP)
1264 {
1265 node->right = lower_subexp (&err, preg, node->right);
1266 if (node->right)
1267 node->right->parent = node;
1268 }
1269
1270 return err;
1271 }
1272
1273 static bin_tree_t *
lower_subexp(reg_errcode_t * err,regex_t * preg,bin_tree_t * node)1274 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1275 {
1276 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
1277 bin_tree_t *body = node->left;
1278 bin_tree_t *op, *cls, *tree1, *tree;
1279
1280 if (preg->re_no_sub
1281 /* We do not optimize empty subexpressions, because otherwise we may
1282 have bad CONCAT nodes with NULL children. This is obviously not
1283 very common, so we do not lose much. An example that triggers
1284 this case is the sed "script" /\(\)/x. */
1285 && node->left != NULL
1286 && ! (node->token.opr.idx < BITSET_WORD_BITS
1287 && dfa->used_bkref_map & ((bitset_word) 1 << node->token.opr.idx)))
1288 return node->left;
1289
1290 /* Convert the SUBEXP node to the concatenation of an
1291 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1292 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1293 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1294 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1295 tree = create_tree (dfa, op, tree1, CONCAT);
1296 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1297 {
1298 *err = REG_ESPACE;
1299 return NULL;
1300 }
1301
1302 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1303 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1304 return tree;
1305 }
1306
1307 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1308 nodes. Requires a postorder visit. */
1309 static reg_errcode_t
calc_first(void * extra,bin_tree_t * node)1310 calc_first (void *extra, bin_tree_t *node)
1311 {
1312 re_dfa_t *dfa = (re_dfa_t *) extra;
1313 if (node->token.type == CONCAT)
1314 {
1315 node->first = node->left->first;
1316 node->node_idx = node->left->node_idx;
1317 }
1318 else
1319 {
1320 node->first = node;
1321 node->node_idx = re_dfa_add_node (dfa, node->token);
1322 if (BE (node->node_idx == REG_MISSING, 0))
1323 return REG_ESPACE;
1324 }
1325 return REG_NOERROR;
1326 }
1327
1328 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1329 static reg_errcode_t
calc_next(void * extra,bin_tree_t * node)1330 calc_next (void *extra, bin_tree_t *node)
1331 {
1332 switch (node->token.type)
1333 {
1334 case OP_DUP_ASTERISK:
1335 node->left->next = node;
1336 break;
1337 case CONCAT:
1338 node->left->next = node->right->first;
1339 node->right->next = node->next;
1340 break;
1341 default:
1342 if (node->left)
1343 node->left->next = node->next;
1344 if (node->right)
1345 node->right->next = node->next;
1346 break;
1347 }
1348 return REG_NOERROR;
1349 }
1350
1351 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1352 static reg_errcode_t
link_nfa_nodes(void * extra,bin_tree_t * node)1353 link_nfa_nodes (void *extra, bin_tree_t *node)
1354 {
1355 re_dfa_t *dfa = (re_dfa_t *) extra;
1356 Idx idx = node->node_idx;
1357 reg_errcode_t err = REG_NOERROR;
1358
1359 switch (node->token.type)
1360 {
1361 case CONCAT:
1362 break;
1363
1364 case END_OF_RE:
1365 assert (node->next == NULL);
1366 break;
1367
1368 case OP_DUP_ASTERISK:
1369 case OP_ALT:
1370 {
1371 Idx left, right;
1372 dfa->has_plural_match = 1;
1373 if (node->left != NULL)
1374 left = node->left->first->node_idx;
1375 else
1376 left = node->next->node_idx;
1377 if (node->right != NULL)
1378 right = node->right->first->node_idx;
1379 else
1380 right = node->next->node_idx;
1381 assert (REG_VALID_INDEX (left));
1382 assert (REG_VALID_INDEX (right));
1383 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1384 }
1385 break;
1386
1387 case ANCHOR:
1388 case OP_OPEN_SUBEXP:
1389 case OP_CLOSE_SUBEXP:
1390 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1391 break;
1392
1393 case OP_BACK_REF:
1394 dfa->nexts[idx] = node->next->node_idx;
1395 if (node->token.type == OP_BACK_REF)
1396 re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1397 break;
1398
1399 default:
1400 assert (!IS_EPSILON_NODE (node->token.type));
1401 dfa->nexts[idx] = node->next->node_idx;
1402 break;
1403 }
1404
1405 return err;
1406 }
1407
1408 /* Duplicate the epsilon closure of the node ROOT_NODE.
1409 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1410 to their own constraint. */
1411
1412 static reg_errcode_t
duplicate_node_closure(re_dfa_t * dfa,Idx top_org_node,Idx top_clone_node,Idx root_node,unsigned int init_constraint)1413 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node,
1414 Idx top_clone_node, Idx root_node,
1415 unsigned int init_constraint)
1416 {
1417 Idx org_node, clone_node;
1418 bool ok;
1419 unsigned int constraint = init_constraint;
1420 for (org_node = top_org_node, clone_node = top_clone_node;;)
1421 {
1422 Idx org_dest, clone_dest;
1423 if (dfa->nodes[org_node].type == OP_BACK_REF)
1424 {
1425 /* If the back reference epsilon-transit, its destination must
1426 also have the constraint. Then duplicate the epsilon closure
1427 of the destination of the back reference, and store it in
1428 edests of the back reference. */
1429 org_dest = dfa->nexts[org_node];
1430 re_node_set_empty (dfa->edests + clone_node);
1431 clone_dest = duplicate_node (dfa, org_dest, constraint);
1432 if (BE (clone_dest == REG_MISSING, 0))
1433 return REG_ESPACE;
1434 dfa->nexts[clone_node] = dfa->nexts[org_node];
1435 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1436 if (BE (! ok, 0))
1437 return REG_ESPACE;
1438 }
1439 else if (dfa->edests[org_node].nelem == 0)
1440 {
1441 /* In case of the node can't epsilon-transit, don't duplicate the
1442 destination and store the original destination as the
1443 destination of the node. */
1444 dfa->nexts[clone_node] = dfa->nexts[org_node];
1445 break;
1446 }
1447 else if (dfa->edests[org_node].nelem == 1)
1448 {
1449 /* In case of the node can epsilon-transit, and it has only one
1450 destination. */
1451 org_dest = dfa->edests[org_node].elems[0];
1452 re_node_set_empty (dfa->edests + clone_node);
1453 if (dfa->nodes[org_node].type == ANCHOR)
1454 {
1455 /* In case of the node has another constraint, append it. */
1456 if (org_node == root_node && clone_node != org_node)
1457 {
1458 /* ...but if the node is root_node itself, it means the
1459 epsilon closure have a loop, then tie it to the
1460 destination of the root_node. */
1461 ok = re_node_set_insert (dfa->edests + clone_node,
1462 org_dest);
1463 if (BE (! ok, 0))
1464 return REG_ESPACE;
1465 break;
1466 }
1467 constraint |= dfa->nodes[org_node].opr.ctx_type;
1468 }
1469 clone_dest = duplicate_node (dfa, org_dest, constraint);
1470 if (BE (clone_dest == REG_MISSING, 0))
1471 return REG_ESPACE;
1472 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1473 if (BE (! ok, 0))
1474 return REG_ESPACE;
1475 }
1476 else /* dfa->edests[org_node].nelem == 2 */
1477 {
1478 /* In case of the node can epsilon-transit, and it has two
1479 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1480 org_dest = dfa->edests[org_node].elems[0];
1481 re_node_set_empty (dfa->edests + clone_node);
1482 /* Search for a duplicated node which satisfies the constraint. */
1483 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1484 if (clone_dest == REG_MISSING)
1485 {
1486 /* There are no such a duplicated node, create a new one. */
1487 reg_errcode_t err;
1488 clone_dest = duplicate_node (dfa, org_dest, constraint);
1489 if (BE (clone_dest == REG_MISSING, 0))
1490 return REG_ESPACE;
1491 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1492 if (BE (! ok, 0))
1493 return REG_ESPACE;
1494 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1495 root_node, constraint);
1496 if (BE (err != REG_NOERROR, 0))
1497 return err;
1498 }
1499 else
1500 {
1501 /* There are a duplicated node which satisfy the constraint,
1502 use it to avoid infinite loop. */
1503 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1504 if (BE (! ok, 0))
1505 return REG_ESPACE;
1506 }
1507
1508 org_dest = dfa->edests[org_node].elems[1];
1509 clone_dest = duplicate_node (dfa, org_dest, constraint);
1510 if (BE (clone_dest == REG_MISSING, 0))
1511 return REG_ESPACE;
1512 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1513 if (BE (! ok, 0))
1514 return REG_ESPACE;
1515 }
1516 org_node = org_dest;
1517 clone_node = clone_dest;
1518 }
1519 return REG_NOERROR;
1520 }
1521
1522 /* Search for a node which is duplicated from the node ORG_NODE, and
1523 satisfies the constraint CONSTRAINT. */
1524
1525 static Idx
search_duplicated_node(const re_dfa_t * dfa,Idx org_node,unsigned int constraint)1526 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1527 unsigned int constraint)
1528 {
1529 Idx idx;
1530 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1531 {
1532 if (org_node == dfa->org_indices[idx]
1533 && constraint == dfa->nodes[idx].constraint)
1534 return idx; /* Found. */
1535 }
1536 return REG_MISSING; /* Not found. */
1537 }
1538
1539 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1540 Return the index of the new node, or REG_MISSING if insufficient storage is
1541 available. */
1542
1543 static Idx
duplicate_node(re_dfa_t * dfa,Idx org_idx,unsigned int constraint)1544 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1545 {
1546 Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1547 if (BE (dup_idx != REG_MISSING, 1))
1548 {
1549 dfa->nodes[dup_idx].constraint = constraint;
1550 if (dfa->nodes[org_idx].type == ANCHOR)
1551 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1552 dfa->nodes[dup_idx].duplicated = 1;
1553
1554 /* Store the index of the original node. */
1555 dfa->org_indices[dup_idx] = org_idx;
1556 }
1557 return dup_idx;
1558 }
1559
1560 static reg_errcode_t
calc_inveclosure(re_dfa_t * dfa)1561 calc_inveclosure (re_dfa_t *dfa)
1562 {
1563 Idx src, idx;
1564 bool ok;
1565 for (idx = 0; idx < dfa->nodes_len; ++idx)
1566 re_node_set_init_empty (dfa->inveclosures + idx);
1567
1568 for (src = 0; src < dfa->nodes_len; ++src)
1569 {
1570 Idx *elems = dfa->eclosures[src].elems;
1571 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1572 {
1573 ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1574 if (BE (! ok, 0))
1575 return REG_ESPACE;
1576 }
1577 }
1578
1579 return REG_NOERROR;
1580 }
1581
1582 /* Calculate "eclosure" for all the node in DFA. */
1583
1584 static reg_errcode_t
calc_eclosure(re_dfa_t * dfa)1585 calc_eclosure (re_dfa_t *dfa)
1586 {
1587 Idx node_idx;
1588 bool incomplete;
1589 #ifdef DEBUG
1590 assert (dfa->nodes_len > 0);
1591 #endif
1592 incomplete = false;
1593 /* For each nodes, calculate epsilon closure. */
1594 for (node_idx = 0; ; ++node_idx)
1595 {
1596 reg_errcode_t err;
1597 re_node_set eclosure_elem;
1598 if (node_idx == dfa->nodes_len)
1599 {
1600 if (!incomplete)
1601 break;
1602 incomplete = false;
1603 node_idx = 0;
1604 }
1605
1606 #ifdef DEBUG
1607 assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1608 #endif
1609
1610 /* If we have already calculated, skip it. */
1611 if (dfa->eclosures[node_idx].nelem != 0)
1612 continue;
1613 /* Calculate epsilon closure of `node_idx'. */
1614 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1615 if (BE (err != REG_NOERROR, 0))
1616 return err;
1617
1618 if (dfa->eclosures[node_idx].nelem == 0)
1619 {
1620 incomplete = true;
1621 re_node_set_free (&eclosure_elem);
1622 }
1623 }
1624 return REG_NOERROR;
1625 }
1626
1627 /* Calculate epsilon closure of NODE. */
1628
1629 static reg_errcode_t
calc_eclosure_iter(re_node_set * new_set,re_dfa_t * dfa,Idx node,bool root)1630 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1631 {
1632 reg_errcode_t err;
1633 unsigned int constraint;
1634 Idx i;
1635 bool incomplete;
1636 bool ok;
1637 re_node_set eclosure;
1638 incomplete = false;
1639 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1640 if (BE (err != REG_NOERROR, 0))
1641 return err;
1642
1643 /* This indicates that we are calculating this node now.
1644 We reference this value to avoid infinite loop. */
1645 dfa->eclosures[node].nelem = REG_MISSING;
1646
1647 constraint = ((dfa->nodes[node].type == ANCHOR)
1648 ? dfa->nodes[node].opr.ctx_type : 0);
1649 /* If the current node has constraints, duplicate all nodes.
1650 Since they must inherit the constraints. */
1651 if (constraint
1652 && dfa->edests[node].nelem
1653 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1654 {
1655 err = duplicate_node_closure (dfa, node, node, node, constraint);
1656 if (BE (err != REG_NOERROR, 0))
1657 return err;
1658 }
1659
1660 /* Expand each epsilon destination nodes. */
1661 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1662 for (i = 0; i < dfa->edests[node].nelem; ++i)
1663 {
1664 re_node_set eclosure_elem;
1665 Idx edest = dfa->edests[node].elems[i];
1666 /* If calculating the epsilon closure of `edest' is in progress,
1667 return intermediate result. */
1668 if (dfa->eclosures[edest].nelem == REG_MISSING)
1669 {
1670 incomplete = true;
1671 continue;
1672 }
1673 /* If we haven't calculated the epsilon closure of `edest' yet,
1674 calculate now. Otherwise use calculated epsilon closure. */
1675 if (dfa->eclosures[edest].nelem == 0)
1676 {
1677 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1678 if (BE (err != REG_NOERROR, 0))
1679 return err;
1680 }
1681 else
1682 eclosure_elem = dfa->eclosures[edest];
1683 /* Merge the epsilon closure of `edest'. */
1684 re_node_set_merge (&eclosure, &eclosure_elem);
1685 /* If the epsilon closure of `edest' is incomplete,
1686 the epsilon closure of this node is also incomplete. */
1687 if (dfa->eclosures[edest].nelem == 0)
1688 {
1689 incomplete = true;
1690 re_node_set_free (&eclosure_elem);
1691 }
1692 }
1693
1694 /* Epsilon closures include itself. */
1695 ok = re_node_set_insert (&eclosure, node);
1696 if (BE (! ok, 0))
1697 return REG_ESPACE;
1698 if (incomplete && !root)
1699 dfa->eclosures[node].nelem = 0;
1700 else
1701 dfa->eclosures[node] = eclosure;
1702 *new_set = eclosure;
1703 return REG_NOERROR;
1704 }
1705
1706 /* Functions for token which are used in the parser. */
1707
1708 /* Fetch a token from INPUT.
1709 We must not use this function inside bracket expressions. */
1710
1711 static void
fetch_token(re_token_t * result,re_string_t * input,reg_syntax_t syntax)1712 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1713 {
1714 re_string_skip_bytes (input, peek_token (result, input, syntax));
1715 }
1716
1717 /* Peek a token from INPUT, and return the length of the token.
1718 We must not use this function inside bracket expressions. */
1719
1720 static int
peek_token(re_token_t * token,re_string_t * input,reg_syntax_t syntax)1721 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1722 {
1723 unsigned char c;
1724
1725 if (re_string_eoi (input))
1726 {
1727 token->type = END_OF_RE;
1728 return 0;
1729 }
1730
1731 c = re_string_peek_byte (input, 0);
1732 token->opr.c = c;
1733
1734 token->word_char = 0;
1735 #ifdef RE_ENABLE_I18N
1736 token->mb_partial = 0;
1737 if (input->mb_cur_max > 1 &&
1738 !re_string_first_byte (input, re_string_cur_idx (input)))
1739 {
1740 token->type = CHARACTER;
1741 token->mb_partial = 1;
1742 return 1;
1743 }
1744 #endif
1745 if (c == '\\')
1746 {
1747 unsigned char c2;
1748 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1749 {
1750 token->type = BACK_SLASH;
1751 return 1;
1752 }
1753
1754 c2 = re_string_peek_byte_case (input, 1);
1755 token->opr.c = c2;
1756 token->type = CHARACTER;
1757 #ifdef RE_ENABLE_I18N
1758 if (input->mb_cur_max > 1)
1759 {
1760 wint_t wc = re_string_wchar_at (input,
1761 re_string_cur_idx (input) + 1);
1762 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1763 }
1764 else
1765 #endif
1766 token->word_char = IS_WORD_CHAR (c2) != 0;
1767
1768 switch (c2)
1769 {
1770 case '|':
1771 if (!(syntax & REG_LIMITED_OPS) && !(syntax & REG_NO_BK_VBAR))
1772 token->type = OP_ALT;
1773 break;
1774 case '1': case '2': case '3': case '4': case '5':
1775 case '6': case '7': case '8': case '9':
1776 if (!(syntax & REG_NO_BK_REFS))
1777 {
1778 token->type = OP_BACK_REF;
1779 token->opr.idx = c2 - '1';
1780 }
1781 break;
1782 case '<':
1783 if (!(syntax & REG_NO_GNU_OPS))
1784 {
1785 token->type = ANCHOR;
1786 token->opr.ctx_type = WORD_FIRST;
1787 }
1788 break;
1789 case '>':
1790 if (!(syntax & REG_NO_GNU_OPS))
1791 {
1792 token->type = ANCHOR;
1793 token->opr.ctx_type = WORD_LAST;
1794 }
1795 break;
1796 case 'b':
1797 if (!(syntax & REG_NO_GNU_OPS))
1798 {
1799 token->type = ANCHOR;
1800 token->opr.ctx_type = WORD_DELIM;
1801 }
1802 break;
1803 case 'B':
1804 if (!(syntax & REG_NO_GNU_OPS))
1805 {
1806 token->type = ANCHOR;
1807 token->opr.ctx_type = NOT_WORD_DELIM;
1808 }
1809 break;
1810 case 'w':
1811 if (!(syntax & REG_NO_GNU_OPS))
1812 token->type = OP_WORD;
1813 break;
1814 case 'W':
1815 if (!(syntax & REG_NO_GNU_OPS))
1816 token->type = OP_NOTWORD;
1817 break;
1818 case 's':
1819 if (!(syntax & REG_NO_GNU_OPS))
1820 token->type = OP_SPACE;
1821 break;
1822 case 'S':
1823 if (!(syntax & REG_NO_GNU_OPS))
1824 token->type = OP_NOTSPACE;
1825 break;
1826 case '`':
1827 if (!(syntax & REG_NO_GNU_OPS))
1828 {
1829 token->type = ANCHOR;
1830 token->opr.ctx_type = BUF_FIRST;
1831 }
1832 break;
1833 case '\'':
1834 if (!(syntax & REG_NO_GNU_OPS))
1835 {
1836 token->type = ANCHOR;
1837 token->opr.ctx_type = BUF_LAST;
1838 }
1839 break;
1840 case '(':
1841 if (!(syntax & REG_NO_BK_PARENS))
1842 token->type = OP_OPEN_SUBEXP;
1843 break;
1844 case ')':
1845 if (!(syntax & REG_NO_BK_PARENS))
1846 token->type = OP_CLOSE_SUBEXP;
1847 break;
1848 case '+':
1849 if (!(syntax & REG_LIMITED_OPS) && (syntax & REG_BK_PLUS_QM))
1850 token->type = OP_DUP_PLUS;
1851 break;
1852 case '?':
1853 if (!(syntax & REG_LIMITED_OPS) && (syntax & REG_BK_PLUS_QM))
1854 token->type = OP_DUP_QUESTION;
1855 break;
1856 case '{':
1857 if ((syntax & REG_INTERVALS) && (!(syntax & REG_NO_BK_BRACES)))
1858 token->type = OP_OPEN_DUP_NUM;
1859 break;
1860 case '}':
1861 if ((syntax & REG_INTERVALS) && (!(syntax & REG_NO_BK_BRACES)))
1862 token->type = OP_CLOSE_DUP_NUM;
1863 break;
1864 default:
1865 break;
1866 }
1867 return 2;
1868 }
1869
1870 token->type = CHARACTER;
1871 #ifdef RE_ENABLE_I18N
1872 if (input->mb_cur_max > 1)
1873 {
1874 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1875 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1876 }
1877 else
1878 #endif
1879 token->word_char = IS_WORD_CHAR (token->opr.c);
1880
1881 switch (c)
1882 {
1883 case '\n':
1884 if (syntax & REG_NEWLINE_ALT)
1885 token->type = OP_ALT;
1886 break;
1887 case '|':
1888 if (!(syntax & REG_LIMITED_OPS) && (syntax & REG_NO_BK_VBAR))
1889 token->type = OP_ALT;
1890 break;
1891 case '*':
1892 token->type = OP_DUP_ASTERISK;
1893 break;
1894 case '+':
1895 if (!(syntax & REG_LIMITED_OPS) && !(syntax & REG_BK_PLUS_QM))
1896 token->type = OP_DUP_PLUS;
1897 break;
1898 case '?':
1899 if (!(syntax & REG_LIMITED_OPS) && !(syntax & REG_BK_PLUS_QM))
1900 token->type = OP_DUP_QUESTION;
1901 break;
1902 case '{':
1903 if ((syntax & REG_INTERVALS) && (syntax & REG_NO_BK_BRACES))
1904 token->type = OP_OPEN_DUP_NUM;
1905 break;
1906 case '}':
1907 if ((syntax & REG_INTERVALS) && (syntax & REG_NO_BK_BRACES))
1908 token->type = OP_CLOSE_DUP_NUM;
1909 break;
1910 case '(':
1911 if (syntax & REG_NO_BK_PARENS)
1912 token->type = OP_OPEN_SUBEXP;
1913 break;
1914 case ')':
1915 if (syntax & REG_NO_BK_PARENS)
1916 token->type = OP_CLOSE_SUBEXP;
1917 break;
1918 case '[':
1919 token->type = OP_OPEN_BRACKET;
1920 break;
1921 case '.':
1922 token->type = OP_PERIOD;
1923 break;
1924 case '^':
1925 if (!(syntax & (REG_CONTEXT_INDEP_ANCHORS | REG_CARET_ANCHORS_HERE)) &&
1926 re_string_cur_idx (input) != 0)
1927 {
1928 char prev = re_string_peek_byte (input, -1);
1929 if (!(syntax & REG_NEWLINE_ALT) || prev != '\n')
1930 break;
1931 }
1932 token->type = ANCHOR;
1933 token->opr.ctx_type = LINE_FIRST;
1934 break;
1935 case '$':
1936 if (!(syntax & REG_CONTEXT_INDEP_ANCHORS) &&
1937 re_string_cur_idx (input) + 1 != re_string_length (input))
1938 {
1939 re_token_t next;
1940 re_string_skip_bytes (input, 1);
1941 peek_token (&next, input, syntax);
1942 re_string_skip_bytes (input, -1);
1943 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1944 break;
1945 }
1946 token->type = ANCHOR;
1947 token->opr.ctx_type = LINE_LAST;
1948 break;
1949 default:
1950 break;
1951 }
1952 return 1;
1953 }
1954
1955 /* Peek a token from INPUT, and return the length of the token.
1956 We must not use this function out of bracket expressions. */
1957
1958 static int
peek_token_bracket(re_token_t * token,re_string_t * input,reg_syntax_t syntax)1959 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1960 {
1961 unsigned char c;
1962 if (re_string_eoi (input))
1963 {
1964 token->type = END_OF_RE;
1965 return 0;
1966 }
1967 c = re_string_peek_byte (input, 0);
1968 token->opr.c = c;
1969
1970 #ifdef RE_ENABLE_I18N
1971 if (input->mb_cur_max > 1 &&
1972 !re_string_first_byte (input, re_string_cur_idx (input)))
1973 {
1974 token->type = CHARACTER;
1975 return 1;
1976 }
1977 #endif /* RE_ENABLE_I18N */
1978
1979 if (c == '\\' && (syntax & REG_BACKSLASH_ESCAPE_IN_LISTS)
1980 && re_string_cur_idx (input) + 1 < re_string_length (input))
1981 {
1982 /* In this case, '\' escape a character. */
1983 unsigned char c2;
1984 re_string_skip_bytes (input, 1);
1985 c2 = re_string_peek_byte (input, 0);
1986 token->opr.c = c2;
1987 token->type = CHARACTER;
1988 return 1;
1989 }
1990 if (c == '[') /* '[' is a special char in a bracket exps. */
1991 {
1992 unsigned char c2;
1993 int token_len;
1994 if (re_string_cur_idx (input) + 1 < re_string_length (input))
1995 c2 = re_string_peek_byte (input, 1);
1996 else
1997 c2 = 0;
1998 token->opr.c = c2;
1999 token_len = 2;
2000 switch (c2)
2001 {
2002 case '.':
2003 token->type = OP_OPEN_COLL_ELEM;
2004 break;
2005 case '=':
2006 token->type = OP_OPEN_EQUIV_CLASS;
2007 break;
2008 case ':':
2009 if (syntax & REG_CHAR_CLASSES)
2010 {
2011 token->type = OP_OPEN_CHAR_CLASS;
2012 break;
2013 }
2014 /* else fall through. */
2015 default:
2016 token->type = CHARACTER;
2017 token->opr.c = c;
2018 token_len = 1;
2019 break;
2020 }
2021 return token_len;
2022 }
2023 switch (c)
2024 {
2025 case '-':
2026 token->type = OP_CHARSET_RANGE;
2027 break;
2028 case ']':
2029 token->type = OP_CLOSE_BRACKET;
2030 break;
2031 case '^':
2032 token->type = OP_NON_MATCH_LIST;
2033 break;
2034 default:
2035 token->type = CHARACTER;
2036 }
2037 return 1;
2038 }
2039
2040 /* Functions for parser. */
2041
2042 /* Entry point of the parser.
2043 Parse the regular expression REGEXP and return the structure tree.
2044 If an error is occured, ERR is set by error code, and return NULL.
2045 This function build the following tree, from regular expression <reg_exp>:
2046 CAT
2047 / \
2048 / \
2049 <reg_exp> EOR
2050
2051 CAT means concatenation.
2052 EOR means end of regular expression. */
2053
2054 static bin_tree_t *
parse(re_string_t * regexp,regex_t * preg,reg_syntax_t syntax,reg_errcode_t * err)2055 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2056 reg_errcode_t *err)
2057 {
2058 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2059 bin_tree_t *tree, *eor, *root;
2060 re_token_t current_token;
2061 dfa->syntax = syntax;
2062 fetch_token (¤t_token, regexp, syntax | REG_CARET_ANCHORS_HERE);
2063 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err);
2064 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2065 return NULL;
2066 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2067 if (tree != NULL)
2068 root = create_tree (dfa, tree, eor, CONCAT);
2069 else
2070 root = eor;
2071 if (BE (eor == NULL || root == NULL, 0))
2072 {
2073 *err = REG_ESPACE;
2074 return NULL;
2075 }
2076 return root;
2077 }
2078
2079 /* This function build the following tree, from regular expression
2080 <branch1>|<branch2>:
2081 ALT
2082 / \
2083 / \
2084 <branch1> <branch2>
2085
2086 ALT means alternative, which represents the operator `|'. */
2087
2088 static bin_tree_t *
parse_reg_exp(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,Idx nest,reg_errcode_t * err)2089 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2090 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2091 {
2092 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2093 bin_tree_t *tree, *branch = NULL;
2094 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2095 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2096 return NULL;
2097
2098 while (token->type == OP_ALT)
2099 {
2100 fetch_token (token, regexp, syntax | REG_CARET_ANCHORS_HERE);
2101 if (token->type != OP_ALT && token->type != END_OF_RE
2102 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2103 {
2104 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2105 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2106 return NULL;
2107 }
2108 else
2109 branch = NULL;
2110 tree = create_tree (dfa, tree, branch, OP_ALT);
2111 if (BE (tree == NULL, 0))
2112 {
2113 *err = REG_ESPACE;
2114 return NULL;
2115 }
2116 }
2117 return tree;
2118 }
2119
2120 /* This function build the following tree, from regular expression
2121 <exp1><exp2>:
2122 CAT
2123 / \
2124 / \
2125 <exp1> <exp2>
2126
2127 CAT means concatenation. */
2128
2129 static bin_tree_t *
parse_branch(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,Idx nest,reg_errcode_t * err)2130 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2131 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2132 {
2133 bin_tree_t *tree, *exp;
2134 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2135 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2136 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2137 return NULL;
2138
2139 while (token->type != OP_ALT && token->type != END_OF_RE
2140 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2141 {
2142 exp = parse_expression (regexp, preg, token, syntax, nest, err);
2143 if (BE (*err != REG_NOERROR && exp == NULL, 0))
2144 {
2145 return NULL;
2146 }
2147 if (tree != NULL && exp != NULL)
2148 {
2149 tree = create_tree (dfa, tree, exp, CONCAT);
2150 if (tree == NULL)
2151 {
2152 *err = REG_ESPACE;
2153 return NULL;
2154 }
2155 }
2156 else if (tree == NULL)
2157 tree = exp;
2158 /* Otherwise exp == NULL, we don't need to create new tree. */
2159 }
2160 return tree;
2161 }
2162
2163 /* This function build the following tree, from regular expression a*:
2164 *
2165 |
2166 a
2167 */
2168
2169 static bin_tree_t *
parse_expression(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,Idx nest,reg_errcode_t * err)2170 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2171 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2172 {
2173 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2174 bin_tree_t *tree;
2175 switch (token->type)
2176 {
2177 case CHARACTER:
2178 tree = create_token_tree (dfa, NULL, NULL, token);
2179 if (BE (tree == NULL, 0))
2180 {
2181 *err = REG_ESPACE;
2182 return NULL;
2183 }
2184 #ifdef RE_ENABLE_I18N
2185 if (dfa->mb_cur_max > 1)
2186 {
2187 while (!re_string_eoi (regexp)
2188 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2189 {
2190 bin_tree_t *mbc_remain;
2191 fetch_token (token, regexp, syntax);
2192 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2193 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2194 if (BE (mbc_remain == NULL || tree == NULL, 0))
2195 {
2196 *err = REG_ESPACE;
2197 return NULL;
2198 }
2199 }
2200 }
2201 #endif
2202 break;
2203 case OP_OPEN_SUBEXP:
2204 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2205 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2206 return NULL;
2207 break;
2208 case OP_OPEN_BRACKET:
2209 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2210 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2211 return NULL;
2212 break;
2213 case OP_BACK_REF:
2214 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2215 {
2216 *err = REG_ESUBREG;
2217 return NULL;
2218 }
2219 dfa->used_bkref_map |= 1 << token->opr.idx;
2220 tree = create_token_tree (dfa, NULL, NULL, token);
2221 if (BE (tree == NULL, 0))
2222 {
2223 *err = REG_ESPACE;
2224 return NULL;
2225 }
2226 ++dfa->nbackref;
2227 dfa->has_mb_node = 1;
2228 break;
2229 case OP_OPEN_DUP_NUM:
2230 if (syntax & REG_CONTEXT_INVALID_DUP)
2231 {
2232 *err = REG_BADRPT;
2233 return NULL;
2234 }
2235 /* FALLTHROUGH */
2236 case OP_DUP_ASTERISK:
2237 case OP_DUP_PLUS:
2238 case OP_DUP_QUESTION:
2239 if (syntax & REG_CONTEXT_INVALID_OPS)
2240 {
2241 *err = REG_BADRPT;
2242 return NULL;
2243 }
2244 else if (syntax & REG_CONTEXT_INDEP_OPS)
2245 {
2246 fetch_token (token, regexp, syntax);
2247 return parse_expression (regexp, preg, token, syntax, nest, err);
2248 }
2249 /* else fall through */
2250 case OP_CLOSE_SUBEXP:
2251 if ((token->type == OP_CLOSE_SUBEXP) &&
2252 !(syntax & REG_UNMATCHED_RIGHT_PAREN_ORD))
2253 {
2254 *err = REG_ERPAREN;
2255 return NULL;
2256 }
2257 /* else fall through */
2258 case OP_CLOSE_DUP_NUM:
2259 /* We treat it as a normal character. */
2260
2261 /* Then we can these characters as normal characters. */
2262 token->type = CHARACTER;
2263 /* mb_partial and word_char bits should be initialized already
2264 by peek_token. */
2265 tree = create_token_tree (dfa, NULL, NULL, token);
2266 if (BE (tree == NULL, 0))
2267 {
2268 *err = REG_ESPACE;
2269 return NULL;
2270 }
2271 break;
2272 case ANCHOR:
2273 if ((token->opr.ctx_type
2274 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2275 && dfa->word_ops_used == 0)
2276 init_word_char (dfa);
2277 if (token->opr.ctx_type == WORD_DELIM
2278 || token->opr.ctx_type == NOT_WORD_DELIM)
2279 {
2280 bin_tree_t *tree_first, *tree_last;
2281 if (token->opr.ctx_type == WORD_DELIM)
2282 {
2283 token->opr.ctx_type = WORD_FIRST;
2284 tree_first = create_token_tree (dfa, NULL, NULL, token);
2285 token->opr.ctx_type = WORD_LAST;
2286 }
2287 else
2288 {
2289 token->opr.ctx_type = INSIDE_WORD;
2290 tree_first = create_token_tree (dfa, NULL, NULL, token);
2291 token->opr.ctx_type = INSIDE_NOTWORD;
2292 }
2293 tree_last = create_token_tree (dfa, NULL, NULL, token);
2294 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2295 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2296 {
2297 *err = REG_ESPACE;
2298 return NULL;
2299 }
2300 }
2301 else
2302 {
2303 tree = create_token_tree (dfa, NULL, NULL, token);
2304 if (BE (tree == NULL, 0))
2305 {
2306 *err = REG_ESPACE;
2307 return NULL;
2308 }
2309 }
2310 /* We must return here, since ANCHORs can't be followed
2311 by repetition operators.
2312 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2313 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2314 fetch_token (token, regexp, syntax);
2315 return tree;
2316 case OP_PERIOD:
2317 tree = create_token_tree (dfa, NULL, NULL, token);
2318 if (BE (tree == NULL, 0))
2319 {
2320 *err = REG_ESPACE;
2321 return NULL;
2322 }
2323 if (dfa->mb_cur_max > 1)
2324 dfa->has_mb_node = 1;
2325 break;
2326 case OP_WORD:
2327 case OP_NOTWORD:
2328 tree = build_charclass_op (dfa, regexp->trans,
2329 (const unsigned char *) "alnum",
2330 (const unsigned char *) "_",
2331 token->type == OP_NOTWORD, err);
2332 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2333 return NULL;
2334 break;
2335 case OP_SPACE:
2336 case OP_NOTSPACE:
2337 tree = build_charclass_op (dfa, regexp->trans,
2338 (const unsigned char *) "space",
2339 (const unsigned char *) "",
2340 token->type == OP_NOTSPACE, err);
2341 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2342 return NULL;
2343 break;
2344 case OP_ALT:
2345 case END_OF_RE:
2346 return NULL;
2347 case BACK_SLASH:
2348 *err = REG_EESCAPE;
2349 return NULL;
2350 default:
2351 /* Must not happen? */
2352 #ifdef DEBUG
2353 assert (0);
2354 #endif
2355 return NULL;
2356 }
2357 fetch_token (token, regexp, syntax);
2358
2359 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2360 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2361 {
2362 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2363 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2364 return NULL;
2365 /* In BRE consecutive duplications are not allowed. */
2366 if ((syntax & REG_CONTEXT_INVALID_DUP)
2367 && (token->type == OP_DUP_ASTERISK
2368 || token->type == OP_OPEN_DUP_NUM))
2369 {
2370 *err = REG_BADRPT;
2371 return NULL;
2372 }
2373 }
2374
2375 return tree;
2376 }
2377
2378 /* This function build the following tree, from regular expression
2379 (<reg_exp>):
2380 SUBEXP
2381 |
2382 <reg_exp>
2383 */
2384
2385 static bin_tree_t *
parse_sub_exp(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,Idx nest,reg_errcode_t * err)2386 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2387 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2388 {
2389 re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2390 bin_tree_t *tree;
2391 size_t cur_nsub;
2392 cur_nsub = preg->re_nsub++;
2393
2394 fetch_token (token, regexp, syntax | REG_CARET_ANCHORS_HERE);
2395
2396 /* The subexpression may be a null string. */
2397 if (token->type == OP_CLOSE_SUBEXP)
2398 tree = NULL;
2399 else
2400 {
2401 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2402 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2403 *err = REG_EPAREN;
2404 if (BE (*err != REG_NOERROR, 0))
2405 return NULL;
2406 }
2407
2408 if (cur_nsub <= '9' - '1')
2409 dfa->completed_bkref_map |= 1 << cur_nsub;
2410
2411 tree = create_tree (dfa, tree, NULL, SUBEXP);
2412 if (BE (tree == NULL, 0))
2413 {
2414 *err = REG_ESPACE;
2415 return NULL;
2416 }
2417 tree->token.opr.idx = cur_nsub;
2418 return tree;
2419 }
2420
2421 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2422
2423 static bin_tree_t *
parse_dup_op(bin_tree_t * elem,re_string_t * regexp,re_dfa_t * dfa,re_token_t * token,reg_syntax_t syntax,reg_errcode_t * err)2424 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2425 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2426 {
2427 bin_tree_t *tree = NULL, *old_tree = NULL;
2428 Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2429 re_token_t start_token = *token;
2430
2431 if (token->type == OP_OPEN_DUP_NUM)
2432 {
2433 end = 0;
2434 start = fetch_number (regexp, token, syntax);
2435 if (start == REG_MISSING)
2436 {
2437 if (token->type == CHARACTER && token->opr.c == ',')
2438 start = 0; /* We treat "{,m}" as "{0,m}". */
2439 else
2440 {
2441 *err = REG_BADBR; /* <re>{} is invalid. */
2442 return NULL;
2443 }
2444 }
2445 if (BE (start != REG_ERROR, 1))
2446 {
2447 /* We treat "{n}" as "{n,n}". */
2448 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2449 : ((token->type == CHARACTER && token->opr.c == ',')
2450 ? fetch_number (regexp, token, syntax) : REG_ERROR));
2451 }
2452 if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2453 {
2454 /* Invalid sequence. */
2455 if (BE (!(syntax & REG_INVALID_INTERVAL_ORD), 0))
2456 {
2457 if (token->type == END_OF_RE)
2458 *err = REG_EBRACE;
2459 else
2460 *err = REG_BADBR;
2461
2462 return NULL;
2463 }
2464
2465 /* If the syntax bit is set, rollback. */
2466 re_string_set_index (regexp, start_idx);
2467 *token = start_token;
2468 token->type = CHARACTER;
2469 /* mb_partial and word_char bits should be already initialized by
2470 peek_token. */
2471 return elem;
2472 }
2473
2474 if (BE (end != REG_MISSING && start > end, 0))
2475 {
2476 /* First number greater than second. */
2477 *err = REG_BADBR;
2478 return NULL;
2479 }
2480 }
2481 else
2482 {
2483 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2484 end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2485 }
2486
2487 fetch_token (token, regexp, syntax);
2488
2489 if (BE (elem == NULL, 0))
2490 return NULL;
2491 if (BE (start == 0 && end == 0, 0))
2492 {
2493 postorder (elem, free_tree, NULL);
2494 return NULL;
2495 }
2496
2497 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2498 if (BE (start > 0, 0))
2499 {
2500 tree = elem;
2501 for (i = 2; i <= start; ++i)
2502 {
2503 elem = duplicate_tree (elem, dfa);
2504 tree = create_tree (dfa, tree, elem, CONCAT);
2505 if (BE (elem == NULL || tree == NULL, 0))
2506 goto parse_dup_op_espace;
2507 }
2508
2509 if (start == end)
2510 return tree;
2511
2512 /* Duplicate ELEM before it is marked optional. */
2513 elem = duplicate_tree (elem, dfa);
2514 old_tree = tree;
2515 }
2516 else
2517 old_tree = NULL;
2518
2519 if (elem->token.type == SUBEXP)
2520 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2521
2522 tree = create_tree (dfa, elem, NULL,
2523 (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2524 if (BE (tree == NULL, 0))
2525 goto parse_dup_op_espace;
2526
2527 /* This loop is actually executed only when end != REG_MISSING,
2528 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2529 already created the start+1-th copy. */
2530 if ((Idx) -1 < 0 || end != REG_MISSING)
2531 for (i = start + 2; i <= end; ++i)
2532 {
2533 elem = duplicate_tree (elem, dfa);
2534 tree = create_tree (dfa, tree, elem, CONCAT);
2535 if (BE (elem == NULL || tree == NULL, 0))
2536 goto parse_dup_op_espace;
2537
2538 tree = create_tree (dfa, tree, NULL, OP_ALT);
2539 if (BE (tree == NULL, 0))
2540 goto parse_dup_op_espace;
2541 }
2542
2543 if (old_tree)
2544 tree = create_tree (dfa, old_tree, tree, CONCAT);
2545
2546 return tree;
2547
2548 parse_dup_op_espace:
2549 *err = REG_ESPACE;
2550 return NULL;
2551 }
2552
2553 /* Size of the names for collating symbol/equivalence_class/character_class.
2554 I'm not sure, but maybe enough. */
2555 #define BRACKET_NAME_BUF_SIZE 32
2556
2557 #ifndef _LIBC
2558 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2559 Build the range expression which starts from START_ELEM, and ends
2560 at END_ELEM. The result are written to MBCSET and SBCSET.
2561 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2562 mbcset->range_ends, is a pointer argument sinse we may
2563 update it. */
2564
2565 static reg_errcode_t
build_range_exp(bitset sbcset,re_charset_t * mbcset,Idx * range_alloc,bracket_elem_t * start_elem,bracket_elem_t * end_elem)2566 build_range_exp (bitset sbcset,
2567 # ifdef RE_ENABLE_I18N
2568 re_charset_t *mbcset, Idx *range_alloc,
2569 # endif
2570 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2571 {
2572 unsigned int start_ch, end_ch;
2573 /* Equivalence Classes and Character Classes can't be a range start/end. */
2574 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2575 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2576 0))
2577 return REG_ERANGE;
2578
2579 /* We can handle no multi character collating elements without libc
2580 support. */
2581 if (BE ((start_elem->type == COLL_SYM
2582 && strlen ((char *) start_elem->opr.name) > 1)
2583 || (end_elem->type == COLL_SYM
2584 && strlen ((char *) end_elem->opr.name) > 1), 0))
2585 return REG_ECOLLATE;
2586
2587 # ifdef RE_ENABLE_I18N
2588 {
2589 wchar_t wc;
2590 wint_t start_wc, end_wc;
2591 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2592
2593 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2594 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2595 : 0));
2596 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2597 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2598 : 0));
2599 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2600 ? __btowc (start_ch) : start_elem->opr.wch);
2601 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2602 ? __btowc (end_ch) : end_elem->opr.wch);
2603 if (start_wc == WEOF || end_wc == WEOF)
2604 return REG_ECOLLATE;
2605 cmp_buf[0] = start_wc;
2606 cmp_buf[4] = end_wc;
2607 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2608 return REG_ERANGE;
2609
2610 /* Got valid collation sequence values, add them as a new entry.
2611 However, for !_LIBC we have no collation elements: if the
2612 character set is single byte, the single byte character set
2613 that we build below suffices. parse_bracket_exp passes
2614 no MBCSET if dfa->mb_cur_max == 1. */
2615 if (mbcset)
2616 {
2617 /* Check the space of the arrays. */
2618 if (BE (*range_alloc == mbcset->nranges, 0))
2619 {
2620 /* There is not enough space, need realloc. */
2621 wchar_t *new_array_start, *new_array_end;
2622 Idx new_nranges;
2623
2624 new_nranges = mbcset->nranges;
2625 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2626 are NULL if *range_alloc == 0. */
2627 new_array_start = re_x2realloc (mbcset->range_starts, wchar_t,
2628 &new_nranges);
2629 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2630 new_nranges);
2631
2632 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2633 return REG_ESPACE;
2634
2635 mbcset->range_starts = new_array_start;
2636 mbcset->range_ends = new_array_end;
2637 *range_alloc = new_nranges;
2638 }
2639
2640 mbcset->range_starts[mbcset->nranges] = start_wc;
2641 mbcset->range_ends[mbcset->nranges++] = end_wc;
2642 }
2643
2644 /* Build the table for single byte characters. */
2645 for (wc = 0; wc < SBC_MAX; ++wc)
2646 {
2647 cmp_buf[2] = wc;
2648 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2649 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2650 bitset_set (sbcset, wc);
2651 }
2652 }
2653 # else /* not RE_ENABLE_I18N */
2654 {
2655 unsigned int ch;
2656 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2657 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2658 : 0));
2659 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2660 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2661 : 0));
2662 if (start_ch > end_ch)
2663 return REG_ERANGE;
2664 /* Build the table for single byte characters. */
2665 for (ch = 0; ch < SBC_MAX; ++ch)
2666 if (start_ch <= ch && ch <= end_ch)
2667 bitset_set (sbcset, ch);
2668 }
2669 # endif /* not RE_ENABLE_I18N */
2670 return REG_NOERROR;
2671 }
2672 #endif /* not _LIBC */
2673
2674 #ifndef _LIBC
2675 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2676 Build the collating element which is represented by NAME.
2677 The result are written to MBCSET and SBCSET.
2678 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2679 pointer argument since we may update it. */
2680
2681 static reg_errcode_t
build_collating_symbol(bitset sbcset,re_charset_t * mbcset,Idx * coll_sym_alloc,const unsigned char * name)2682 build_collating_symbol (bitset sbcset,
2683 # ifdef RE_ENABLE_I18N
2684 re_charset_t *mbcset, Idx *coll_sym_alloc,
2685 # endif
2686 const unsigned char *name)
2687 {
2688 size_t name_len = strlen ((const char *) name);
2689 if (BE (name_len != 1, 0))
2690 return REG_ECOLLATE;
2691 else
2692 {
2693 bitset_set (sbcset, name[0]);
2694 return REG_NOERROR;
2695 }
2696 }
2697 #endif /* not _LIBC */
2698
2699 /* This function parse bracket expression like "[abc]", "[a-c]",
2700 "[[.a-a.]]" etc. */
2701
2702 static bin_tree_t *
parse_bracket_exp(re_string_t * regexp,re_dfa_t * dfa,re_token_t * token,reg_syntax_t syntax,reg_errcode_t * err)2703 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2704 reg_syntax_t syntax, reg_errcode_t *err)
2705 {
2706 #ifdef _LIBC
2707 const unsigned char *collseqmb;
2708 const char *collseqwc;
2709 uint32_t nrules;
2710 int32_t table_size;
2711 const int32_t *symb_table;
2712 const unsigned char *extra;
2713
2714 /* Local function for parse_bracket_exp used in _LIBC environement.
2715 Seek the collating symbol entry correspondings to NAME.
2716 Return the index of the symbol in the SYMB_TABLE. */
2717
2718 auto inline int32_t
2719 __attribute ((always_inline))
2720 seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
2721 {
2722 int32_t hash = elem_hash ((const char *) name, name_len);
2723 int32_t elem = hash % table_size;
2724 int32_t second = hash % (table_size - 2);
2725 while (symb_table[2 * elem] != 0)
2726 {
2727 /* First compare the hashing value. */
2728 if (symb_table[2 * elem] == hash
2729 /* Compare the length of the name. */
2730 && name_len == extra[symb_table[2 * elem + 1]]
2731 /* Compare the name. */
2732 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2733 name_len) == 0)
2734 {
2735 /* Yep, this is the entry. */
2736 break;
2737 }
2738
2739 /* Next entry. */
2740 elem += second;
2741 }
2742 return elem;
2743 }
2744
2745 /* Local function for parse_bracket_exp used in _LIBC environement.
2746 Look up the collation sequence value of BR_ELEM.
2747 Return the value if succeeded, UINT_MAX otherwise. */
2748
2749 auto inline unsigned int
2750 __attribute ((always_inline))
2751 lookup_collation_sequence_value (bracket_elem_t *br_elem)
2752 {
2753 if (br_elem->type == SB_CHAR)
2754 {
2755 /*
2756 if (MB_CUR_MAX == 1)
2757 */
2758 if (nrules == 0)
2759 return collseqmb[br_elem->opr.ch];
2760 else
2761 {
2762 wint_t wc = __btowc (br_elem->opr.ch);
2763 return __collseq_table_lookup (collseqwc, wc);
2764 }
2765 }
2766 else if (br_elem->type == MB_CHAR)
2767 {
2768 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2769 }
2770 else if (br_elem->type == COLL_SYM)
2771 {
2772 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2773 if (nrules != 0)
2774 {
2775 int32_t elem, idx;
2776 elem = seek_collating_symbol_entry (br_elem->opr.name,
2777 sym_name_len);
2778 if (symb_table[2 * elem] != 0)
2779 {
2780 /* We found the entry. */
2781 idx = symb_table[2 * elem + 1];
2782 /* Skip the name of collating element name. */
2783 idx += 1 + extra[idx];
2784 /* Skip the byte sequence of the collating element. */
2785 idx += 1 + extra[idx];
2786 /* Adjust for the alignment. */
2787 idx = (idx + 3) & ~3;
2788 /* Skip the multibyte collation sequence value. */
2789 idx += sizeof (unsigned int);
2790 /* Skip the wide char sequence of the collating element. */
2791 idx += sizeof (unsigned int) *
2792 (1 + *(unsigned int *) (extra + idx));
2793 /* Return the collation sequence value. */
2794 return *(unsigned int *) (extra + idx);
2795 }
2796 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2797 {
2798 /* No valid character. Match it as a single byte
2799 character. */
2800 return collseqmb[br_elem->opr.name[0]];
2801 }
2802 }
2803 else if (sym_name_len == 1)
2804 return collseqmb[br_elem->opr.name[0]];
2805 }
2806 return UINT_MAX;
2807 }
2808
2809 /* Local function for parse_bracket_exp used in _LIBC environement.
2810 Build the range expression which starts from START_ELEM, and ends
2811 at END_ELEM. The result are written to MBCSET and SBCSET.
2812 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2813 mbcset->range_ends, is a pointer argument sinse we may
2814 update it. */
2815
2816 auto inline reg_errcode_t
2817 __attribute ((always_inline))
2818 build_range_exp (bitset sbcset, re_charset_t *mbcset,
2819 Idx *range_alloc,
2820 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2821 {
2822 unsigned int ch;
2823 uint32_t start_collseq;
2824 uint32_t end_collseq;
2825
2826 /* Equivalence Classes and Character Classes can't be a range
2827 start/end. */
2828 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2829 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2830 0))
2831 return REG_ERANGE;
2832
2833 start_collseq = lookup_collation_sequence_value (start_elem);
2834 end_collseq = lookup_collation_sequence_value (end_elem);
2835 /* Check start/end collation sequence values. */
2836 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2837 return REG_ECOLLATE;
2838 if (BE ((syntax & REG_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2839 return REG_ERANGE;
2840
2841 /* Got valid collation sequence values, add them as a new entry.
2842 However, if we have no collation elements, and the character set
2843 is single byte, the single byte character set that we
2844 build below suffices. */
2845 if (nrules > 0 || dfa->mb_cur_max > 1)
2846 {
2847 /* Check the space of the arrays. */
2848 if (BE (*range_alloc == mbcset->nranges, 0))
2849 {
2850 /* There is not enough space, need realloc. */
2851 uint32_t *new_array_start;
2852 uint32_t *new_array_end;
2853 Idx new_nranges;
2854
2855 new_nranges = mbcset->nranges;
2856 new_array_start = re_x2realloc (mbcset->range_starts, uint32_t,
2857 &new_nranges);
2858 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2859 new_nranges);
2860
2861 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2862 return REG_ESPACE;
2863
2864 mbcset->range_starts = new_array_start;
2865 mbcset->range_ends = new_array_end;
2866 *range_alloc = new_nranges;
2867 }
2868
2869 mbcset->range_starts[mbcset->nranges] = start_collseq;
2870 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2871 }
2872
2873 /* Build the table for single byte characters. */
2874 for (ch = 0; ch < SBC_MAX; ch++)
2875 {
2876 uint32_t ch_collseq;
2877 /*
2878 if (MB_CUR_MAX == 1)
2879 */
2880 if (nrules == 0)
2881 ch_collseq = collseqmb[ch];
2882 else
2883 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2884 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2885 bitset_set (sbcset, ch);
2886 }
2887 return REG_NOERROR;
2888 }
2889
2890 /* Local function for parse_bracket_exp used in _LIBC environement.
2891 Build the collating element which is represented by NAME.
2892 The result are written to MBCSET and SBCSET.
2893 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2894 pointer argument sinse we may update it. */
2895
2896 auto inline reg_errcode_t
2897 __attribute ((always_inline))
2898 build_collating_symbol (bitset sbcset, re_charset_t *mbcset,
2899 Idx *coll_sym_alloc, const unsigned char *name)
2900 {
2901 int32_t elem, idx;
2902 size_t name_len = strlen ((const char *) name);
2903 if (nrules != 0)
2904 {
2905 elem = seek_collating_symbol_entry (name, name_len);
2906 if (symb_table[2 * elem] != 0)
2907 {
2908 /* We found the entry. */
2909 idx = symb_table[2 * elem + 1];
2910 /* Skip the name of collating element name. */
2911 idx += 1 + extra[idx];
2912 }
2913 else if (symb_table[2 * elem] == 0 && name_len == 1)
2914 {
2915 /* No valid character, treat it as a normal
2916 character. */
2917 bitset_set (sbcset, name[0]);
2918 return REG_NOERROR;
2919 }
2920 else
2921 return REG_ECOLLATE;
2922
2923 /* Got valid collation sequence, add it as a new entry. */
2924 /* Check the space of the arrays. */
2925 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2926 {
2927 /* Not enough, realloc it. */
2928 Idx new_coll_sym_alloc = mbcset->ncoll_syms;
2929 /* Use realloc since mbcset->coll_syms is NULL
2930 if *alloc == 0. */
2931 int32_t *new_coll_syms = re_x2realloc (mbcset->coll_syms, int32_t,
2932 &new_coll_sym_alloc);
2933 if (BE (new_coll_syms == NULL, 0))
2934 return REG_ESPACE;
2935 mbcset->coll_syms = new_coll_syms;
2936 *coll_sym_alloc = new_coll_sym_alloc;
2937 }
2938 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2939 return REG_NOERROR;
2940 }
2941 else
2942 {
2943 if (BE (name_len != 1, 0))
2944 return REG_ECOLLATE;
2945 else
2946 {
2947 bitset_set (sbcset, name[0]);
2948 return REG_NOERROR;
2949 }
2950 }
2951 }
2952 #endif
2953
2954 re_token_t br_token;
2955 re_bitset_ptr_t sbcset;
2956 #ifdef RE_ENABLE_I18N
2957 re_charset_t *mbcset;
2958 Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2959 Idx equiv_class_alloc = 0, char_class_alloc = 0;
2960 #endif /* not RE_ENABLE_I18N */
2961 bool non_match = false;
2962 bin_tree_t *work_tree;
2963 int token_len;
2964 bool first_round = true;
2965 #ifdef _LIBC
2966 collseqmb = (const unsigned char *)
2967 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
2968 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2969 if (nrules)
2970 {
2971 /*
2972 if (MB_CUR_MAX > 1)
2973 */
2974 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
2975 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
2976 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
2977 _NL_COLLATE_SYMB_TABLEMB);
2978 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
2979 _NL_COLLATE_SYMB_EXTRAMB);
2980 }
2981 #endif
2982 sbcset = re_calloc (bitset_word, BITSET_WORDS);
2983 #ifdef RE_ENABLE_I18N
2984 mbcset = re_calloc (re_charset_t, 1);
2985 #endif /* RE_ENABLE_I18N */
2986 #ifdef RE_ENABLE_I18N
2987 if (BE (sbcset == NULL || mbcset == NULL, 0))
2988 #else
2989 if (BE (sbcset == NULL, 0))
2990 #endif /* RE_ENABLE_I18N */
2991 {
2992 *err = REG_ESPACE;
2993 return NULL;
2994 }
2995
2996 token_len = peek_token_bracket (token, regexp, syntax);
2997 if (BE (token->type == END_OF_RE, 0))
2998 {
2999 *err = REG_BADPAT;
3000 goto parse_bracket_exp_free_return;
3001 }
3002 if (token->type == OP_NON_MATCH_LIST)
3003 {
3004 #ifdef RE_ENABLE_I18N
3005 mbcset->non_match = 1;
3006 #endif /* not RE_ENABLE_I18N */
3007 non_match = true;
3008 if (syntax & REG_HAT_LISTS_NOT_NEWLINE)
3009 bitset_set (sbcset, '\0');
3010 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3011 token_len = peek_token_bracket (token, regexp, syntax);
3012 if (BE (token->type == END_OF_RE, 0))
3013 {
3014 *err = REG_BADPAT;
3015 goto parse_bracket_exp_free_return;
3016 }
3017 }
3018
3019 /* We treat the first ']' as a normal character. */
3020 if (token->type == OP_CLOSE_BRACKET)
3021 token->type = CHARACTER;
3022
3023 while (1)
3024 {
3025 bracket_elem_t start_elem, end_elem;
3026 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3027 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3028 reg_errcode_t ret;
3029 int token_len2 = 0;
3030 bool is_range_exp = false;
3031 re_token_t token2;
3032
3033 start_elem.opr.name = start_name_buf;
3034 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3035 syntax, first_round);
3036 if (BE (ret != REG_NOERROR, 0))
3037 {
3038 *err = ret;
3039 goto parse_bracket_exp_free_return;
3040 }
3041 first_round = false;
3042
3043 /* Get information about the next token. We need it in any case. */
3044 token_len = peek_token_bracket (token, regexp, syntax);
3045
3046 /* Do not check for ranges if we know they are not allowed. */
3047 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3048 {
3049 if (BE (token->type == END_OF_RE, 0))
3050 {
3051 *err = REG_EBRACK;
3052 goto parse_bracket_exp_free_return;
3053 }
3054 if (token->type == OP_CHARSET_RANGE)
3055 {
3056 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3057 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3058 if (BE (token2.type == END_OF_RE, 0))
3059 {
3060 *err = REG_EBRACK;
3061 goto parse_bracket_exp_free_return;
3062 }
3063 if (token2.type == OP_CLOSE_BRACKET)
3064 {
3065 /* We treat the last '-' as a normal character. */
3066 re_string_skip_bytes (regexp, -token_len);
3067 token->type = CHARACTER;
3068 }
3069 else
3070 is_range_exp = true;
3071 }
3072 }
3073
3074 if (is_range_exp == true)
3075 {
3076 end_elem.opr.name = end_name_buf;
3077 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3078 dfa, syntax, true);
3079 if (BE (ret != REG_NOERROR, 0))
3080 {
3081 *err = ret;
3082 goto parse_bracket_exp_free_return;
3083 }
3084
3085 token_len = peek_token_bracket (token, regexp, syntax);
3086
3087 #ifdef _LIBC
3088 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3089 &start_elem, &end_elem);
3090 #else
3091 # ifdef RE_ENABLE_I18N
3092 *err = build_range_exp (sbcset,
3093 dfa->mb_cur_max > 1 ? mbcset : NULL,
3094 &range_alloc, &start_elem, &end_elem);
3095 # else
3096 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3097 # endif
3098 #endif /* RE_ENABLE_I18N */
3099 if (BE (*err != REG_NOERROR, 0))
3100 goto parse_bracket_exp_free_return;
3101 }
3102 else
3103 {
3104 switch (start_elem.type)
3105 {
3106 case SB_CHAR:
3107 bitset_set (sbcset, start_elem.opr.ch);
3108 break;
3109 #ifdef RE_ENABLE_I18N
3110 case MB_CHAR:
3111 /* Check whether the array has enough space. */
3112 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3113 {
3114 wchar_t *new_mbchars;
3115 /* Not enough, realloc it. */
3116 mbchar_alloc = mbcset->nmbchars;
3117 /* Use realloc since array is NULL if *alloc == 0. */
3118 new_mbchars = re_x2realloc (mbcset->mbchars, wchar_t,
3119 &mbchar_alloc);
3120 if (BE (new_mbchars == NULL, 0))
3121 goto parse_bracket_exp_espace;
3122 mbcset->mbchars = new_mbchars;
3123 }
3124 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3125 break;
3126 #endif /* RE_ENABLE_I18N */
3127 case EQUIV_CLASS:
3128 *err = build_equiv_class (sbcset,
3129 #ifdef RE_ENABLE_I18N
3130 mbcset, &equiv_class_alloc,
3131 #endif /* RE_ENABLE_I18N */
3132 start_elem.opr.name);
3133 if (BE (*err != REG_NOERROR, 0))
3134 goto parse_bracket_exp_free_return;
3135 break;
3136 case COLL_SYM:
3137 *err = build_collating_symbol (sbcset,
3138 #ifdef RE_ENABLE_I18N
3139 mbcset, &coll_sym_alloc,
3140 #endif /* RE_ENABLE_I18N */
3141 start_elem.opr.name);
3142 if (BE (*err != REG_NOERROR, 0))
3143 goto parse_bracket_exp_free_return;
3144 break;
3145 case CHAR_CLASS:
3146 *err = build_charclass (regexp->trans, sbcset,
3147 #ifdef RE_ENABLE_I18N
3148 mbcset, &char_class_alloc,
3149 #endif /* RE_ENABLE_I18N */
3150 start_elem.opr.name, syntax);
3151 if (BE (*err != REG_NOERROR, 0))
3152 goto parse_bracket_exp_free_return;
3153 break;
3154 default:
3155 assert (0);
3156 break;
3157 }
3158 }
3159 if (BE (token->type == END_OF_RE, 0))
3160 {
3161 *err = REG_EBRACK;
3162 goto parse_bracket_exp_free_return;
3163 }
3164 if (token->type == OP_CLOSE_BRACKET)
3165 break;
3166 }
3167
3168 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3169
3170 /* If it is non-matching list. */
3171 if (non_match)
3172 bitset_not (sbcset);
3173
3174 #ifdef RE_ENABLE_I18N
3175 /* Ensure only single byte characters are set. */
3176 if (dfa->mb_cur_max > 1)
3177 bitset_mask (sbcset, dfa->sb_char);
3178
3179 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3180 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3181 || mbcset->non_match)))
3182 {
3183 bin_tree_t *mbc_tree;
3184 int sbc_idx;
3185 /* Build a tree for complex bracket. */
3186 dfa->has_mb_node = 1;
3187 br_token.type = COMPLEX_BRACKET;
3188 br_token.opr.mbcset = mbcset;
3189 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3190 if (BE (mbc_tree == NULL, 0))
3191 goto parse_bracket_exp_espace;
3192 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3193 if (sbcset[sbc_idx])
3194 break;
3195 /* If there are no bits set in sbcset, there is no point
3196 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3197 if (sbc_idx < BITSET_WORDS)
3198 {
3199 /* Build a tree for simple bracket. */
3200 br_token.type = SIMPLE_BRACKET;
3201 br_token.opr.sbcset = sbcset;
3202 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3203 if (BE (work_tree == NULL, 0))
3204 goto parse_bracket_exp_espace;
3205
3206 /* Then join them by ALT node. */
3207 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3208 if (BE (work_tree == NULL, 0))
3209 goto parse_bracket_exp_espace;
3210 }
3211 else
3212 {
3213 re_free (sbcset);
3214 work_tree = mbc_tree;
3215 }
3216 }
3217 else
3218 #endif /* not RE_ENABLE_I18N */
3219 {
3220 #ifdef RE_ENABLE_I18N
3221 free_charset (mbcset);
3222 #endif
3223 /* Build a tree for simple bracket. */
3224 br_token.type = SIMPLE_BRACKET;
3225 br_token.opr.sbcset = sbcset;
3226 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3227 if (BE (work_tree == NULL, 0))
3228 goto parse_bracket_exp_espace;
3229 }
3230 return work_tree;
3231
3232 parse_bracket_exp_espace:
3233 *err = REG_ESPACE;
3234 parse_bracket_exp_free_return:
3235 re_free (sbcset);
3236 #ifdef RE_ENABLE_I18N
3237 free_charset (mbcset);
3238 #endif /* RE_ENABLE_I18N */
3239 return NULL;
3240 }
3241
3242 /* Parse an element in the bracket expression. */
3243
3244 static reg_errcode_t
parse_bracket_element(bracket_elem_t * elem,re_string_t * regexp,re_token_t * token,int token_len,re_dfa_t * dfa,reg_syntax_t syntax,bool accept_hyphen)3245 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3246 re_token_t *token, int token_len, re_dfa_t *dfa,
3247 reg_syntax_t syntax, bool accept_hyphen)
3248 {
3249 #ifdef RE_ENABLE_I18N
3250 int cur_char_size;
3251 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3252 if (cur_char_size > 1)
3253 {
3254 elem->type = MB_CHAR;
3255 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3256 re_string_skip_bytes (regexp, cur_char_size);
3257 return REG_NOERROR;
3258 }
3259 #endif /* RE_ENABLE_I18N */
3260 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3261 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3262 || token->type == OP_OPEN_EQUIV_CLASS)
3263 return parse_bracket_symbol (elem, regexp, token);
3264 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3265 {
3266 /* A '-' must only appear as anything but a range indicator before
3267 the closing bracket. Everything else is an error. */
3268 re_token_t token2;
3269 (void) peek_token_bracket (&token2, regexp, syntax);
3270 if (token2.type != OP_CLOSE_BRACKET)
3271 /* The actual error value is not standardized since this whole
3272 case is undefined. But ERANGE makes good sense. */
3273 return REG_ERANGE;
3274 }
3275 elem->type = SB_CHAR;
3276 elem->opr.ch = token->opr.c;
3277 return REG_NOERROR;
3278 }
3279
3280 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3281 such as [:<character_class>:], [.<collating_element>.], and
3282 [=<equivalent_class>=]. */
3283
3284 static reg_errcode_t
parse_bracket_symbol(bracket_elem_t * elem,re_string_t * regexp,re_token_t * token)3285 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3286 re_token_t *token)
3287 {
3288 unsigned char ch, delim = token->opr.c;
3289 int i = 0;
3290 if (re_string_eoi(regexp))
3291 return REG_EBRACK;
3292 for (;; ++i)
3293 {
3294 if (i >= BRACKET_NAME_BUF_SIZE)
3295 return REG_EBRACK;
3296 if (token->type == OP_OPEN_CHAR_CLASS)
3297 ch = re_string_fetch_byte_case (regexp);
3298 else
3299 ch = re_string_fetch_byte (regexp);
3300 if (re_string_eoi(regexp))
3301 return REG_EBRACK;
3302 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3303 break;
3304 elem->opr.name[i] = ch;
3305 }
3306 re_string_skip_bytes (regexp, 1);
3307 elem->opr.name[i] = '\0';
3308 switch (token->type)
3309 {
3310 case OP_OPEN_COLL_ELEM:
3311 elem->type = COLL_SYM;
3312 break;
3313 case OP_OPEN_EQUIV_CLASS:
3314 elem->type = EQUIV_CLASS;
3315 break;
3316 case OP_OPEN_CHAR_CLASS:
3317 elem->type = CHAR_CLASS;
3318 break;
3319 default:
3320 break;
3321 }
3322 return REG_NOERROR;
3323 }
3324
3325 /* Helper function for parse_bracket_exp.
3326 Build the equivalence class which is represented by NAME.
3327 The result are written to MBCSET and SBCSET.
3328 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3329 is a pointer argument sinse we may update it. */
3330
3331 static reg_errcode_t
build_equiv_class(bitset sbcset,re_charset_t * mbcset,Idx * equiv_class_alloc,const unsigned char * name)3332 build_equiv_class (bitset sbcset,
3333 #ifdef RE_ENABLE_I18N
3334 re_charset_t *mbcset, Idx *equiv_class_alloc,
3335 #endif
3336 const unsigned char *name)
3337 {
3338 #if defined _LIBC
3339 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3340 if (nrules != 0)
3341 {
3342 const int32_t *table, *indirect;
3343 const unsigned char *weights, *extra, *cp;
3344 unsigned char char_buf[2];
3345 int32_t idx1, idx2;
3346 unsigned int ch;
3347 size_t len;
3348 /* This #include defines a local function! */
3349 # include <locale/weight.h>
3350 /* Calculate the index for equivalence class. */
3351 cp = name;
3352 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3353 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3354 _NL_COLLATE_WEIGHTMB);
3355 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3356 _NL_COLLATE_EXTRAMB);
3357 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3358 _NL_COLLATE_INDIRECTMB);
3359 idx1 = findidx (&cp);
3360 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3361 /* This isn't a valid character. */
3362 return REG_ECOLLATE;
3363
3364 /* Build single byte matcing table for this equivalence class. */
3365 char_buf[1] = (unsigned char) '\0';
3366 len = weights[idx1];
3367 for (ch = 0; ch < SBC_MAX; ++ch)
3368 {
3369 char_buf[0] = ch;
3370 cp = char_buf;
3371 idx2 = findidx (&cp);
3372 /*
3373 idx2 = table[ch];
3374 */
3375 if (idx2 == 0)
3376 /* This isn't a valid character. */
3377 continue;
3378 if (len == weights[idx2])
3379 {
3380 int cnt = 0;
3381 while (cnt <= len &&
3382 weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3383 ++cnt;
3384
3385 if (cnt > len)
3386 bitset_set (sbcset, ch);
3387 }
3388 }
3389 /* Check whether the array has enough space. */
3390 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3391 {
3392 /* Not enough, realloc it. */
3393 Idx new_equiv_class_alloc = mbcset->nequiv_classes;
3394 /* Use realloc since the array is NULL if *alloc == 0. */
3395 int32_t *new_equiv_classes = re_x2realloc (mbcset->equiv_classes,
3396 int32_t,
3397 &new_equiv_class_alloc);
3398 if (BE (new_equiv_classes == NULL, 0))
3399 return REG_ESPACE;
3400 mbcset->equiv_classes = new_equiv_classes;
3401 *equiv_class_alloc = new_equiv_class_alloc;
3402 }
3403 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3404 }
3405 else
3406 #endif /* _LIBC */
3407 {
3408 if (BE (strlen ((const char *) name) != 1, 0))
3409 return REG_ECOLLATE;
3410 bitset_set (sbcset, *name);
3411 }
3412 return REG_NOERROR;
3413 }
3414
3415 /* Helper function for parse_bracket_exp.
3416 Build the character class which is represented by NAME.
3417 The result are written to MBCSET and SBCSET.
3418 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3419 is a pointer argument sinse we may update it. */
3420
3421 static reg_errcode_t
build_charclass(unsigned REG_TRANSLATE_TYPE trans,bitset sbcset,re_charset_t * mbcset,Idx * char_class_alloc,const unsigned char * class_name,reg_syntax_t syntax)3422 build_charclass (unsigned REG_TRANSLATE_TYPE trans, bitset sbcset,
3423 #ifdef RE_ENABLE_I18N
3424 re_charset_t *mbcset, Idx *char_class_alloc,
3425 #endif
3426 const unsigned char *class_name, reg_syntax_t syntax)
3427 {
3428 int i;
3429 const char *name = (const char *) class_name;
3430
3431 /* In case of REG_ICASE "upper" and "lower" match the both of
3432 upper and lower cases. */
3433 if ((syntax & REG_IGNORE_CASE)
3434 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3435 name = "alpha";
3436
3437 #ifdef RE_ENABLE_I18N
3438 /* Check the space of the arrays. */
3439 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3440 {
3441 /* Not enough, realloc it. */
3442 Idx new_char_class_alloc = mbcset->nchar_classes;
3443 /* Use realloc since array is NULL if *alloc == 0. */
3444 wctype_t *new_char_classes = re_x2realloc (mbcset->char_classes, wctype_t,
3445 &new_char_class_alloc);
3446 if (BE (new_char_classes == NULL, 0))
3447 return REG_ESPACE;
3448 mbcset->char_classes = new_char_classes;
3449 *char_class_alloc = new_char_class_alloc;
3450 }
3451 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3452 #endif /* RE_ENABLE_I18N */
3453
3454 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3455 for (i = 0; i < SBC_MAX; ++i) \
3456 { \
3457 if (ctype_func (i)) \
3458 { \
3459 int ch = trans ? trans[i] : i; \
3460 bitset_set (sbcset, ch); \
3461 } \
3462 }
3463
3464 if (strcmp (name, "alnum") == 0)
3465 BUILD_CHARCLASS_LOOP (isalnum)
3466 else if (strcmp (name, "cntrl") == 0)
3467 BUILD_CHARCLASS_LOOP (iscntrl)
3468 else if (strcmp (name, "lower") == 0)
3469 BUILD_CHARCLASS_LOOP (islower)
3470 else if (strcmp (name, "space") == 0)
3471 BUILD_CHARCLASS_LOOP (isspace)
3472 else if (strcmp (name, "alpha") == 0)
3473 BUILD_CHARCLASS_LOOP (isalpha)
3474 else if (strcmp (name, "digit") == 0)
3475 BUILD_CHARCLASS_LOOP (isdigit)
3476 else if (strcmp (name, "print") == 0)
3477 BUILD_CHARCLASS_LOOP (isprint)
3478 else if (strcmp (name, "upper") == 0)
3479 BUILD_CHARCLASS_LOOP (isupper)
3480 else if (strcmp (name, "blank") == 0)
3481 BUILD_CHARCLASS_LOOP (isblank)
3482 else if (strcmp (name, "graph") == 0)
3483 BUILD_CHARCLASS_LOOP (isgraph)
3484 else if (strcmp (name, "punct") == 0)
3485 BUILD_CHARCLASS_LOOP (ispunct)
3486 else if (strcmp (name, "xdigit") == 0)
3487 BUILD_CHARCLASS_LOOP (isxdigit)
3488 else
3489 return REG_ECTYPE;
3490
3491 return REG_NOERROR;
3492 }
3493
3494 static bin_tree_t *
build_charclass_op(re_dfa_t * dfa,unsigned REG_TRANSLATE_TYPE trans,const unsigned char * class_name,const unsigned char * extra,bool non_match,reg_errcode_t * err)3495 build_charclass_op (re_dfa_t *dfa, unsigned REG_TRANSLATE_TYPE trans,
3496 const unsigned char *class_name,
3497 const unsigned char *extra,
3498 bool non_match, reg_errcode_t *err)
3499 {
3500 re_bitset_ptr_t sbcset;
3501 #ifdef RE_ENABLE_I18N
3502 re_charset_t *mbcset;
3503 Idx alloc = 0;
3504 #endif /* not RE_ENABLE_I18N */
3505 reg_errcode_t ret;
3506 re_token_t br_token;
3507 bin_tree_t *tree;
3508
3509 sbcset = re_calloc (bitset_word, BITSET_WORDS);
3510 #ifdef RE_ENABLE_I18N
3511 mbcset = re_calloc (re_charset_t, 1);
3512 #endif /* RE_ENABLE_I18N */
3513
3514 #ifdef RE_ENABLE_I18N
3515 if (BE (sbcset == NULL || mbcset == NULL, 0))
3516 #else /* not RE_ENABLE_I18N */
3517 if (BE (sbcset == NULL, 0))
3518 #endif /* not RE_ENABLE_I18N */
3519 {
3520 *err = REG_ESPACE;
3521 return NULL;
3522 }
3523
3524 if (non_match)
3525 {
3526 #ifdef RE_ENABLE_I18N
3527 /*
3528 if (syntax & REG_HAT_LISTS_NOT_NEWLINE)
3529 bitset_set(cset->sbcset, '\0');
3530 */
3531 mbcset->non_match = 1;
3532 #endif /* not RE_ENABLE_I18N */
3533 }
3534
3535 /* We don't care the syntax in this case. */
3536 ret = build_charclass (trans, sbcset,
3537 #ifdef RE_ENABLE_I18N
3538 mbcset, &alloc,
3539 #endif /* RE_ENABLE_I18N */
3540 class_name, 0);
3541
3542 if (BE (ret != REG_NOERROR, 0))
3543 {
3544 re_free (sbcset);
3545 #ifdef RE_ENABLE_I18N
3546 free_charset (mbcset);
3547 #endif /* RE_ENABLE_I18N */
3548 *err = ret;
3549 return NULL;
3550 }
3551 /* \w match '_' also. */
3552 for (; *extra; extra++)
3553 bitset_set (sbcset, *extra);
3554
3555 /* If it is non-matching list. */
3556 if (non_match)
3557 bitset_not (sbcset);
3558
3559 #ifdef RE_ENABLE_I18N
3560 /* Ensure only single byte characters are set. */
3561 if (dfa->mb_cur_max > 1)
3562 bitset_mask (sbcset, dfa->sb_char);
3563 #endif
3564
3565 /* Build a tree for simple bracket. */
3566 br_token.type = SIMPLE_BRACKET;
3567 br_token.opr.sbcset = sbcset;
3568 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3569 if (BE (tree == NULL, 0))
3570 goto build_word_op_espace;
3571
3572 #ifdef RE_ENABLE_I18N
3573 if (dfa->mb_cur_max > 1)
3574 {
3575 bin_tree_t *mbc_tree;
3576 /* Build a tree for complex bracket. */
3577 br_token.type = COMPLEX_BRACKET;
3578 br_token.opr.mbcset = mbcset;
3579 dfa->has_mb_node = 1;
3580 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3581 if (BE (mbc_tree == NULL, 0))
3582 goto build_word_op_espace;
3583 /* Then join them by ALT node. */
3584 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3585 if (BE (mbc_tree != NULL, 1))
3586 return tree;
3587 }
3588 else
3589 {
3590 free_charset (mbcset);
3591 return tree;
3592 }
3593 #else /* not RE_ENABLE_I18N */
3594 return tree;
3595 #endif /* not RE_ENABLE_I18N */
3596
3597 build_word_op_espace:
3598 re_free (sbcset);
3599 #ifdef RE_ENABLE_I18N
3600 free_charset (mbcset);
3601 #endif /* RE_ENABLE_I18N */
3602 *err = REG_ESPACE;
3603 return NULL;
3604 }
3605
3606 /* This is intended for the expressions like "a{1,3}".
3607 Fetch a number from `input', and return the number.
3608 Return REG_MISSING if the number field is empty like "{,1}".
3609 Return REG_ERROR if an error occurred. */
3610
3611 static Idx
fetch_number(re_string_t * input,re_token_t * token,reg_syntax_t syntax)3612 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3613 {
3614 Idx num = REG_MISSING;
3615 unsigned char c;
3616 while (1)
3617 {
3618 fetch_token (token, input, syntax);
3619 c = token->opr.c;
3620 if (BE (token->type == END_OF_RE, 0))
3621 return REG_ERROR;
3622 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3623 break;
3624 num = ((token->type != CHARACTER || c < '0' || '9' < c
3625 || num == REG_ERROR)
3626 ? REG_ERROR
3627 : ((num == REG_MISSING) ? c - '0' : num * 10 + c - '0'));
3628 num = (num > REG_DUP_MAX) ? REG_ERROR : num;
3629 }
3630 return num;
3631 }
3632
3633 #ifdef RE_ENABLE_I18N
3634 static void
free_charset(re_charset_t * cset)3635 free_charset (re_charset_t *cset)
3636 {
3637 re_free (cset->mbchars);
3638 # ifdef _LIBC
3639 re_free (cset->coll_syms);
3640 re_free (cset->equiv_classes);
3641 re_free (cset->range_starts);
3642 re_free (cset->range_ends);
3643 # endif
3644 re_free (cset->char_classes);
3645 re_free (cset);
3646 }
3647 #endif /* RE_ENABLE_I18N */
3648
3649 /* Functions for binary tree operation. */
3650
3651 /* Create a tree node. */
3652
3653 static bin_tree_t *
create_tree(re_dfa_t * dfa,bin_tree_t * left,bin_tree_t * right,re_token_type_t type)3654 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3655 re_token_type_t type)
3656 {
3657 re_token_t t;
3658 t.type = type;
3659 return create_token_tree (dfa, left, right, &t);
3660 }
3661
3662 static bin_tree_t *
create_token_tree(re_dfa_t * dfa,bin_tree_t * left,bin_tree_t * right,const re_token_t * token)3663 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3664 const re_token_t *token)
3665 {
3666 bin_tree_t *tree;
3667 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3668 {
3669 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3670
3671 if (storage == NULL)
3672 return NULL;
3673 storage->next = dfa->str_tree_storage;
3674 dfa->str_tree_storage = storage;
3675 dfa->str_tree_storage_idx = 0;
3676 }
3677 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3678
3679 tree->parent = NULL;
3680 tree->left = left;
3681 tree->right = right;
3682 tree->token = *token;
3683 tree->token.duplicated = 0;
3684 tree->token.opt_subexp = 0;
3685 tree->first = NULL;
3686 tree->next = NULL;
3687 tree->node_idx = REG_MISSING;
3688
3689 if (left != NULL)
3690 left->parent = tree;
3691 if (right != NULL)
3692 right->parent = tree;
3693 return tree;
3694 }
3695
3696 /* Mark the tree SRC as an optional subexpression.
3697 To be called from preorder or postorder. */
3698
3699 static reg_errcode_t
mark_opt_subexp(void * extra,bin_tree_t * node)3700 mark_opt_subexp (void *extra, bin_tree_t *node)
3701 {
3702 Idx idx = (Idx) (long) extra;
3703 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3704 node->token.opt_subexp = 1;
3705
3706 return REG_NOERROR;
3707 }
3708
3709 /* Free the allocated memory inside NODE. */
3710
3711 static void
free_token(re_token_t * node)3712 free_token (re_token_t *node)
3713 {
3714 #ifdef RE_ENABLE_I18N
3715 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3716 free_charset (node->opr.mbcset);
3717 else
3718 #endif /* RE_ENABLE_I18N */
3719 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3720 re_free (node->opr.sbcset);
3721 }
3722
3723 /* Worker function for tree walking. Free the allocated memory inside NODE
3724 and its children. */
3725
3726 static reg_errcode_t
free_tree(void * extra,bin_tree_t * node)3727 free_tree (void *extra, bin_tree_t *node)
3728 {
3729 free_token (&node->token);
3730 return REG_NOERROR;
3731 }
3732
3733
3734 /* Duplicate the node SRC, and return new node. This is a preorder
3735 visit similar to the one implemented by the generic visitor, but
3736 we need more infrastructure to maintain two parallel trees --- so,
3737 it's easier to duplicate. */
3738
3739 static bin_tree_t *
duplicate_tree(const bin_tree_t * root,re_dfa_t * dfa)3740 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3741 {
3742 const bin_tree_t *node;
3743 bin_tree_t *dup_root;
3744 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3745
3746 for (node = root; ; )
3747 {
3748 /* Create a new tree and link it back to the current parent. */
3749 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3750 if (*p_new == NULL)
3751 return NULL;
3752 (*p_new)->parent = dup_node;
3753 (*p_new)->token.duplicated = 1;
3754 dup_node = *p_new;
3755
3756 /* Go to the left node, or up and to the right. */
3757 if (node->left)
3758 {
3759 node = node->left;
3760 p_new = &dup_node->left;
3761 }
3762 else
3763 {
3764 const bin_tree_t *prev = NULL;
3765 while (node->right == prev || node->right == NULL)
3766 {
3767 prev = node;
3768 node = node->parent;
3769 dup_node = dup_node->parent;
3770 if (!node)
3771 return dup_root;
3772 }
3773 node = node->right;
3774 p_new = &dup_node->right;
3775 }
3776 }
3777 }
3778
3779 __RCSID("$MirOS: src/gnu/usr.bin/cvs/lib/regcomp.c,v 1.2 2011/07/28 15:54:33 tg Exp $");
3780