1 /* Extended regular expression matching and search library.
2    Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5 
6    This program is free software; you can redistribute it and/or modify
7    it under the terms of the GNU General Public License as published by
8    the Free Software Foundation; either version 2, or (at your option)
9    any later version.
10 
11    This program is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14    GNU General Public License for more details.
15 
16    You should have received a copy of the GNU General Public License along
17    with this program; if not, write to the Free Software Foundation,
18    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
19 
20 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
21 					  Idx length, reg_syntax_t syntax);
22 static void re_compile_fastmap_iter (regex_t *bufp,
23 				     const re_dfastate_t *init_state,
24 				     char *fastmap);
25 static reg_errcode_t init_dfa (re_dfa_t *dfa, Idx pat_len);
26 #ifdef RE_ENABLE_I18N
27 static void free_charset (re_charset_t *cset);
28 #endif /* RE_ENABLE_I18N */
29 static void free_workarea_compile (regex_t *preg);
30 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
31 #ifdef RE_ENABLE_I18N
32 static void optimize_utf8 (re_dfa_t *dfa);
33 #endif
34 static reg_errcode_t analyze (regex_t *preg);
35 static reg_errcode_t preorder (bin_tree_t *root,
36 			       reg_errcode_t (fn (void *, bin_tree_t *)),
37 			       void *extra);
38 static reg_errcode_t postorder (bin_tree_t *root,
39 				reg_errcode_t (fn (void *, bin_tree_t *)),
40 				void *extra);
41 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
42 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
43 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
44 				 bin_tree_t *node);
45 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
46 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
47 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
48 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
49 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
50 				   unsigned int constraint);
51 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
52 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
53 					 Idx node, bool root);
54 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
55 static Idx fetch_number (re_string_t *input, re_token_t *token,
56 			 reg_syntax_t syntax);
57 static int peek_token (re_token_t *token, re_string_t *input,
58 			reg_syntax_t syntax);
59 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
60 			  reg_syntax_t syntax, reg_errcode_t *err);
61 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
62 				  re_token_t *token, reg_syntax_t syntax,
63 				  Idx nest, reg_errcode_t *err);
64 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
65 				 re_token_t *token, reg_syntax_t syntax,
66 				 Idx nest, reg_errcode_t *err);
67 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
68 				     re_token_t *token, reg_syntax_t syntax,
69 				     Idx nest, reg_errcode_t *err);
70 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
71 				  re_token_t *token, reg_syntax_t syntax,
72 				  Idx nest, reg_errcode_t *err);
73 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
74 				 re_dfa_t *dfa, re_token_t *token,
75 				 reg_syntax_t syntax, reg_errcode_t *err);
76 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
77 				      re_token_t *token, reg_syntax_t syntax,
78 				      reg_errcode_t *err);
79 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
80 					    re_string_t *regexp,
81 					    re_token_t *token, int token_len,
82 					    re_dfa_t *dfa,
83 					    reg_syntax_t syntax,
84 					    bool accept_hyphen);
85 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
86 					  re_string_t *regexp,
87 					  re_token_t *token);
88 #ifdef RE_ENABLE_I18N
89 static reg_errcode_t build_equiv_class (bitset sbcset,
90 					re_charset_t *mbcset,
91 					Idx *equiv_class_alloc,
92 					const unsigned char *name);
93 static reg_errcode_t build_charclass (unsigned REG_TRANSLATE_TYPE trans,
94 				      bitset sbcset,
95 				      re_charset_t *mbcset,
96 				      Idx *char_class_alloc,
97 				      const unsigned char *class_name,
98 				      reg_syntax_t syntax);
99 #else  /* not RE_ENABLE_I18N */
100 static reg_errcode_t build_equiv_class (bitset sbcset,
101 					const unsigned char *name);
102 static reg_errcode_t build_charclass (unsigned REG_TRANSLATE_TYPE trans,
103 				      bitset sbcset,
104 				      const unsigned char *class_name,
105 				      reg_syntax_t syntax);
106 #endif /* not RE_ENABLE_I18N */
107 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
108 				       unsigned REG_TRANSLATE_TYPE trans,
109 				       const unsigned char *class_name,
110 				       const unsigned char *extra,
111 				       bool non_match, reg_errcode_t *err);
112 static bin_tree_t *create_tree (re_dfa_t *dfa,
113 				bin_tree_t *left, bin_tree_t *right,
114 				re_token_type_t type);
115 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
116 				      bin_tree_t *left, bin_tree_t *right,
117 				      const re_token_t *token);
118 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
119 static void free_token (re_token_t *node);
120 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
121 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
122 
123 /* This table gives an error message for each of the error codes listed
124    in regex.h.  Obviously the order here has to be same as there.
125    POSIX doesn't require that we do anything for REG_NOERROR,
126    but why not be nice?  */
127 
128 const char __re_error_msgid[] attribute_hidden =
129   {
130 #define REG_NOERROR_IDX	0
131     gettext_noop ("Success")	/* REG_NOERROR */
132     "\0"
133 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
134     gettext_noop ("No match")	/* REG_NOMATCH */
135     "\0"
136 #define REG_BADPAT_IDX	(REG_NOMATCH_IDX + sizeof "No match")
137     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
138     "\0"
139 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
140     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
141     "\0"
142 #define REG_ECTYPE_IDX	(REG_ECOLLATE_IDX + sizeof "Invalid collation character")
143     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
144     "\0"
145 #define REG_EESCAPE_IDX	(REG_ECTYPE_IDX + sizeof "Invalid character class name")
146     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
147     "\0"
148 #define REG_ESUBREG_IDX	(REG_EESCAPE_IDX + sizeof "Trailing backslash")
149     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
150     "\0"
151 #define REG_EBRACK_IDX	(REG_ESUBREG_IDX + sizeof "Invalid back reference")
152     gettext_noop ("Unmatched [ or [^")	/* REG_EBRACK */
153     "\0"
154 #define REG_EPAREN_IDX	(REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
155     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
156     "\0"
157 #define REG_EBRACE_IDX	(REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
158     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
159     "\0"
160 #define REG_BADBR_IDX	(REG_EBRACE_IDX + sizeof "Unmatched \\{")
161     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
162     "\0"
163 #define REG_ERANGE_IDX	(REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
164     gettext_noop ("Invalid range end")	/* REG_ERANGE */
165     "\0"
166 #define REG_ESPACE_IDX	(REG_ERANGE_IDX + sizeof "Invalid range end")
167     gettext_noop ("Memory exhausted") /* REG_ESPACE */
168     "\0"
169 #define REG_BADRPT_IDX	(REG_ESPACE_IDX + sizeof "Memory exhausted")
170     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
171     "\0"
172 #define REG_EEND_IDX	(REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
173     gettext_noop ("Premature end of regular expression") /* REG_EEND */
174     "\0"
175 #define REG_ESIZE_IDX	(REG_EEND_IDX + sizeof "Premature end of regular expression")
176     gettext_noop ("Regular expression too big") /* REG_ESIZE */
177     "\0"
178 #define REG_ERPAREN_IDX	(REG_ESIZE_IDX + sizeof "Regular expression too big")
179     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
180   };
181 
182 const size_t __re_error_msgid_idx[] attribute_hidden =
183   {
184     REG_NOERROR_IDX,
185     REG_NOMATCH_IDX,
186     REG_BADPAT_IDX,
187     REG_ECOLLATE_IDX,
188     REG_ECTYPE_IDX,
189     REG_EESCAPE_IDX,
190     REG_ESUBREG_IDX,
191     REG_EBRACK_IDX,
192     REG_EPAREN_IDX,
193     REG_EBRACE_IDX,
194     REG_BADBR_IDX,
195     REG_ERANGE_IDX,
196     REG_ESPACE_IDX,
197     REG_BADRPT_IDX,
198     REG_EEND_IDX,
199     REG_ESIZE_IDX,
200     REG_ERPAREN_IDX
201   };
202 
203 /* Entry points for GNU code.  */
204 
205 /* re_compile_pattern is the GNU regular expression compiler: it
206    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
207    Returns 0 if the pattern was valid, otherwise an error string.
208 
209    Assumes the `re_allocated' (and perhaps `re_buffer') and `translate' fields
210    are set in BUFP on entry.  */
211 
212 const char *
re_compile_pattern(const char * pattern,size_t length,struct re_pattern_buffer * bufp)213 re_compile_pattern (const char *pattern, size_t length,
214 		    struct re_pattern_buffer *bufp)
215 {
216   reg_errcode_t ret;
217 
218   /* And GNU code determines whether or not to get register information
219      by passing null for the REGS argument to re_match, etc., not by
220      setting re_no_sub, unless REG_NO_SUB is set.  */
221   bufp->re_no_sub = !!(re_syntax_options & REG_NO_SUB);
222 
223   /* Match anchors at newline.  */
224   bufp->re_newline_anchor = 1;
225 
226   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
227 
228   if (!ret)
229     return NULL;
230   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
231 }
232 #ifdef _LIBC
weak_alias(__re_compile_pattern,re_compile_pattern)233 weak_alias (__re_compile_pattern, re_compile_pattern)
234 #endif
235 
236 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
237    also be assigned to arbitrarily: each pattern buffer stores its own
238    syntax, so it can be changed between regex compilations.  */
239 /* This has no initializer because initialized variables in Emacs
240    become read-only after dumping.  */
241 reg_syntax_t re_syntax_options;
242 
243 
244 /* Specify the precise syntax of regexps for compilation.  This provides
245    for compatibility for various utilities which historically have
246    different, incompatible syntaxes.
247 
248    The argument SYNTAX is a bit mask comprised of the various bits
249    defined in regex.h.  We return the old syntax.  */
250 
251 reg_syntax_t
252 re_set_syntax (reg_syntax_t syntax)
253 {
254   reg_syntax_t ret = re_syntax_options;
255 
256   re_syntax_options = syntax;
257   return ret;
258 }
259 #ifdef _LIBC
weak_alias(__re_set_syntax,re_set_syntax)260 weak_alias (__re_set_syntax, re_set_syntax)
261 #endif
262 
263 int
264 re_compile_fastmap (struct re_pattern_buffer *bufp)
265 {
266   re_dfa_t *dfa = (re_dfa_t *) bufp->re_buffer;
267   char *fastmap = bufp->re_fastmap;
268 
269   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
270   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
271   if (dfa->init_state != dfa->init_state_word)
272     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
273   if (dfa->init_state != dfa->init_state_nl)
274     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
275   if (dfa->init_state != dfa->init_state_begbuf)
276     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
277   bufp->re_fastmap_accurate = 1;
278   return 0;
279 }
280 #ifdef _LIBC
weak_alias(__re_compile_fastmap,re_compile_fastmap)281 weak_alias (__re_compile_fastmap, re_compile_fastmap)
282 #endif
283 
284 static inline void
285 __attribute ((always_inline))
286 re_set_fastmap (char *fastmap, bool icase, int ch)
287 {
288   fastmap[ch] = 1;
289   if (icase)
290     fastmap[tolower (ch)] = 1;
291 }
292 
293 /* Helper function for re_compile_fastmap.
294    Compile fastmap for the initial_state INIT_STATE.  */
295 
296 static void
re_compile_fastmap_iter(regex_t * bufp,const re_dfastate_t * init_state,char * fastmap)297 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
298 			 char *fastmap)
299 {
300   re_dfa_t *dfa = (re_dfa_t *) bufp->re_buffer;
301   Idx node_cnt;
302   bool icase = (dfa->mb_cur_max == 1 && (bufp->re_syntax & REG_IGNORE_CASE));
303   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
304     {
305       Idx node = init_state->nodes.elems[node_cnt];
306       re_token_type_t type = dfa->nodes[node].type;
307 
308       if (type == CHARACTER)
309 	{
310 	  re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
311 #ifdef RE_ENABLE_I18N
312 	  if ((bufp->re_syntax & REG_IGNORE_CASE) && dfa->mb_cur_max > 1)
313 	    {
314 	      unsigned char buf[MB_LEN_MAX];
315 	      unsigned char *p;
316 	      wchar_t wc;
317 	      mbstate_t state;
318 
319 	      p = buf;
320 	      *p++ = dfa->nodes[node].opr.c;
321 	      while (++node < dfa->nodes_len
322 		     &&	dfa->nodes[node].type == CHARACTER
323 		     && dfa->nodes[node].mb_partial)
324 		*p++ = dfa->nodes[node].opr.c;
325 	      memset (&state, 0, sizeof (state));
326 	      if (mbrtowc (&wc, (const char *) buf, p - buf,
327 			   &state) == p - buf
328 		  && (__wcrtomb ((char *) buf, towlower (wc), &state)
329 		      != (size_t) -1))
330 		re_set_fastmap (fastmap, false, buf[0]);
331 	    }
332 #endif
333 	}
334       else if (type == SIMPLE_BRACKET)
335 	{
336 	  int i, j, ch;
337 	  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
338 	    for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
339 	      if (dfa->nodes[node].opr.sbcset[i] & ((bitset_word) 1 << j))
340 		re_set_fastmap (fastmap, icase, ch);
341 	}
342 #ifdef RE_ENABLE_I18N
343       else if (type == COMPLEX_BRACKET)
344 	{
345 	  Idx i;
346 	  re_charset_t *cset = dfa->nodes[node].opr.mbcset;
347 	  if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
348 	      || cset->nranges || cset->nchar_classes)
349 	    {
350 # ifdef _LIBC
351 	      if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
352 		{
353 		  /* In this case we want to catch the bytes which are
354 		     the first byte of any collation elements.
355 		     e.g. In da_DK, we want to catch 'a' since "aa"
356 			  is a valid collation element, and don't catch
357 			  'b' since 'b' is the only collation element
358 			  which starts from 'b'.  */
359 		  const int32_t *table = (const int32_t *)
360 		    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
361 		  for (i = 0; i < SBC_MAX; ++i)
362 		    if (table[i] < 0)
363 		      re_set_fastmap (fastmap, icase, i);
364 		}
365 # else
366 	      if (dfa->mb_cur_max > 1)
367 		for (i = 0; i < SBC_MAX; ++i)
368 		  if (__btowc (i) == WEOF)
369 		    re_set_fastmap (fastmap, icase, i);
370 # endif /* not _LIBC */
371 	    }
372 	  for (i = 0; i < cset->nmbchars; ++i)
373 	    {
374 	      char buf[256];
375 	      mbstate_t state;
376 	      memset (&state, '\0', sizeof (state));
377 	      if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
378 		re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
379 	      if ((bufp->re_syntax & REG_IGNORE_CASE) && dfa->mb_cur_max > 1)
380 		{
381 		  if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
382 		      != (size_t) -1)
383 		    re_set_fastmap (fastmap, false, *(unsigned char *) buf);
384 		}
385 	    }
386 	}
387 #endif /* RE_ENABLE_I18N */
388       else if (type == OP_PERIOD
389 #ifdef RE_ENABLE_I18N
390 	       || type == OP_UTF8_PERIOD
391 #endif /* RE_ENABLE_I18N */
392 	       || type == END_OF_RE)
393 	{
394 	  memset (fastmap, '\1', sizeof (char) * SBC_MAX);
395 	  if (type == END_OF_RE)
396 	    bufp->re_can_be_null = 1;
397 	  return;
398 	}
399     }
400 }
401 
402 /* Entry point for POSIX code.  */
403 /* regcomp takes a regular expression as a string and compiles it.
404 
405    PREG is a regex_t *.  We do not expect any fields to be initialized,
406    since POSIX says we shouldn't.  Thus, we set
407 
408      `re_buffer' to the compiled pattern;
409      `re_used' to the length of the compiled pattern;
410      `re_syntax' to REG_SYNTAX_POSIX_EXTENDED if the
411        REG_EXTENDED bit in CFLAGS is set; otherwise, to
412        REG_SYNTAX_POSIX_BASIC;
413      `re_newline_anchor' to REG_NEWLINE being set in CFLAGS;
414      `re_fastmap' to an allocated space for the fastmap;
415      `re_fastmap_accurate' to zero;
416      `re_nsub' to the number of subexpressions in PATTERN.
417 
418    PATTERN is the address of the pattern string.
419 
420    CFLAGS is a series of bits which affect compilation.
421 
422      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
423      use POSIX basic syntax.
424 
425      If REG_NEWLINE is set, then . and [^...] don't match newline.
426      Also, regexec will try a match beginning after every newline.
427 
428      If REG_ICASE is set, then we considers upper- and lowercase
429      versions of letters to be equivalent when matching.
430 
431      If REG_NOSUB is set, then when PREG is passed to regexec, that
432      routine will report only success or failure, and nothing about the
433      registers.
434 
435    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
436    the return codes and their meanings.)  */
437 
438 int
regcomp(regex_t * __restrict preg,const char * __restrict pattern,int cflags)439 regcomp (regex_t *__restrict preg, const char *__restrict pattern, int cflags)
440 {
441   reg_errcode_t ret;
442   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? REG_SYNTAX_POSIX_EXTENDED
443 			: REG_SYNTAX_POSIX_BASIC);
444 
445   preg->re_buffer = NULL;
446   preg->re_allocated = 0;
447   preg->re_used = 0;
448 
449   /* Try to allocate space for the fastmap.  */
450   preg->re_fastmap = re_malloc (char, SBC_MAX);
451   if (BE (preg->re_fastmap == NULL, 0))
452     return REG_ESPACE;
453 
454   syntax |= (cflags & REG_ICASE) ? REG_IGNORE_CASE : 0;
455 
456   /* If REG_NEWLINE is set, newlines are treated differently.  */
457   if (cflags & REG_NEWLINE)
458     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
459       syntax &= ~REG_DOT_NEWLINE;
460       syntax |= REG_HAT_LISTS_NOT_NEWLINE;
461       /* It also changes the matching behavior.  */
462       preg->re_newline_anchor = 1;
463     }
464   else
465     preg->re_newline_anchor = 0;
466   preg->re_no_sub = !!(cflags & REG_NOSUB);
467   preg->re_translate = NULL;
468 
469   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
470 
471   /* POSIX doesn't distinguish between an unmatched open-group and an
472      unmatched close-group: both are REG_EPAREN.  */
473   if (ret == REG_ERPAREN)
474     ret = REG_EPAREN;
475 
476   /* We have already checked preg->re_fastmap != NULL.  */
477   if (BE (ret == REG_NOERROR, 1))
478     /* Compute the fastmap now, since regexec cannot modify the pattern
479        buffer.  This function never fails in this implementation.  */
480     (void) re_compile_fastmap (preg);
481   else
482     {
483       /* Some error occurred while compiling the expression.  */
484       re_free (preg->re_fastmap);
485       preg->re_fastmap = NULL;
486     }
487 
488   return (int) ret;
489 }
490 #ifdef _LIBC
weak_alias(__regcomp,regcomp)491 weak_alias (__regcomp, regcomp)
492 #endif
493 
494 /* Returns a message corresponding to an error code, ERRCODE, returned
495    from either regcomp or regexec.   We don't use PREG here.  */
496 
497 size_t
498 regerror (int errcode, const regex_t *__restrict preg,
499 	  char *__restrict errbuf, size_t errbuf_size)
500 {
501   const char *msg;
502   size_t msg_size;
503 
504   if (BE (errcode < 0
505 	  || errcode >= (int) (sizeof (__re_error_msgid_idx)
506 			       / sizeof (__re_error_msgid_idx[0])), 0))
507     /* Only error codes returned by the rest of the code should be passed
508        to this routine.  If we are given anything else, or if other regex
509        code generates an invalid error code, then the program has a bug.
510        Dump core so we can fix it.  */
511     abort ();
512 
513   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
514 
515   msg_size = strlen (msg) + 1; /* Includes the null.  */
516 
517   if (BE (errbuf_size != 0, 1))
518     {
519       if (BE (msg_size > errbuf_size, 0))
520 	{
521 #if defined HAVE_MEMPCPY || defined _LIBC
522 	  *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
523 #else
524 	  memcpy (errbuf, msg, errbuf_size - 1);
525 	  errbuf[errbuf_size - 1] = 0;
526 #endif
527 	}
528       else
529 	memcpy (errbuf, msg, msg_size);
530     }
531 
532   return msg_size;
533 }
534 #ifdef _LIBC
535 weak_alias (__regerror, regerror)
536 #endif
537 
538 
539 #ifdef RE_ENABLE_I18N
540 /* This static array is used for the map to single-byte characters when
541    UTF-8 is used.  Otherwise we would allocate memory just to initialize
542    it the same all the time.  UTF-8 is the preferred encoding so this is
543    a worthwhile optimization.  */
544 static const bitset utf8_sb_map =
545 {
546   /* Set the first 128 bits.  */
547 # if 2 < BITSET_WORDS
548   BITSET_WORD_MAX,
549 # endif
550 # if 4 < BITSET_WORDS
551   BITSET_WORD_MAX,
552 # endif
553 # if 6 < BITSET_WORDS
554   BITSET_WORD_MAX,
555 # endif
556 # if 8 < BITSET_WORDS
557 #  error "Invalid BITSET_WORDS"
558 # endif
559   (BITSET_WORD_MAX
560    >> (SBC_MAX % BITSET_WORD_BITS == 0
561        ? 0
562        : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
563 };
564 #endif
565 
566 
567 static void
free_dfa_content(re_dfa_t * dfa)568 free_dfa_content (re_dfa_t *dfa)
569 {
570   Idx i, j;
571 
572   if (dfa->nodes)
573     for (i = 0; i < dfa->nodes_len; ++i)
574       free_token (dfa->nodes + i);
575   re_free (dfa->nexts);
576   for (i = 0; i < dfa->nodes_len; ++i)
577     {
578       if (dfa->eclosures != NULL)
579 	re_node_set_free (dfa->eclosures + i);
580       if (dfa->inveclosures != NULL)
581 	re_node_set_free (dfa->inveclosures + i);
582       if (dfa->edests != NULL)
583 	re_node_set_free (dfa->edests + i);
584     }
585   re_free (dfa->edests);
586   re_free (dfa->eclosures);
587   re_free (dfa->inveclosures);
588   re_free (dfa->nodes);
589 
590   if (dfa->state_table)
591     for (i = 0; i <= dfa->state_hash_mask; ++i)
592       {
593 	struct re_state_table_entry *entry = dfa->state_table + i;
594 	for (j = 0; j < entry->num; ++j)
595 	  {
596 	    re_dfastate_t *state = entry->array[j];
597 	    free_state (state);
598 	  }
599         re_free (entry->array);
600       }
601   re_free (dfa->state_table);
602 #ifdef RE_ENABLE_I18N
603   if (dfa->sb_char != utf8_sb_map)
604     re_free (dfa->sb_char);
605 #endif
606   re_free (dfa->subexp_map);
607 #ifdef DEBUG
608   re_free (dfa->re_str);
609 #endif
610 
611   re_free (dfa);
612 }
613 
614 
615 /* Free dynamically allocated space used by PREG.  */
616 
617 void
regfree(regex_t * preg)618 regfree (regex_t *preg)
619 {
620   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
621   if (BE (dfa != NULL, 1))
622     free_dfa_content (dfa);
623   preg->re_buffer = NULL;
624   preg->re_allocated = 0;
625 
626   re_free (preg->re_fastmap);
627   preg->re_fastmap = NULL;
628 
629   re_free (preg->re_translate);
630   preg->re_translate = NULL;
631 }
632 #ifdef _LIBC
633 weak_alias (__regfree, regfree)
634 #endif
635 
636 /* Entry points compatible with 4.2 BSD regex library.  We don't define
637    them unless specifically requested.  */
638 
639 #if defined _REGEX_RE_COMP || defined _LIBC
640 
641 /* BSD has one and only one pattern buffer.  */
642 static struct re_pattern_buffer re_comp_buf;
643 
644 char *
645 # ifdef _LIBC
646 /* Make these definitions weak in libc, so POSIX programs can redefine
647    these names if they don't use our functions, and still use
648    regcomp/regexec above without link errors.  */
649 weak_function
650 # endif
re_comp(const char * s)651 re_comp (const char *s)
652 {
653   reg_errcode_t ret;
654   char *fastmap;
655 
656   if (!s)
657     {
658       if (!re_comp_buf.re_buffer)
659 	return gettext ("No previous regular expression");
660       return 0;
661     }
662 
663   if (re_comp_buf.re_buffer)
664     {
665       fastmap = re_comp_buf.re_fastmap;
666       re_comp_buf.re_fastmap = NULL;
667       __regfree (&re_comp_buf);
668       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
669       re_comp_buf.re_fastmap = fastmap;
670     }
671 
672   if (re_comp_buf.re_fastmap == NULL)
673     {
674       re_comp_buf.re_fastmap = (char *) malloc (SBC_MAX);
675       if (re_comp_buf.re_fastmap == NULL)
676 	return (char *) gettext (__re_error_msgid
677 				 + __re_error_msgid_idx[(int) REG_ESPACE]);
678     }
679 
680   /* Since `re_exec' always passes NULL for the `regs' argument, we
681      don't need to initialize the pattern buffer fields which affect it.  */
682 
683   /* Match anchors at newlines.  */
684   re_comp_buf.re_newline_anchor = 1;
685 
686   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
687 
688   if (!ret)
689     return NULL;
690 
691   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
692   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
693 }
694 
695 #ifdef _LIBC
libc_freeres_fn(free_mem)696 libc_freeres_fn (free_mem)
697 {
698   __regfree (&re_comp_buf);
699 }
700 #endif
701 
702 #endif /* _REGEX_RE_COMP */
703 
704 /* Internal entry point.
705    Compile the regular expression PATTERN, whose length is LENGTH.
706    SYNTAX indicate regular expression's syntax.  */
707 
708 static reg_errcode_t
re_compile_internal(regex_t * preg,const char * pattern,Idx length,reg_syntax_t syntax)709 re_compile_internal (regex_t *preg, const char *pattern, Idx length,
710 		     reg_syntax_t syntax)
711 {
712   reg_errcode_t err = REG_NOERROR;
713   re_dfa_t *dfa;
714   re_string_t regexp;
715 
716   /* Initialize the pattern buffer.  */
717   preg->re_fastmap_accurate = 0;
718   preg->re_syntax = syntax;
719   preg->re_not_bol = preg->re_not_eol = 0;
720   preg->re_used = 0;
721   preg->re_nsub = 0;
722   preg->re_can_be_null = 0;
723   preg->re_regs_allocated = REG_UNALLOCATED;
724 
725   /* Initialize the dfa.  */
726   dfa = (re_dfa_t *) preg->re_buffer;
727   if (BE (preg->re_allocated < sizeof (re_dfa_t), 0))
728     {
729       /* If zero allocated, but buffer is non-null, try to realloc
730 	 enough space.  This loses if buffer's address is bogus, but
731 	 that is the user's responsibility.  If buffer is null this
732 	 is a simple allocation.  */
733       dfa = re_realloc (preg->re_buffer, re_dfa_t, 1);
734       if (dfa == NULL)
735 	return REG_ESPACE;
736       preg->re_allocated = sizeof (re_dfa_t);
737       preg->re_buffer = (unsigned char *) dfa;
738     }
739   preg->re_used = sizeof (re_dfa_t);
740 
741   __libc_lock_init (dfa->lock);
742 
743   err = init_dfa (dfa, length);
744   if (BE (err != REG_NOERROR, 0))
745     {
746       free_dfa_content (dfa);
747       preg->re_buffer = NULL;
748       preg->re_allocated = 0;
749       return err;
750     }
751 #ifdef DEBUG
752   dfa->re_str = re_malloc (char, length + 1);
753   strncpy (dfa->re_str, pattern, length + 1);
754 #endif
755 
756   err = re_string_construct (&regexp, pattern, length, preg->re_translate,
757 			     syntax & REG_IGNORE_CASE, dfa);
758   if (BE (err != REG_NOERROR, 0))
759     {
760     re_compile_internal_free_return:
761       free_workarea_compile (preg);
762       re_string_destruct (&regexp);
763       free_dfa_content (dfa);
764       preg->re_buffer = NULL;
765       preg->re_allocated = 0;
766       return err;
767     }
768 
769   /* Parse the regular expression, and build a structure tree.  */
770   preg->re_nsub = 0;
771   dfa->str_tree = parse (&regexp, preg, syntax, &err);
772   if (BE (dfa->str_tree == NULL, 0))
773     goto re_compile_internal_free_return;
774 
775   /* Analyze the tree and create the nfa.  */
776   err = analyze (preg);
777   if (BE (err != REG_NOERROR, 0))
778     goto re_compile_internal_free_return;
779 
780 #ifdef RE_ENABLE_I18N
781   /* If possible, do searching in single byte encoding to speed things up.  */
782   if (dfa->is_utf8 && !(syntax & REG_IGNORE_CASE) && preg->re_translate == NULL)
783     optimize_utf8 (dfa);
784 #endif
785 
786   /* Then create the initial state of the dfa.  */
787   err = create_initial_state (dfa);
788 
789   /* Release work areas.  */
790   free_workarea_compile (preg);
791   re_string_destruct (&regexp);
792 
793   if (BE (err != REG_NOERROR, 0))
794     {
795       free_dfa_content (dfa);
796       preg->re_buffer = NULL;
797       preg->re_allocated = 0;
798     }
799 
800   return err;
801 }
802 
803 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
804    as the initial length of some arrays.  */
805 
806 static reg_errcode_t
init_dfa(re_dfa_t * dfa,Idx pat_len)807 init_dfa (re_dfa_t *dfa, Idx pat_len)
808 {
809   __re_size_t table_size;
810 #ifndef _LIBC
811   char *codeset_name;
812 #endif
813 
814   memset (dfa, '\0', sizeof (re_dfa_t));
815 
816   /* Force allocation of str_tree_storage the first time.  */
817   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
818 
819   dfa->nodes_alloc = pat_len + 1;
820   dfa->nodes = re_xmalloc (re_token_t, dfa->nodes_alloc);
821 
822   /*  table_size = 2 ^ ceil(log pat_len) */
823   for (table_size = 1; table_size <= pat_len; table_size <<= 1)
824     if (0 < (Idx) -1 && table_size == 0)
825       return REG_ESPACE;
826 
827   dfa->state_table = re_calloc (struct re_state_table_entry, table_size);
828   dfa->state_hash_mask = table_size - 1;
829 
830   dfa->mb_cur_max = MB_CUR_MAX;
831 #ifdef _LIBC
832   if (dfa->mb_cur_max == 6
833       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
834     dfa->is_utf8 = 1;
835   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
836 		       != 0);
837 #else
838 # ifdef HAVE_LANGINFO_CODESET
839   codeset_name = nl_langinfo (CODESET);
840 # else
841   codeset_name = getenv ("LC_ALL");
842   if (codeset_name == NULL || codeset_name[0] == '\0')
843     codeset_name = getenv ("LC_CTYPE");
844   if (codeset_name == NULL || codeset_name[0] == '\0')
845     codeset_name = getenv ("LANG");
846   if (codeset_name == NULL)
847     codeset_name = "";
848   else if (strchr (codeset_name, '.') !=  NULL)
849     codeset_name = strchr (codeset_name, '.') + 1;
850 # endif
851 
852   if (strcasecmp (codeset_name, "UTF-8") == 0
853       || strcasecmp (codeset_name, "UTF8") == 0)
854     dfa->is_utf8 = 1;
855 
856   /* We check exhaustively in the loop below if this charset is a
857      superset of ASCII.  */
858   dfa->map_notascii = 0;
859 #endif
860 
861 #ifdef RE_ENABLE_I18N
862   if (dfa->mb_cur_max > 1)
863     {
864       if (dfa->is_utf8)
865 	dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
866       else
867 	{
868 	  int i, j, ch;
869 
870 	  dfa->sb_char = re_calloc (bitset_word, BITSET_WORDS);
871 	  if (BE (dfa->sb_char == NULL, 0))
872 	    return REG_ESPACE;
873 
874 	  /* Set the bits corresponding to single byte chars.  */
875 	  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
876 	    for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
877 	      {
878 		wint_t wch = __btowc (ch);
879 		if (wch != WEOF)
880 		  dfa->sb_char[i] |= (bitset_word) 1 << j;
881 # ifndef _LIBC
882 		if (isascii (ch) && wch != ch)
883 		  dfa->map_notascii = 1;
884 # endif
885 	      }
886 	}
887     }
888 #endif
889 
890   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
891     return REG_ESPACE;
892   return REG_NOERROR;
893 }
894 
895 /* Initialize WORD_CHAR table, which indicate which character is
896    "word".  In this case "word" means that it is the word construction
897    character used by some operators like "\<", "\>", etc.  */
898 
899 static void
init_word_char(re_dfa_t * dfa)900 init_word_char (re_dfa_t *dfa)
901 {
902   int i, j, ch;
903   dfa->word_ops_used = 1;
904   for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
905     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
906       if (isalnum (ch) || ch == '_')
907 	dfa->word_char[i] |= (bitset_word) 1 << j;
908 }
909 
910 /* Free the work area which are only used while compiling.  */
911 
912 static void
free_workarea_compile(regex_t * preg)913 free_workarea_compile (regex_t *preg)
914 {
915   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
916   bin_tree_storage_t *storage, *next;
917   for (storage = dfa->str_tree_storage; storage; storage = next)
918     {
919       next = storage->next;
920       re_free (storage);
921     }
922   dfa->str_tree_storage = NULL;
923   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
924   dfa->str_tree = NULL;
925   re_free (dfa->org_indices);
926   dfa->org_indices = NULL;
927 }
928 
929 /* Create initial states for all contexts.  */
930 
931 static reg_errcode_t
create_initial_state(re_dfa_t * dfa)932 create_initial_state (re_dfa_t *dfa)
933 {
934   Idx first, i;
935   reg_errcode_t err;
936   re_node_set init_nodes;
937 
938   /* Initial states have the epsilon closure of the node which is
939      the first node of the regular expression.  */
940   first = dfa->str_tree->first->node_idx;
941   dfa->init_node = first;
942   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
943   if (BE (err != REG_NOERROR, 0))
944     return err;
945 
946   /* The back-references which are in initial states can epsilon transit,
947      since in this case all of the subexpressions can be null.
948      Then we add epsilon closures of the nodes which are the next nodes of
949      the back-references.  */
950   if (dfa->nbackref > 0)
951     for (i = 0; i < init_nodes.nelem; ++i)
952       {
953 	Idx node_idx = init_nodes.elems[i];
954 	re_token_type_t type = dfa->nodes[node_idx].type;
955 
956 	Idx clexp_idx;
957 	if (type != OP_BACK_REF)
958 	  continue;
959 	for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
960 	  {
961 	    re_token_t *clexp_node;
962 	    clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
963 	    if (clexp_node->type == OP_CLOSE_SUBEXP
964 		&& clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
965 	      break;
966 	  }
967 	if (clexp_idx == init_nodes.nelem)
968 	  continue;
969 
970 	if (type == OP_BACK_REF)
971 	  {
972 	    Idx dest_idx = dfa->edests[node_idx].elems[0];
973 	    if (!re_node_set_contains (&init_nodes, dest_idx))
974 	      {
975 		re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
976 		i = 0;
977 	      }
978 	  }
979       }
980 
981   /* It must be the first time to invoke acquire_state.  */
982   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
983   /* We don't check ERR here, since the initial state must not be NULL.  */
984   if (BE (dfa->init_state == NULL, 0))
985     return err;
986   if (dfa->init_state->has_constraint)
987     {
988       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
989 						       CONTEXT_WORD);
990       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
991 						     CONTEXT_NEWLINE);
992       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
993 							 &init_nodes,
994 							 CONTEXT_NEWLINE
995 							 | CONTEXT_BEGBUF);
996       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
997 	      || dfa->init_state_begbuf == NULL, 0))
998 	return err;
999     }
1000   else
1001     dfa->init_state_word = dfa->init_state_nl
1002       = dfa->init_state_begbuf = dfa->init_state;
1003 
1004   re_node_set_free (&init_nodes);
1005   return REG_NOERROR;
1006 }
1007 
1008 #ifdef RE_ENABLE_I18N
1009 /* If it is possible to do searching in single byte encoding instead of UTF-8
1010    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1011    DFA nodes where needed.  */
1012 
1013 static void
optimize_utf8(re_dfa_t * dfa)1014 optimize_utf8 (re_dfa_t *dfa)
1015 {
1016   Idx node;
1017   int i;
1018   bool mb_chars = false;
1019   bool has_period = false;
1020 
1021   for (node = 0; node < dfa->nodes_len; ++node)
1022     switch (dfa->nodes[node].type)
1023       {
1024       case CHARACTER:
1025 	if (dfa->nodes[node].opr.c >= 0x80)
1026 	  mb_chars = true;
1027 	break;
1028       case ANCHOR:
1029 	switch (dfa->nodes[node].opr.idx)
1030 	  {
1031 	  case LINE_FIRST:
1032 	  case LINE_LAST:
1033 	  case BUF_FIRST:
1034 	  case BUF_LAST:
1035 	    break;
1036 	  default:
1037 	    /* Word anchors etc. cannot be handled.  */
1038 	    return;
1039 	  }
1040 	break;
1041       case OP_PERIOD:
1042         has_period = true;
1043         break;
1044       case OP_BACK_REF:
1045       case OP_ALT:
1046       case END_OF_RE:
1047       case OP_DUP_ASTERISK:
1048       case OP_OPEN_SUBEXP:
1049       case OP_CLOSE_SUBEXP:
1050 	break;
1051       case COMPLEX_BRACKET:
1052 	return;
1053       case SIMPLE_BRACKET:
1054 	/* Just double check.  */
1055 	{
1056 	  int rshift =
1057 	    (SBC_MAX / 2 % BITSET_WORD_BITS == 0
1058 	     ? 0
1059 	     : BITSET_WORD_BITS - SBC_MAX / 2 % BITSET_WORD_BITS);
1060 	  for (i = SBC_MAX / 2 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1061 	    {
1062 	      if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1063 		return;
1064 	      rshift = 0;
1065 	    }
1066 	}
1067 	break;
1068       default:
1069 	abort ();
1070       }
1071 
1072   if (mb_chars || has_period)
1073     for (node = 0; node < dfa->nodes_len; ++node)
1074       {
1075 	if (dfa->nodes[node].type == CHARACTER
1076 	    && dfa->nodes[node].opr.c >= 0x80)
1077 	  dfa->nodes[node].mb_partial = 0;
1078 	else if (dfa->nodes[node].type == OP_PERIOD)
1079 	  dfa->nodes[node].type = OP_UTF8_PERIOD;
1080       }
1081 
1082   /* The search can be in single byte locale.  */
1083   dfa->mb_cur_max = 1;
1084   dfa->is_utf8 = 0;
1085   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1086 }
1087 #endif
1088 
1089 /* Analyze the structure tree, and calculate "first", "next", "edest",
1090    "eclosure", and "inveclosure".  */
1091 
1092 static reg_errcode_t
analyze(regex_t * preg)1093 analyze (regex_t *preg)
1094 {
1095   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
1096   reg_errcode_t ret;
1097 
1098   /* Allocate arrays.  */
1099   dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1100   dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1101   dfa->edests = re_xmalloc (re_node_set, dfa->nodes_alloc);
1102   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1103   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1104 	  || dfa->eclosures == NULL, 0))
1105     return REG_ESPACE;
1106 
1107   dfa->subexp_map = re_xmalloc (Idx, preg->re_nsub);
1108   if (dfa->subexp_map != NULL)
1109     {
1110       Idx i;
1111       for (i = 0; i < preg->re_nsub; i++)
1112 	dfa->subexp_map[i] = i;
1113       preorder (dfa->str_tree, optimize_subexps, dfa);
1114       for (i = 0; i < preg->re_nsub; i++)
1115 	if (dfa->subexp_map[i] != i)
1116 	  break;
1117       if (i == preg->re_nsub)
1118 	{
1119 	  free (dfa->subexp_map);
1120 	  dfa->subexp_map = NULL;
1121 	}
1122     }
1123 
1124   ret = postorder (dfa->str_tree, lower_subexps, preg);
1125   if (BE (ret != REG_NOERROR, 0))
1126     return ret;
1127   ret = postorder (dfa->str_tree, calc_first, dfa);
1128   if (BE (ret != REG_NOERROR, 0))
1129     return ret;
1130   preorder (dfa->str_tree, calc_next, dfa);
1131   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1132   if (BE (ret != REG_NOERROR, 0))
1133     return ret;
1134   ret = calc_eclosure (dfa);
1135   if (BE (ret != REG_NOERROR, 0))
1136     return ret;
1137 
1138   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1139      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1140   if ((!preg->re_no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1141       || dfa->nbackref)
1142     {
1143       dfa->inveclosures = re_xmalloc (re_node_set, dfa->nodes_len);
1144       if (BE (dfa->inveclosures == NULL, 0))
1145         return REG_ESPACE;
1146       ret = calc_inveclosure (dfa);
1147     }
1148 
1149   return ret;
1150 }
1151 
1152 /* Our parse trees are very unbalanced, so we cannot use a stack to
1153    implement parse tree visits.  Instead, we use parent pointers and
1154    some hairy code in these two functions.  */
1155 static reg_errcode_t
postorder(bin_tree_t * root,reg_errcode_t (fn (void *,bin_tree_t *)),void * extra)1156 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1157 	   void *extra)
1158 {
1159   bin_tree_t *node, *prev;
1160 
1161   for (node = root; ; )
1162     {
1163       /* Descend down the tree, preferably to the left (or to the right
1164 	 if that's the only child).  */
1165       while (node->left || node->right)
1166 	if (node->left)
1167           node = node->left;
1168         else
1169           node = node->right;
1170 
1171       do
1172 	{
1173 	  reg_errcode_t err = fn (extra, node);
1174 	  if (BE (err != REG_NOERROR, 0))
1175 	    return err;
1176           if (node->parent == NULL)
1177 	    return REG_NOERROR;
1178 	  prev = node;
1179 	  node = node->parent;
1180 	}
1181       /* Go up while we have a node that is reached from the right.  */
1182       while (node->right == prev || node->right == NULL);
1183       node = node->right;
1184     }
1185 }
1186 
1187 static reg_errcode_t
preorder(bin_tree_t * root,reg_errcode_t (fn (void *,bin_tree_t *)),void * extra)1188 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1189 	  void *extra)
1190 {
1191   bin_tree_t *node;
1192 
1193   for (node = root; ; )
1194     {
1195       reg_errcode_t err = fn (extra, node);
1196       if (BE (err != REG_NOERROR, 0))
1197 	return err;
1198 
1199       /* Go to the left node, or up and to the right.  */
1200       if (node->left)
1201 	node = node->left;
1202       else
1203 	{
1204 	  bin_tree_t *prev = NULL;
1205 	  while (node->right == prev || node->right == NULL)
1206 	    {
1207 	      prev = node;
1208 	      node = node->parent;
1209 	      if (!node)
1210 	        return REG_NOERROR;
1211 	    }
1212 	  node = node->right;
1213 	}
1214     }
1215 }
1216 
1217 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1218    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1219    backreferences as well.  Requires a preorder visit.  */
1220 static reg_errcode_t
optimize_subexps(void * extra,bin_tree_t * node)1221 optimize_subexps (void *extra, bin_tree_t *node)
1222 {
1223   re_dfa_t *dfa = (re_dfa_t *) extra;
1224 
1225   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1226     {
1227       int idx = node->token.opr.idx;
1228       node->token.opr.idx = dfa->subexp_map[idx];
1229       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1230     }
1231 
1232   else if (node->token.type == SUBEXP
1233            && node->left && node->left->token.type == SUBEXP)
1234     {
1235       Idx other_idx = node->left->token.opr.idx;
1236 
1237       node->left = node->left->left;
1238       if (node->left)
1239         node->left->parent = node;
1240 
1241       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1242       if (other_idx < BITSET_WORD_BITS)
1243 	dfa->used_bkref_map &= ~ ((bitset_word) 1 << other_idx);
1244     }
1245 
1246   return REG_NOERROR;
1247 }
1248 
1249 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1250    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1251 static reg_errcode_t
lower_subexps(void * extra,bin_tree_t * node)1252 lower_subexps (void *extra, bin_tree_t *node)
1253 {
1254   regex_t *preg = (regex_t *) extra;
1255   reg_errcode_t err = REG_NOERROR;
1256 
1257   if (node->left && node->left->token.type == SUBEXP)
1258     {
1259       node->left = lower_subexp (&err, preg, node->left);
1260       if (node->left)
1261 	node->left->parent = node;
1262     }
1263   if (node->right && node->right->token.type == SUBEXP)
1264     {
1265       node->right = lower_subexp (&err, preg, node->right);
1266       if (node->right)
1267 	node->right->parent = node;
1268     }
1269 
1270   return err;
1271 }
1272 
1273 static bin_tree_t *
lower_subexp(reg_errcode_t * err,regex_t * preg,bin_tree_t * node)1274 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1275 {
1276   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
1277   bin_tree_t *body = node->left;
1278   bin_tree_t *op, *cls, *tree1, *tree;
1279 
1280   if (preg->re_no_sub
1281       /* We do not optimize empty subexpressions, because otherwise we may
1282 	 have bad CONCAT nodes with NULL children.  This is obviously not
1283 	 very common, so we do not lose much.  An example that triggers
1284 	 this case is the sed "script" /\(\)/x.  */
1285       && node->left != NULL
1286       && ! (node->token.opr.idx < BITSET_WORD_BITS
1287 	    && dfa->used_bkref_map & ((bitset_word) 1 << node->token.opr.idx)))
1288     return node->left;
1289 
1290   /* Convert the SUBEXP node to the concatenation of an
1291      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1292   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1293   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1294   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1295   tree = create_tree (dfa, op, tree1, CONCAT);
1296   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1297     {
1298       *err = REG_ESPACE;
1299       return NULL;
1300     }
1301 
1302   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1303   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1304   return tree;
1305 }
1306 
1307 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1308    nodes.  Requires a postorder visit.  */
1309 static reg_errcode_t
calc_first(void * extra,bin_tree_t * node)1310 calc_first (void *extra, bin_tree_t *node)
1311 {
1312   re_dfa_t *dfa = (re_dfa_t *) extra;
1313   if (node->token.type == CONCAT)
1314     {
1315       node->first = node->left->first;
1316       node->node_idx = node->left->node_idx;
1317     }
1318   else
1319     {
1320       node->first = node;
1321       node->node_idx = re_dfa_add_node (dfa, node->token);
1322       if (BE (node->node_idx == REG_MISSING, 0))
1323         return REG_ESPACE;
1324     }
1325   return REG_NOERROR;
1326 }
1327 
1328 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1329 static reg_errcode_t
calc_next(void * extra,bin_tree_t * node)1330 calc_next (void *extra, bin_tree_t *node)
1331 {
1332   switch (node->token.type)
1333     {
1334     case OP_DUP_ASTERISK:
1335       node->left->next = node;
1336       break;
1337     case CONCAT:
1338       node->left->next = node->right->first;
1339       node->right->next = node->next;
1340       break;
1341     default:
1342       if (node->left)
1343 	node->left->next = node->next;
1344       if (node->right)
1345         node->right->next = node->next;
1346       break;
1347     }
1348   return REG_NOERROR;
1349 }
1350 
1351 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1352 static reg_errcode_t
link_nfa_nodes(void * extra,bin_tree_t * node)1353 link_nfa_nodes (void *extra, bin_tree_t *node)
1354 {
1355   re_dfa_t *dfa = (re_dfa_t *) extra;
1356   Idx idx = node->node_idx;
1357   reg_errcode_t err = REG_NOERROR;
1358 
1359   switch (node->token.type)
1360     {
1361     case CONCAT:
1362       break;
1363 
1364     case END_OF_RE:
1365       assert (node->next == NULL);
1366       break;
1367 
1368     case OP_DUP_ASTERISK:
1369     case OP_ALT:
1370       {
1371 	Idx left, right;
1372 	dfa->has_plural_match = 1;
1373 	if (node->left != NULL)
1374 	  left = node->left->first->node_idx;
1375 	else
1376 	  left = node->next->node_idx;
1377 	if (node->right != NULL)
1378 	  right = node->right->first->node_idx;
1379 	else
1380 	  right = node->next->node_idx;
1381 	assert (REG_VALID_INDEX (left));
1382 	assert (REG_VALID_INDEX (right));
1383 	err = re_node_set_init_2 (dfa->edests + idx, left, right);
1384       }
1385       break;
1386 
1387     case ANCHOR:
1388     case OP_OPEN_SUBEXP:
1389     case OP_CLOSE_SUBEXP:
1390       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1391       break;
1392 
1393     case OP_BACK_REF:
1394       dfa->nexts[idx] = node->next->node_idx;
1395       if (node->token.type == OP_BACK_REF)
1396 	re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1397       break;
1398 
1399     default:
1400       assert (!IS_EPSILON_NODE (node->token.type));
1401       dfa->nexts[idx] = node->next->node_idx;
1402       break;
1403     }
1404 
1405   return err;
1406 }
1407 
1408 /* Duplicate the epsilon closure of the node ROOT_NODE.
1409    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1410    to their own constraint.  */
1411 
1412 static reg_errcode_t
duplicate_node_closure(re_dfa_t * dfa,Idx top_org_node,Idx top_clone_node,Idx root_node,unsigned int init_constraint)1413 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node,
1414 			Idx top_clone_node, Idx root_node,
1415 			unsigned int init_constraint)
1416 {
1417   Idx org_node, clone_node;
1418   bool ok;
1419   unsigned int constraint = init_constraint;
1420   for (org_node = top_org_node, clone_node = top_clone_node;;)
1421     {
1422       Idx org_dest, clone_dest;
1423       if (dfa->nodes[org_node].type == OP_BACK_REF)
1424 	{
1425 	  /* If the back reference epsilon-transit, its destination must
1426 	     also have the constraint.  Then duplicate the epsilon closure
1427 	     of the destination of the back reference, and store it in
1428 	     edests of the back reference.  */
1429 	  org_dest = dfa->nexts[org_node];
1430 	  re_node_set_empty (dfa->edests + clone_node);
1431 	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1432 	  if (BE (clone_dest == REG_MISSING, 0))
1433 	    return REG_ESPACE;
1434 	  dfa->nexts[clone_node] = dfa->nexts[org_node];
1435 	  ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1436 	  if (BE (! ok, 0))
1437 	    return REG_ESPACE;
1438 	}
1439       else if (dfa->edests[org_node].nelem == 0)
1440 	{
1441 	  /* In case of the node can't epsilon-transit, don't duplicate the
1442 	     destination and store the original destination as the
1443 	     destination of the node.  */
1444 	  dfa->nexts[clone_node] = dfa->nexts[org_node];
1445 	  break;
1446 	}
1447       else if (dfa->edests[org_node].nelem == 1)
1448 	{
1449 	  /* In case of the node can epsilon-transit, and it has only one
1450 	     destination.  */
1451 	  org_dest = dfa->edests[org_node].elems[0];
1452 	  re_node_set_empty (dfa->edests + clone_node);
1453 	  if (dfa->nodes[org_node].type == ANCHOR)
1454 	    {
1455 	      /* In case of the node has another constraint, append it.  */
1456 	      if (org_node == root_node && clone_node != org_node)
1457 		{
1458 		  /* ...but if the node is root_node itself, it means the
1459 		     epsilon closure have a loop, then tie it to the
1460 		     destination of the root_node.  */
1461 		  ok = re_node_set_insert (dfa->edests + clone_node,
1462 					    org_dest);
1463 		  if (BE (! ok, 0))
1464 		    return REG_ESPACE;
1465 		  break;
1466 		}
1467 	      constraint |= dfa->nodes[org_node].opr.ctx_type;
1468 	    }
1469 	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1470 	  if (BE (clone_dest == REG_MISSING, 0))
1471 	    return REG_ESPACE;
1472 	  ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1473 	  if (BE (! ok, 0))
1474 	    return REG_ESPACE;
1475 	}
1476       else /* dfa->edests[org_node].nelem == 2 */
1477 	{
1478 	  /* In case of the node can epsilon-transit, and it has two
1479 	     destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1480 	  org_dest = dfa->edests[org_node].elems[0];
1481 	  re_node_set_empty (dfa->edests + clone_node);
1482 	  /* Search for a duplicated node which satisfies the constraint.  */
1483 	  clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1484 	  if (clone_dest == REG_MISSING)
1485 	    {
1486 	      /* There are no such a duplicated node, create a new one.  */
1487 	      reg_errcode_t err;
1488 	      clone_dest = duplicate_node (dfa, org_dest, constraint);
1489 	      if (BE (clone_dest == REG_MISSING, 0))
1490 		return REG_ESPACE;
1491 	      ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1492 	      if (BE (! ok, 0))
1493 		return REG_ESPACE;
1494 	      err = duplicate_node_closure (dfa, org_dest, clone_dest,
1495 					    root_node, constraint);
1496 	      if (BE (err != REG_NOERROR, 0))
1497 		return err;
1498 	    }
1499 	  else
1500 	    {
1501 	      /* There are a duplicated node which satisfy the constraint,
1502 		 use it to avoid infinite loop.  */
1503 	      ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1504 	      if (BE (! ok, 0))
1505 		return REG_ESPACE;
1506 	    }
1507 
1508 	  org_dest = dfa->edests[org_node].elems[1];
1509 	  clone_dest = duplicate_node (dfa, org_dest, constraint);
1510 	  if (BE (clone_dest == REG_MISSING, 0))
1511 	    return REG_ESPACE;
1512 	  ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1513 	  if (BE (! ok, 0))
1514 	    return REG_ESPACE;
1515 	}
1516       org_node = org_dest;
1517       clone_node = clone_dest;
1518     }
1519   return REG_NOERROR;
1520 }
1521 
1522 /* Search for a node which is duplicated from the node ORG_NODE, and
1523    satisfies the constraint CONSTRAINT.  */
1524 
1525 static Idx
search_duplicated_node(const re_dfa_t * dfa,Idx org_node,unsigned int constraint)1526 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1527 			unsigned int constraint)
1528 {
1529   Idx idx;
1530   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1531     {
1532       if (org_node == dfa->org_indices[idx]
1533 	  && constraint == dfa->nodes[idx].constraint)
1534 	return idx; /* Found.  */
1535     }
1536   return REG_MISSING; /* Not found.  */
1537 }
1538 
1539 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1540    Return the index of the new node, or REG_MISSING if insufficient storage is
1541    available.  */
1542 
1543 static Idx
duplicate_node(re_dfa_t * dfa,Idx org_idx,unsigned int constraint)1544 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1545 {
1546   Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1547   if (BE (dup_idx != REG_MISSING, 1))
1548     {
1549       dfa->nodes[dup_idx].constraint = constraint;
1550       if (dfa->nodes[org_idx].type == ANCHOR)
1551 	dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1552       dfa->nodes[dup_idx].duplicated = 1;
1553 
1554       /* Store the index of the original node.  */
1555       dfa->org_indices[dup_idx] = org_idx;
1556     }
1557   return dup_idx;
1558 }
1559 
1560 static reg_errcode_t
calc_inveclosure(re_dfa_t * dfa)1561 calc_inveclosure (re_dfa_t *dfa)
1562 {
1563   Idx src, idx;
1564   bool ok;
1565   for (idx = 0; idx < dfa->nodes_len; ++idx)
1566     re_node_set_init_empty (dfa->inveclosures + idx);
1567 
1568   for (src = 0; src < dfa->nodes_len; ++src)
1569     {
1570       Idx *elems = dfa->eclosures[src].elems;
1571       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1572 	{
1573 	  ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1574 	  if (BE (! ok, 0))
1575 	    return REG_ESPACE;
1576 	}
1577     }
1578 
1579   return REG_NOERROR;
1580 }
1581 
1582 /* Calculate "eclosure" for all the node in DFA.  */
1583 
1584 static reg_errcode_t
calc_eclosure(re_dfa_t * dfa)1585 calc_eclosure (re_dfa_t *dfa)
1586 {
1587   Idx node_idx;
1588   bool incomplete;
1589 #ifdef DEBUG
1590   assert (dfa->nodes_len > 0);
1591 #endif
1592   incomplete = false;
1593   /* For each nodes, calculate epsilon closure.  */
1594   for (node_idx = 0; ; ++node_idx)
1595     {
1596       reg_errcode_t err;
1597       re_node_set eclosure_elem;
1598       if (node_idx == dfa->nodes_len)
1599 	{
1600 	  if (!incomplete)
1601 	    break;
1602 	  incomplete = false;
1603 	  node_idx = 0;
1604 	}
1605 
1606 #ifdef DEBUG
1607       assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1608 #endif
1609 
1610       /* If we have already calculated, skip it.  */
1611       if (dfa->eclosures[node_idx].nelem != 0)
1612 	continue;
1613       /* Calculate epsilon closure of `node_idx'.  */
1614       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1615       if (BE (err != REG_NOERROR, 0))
1616 	return err;
1617 
1618       if (dfa->eclosures[node_idx].nelem == 0)
1619 	{
1620 	  incomplete = true;
1621 	  re_node_set_free (&eclosure_elem);
1622 	}
1623     }
1624   return REG_NOERROR;
1625 }
1626 
1627 /* Calculate epsilon closure of NODE.  */
1628 
1629 static reg_errcode_t
calc_eclosure_iter(re_node_set * new_set,re_dfa_t * dfa,Idx node,bool root)1630 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1631 {
1632   reg_errcode_t err;
1633   unsigned int constraint;
1634   Idx i;
1635   bool incomplete;
1636   bool ok;
1637   re_node_set eclosure;
1638   incomplete = false;
1639   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1640   if (BE (err != REG_NOERROR, 0))
1641     return err;
1642 
1643   /* This indicates that we are calculating this node now.
1644      We reference this value to avoid infinite loop.  */
1645   dfa->eclosures[node].nelem = REG_MISSING;
1646 
1647   constraint = ((dfa->nodes[node].type == ANCHOR)
1648 		? dfa->nodes[node].opr.ctx_type : 0);
1649   /* If the current node has constraints, duplicate all nodes.
1650      Since they must inherit the constraints.  */
1651   if (constraint
1652       && dfa->edests[node].nelem
1653       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1654     {
1655       err = duplicate_node_closure (dfa, node, node, node, constraint);
1656       if (BE (err != REG_NOERROR, 0))
1657 	return err;
1658     }
1659 
1660   /* Expand each epsilon destination nodes.  */
1661   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1662     for (i = 0; i < dfa->edests[node].nelem; ++i)
1663       {
1664 	re_node_set eclosure_elem;
1665 	Idx edest = dfa->edests[node].elems[i];
1666 	/* If calculating the epsilon closure of `edest' is in progress,
1667 	   return intermediate result.  */
1668 	if (dfa->eclosures[edest].nelem == REG_MISSING)
1669 	  {
1670 	    incomplete = true;
1671 	    continue;
1672 	  }
1673 	/* If we haven't calculated the epsilon closure of `edest' yet,
1674 	   calculate now. Otherwise use calculated epsilon closure.  */
1675 	if (dfa->eclosures[edest].nelem == 0)
1676 	  {
1677 	    err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1678 	    if (BE (err != REG_NOERROR, 0))
1679 	      return err;
1680 	  }
1681 	else
1682 	  eclosure_elem = dfa->eclosures[edest];
1683 	/* Merge the epsilon closure of `edest'.  */
1684 	re_node_set_merge (&eclosure, &eclosure_elem);
1685 	/* If the epsilon closure of `edest' is incomplete,
1686 	   the epsilon closure of this node is also incomplete.  */
1687 	if (dfa->eclosures[edest].nelem == 0)
1688 	  {
1689 	    incomplete = true;
1690 	    re_node_set_free (&eclosure_elem);
1691 	  }
1692       }
1693 
1694   /* Epsilon closures include itself.  */
1695   ok = re_node_set_insert (&eclosure, node);
1696   if (BE (! ok, 0))
1697     return REG_ESPACE;
1698   if (incomplete && !root)
1699     dfa->eclosures[node].nelem = 0;
1700   else
1701     dfa->eclosures[node] = eclosure;
1702   *new_set = eclosure;
1703   return REG_NOERROR;
1704 }
1705 
1706 /* Functions for token which are used in the parser.  */
1707 
1708 /* Fetch a token from INPUT.
1709    We must not use this function inside bracket expressions.  */
1710 
1711 static void
fetch_token(re_token_t * result,re_string_t * input,reg_syntax_t syntax)1712 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1713 {
1714   re_string_skip_bytes (input, peek_token (result, input, syntax));
1715 }
1716 
1717 /* Peek a token from INPUT, and return the length of the token.
1718    We must not use this function inside bracket expressions.  */
1719 
1720 static int
peek_token(re_token_t * token,re_string_t * input,reg_syntax_t syntax)1721 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1722 {
1723   unsigned char c;
1724 
1725   if (re_string_eoi (input))
1726     {
1727       token->type = END_OF_RE;
1728       return 0;
1729     }
1730 
1731   c = re_string_peek_byte (input, 0);
1732   token->opr.c = c;
1733 
1734   token->word_char = 0;
1735 #ifdef RE_ENABLE_I18N
1736   token->mb_partial = 0;
1737   if (input->mb_cur_max > 1 &&
1738       !re_string_first_byte (input, re_string_cur_idx (input)))
1739     {
1740       token->type = CHARACTER;
1741       token->mb_partial = 1;
1742       return 1;
1743     }
1744 #endif
1745   if (c == '\\')
1746     {
1747       unsigned char c2;
1748       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1749 	{
1750 	  token->type = BACK_SLASH;
1751 	  return 1;
1752 	}
1753 
1754       c2 = re_string_peek_byte_case (input, 1);
1755       token->opr.c = c2;
1756       token->type = CHARACTER;
1757 #ifdef RE_ENABLE_I18N
1758       if (input->mb_cur_max > 1)
1759 	{
1760 	  wint_t wc = re_string_wchar_at (input,
1761 					  re_string_cur_idx (input) + 1);
1762 	  token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1763 	}
1764       else
1765 #endif
1766 	token->word_char = IS_WORD_CHAR (c2) != 0;
1767 
1768       switch (c2)
1769 	{
1770 	case '|':
1771 	  if (!(syntax & REG_LIMITED_OPS) && !(syntax & REG_NO_BK_VBAR))
1772 	    token->type = OP_ALT;
1773 	  break;
1774 	case '1': case '2': case '3': case '4': case '5':
1775 	case '6': case '7': case '8': case '9':
1776 	  if (!(syntax & REG_NO_BK_REFS))
1777 	    {
1778 	      token->type = OP_BACK_REF;
1779 	      token->opr.idx = c2 - '1';
1780 	    }
1781 	  break;
1782 	case '<':
1783 	  if (!(syntax & REG_NO_GNU_OPS))
1784 	    {
1785 	      token->type = ANCHOR;
1786 	      token->opr.ctx_type = WORD_FIRST;
1787 	    }
1788 	  break;
1789 	case '>':
1790 	  if (!(syntax & REG_NO_GNU_OPS))
1791 	    {
1792 	      token->type = ANCHOR;
1793 	      token->opr.ctx_type = WORD_LAST;
1794 	    }
1795 	  break;
1796 	case 'b':
1797 	  if (!(syntax & REG_NO_GNU_OPS))
1798 	    {
1799 	      token->type = ANCHOR;
1800 	      token->opr.ctx_type = WORD_DELIM;
1801 	    }
1802 	  break;
1803 	case 'B':
1804 	  if (!(syntax & REG_NO_GNU_OPS))
1805 	    {
1806 	      token->type = ANCHOR;
1807 	      token->opr.ctx_type = NOT_WORD_DELIM;
1808 	    }
1809 	  break;
1810 	case 'w':
1811 	  if (!(syntax & REG_NO_GNU_OPS))
1812 	    token->type = OP_WORD;
1813 	  break;
1814 	case 'W':
1815 	  if (!(syntax & REG_NO_GNU_OPS))
1816 	    token->type = OP_NOTWORD;
1817 	  break;
1818 	case 's':
1819 	  if (!(syntax & REG_NO_GNU_OPS))
1820 	    token->type = OP_SPACE;
1821 	  break;
1822 	case 'S':
1823 	  if (!(syntax & REG_NO_GNU_OPS))
1824 	    token->type = OP_NOTSPACE;
1825 	  break;
1826 	case '`':
1827 	  if (!(syntax & REG_NO_GNU_OPS))
1828 	    {
1829 	      token->type = ANCHOR;
1830 	      token->opr.ctx_type = BUF_FIRST;
1831 	    }
1832 	  break;
1833 	case '\'':
1834 	  if (!(syntax & REG_NO_GNU_OPS))
1835 	    {
1836 	      token->type = ANCHOR;
1837 	      token->opr.ctx_type = BUF_LAST;
1838 	    }
1839 	  break;
1840 	case '(':
1841 	  if (!(syntax & REG_NO_BK_PARENS))
1842 	    token->type = OP_OPEN_SUBEXP;
1843 	  break;
1844 	case ')':
1845 	  if (!(syntax & REG_NO_BK_PARENS))
1846 	    token->type = OP_CLOSE_SUBEXP;
1847 	  break;
1848 	case '+':
1849 	  if (!(syntax & REG_LIMITED_OPS) && (syntax & REG_BK_PLUS_QM))
1850 	    token->type = OP_DUP_PLUS;
1851 	  break;
1852 	case '?':
1853 	  if (!(syntax & REG_LIMITED_OPS) && (syntax & REG_BK_PLUS_QM))
1854 	    token->type = OP_DUP_QUESTION;
1855 	  break;
1856 	case '{':
1857 	  if ((syntax & REG_INTERVALS) && (!(syntax & REG_NO_BK_BRACES)))
1858 	    token->type = OP_OPEN_DUP_NUM;
1859 	  break;
1860 	case '}':
1861 	  if ((syntax & REG_INTERVALS) && (!(syntax & REG_NO_BK_BRACES)))
1862 	    token->type = OP_CLOSE_DUP_NUM;
1863 	  break;
1864 	default:
1865 	  break;
1866 	}
1867       return 2;
1868     }
1869 
1870   token->type = CHARACTER;
1871 #ifdef RE_ENABLE_I18N
1872   if (input->mb_cur_max > 1)
1873     {
1874       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1875       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1876     }
1877   else
1878 #endif
1879     token->word_char = IS_WORD_CHAR (token->opr.c);
1880 
1881   switch (c)
1882     {
1883     case '\n':
1884       if (syntax & REG_NEWLINE_ALT)
1885 	token->type = OP_ALT;
1886       break;
1887     case '|':
1888       if (!(syntax & REG_LIMITED_OPS) && (syntax & REG_NO_BK_VBAR))
1889 	token->type = OP_ALT;
1890       break;
1891     case '*':
1892       token->type = OP_DUP_ASTERISK;
1893       break;
1894     case '+':
1895       if (!(syntax & REG_LIMITED_OPS) && !(syntax & REG_BK_PLUS_QM))
1896 	token->type = OP_DUP_PLUS;
1897       break;
1898     case '?':
1899       if (!(syntax & REG_LIMITED_OPS) && !(syntax & REG_BK_PLUS_QM))
1900 	token->type = OP_DUP_QUESTION;
1901       break;
1902     case '{':
1903       if ((syntax & REG_INTERVALS) && (syntax & REG_NO_BK_BRACES))
1904 	token->type = OP_OPEN_DUP_NUM;
1905       break;
1906     case '}':
1907       if ((syntax & REG_INTERVALS) && (syntax & REG_NO_BK_BRACES))
1908 	token->type = OP_CLOSE_DUP_NUM;
1909       break;
1910     case '(':
1911       if (syntax & REG_NO_BK_PARENS)
1912 	token->type = OP_OPEN_SUBEXP;
1913       break;
1914     case ')':
1915       if (syntax & REG_NO_BK_PARENS)
1916 	token->type = OP_CLOSE_SUBEXP;
1917       break;
1918     case '[':
1919       token->type = OP_OPEN_BRACKET;
1920       break;
1921     case '.':
1922       token->type = OP_PERIOD;
1923       break;
1924     case '^':
1925       if (!(syntax & (REG_CONTEXT_INDEP_ANCHORS | REG_CARET_ANCHORS_HERE)) &&
1926 	  re_string_cur_idx (input) != 0)
1927 	{
1928 	  char prev = re_string_peek_byte (input, -1);
1929 	  if (!(syntax & REG_NEWLINE_ALT) || prev != '\n')
1930 	    break;
1931 	}
1932       token->type = ANCHOR;
1933       token->opr.ctx_type = LINE_FIRST;
1934       break;
1935     case '$':
1936       if (!(syntax & REG_CONTEXT_INDEP_ANCHORS) &&
1937 	  re_string_cur_idx (input) + 1 != re_string_length (input))
1938 	{
1939 	  re_token_t next;
1940 	  re_string_skip_bytes (input, 1);
1941 	  peek_token (&next, input, syntax);
1942 	  re_string_skip_bytes (input, -1);
1943 	  if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1944 	    break;
1945 	}
1946       token->type = ANCHOR;
1947       token->opr.ctx_type = LINE_LAST;
1948       break;
1949     default:
1950       break;
1951     }
1952   return 1;
1953 }
1954 
1955 /* Peek a token from INPUT, and return the length of the token.
1956    We must not use this function out of bracket expressions.  */
1957 
1958 static int
peek_token_bracket(re_token_t * token,re_string_t * input,reg_syntax_t syntax)1959 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1960 {
1961   unsigned char c;
1962   if (re_string_eoi (input))
1963     {
1964       token->type = END_OF_RE;
1965       return 0;
1966     }
1967   c = re_string_peek_byte (input, 0);
1968   token->opr.c = c;
1969 
1970 #ifdef RE_ENABLE_I18N
1971   if (input->mb_cur_max > 1 &&
1972       !re_string_first_byte (input, re_string_cur_idx (input)))
1973     {
1974       token->type = CHARACTER;
1975       return 1;
1976     }
1977 #endif /* RE_ENABLE_I18N */
1978 
1979   if (c == '\\' && (syntax & REG_BACKSLASH_ESCAPE_IN_LISTS)
1980       && re_string_cur_idx (input) + 1 < re_string_length (input))
1981     {
1982       /* In this case, '\' escape a character.  */
1983       unsigned char c2;
1984       re_string_skip_bytes (input, 1);
1985       c2 = re_string_peek_byte (input, 0);
1986       token->opr.c = c2;
1987       token->type = CHARACTER;
1988       return 1;
1989     }
1990   if (c == '[') /* '[' is a special char in a bracket exps.  */
1991     {
1992       unsigned char c2;
1993       int token_len;
1994       if (re_string_cur_idx (input) + 1 < re_string_length (input))
1995 	c2 = re_string_peek_byte (input, 1);
1996       else
1997 	c2 = 0;
1998       token->opr.c = c2;
1999       token_len = 2;
2000       switch (c2)
2001 	{
2002 	case '.':
2003 	  token->type = OP_OPEN_COLL_ELEM;
2004 	  break;
2005 	case '=':
2006 	  token->type = OP_OPEN_EQUIV_CLASS;
2007 	  break;
2008 	case ':':
2009 	  if (syntax & REG_CHAR_CLASSES)
2010 	    {
2011 	      token->type = OP_OPEN_CHAR_CLASS;
2012 	      break;
2013 	    }
2014 	  /* else fall through.  */
2015 	default:
2016 	  token->type = CHARACTER;
2017 	  token->opr.c = c;
2018 	  token_len = 1;
2019 	  break;
2020 	}
2021       return token_len;
2022     }
2023   switch (c)
2024     {
2025     case '-':
2026       token->type = OP_CHARSET_RANGE;
2027       break;
2028     case ']':
2029       token->type = OP_CLOSE_BRACKET;
2030       break;
2031     case '^':
2032       token->type = OP_NON_MATCH_LIST;
2033       break;
2034     default:
2035       token->type = CHARACTER;
2036     }
2037   return 1;
2038 }
2039 
2040 /* Functions for parser.  */
2041 
2042 /* Entry point of the parser.
2043    Parse the regular expression REGEXP and return the structure tree.
2044    If an error is occured, ERR is set by error code, and return NULL.
2045    This function build the following tree, from regular expression <reg_exp>:
2046 	   CAT
2047 	   / \
2048 	  /   \
2049    <reg_exp>  EOR
2050 
2051    CAT means concatenation.
2052    EOR means end of regular expression.  */
2053 
2054 static bin_tree_t *
parse(re_string_t * regexp,regex_t * preg,reg_syntax_t syntax,reg_errcode_t * err)2055 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2056        reg_errcode_t *err)
2057 {
2058   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2059   bin_tree_t *tree, *eor, *root;
2060   re_token_t current_token;
2061   dfa->syntax = syntax;
2062   fetch_token (&current_token, regexp, syntax | REG_CARET_ANCHORS_HERE);
2063   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2064   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2065     return NULL;
2066   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2067   if (tree != NULL)
2068     root = create_tree (dfa, tree, eor, CONCAT);
2069   else
2070     root = eor;
2071   if (BE (eor == NULL || root == NULL, 0))
2072     {
2073       *err = REG_ESPACE;
2074       return NULL;
2075     }
2076   return root;
2077 }
2078 
2079 /* This function build the following tree, from regular expression
2080    <branch1>|<branch2>:
2081 	   ALT
2082 	   / \
2083 	  /   \
2084    <branch1> <branch2>
2085 
2086    ALT means alternative, which represents the operator `|'.  */
2087 
2088 static bin_tree_t *
parse_reg_exp(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,Idx nest,reg_errcode_t * err)2089 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2090 	       reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2091 {
2092   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2093   bin_tree_t *tree, *branch = NULL;
2094   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2095   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2096     return NULL;
2097 
2098   while (token->type == OP_ALT)
2099     {
2100       fetch_token (token, regexp, syntax | REG_CARET_ANCHORS_HERE);
2101       if (token->type != OP_ALT && token->type != END_OF_RE
2102 	  && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2103 	{
2104 	  branch = parse_branch (regexp, preg, token, syntax, nest, err);
2105 	  if (BE (*err != REG_NOERROR && branch == NULL, 0))
2106 	    return NULL;
2107 	}
2108       else
2109 	branch = NULL;
2110       tree = create_tree (dfa, tree, branch, OP_ALT);
2111       if (BE (tree == NULL, 0))
2112 	{
2113 	  *err = REG_ESPACE;
2114 	  return NULL;
2115 	}
2116     }
2117   return tree;
2118 }
2119 
2120 /* This function build the following tree, from regular expression
2121    <exp1><exp2>:
2122 	CAT
2123 	/ \
2124        /   \
2125    <exp1> <exp2>
2126 
2127    CAT means concatenation.  */
2128 
2129 static bin_tree_t *
parse_branch(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,Idx nest,reg_errcode_t * err)2130 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2131 	      reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2132 {
2133   bin_tree_t *tree, *exp;
2134   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2135   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2136   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2137     return NULL;
2138 
2139   while (token->type != OP_ALT && token->type != END_OF_RE
2140 	 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2141     {
2142       exp = parse_expression (regexp, preg, token, syntax, nest, err);
2143       if (BE (*err != REG_NOERROR && exp == NULL, 0))
2144 	{
2145 	  return NULL;
2146 	}
2147       if (tree != NULL && exp != NULL)
2148 	{
2149 	  tree = create_tree (dfa, tree, exp, CONCAT);
2150 	  if (tree == NULL)
2151 	    {
2152 	      *err = REG_ESPACE;
2153 	      return NULL;
2154 	    }
2155 	}
2156       else if (tree == NULL)
2157 	tree = exp;
2158       /* Otherwise exp == NULL, we don't need to create new tree.  */
2159     }
2160   return tree;
2161 }
2162 
2163 /* This function build the following tree, from regular expression a*:
2164 	 *
2165 	 |
2166 	 a
2167 */
2168 
2169 static bin_tree_t *
parse_expression(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,Idx nest,reg_errcode_t * err)2170 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2171 		  reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2172 {
2173   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2174   bin_tree_t *tree;
2175   switch (token->type)
2176     {
2177     case CHARACTER:
2178       tree = create_token_tree (dfa, NULL, NULL, token);
2179       if (BE (tree == NULL, 0))
2180 	{
2181 	  *err = REG_ESPACE;
2182 	  return NULL;
2183 	}
2184 #ifdef RE_ENABLE_I18N
2185       if (dfa->mb_cur_max > 1)
2186 	{
2187 	  while (!re_string_eoi (regexp)
2188 		 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2189 	    {
2190 	      bin_tree_t *mbc_remain;
2191 	      fetch_token (token, regexp, syntax);
2192 	      mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2193 	      tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2194 	      if (BE (mbc_remain == NULL || tree == NULL, 0))
2195 		{
2196 		  *err = REG_ESPACE;
2197 		  return NULL;
2198 		}
2199 	    }
2200 	}
2201 #endif
2202       break;
2203     case OP_OPEN_SUBEXP:
2204       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2205       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2206 	return NULL;
2207       break;
2208     case OP_OPEN_BRACKET:
2209       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2210       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2211 	return NULL;
2212       break;
2213     case OP_BACK_REF:
2214       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2215 	{
2216 	  *err = REG_ESUBREG;
2217 	  return NULL;
2218 	}
2219       dfa->used_bkref_map |= 1 << token->opr.idx;
2220       tree = create_token_tree (dfa, NULL, NULL, token);
2221       if (BE (tree == NULL, 0))
2222 	{
2223 	  *err = REG_ESPACE;
2224 	  return NULL;
2225 	}
2226       ++dfa->nbackref;
2227       dfa->has_mb_node = 1;
2228       break;
2229     case OP_OPEN_DUP_NUM:
2230       if (syntax & REG_CONTEXT_INVALID_DUP)
2231 	{
2232 	  *err = REG_BADRPT;
2233 	  return NULL;
2234 	}
2235       /* FALLTHROUGH */
2236     case OP_DUP_ASTERISK:
2237     case OP_DUP_PLUS:
2238     case OP_DUP_QUESTION:
2239       if (syntax & REG_CONTEXT_INVALID_OPS)
2240 	{
2241 	  *err = REG_BADRPT;
2242 	  return NULL;
2243 	}
2244       else if (syntax & REG_CONTEXT_INDEP_OPS)
2245 	{
2246 	  fetch_token (token, regexp, syntax);
2247 	  return parse_expression (regexp, preg, token, syntax, nest, err);
2248 	}
2249       /* else fall through  */
2250     case OP_CLOSE_SUBEXP:
2251       if ((token->type == OP_CLOSE_SUBEXP) &&
2252 	  !(syntax & REG_UNMATCHED_RIGHT_PAREN_ORD))
2253 	{
2254 	  *err = REG_ERPAREN;
2255 	  return NULL;
2256 	}
2257       /* else fall through  */
2258     case OP_CLOSE_DUP_NUM:
2259       /* We treat it as a normal character.  */
2260 
2261       /* Then we can these characters as normal characters.  */
2262       token->type = CHARACTER;
2263       /* mb_partial and word_char bits should be initialized already
2264 	 by peek_token.  */
2265       tree = create_token_tree (dfa, NULL, NULL, token);
2266       if (BE (tree == NULL, 0))
2267 	{
2268 	  *err = REG_ESPACE;
2269 	  return NULL;
2270 	}
2271       break;
2272     case ANCHOR:
2273       if ((token->opr.ctx_type
2274 	   & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2275 	  && dfa->word_ops_used == 0)
2276 	init_word_char (dfa);
2277       if (token->opr.ctx_type == WORD_DELIM
2278           || token->opr.ctx_type == NOT_WORD_DELIM)
2279 	{
2280 	  bin_tree_t *tree_first, *tree_last;
2281 	  if (token->opr.ctx_type == WORD_DELIM)
2282 	    {
2283 	      token->opr.ctx_type = WORD_FIRST;
2284 	      tree_first = create_token_tree (dfa, NULL, NULL, token);
2285 	      token->opr.ctx_type = WORD_LAST;
2286             }
2287           else
2288             {
2289 	      token->opr.ctx_type = INSIDE_WORD;
2290 	      tree_first = create_token_tree (dfa, NULL, NULL, token);
2291 	      token->opr.ctx_type = INSIDE_NOTWORD;
2292             }
2293 	  tree_last = create_token_tree (dfa, NULL, NULL, token);
2294 	  tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2295 	  if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2296 	    {
2297 	      *err = REG_ESPACE;
2298 	      return NULL;
2299 	    }
2300 	}
2301       else
2302 	{
2303 	  tree = create_token_tree (dfa, NULL, NULL, token);
2304 	  if (BE (tree == NULL, 0))
2305 	    {
2306 	      *err = REG_ESPACE;
2307 	      return NULL;
2308 	    }
2309 	}
2310       /* We must return here, since ANCHORs can't be followed
2311 	 by repetition operators.
2312 	 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2313 	     it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2314       fetch_token (token, regexp, syntax);
2315       return tree;
2316     case OP_PERIOD:
2317       tree = create_token_tree (dfa, NULL, NULL, token);
2318       if (BE (tree == NULL, 0))
2319 	{
2320 	  *err = REG_ESPACE;
2321 	  return NULL;
2322 	}
2323       if (dfa->mb_cur_max > 1)
2324 	dfa->has_mb_node = 1;
2325       break;
2326     case OP_WORD:
2327     case OP_NOTWORD:
2328       tree = build_charclass_op (dfa, regexp->trans,
2329 				 (const unsigned char *) "alnum",
2330 				 (const unsigned char *) "_",
2331 				 token->type == OP_NOTWORD, err);
2332       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2333 	return NULL;
2334       break;
2335     case OP_SPACE:
2336     case OP_NOTSPACE:
2337       tree = build_charclass_op (dfa, regexp->trans,
2338 				 (const unsigned char *) "space",
2339 				 (const unsigned char *) "",
2340 				 token->type == OP_NOTSPACE, err);
2341       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2342 	return NULL;
2343       break;
2344     case OP_ALT:
2345     case END_OF_RE:
2346       return NULL;
2347     case BACK_SLASH:
2348       *err = REG_EESCAPE;
2349       return NULL;
2350     default:
2351       /* Must not happen?  */
2352 #ifdef DEBUG
2353       assert (0);
2354 #endif
2355       return NULL;
2356     }
2357   fetch_token (token, regexp, syntax);
2358 
2359   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2360 	 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2361     {
2362       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2363       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2364 	return NULL;
2365       /* In BRE consecutive duplications are not allowed.  */
2366       if ((syntax & REG_CONTEXT_INVALID_DUP)
2367 	  && (token->type == OP_DUP_ASTERISK
2368 	      || token->type == OP_OPEN_DUP_NUM))
2369 	{
2370 	  *err = REG_BADRPT;
2371 	  return NULL;
2372 	}
2373     }
2374 
2375   return tree;
2376 }
2377 
2378 /* This function build the following tree, from regular expression
2379    (<reg_exp>):
2380 	 SUBEXP
2381 	    |
2382 	<reg_exp>
2383 */
2384 
2385 static bin_tree_t *
parse_sub_exp(re_string_t * regexp,regex_t * preg,re_token_t * token,reg_syntax_t syntax,Idx nest,reg_errcode_t * err)2386 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2387 	       reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2388 {
2389   re_dfa_t *dfa = (re_dfa_t *) preg->re_buffer;
2390   bin_tree_t *tree;
2391   size_t cur_nsub;
2392   cur_nsub = preg->re_nsub++;
2393 
2394   fetch_token (token, regexp, syntax | REG_CARET_ANCHORS_HERE);
2395 
2396   /* The subexpression may be a null string.  */
2397   if (token->type == OP_CLOSE_SUBEXP)
2398     tree = NULL;
2399   else
2400     {
2401       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2402       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2403         *err = REG_EPAREN;
2404       if (BE (*err != REG_NOERROR, 0))
2405 	return NULL;
2406     }
2407 
2408   if (cur_nsub <= '9' - '1')
2409     dfa->completed_bkref_map |= 1 << cur_nsub;
2410 
2411   tree = create_tree (dfa, tree, NULL, SUBEXP);
2412   if (BE (tree == NULL, 0))
2413     {
2414       *err = REG_ESPACE;
2415       return NULL;
2416     }
2417   tree->token.opr.idx = cur_nsub;
2418   return tree;
2419 }
2420 
2421 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2422 
2423 static bin_tree_t *
parse_dup_op(bin_tree_t * elem,re_string_t * regexp,re_dfa_t * dfa,re_token_t * token,reg_syntax_t syntax,reg_errcode_t * err)2424 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2425 	      re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2426 {
2427   bin_tree_t *tree = NULL, *old_tree = NULL;
2428   Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2429   re_token_t start_token = *token;
2430 
2431   if (token->type == OP_OPEN_DUP_NUM)
2432     {
2433       end = 0;
2434       start = fetch_number (regexp, token, syntax);
2435       if (start == REG_MISSING)
2436 	{
2437 	  if (token->type == CHARACTER && token->opr.c == ',')
2438 	    start = 0; /* We treat "{,m}" as "{0,m}".  */
2439 	  else
2440 	    {
2441 	      *err = REG_BADBR; /* <re>{} is invalid.  */
2442 	      return NULL;
2443 	    }
2444 	}
2445       if (BE (start != REG_ERROR, 1))
2446 	{
2447 	  /* We treat "{n}" as "{n,n}".  */
2448 	  end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2449 		 : ((token->type == CHARACTER && token->opr.c == ',')
2450 		    ? fetch_number (regexp, token, syntax) : REG_ERROR));
2451 	}
2452       if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2453 	{
2454 	  /* Invalid sequence.  */
2455 	  if (BE (!(syntax & REG_INVALID_INTERVAL_ORD), 0))
2456 	    {
2457 	      if (token->type == END_OF_RE)
2458 		*err = REG_EBRACE;
2459 	      else
2460 		*err = REG_BADBR;
2461 
2462 	      return NULL;
2463 	    }
2464 
2465 	  /* If the syntax bit is set, rollback.  */
2466 	  re_string_set_index (regexp, start_idx);
2467 	  *token = start_token;
2468 	  token->type = CHARACTER;
2469 	  /* mb_partial and word_char bits should be already initialized by
2470 	     peek_token.  */
2471 	  return elem;
2472 	}
2473 
2474       if (BE (end != REG_MISSING && start > end, 0))
2475 	{
2476 	  /* First number greater than second.  */
2477 	  *err = REG_BADBR;
2478 	  return NULL;
2479 	}
2480     }
2481   else
2482     {
2483       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2484       end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2485     }
2486 
2487   fetch_token (token, regexp, syntax);
2488 
2489   if (BE (elem == NULL, 0))
2490     return NULL;
2491   if (BE (start == 0 && end == 0, 0))
2492     {
2493       postorder (elem, free_tree, NULL);
2494       return NULL;
2495     }
2496 
2497   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2498   if (BE (start > 0, 0))
2499     {
2500       tree = elem;
2501       for (i = 2; i <= start; ++i)
2502 	{
2503 	  elem = duplicate_tree (elem, dfa);
2504 	  tree = create_tree (dfa, tree, elem, CONCAT);
2505 	  if (BE (elem == NULL || tree == NULL, 0))
2506 	    goto parse_dup_op_espace;
2507 	}
2508 
2509       if (start == end)
2510 	return tree;
2511 
2512       /* Duplicate ELEM before it is marked optional.  */
2513       elem = duplicate_tree (elem, dfa);
2514       old_tree = tree;
2515     }
2516   else
2517     old_tree = NULL;
2518 
2519   if (elem->token.type == SUBEXP)
2520     postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2521 
2522   tree = create_tree (dfa, elem, NULL,
2523 		      (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2524   if (BE (tree == NULL, 0))
2525     goto parse_dup_op_espace;
2526 
2527   /* This loop is actually executed only when end != REG_MISSING,
2528      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2529      already created the start+1-th copy.  */
2530   if ((Idx) -1 < 0 || end != REG_MISSING)
2531     for (i = start + 2; i <= end; ++i)
2532       {
2533 	elem = duplicate_tree (elem, dfa);
2534 	tree = create_tree (dfa, tree, elem, CONCAT);
2535 	if (BE (elem == NULL || tree == NULL, 0))
2536 	  goto parse_dup_op_espace;
2537 
2538 	tree = create_tree (dfa, tree, NULL, OP_ALT);
2539 	if (BE (tree == NULL, 0))
2540 	  goto parse_dup_op_espace;
2541       }
2542 
2543   if (old_tree)
2544     tree = create_tree (dfa, old_tree, tree, CONCAT);
2545 
2546   return tree;
2547 
2548  parse_dup_op_espace:
2549   *err = REG_ESPACE;
2550   return NULL;
2551 }
2552 
2553 /* Size of the names for collating symbol/equivalence_class/character_class.
2554    I'm not sure, but maybe enough.  */
2555 #define BRACKET_NAME_BUF_SIZE 32
2556 
2557 #ifndef _LIBC
2558   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2559      Build the range expression which starts from START_ELEM, and ends
2560      at END_ELEM.  The result are written to MBCSET and SBCSET.
2561      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2562      mbcset->range_ends, is a pointer argument sinse we may
2563      update it.  */
2564 
2565 static reg_errcode_t
build_range_exp(bitset sbcset,re_charset_t * mbcset,Idx * range_alloc,bracket_elem_t * start_elem,bracket_elem_t * end_elem)2566 build_range_exp (bitset sbcset,
2567 # ifdef RE_ENABLE_I18N
2568 		 re_charset_t *mbcset, Idx *range_alloc,
2569 # endif
2570 		 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2571 {
2572   unsigned int start_ch, end_ch;
2573   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2574   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2575 	  || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2576 	  0))
2577     return REG_ERANGE;
2578 
2579   /* We can handle no multi character collating elements without libc
2580      support.  */
2581   if (BE ((start_elem->type == COLL_SYM
2582 	   && strlen ((char *) start_elem->opr.name) > 1)
2583 	  || (end_elem->type == COLL_SYM
2584 	      && strlen ((char *) end_elem->opr.name) > 1), 0))
2585     return REG_ECOLLATE;
2586 
2587 # ifdef RE_ENABLE_I18N
2588   {
2589     wchar_t wc;
2590     wint_t start_wc, end_wc;
2591     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2592 
2593     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2594 		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2595 		   : 0));
2596     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2597 	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2598 		 : 0));
2599     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2600 		? __btowc (start_ch) : start_elem->opr.wch);
2601     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2602 	      ? __btowc (end_ch) : end_elem->opr.wch);
2603     if (start_wc == WEOF || end_wc == WEOF)
2604       return REG_ECOLLATE;
2605     cmp_buf[0] = start_wc;
2606     cmp_buf[4] = end_wc;
2607     if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2608       return REG_ERANGE;
2609 
2610     /* Got valid collation sequence values, add them as a new entry.
2611        However, for !_LIBC we have no collation elements: if the
2612        character set is single byte, the single byte character set
2613        that we build below suffices.  parse_bracket_exp passes
2614        no MBCSET if dfa->mb_cur_max == 1.  */
2615     if (mbcset)
2616       {
2617         /* Check the space of the arrays.  */
2618         if (BE (*range_alloc == mbcset->nranges, 0))
2619           {
2620 	    /* There is not enough space, need realloc.  */
2621 	    wchar_t *new_array_start, *new_array_end;
2622 	    Idx new_nranges;
2623 
2624 	    new_nranges = mbcset->nranges;
2625 	    /* Use realloc since mbcset->range_starts and mbcset->range_ends
2626 	       are NULL if *range_alloc == 0.  */
2627 	    new_array_start = re_x2realloc (mbcset->range_starts, wchar_t,
2628 					    &new_nranges);
2629 	    new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2630 				        new_nranges);
2631 
2632 	    if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2633 	      return REG_ESPACE;
2634 
2635 	    mbcset->range_starts = new_array_start;
2636 	    mbcset->range_ends = new_array_end;
2637 	    *range_alloc = new_nranges;
2638           }
2639 
2640         mbcset->range_starts[mbcset->nranges] = start_wc;
2641         mbcset->range_ends[mbcset->nranges++] = end_wc;
2642       }
2643 
2644     /* Build the table for single byte characters.  */
2645     for (wc = 0; wc < SBC_MAX; ++wc)
2646       {
2647 	cmp_buf[2] = wc;
2648 	if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2649 	    && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2650 	  bitset_set (sbcset, wc);
2651       }
2652   }
2653 # else /* not RE_ENABLE_I18N */
2654   {
2655     unsigned int ch;
2656     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2657 		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2658 		   : 0));
2659     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2660 	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2661 		 : 0));
2662     if (start_ch > end_ch)
2663       return REG_ERANGE;
2664     /* Build the table for single byte characters.  */
2665     for (ch = 0; ch < SBC_MAX; ++ch)
2666       if (start_ch <= ch  && ch <= end_ch)
2667 	bitset_set (sbcset, ch);
2668   }
2669 # endif /* not RE_ENABLE_I18N */
2670   return REG_NOERROR;
2671 }
2672 #endif /* not _LIBC */
2673 
2674 #ifndef _LIBC
2675 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2676    Build the collating element which is represented by NAME.
2677    The result are written to MBCSET and SBCSET.
2678    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2679    pointer argument since we may update it.  */
2680 
2681 static reg_errcode_t
build_collating_symbol(bitset sbcset,re_charset_t * mbcset,Idx * coll_sym_alloc,const unsigned char * name)2682 build_collating_symbol (bitset sbcset,
2683 # ifdef RE_ENABLE_I18N
2684 			re_charset_t *mbcset, Idx *coll_sym_alloc,
2685 # endif
2686 			const unsigned char *name)
2687 {
2688   size_t name_len = strlen ((const char *) name);
2689   if (BE (name_len != 1, 0))
2690     return REG_ECOLLATE;
2691   else
2692     {
2693       bitset_set (sbcset, name[0]);
2694       return REG_NOERROR;
2695     }
2696 }
2697 #endif /* not _LIBC */
2698 
2699 /* This function parse bracket expression like "[abc]", "[a-c]",
2700    "[[.a-a.]]" etc.  */
2701 
2702 static bin_tree_t *
parse_bracket_exp(re_string_t * regexp,re_dfa_t * dfa,re_token_t * token,reg_syntax_t syntax,reg_errcode_t * err)2703 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2704 		   reg_syntax_t syntax, reg_errcode_t *err)
2705 {
2706 #ifdef _LIBC
2707   const unsigned char *collseqmb;
2708   const char *collseqwc;
2709   uint32_t nrules;
2710   int32_t table_size;
2711   const int32_t *symb_table;
2712   const unsigned char *extra;
2713 
2714   /* Local function for parse_bracket_exp used in _LIBC environement.
2715      Seek the collating symbol entry correspondings to NAME.
2716      Return the index of the symbol in the SYMB_TABLE.  */
2717 
2718   auto inline int32_t
2719   __attribute ((always_inline))
2720   seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
2721     {
2722       int32_t hash = elem_hash ((const char *) name, name_len);
2723       int32_t elem = hash % table_size;
2724       int32_t second = hash % (table_size - 2);
2725       while (symb_table[2 * elem] != 0)
2726 	{
2727 	  /* First compare the hashing value.  */
2728 	  if (symb_table[2 * elem] == hash
2729 	      /* Compare the length of the name.  */
2730 	      && name_len == extra[symb_table[2 * elem + 1]]
2731 	      /* Compare the name.  */
2732 	      && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2733 			 name_len) == 0)
2734 	    {
2735 	      /* Yep, this is the entry.  */
2736 	      break;
2737 	    }
2738 
2739 	  /* Next entry.  */
2740 	  elem += second;
2741 	}
2742       return elem;
2743     }
2744 
2745   /* Local function for parse_bracket_exp used in _LIBC environement.
2746      Look up the collation sequence value of BR_ELEM.
2747      Return the value if succeeded, UINT_MAX otherwise.  */
2748 
2749   auto inline unsigned int
2750   __attribute ((always_inline))
2751   lookup_collation_sequence_value (bracket_elem_t *br_elem)
2752     {
2753       if (br_elem->type == SB_CHAR)
2754 	{
2755 	  /*
2756 	  if (MB_CUR_MAX == 1)
2757 	  */
2758 	  if (nrules == 0)
2759 	    return collseqmb[br_elem->opr.ch];
2760 	  else
2761 	    {
2762 	      wint_t wc = __btowc (br_elem->opr.ch);
2763 	      return __collseq_table_lookup (collseqwc, wc);
2764 	    }
2765 	}
2766       else if (br_elem->type == MB_CHAR)
2767 	{
2768 	  return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2769 	}
2770       else if (br_elem->type == COLL_SYM)
2771 	{
2772 	  size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2773 	  if (nrules != 0)
2774 	    {
2775 	      int32_t elem, idx;
2776 	      elem = seek_collating_symbol_entry (br_elem->opr.name,
2777 						  sym_name_len);
2778 	      if (symb_table[2 * elem] != 0)
2779 		{
2780 		  /* We found the entry.  */
2781 		  idx = symb_table[2 * elem + 1];
2782 		  /* Skip the name of collating element name.  */
2783 		  idx += 1 + extra[idx];
2784 		  /* Skip the byte sequence of the collating element.  */
2785 		  idx += 1 + extra[idx];
2786 		  /* Adjust for the alignment.  */
2787 		  idx = (idx + 3) & ~3;
2788 		  /* Skip the multibyte collation sequence value.  */
2789 		  idx += sizeof (unsigned int);
2790 		  /* Skip the wide char sequence of the collating element.  */
2791 		  idx += sizeof (unsigned int) *
2792 		    (1 + *(unsigned int *) (extra + idx));
2793 		  /* Return the collation sequence value.  */
2794 		  return *(unsigned int *) (extra + idx);
2795 		}
2796 	      else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2797 		{
2798 		  /* No valid character.  Match it as a single byte
2799 		     character.  */
2800 		  return collseqmb[br_elem->opr.name[0]];
2801 		}
2802 	    }
2803 	  else if (sym_name_len == 1)
2804 	    return collseqmb[br_elem->opr.name[0]];
2805 	}
2806       return UINT_MAX;
2807     }
2808 
2809   /* Local function for parse_bracket_exp used in _LIBC environement.
2810      Build the range expression which starts from START_ELEM, and ends
2811      at END_ELEM.  The result are written to MBCSET and SBCSET.
2812      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2813      mbcset->range_ends, is a pointer argument sinse we may
2814      update it.  */
2815 
2816   auto inline reg_errcode_t
2817   __attribute ((always_inline))
2818   build_range_exp (bitset sbcset, re_charset_t *mbcset,
2819 		   Idx *range_alloc,
2820 		   bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2821     {
2822       unsigned int ch;
2823       uint32_t start_collseq;
2824       uint32_t end_collseq;
2825 
2826       /* Equivalence Classes and Character Classes can't be a range
2827 	 start/end.  */
2828       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2829 	      || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2830 	      0))
2831 	return REG_ERANGE;
2832 
2833       start_collseq = lookup_collation_sequence_value (start_elem);
2834       end_collseq = lookup_collation_sequence_value (end_elem);
2835       /* Check start/end collation sequence values.  */
2836       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2837 	return REG_ECOLLATE;
2838       if (BE ((syntax & REG_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2839 	return REG_ERANGE;
2840 
2841       /* Got valid collation sequence values, add them as a new entry.
2842 	 However, if we have no collation elements, and the character set
2843 	 is single byte, the single byte character set that we
2844 	 build below suffices. */
2845       if (nrules > 0 || dfa->mb_cur_max > 1)
2846 	{
2847           /* Check the space of the arrays.  */
2848           if (BE (*range_alloc == mbcset->nranges, 0))
2849 	    {
2850 	      /* There is not enough space, need realloc.  */
2851 	      uint32_t *new_array_start;
2852 	      uint32_t *new_array_end;
2853 	      Idx new_nranges;
2854 
2855 	      new_nranges = mbcset->nranges;
2856 	      new_array_start = re_x2realloc (mbcset->range_starts, uint32_t,
2857 					      &new_nranges);
2858 	      new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2859 				          new_nranges);
2860 
2861 	      if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2862 	        return REG_ESPACE;
2863 
2864 	      mbcset->range_starts = new_array_start;
2865 	      mbcset->range_ends = new_array_end;
2866 	      *range_alloc = new_nranges;
2867 	    }
2868 
2869           mbcset->range_starts[mbcset->nranges] = start_collseq;
2870           mbcset->range_ends[mbcset->nranges++] = end_collseq;
2871 	}
2872 
2873       /* Build the table for single byte characters.  */
2874       for (ch = 0; ch < SBC_MAX; ch++)
2875 	{
2876 	  uint32_t ch_collseq;
2877 	  /*
2878 	  if (MB_CUR_MAX == 1)
2879 	  */
2880 	  if (nrules == 0)
2881 	    ch_collseq = collseqmb[ch];
2882 	  else
2883 	    ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2884 	  if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2885 	    bitset_set (sbcset, ch);
2886 	}
2887       return REG_NOERROR;
2888     }
2889 
2890   /* Local function for parse_bracket_exp used in _LIBC environement.
2891      Build the collating element which is represented by NAME.
2892      The result are written to MBCSET and SBCSET.
2893      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2894      pointer argument sinse we may update it.  */
2895 
2896   auto inline reg_errcode_t
2897   __attribute ((always_inline))
2898   build_collating_symbol (bitset sbcset, re_charset_t *mbcset,
2899 			  Idx *coll_sym_alloc, const unsigned char *name)
2900     {
2901       int32_t elem, idx;
2902       size_t name_len = strlen ((const char *) name);
2903       if (nrules != 0)
2904 	{
2905 	  elem = seek_collating_symbol_entry (name, name_len);
2906 	  if (symb_table[2 * elem] != 0)
2907 	    {
2908 	      /* We found the entry.  */
2909 	      idx = symb_table[2 * elem + 1];
2910 	      /* Skip the name of collating element name.  */
2911 	      idx += 1 + extra[idx];
2912 	    }
2913 	  else if (symb_table[2 * elem] == 0 && name_len == 1)
2914 	    {
2915 	      /* No valid character, treat it as a normal
2916 		 character.  */
2917 	      bitset_set (sbcset, name[0]);
2918 	      return REG_NOERROR;
2919 	    }
2920 	  else
2921 	    return REG_ECOLLATE;
2922 
2923 	  /* Got valid collation sequence, add it as a new entry.  */
2924 	  /* Check the space of the arrays.  */
2925 	  if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2926 	    {
2927 	      /* Not enough, realloc it.  */
2928 	      Idx new_coll_sym_alloc = mbcset->ncoll_syms;
2929 	      /* Use realloc since mbcset->coll_syms is NULL
2930 		 if *alloc == 0.  */
2931 	      int32_t *new_coll_syms = re_x2realloc (mbcset->coll_syms, int32_t,
2932 						     &new_coll_sym_alloc);
2933 	      if (BE (new_coll_syms == NULL, 0))
2934 		return REG_ESPACE;
2935 	      mbcset->coll_syms = new_coll_syms;
2936 	      *coll_sym_alloc = new_coll_sym_alloc;
2937 	    }
2938 	  mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2939 	  return REG_NOERROR;
2940 	}
2941       else
2942 	{
2943 	  if (BE (name_len != 1, 0))
2944 	    return REG_ECOLLATE;
2945 	  else
2946 	    {
2947 	      bitset_set (sbcset, name[0]);
2948 	      return REG_NOERROR;
2949 	    }
2950 	}
2951     }
2952 #endif
2953 
2954   re_token_t br_token;
2955   re_bitset_ptr_t sbcset;
2956 #ifdef RE_ENABLE_I18N
2957   re_charset_t *mbcset;
2958   Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2959   Idx equiv_class_alloc = 0, char_class_alloc = 0;
2960 #endif /* not RE_ENABLE_I18N */
2961   bool non_match = false;
2962   bin_tree_t *work_tree;
2963   int token_len;
2964   bool first_round = true;
2965 #ifdef _LIBC
2966   collseqmb = (const unsigned char *)
2967     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
2968   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2969   if (nrules)
2970     {
2971       /*
2972       if (MB_CUR_MAX > 1)
2973       */
2974 	collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
2975       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
2976       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
2977 						  _NL_COLLATE_SYMB_TABLEMB);
2978       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
2979 						   _NL_COLLATE_SYMB_EXTRAMB);
2980     }
2981 #endif
2982   sbcset = re_calloc (bitset_word, BITSET_WORDS);
2983 #ifdef RE_ENABLE_I18N
2984   mbcset = re_calloc (re_charset_t, 1);
2985 #endif /* RE_ENABLE_I18N */
2986 #ifdef RE_ENABLE_I18N
2987   if (BE (sbcset == NULL || mbcset == NULL, 0))
2988 #else
2989   if (BE (sbcset == NULL, 0))
2990 #endif /* RE_ENABLE_I18N */
2991     {
2992       *err = REG_ESPACE;
2993       return NULL;
2994     }
2995 
2996   token_len = peek_token_bracket (token, regexp, syntax);
2997   if (BE (token->type == END_OF_RE, 0))
2998     {
2999       *err = REG_BADPAT;
3000       goto parse_bracket_exp_free_return;
3001     }
3002   if (token->type == OP_NON_MATCH_LIST)
3003     {
3004 #ifdef RE_ENABLE_I18N
3005       mbcset->non_match = 1;
3006 #endif /* not RE_ENABLE_I18N */
3007       non_match = true;
3008       if (syntax & REG_HAT_LISTS_NOT_NEWLINE)
3009 	bitset_set (sbcset, '\0');
3010       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3011       token_len = peek_token_bracket (token, regexp, syntax);
3012       if (BE (token->type == END_OF_RE, 0))
3013 	{
3014 	  *err = REG_BADPAT;
3015 	  goto parse_bracket_exp_free_return;
3016 	}
3017     }
3018 
3019   /* We treat the first ']' as a normal character.  */
3020   if (token->type == OP_CLOSE_BRACKET)
3021     token->type = CHARACTER;
3022 
3023   while (1)
3024     {
3025       bracket_elem_t start_elem, end_elem;
3026       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3027       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3028       reg_errcode_t ret;
3029       int token_len2 = 0;
3030       bool is_range_exp = false;
3031       re_token_t token2;
3032 
3033       start_elem.opr.name = start_name_buf;
3034       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3035 				   syntax, first_round);
3036       if (BE (ret != REG_NOERROR, 0))
3037 	{
3038 	  *err = ret;
3039 	  goto parse_bracket_exp_free_return;
3040 	}
3041       first_round = false;
3042 
3043       /* Get information about the next token.  We need it in any case.  */
3044       token_len = peek_token_bracket (token, regexp, syntax);
3045 
3046       /* Do not check for ranges if we know they are not allowed.  */
3047       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3048 	{
3049 	  if (BE (token->type == END_OF_RE, 0))
3050 	    {
3051 	      *err = REG_EBRACK;
3052 	      goto parse_bracket_exp_free_return;
3053 	    }
3054 	  if (token->type == OP_CHARSET_RANGE)
3055 	    {
3056 	      re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3057 	      token_len2 = peek_token_bracket (&token2, regexp, syntax);
3058 	      if (BE (token2.type == END_OF_RE, 0))
3059 		{
3060 		  *err = REG_EBRACK;
3061 		  goto parse_bracket_exp_free_return;
3062 		}
3063 	      if (token2.type == OP_CLOSE_BRACKET)
3064 		{
3065 		  /* We treat the last '-' as a normal character.  */
3066 		  re_string_skip_bytes (regexp, -token_len);
3067 		  token->type = CHARACTER;
3068 		}
3069 	      else
3070 		is_range_exp = true;
3071 	    }
3072 	}
3073 
3074       if (is_range_exp == true)
3075 	{
3076 	  end_elem.opr.name = end_name_buf;
3077 	  ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3078 				       dfa, syntax, true);
3079 	  if (BE (ret != REG_NOERROR, 0))
3080 	    {
3081 	      *err = ret;
3082 	      goto parse_bracket_exp_free_return;
3083 	    }
3084 
3085 	  token_len = peek_token_bracket (token, regexp, syntax);
3086 
3087 #ifdef _LIBC
3088 	  *err = build_range_exp (sbcset, mbcset, &range_alloc,
3089 				  &start_elem, &end_elem);
3090 #else
3091 # ifdef RE_ENABLE_I18N
3092 	  *err = build_range_exp (sbcset,
3093 				  dfa->mb_cur_max > 1 ? mbcset : NULL,
3094 				  &range_alloc, &start_elem, &end_elem);
3095 # else
3096 	  *err = build_range_exp (sbcset, &start_elem, &end_elem);
3097 # endif
3098 #endif /* RE_ENABLE_I18N */
3099 	  if (BE (*err != REG_NOERROR, 0))
3100 	    goto parse_bracket_exp_free_return;
3101 	}
3102       else
3103 	{
3104 	  switch (start_elem.type)
3105 	    {
3106 	    case SB_CHAR:
3107 	      bitset_set (sbcset, start_elem.opr.ch);
3108 	      break;
3109 #ifdef RE_ENABLE_I18N
3110 	    case MB_CHAR:
3111 	      /* Check whether the array has enough space.  */
3112 	      if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3113 		{
3114 		  wchar_t *new_mbchars;
3115 		  /* Not enough, realloc it.  */
3116 		  mbchar_alloc = mbcset->nmbchars;
3117 		  /* Use realloc since array is NULL if *alloc == 0.  */
3118 		  new_mbchars = re_x2realloc (mbcset->mbchars, wchar_t,
3119 					      &mbchar_alloc);
3120 		  if (BE (new_mbchars == NULL, 0))
3121 		    goto parse_bracket_exp_espace;
3122 		  mbcset->mbchars = new_mbchars;
3123 		}
3124 	      mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3125 	      break;
3126 #endif /* RE_ENABLE_I18N */
3127 	    case EQUIV_CLASS:
3128 	      *err = build_equiv_class (sbcset,
3129 #ifdef RE_ENABLE_I18N
3130 					mbcset, &equiv_class_alloc,
3131 #endif /* RE_ENABLE_I18N */
3132 					start_elem.opr.name);
3133 	      if (BE (*err != REG_NOERROR, 0))
3134 		goto parse_bracket_exp_free_return;
3135 	      break;
3136 	    case COLL_SYM:
3137 	      *err = build_collating_symbol (sbcset,
3138 #ifdef RE_ENABLE_I18N
3139 					     mbcset, &coll_sym_alloc,
3140 #endif /* RE_ENABLE_I18N */
3141 					     start_elem.opr.name);
3142 	      if (BE (*err != REG_NOERROR, 0))
3143 		goto parse_bracket_exp_free_return;
3144 	      break;
3145 	    case CHAR_CLASS:
3146 	      *err = build_charclass (regexp->trans, sbcset,
3147 #ifdef RE_ENABLE_I18N
3148 				      mbcset, &char_class_alloc,
3149 #endif /* RE_ENABLE_I18N */
3150 				      start_elem.opr.name, syntax);
3151 	      if (BE (*err != REG_NOERROR, 0))
3152 	       goto parse_bracket_exp_free_return;
3153 	      break;
3154 	    default:
3155 	      assert (0);
3156 	      break;
3157 	    }
3158 	}
3159       if (BE (token->type == END_OF_RE, 0))
3160 	{
3161 	  *err = REG_EBRACK;
3162 	  goto parse_bracket_exp_free_return;
3163 	}
3164       if (token->type == OP_CLOSE_BRACKET)
3165 	break;
3166     }
3167 
3168   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3169 
3170   /* If it is non-matching list.  */
3171   if (non_match)
3172     bitset_not (sbcset);
3173 
3174 #ifdef RE_ENABLE_I18N
3175   /* Ensure only single byte characters are set.  */
3176   if (dfa->mb_cur_max > 1)
3177     bitset_mask (sbcset, dfa->sb_char);
3178 
3179   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3180       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3181 						     || mbcset->non_match)))
3182     {
3183       bin_tree_t *mbc_tree;
3184       int sbc_idx;
3185       /* Build a tree for complex bracket.  */
3186       dfa->has_mb_node = 1;
3187       br_token.type = COMPLEX_BRACKET;
3188       br_token.opr.mbcset = mbcset;
3189       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3190       if (BE (mbc_tree == NULL, 0))
3191 	goto parse_bracket_exp_espace;
3192       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3193 	if (sbcset[sbc_idx])
3194 	  break;
3195       /* If there are no bits set in sbcset, there is no point
3196 	 of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3197       if (sbc_idx < BITSET_WORDS)
3198 	{
3199           /* Build a tree for simple bracket.  */
3200           br_token.type = SIMPLE_BRACKET;
3201           br_token.opr.sbcset = sbcset;
3202           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3203           if (BE (work_tree == NULL, 0))
3204             goto parse_bracket_exp_espace;
3205 
3206           /* Then join them by ALT node.  */
3207           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3208           if (BE (work_tree == NULL, 0))
3209             goto parse_bracket_exp_espace;
3210 	}
3211       else
3212 	{
3213 	  re_free (sbcset);
3214 	  work_tree = mbc_tree;
3215 	}
3216     }
3217   else
3218 #endif /* not RE_ENABLE_I18N */
3219     {
3220 #ifdef RE_ENABLE_I18N
3221       free_charset (mbcset);
3222 #endif
3223       /* Build a tree for simple bracket.  */
3224       br_token.type = SIMPLE_BRACKET;
3225       br_token.opr.sbcset = sbcset;
3226       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3227       if (BE (work_tree == NULL, 0))
3228         goto parse_bracket_exp_espace;
3229     }
3230   return work_tree;
3231 
3232  parse_bracket_exp_espace:
3233   *err = REG_ESPACE;
3234  parse_bracket_exp_free_return:
3235   re_free (sbcset);
3236 #ifdef RE_ENABLE_I18N
3237   free_charset (mbcset);
3238 #endif /* RE_ENABLE_I18N */
3239   return NULL;
3240 }
3241 
3242 /* Parse an element in the bracket expression.  */
3243 
3244 static reg_errcode_t
parse_bracket_element(bracket_elem_t * elem,re_string_t * regexp,re_token_t * token,int token_len,re_dfa_t * dfa,reg_syntax_t syntax,bool accept_hyphen)3245 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3246 		       re_token_t *token, int token_len, re_dfa_t *dfa,
3247 		       reg_syntax_t syntax, bool accept_hyphen)
3248 {
3249 #ifdef RE_ENABLE_I18N
3250   int cur_char_size;
3251   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3252   if (cur_char_size > 1)
3253     {
3254       elem->type = MB_CHAR;
3255       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3256       re_string_skip_bytes (regexp, cur_char_size);
3257       return REG_NOERROR;
3258     }
3259 #endif /* RE_ENABLE_I18N */
3260   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3261   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3262       || token->type == OP_OPEN_EQUIV_CLASS)
3263     return parse_bracket_symbol (elem, regexp, token);
3264   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3265     {
3266       /* A '-' must only appear as anything but a range indicator before
3267 	 the closing bracket.  Everything else is an error.  */
3268       re_token_t token2;
3269       (void) peek_token_bracket (&token2, regexp, syntax);
3270       if (token2.type != OP_CLOSE_BRACKET)
3271 	/* The actual error value is not standardized since this whole
3272 	   case is undefined.  But ERANGE makes good sense.  */
3273 	return REG_ERANGE;
3274     }
3275   elem->type = SB_CHAR;
3276   elem->opr.ch = token->opr.c;
3277   return REG_NOERROR;
3278 }
3279 
3280 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3281    such as [:<character_class>:], [.<collating_element>.], and
3282    [=<equivalent_class>=].  */
3283 
3284 static reg_errcode_t
parse_bracket_symbol(bracket_elem_t * elem,re_string_t * regexp,re_token_t * token)3285 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3286 		      re_token_t *token)
3287 {
3288   unsigned char ch, delim = token->opr.c;
3289   int i = 0;
3290   if (re_string_eoi(regexp))
3291     return REG_EBRACK;
3292   for (;; ++i)
3293     {
3294       if (i >= BRACKET_NAME_BUF_SIZE)
3295 	return REG_EBRACK;
3296       if (token->type == OP_OPEN_CHAR_CLASS)
3297 	ch = re_string_fetch_byte_case (regexp);
3298       else
3299 	ch = re_string_fetch_byte (regexp);
3300       if (re_string_eoi(regexp))
3301 	return REG_EBRACK;
3302       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3303 	break;
3304       elem->opr.name[i] = ch;
3305     }
3306   re_string_skip_bytes (regexp, 1);
3307   elem->opr.name[i] = '\0';
3308   switch (token->type)
3309     {
3310     case OP_OPEN_COLL_ELEM:
3311       elem->type = COLL_SYM;
3312       break;
3313     case OP_OPEN_EQUIV_CLASS:
3314       elem->type = EQUIV_CLASS;
3315       break;
3316     case OP_OPEN_CHAR_CLASS:
3317       elem->type = CHAR_CLASS;
3318       break;
3319     default:
3320       break;
3321     }
3322   return REG_NOERROR;
3323 }
3324 
3325   /* Helper function for parse_bracket_exp.
3326      Build the equivalence class which is represented by NAME.
3327      The result are written to MBCSET and SBCSET.
3328      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3329      is a pointer argument sinse we may update it.  */
3330 
3331 static reg_errcode_t
build_equiv_class(bitset sbcset,re_charset_t * mbcset,Idx * equiv_class_alloc,const unsigned char * name)3332 build_equiv_class (bitset sbcset,
3333 #ifdef RE_ENABLE_I18N
3334 		   re_charset_t *mbcset, Idx *equiv_class_alloc,
3335 #endif
3336 		   const unsigned char *name)
3337 {
3338 #if defined _LIBC
3339   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3340   if (nrules != 0)
3341     {
3342       const int32_t *table, *indirect;
3343       const unsigned char *weights, *extra, *cp;
3344       unsigned char char_buf[2];
3345       int32_t idx1, idx2;
3346       unsigned int ch;
3347       size_t len;
3348       /* This #include defines a local function!  */
3349 # include <locale/weight.h>
3350       /* Calculate the index for equivalence class.  */
3351       cp = name;
3352       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3353       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3354 					       _NL_COLLATE_WEIGHTMB);
3355       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3356 						   _NL_COLLATE_EXTRAMB);
3357       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3358 						_NL_COLLATE_INDIRECTMB);
3359       idx1 = findidx (&cp);
3360       if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3361 	/* This isn't a valid character.  */
3362 	return REG_ECOLLATE;
3363 
3364       /* Build single byte matcing table for this equivalence class.  */
3365       char_buf[1] = (unsigned char) '\0';
3366       len = weights[idx1];
3367       for (ch = 0; ch < SBC_MAX; ++ch)
3368 	{
3369 	  char_buf[0] = ch;
3370 	  cp = char_buf;
3371 	  idx2 = findidx (&cp);
3372 /*
3373 	  idx2 = table[ch];
3374 */
3375 	  if (idx2 == 0)
3376 	    /* This isn't a valid character.  */
3377 	    continue;
3378 	  if (len == weights[idx2])
3379 	    {
3380 	      int cnt = 0;
3381 	      while (cnt <= len &&
3382 		     weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3383 		++cnt;
3384 
3385 	      if (cnt > len)
3386 		bitset_set (sbcset, ch);
3387 	    }
3388 	}
3389       /* Check whether the array has enough space.  */
3390       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3391 	{
3392 	  /* Not enough, realloc it.  */
3393 	  Idx new_equiv_class_alloc = mbcset->nequiv_classes;
3394 	  /* Use realloc since the array is NULL if *alloc == 0.  */
3395 	  int32_t *new_equiv_classes = re_x2realloc (mbcset->equiv_classes,
3396 						     int32_t,
3397 						     &new_equiv_class_alloc);
3398 	  if (BE (new_equiv_classes == NULL, 0))
3399 	    return REG_ESPACE;
3400 	  mbcset->equiv_classes = new_equiv_classes;
3401 	  *equiv_class_alloc = new_equiv_class_alloc;
3402 	}
3403       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3404     }
3405   else
3406 #endif /* _LIBC */
3407     {
3408       if (BE (strlen ((const char *) name) != 1, 0))
3409 	return REG_ECOLLATE;
3410       bitset_set (sbcset, *name);
3411     }
3412   return REG_NOERROR;
3413 }
3414 
3415   /* Helper function for parse_bracket_exp.
3416      Build the character class which is represented by NAME.
3417      The result are written to MBCSET and SBCSET.
3418      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3419      is a pointer argument sinse we may update it.  */
3420 
3421 static reg_errcode_t
build_charclass(unsigned REG_TRANSLATE_TYPE trans,bitset sbcset,re_charset_t * mbcset,Idx * char_class_alloc,const unsigned char * class_name,reg_syntax_t syntax)3422 build_charclass (unsigned REG_TRANSLATE_TYPE trans, bitset sbcset,
3423 #ifdef RE_ENABLE_I18N
3424 		 re_charset_t *mbcset, Idx *char_class_alloc,
3425 #endif
3426 		 const unsigned char *class_name, reg_syntax_t syntax)
3427 {
3428   int i;
3429   const char *name = (const char *) class_name;
3430 
3431   /* In case of REG_ICASE "upper" and "lower" match the both of
3432      upper and lower cases.  */
3433   if ((syntax & REG_IGNORE_CASE)
3434       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3435     name = "alpha";
3436 
3437 #ifdef RE_ENABLE_I18N
3438   /* Check the space of the arrays.  */
3439   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3440     {
3441       /* Not enough, realloc it.  */
3442       Idx new_char_class_alloc = mbcset->nchar_classes;
3443       /* Use realloc since array is NULL if *alloc == 0.  */
3444       wctype_t *new_char_classes = re_x2realloc (mbcset->char_classes, wctype_t,
3445 						 &new_char_class_alloc);
3446       if (BE (new_char_classes == NULL, 0))
3447 	return REG_ESPACE;
3448       mbcset->char_classes = new_char_classes;
3449       *char_class_alloc = new_char_class_alloc;
3450     }
3451   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3452 #endif /* RE_ENABLE_I18N */
3453 
3454 #define BUILD_CHARCLASS_LOOP(ctype_func)	\
3455     for (i = 0; i < SBC_MAX; ++i)		\
3456       {						\
3457 	if (ctype_func (i))			\
3458 	  {					\
3459 	    int ch = trans ? trans[i] : i;	\
3460 	    bitset_set (sbcset, ch);		\
3461 	  }					\
3462       }
3463 
3464   if (strcmp (name, "alnum") == 0)
3465     BUILD_CHARCLASS_LOOP (isalnum)
3466   else if (strcmp (name, "cntrl") == 0)
3467     BUILD_CHARCLASS_LOOP (iscntrl)
3468   else if (strcmp (name, "lower") == 0)
3469     BUILD_CHARCLASS_LOOP (islower)
3470   else if (strcmp (name, "space") == 0)
3471     BUILD_CHARCLASS_LOOP (isspace)
3472   else if (strcmp (name, "alpha") == 0)
3473     BUILD_CHARCLASS_LOOP (isalpha)
3474   else if (strcmp (name, "digit") == 0)
3475     BUILD_CHARCLASS_LOOP (isdigit)
3476   else if (strcmp (name, "print") == 0)
3477     BUILD_CHARCLASS_LOOP (isprint)
3478   else if (strcmp (name, "upper") == 0)
3479     BUILD_CHARCLASS_LOOP (isupper)
3480   else if (strcmp (name, "blank") == 0)
3481     BUILD_CHARCLASS_LOOP (isblank)
3482   else if (strcmp (name, "graph") == 0)
3483     BUILD_CHARCLASS_LOOP (isgraph)
3484   else if (strcmp (name, "punct") == 0)
3485     BUILD_CHARCLASS_LOOP (ispunct)
3486   else if (strcmp (name, "xdigit") == 0)
3487     BUILD_CHARCLASS_LOOP (isxdigit)
3488   else
3489     return REG_ECTYPE;
3490 
3491   return REG_NOERROR;
3492 }
3493 
3494 static bin_tree_t *
build_charclass_op(re_dfa_t * dfa,unsigned REG_TRANSLATE_TYPE trans,const unsigned char * class_name,const unsigned char * extra,bool non_match,reg_errcode_t * err)3495 build_charclass_op (re_dfa_t *dfa, unsigned REG_TRANSLATE_TYPE trans,
3496 		    const unsigned char *class_name,
3497 		    const unsigned char *extra,
3498 		    bool non_match, reg_errcode_t *err)
3499 {
3500   re_bitset_ptr_t sbcset;
3501 #ifdef RE_ENABLE_I18N
3502   re_charset_t *mbcset;
3503   Idx alloc = 0;
3504 #endif /* not RE_ENABLE_I18N */
3505   reg_errcode_t ret;
3506   re_token_t br_token;
3507   bin_tree_t *tree;
3508 
3509   sbcset = re_calloc (bitset_word, BITSET_WORDS);
3510 #ifdef RE_ENABLE_I18N
3511   mbcset = re_calloc (re_charset_t, 1);
3512 #endif /* RE_ENABLE_I18N */
3513 
3514 #ifdef RE_ENABLE_I18N
3515   if (BE (sbcset == NULL || mbcset == NULL, 0))
3516 #else /* not RE_ENABLE_I18N */
3517   if (BE (sbcset == NULL, 0))
3518 #endif /* not RE_ENABLE_I18N */
3519     {
3520       *err = REG_ESPACE;
3521       return NULL;
3522     }
3523 
3524   if (non_match)
3525     {
3526 #ifdef RE_ENABLE_I18N
3527       /*
3528       if (syntax & REG_HAT_LISTS_NOT_NEWLINE)
3529 	bitset_set(cset->sbcset, '\0');
3530       */
3531       mbcset->non_match = 1;
3532 #endif /* not RE_ENABLE_I18N */
3533     }
3534 
3535   /* We don't care the syntax in this case.  */
3536   ret = build_charclass (trans, sbcset,
3537 #ifdef RE_ENABLE_I18N
3538 			 mbcset, &alloc,
3539 #endif /* RE_ENABLE_I18N */
3540 			 class_name, 0);
3541 
3542   if (BE (ret != REG_NOERROR, 0))
3543     {
3544       re_free (sbcset);
3545 #ifdef RE_ENABLE_I18N
3546       free_charset (mbcset);
3547 #endif /* RE_ENABLE_I18N */
3548       *err = ret;
3549       return NULL;
3550     }
3551   /* \w match '_' also.  */
3552   for (; *extra; extra++)
3553     bitset_set (sbcset, *extra);
3554 
3555   /* If it is non-matching list.  */
3556   if (non_match)
3557     bitset_not (sbcset);
3558 
3559 #ifdef RE_ENABLE_I18N
3560   /* Ensure only single byte characters are set.  */
3561   if (dfa->mb_cur_max > 1)
3562     bitset_mask (sbcset, dfa->sb_char);
3563 #endif
3564 
3565   /* Build a tree for simple bracket.  */
3566   br_token.type = SIMPLE_BRACKET;
3567   br_token.opr.sbcset = sbcset;
3568   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3569   if (BE (tree == NULL, 0))
3570     goto build_word_op_espace;
3571 
3572 #ifdef RE_ENABLE_I18N
3573   if (dfa->mb_cur_max > 1)
3574     {
3575       bin_tree_t *mbc_tree;
3576       /* Build a tree for complex bracket.  */
3577       br_token.type = COMPLEX_BRACKET;
3578       br_token.opr.mbcset = mbcset;
3579       dfa->has_mb_node = 1;
3580       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3581       if (BE (mbc_tree == NULL, 0))
3582 	goto build_word_op_espace;
3583       /* Then join them by ALT node.  */
3584       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3585       if (BE (mbc_tree != NULL, 1))
3586 	return tree;
3587     }
3588   else
3589     {
3590       free_charset (mbcset);
3591       return tree;
3592     }
3593 #else /* not RE_ENABLE_I18N */
3594   return tree;
3595 #endif /* not RE_ENABLE_I18N */
3596 
3597  build_word_op_espace:
3598   re_free (sbcset);
3599 #ifdef RE_ENABLE_I18N
3600   free_charset (mbcset);
3601 #endif /* RE_ENABLE_I18N */
3602   *err = REG_ESPACE;
3603   return NULL;
3604 }
3605 
3606 /* This is intended for the expressions like "a{1,3}".
3607    Fetch a number from `input', and return the number.
3608    Return REG_MISSING if the number field is empty like "{,1}".
3609    Return REG_ERROR if an error occurred.  */
3610 
3611 static Idx
fetch_number(re_string_t * input,re_token_t * token,reg_syntax_t syntax)3612 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3613 {
3614   Idx num = REG_MISSING;
3615   unsigned char c;
3616   while (1)
3617     {
3618       fetch_token (token, input, syntax);
3619       c = token->opr.c;
3620       if (BE (token->type == END_OF_RE, 0))
3621 	return REG_ERROR;
3622       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3623 	break;
3624       num = ((token->type != CHARACTER || c < '0' || '9' < c
3625 	      || num == REG_ERROR)
3626 	     ? REG_ERROR
3627 	     : ((num == REG_MISSING) ? c - '0' : num * 10 + c - '0'));
3628       num = (num > REG_DUP_MAX) ? REG_ERROR : num;
3629     }
3630   return num;
3631 }
3632 
3633 #ifdef RE_ENABLE_I18N
3634 static void
free_charset(re_charset_t * cset)3635 free_charset (re_charset_t *cset)
3636 {
3637   re_free (cset->mbchars);
3638 # ifdef _LIBC
3639   re_free (cset->coll_syms);
3640   re_free (cset->equiv_classes);
3641   re_free (cset->range_starts);
3642   re_free (cset->range_ends);
3643 # endif
3644   re_free (cset->char_classes);
3645   re_free (cset);
3646 }
3647 #endif /* RE_ENABLE_I18N */
3648 
3649 /* Functions for binary tree operation.  */
3650 
3651 /* Create a tree node.  */
3652 
3653 static bin_tree_t *
create_tree(re_dfa_t * dfa,bin_tree_t * left,bin_tree_t * right,re_token_type_t type)3654 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3655 	     re_token_type_t type)
3656 {
3657   re_token_t t;
3658   t.type = type;
3659   return create_token_tree (dfa, left, right, &t);
3660 }
3661 
3662 static bin_tree_t *
create_token_tree(re_dfa_t * dfa,bin_tree_t * left,bin_tree_t * right,const re_token_t * token)3663 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3664 		   const re_token_t *token)
3665 {
3666   bin_tree_t *tree;
3667   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3668     {
3669       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3670 
3671       if (storage == NULL)
3672 	return NULL;
3673       storage->next = dfa->str_tree_storage;
3674       dfa->str_tree_storage = storage;
3675       dfa->str_tree_storage_idx = 0;
3676     }
3677   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3678 
3679   tree->parent = NULL;
3680   tree->left = left;
3681   tree->right = right;
3682   tree->token = *token;
3683   tree->token.duplicated = 0;
3684   tree->token.opt_subexp = 0;
3685   tree->first = NULL;
3686   tree->next = NULL;
3687   tree->node_idx = REG_MISSING;
3688 
3689   if (left != NULL)
3690     left->parent = tree;
3691   if (right != NULL)
3692     right->parent = tree;
3693   return tree;
3694 }
3695 
3696 /* Mark the tree SRC as an optional subexpression.
3697    To be called from preorder or postorder.  */
3698 
3699 static reg_errcode_t
mark_opt_subexp(void * extra,bin_tree_t * node)3700 mark_opt_subexp (void *extra, bin_tree_t *node)
3701 {
3702   Idx idx = (Idx) (long) extra;
3703   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3704     node->token.opt_subexp = 1;
3705 
3706   return REG_NOERROR;
3707 }
3708 
3709 /* Free the allocated memory inside NODE. */
3710 
3711 static void
free_token(re_token_t * node)3712 free_token (re_token_t *node)
3713 {
3714 #ifdef RE_ENABLE_I18N
3715   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3716     free_charset (node->opr.mbcset);
3717   else
3718 #endif /* RE_ENABLE_I18N */
3719     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3720       re_free (node->opr.sbcset);
3721 }
3722 
3723 /* Worker function for tree walking.  Free the allocated memory inside NODE
3724    and its children. */
3725 
3726 static reg_errcode_t
free_tree(void * extra,bin_tree_t * node)3727 free_tree (void *extra, bin_tree_t *node)
3728 {
3729   free_token (&node->token);
3730   return REG_NOERROR;
3731 }
3732 
3733 
3734 /* Duplicate the node SRC, and return new node.  This is a preorder
3735    visit similar to the one implemented by the generic visitor, but
3736    we need more infrastructure to maintain two parallel trees --- so,
3737    it's easier to duplicate.  */
3738 
3739 static bin_tree_t *
duplicate_tree(const bin_tree_t * root,re_dfa_t * dfa)3740 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3741 {
3742   const bin_tree_t *node;
3743   bin_tree_t *dup_root;
3744   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3745 
3746   for (node = root; ; )
3747     {
3748       /* Create a new tree and link it back to the current parent.  */
3749       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3750       if (*p_new == NULL)
3751 	return NULL;
3752       (*p_new)->parent = dup_node;
3753       (*p_new)->token.duplicated = 1;
3754       dup_node = *p_new;
3755 
3756       /* Go to the left node, or up and to the right.  */
3757       if (node->left)
3758 	{
3759 	  node = node->left;
3760 	  p_new = &dup_node->left;
3761 	}
3762       else
3763 	{
3764 	  const bin_tree_t *prev = NULL;
3765 	  while (node->right == prev || node->right == NULL)
3766 	    {
3767 	      prev = node;
3768 	      node = node->parent;
3769 	      dup_node = dup_node->parent;
3770 	      if (!node)
3771 	        return dup_root;
3772 	    }
3773 	  node = node->right;
3774 	  p_new = &dup_node->right;
3775 	}
3776     }
3777 }
3778 
3779 __RCSID("$MirOS: src/gnu/usr.bin/cvs/lib/regcomp.c,v 1.2 2011/07/28 15:54:33 tg Exp $");
3780