1 /* Extended regular expression matching and search library.
2    Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5 
6    The GNU C Library is free software; you can redistribute it and/or
7    modify it under the terms of the GNU Lesser General Public
8    License as published by the Free Software Foundation; either
9    version 2.1 of the License, or (at your option) any later version.
10 
11    The GNU C Library is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14    Lesser General Public License for more details.
15 
16    You should have received a copy of the GNU Lesser General Public
17    License along with the GNU C Library; if not, write to the Free
18    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19    02111-1307 USA.  */
20 
21 static void re_string_construct_common (const char *str, int len,
22 					re_string_t *pstr,
23 					RE_TRANSLATE_TYPE trans, int icase,
24 					const re_dfa_t *dfa) internal_function;
25 #ifdef RE_ENABLE_I18N
26 static int re_string_skip_chars (re_string_t *pstr, int new_raw_idx,
27 				 wint_t *last_wc) internal_function;
28 #endif /* RE_ENABLE_I18N */
29 static reg_errcode_t register_state (re_dfa_t *dfa, re_dfastate_t *newstate,
30 				     unsigned int hash) internal_function;
31 static re_dfastate_t *create_ci_newstate (re_dfa_t *dfa,
32 					  const re_node_set *nodes,
33 					  unsigned int hash) internal_function;
34 static re_dfastate_t *create_cd_newstate (re_dfa_t *dfa,
35 					  const re_node_set *nodes,
36 					  unsigned int context,
37 					  unsigned int hash) internal_function;
38 static unsigned int inline calc_state_hash (const re_node_set *nodes,
39 					    unsigned int context) internal_function;
40 
41 /* Functions for string operation.  */
42 
43 /* This function allocate the buffers.  It is necessary to call
44    re_string_reconstruct before using the object.  */
45 
46 static reg_errcode_t
re_string_allocate(pstr,str,len,init_len,trans,icase,dfa)47 re_string_allocate (pstr, str, len, init_len, trans, icase, dfa)
48      re_string_t *pstr;
49      const char *str;
50      int len, init_len, icase;
51      RE_TRANSLATE_TYPE trans;
52      const re_dfa_t *dfa;
53 {
54   reg_errcode_t ret;
55   int init_buf_len;
56 
57   /* Ensure at least one character fits into the buffers.  */
58   if (init_len < dfa->mb_cur_max)
59     init_len = dfa->mb_cur_max;
60   init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
61   re_string_construct_common (str, len, pstr, trans, icase, dfa);
62 
63   ret = re_string_realloc_buffers (pstr, init_buf_len);
64   if (BE (ret != REG_NOERROR, 0))
65     return ret;
66 
67   pstr->word_char = dfa->word_char;
68   pstr->word_ops_used = dfa->word_ops_used;
69   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
70   pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
71   pstr->valid_raw_len = pstr->valid_len;
72   return REG_NOERROR;
73 }
74 
75 /* This function allocate the buffers, and initialize them.  */
76 
77 static reg_errcode_t
re_string_construct(pstr,str,len,trans,icase,dfa)78 re_string_construct (pstr, str, len, trans, icase, dfa)
79      re_string_t *pstr;
80      const char *str;
81      int len, icase;
82      RE_TRANSLATE_TYPE trans;
83      const re_dfa_t *dfa;
84 {
85   reg_errcode_t ret;
86   memset (pstr, '\0', sizeof (re_string_t));
87   re_string_construct_common (str, len, pstr, trans, icase, dfa);
88 
89   if (len > 0)
90     {
91       ret = re_string_realloc_buffers (pstr, len + 1);
92       if (BE (ret != REG_NOERROR, 0))
93 	return ret;
94     }
95   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
96 
97   if (icase)
98     {
99 #ifdef RE_ENABLE_I18N
100       if (dfa->mb_cur_max > 1)
101 	{
102 	  while (1)
103 	    {
104 	      ret = build_wcs_upper_buffer (pstr);
105 	      if (BE (ret != REG_NOERROR, 0))
106 		return ret;
107 	      if (pstr->valid_raw_len >= len)
108 		break;
109 	      if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
110 		break;
111 	      ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
112 	      if (BE (ret != REG_NOERROR, 0))
113 		return ret;
114 	    }
115 	}
116       else
117 #endif /* RE_ENABLE_I18N  */
118 	build_upper_buffer (pstr);
119     }
120   else
121     {
122 #ifdef RE_ENABLE_I18N
123       if (dfa->mb_cur_max > 1)
124 	build_wcs_buffer (pstr);
125       else
126 #endif /* RE_ENABLE_I18N  */
127 	{
128 	  if (trans != NULL)
129 	    re_string_translate_buffer (pstr);
130 	  else
131 	    {
132 	      pstr->valid_len = pstr->bufs_len;
133 	      pstr->valid_raw_len = pstr->bufs_len;
134 	    }
135 	}
136     }
137 
138   return REG_NOERROR;
139 }
140 
141 /* Helper functions for re_string_allocate, and re_string_construct.  */
142 
143 static reg_errcode_t
re_string_realloc_buffers(pstr,new_buf_len)144 re_string_realloc_buffers (pstr, new_buf_len)
145      re_string_t *pstr;
146      int new_buf_len;
147 {
148 #ifdef RE_ENABLE_I18N
149   if (pstr->mb_cur_max > 1)
150     {
151       wint_t *new_array = re_realloc (pstr->wcs, wint_t, new_buf_len);
152       if (BE (new_array == NULL, 0))
153 	return REG_ESPACE;
154       pstr->wcs = new_array;
155       if (pstr->offsets != NULL)
156 	{
157 	  int *new_array = re_realloc (pstr->offsets, int, new_buf_len);
158 	  if (BE (new_array == NULL, 0))
159 	    return REG_ESPACE;
160 	  pstr->offsets = new_array;
161 	}
162     }
163 #endif /* RE_ENABLE_I18N  */
164   if (pstr->mbs_allocated)
165     {
166       unsigned char *new_array = re_realloc (pstr->mbs, unsigned char,
167 					     new_buf_len);
168       if (BE (new_array == NULL, 0))
169 	return REG_ESPACE;
170       pstr->mbs = new_array;
171     }
172   pstr->bufs_len = new_buf_len;
173   return REG_NOERROR;
174 }
175 
176 
177 static void
re_string_construct_common(str,len,pstr,trans,icase,dfa)178 re_string_construct_common (str, len, pstr, trans, icase, dfa)
179      const char *str;
180      int len;
181      re_string_t *pstr;
182      RE_TRANSLATE_TYPE trans;
183      int icase;
184      const re_dfa_t *dfa;
185 {
186   pstr->raw_mbs = (const unsigned char *) str;
187   pstr->len = len;
188   pstr->raw_len = len;
189   pstr->trans = (unsigned RE_TRANSLATE_TYPE) trans;
190   pstr->icase = icase ? 1 : 0;
191   pstr->mbs_allocated = (trans != NULL || icase);
192   pstr->mb_cur_max = dfa->mb_cur_max;
193   pstr->is_utf8 = dfa->is_utf8;
194   pstr->map_notascii = dfa->map_notascii;
195   pstr->stop = pstr->len;
196   pstr->raw_stop = pstr->stop;
197 }
198 
199 #ifdef RE_ENABLE_I18N
200 
201 /* Build wide character buffer PSTR->WCS.
202    If the byte sequence of the string are:
203      <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
204    Then wide character buffer will be:
205      <wc1>   , WEOF    , <wc2>   , WEOF    , <wc3>
206    We use WEOF for padding, they indicate that the position isn't
207    a first byte of a multibyte character.
208 
209    Note that this function assumes PSTR->VALID_LEN elements are already
210    built and starts from PSTR->VALID_LEN.  */
211 
212 static void
build_wcs_buffer(pstr)213 build_wcs_buffer (pstr)
214      re_string_t *pstr;
215 {
216 #ifdef _LIBC
217   unsigned char buf[MB_CUR_MAX];
218   assert (MB_CUR_MAX >= pstr->mb_cur_max);
219 #else
220   unsigned char buf[64];
221 #endif
222   mbstate_t prev_st;
223   int byte_idx, end_idx, remain_len;
224   size_t mbclen;
225 
226   /* Build the buffers from pstr->valid_len to either pstr->len or
227      pstr->bufs_len.  */
228   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
229   for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
230     {
231       wchar_t wc;
232       const char *p;
233 
234       remain_len = end_idx - byte_idx;
235       prev_st = pstr->cur_state;
236       /* Apply the translation if we need.  */
237       if (BE (pstr->trans != NULL, 0))
238 	{
239 	  int i, ch;
240 
241 	  for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
242 	    {
243 	      ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
244 	      buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
245 	    }
246 	  p = (const char *) buf;
247 	}
248       else
249 	p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
250       mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
251       if (BE (mbclen == (size_t) -2, 0))
252 	{
253 	  /* The buffer doesn't have enough space, finish to build.  */
254 	  pstr->cur_state = prev_st;
255 	  break;
256 	}
257       else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
258 	{
259 	  /* We treat these cases as a singlebyte character.  */
260 	  mbclen = 1;
261 	  wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
262 	  if (BE (pstr->trans != NULL, 0))
263 	    wc = pstr->trans[wc];
264 	  pstr->cur_state = prev_st;
265 	}
266 
267       /* Write wide character and padding.  */
268       pstr->wcs[byte_idx++] = wc;
269       /* Write paddings.  */
270       for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
271 	pstr->wcs[byte_idx++] = WEOF;
272     }
273   pstr->valid_len = byte_idx;
274   pstr->valid_raw_len = byte_idx;
275 }
276 
277 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
278    but for REG_ICASE.  */
279 
280 static int
build_wcs_upper_buffer(pstr)281 build_wcs_upper_buffer (pstr)
282      re_string_t *pstr;
283 {
284   mbstate_t prev_st;
285   int src_idx, byte_idx, end_idx, remain_len;
286   size_t mbclen;
287 #ifdef _LIBC
288   char buf[MB_CUR_MAX];
289   assert (MB_CUR_MAX >= pstr->mb_cur_max);
290 #else
291   char buf[64];
292 #endif
293 
294   byte_idx = pstr->valid_len;
295   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
296 
297   /* The following optimization assumes that ASCII characters can be
298      mapped to wide characters with a simple cast.  */
299   if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
300     {
301       while (byte_idx < end_idx)
302 	{
303 	  wchar_t wc;
304 
305 	  if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
306 	      && mbsinit (&pstr->cur_state))
307 	    {
308 	      /* In case of a singlebyte character.  */
309 	      pstr->mbs[byte_idx]
310 		= toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
311 	      /* The next step uses the assumption that wchar_t is encoded
312 		 ASCII-safe: all ASCII values can be converted like this.  */
313 	      pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
314 	      ++byte_idx;
315 	      continue;
316 	    }
317 
318 	  remain_len = end_idx - byte_idx;
319 	  prev_st = pstr->cur_state;
320 	  mbclen = mbrtowc (&wc,
321 			    ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
322 			     + byte_idx), remain_len, &pstr->cur_state);
323 	  if (BE (mbclen + 2 > 2, 1))
324 	    {
325 	      wchar_t wcu = wc;
326 	      if (iswlower (wc))
327 		{
328 		  size_t mbcdlen;
329 
330 		  wcu = towupper (wc);
331 		  mbcdlen = wcrtomb (buf, wcu, &prev_st);
332 		  if (BE (mbclen == mbcdlen, 1))
333 		    memcpy (pstr->mbs + byte_idx, buf, mbclen);
334 		  else
335 		    {
336 		      src_idx = byte_idx;
337 		      goto offsets_needed;
338 		    }
339 		}
340 	      else
341 		memcpy (pstr->mbs + byte_idx,
342 			pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
343 	      pstr->wcs[byte_idx++] = wcu;
344 	      /* Write paddings.  */
345 	      for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
346 		pstr->wcs[byte_idx++] = WEOF;
347 	    }
348 	  else if (mbclen == (size_t) -1 || mbclen == 0)
349 	    {
350 	      /* It is an invalid character or '\0'.  Just use the byte.  */
351 	      int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
352 	      pstr->mbs[byte_idx] = ch;
353 	      /* And also cast it to wide char.  */
354 	      pstr->wcs[byte_idx++] = (wchar_t) ch;
355 	      if (BE (mbclen == (size_t) -1, 0))
356 		pstr->cur_state = prev_st;
357 	    }
358 	  else
359 	    {
360 	      /* The buffer doesn't have enough space, finish to build.  */
361 	      pstr->cur_state = prev_st;
362 	      break;
363 	    }
364 	}
365       pstr->valid_len = byte_idx;
366       pstr->valid_raw_len = byte_idx;
367       return REG_NOERROR;
368     }
369   else
370     for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
371       {
372 	wchar_t wc;
373 	const char *p;
374       offsets_needed:
375 	remain_len = end_idx - byte_idx;
376 	prev_st = pstr->cur_state;
377 	if (BE (pstr->trans != NULL, 0))
378 	  {
379 	    int i, ch;
380 
381 	    for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
382 	      {
383 		ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
384 		buf[i] = pstr->trans[ch];
385 	      }
386 	    p = (const char *) buf;
387 	  }
388 	else
389 	  p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
390 	mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
391 	if (BE (mbclen + 2 > 2, 1))
392 	  {
393 	    wchar_t wcu = wc;
394 	    if (iswlower (wc))
395 	      {
396 		size_t mbcdlen;
397 
398 		wcu = towupper (wc);
399 		mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
400 		if (BE (mbclen == mbcdlen, 1))
401 		  memcpy (pstr->mbs + byte_idx, buf, mbclen);
402 		else if (mbcdlen != (size_t) -1)
403 		  {
404 		    size_t i;
405 
406 		    if (byte_idx + mbcdlen > pstr->bufs_len)
407 		      {
408 			pstr->cur_state = prev_st;
409 			break;
410 		      }
411 
412 		    if (pstr->offsets == NULL)
413 		      {
414 			pstr->offsets = re_malloc (int, pstr->bufs_len);
415 
416 			if (pstr->offsets == NULL)
417 			  return REG_ESPACE;
418 		      }
419 		    if (!pstr->offsets_needed)
420 		      {
421 			for (i = 0; i < (size_t) byte_idx; ++i)
422 			  pstr->offsets[i] = i;
423 			pstr->offsets_needed = 1;
424 		      }
425 
426 		    memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
427 		    pstr->wcs[byte_idx] = wcu;
428 		    pstr->offsets[byte_idx] = src_idx;
429 		    for (i = 1; i < mbcdlen; ++i)
430 		      {
431 			pstr->offsets[byte_idx + i]
432 			  = src_idx + (i < mbclen ? i : mbclen - 1);
433 			pstr->wcs[byte_idx + i] = WEOF;
434 		      }
435 		    pstr->len += mbcdlen - mbclen;
436 		    if (pstr->raw_stop > src_idx)
437 		      pstr->stop += mbcdlen - mbclen;
438 		    end_idx = (pstr->bufs_len > pstr->len)
439 			      ? pstr->len : pstr->bufs_len;
440 		    byte_idx += mbcdlen;
441 		    src_idx += mbclen;
442 		    continue;
443 		  }
444                 else
445                   memcpy (pstr->mbs + byte_idx, p, mbclen);
446 	      }
447 	    else
448 	      memcpy (pstr->mbs + byte_idx, p, mbclen);
449 
450 	    if (BE (pstr->offsets_needed != 0, 0))
451 	      {
452 		size_t i;
453 		for (i = 0; i < mbclen; ++i)
454 		  pstr->offsets[byte_idx + i] = src_idx + i;
455 	      }
456 	    src_idx += mbclen;
457 
458 	    pstr->wcs[byte_idx++] = wcu;
459 	    /* Write paddings.  */
460 	    for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
461 	      pstr->wcs[byte_idx++] = WEOF;
462 	  }
463 	else if (mbclen == (size_t) -1 || mbclen == 0)
464 	  {
465 	    /* It is an invalid character or '\0'.  Just use the byte.  */
466 	    int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
467 
468 	    if (BE (pstr->trans != NULL, 0))
469 	      ch = pstr->trans [ch];
470 	    pstr->mbs[byte_idx] = ch;
471 
472 	    if (BE (pstr->offsets_needed != 0, 0))
473 	      pstr->offsets[byte_idx] = src_idx;
474 	    ++src_idx;
475 
476 	    /* And also cast it to wide char.  */
477 	    pstr->wcs[byte_idx++] = (wchar_t) ch;
478 	    if (BE (mbclen == (size_t) -1, 0))
479 	      pstr->cur_state = prev_st;
480 	  }
481 	else
482 	  {
483 	    /* The buffer doesn't have enough space, finish to build.  */
484 	    pstr->cur_state = prev_st;
485 	    break;
486 	  }
487       }
488   pstr->valid_len = byte_idx;
489   pstr->valid_raw_len = src_idx;
490   return REG_NOERROR;
491 }
492 
493 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
494    Return the index.  */
495 
496 static int
re_string_skip_chars(pstr,new_raw_idx,last_wc)497 re_string_skip_chars (pstr, new_raw_idx, last_wc)
498      re_string_t *pstr;
499      int new_raw_idx;
500      wint_t *last_wc;
501 {
502   mbstate_t prev_st;
503   int rawbuf_idx;
504   size_t mbclen;
505   wchar_t wc = 0;
506 
507   /* Skip the characters which are not necessary to check.  */
508   for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
509        rawbuf_idx < new_raw_idx;)
510     {
511       int remain_len;
512       remain_len = pstr->len - rawbuf_idx;
513       prev_st = pstr->cur_state;
514       mbclen = mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx,
515 			remain_len, &pstr->cur_state);
516       if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
517 	{
518 	  /* We treat these cases as a singlebyte character.  */
519 	  mbclen = 1;
520 	  pstr->cur_state = prev_st;
521 	}
522       /* Then proceed the next character.  */
523       rawbuf_idx += mbclen;
524     }
525   *last_wc = (wint_t) wc;
526   return rawbuf_idx;
527 }
528 #endif /* RE_ENABLE_I18N  */
529 
530 /* Build the buffer PSTR->MBS, and apply the translation if we need.
531    This function is used in case of REG_ICASE.  */
532 
533 static void
build_upper_buffer(pstr)534 build_upper_buffer (pstr)
535      re_string_t *pstr;
536 {
537   int char_idx, end_idx;
538   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
539 
540   for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
541     {
542       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
543       if (BE (pstr->trans != NULL, 0))
544 	ch = pstr->trans[ch];
545       if (islower (ch))
546 	pstr->mbs[char_idx] = toupper (ch);
547       else
548 	pstr->mbs[char_idx] = ch;
549     }
550   pstr->valid_len = char_idx;
551   pstr->valid_raw_len = char_idx;
552 }
553 
554 /* Apply TRANS to the buffer in PSTR.  */
555 
556 static void
re_string_translate_buffer(pstr)557 re_string_translate_buffer (pstr)
558      re_string_t *pstr;
559 {
560   int buf_idx, end_idx;
561   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
562 
563   for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
564     {
565       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
566       pstr->mbs[buf_idx] = pstr->trans[ch];
567     }
568 
569   pstr->valid_len = buf_idx;
570   pstr->valid_raw_len = buf_idx;
571 }
572 
573 /* This function re-construct the buffers.
574    Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
575    convert to upper case in case of REG_ICASE, apply translation.  */
576 
577 static reg_errcode_t
re_string_reconstruct(pstr,idx,eflags)578 re_string_reconstruct (pstr, idx, eflags)
579      re_string_t *pstr;
580      int idx, eflags;
581 {
582   int offset = idx - pstr->raw_mbs_idx;
583   if (BE (offset < 0, 0))
584     {
585       /* Reset buffer.  */
586 #ifdef RE_ENABLE_I18N
587       if (pstr->mb_cur_max > 1)
588 	memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
589 #endif /* RE_ENABLE_I18N */
590       pstr->len = pstr->raw_len;
591       pstr->stop = pstr->raw_stop;
592       pstr->valid_len = 0;
593       pstr->raw_mbs_idx = 0;
594       pstr->valid_raw_len = 0;
595       pstr->offsets_needed = 0;
596       pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
597 			   : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
598       if (!pstr->mbs_allocated)
599 	pstr->mbs = (unsigned char *) pstr->raw_mbs;
600       offset = idx;
601     }
602 
603   if (BE (offset != 0, 1))
604     {
605       /* Are the characters which are already checked remain?  */
606       if (BE (offset < pstr->valid_raw_len, 1)
607 #ifdef RE_ENABLE_I18N
608 	  /* Handling this would enlarge the code too much.
609 	     Accept a slowdown in that case.  */
610 	  && pstr->offsets_needed == 0
611 #endif
612 	 )
613 	{
614 	  /* Yes, move them to the front of the buffer.  */
615 	  pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags);
616 #ifdef RE_ENABLE_I18N
617 	  if (pstr->mb_cur_max > 1)
618 	    memmove (pstr->wcs, pstr->wcs + offset,
619 		     (pstr->valid_len - offset) * sizeof (wint_t));
620 #endif /* RE_ENABLE_I18N */
621 	  if (BE (pstr->mbs_allocated, 0))
622 	    memmove (pstr->mbs, pstr->mbs + offset,
623 		     pstr->valid_len - offset);
624 	  pstr->valid_len -= offset;
625 	  pstr->valid_raw_len -= offset;
626 #if DEBUG
627 	  assert (pstr->valid_len > 0);
628 #endif
629 	}
630       else
631 	{
632 	  /* No, skip all characters until IDX.  */
633 #ifdef RE_ENABLE_I18N
634 	  if (BE (pstr->offsets_needed, 0))
635 	    {
636 	      pstr->len = pstr->raw_len - idx + offset;
637 	      pstr->stop = pstr->raw_stop - idx + offset;
638 	      pstr->offsets_needed = 0;
639 	    }
640 #endif
641 	  pstr->valid_len = 0;
642 	  pstr->valid_raw_len = 0;
643 #ifdef RE_ENABLE_I18N
644 	  if (pstr->mb_cur_max > 1)
645 	    {
646 	      int wcs_idx;
647 	      wint_t wc = WEOF;
648 
649 	      if (pstr->is_utf8)
650 		{
651 		  const unsigned char *raw, *p, *q, *end;
652 
653 		  /* Special case UTF-8.  Multi-byte chars start with any
654 		     byte other than 0x80 - 0xbf.  */
655 		  raw = pstr->raw_mbs + pstr->raw_mbs_idx;
656 		  end = raw + (offset - pstr->mb_cur_max);
657 		  for (p = raw + offset - 1; p >= end; --p)
658 		    if ((*p & 0xc0) != 0x80)
659 		      {
660 			mbstate_t cur_state;
661 			wchar_t wc2;
662 			int mlen = raw + pstr->len - p;
663 			unsigned char buf[6];
664 
665 			q = p;
666 			if (BE (pstr->trans != NULL, 0))
667 			  {
668 			    int i = mlen < 6 ? mlen : 6;
669 			    while (--i >= 0)
670 			      buf[i] = pstr->trans[p[i]];
671 			    q = buf;
672 			  }
673 			/* XXX Don't use mbrtowc, we know which conversion
674 			   to use (UTF-8 -> UCS4).  */
675 			memset (&cur_state, 0, sizeof (cur_state));
676 			mlen = (mbrtowc (&wc2, (const char *) p, mlen,
677 					 &cur_state)
678 				- (raw + offset - p));
679 			if (mlen >= 0)
680 			  {
681 			    memset (&pstr->cur_state, '\0',
682 				    sizeof (mbstate_t));
683 			    pstr->valid_len = mlen;
684 			    wc = wc2;
685 			  }
686 			break;
687 		      }
688 		}
689 
690 	      if (wc == WEOF)
691 		pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
692 	      if (BE (pstr->valid_len, 0))
693 		{
694 		  for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
695 		    pstr->wcs[wcs_idx] = WEOF;
696 		  if (pstr->mbs_allocated)
697 		    memset (pstr->mbs, 255, pstr->valid_len);
698 		}
699 	      pstr->valid_raw_len = pstr->valid_len;
700 	      pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
701 				    && IS_WIDE_WORD_CHAR (wc))
702 				   ? CONTEXT_WORD
703 				   : ((IS_WIDE_NEWLINE (wc)
704 				       && pstr->newline_anchor)
705 				      ? CONTEXT_NEWLINE : 0));
706 	    }
707 	  else
708 #endif /* RE_ENABLE_I18N */
709 	    {
710 	      int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
711 	      if (pstr->trans)
712 		c = pstr->trans[c];
713 	      pstr->tip_context = (bitset_contain (pstr->word_char, c)
714 				   ? CONTEXT_WORD
715 				   : ((IS_NEWLINE (c) && pstr->newline_anchor)
716 				      ? CONTEXT_NEWLINE : 0));
717 	    }
718 	}
719       if (!BE (pstr->mbs_allocated, 0))
720 	pstr->mbs += offset;
721     }
722   pstr->raw_mbs_idx = idx;
723   pstr->len -= offset;
724   pstr->stop -= offset;
725 
726   /* Then build the buffers.  */
727 #ifdef RE_ENABLE_I18N
728   if (pstr->mb_cur_max > 1)
729     {
730       if (pstr->icase)
731 	{
732 	  int ret = build_wcs_upper_buffer (pstr);
733 	  if (BE (ret != REG_NOERROR, 0))
734 	    return ret;
735 	}
736       else
737 	build_wcs_buffer (pstr);
738     }
739   else
740 #endif /* RE_ENABLE_I18N */
741   if (BE (pstr->mbs_allocated, 0))
742     {
743       if (pstr->icase)
744 	build_upper_buffer (pstr);
745       else if (pstr->trans != NULL)
746 	re_string_translate_buffer (pstr);
747     }
748   else
749     pstr->valid_len = pstr->len;
750 
751   pstr->cur_idx = 0;
752   return REG_NOERROR;
753 }
754 
755 static unsigned char
re_string_peek_byte_case(pstr,idx)756 re_string_peek_byte_case (pstr, idx)
757      const re_string_t *pstr;
758      int idx;
759 {
760   int ch, off;
761 
762   /* Handle the common (easiest) cases first.  */
763   if (BE (!pstr->mbs_allocated, 1))
764     return re_string_peek_byte (pstr, idx);
765 
766 #ifdef RE_ENABLE_I18N
767   if (pstr->mb_cur_max > 1
768       && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
769     return re_string_peek_byte (pstr, idx);
770 #endif
771 
772   off = pstr->cur_idx + idx;
773 #ifdef RE_ENABLE_I18N
774   if (pstr->offsets_needed)
775     off = pstr->offsets[off];
776 #endif
777 
778   ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
779 
780 #ifdef RE_ENABLE_I18N
781   /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
782      this function returns CAPITAL LETTER I instead of first byte of
783      DOTLESS SMALL LETTER I.  The latter would confuse the parser,
784      since peek_byte_case doesn't advance cur_idx in any way.  */
785   if (pstr->offsets_needed && !isascii (ch))
786     return re_string_peek_byte (pstr, idx);
787 #endif
788 
789   return ch;
790 }
791 
792 static unsigned char
re_string_fetch_byte_case(pstr)793 re_string_fetch_byte_case (pstr)
794      re_string_t *pstr;
795 {
796   if (BE (!pstr->mbs_allocated, 1))
797     return re_string_fetch_byte (pstr);
798 
799 #ifdef RE_ENABLE_I18N
800   if (pstr->offsets_needed)
801     {
802       int off, ch;
803 
804       /* For tr_TR.UTF-8 [[:islower:]] there is
805 	 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs.  Skip
806 	 in that case the whole multi-byte character and return
807 	 the original letter.  On the other side, with
808 	 [[: DOTLESS SMALL LETTER I return [[:I, as doing
809 	 anything else would complicate things too much.  */
810 
811       if (!re_string_first_byte (pstr, pstr->cur_idx))
812 	return re_string_fetch_byte (pstr);
813 
814       off = pstr->offsets[pstr->cur_idx];
815       ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
816 
817       if (! isascii (ch))
818 	return re_string_fetch_byte (pstr);
819 
820       re_string_skip_bytes (pstr,
821 			    re_string_char_size_at (pstr, pstr->cur_idx));
822       return ch;
823     }
824 #endif
825 
826   return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
827 }
828 
829 static void
re_string_destruct(pstr)830 re_string_destruct (pstr)
831      re_string_t *pstr;
832 {
833 #ifdef RE_ENABLE_I18N
834   re_free (pstr->wcs);
835   re_free (pstr->offsets);
836 #endif /* RE_ENABLE_I18N  */
837   if (pstr->mbs_allocated)
838     re_free (pstr->mbs);
839 }
840 
841 /* Return the context at IDX in INPUT.  */
842 
843 static unsigned int
re_string_context_at(input,idx,eflags)844 re_string_context_at (input, idx, eflags)
845      const re_string_t *input;
846      int idx, eflags;
847 {
848   int c;
849   if (BE (idx < 0, 0))
850     /* In this case, we use the value stored in input->tip_context,
851        since we can't know the character in input->mbs[-1] here.  */
852     return input->tip_context;
853   if (BE (idx == input->len, 0))
854     return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
855 	    : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
856 #ifdef RE_ENABLE_I18N
857   if (input->mb_cur_max > 1)
858     {
859       wint_t wc;
860       int wc_idx = idx;
861       while(input->wcs[wc_idx] == WEOF)
862 	{
863 #ifdef DEBUG
864 	  /* It must not happen.  */
865 	  assert (wc_idx >= 0);
866 #endif
867 	  --wc_idx;
868 	  if (wc_idx < 0)
869 	    return input->tip_context;
870 	}
871       wc = input->wcs[wc_idx];
872       if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
873 	return CONTEXT_WORD;
874       return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
875 	      ? CONTEXT_NEWLINE : 0);
876     }
877   else
878 #endif
879     {
880       c = re_string_byte_at (input, idx);
881       if (bitset_contain (input->word_char, c))
882 	return CONTEXT_WORD;
883       return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
884     }
885 }
886 
887 /* Functions for set operation.  */
888 
889 static reg_errcode_t
re_node_set_alloc(set,size)890 re_node_set_alloc (set, size)
891      re_node_set *set;
892      int size;
893 {
894   set->alloc = size;
895   set->nelem = 0;
896   set->elems = re_malloc (int, size);
897   if (BE (set->elems == NULL, 0))
898     return REG_ESPACE;
899   return REG_NOERROR;
900 }
901 
902 static reg_errcode_t
re_node_set_init_1(set,elem)903 re_node_set_init_1 (set, elem)
904      re_node_set *set;
905      int elem;
906 {
907   set->alloc = 1;
908   set->nelem = 1;
909   set->elems = re_malloc (int, 1);
910   if (BE (set->elems == NULL, 0))
911     {
912       set->alloc = set->nelem = 0;
913       return REG_ESPACE;
914     }
915   set->elems[0] = elem;
916   return REG_NOERROR;
917 }
918 
919 static reg_errcode_t
re_node_set_init_2(set,elem1,elem2)920 re_node_set_init_2 (set, elem1, elem2)
921      re_node_set *set;
922      int elem1, elem2;
923 {
924   set->alloc = 2;
925   set->elems = re_malloc (int, 2);
926   if (BE (set->elems == NULL, 0))
927     return REG_ESPACE;
928   if (elem1 == elem2)
929     {
930       set->nelem = 1;
931       set->elems[0] = elem1;
932     }
933   else
934     {
935       set->nelem = 2;
936       if (elem1 < elem2)
937 	{
938 	  set->elems[0] = elem1;
939 	  set->elems[1] = elem2;
940 	}
941       else
942 	{
943 	  set->elems[0] = elem2;
944 	  set->elems[1] = elem1;
945 	}
946     }
947   return REG_NOERROR;
948 }
949 
950 static reg_errcode_t
re_node_set_init_copy(dest,src)951 re_node_set_init_copy (dest, src)
952      re_node_set *dest;
953      const re_node_set *src;
954 {
955   dest->nelem = src->nelem;
956   if (src->nelem > 0)
957     {
958       dest->alloc = dest->nelem;
959       dest->elems = re_malloc (int, dest->alloc);
960       if (BE (dest->elems == NULL, 0))
961 	{
962 	  dest->alloc = dest->nelem = 0;
963 	  return REG_ESPACE;
964 	}
965       memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
966     }
967   else
968     re_node_set_init_empty (dest);
969   return REG_NOERROR;
970 }
971 
972 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
973    DEST. Return value indicate the error code or REG_NOERROR if succeeded.
974    Note: We assume dest->elems is NULL, when dest->alloc is 0.  */
975 
976 static reg_errcode_t
re_node_set_add_intersect(dest,src1,src2)977 re_node_set_add_intersect (dest, src1, src2)
978      re_node_set *dest;
979      const re_node_set *src1, *src2;
980 {
981   int i1, i2, is, id, delta, sbase;
982   if (src1->nelem == 0 || src2->nelem == 0)
983     return REG_NOERROR;
984 
985   /* We need dest->nelem + 2 * elems_in_intersection; this is a
986      conservative estimate.  */
987   if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
988     {
989       int new_alloc = src1->nelem + src2->nelem + dest->alloc;
990       int *new_elems = re_realloc (dest->elems, int, new_alloc);
991       if (BE (new_elems == NULL, 0))
992         return REG_ESPACE;
993       dest->elems = new_elems;
994       dest->alloc = new_alloc;
995     }
996 
997   /* Find the items in the intersection of SRC1 and SRC2, and copy
998      into the top of DEST those that are not already in DEST itself.  */
999   sbase = dest->nelem + src1->nelem + src2->nelem;
1000   i1 = src1->nelem - 1;
1001   i2 = src2->nelem - 1;
1002   id = dest->nelem - 1;
1003   for (;;)
1004     {
1005       if (src1->elems[i1] == src2->elems[i2])
1006 	{
1007 	  /* Try to find the item in DEST.  Maybe we could binary search?  */
1008 	  while (id >= 0 && dest->elems[id] > src1->elems[i1])
1009 	    --id;
1010 
1011           if (id < 0 || dest->elems[id] != src1->elems[i1])
1012             dest->elems[--sbase] = src1->elems[i1];
1013 
1014 	  if (--i1 < 0 || --i2 < 0)
1015 	    break;
1016 	}
1017 
1018       /* Lower the highest of the two items.  */
1019       else if (src1->elems[i1] < src2->elems[i2])
1020 	{
1021 	  if (--i2 < 0)
1022 	    break;
1023 	}
1024       else
1025 	{
1026 	  if (--i1 < 0)
1027 	    break;
1028 	}
1029     }
1030 
1031   id = dest->nelem - 1;
1032   is = dest->nelem + src1->nelem + src2->nelem - 1;
1033   delta = is - sbase + 1;
1034 
1035   /* Now copy.  When DELTA becomes zero, the remaining
1036      DEST elements are already in place; this is more or
1037      less the same loop that is in re_node_set_merge.  */
1038   dest->nelem += delta;
1039   if (delta > 0 && id >= 0)
1040     for (;;)
1041       {
1042         if (dest->elems[is] > dest->elems[id])
1043           {
1044             /* Copy from the top.  */
1045             dest->elems[id + delta--] = dest->elems[is--];
1046             if (delta == 0)
1047               break;
1048           }
1049         else
1050           {
1051             /* Slide from the bottom.  */
1052             dest->elems[id + delta] = dest->elems[id];
1053             if (--id < 0)
1054               break;
1055           }
1056       }
1057 
1058   /* Copy remaining SRC elements.  */
1059   memcpy (dest->elems, dest->elems + sbase, delta * sizeof (int));
1060 
1061   return REG_NOERROR;
1062 }
1063 
1064 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1065    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1066 
1067 static reg_errcode_t
re_node_set_init_union(dest,src1,src2)1068 re_node_set_init_union (dest, src1, src2)
1069      re_node_set *dest;
1070      const re_node_set *src1, *src2;
1071 {
1072   int i1, i2, id;
1073   if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1074     {
1075       dest->alloc = src1->nelem + src2->nelem;
1076       dest->elems = re_malloc (int, dest->alloc);
1077       if (BE (dest->elems == NULL, 0))
1078 	return REG_ESPACE;
1079     }
1080   else
1081     {
1082       if (src1 != NULL && src1->nelem > 0)
1083 	return re_node_set_init_copy (dest, src1);
1084       else if (src2 != NULL && src2->nelem > 0)
1085 	return re_node_set_init_copy (dest, src2);
1086       else
1087 	re_node_set_init_empty (dest);
1088       return REG_NOERROR;
1089     }
1090   for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1091     {
1092       if (src1->elems[i1] > src2->elems[i2])
1093 	{
1094 	  dest->elems[id++] = src2->elems[i2++];
1095 	  continue;
1096 	}
1097       if (src1->elems[i1] == src2->elems[i2])
1098 	++i2;
1099       dest->elems[id++] = src1->elems[i1++];
1100     }
1101   if (i1 < src1->nelem)
1102     {
1103       memcpy (dest->elems + id, src1->elems + i1,
1104 	     (src1->nelem - i1) * sizeof (int));
1105       id += src1->nelem - i1;
1106     }
1107   else if (i2 < src2->nelem)
1108     {
1109       memcpy (dest->elems + id, src2->elems + i2,
1110 	     (src2->nelem - i2) * sizeof (int));
1111       id += src2->nelem - i2;
1112     }
1113   dest->nelem = id;
1114   return REG_NOERROR;
1115 }
1116 
1117 /* Calculate the union set of the sets DEST and SRC. And store it to
1118    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1119 
1120 static reg_errcode_t
re_node_set_merge(dest,src)1121 re_node_set_merge (dest, src)
1122      re_node_set *dest;
1123      const re_node_set *src;
1124 {
1125   int is, id, sbase, delta;
1126   if (src == NULL || src->nelem == 0)
1127     return REG_NOERROR;
1128   if (dest->alloc < 2 * src->nelem + dest->nelem)
1129     {
1130       int new_alloc = 2 * (src->nelem + dest->alloc);
1131       int *new_buffer = re_realloc (dest->elems, int, new_alloc);
1132       if (BE (new_buffer == NULL, 0))
1133 	return REG_ESPACE;
1134       dest->elems = new_buffer;
1135       dest->alloc = new_alloc;
1136     }
1137 
1138   if (BE (dest->nelem == 0, 0))
1139     {
1140       dest->nelem = src->nelem;
1141       memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1142       return REG_NOERROR;
1143     }
1144 
1145   /* Copy into the top of DEST the items of SRC that are not
1146      found in DEST.  Maybe we could binary search in DEST?  */
1147   for (sbase = dest->nelem + 2 * src->nelem,
1148        is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1149     {
1150       if (dest->elems[id] == src->elems[is])
1151         is--, id--;
1152       else if (dest->elems[id] < src->elems[is])
1153         dest->elems[--sbase] = src->elems[is--];
1154       else /* if (dest->elems[id] > src->elems[is]) */
1155         --id;
1156     }
1157 
1158   if (is >= 0)
1159     {
1160       /* If DEST is exhausted, the remaining items of SRC must be unique.  */
1161       sbase -= is + 1;
1162       memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (int));
1163     }
1164 
1165   id = dest->nelem - 1;
1166   is = dest->nelem + 2 * src->nelem - 1;
1167   delta = is - sbase + 1;
1168   if (delta == 0)
1169     return REG_NOERROR;
1170 
1171   /* Now copy.  When DELTA becomes zero, the remaining
1172      DEST elements are already in place.  */
1173   dest->nelem += delta;
1174   for (;;)
1175     {
1176       if (dest->elems[is] > dest->elems[id])
1177         {
1178 	  /* Copy from the top.  */
1179           dest->elems[id + delta--] = dest->elems[is--];
1180 	  if (delta == 0)
1181 	    break;
1182 	}
1183       else
1184         {
1185           /* Slide from the bottom.  */
1186           dest->elems[id + delta] = dest->elems[id];
1187 	  if (--id < 0)
1188 	    {
1189 	      /* Copy remaining SRC elements.  */
1190 	      memcpy (dest->elems, dest->elems + sbase,
1191 	              delta * sizeof (int));
1192 	      break;
1193 	    }
1194 	}
1195     }
1196 
1197   return REG_NOERROR;
1198 }
1199 
1200 /* Insert the new element ELEM to the re_node_set* SET.
1201    SET should not already have ELEM.
1202    return -1 if an error is occured, return 1 otherwise.  */
1203 
1204 static int
re_node_set_insert(set,elem)1205 re_node_set_insert (set, elem)
1206      re_node_set *set;
1207      int elem;
1208 {
1209   int idx;
1210   /* In case the set is empty.  */
1211   if (set->alloc == 0)
1212     {
1213       if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1))
1214 	return 1;
1215       else
1216 	return -1;
1217     }
1218 
1219   if (BE (set->nelem, 0) == 0)
1220     {
1221       /* We already guaranteed above that set->alloc != 0.  */
1222       set->elems[0] = elem;
1223       ++set->nelem;
1224       return 1;
1225     }
1226 
1227   /* Realloc if we need.  */
1228   if (set->alloc == set->nelem)
1229     {
1230       int *new_array;
1231       set->alloc = set->alloc * 2;
1232       new_array = re_realloc (set->elems, int, set->alloc);
1233       if (BE (new_array == NULL, 0))
1234 	return -1;
1235       set->elems = new_array;
1236     }
1237 
1238   /* Move the elements which follows the new element.  Test the
1239      first element separately to skip a check in the inner loop.  */
1240   if (elem < set->elems[0])
1241     {
1242       idx = 0;
1243       for (idx = set->nelem; idx > 0; idx--)
1244         set->elems[idx] = set->elems[idx - 1];
1245     }
1246   else
1247     {
1248       for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1249         set->elems[idx] = set->elems[idx - 1];
1250     }
1251 
1252   /* Insert the new element.  */
1253   set->elems[idx] = elem;
1254   ++set->nelem;
1255   return 1;
1256 }
1257 
1258 /* Insert the new element ELEM to the re_node_set* SET.
1259    SET should not already have any element greater than or equal to ELEM.
1260    Return -1 if an error is occured, return 1 otherwise.  */
1261 
1262 static int
re_node_set_insert_last(set,elem)1263 re_node_set_insert_last (set, elem)
1264      re_node_set *set;
1265      int elem;
1266 {
1267   /* Realloc if we need.  */
1268   if (set->alloc == set->nelem)
1269     {
1270       int *new_array;
1271       set->alloc = (set->alloc + 1) * 2;
1272       new_array = re_realloc (set->elems, int, set->alloc);
1273       if (BE (new_array == NULL, 0))
1274 	return -1;
1275       set->elems = new_array;
1276     }
1277 
1278   /* Insert the new element.  */
1279   set->elems[set->nelem++] = elem;
1280   return 1;
1281 }
1282 
1283 /* Compare two node sets SET1 and SET2.
1284    return 1 if SET1 and SET2 are equivalent, return 0 otherwise.  */
1285 
1286 static int
re_node_set_compare(set1,set2)1287 re_node_set_compare (set1, set2)
1288      const re_node_set *set1, *set2;
1289 {
1290   int i;
1291   if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1292     return 0;
1293   for (i = set1->nelem ; --i >= 0 ; )
1294     if (set1->elems[i] != set2->elems[i])
1295       return 0;
1296   return 1;
1297 }
1298 
1299 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise.  */
1300 
1301 static int
re_node_set_contains(set,elem)1302 re_node_set_contains (set, elem)
1303      const re_node_set *set;
1304      int elem;
1305 {
1306   unsigned int idx, right, mid;
1307   if (set->nelem <= 0)
1308     return 0;
1309 
1310   /* Binary search the element.  */
1311   idx = 0;
1312   right = set->nelem - 1;
1313   while (idx < right)
1314     {
1315       mid = (idx + right) / 2;
1316       if (set->elems[mid] < elem)
1317 	idx = mid + 1;
1318       else
1319 	right = mid;
1320     }
1321   return set->elems[idx] == elem ? idx + 1 : 0;
1322 }
1323 
1324 static void
re_node_set_remove_at(set,idx)1325 re_node_set_remove_at (set, idx)
1326      re_node_set *set;
1327      int idx;
1328 {
1329   if (idx < 0 || idx >= set->nelem)
1330     return;
1331   --set->nelem;
1332   for (; idx < set->nelem; idx++)
1333     set->elems[idx] = set->elems[idx + 1];
1334 }
1335 
1336 
1337 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1338    Or return -1, if an error will be occured.  */
1339 
1340 static int
re_dfa_add_node(dfa,token)1341 re_dfa_add_node (dfa, token)
1342      re_dfa_t *dfa;
1343      re_token_t token;
1344 {
1345   int type = token.type;
1346   if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1347     {
1348       int new_nodes_alloc = dfa->nodes_alloc * 2;
1349       int *new_nexts, *new_indices;
1350       re_node_set *new_edests, *new_eclosures;
1351 
1352       re_token_t *new_array = re_realloc (dfa->nodes, re_token_t,
1353 					  new_nodes_alloc);
1354       if (BE (new_array == NULL, 0))
1355 	return -1;
1356       dfa->nodes = new_array;
1357       new_nexts = re_realloc (dfa->nexts, int, new_nodes_alloc);
1358       new_indices = re_realloc (dfa->org_indices, int, new_nodes_alloc);
1359       new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1360       new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1361       if (BE (new_nexts == NULL || new_indices == NULL
1362 	      || new_edests == NULL || new_eclosures == NULL, 0))
1363 	return -1;
1364       dfa->nexts = new_nexts;
1365       dfa->org_indices = new_indices;
1366       dfa->edests = new_edests;
1367       dfa->eclosures = new_eclosures;
1368       dfa->nodes_alloc = new_nodes_alloc;
1369     }
1370   dfa->nodes[dfa->nodes_len] = token;
1371   dfa->nodes[dfa->nodes_len].constraint = 0;
1372 #ifdef RE_ENABLE_I18N
1373   dfa->nodes[dfa->nodes_len].accept_mb =
1374     (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
1375 #endif
1376   dfa->nexts[dfa->nodes_len] = -1;
1377   re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1378   re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1379   return dfa->nodes_len++;
1380 }
1381 
1382 static unsigned int inline
calc_state_hash(nodes,context)1383 calc_state_hash (nodes, context)
1384      const re_node_set *nodes;
1385      unsigned int context;
1386 {
1387   unsigned int hash = nodes->nelem + context;
1388   int i;
1389   for (i = 0 ; i < nodes->nelem ; i++)
1390     hash += nodes->elems[i];
1391   return hash;
1392 }
1393 
1394 /* Search for the state whose node_set is equivalent to NODES.
1395    Return the pointer to the state, if we found it in the DFA.
1396    Otherwise create the new one and return it.  In case of an error
1397    return NULL and set the error code in ERR.
1398    Note: - We assume NULL as the invalid state, then it is possible that
1399 	   return value is NULL and ERR is REG_NOERROR.
1400 	 - We never return non-NULL value in case of any errors, it is for
1401 	   optimization.  */
1402 
1403 static re_dfastate_t*
re_acquire_state(err,dfa,nodes)1404 re_acquire_state (err, dfa, nodes)
1405      reg_errcode_t *err;
1406      re_dfa_t *dfa;
1407      const re_node_set *nodes;
1408 {
1409   unsigned int hash;
1410   re_dfastate_t *new_state;
1411   struct re_state_table_entry *spot;
1412   int i;
1413   if (BE (nodes->nelem == 0, 0))
1414     {
1415       *err = REG_NOERROR;
1416       return NULL;
1417     }
1418   hash = calc_state_hash (nodes, 0);
1419   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1420 
1421   for (i = 0 ; i < spot->num ; i++)
1422     {
1423       re_dfastate_t *state = spot->array[i];
1424       if (hash != state->hash)
1425 	continue;
1426       if (re_node_set_compare (&state->nodes, nodes))
1427 	return state;
1428     }
1429 
1430   /* There are no appropriate state in the dfa, create the new one.  */
1431   new_state = create_ci_newstate (dfa, nodes, hash);
1432   if (BE (new_state != NULL, 1))
1433     return new_state;
1434   else
1435     {
1436       *err = REG_ESPACE;
1437       return NULL;
1438     }
1439 }
1440 
1441 /* Search for the state whose node_set is equivalent to NODES and
1442    whose context is equivalent to CONTEXT.
1443    Return the pointer to the state, if we found it in the DFA.
1444    Otherwise create the new one and return it.  In case of an error
1445    return NULL and set the error code in ERR.
1446    Note: - We assume NULL as the invalid state, then it is possible that
1447 	   return value is NULL and ERR is REG_NOERROR.
1448 	 - We never return non-NULL value in case of any errors, it is for
1449 	   optimization.  */
1450 
1451 static re_dfastate_t*
re_acquire_state_context(err,dfa,nodes,context)1452 re_acquire_state_context (err, dfa, nodes, context)
1453      reg_errcode_t *err;
1454      re_dfa_t *dfa;
1455      const re_node_set *nodes;
1456      unsigned int context;
1457 {
1458   unsigned int hash;
1459   re_dfastate_t *new_state;
1460   struct re_state_table_entry *spot;
1461   int i;
1462   if (nodes->nelem == 0)
1463     {
1464       *err = REG_NOERROR;
1465       return NULL;
1466     }
1467   hash = calc_state_hash (nodes, context);
1468   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1469 
1470   for (i = 0 ; i < spot->num ; i++)
1471     {
1472       re_dfastate_t *state = spot->array[i];
1473       if (state->hash == hash
1474 	  && state->context == context
1475 	  && re_node_set_compare (state->entrance_nodes, nodes))
1476 	return state;
1477     }
1478   /* There are no appropriate state in `dfa', create the new one.  */
1479   new_state = create_cd_newstate (dfa, nodes, context, hash);
1480   if (BE (new_state != NULL, 1))
1481     return new_state;
1482   else
1483     {
1484       *err = REG_ESPACE;
1485       return NULL;
1486     }
1487 }
1488 
1489 /* Finish initialization of the new state NEWSTATE, and using its hash value
1490    HASH put in the appropriate bucket of DFA's state table.  Return value
1491    indicates the error code if failed.  */
1492 
1493 static reg_errcode_t
register_state(dfa,newstate,hash)1494 register_state (dfa, newstate, hash)
1495      re_dfa_t *dfa;
1496      re_dfastate_t *newstate;
1497      unsigned int hash;
1498 {
1499   struct re_state_table_entry *spot;
1500   reg_errcode_t err;
1501   int i;
1502 
1503   newstate->hash = hash;
1504   err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1505   if (BE (err != REG_NOERROR, 0))
1506     return REG_ESPACE;
1507   for (i = 0; i < newstate->nodes.nelem; i++)
1508     {
1509       int elem = newstate->nodes.elems[i];
1510       if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1511         re_node_set_insert_last (&newstate->non_eps_nodes, elem);
1512     }
1513 
1514   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1515   if (BE (spot->alloc <= spot->num, 0))
1516     {
1517       int new_alloc = 2 * spot->num + 2;
1518       re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1519 					      new_alloc);
1520       if (BE (new_array == NULL, 0))
1521 	return REG_ESPACE;
1522       spot->array = new_array;
1523       spot->alloc = new_alloc;
1524     }
1525   spot->array[spot->num++] = newstate;
1526   return REG_NOERROR;
1527 }
1528 
1529 /* Create the new state which is independ of contexts.
1530    Return the new state if succeeded, otherwise return NULL.  */
1531 
1532 static re_dfastate_t *
create_ci_newstate(dfa,nodes,hash)1533 create_ci_newstate (dfa, nodes, hash)
1534      re_dfa_t *dfa;
1535      const re_node_set *nodes;
1536      unsigned int hash;
1537 {
1538   int i;
1539   reg_errcode_t err;
1540   re_dfastate_t *newstate;
1541 
1542   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1543   if (BE (newstate == NULL, 0))
1544     return NULL;
1545   err = re_node_set_init_copy (&newstate->nodes, nodes);
1546   if (BE (err != REG_NOERROR, 0))
1547     {
1548       re_free (newstate);
1549       return NULL;
1550     }
1551 
1552   newstate->entrance_nodes = &newstate->nodes;
1553   for (i = 0 ; i < nodes->nelem ; i++)
1554     {
1555       re_token_t *node = dfa->nodes + nodes->elems[i];
1556       re_token_type_t type = node->type;
1557       if (type == CHARACTER && !node->constraint)
1558 	continue;
1559 #ifdef RE_ENABLE_I18N
1560       newstate->accept_mb |= node->accept_mb;
1561 #endif /* RE_ENABLE_I18N */
1562 
1563       /* If the state has the halt node, the state is a halt state.  */
1564       if (type == END_OF_RE)
1565 	newstate->halt = 1;
1566       else if (type == OP_BACK_REF)
1567 	newstate->has_backref = 1;
1568       else if (type == ANCHOR || node->constraint)
1569 	newstate->has_constraint = 1;
1570     }
1571   err = register_state (dfa, newstate, hash);
1572   if (BE (err != REG_NOERROR, 0))
1573     {
1574       free_state (newstate);
1575       newstate = NULL;
1576     }
1577   return newstate;
1578 }
1579 
1580 /* Create the new state which is depend on the context CONTEXT.
1581    Return the new state if succeeded, otherwise return NULL.  */
1582 
1583 static re_dfastate_t *
create_cd_newstate(dfa,nodes,context,hash)1584 create_cd_newstate (dfa, nodes, context, hash)
1585      re_dfa_t *dfa;
1586      const re_node_set *nodes;
1587      unsigned int context, hash;
1588 {
1589   int i, nctx_nodes = 0;
1590   reg_errcode_t err;
1591   re_dfastate_t *newstate;
1592 
1593   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1594   if (BE (newstate == NULL, 0))
1595     return NULL;
1596   err = re_node_set_init_copy (&newstate->nodes, nodes);
1597   if (BE (err != REG_NOERROR, 0))
1598     {
1599       re_free (newstate);
1600       return NULL;
1601     }
1602 
1603   newstate->context = context;
1604   newstate->entrance_nodes = &newstate->nodes;
1605 
1606   for (i = 0 ; i < nodes->nelem ; i++)
1607     {
1608       unsigned int constraint = 0;
1609       re_token_t *node = dfa->nodes + nodes->elems[i];
1610       re_token_type_t type = node->type;
1611       if (node->constraint)
1612 	constraint = node->constraint;
1613 
1614       if (type == CHARACTER && !constraint)
1615 	continue;
1616 #ifdef RE_ENABLE_I18N
1617       newstate->accept_mb |= node->accept_mb;
1618 #endif /* RE_ENABLE_I18N */
1619 
1620       /* If the state has the halt node, the state is a halt state.  */
1621       if (type == END_OF_RE)
1622 	newstate->halt = 1;
1623       else if (type == OP_BACK_REF)
1624 	newstate->has_backref = 1;
1625       else if (type == ANCHOR)
1626 	constraint = node->opr.ctx_type;
1627 
1628       if (constraint)
1629 	{
1630 	  if (newstate->entrance_nodes == &newstate->nodes)
1631 	    {
1632 	      newstate->entrance_nodes = re_malloc (re_node_set, 1);
1633 	      if (BE (newstate->entrance_nodes == NULL, 0))
1634 		{
1635 		  free_state (newstate);
1636 		  return NULL;
1637 		}
1638 	      re_node_set_init_copy (newstate->entrance_nodes, nodes);
1639 	      nctx_nodes = 0;
1640 	      newstate->has_constraint = 1;
1641 	    }
1642 
1643 	  if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1644 	    {
1645 	      re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1646 	      ++nctx_nodes;
1647 	    }
1648 	}
1649     }
1650   err = register_state (dfa, newstate, hash);
1651   if (BE (err != REG_NOERROR, 0))
1652     {
1653       free_state (newstate);
1654       newstate = NULL;
1655     }
1656   return  newstate;
1657 }
1658 
1659 static void
free_state(state)1660 free_state (state)
1661      re_dfastate_t *state;
1662 {
1663   re_node_set_free (&state->non_eps_nodes);
1664   re_node_set_free (&state->inveclosure);
1665   if (state->entrance_nodes != &state->nodes)
1666     {
1667       re_node_set_free (state->entrance_nodes);
1668       re_free (state->entrance_nodes);
1669     }
1670   re_node_set_free (&state->nodes);
1671   re_free (state->word_trtable);
1672   re_free (state->trtable);
1673   re_free (state);
1674 }
1675