xref: /NextBSD/contrib/libgnuregex/regex_internal.c (revision eb1a5f8de9f7ea602c373a710f531abbf81141c4)
1 /* Extended regular expression matching and search library.
2    Copyright (C) 2002-2006, 2010, 2011 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5 
6    The GNU C Library is free software; you can redistribute it and/or
7    modify it under the terms of the GNU Lesser General Public
8    License as published by the Free Software Foundation; either
9    version 2.1 of the License, or (at your option) any later version.
10 
11    The GNU C Library is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14    Lesser General Public License for more details.
15 
16    You should have received a copy of the GNU Lesser General Public
17    License along with the GNU C Library; if not, see
18    <http://www.gnu.org/licenses/>.  */
19 
20 static void re_string_construct_common (const char *str, int len,
21 					re_string_t *pstr,
22 					RE_TRANSLATE_TYPE trans, int icase,
23 					const re_dfa_t *dfa) internal_function;
24 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
25 					  const re_node_set *nodes,
26 					  unsigned int hash) internal_function;
27 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
28 					  const re_node_set *nodes,
29 					  unsigned int context,
30 					  unsigned int hash) internal_function;
31 
32 /* Functions for string operation.  */
33 
34 /* This function allocate the buffers.  It is necessary to call
35    re_string_reconstruct before using the object.  */
36 
37 static reg_errcode_t
38 internal_function __attribute_warn_unused_result__
re_string_allocate(re_string_t * pstr,const char * str,int len,int init_len,RE_TRANSLATE_TYPE trans,int icase,const re_dfa_t * dfa)39 re_string_allocate (re_string_t *pstr, const char *str, int len, int init_len,
40 		    RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
41 {
42   reg_errcode_t ret;
43   int init_buf_len;
44 
45   /* Ensure at least one character fits into the buffers.  */
46   if (init_len < dfa->mb_cur_max)
47     init_len = dfa->mb_cur_max;
48   init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
49   re_string_construct_common (str, len, pstr, trans, icase, dfa);
50 
51   ret = re_string_realloc_buffers (pstr, init_buf_len);
52   if (BE (ret != REG_NOERROR, 0))
53     return ret;
54 
55   pstr->word_char = dfa->word_char;
56   pstr->word_ops_used = dfa->word_ops_used;
57   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
58   pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
59   pstr->valid_raw_len = pstr->valid_len;
60   return REG_NOERROR;
61 }
62 
63 /* This function allocate the buffers, and initialize them.  */
64 
65 static reg_errcode_t
66 internal_function __attribute_warn_unused_result__
re_string_construct(re_string_t * pstr,const char * str,int len,RE_TRANSLATE_TYPE trans,int icase,const re_dfa_t * dfa)67 re_string_construct (re_string_t *pstr, const char *str, int len,
68 		     RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
69 {
70   reg_errcode_t ret;
71   memset (pstr, '\0', sizeof (re_string_t));
72   re_string_construct_common (str, len, pstr, trans, icase, dfa);
73 
74   if (len > 0)
75     {
76       ret = re_string_realloc_buffers (pstr, len + 1);
77       if (BE (ret != REG_NOERROR, 0))
78 	return ret;
79     }
80   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
81 
82   if (icase)
83     {
84 #ifdef RE_ENABLE_I18N
85       if (dfa->mb_cur_max > 1)
86 	{
87 	  while (1)
88 	    {
89 	      ret = build_wcs_upper_buffer (pstr);
90 	      if (BE (ret != REG_NOERROR, 0))
91 		return ret;
92 	      if (pstr->valid_raw_len >= len)
93 		break;
94 	      if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
95 		break;
96 	      ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
97 	      if (BE (ret != REG_NOERROR, 0))
98 		return ret;
99 	    }
100 	}
101       else
102 #endif /* RE_ENABLE_I18N  */
103 	build_upper_buffer (pstr);
104     }
105   else
106     {
107 #ifdef RE_ENABLE_I18N
108       if (dfa->mb_cur_max > 1)
109 	build_wcs_buffer (pstr);
110       else
111 #endif /* RE_ENABLE_I18N  */
112 	{
113 	  if (trans != NULL)
114 	    re_string_translate_buffer (pstr);
115 	  else
116 	    {
117 	      pstr->valid_len = pstr->bufs_len;
118 	      pstr->valid_raw_len = pstr->bufs_len;
119 	    }
120 	}
121     }
122 
123   return REG_NOERROR;
124 }
125 
126 /* Helper functions for re_string_allocate, and re_string_construct.  */
127 
128 static reg_errcode_t
129 internal_function __attribute_warn_unused_result__
re_string_realloc_buffers(re_string_t * pstr,int new_buf_len)130 re_string_realloc_buffers (re_string_t *pstr, int new_buf_len)
131 {
132 #ifdef RE_ENABLE_I18N
133   if (pstr->mb_cur_max > 1)
134     {
135       wint_t *new_wcs;
136 
137       /* Avoid overflow in realloc.  */
138       const size_t max_object_size = MAX (sizeof (wint_t), sizeof (int));
139       if (BE (SIZE_MAX / max_object_size < new_buf_len, 0))
140 	return REG_ESPACE;
141 
142       new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
143       if (BE (new_wcs == NULL, 0))
144 	return REG_ESPACE;
145       pstr->wcs = new_wcs;
146       if (pstr->offsets != NULL)
147 	{
148 	  int *new_offsets = re_realloc (pstr->offsets, int, new_buf_len);
149 	  if (BE (new_offsets == NULL, 0))
150 	    return REG_ESPACE;
151 	  pstr->offsets = new_offsets;
152 	}
153     }
154 #endif /* RE_ENABLE_I18N  */
155   if (pstr->mbs_allocated)
156     {
157       unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
158 					   new_buf_len);
159       if (BE (new_mbs == NULL, 0))
160 	return REG_ESPACE;
161       pstr->mbs = new_mbs;
162     }
163   pstr->bufs_len = new_buf_len;
164   return REG_NOERROR;
165 }
166 
167 
168 static void
169 internal_function
re_string_construct_common(const char * str,int len,re_string_t * pstr,RE_TRANSLATE_TYPE trans,int icase,const re_dfa_t * dfa)170 re_string_construct_common (const char *str, int len, re_string_t *pstr,
171 			    RE_TRANSLATE_TYPE trans, int icase,
172 			    const re_dfa_t *dfa)
173 {
174   pstr->raw_mbs = (const unsigned char *) str;
175   pstr->len = len;
176   pstr->raw_len = len;
177   pstr->trans = trans;
178   pstr->icase = icase ? 1 : 0;
179   pstr->mbs_allocated = (trans != NULL || icase);
180   pstr->mb_cur_max = dfa->mb_cur_max;
181   pstr->is_utf8 = dfa->is_utf8;
182   pstr->map_notascii = dfa->map_notascii;
183   pstr->stop = pstr->len;
184   pstr->raw_stop = pstr->stop;
185 }
186 
187 #ifdef RE_ENABLE_I18N
188 
189 /* Build wide character buffer PSTR->WCS.
190    If the byte sequence of the string are:
191      <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
192    Then wide character buffer will be:
193      <wc1>   , WEOF    , <wc2>   , WEOF    , <wc3>
194    We use WEOF for padding, they indicate that the position isn't
195    a first byte of a multibyte character.
196 
197    Note that this function assumes PSTR->VALID_LEN elements are already
198    built and starts from PSTR->VALID_LEN.  */
199 
200 static void
201 internal_function
build_wcs_buffer(re_string_t * pstr)202 build_wcs_buffer (re_string_t *pstr)
203 {
204 #ifdef _LIBC
205   unsigned char buf[MB_LEN_MAX];
206   assert (MB_LEN_MAX >= pstr->mb_cur_max);
207 #else
208   unsigned char buf[64];
209 #endif
210   mbstate_t prev_st;
211   int byte_idx, end_idx, remain_len;
212   size_t mbclen;
213 
214   /* Build the buffers from pstr->valid_len to either pstr->len or
215      pstr->bufs_len.  */
216   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
217   for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
218     {
219       wchar_t wc;
220       const char *p;
221 
222       remain_len = end_idx - byte_idx;
223       prev_st = pstr->cur_state;
224       /* Apply the translation if we need.  */
225       if (BE (pstr->trans != NULL, 0))
226 	{
227 	  int i, ch;
228 
229 	  for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
230 	    {
231 	      ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
232 	      buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
233 	    }
234 	  p = (const char *) buf;
235 	}
236       else
237 	p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
238       mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
239       if (BE (mbclen == (size_t) -1 || mbclen == 0
240 	      || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len), 0))
241 	{
242 	  /* We treat these cases as a singlebyte character.  */
243 	  mbclen = 1;
244 	  wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
245 	  if (BE (pstr->trans != NULL, 0))
246 	    wc = pstr->trans[wc];
247 	  pstr->cur_state = prev_st;
248 	}
249       else if (BE (mbclen == (size_t) -2, 0))
250 	{
251 	  /* The buffer doesn't have enough space, finish to build.  */
252 	  pstr->cur_state = prev_st;
253 	  break;
254 	}
255 
256       /* Write wide character and padding.  */
257       pstr->wcs[byte_idx++] = wc;
258       /* Write paddings.  */
259       for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
260 	pstr->wcs[byte_idx++] = WEOF;
261     }
262   pstr->valid_len = byte_idx;
263   pstr->valid_raw_len = byte_idx;
264 }
265 
266 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
267    but for REG_ICASE.  */
268 
269 static reg_errcode_t
270 internal_function __attribute_warn_unused_result__
build_wcs_upper_buffer(re_string_t * pstr)271 build_wcs_upper_buffer (re_string_t *pstr)
272 {
273   mbstate_t prev_st;
274   int src_idx, byte_idx, end_idx, remain_len;
275   size_t mbclen;
276 #ifdef _LIBC
277   char buf[MB_LEN_MAX];
278   assert (MB_LEN_MAX >= pstr->mb_cur_max);
279 #else
280   char buf[64];
281 #endif
282 
283   byte_idx = pstr->valid_len;
284   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
285 
286   /* The following optimization assumes that ASCII characters can be
287      mapped to wide characters with a simple cast.  */
288   if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
289     {
290       while (byte_idx < end_idx)
291 	{
292 	  wchar_t wc;
293 
294 	  if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
295 	      && mbsinit (&pstr->cur_state))
296 	    {
297 	      /* In case of a singlebyte character.  */
298 	      pstr->mbs[byte_idx]
299 		= toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
300 	      /* The next step uses the assumption that wchar_t is encoded
301 		 ASCII-safe: all ASCII values can be converted like this.  */
302 	      pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
303 	      ++byte_idx;
304 	      continue;
305 	    }
306 
307 	  remain_len = end_idx - byte_idx;
308 	  prev_st = pstr->cur_state;
309 	  mbclen = __mbrtowc (&wc,
310 			      ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
311 			       + byte_idx), remain_len, &pstr->cur_state);
312 	  if (BE (mbclen + 2 > 2, 1))
313 	    {
314 	      wchar_t wcu = wc;
315 	      if (iswlower (wc))
316 		{
317 		  size_t mbcdlen;
318 
319 		  wcu = towupper (wc);
320 		  mbcdlen = wcrtomb (buf, wcu, &prev_st);
321 		  if (BE (mbclen == mbcdlen, 1))
322 		    memcpy (pstr->mbs + byte_idx, buf, mbclen);
323 		  else
324 		    {
325 		      src_idx = byte_idx;
326 		      goto offsets_needed;
327 		    }
328 		}
329 	      else
330 		memcpy (pstr->mbs + byte_idx,
331 			pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
332 	      pstr->wcs[byte_idx++] = wcu;
333 	      /* Write paddings.  */
334 	      for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
335 		pstr->wcs[byte_idx++] = WEOF;
336 	    }
337 	  else if (mbclen == (size_t) -1 || mbclen == 0
338 		   || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
339 	    {
340 	      /* It is an invalid character, an incomplete character
341 		 at the end of the string, or '\0'.  Just use the byte.  */
342 	      int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
343 	      pstr->mbs[byte_idx] = ch;
344 	      /* And also cast it to wide char.  */
345 	      pstr->wcs[byte_idx++] = (wchar_t) ch;
346 	      if (BE (mbclen == (size_t) -1, 0))
347 		pstr->cur_state = prev_st;
348 	    }
349 	  else
350 	    {
351 	      /* The buffer doesn't have enough space, finish to build.  */
352 	      pstr->cur_state = prev_st;
353 	      break;
354 	    }
355 	}
356       pstr->valid_len = byte_idx;
357       pstr->valid_raw_len = byte_idx;
358       return REG_NOERROR;
359     }
360   else
361     for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
362       {
363 	wchar_t wc;
364 	const char *p;
365       offsets_needed:
366 	remain_len = end_idx - byte_idx;
367 	prev_st = pstr->cur_state;
368 	if (BE (pstr->trans != NULL, 0))
369 	  {
370 	    int i, ch;
371 
372 	    for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
373 	      {
374 		ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
375 		buf[i] = pstr->trans[ch];
376 	      }
377 	    p = (const char *) buf;
378 	  }
379 	else
380 	  p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
381 	mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
382 	if (BE (mbclen + 2 > 2, 1))
383 	  {
384 	    wchar_t wcu = wc;
385 	    if (iswlower (wc))
386 	      {
387 		size_t mbcdlen;
388 
389 		wcu = towupper (wc);
390 		mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
391 		if (BE (mbclen == mbcdlen, 1))
392 		  memcpy (pstr->mbs + byte_idx, buf, mbclen);
393 		else if (mbcdlen != (size_t) -1)
394 		  {
395 		    size_t i;
396 
397 		    if (byte_idx + mbcdlen > pstr->bufs_len)
398 		      {
399 			pstr->cur_state = prev_st;
400 			break;
401 		      }
402 
403 		    if (pstr->offsets == NULL)
404 		      {
405 			pstr->offsets = re_malloc (int, pstr->bufs_len);
406 
407 			if (pstr->offsets == NULL)
408 			  return REG_ESPACE;
409 		      }
410 		    if (!pstr->offsets_needed)
411 		      {
412 			for (i = 0; i < (size_t) byte_idx; ++i)
413 			  pstr->offsets[i] = i;
414 			pstr->offsets_needed = 1;
415 		      }
416 
417 		    memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
418 		    pstr->wcs[byte_idx] = wcu;
419 		    pstr->offsets[byte_idx] = src_idx;
420 		    for (i = 1; i < mbcdlen; ++i)
421 		      {
422 			pstr->offsets[byte_idx + i]
423 			  = src_idx + (i < mbclen ? i : mbclen - 1);
424 			pstr->wcs[byte_idx + i] = WEOF;
425 		      }
426 		    pstr->len += mbcdlen - mbclen;
427 		    if (pstr->raw_stop > src_idx)
428 		      pstr->stop += mbcdlen - mbclen;
429 		    end_idx = (pstr->bufs_len > pstr->len)
430 			      ? pstr->len : pstr->bufs_len;
431 		    byte_idx += mbcdlen;
432 		    src_idx += mbclen;
433 		    continue;
434 		  }
435 		else
436 		  memcpy (pstr->mbs + byte_idx, p, mbclen);
437 	      }
438 	    else
439 	      memcpy (pstr->mbs + byte_idx, p, mbclen);
440 
441 	    if (BE (pstr->offsets_needed != 0, 0))
442 	      {
443 		size_t i;
444 		for (i = 0; i < mbclen; ++i)
445 		  pstr->offsets[byte_idx + i] = src_idx + i;
446 	      }
447 	    src_idx += mbclen;
448 
449 	    pstr->wcs[byte_idx++] = wcu;
450 	    /* Write paddings.  */
451 	    for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
452 	      pstr->wcs[byte_idx++] = WEOF;
453 	  }
454 	else if (mbclen == (size_t) -1 || mbclen == 0
455 		 || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
456 	  {
457 	    /* It is an invalid character or '\0'.  Just use the byte.  */
458 	    int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
459 
460 	    if (BE (pstr->trans != NULL, 0))
461 	      ch = pstr->trans [ch];
462 	    pstr->mbs[byte_idx] = ch;
463 
464 	    if (BE (pstr->offsets_needed != 0, 0))
465 	      pstr->offsets[byte_idx] = src_idx;
466 	    ++src_idx;
467 
468 	    /* And also cast it to wide char.  */
469 	    pstr->wcs[byte_idx++] = (wchar_t) ch;
470 	    if (BE (mbclen == (size_t) -1, 0))
471 	      pstr->cur_state = prev_st;
472 	  }
473 	else
474 	  {
475 	    /* The buffer doesn't have enough space, finish to build.  */
476 	    pstr->cur_state = prev_st;
477 	    break;
478 	  }
479       }
480   pstr->valid_len = byte_idx;
481   pstr->valid_raw_len = src_idx;
482   return REG_NOERROR;
483 }
484 
485 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
486    Return the index.  */
487 
488 static int
489 internal_function
re_string_skip_chars(re_string_t * pstr,int new_raw_idx,wint_t * last_wc)490 re_string_skip_chars (re_string_t *pstr, int new_raw_idx, wint_t *last_wc)
491 {
492   mbstate_t prev_st;
493   int rawbuf_idx;
494   size_t mbclen;
495   wint_t wc = WEOF;
496 
497   /* Skip the characters which are not necessary to check.  */
498   for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
499        rawbuf_idx < new_raw_idx;)
500     {
501       wchar_t wc2;
502       int remain_len = pstr->raw_len - rawbuf_idx;
503       prev_st = pstr->cur_state;
504       mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
505 			  remain_len, &pstr->cur_state);
506       if (BE ((ssize_t) mbclen <= 0, 0))
507 	{
508 	  /* We treat these cases as a single byte character.  */
509 	  if (mbclen == 0 || remain_len == 0)
510 	    wc = L'\0';
511 	  else
512 	    wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
513 	  mbclen = 1;
514 	  pstr->cur_state = prev_st;
515 	}
516       else
517 	wc = (wint_t) wc2;
518       /* Then proceed the next character.  */
519       rawbuf_idx += mbclen;
520     }
521   *last_wc = wc;
522   return rawbuf_idx;
523 }
524 #endif /* RE_ENABLE_I18N  */
525 
526 /* Build the buffer PSTR->MBS, and apply the translation if we need.
527    This function is used in case of REG_ICASE.  */
528 
529 static void
530 internal_function
build_upper_buffer(re_string_t * pstr)531 build_upper_buffer (re_string_t *pstr)
532 {
533   int char_idx, end_idx;
534   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
535 
536   for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
537     {
538       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
539       if (BE (pstr->trans != NULL, 0))
540 	ch = pstr->trans[ch];
541       if (islower (ch))
542 	pstr->mbs[char_idx] = toupper (ch);
543       else
544 	pstr->mbs[char_idx] = ch;
545     }
546   pstr->valid_len = char_idx;
547   pstr->valid_raw_len = char_idx;
548 }
549 
550 /* Apply TRANS to the buffer in PSTR.  */
551 
552 static void
553 internal_function
re_string_translate_buffer(re_string_t * pstr)554 re_string_translate_buffer (re_string_t *pstr)
555 {
556   int buf_idx, end_idx;
557   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
558 
559   for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
560     {
561       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
562       pstr->mbs[buf_idx] = pstr->trans[ch];
563     }
564 
565   pstr->valid_len = buf_idx;
566   pstr->valid_raw_len = buf_idx;
567 }
568 
569 /* This function re-construct the buffers.
570    Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
571    convert to upper case in case of REG_ICASE, apply translation.  */
572 
573 static reg_errcode_t
574 internal_function __attribute_warn_unused_result__
re_string_reconstruct(re_string_t * pstr,int idx,int eflags)575 re_string_reconstruct (re_string_t *pstr, int idx, int eflags)
576 {
577   int offset = idx - pstr->raw_mbs_idx;
578   if (BE (offset < 0, 0))
579     {
580       /* Reset buffer.  */
581 #ifdef RE_ENABLE_I18N
582       if (pstr->mb_cur_max > 1)
583 	memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
584 #endif /* RE_ENABLE_I18N */
585       pstr->len = pstr->raw_len;
586       pstr->stop = pstr->raw_stop;
587       pstr->valid_len = 0;
588       pstr->raw_mbs_idx = 0;
589       pstr->valid_raw_len = 0;
590       pstr->offsets_needed = 0;
591       pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
592 			   : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
593       if (!pstr->mbs_allocated)
594 	pstr->mbs = (unsigned char *) pstr->raw_mbs;
595       offset = idx;
596     }
597 
598   if (BE (offset != 0, 1))
599     {
600       /* Should the already checked characters be kept?  */
601       if (BE (offset < pstr->valid_raw_len, 1))
602 	{
603 	  /* Yes, move them to the front of the buffer.  */
604 #ifdef RE_ENABLE_I18N
605 	  if (BE (pstr->offsets_needed, 0))
606 	    {
607 	      int low = 0, high = pstr->valid_len, mid;
608 	      do
609 		{
610 		  mid = (high + low) / 2;
611 		  if (pstr->offsets[mid] > offset)
612 		    high = mid;
613 		  else if (pstr->offsets[mid] < offset)
614 		    low = mid + 1;
615 		  else
616 		    break;
617 		}
618 	      while (low < high);
619 	      if (pstr->offsets[mid] < offset)
620 		++mid;
621 	      pstr->tip_context = re_string_context_at (pstr, mid - 1,
622 							eflags);
623 	      /* This can be quite complicated, so handle specially
624 		 only the common and easy case where the character with
625 		 different length representation of lower and upper
626 		 case is present at or after offset.  */
627 	      if (pstr->valid_len > offset
628 		  && mid == offset && pstr->offsets[mid] == offset)
629 		{
630 		  memmove (pstr->wcs, pstr->wcs + offset,
631 			   (pstr->valid_len - offset) * sizeof (wint_t));
632 		  memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
633 		  pstr->valid_len -= offset;
634 		  pstr->valid_raw_len -= offset;
635 		  for (low = 0; low < pstr->valid_len; low++)
636 		    pstr->offsets[low] = pstr->offsets[low + offset] - offset;
637 		}
638 	      else
639 		{
640 		  /* Otherwise, just find out how long the partial multibyte
641 		     character at offset is and fill it with WEOF/255.  */
642 		  pstr->len = pstr->raw_len - idx + offset;
643 		  pstr->stop = pstr->raw_stop - idx + offset;
644 		  pstr->offsets_needed = 0;
645 		  while (mid > 0 && pstr->offsets[mid - 1] == offset)
646 		    --mid;
647 		  while (mid < pstr->valid_len)
648 		    if (pstr->wcs[mid] != WEOF)
649 		      break;
650 		    else
651 		      ++mid;
652 		  if (mid == pstr->valid_len)
653 		    pstr->valid_len = 0;
654 		  else
655 		    {
656 		      pstr->valid_len = pstr->offsets[mid] - offset;
657 		      if (pstr->valid_len)
658 			{
659 			  for (low = 0; low < pstr->valid_len; ++low)
660 			    pstr->wcs[low] = WEOF;
661 			  memset (pstr->mbs, 255, pstr->valid_len);
662 			}
663 		    }
664 		  pstr->valid_raw_len = pstr->valid_len;
665 		}
666 	    }
667 	  else
668 #endif
669 	    {
670 	      pstr->tip_context = re_string_context_at (pstr, offset - 1,
671 							eflags);
672 #ifdef RE_ENABLE_I18N
673 	      if (pstr->mb_cur_max > 1)
674 		memmove (pstr->wcs, pstr->wcs + offset,
675 			 (pstr->valid_len - offset) * sizeof (wint_t));
676 #endif /* RE_ENABLE_I18N */
677 	      if (BE (pstr->mbs_allocated, 0))
678 		memmove (pstr->mbs, pstr->mbs + offset,
679 			 pstr->valid_len - offset);
680 	      pstr->valid_len -= offset;
681 	      pstr->valid_raw_len -= offset;
682 #if DEBUG
683 	      assert (pstr->valid_len > 0);
684 #endif
685 	    }
686 	}
687       else
688 	{
689 	  /* No, skip all characters until IDX.  */
690 	  int prev_valid_len = pstr->valid_len;
691 
692 #ifdef RE_ENABLE_I18N
693 	  if (BE (pstr->offsets_needed, 0))
694 	    {
695 	      pstr->len = pstr->raw_len - idx + offset;
696 	      pstr->stop = pstr->raw_stop - idx + offset;
697 	      pstr->offsets_needed = 0;
698 	    }
699 #endif
700 	  pstr->valid_len = 0;
701 #ifdef RE_ENABLE_I18N
702 	  if (pstr->mb_cur_max > 1)
703 	    {
704 	      int wcs_idx;
705 	      wint_t wc = WEOF;
706 
707 	      if (pstr->is_utf8)
708 		{
709 		  const unsigned char *raw, *p, *end;
710 
711 		  /* Special case UTF-8.  Multi-byte chars start with any
712 		     byte other than 0x80 - 0xbf.  */
713 		  raw = pstr->raw_mbs + pstr->raw_mbs_idx;
714 		  end = raw + (offset - pstr->mb_cur_max);
715 		  if (end < pstr->raw_mbs)
716 		    end = pstr->raw_mbs;
717 		  p = raw + offset - 1;
718 #ifdef _LIBC
719 		  /* We know the wchar_t encoding is UCS4, so for the simple
720 		     case, ASCII characters, skip the conversion step.  */
721 		  if (isascii (*p) && BE (pstr->trans == NULL, 1))
722 		    {
723 		      memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
724 		      /* pstr->valid_len = 0; */
725 		      wc = (wchar_t) *p;
726 		    }
727 		  else
728 #endif
729 		    for (; p >= end; --p)
730 		      if ((*p & 0xc0) != 0x80)
731 			{
732 			  mbstate_t cur_state;
733 			  wchar_t wc2;
734 			  int mlen = raw + pstr->len - p;
735 			  unsigned char buf[6];
736 			  size_t mbclen;
737 
738 			  const unsigned char *pp = p;
739 			  if (BE (pstr->trans != NULL, 0))
740 			    {
741 			      int i = mlen < 6 ? mlen : 6;
742 			      while (--i >= 0)
743 				buf[i] = pstr->trans[p[i]];
744 			      pp = buf;
745 			    }
746 			  /* XXX Don't use mbrtowc, we know which conversion
747 			     to use (UTF-8 -> UCS4).  */
748 			  memset (&cur_state, 0, sizeof (cur_state));
749 			  mbclen = __mbrtowc (&wc2, (const char *) pp, mlen,
750 					      &cur_state);
751 			  if (raw + offset - p <= mbclen
752 			      && mbclen < (size_t) -2)
753 			    {
754 			      memset (&pstr->cur_state, '\0',
755 				      sizeof (mbstate_t));
756 			      pstr->valid_len = mbclen - (raw + offset - p);
757 			      wc = wc2;
758 			    }
759 			  break;
760 			}
761 		}
762 
763 	      if (wc == WEOF)
764 		pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
765 	      if (wc == WEOF)
766 		pstr->tip_context
767 		  = re_string_context_at (pstr, prev_valid_len - 1, eflags);
768 	      else
769 		pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
770 				      && IS_WIDE_WORD_CHAR (wc))
771 				     ? CONTEXT_WORD
772 				     : ((IS_WIDE_NEWLINE (wc)
773 					 && pstr->newline_anchor)
774 					? CONTEXT_NEWLINE : 0));
775 	      if (BE (pstr->valid_len, 0))
776 		{
777 		  for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
778 		    pstr->wcs[wcs_idx] = WEOF;
779 		  if (pstr->mbs_allocated)
780 		    memset (pstr->mbs, 255, pstr->valid_len);
781 		}
782 	      pstr->valid_raw_len = pstr->valid_len;
783 	    }
784 	  else
785 #endif /* RE_ENABLE_I18N */
786 	    {
787 	      int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
788 	      pstr->valid_raw_len = 0;
789 	      if (pstr->trans)
790 		c = pstr->trans[c];
791 	      pstr->tip_context = (bitset_contain (pstr->word_char, c)
792 				   ? CONTEXT_WORD
793 				   : ((IS_NEWLINE (c) && pstr->newline_anchor)
794 				      ? CONTEXT_NEWLINE : 0));
795 	    }
796 	}
797       if (!BE (pstr->mbs_allocated, 0))
798 	pstr->mbs += offset;
799     }
800   pstr->raw_mbs_idx = idx;
801   pstr->len -= offset;
802   pstr->stop -= offset;
803 
804   /* Then build the buffers.  */
805 #ifdef RE_ENABLE_I18N
806   if (pstr->mb_cur_max > 1)
807     {
808       if (pstr->icase)
809 	{
810 	  reg_errcode_t ret = build_wcs_upper_buffer (pstr);
811 	  if (BE (ret != REG_NOERROR, 0))
812 	    return ret;
813 	}
814       else
815 	build_wcs_buffer (pstr);
816     }
817   else
818 #endif /* RE_ENABLE_I18N */
819     if (BE (pstr->mbs_allocated, 0))
820       {
821 	if (pstr->icase)
822 	  build_upper_buffer (pstr);
823 	else if (pstr->trans != NULL)
824 	  re_string_translate_buffer (pstr);
825       }
826     else
827       pstr->valid_len = pstr->len;
828 
829   pstr->cur_idx = 0;
830   return REG_NOERROR;
831 }
832 
833 static unsigned char
internal_function(pure)834 internal_function __attribute ((pure))
835 re_string_peek_byte_case (const re_string_t *pstr, int idx)
836 {
837   int ch, off;
838 
839   /* Handle the common (easiest) cases first.  */
840   if (BE (!pstr->mbs_allocated, 1))
841     return re_string_peek_byte (pstr, idx);
842 
843 #ifdef RE_ENABLE_I18N
844   if (pstr->mb_cur_max > 1
845       && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
846     return re_string_peek_byte (pstr, idx);
847 #endif
848 
849   off = pstr->cur_idx + idx;
850 #ifdef RE_ENABLE_I18N
851   if (pstr->offsets_needed)
852     off = pstr->offsets[off];
853 #endif
854 
855   ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
856 
857 #ifdef RE_ENABLE_I18N
858   /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
859      this function returns CAPITAL LETTER I instead of first byte of
860      DOTLESS SMALL LETTER I.  The latter would confuse the parser,
861      since peek_byte_case doesn't advance cur_idx in any way.  */
862   if (pstr->offsets_needed && !isascii (ch))
863     return re_string_peek_byte (pstr, idx);
864 #endif
865 
866   return ch;
867 }
868 
869 static unsigned char
870 internal_function
re_string_fetch_byte_case(re_string_t * pstr)871 re_string_fetch_byte_case (re_string_t *pstr)
872 {
873   if (BE (!pstr->mbs_allocated, 1))
874     return re_string_fetch_byte (pstr);
875 
876 #ifdef RE_ENABLE_I18N
877   if (pstr->offsets_needed)
878     {
879       int off, ch;
880 
881       /* For tr_TR.UTF-8 [[:islower:]] there is
882 	 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs.  Skip
883 	 in that case the whole multi-byte character and return
884 	 the original letter.  On the other side, with
885 	 [[: DOTLESS SMALL LETTER I return [[:I, as doing
886 	 anything else would complicate things too much.  */
887 
888       if (!re_string_first_byte (pstr, pstr->cur_idx))
889 	return re_string_fetch_byte (pstr);
890 
891       off = pstr->offsets[pstr->cur_idx];
892       ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
893 
894       if (! isascii (ch))
895 	return re_string_fetch_byte (pstr);
896 
897       re_string_skip_bytes (pstr,
898 			    re_string_char_size_at (pstr, pstr->cur_idx));
899       return ch;
900     }
901 #endif
902 
903   return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
904 }
905 
906 static void
907 internal_function
re_string_destruct(re_string_t * pstr)908 re_string_destruct (re_string_t *pstr)
909 {
910 #ifdef RE_ENABLE_I18N
911   re_free (pstr->wcs);
912   re_free (pstr->offsets);
913 #endif /* RE_ENABLE_I18N  */
914   if (pstr->mbs_allocated)
915     re_free (pstr->mbs);
916 }
917 
918 /* Return the context at IDX in INPUT.  */
919 
920 static unsigned int
921 internal_function
re_string_context_at(const re_string_t * input,int idx,int eflags)922 re_string_context_at (const re_string_t *input, int idx, int eflags)
923 {
924   int c;
925   if (BE (idx < 0, 0))
926     /* In this case, we use the value stored in input->tip_context,
927        since we can't know the character in input->mbs[-1] here.  */
928     return input->tip_context;
929   if (BE (idx == input->len, 0))
930     return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
931 	    : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
932 #ifdef RE_ENABLE_I18N
933   if (input->mb_cur_max > 1)
934     {
935       wint_t wc;
936       int wc_idx = idx;
937       while(input->wcs[wc_idx] == WEOF)
938 	{
939 #ifdef DEBUG
940 	  /* It must not happen.  */
941 	  assert (wc_idx >= 0);
942 #endif
943 	  --wc_idx;
944 	  if (wc_idx < 0)
945 	    return input->tip_context;
946 	}
947       wc = input->wcs[wc_idx];
948       if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
949 	return CONTEXT_WORD;
950       return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
951 	      ? CONTEXT_NEWLINE : 0);
952     }
953   else
954 #endif
955     {
956       c = re_string_byte_at (input, idx);
957       if (bitset_contain (input->word_char, c))
958 	return CONTEXT_WORD;
959       return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
960     }
961 }
962 
963 /* Functions for set operation.  */
964 
965 static reg_errcode_t
966 internal_function __attribute_warn_unused_result__
re_node_set_alloc(re_node_set * set,int size)967 re_node_set_alloc (re_node_set *set, int size)
968 {
969   set->alloc = size;
970   set->nelem = 0;
971   set->elems = re_malloc (int, size);
972   if (BE (set->elems == NULL, 0))
973     return REG_ESPACE;
974   return REG_NOERROR;
975 }
976 
977 static reg_errcode_t
978 internal_function __attribute_warn_unused_result__
re_node_set_init_1(re_node_set * set,int elem)979 re_node_set_init_1 (re_node_set *set, int elem)
980 {
981   set->alloc = 1;
982   set->nelem = 1;
983   set->elems = re_malloc (int, 1);
984   if (BE (set->elems == NULL, 0))
985     {
986       set->alloc = set->nelem = 0;
987       return REG_ESPACE;
988     }
989   set->elems[0] = elem;
990   return REG_NOERROR;
991 }
992 
993 static reg_errcode_t
994 internal_function __attribute_warn_unused_result__
re_node_set_init_2(re_node_set * set,int elem1,int elem2)995 re_node_set_init_2 (re_node_set *set, int elem1, int elem2)
996 {
997   set->alloc = 2;
998   set->elems = re_malloc (int, 2);
999   if (BE (set->elems == NULL, 0))
1000     return REG_ESPACE;
1001   if (elem1 == elem2)
1002     {
1003       set->nelem = 1;
1004       set->elems[0] = elem1;
1005     }
1006   else
1007     {
1008       set->nelem = 2;
1009       if (elem1 < elem2)
1010 	{
1011 	  set->elems[0] = elem1;
1012 	  set->elems[1] = elem2;
1013 	}
1014       else
1015 	{
1016 	  set->elems[0] = elem2;
1017 	  set->elems[1] = elem1;
1018 	}
1019     }
1020   return REG_NOERROR;
1021 }
1022 
1023 static reg_errcode_t
1024 internal_function __attribute_warn_unused_result__
re_node_set_init_copy(re_node_set * dest,const re_node_set * src)1025 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
1026 {
1027   dest->nelem = src->nelem;
1028   if (src->nelem > 0)
1029     {
1030       dest->alloc = dest->nelem;
1031       dest->elems = re_malloc (int, dest->alloc);
1032       if (BE (dest->elems == NULL, 0))
1033 	{
1034 	  dest->alloc = dest->nelem = 0;
1035 	  return REG_ESPACE;
1036 	}
1037       memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1038     }
1039   else
1040     re_node_set_init_empty (dest);
1041   return REG_NOERROR;
1042 }
1043 
1044 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1045    DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1046    Note: We assume dest->elems is NULL, when dest->alloc is 0.  */
1047 
1048 static reg_errcode_t
1049 internal_function __attribute_warn_unused_result__
re_node_set_add_intersect(re_node_set * dest,const re_node_set * src1,const re_node_set * src2)1050 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1051 			   const re_node_set *src2)
1052 {
1053   int i1, i2, is, id, delta, sbase;
1054   if (src1->nelem == 0 || src2->nelem == 0)
1055     return REG_NOERROR;
1056 
1057   /* We need dest->nelem + 2 * elems_in_intersection; this is a
1058      conservative estimate.  */
1059   if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1060     {
1061       int new_alloc = src1->nelem + src2->nelem + dest->alloc;
1062       int *new_elems = re_realloc (dest->elems, int, new_alloc);
1063       if (BE (new_elems == NULL, 0))
1064 	return REG_ESPACE;
1065       dest->elems = new_elems;
1066       dest->alloc = new_alloc;
1067     }
1068 
1069   /* Find the items in the intersection of SRC1 and SRC2, and copy
1070      into the top of DEST those that are not already in DEST itself.  */
1071   sbase = dest->nelem + src1->nelem + src2->nelem;
1072   i1 = src1->nelem - 1;
1073   i2 = src2->nelem - 1;
1074   id = dest->nelem - 1;
1075   for (;;)
1076     {
1077       if (src1->elems[i1] == src2->elems[i2])
1078 	{
1079 	  /* Try to find the item in DEST.  Maybe we could binary search?  */
1080 	  while (id >= 0 && dest->elems[id] > src1->elems[i1])
1081 	    --id;
1082 
1083 	  if (id < 0 || dest->elems[id] != src1->elems[i1])
1084 	    dest->elems[--sbase] = src1->elems[i1];
1085 
1086 	  if (--i1 < 0 || --i2 < 0)
1087 	    break;
1088 	}
1089 
1090       /* Lower the highest of the two items.  */
1091       else if (src1->elems[i1] < src2->elems[i2])
1092 	{
1093 	  if (--i2 < 0)
1094 	    break;
1095 	}
1096       else
1097 	{
1098 	  if (--i1 < 0)
1099 	    break;
1100 	}
1101     }
1102 
1103   id = dest->nelem - 1;
1104   is = dest->nelem + src1->nelem + src2->nelem - 1;
1105   delta = is - sbase + 1;
1106 
1107   /* Now copy.  When DELTA becomes zero, the remaining
1108      DEST elements are already in place; this is more or
1109      less the same loop that is in re_node_set_merge.  */
1110   dest->nelem += delta;
1111   if (delta > 0 && id >= 0)
1112     for (;;)
1113       {
1114 	if (dest->elems[is] > dest->elems[id])
1115 	  {
1116 	    /* Copy from the top.  */
1117 	    dest->elems[id + delta--] = dest->elems[is--];
1118 	    if (delta == 0)
1119 	      break;
1120 	  }
1121 	else
1122 	  {
1123 	    /* Slide from the bottom.  */
1124 	    dest->elems[id + delta] = dest->elems[id];
1125 	    if (--id < 0)
1126 	      break;
1127 	  }
1128       }
1129 
1130   /* Copy remaining SRC elements.  */
1131   memcpy (dest->elems, dest->elems + sbase, delta * sizeof (int));
1132 
1133   return REG_NOERROR;
1134 }
1135 
1136 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1137    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1138 
1139 static reg_errcode_t
1140 internal_function __attribute_warn_unused_result__
re_node_set_init_union(re_node_set * dest,const re_node_set * src1,const re_node_set * src2)1141 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1142 			const re_node_set *src2)
1143 {
1144   int i1, i2, id;
1145   if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1146     {
1147       dest->alloc = src1->nelem + src2->nelem;
1148       dest->elems = re_malloc (int, dest->alloc);
1149       if (BE (dest->elems == NULL, 0))
1150 	return REG_ESPACE;
1151     }
1152   else
1153     {
1154       if (src1 != NULL && src1->nelem > 0)
1155 	return re_node_set_init_copy (dest, src1);
1156       else if (src2 != NULL && src2->nelem > 0)
1157 	return re_node_set_init_copy (dest, src2);
1158       else
1159 	re_node_set_init_empty (dest);
1160       return REG_NOERROR;
1161     }
1162   for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1163     {
1164       if (src1->elems[i1] > src2->elems[i2])
1165 	{
1166 	  dest->elems[id++] = src2->elems[i2++];
1167 	  continue;
1168 	}
1169       if (src1->elems[i1] == src2->elems[i2])
1170 	++i2;
1171       dest->elems[id++] = src1->elems[i1++];
1172     }
1173   if (i1 < src1->nelem)
1174     {
1175       memcpy (dest->elems + id, src1->elems + i1,
1176 	     (src1->nelem - i1) * sizeof (int));
1177       id += src1->nelem - i1;
1178     }
1179   else if (i2 < src2->nelem)
1180     {
1181       memcpy (dest->elems + id, src2->elems + i2,
1182 	     (src2->nelem - i2) * sizeof (int));
1183       id += src2->nelem - i2;
1184     }
1185   dest->nelem = id;
1186   return REG_NOERROR;
1187 }
1188 
1189 /* Calculate the union set of the sets DEST and SRC. And store it to
1190    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1191 
1192 static reg_errcode_t
1193 internal_function __attribute_warn_unused_result__
re_node_set_merge(re_node_set * dest,const re_node_set * src)1194 re_node_set_merge (re_node_set *dest, const re_node_set *src)
1195 {
1196   int is, id, sbase, delta;
1197   if (src == NULL || src->nelem == 0)
1198     return REG_NOERROR;
1199   if (dest->alloc < 2 * src->nelem + dest->nelem)
1200     {
1201       int new_alloc = 2 * (src->nelem + dest->alloc);
1202       int *new_buffer = re_realloc (dest->elems, int, new_alloc);
1203       if (BE (new_buffer == NULL, 0))
1204 	return REG_ESPACE;
1205       dest->elems = new_buffer;
1206       dest->alloc = new_alloc;
1207     }
1208 
1209   if (BE (dest->nelem == 0, 0))
1210     {
1211       dest->nelem = src->nelem;
1212       memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1213       return REG_NOERROR;
1214     }
1215 
1216   /* Copy into the top of DEST the items of SRC that are not
1217      found in DEST.  Maybe we could binary search in DEST?  */
1218   for (sbase = dest->nelem + 2 * src->nelem,
1219        is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1220     {
1221       if (dest->elems[id] == src->elems[is])
1222 	is--, id--;
1223       else if (dest->elems[id] < src->elems[is])
1224 	dest->elems[--sbase] = src->elems[is--];
1225       else /* if (dest->elems[id] > src->elems[is]) */
1226 	--id;
1227     }
1228 
1229   if (is >= 0)
1230     {
1231       /* If DEST is exhausted, the remaining items of SRC must be unique.  */
1232       sbase -= is + 1;
1233       memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (int));
1234     }
1235 
1236   id = dest->nelem - 1;
1237   is = dest->nelem + 2 * src->nelem - 1;
1238   delta = is - sbase + 1;
1239   if (delta == 0)
1240     return REG_NOERROR;
1241 
1242   /* Now copy.  When DELTA becomes zero, the remaining
1243      DEST elements are already in place.  */
1244   dest->nelem += delta;
1245   for (;;)
1246     {
1247       if (dest->elems[is] > dest->elems[id])
1248 	{
1249 	  /* Copy from the top.  */
1250 	  dest->elems[id + delta--] = dest->elems[is--];
1251 	  if (delta == 0)
1252 	    break;
1253 	}
1254       else
1255 	{
1256 	  /* Slide from the bottom.  */
1257 	  dest->elems[id + delta] = dest->elems[id];
1258 	  if (--id < 0)
1259 	    {
1260 	      /* Copy remaining SRC elements.  */
1261 	      memcpy (dest->elems, dest->elems + sbase,
1262 		      delta * sizeof (int));
1263 	      break;
1264 	    }
1265 	}
1266     }
1267 
1268   return REG_NOERROR;
1269 }
1270 
1271 /* Insert the new element ELEM to the re_node_set* SET.
1272    SET should not already have ELEM.
1273    return -1 if an error is occured, return 1 otherwise.  */
1274 
1275 static int
1276 internal_function __attribute_warn_unused_result__
re_node_set_insert(re_node_set * set,int elem)1277 re_node_set_insert (re_node_set *set, int elem)
1278 {
1279   int idx;
1280   /* In case the set is empty.  */
1281   if (set->alloc == 0)
1282     {
1283       if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1))
1284 	return 1;
1285       else
1286 	return -1;
1287     }
1288 
1289   if (BE (set->nelem, 0) == 0)
1290     {
1291       /* We already guaranteed above that set->alloc != 0.  */
1292       set->elems[0] = elem;
1293       ++set->nelem;
1294       return 1;
1295     }
1296 
1297   /* Realloc if we need.  */
1298   if (set->alloc == set->nelem)
1299     {
1300       int *new_elems;
1301       set->alloc = set->alloc * 2;
1302       new_elems = re_realloc (set->elems, int, set->alloc);
1303       if (BE (new_elems == NULL, 0))
1304 	return -1;
1305       set->elems = new_elems;
1306     }
1307 
1308   /* Move the elements which follows the new element.  Test the
1309      first element separately to skip a check in the inner loop.  */
1310   if (elem < set->elems[0])
1311     {
1312       idx = 0;
1313       for (idx = set->nelem; idx > 0; idx--)
1314 	set->elems[idx] = set->elems[idx - 1];
1315     }
1316   else
1317     {
1318       for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1319 	set->elems[idx] = set->elems[idx - 1];
1320     }
1321 
1322   /* Insert the new element.  */
1323   set->elems[idx] = elem;
1324   ++set->nelem;
1325   return 1;
1326 }
1327 
1328 /* Insert the new element ELEM to the re_node_set* SET.
1329    SET should not already have any element greater than or equal to ELEM.
1330    Return -1 if an error is occured, return 1 otherwise.  */
1331 
1332 static int
1333 internal_function __attribute_warn_unused_result__
re_node_set_insert_last(re_node_set * set,int elem)1334 re_node_set_insert_last (re_node_set *set, int elem)
1335 {
1336   /* Realloc if we need.  */
1337   if (set->alloc == set->nelem)
1338     {
1339       int *new_elems;
1340       set->alloc = (set->alloc + 1) * 2;
1341       new_elems = re_realloc (set->elems, int, set->alloc);
1342       if (BE (new_elems == NULL, 0))
1343 	return -1;
1344       set->elems = new_elems;
1345     }
1346 
1347   /* Insert the new element.  */
1348   set->elems[set->nelem++] = elem;
1349   return 1;
1350 }
1351 
1352 /* Compare two node sets SET1 and SET2.
1353    return 1 if SET1 and SET2 are equivalent, return 0 otherwise.  */
1354 
1355 static int
internal_function(pure)1356 internal_function __attribute ((pure))
1357 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1358 {
1359   int i;
1360   if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1361     return 0;
1362   for (i = set1->nelem ; --i >= 0 ; )
1363     if (set1->elems[i] != set2->elems[i])
1364       return 0;
1365   return 1;
1366 }
1367 
1368 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise.  */
1369 
1370 static int
internal_function(pure)1371 internal_function __attribute ((pure))
1372 re_node_set_contains (const re_node_set *set, int elem)
1373 {
1374   unsigned int idx, right, mid;
1375   if (set->nelem <= 0)
1376     return 0;
1377 
1378   /* Binary search the element.  */
1379   idx = 0;
1380   right = set->nelem - 1;
1381   while (idx < right)
1382     {
1383       mid = (idx + right) / 2;
1384       if (set->elems[mid] < elem)
1385 	idx = mid + 1;
1386       else
1387 	right = mid;
1388     }
1389   return set->elems[idx] == elem ? idx + 1 : 0;
1390 }
1391 
1392 static void
1393 internal_function
re_node_set_remove_at(re_node_set * set,int idx)1394 re_node_set_remove_at (re_node_set *set, int idx)
1395 {
1396   if (idx < 0 || idx >= set->nelem)
1397     return;
1398   --set->nelem;
1399   for (; idx < set->nelem; idx++)
1400     set->elems[idx] = set->elems[idx + 1];
1401 }
1402 
1403 
1404 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1405    Or return -1, if an error will be occured.  */
1406 
1407 static int
1408 internal_function
re_dfa_add_node(re_dfa_t * dfa,re_token_t token)1409 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1410 {
1411   int type = token.type;
1412   if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1413     {
1414       size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1415       int *new_nexts, *new_indices;
1416       re_node_set *new_edests, *new_eclosures;
1417       re_token_t *new_nodes;
1418 
1419       /* Avoid overflows in realloc.  */
1420       const size_t max_object_size = MAX (sizeof (re_token_t),
1421 					  MAX (sizeof (re_node_set),
1422 					       sizeof (int)));
1423       if (BE (SIZE_MAX / max_object_size < new_nodes_alloc, 0))
1424 	return -1;
1425 
1426       new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1427       if (BE (new_nodes == NULL, 0))
1428 	return -1;
1429       dfa->nodes = new_nodes;
1430       new_nexts = re_realloc (dfa->nexts, int, new_nodes_alloc);
1431       new_indices = re_realloc (dfa->org_indices, int, new_nodes_alloc);
1432       new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1433       new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1434       if (BE (new_nexts == NULL || new_indices == NULL
1435 	      || new_edests == NULL || new_eclosures == NULL, 0))
1436 	return -1;
1437       dfa->nexts = new_nexts;
1438       dfa->org_indices = new_indices;
1439       dfa->edests = new_edests;
1440       dfa->eclosures = new_eclosures;
1441       dfa->nodes_alloc = new_nodes_alloc;
1442     }
1443   dfa->nodes[dfa->nodes_len] = token;
1444   dfa->nodes[dfa->nodes_len].constraint = 0;
1445 #ifdef RE_ENABLE_I18N
1446   dfa->nodes[dfa->nodes_len].accept_mb =
1447     (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
1448 #endif
1449   dfa->nexts[dfa->nodes_len] = -1;
1450   re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1451   re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1452   return dfa->nodes_len++;
1453 }
1454 
1455 static inline unsigned int
1456 internal_function
calc_state_hash(const re_node_set * nodes,unsigned int context)1457 calc_state_hash (const re_node_set *nodes, unsigned int context)
1458 {
1459   unsigned int hash = nodes->nelem + context;
1460   int i;
1461   for (i = 0 ; i < nodes->nelem ; i++)
1462     hash += nodes->elems[i];
1463   return hash;
1464 }
1465 
1466 /* Search for the state whose node_set is equivalent to NODES.
1467    Return the pointer to the state, if we found it in the DFA.
1468    Otherwise create the new one and return it.  In case of an error
1469    return NULL and set the error code in ERR.
1470    Note: - We assume NULL as the invalid state, then it is possible that
1471 	   return value is NULL and ERR is REG_NOERROR.
1472 	 - We never return non-NULL value in case of any errors, it is for
1473 	   optimization.  */
1474 
1475 static re_dfastate_t *
1476 internal_function __attribute_warn_unused_result__
re_acquire_state(reg_errcode_t * err,const re_dfa_t * dfa,const re_node_set * nodes)1477 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1478 		  const re_node_set *nodes)
1479 {
1480   unsigned int hash;
1481   re_dfastate_t *new_state;
1482   struct re_state_table_entry *spot;
1483   int i;
1484   if (BE (nodes->nelem == 0, 0))
1485     {
1486       *err = REG_NOERROR;
1487       return NULL;
1488     }
1489   hash = calc_state_hash (nodes, 0);
1490   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1491 
1492   for (i = 0 ; i < spot->num ; i++)
1493     {
1494       re_dfastate_t *state = spot->array[i];
1495       if (hash != state->hash)
1496 	continue;
1497       if (re_node_set_compare (&state->nodes, nodes))
1498 	return state;
1499     }
1500 
1501   /* There are no appropriate state in the dfa, create the new one.  */
1502   new_state = create_ci_newstate (dfa, nodes, hash);
1503   if (BE (new_state == NULL, 0))
1504     *err = REG_ESPACE;
1505 
1506   return new_state;
1507 }
1508 
1509 /* Search for the state whose node_set is equivalent to NODES and
1510    whose context is equivalent to CONTEXT.
1511    Return the pointer to the state, if we found it in the DFA.
1512    Otherwise create the new one and return it.  In case of an error
1513    return NULL and set the error code in ERR.
1514    Note: - We assume NULL as the invalid state, then it is possible that
1515 	   return value is NULL and ERR is REG_NOERROR.
1516 	 - We never return non-NULL value in case of any errors, it is for
1517 	   optimization.  */
1518 
1519 static re_dfastate_t *
1520 internal_function __attribute_warn_unused_result__
re_acquire_state_context(reg_errcode_t * err,const re_dfa_t * dfa,const re_node_set * nodes,unsigned int context)1521 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1522 			  const re_node_set *nodes, unsigned int context)
1523 {
1524   unsigned int hash;
1525   re_dfastate_t *new_state;
1526   struct re_state_table_entry *spot;
1527   int i;
1528   if (nodes->nelem == 0)
1529     {
1530       *err = REG_NOERROR;
1531       return NULL;
1532     }
1533   hash = calc_state_hash (nodes, context);
1534   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1535 
1536   for (i = 0 ; i < spot->num ; i++)
1537     {
1538       re_dfastate_t *state = spot->array[i];
1539       if (state->hash == hash
1540 	  && state->context == context
1541 	  && re_node_set_compare (state->entrance_nodes, nodes))
1542 	return state;
1543     }
1544   /* There are no appropriate state in `dfa', create the new one.  */
1545   new_state = create_cd_newstate (dfa, nodes, context, hash);
1546   if (BE (new_state == NULL, 0))
1547     *err = REG_ESPACE;
1548 
1549   return new_state;
1550 }
1551 
1552 /* Finish initialization of the new state NEWSTATE, and using its hash value
1553    HASH put in the appropriate bucket of DFA's state table.  Return value
1554    indicates the error code if failed.  */
1555 
1556 static reg_errcode_t
1557 __attribute_warn_unused_result__
register_state(const re_dfa_t * dfa,re_dfastate_t * newstate,unsigned int hash)1558 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1559 		unsigned int hash)
1560 {
1561   struct re_state_table_entry *spot;
1562   reg_errcode_t err;
1563   int i;
1564 
1565   newstate->hash = hash;
1566   err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1567   if (BE (err != REG_NOERROR, 0))
1568     return REG_ESPACE;
1569   for (i = 0; i < newstate->nodes.nelem; i++)
1570     {
1571       int elem = newstate->nodes.elems[i];
1572       if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1573 	if (re_node_set_insert_last (&newstate->non_eps_nodes, elem) < 0)
1574 	  return REG_ESPACE;
1575     }
1576 
1577   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1578   if (BE (spot->alloc <= spot->num, 0))
1579     {
1580       int new_alloc = 2 * spot->num + 2;
1581       re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1582 					      new_alloc);
1583       if (BE (new_array == NULL, 0))
1584 	return REG_ESPACE;
1585       spot->array = new_array;
1586       spot->alloc = new_alloc;
1587     }
1588   spot->array[spot->num++] = newstate;
1589   return REG_NOERROR;
1590 }
1591 
1592 static void
free_state(re_dfastate_t * state)1593 free_state (re_dfastate_t *state)
1594 {
1595   re_node_set_free (&state->non_eps_nodes);
1596   re_node_set_free (&state->inveclosure);
1597   if (state->entrance_nodes != &state->nodes)
1598     {
1599       re_node_set_free (state->entrance_nodes);
1600       re_free (state->entrance_nodes);
1601     }
1602   re_node_set_free (&state->nodes);
1603   re_free (state->word_trtable);
1604   re_free (state->trtable);
1605   re_free (state);
1606 }
1607 
1608 /* Create the new state which is independ of contexts.
1609    Return the new state if succeeded, otherwise return NULL.  */
1610 
1611 static re_dfastate_t *
1612 internal_function __attribute_warn_unused_result__
create_ci_newstate(const re_dfa_t * dfa,const re_node_set * nodes,unsigned int hash)1613 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1614 		    unsigned int hash)
1615 {
1616   int i;
1617   reg_errcode_t err;
1618   re_dfastate_t *newstate;
1619 
1620   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1621   if (BE (newstate == NULL, 0))
1622     return NULL;
1623   err = re_node_set_init_copy (&newstate->nodes, nodes);
1624   if (BE (err != REG_NOERROR, 0))
1625     {
1626       re_free (newstate);
1627       return NULL;
1628     }
1629 
1630   newstate->entrance_nodes = &newstate->nodes;
1631   for (i = 0 ; i < nodes->nelem ; i++)
1632     {
1633       re_token_t *node = dfa->nodes + nodes->elems[i];
1634       re_token_type_t type = node->type;
1635       if (type == CHARACTER && !node->constraint)
1636 	continue;
1637 #ifdef RE_ENABLE_I18N
1638       newstate->accept_mb |= node->accept_mb;
1639 #endif /* RE_ENABLE_I18N */
1640 
1641       /* If the state has the halt node, the state is a halt state.  */
1642       if (type == END_OF_RE)
1643 	newstate->halt = 1;
1644       else if (type == OP_BACK_REF)
1645 	newstate->has_backref = 1;
1646       else if (type == ANCHOR || node->constraint)
1647 	newstate->has_constraint = 1;
1648     }
1649   err = register_state (dfa, newstate, hash);
1650   if (BE (err != REG_NOERROR, 0))
1651     {
1652       free_state (newstate);
1653       newstate = NULL;
1654     }
1655   return newstate;
1656 }
1657 
1658 /* Create the new state which is depend on the context CONTEXT.
1659    Return the new state if succeeded, otherwise return NULL.  */
1660 
1661 static re_dfastate_t *
1662 internal_function __attribute_warn_unused_result__
create_cd_newstate(const re_dfa_t * dfa,const re_node_set * nodes,unsigned int context,unsigned int hash)1663 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1664 		    unsigned int context, unsigned int hash)
1665 {
1666   int i, nctx_nodes = 0;
1667   reg_errcode_t err;
1668   re_dfastate_t *newstate;
1669 
1670   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1671   if (BE (newstate == NULL, 0))
1672     return NULL;
1673   err = re_node_set_init_copy (&newstate->nodes, nodes);
1674   if (BE (err != REG_NOERROR, 0))
1675     {
1676       re_free (newstate);
1677       return NULL;
1678     }
1679 
1680   newstate->context = context;
1681   newstate->entrance_nodes = &newstate->nodes;
1682 
1683   for (i = 0 ; i < nodes->nelem ; i++)
1684     {
1685       re_token_t *node = dfa->nodes + nodes->elems[i];
1686       re_token_type_t type = node->type;
1687       unsigned int constraint = node->constraint;
1688 
1689       if (type == CHARACTER && !constraint)
1690 	continue;
1691 #ifdef RE_ENABLE_I18N
1692       newstate->accept_mb |= node->accept_mb;
1693 #endif /* RE_ENABLE_I18N */
1694 
1695       /* If the state has the halt node, the state is a halt state.  */
1696       if (type == END_OF_RE)
1697 	newstate->halt = 1;
1698       else if (type == OP_BACK_REF)
1699 	newstate->has_backref = 1;
1700 
1701       if (constraint)
1702 	{
1703 	  if (newstate->entrance_nodes == &newstate->nodes)
1704 	    {
1705 	      newstate->entrance_nodes = re_malloc (re_node_set, 1);
1706 	      if (BE (newstate->entrance_nodes == NULL, 0))
1707 		{
1708 		  free_state (newstate);
1709 		  return NULL;
1710 		}
1711 	      if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
1712 		  != REG_NOERROR)
1713 		return NULL;
1714 	      nctx_nodes = 0;
1715 	      newstate->has_constraint = 1;
1716 	    }
1717 
1718 	  if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1719 	    {
1720 	      re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1721 	      ++nctx_nodes;
1722 	    }
1723 	}
1724     }
1725   err = register_state (dfa, newstate, hash);
1726   if (BE (err != REG_NOERROR, 0))
1727     {
1728       free_state (newstate);
1729       newstate = NULL;
1730     }
1731   return  newstate;
1732 }
1733