1 /* Extended regular expression matching and search library.
2    Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5 
6    This program is free software; you can redistribute it and/or modify
7    it under the terms of the GNU General Public License as published by
8    the Free Software Foundation; either version 2, or (at your option)
9    any later version.
10 
11    This program is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14    GNU General Public License for more details.
15 
16    You should have received a copy of the GNU General Public License along
17    with this program; if not, write to the Free Software Foundation,
18    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
19 
20 static void re_string_construct_common (const char *str, Idx len,
21 					re_string_t *pstr,
22 					REG_TRANSLATE_TYPE trans, bool icase,
23 					const re_dfa_t *dfa) internal_function;
24 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
25 					  const re_node_set *nodes,
26 					  re_hashval_t hash) internal_function;
27 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
28 					  const re_node_set *nodes,
29 					  unsigned int context,
30 					  re_hashval_t hash) internal_function;
31 
32 /* Functions for string operation.  */
33 
34 /* This function allocate the buffers.  It is necessary to call
35    re_string_reconstruct before using the object.  */
36 
37 static reg_errcode_t
38 internal_function
re_string_allocate(re_string_t * pstr,const char * str,Idx len,Idx init_len,REG_TRANSLATE_TYPE trans,bool icase,const re_dfa_t * dfa)39 re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
40 		    REG_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
41 {
42   reg_errcode_t ret;
43   Idx init_buf_len;
44 
45   /* Ensure at least one character fits into the buffers.  */
46   if (init_len < dfa->mb_cur_max)
47     init_len = dfa->mb_cur_max;
48   init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
49   re_string_construct_common (str, len, pstr, trans, icase, dfa);
50 
51   ret = re_string_realloc_buffers (pstr, init_buf_len);
52   if (BE (ret != REG_NOERROR, 0))
53     return ret;
54 
55   pstr->word_char = dfa->word_char;
56   pstr->word_ops_used = dfa->word_ops_used;
57   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
58   pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
59   pstr->valid_raw_len = pstr->valid_len;
60   return REG_NOERROR;
61 }
62 
63 /* This function allocate the buffers, and initialize them.  */
64 
65 static reg_errcode_t
66 internal_function
re_string_construct(re_string_t * pstr,const char * str,Idx len,REG_TRANSLATE_TYPE trans,bool icase,const re_dfa_t * dfa)67 re_string_construct (re_string_t *pstr, const char *str, Idx len,
68 		     REG_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
69 {
70   reg_errcode_t ret;
71   memset (pstr, '\0', sizeof (re_string_t));
72   re_string_construct_common (str, len, pstr, trans, icase, dfa);
73 
74   if (len > 0)
75     {
76       ret = re_string_realloc_buffers (pstr, len + 1);
77       if (BE (ret != REG_NOERROR, 0))
78 	return ret;
79     }
80   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
81 
82   if (icase)
83     {
84 #ifdef RE_ENABLE_I18N
85       if (dfa->mb_cur_max > 1)
86 	{
87 	  while (1)
88 	    {
89 	      ret = build_wcs_upper_buffer (pstr);
90 	      if (BE (ret != REG_NOERROR, 0))
91 		return ret;
92 	      if (pstr->valid_raw_len >= len)
93 		break;
94 	      if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
95 		break;
96 	      ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
97 	      if (BE (ret != REG_NOERROR, 0))
98 		return ret;
99 	    }
100 	}
101       else
102 #endif /* RE_ENABLE_I18N  */
103 	build_upper_buffer (pstr);
104     }
105   else
106     {
107 #ifdef RE_ENABLE_I18N
108       if (dfa->mb_cur_max > 1)
109 	build_wcs_buffer (pstr);
110       else
111 #endif /* RE_ENABLE_I18N  */
112 	{
113 	  if (trans != NULL)
114 	    re_string_translate_buffer (pstr);
115 	  else
116 	    {
117 	      pstr->valid_len = pstr->bufs_len;
118 	      pstr->valid_raw_len = pstr->bufs_len;
119 	    }
120 	}
121     }
122 
123   return REG_NOERROR;
124 }
125 
126 /* Helper functions for re_string_allocate, and re_string_construct.  */
127 
128 static reg_errcode_t
129 internal_function
re_string_realloc_buffers(re_string_t * pstr,Idx new_buf_len)130 re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
131 {
132 #ifdef RE_ENABLE_I18N
133   if (pstr->mb_cur_max > 1)
134     {
135       wint_t *new_wcs = re_xrealloc (pstr->wcs, wint_t, new_buf_len);
136       if (BE (new_wcs == NULL, 0))
137 	return REG_ESPACE;
138       pstr->wcs = new_wcs;
139       if (pstr->offsets != NULL)
140 	{
141 	  Idx *new_offsets = re_xrealloc (pstr->offsets, Idx, new_buf_len);
142 	  if (BE (new_offsets == NULL, 0))
143 	    return REG_ESPACE;
144 	  pstr->offsets = new_offsets;
145 	}
146     }
147 #endif /* RE_ENABLE_I18N  */
148   if (pstr->mbs_allocated)
149     {
150       unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
151 					   new_buf_len);
152       if (BE (new_mbs == NULL, 0))
153 	return REG_ESPACE;
154       pstr->mbs = new_mbs;
155     }
156   pstr->bufs_len = new_buf_len;
157   return REG_NOERROR;
158 }
159 
160 
161 static void
162 internal_function
re_string_construct_common(const char * str,Idx len,re_string_t * pstr,REG_TRANSLATE_TYPE trans,bool icase,const re_dfa_t * dfa)163 re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
164 			    REG_TRANSLATE_TYPE trans, bool icase,
165 			    const re_dfa_t *dfa)
166 {
167   pstr->raw_mbs = (const unsigned char *) str;
168   pstr->len = len;
169   pstr->raw_len = len;
170   pstr->trans = (unsigned REG_TRANSLATE_TYPE) trans;
171   pstr->icase = icase;
172   pstr->mbs_allocated = (trans != NULL || icase);
173   pstr->mb_cur_max = dfa->mb_cur_max;
174   pstr->is_utf8 = dfa->is_utf8;
175   pstr->map_notascii = dfa->map_notascii;
176   pstr->stop = pstr->len;
177   pstr->raw_stop = pstr->stop;
178 }
179 
180 #ifdef RE_ENABLE_I18N
181 
182 /* Build wide character buffer PSTR->WCS.
183    If the byte sequence of the string are:
184      <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
185    Then wide character buffer will be:
186      <wc1>   , WEOF    , <wc2>   , WEOF    , <wc3>
187    We use WEOF for padding, they indicate that the position isn't
188    a first byte of a multibyte character.
189 
190    Note that this function assumes PSTR->VALID_LEN elements are already
191    built and starts from PSTR->VALID_LEN.  */
192 
193 static void
194 internal_function
build_wcs_buffer(re_string_t * pstr)195 build_wcs_buffer (re_string_t *pstr)
196 {
197 #ifdef _LIBC
198   unsigned char buf[MB_LEN_MAX];
199   assert (MB_LEN_MAX >= pstr->mb_cur_max);
200 #else
201   unsigned char buf[64];
202 #endif
203   mbstate_t prev_st;
204   Idx byte_idx, end_idx, remain_len;
205   size_t mbclen;
206 
207   /* Build the buffers from pstr->valid_len to either pstr->len or
208      pstr->bufs_len.  */
209   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
210   for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
211     {
212       wchar_t wc;
213       const char *p;
214 
215       remain_len = end_idx - byte_idx;
216       prev_st = pstr->cur_state;
217       /* Apply the translation if we need.  */
218       if (BE (pstr->trans != NULL, 0))
219 	{
220 	  int i, ch;
221 
222 	  for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
223 	    {
224 	      ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
225 	      buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
226 	    }
227 	  p = (const char *) buf;
228 	}
229       else
230 	p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
231       mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
232       if (BE (mbclen == (size_t) -2, 0))
233 	{
234 	  /* The buffer doesn't have enough space, finish to build.  */
235 	  pstr->cur_state = prev_st;
236 	  break;
237 	}
238       else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
239 	{
240 	  /* We treat these cases as a singlebyte character.  */
241 	  mbclen = 1;
242 	  wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
243 	  if (BE (pstr->trans != NULL, 0))
244 	    wc = pstr->trans[wc];
245 	  pstr->cur_state = prev_st;
246 	}
247 
248       /* Write wide character and padding.  */
249       pstr->wcs[byte_idx++] = wc;
250       /* Write paddings.  */
251       for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
252 	pstr->wcs[byte_idx++] = WEOF;
253     }
254   pstr->valid_len = byte_idx;
255   pstr->valid_raw_len = byte_idx;
256 }
257 
258 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
259    but for REG_ICASE.  */
260 
261 static reg_errcode_t
262 internal_function
build_wcs_upper_buffer(re_string_t * pstr)263 build_wcs_upper_buffer (re_string_t *pstr)
264 {
265   mbstate_t prev_st;
266   Idx src_idx, byte_idx, end_idx, remain_len;
267   size_t mbclen;
268 #ifdef _LIBC
269   char buf[MB_LEN_MAX];
270   assert (MB_LEN_MAX >= pstr->mb_cur_max);
271 #else
272   char buf[64];
273 #endif
274 
275   byte_idx = pstr->valid_len;
276   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
277 
278   /* The following optimization assumes that ASCII characters can be
279      mapped to wide characters with a simple cast.  */
280   if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
281     {
282       while (byte_idx < end_idx)
283 	{
284 	  wchar_t wc;
285 
286 	  if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
287 	      && mbsinit (&pstr->cur_state))
288 	    {
289 	      /* In case of a singlebyte character.  */
290 	      pstr->mbs[byte_idx]
291 		= toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
292 	      /* The next step uses the assumption that wchar_t is encoded
293 		 ASCII-safe: all ASCII values can be converted like this.  */
294 	      pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
295 	      ++byte_idx;
296 	      continue;
297 	    }
298 
299 	  remain_len = end_idx - byte_idx;
300 	  prev_st = pstr->cur_state;
301 	  mbclen = mbrtowc (&wc,
302 			    ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
303 			     + byte_idx), remain_len, &pstr->cur_state);
304 	  if (BE ((size_t) (mbclen + 2) > 2, 1))
305 	    {
306 	      wchar_t wcu = wc;
307 	      if (iswlower (wc))
308 		{
309 		  size_t mbcdlen;
310 
311 		  wcu = towupper (wc);
312 		  mbcdlen = wcrtomb (buf, wcu, &prev_st);
313 		  if (BE (mbclen == mbcdlen, 1))
314 		    memcpy (pstr->mbs + byte_idx, buf, mbclen);
315 		  else
316 		    {
317 		      src_idx = byte_idx;
318 		      goto offsets_needed;
319 		    }
320 		}
321 	      else
322 		memcpy (pstr->mbs + byte_idx,
323 			pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
324 	      pstr->wcs[byte_idx++] = wcu;
325 	      /* Write paddings.  */
326 	      for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
327 		pstr->wcs[byte_idx++] = WEOF;
328 	    }
329 	  else if (mbclen == (size_t) -1 || mbclen == 0)
330 	    {
331 	      /* It is an invalid character or '\0'.  Just use the byte.  */
332 	      int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
333 	      pstr->mbs[byte_idx] = ch;
334 	      /* And also cast it to wide char.  */
335 	      pstr->wcs[byte_idx++] = (wchar_t) ch;
336 	      if (BE (mbclen == (size_t) -1, 0))
337 		pstr->cur_state = prev_st;
338 	    }
339 	  else
340 	    {
341 	      /* The buffer doesn't have enough space, finish to build.  */
342 	      pstr->cur_state = prev_st;
343 	      break;
344 	    }
345 	}
346       pstr->valid_len = byte_idx;
347       pstr->valid_raw_len = byte_idx;
348       return REG_NOERROR;
349     }
350   else
351     for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
352       {
353 	wchar_t wc;
354 	const char *p;
355       offsets_needed:
356 	remain_len = end_idx - byte_idx;
357 	prev_st = pstr->cur_state;
358 	if (BE (pstr->trans != NULL, 0))
359 	  {
360 	    int i, ch;
361 
362 	    for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
363 	      {
364 		ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
365 		buf[i] = pstr->trans[ch];
366 	      }
367 	    p = (const char *) buf;
368 	  }
369 	else
370 	  p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
371 	mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
372 	if (BE ((size_t) (mbclen + 2) > 2, 1))
373 	  {
374 	    wchar_t wcu = wc;
375 	    if (iswlower (wc))
376 	      {
377 		size_t mbcdlen;
378 
379 		wcu = towupper (wc);
380 		mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
381 		if (BE (mbclen == mbcdlen, 1))
382 		  memcpy (pstr->mbs + byte_idx, buf, mbclen);
383 		else if (mbcdlen != (size_t) -1)
384 		  {
385 		    size_t i;
386 
387 		    if (byte_idx + mbcdlen > pstr->bufs_len)
388 		      {
389 			pstr->cur_state = prev_st;
390 			break;
391 		      }
392 
393 		    if (pstr->offsets == NULL)
394 		      {
395 			pstr->offsets = re_xmalloc (Idx, pstr->bufs_len);
396 
397 			if (pstr->offsets == NULL)
398 			  return REG_ESPACE;
399 		      }
400 		    if (!pstr->offsets_needed)
401 		      {
402 			for (i = 0; i < (size_t) byte_idx; ++i)
403 			  pstr->offsets[i] = i;
404 			pstr->offsets_needed = 1;
405 		      }
406 
407 		    memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
408 		    pstr->wcs[byte_idx] = wcu;
409 		    pstr->offsets[byte_idx] = src_idx;
410 		    for (i = 1; i < mbcdlen; ++i)
411 		      {
412 			pstr->offsets[byte_idx + i]
413 			  = src_idx + (i < mbclen ? i : mbclen - 1);
414 			pstr->wcs[byte_idx + i] = WEOF;
415 		      }
416 		    pstr->len += mbcdlen - mbclen;
417 		    if (pstr->raw_stop > src_idx)
418 		      pstr->stop += mbcdlen - mbclen;
419 		    end_idx = (pstr->bufs_len > pstr->len)
420 			      ? pstr->len : pstr->bufs_len;
421 		    byte_idx += mbcdlen;
422 		    src_idx += mbclen;
423 		    continue;
424 		  }
425                 else
426                   memcpy (pstr->mbs + byte_idx, p, mbclen);
427 	      }
428 	    else
429 	      memcpy (pstr->mbs + byte_idx, p, mbclen);
430 
431 	    if (BE (pstr->offsets_needed != 0, 0))
432 	      {
433 		size_t i;
434 		for (i = 0; i < mbclen; ++i)
435 		  pstr->offsets[byte_idx + i] = src_idx + i;
436 	      }
437 	    src_idx += mbclen;
438 
439 	    pstr->wcs[byte_idx++] = wcu;
440 	    /* Write paddings.  */
441 	    for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
442 	      pstr->wcs[byte_idx++] = WEOF;
443 	  }
444 	else if (mbclen == (size_t) -1 || mbclen == 0)
445 	  {
446 	    /* It is an invalid character or '\0'.  Just use the byte.  */
447 	    int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
448 
449 	    if (BE (pstr->trans != NULL, 0))
450 	      ch = pstr->trans [ch];
451 	    pstr->mbs[byte_idx] = ch;
452 
453 	    if (BE (pstr->offsets_needed != 0, 0))
454 	      pstr->offsets[byte_idx] = src_idx;
455 	    ++src_idx;
456 
457 	    /* And also cast it to wide char.  */
458 	    pstr->wcs[byte_idx++] = (wchar_t) ch;
459 	    if (BE (mbclen == (size_t) -1, 0))
460 	      pstr->cur_state = prev_st;
461 	  }
462 	else
463 	  {
464 	    /* The buffer doesn't have enough space, finish to build.  */
465 	    pstr->cur_state = prev_st;
466 	    break;
467 	  }
468       }
469   pstr->valid_len = byte_idx;
470   pstr->valid_raw_len = src_idx;
471   return REG_NOERROR;
472 }
473 
474 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
475    Return the index.  */
476 
477 static Idx
478 internal_function
re_string_skip_chars(re_string_t * pstr,Idx new_raw_idx,wint_t * last_wc)479 re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
480 {
481   mbstate_t prev_st;
482   Idx rawbuf_idx;
483   size_t mbclen;
484   wchar_t wc = 0;
485 
486   /* Skip the characters which are not necessary to check.  */
487   for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
488        rawbuf_idx < new_raw_idx;)
489     {
490       Idx remain_len;
491       remain_len = pstr->len - rawbuf_idx;
492       prev_st = pstr->cur_state;
493       mbclen = mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx,
494 			remain_len, &pstr->cur_state);
495       if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
496 	{
497 	  /* We treat these cases as a singlebyte character.  */
498 	  mbclen = 1;
499 	  pstr->cur_state = prev_st;
500 	}
501       /* Then proceed the next character.  */
502       rawbuf_idx += mbclen;
503     }
504   *last_wc = (wint_t) wc;
505   return rawbuf_idx;
506 }
507 #endif /* RE_ENABLE_I18N  */
508 
509 /* Build the buffer PSTR->MBS, and apply the translation if we need.
510    This function is used in case of REG_ICASE.  */
511 
512 static void
513 internal_function
build_upper_buffer(re_string_t * pstr)514 build_upper_buffer (re_string_t *pstr)
515 {
516   Idx char_idx, end_idx;
517   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
518 
519   for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
520     {
521       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
522       if (BE (pstr->trans != NULL, 0))
523 	ch = pstr->trans[ch];
524       if (islower (ch))
525 	pstr->mbs[char_idx] = toupper (ch);
526       else
527 	pstr->mbs[char_idx] = ch;
528     }
529   pstr->valid_len = char_idx;
530   pstr->valid_raw_len = char_idx;
531 }
532 
533 /* Apply TRANS to the buffer in PSTR.  */
534 
535 static void
536 internal_function
re_string_translate_buffer(re_string_t * pstr)537 re_string_translate_buffer (re_string_t *pstr)
538 {
539   Idx buf_idx, end_idx;
540   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
541 
542   for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
543     {
544       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
545       pstr->mbs[buf_idx] = pstr->trans[ch];
546     }
547 
548   pstr->valid_len = buf_idx;
549   pstr->valid_raw_len = buf_idx;
550 }
551 
552 /* This function re-construct the buffers.
553    Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
554    convert to upper case in case of REG_ICASE, apply translation.  */
555 
556 static reg_errcode_t
557 internal_function
re_string_reconstruct(re_string_t * pstr,Idx idx,int eflags)558 re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
559 {
560   Idx offset;
561 
562   if (BE (pstr->raw_mbs_idx <= idx, 0))
563     offset = idx - pstr->raw_mbs_idx;
564   else
565     {
566       /* Reset buffer.  */
567 #ifdef RE_ENABLE_I18N
568       if (pstr->mb_cur_max > 1)
569 	memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
570 #endif /* RE_ENABLE_I18N */
571       pstr->len = pstr->raw_len;
572       pstr->stop = pstr->raw_stop;
573       pstr->valid_len = 0;
574       pstr->raw_mbs_idx = 0;
575       pstr->valid_raw_len = 0;
576       pstr->offsets_needed = 0;
577       pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
578 			   : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
579       if (!pstr->mbs_allocated)
580 	pstr->mbs = (unsigned char *) pstr->raw_mbs;
581       offset = idx;
582     }
583 
584   if (BE (offset != 0, 1))
585     {
586       /* Are the characters which are already checked remain?  */
587       if (BE (offset < pstr->valid_raw_len, 1)
588 #ifdef RE_ENABLE_I18N
589 	  /* Handling this would enlarge the code too much.
590 	     Accept a slowdown in that case.  */
591 	  && pstr->offsets_needed == 0
592 #endif
593 	 )
594 	{
595 	  /* Yes, move them to the front of the buffer.  */
596 	  pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags);
597 #ifdef RE_ENABLE_I18N
598 	  if (pstr->mb_cur_max > 1)
599 	    memmove (pstr->wcs, pstr->wcs + offset,
600 		     (pstr->valid_len - offset) * sizeof (wint_t));
601 #endif /* RE_ENABLE_I18N */
602 	  if (BE (pstr->mbs_allocated, 0))
603 	    memmove (pstr->mbs, pstr->mbs + offset,
604 		     pstr->valid_len - offset);
605 	  pstr->valid_len -= offset;
606 	  pstr->valid_raw_len -= offset;
607 #if DEBUG
608 	  assert (pstr->valid_len > 0);
609 #endif
610 	}
611       else
612 	{
613 	  /* No, skip all characters until IDX.  */
614 #ifdef RE_ENABLE_I18N
615 	  if (BE (pstr->offsets_needed, 0))
616 	    {
617 	      pstr->len = pstr->raw_len - idx + offset;
618 	      pstr->stop = pstr->raw_stop - idx + offset;
619 	      pstr->offsets_needed = 0;
620 	    }
621 #endif
622 	  pstr->valid_len = 0;
623 	  pstr->valid_raw_len = 0;
624 #ifdef RE_ENABLE_I18N
625 	  if (pstr->mb_cur_max > 1)
626 	    {
627 	      Idx wcs_idx;
628 	      wint_t wc = WEOF;
629 
630 	      if (pstr->is_utf8)
631 		{
632 		  const unsigned char *raw, *p, *end;
633 
634 		  /* Special case UTF-8.  Multi-byte chars start with any
635 		     byte other than 0x80 - 0xbf.  */
636 		  raw = pstr->raw_mbs + pstr->raw_mbs_idx;
637 		  end = raw + (offset - pstr->mb_cur_max);
638 		  for (p = raw + offset - 1; p >= end; --p)
639 		    if ((*p & 0xc0) != 0x80)
640 		      {
641 			mbstate_t cur_state;
642 			wchar_t wc2;
643 			Idx mlen = raw + pstr->len - p;
644 			size_t mbclen;
645 
646 			/* XXX Don't use mbrtowc, we know which conversion
647 			   to use (UTF-8 -> UCS4).  */
648 			memset (&cur_state, 0, sizeof (cur_state));
649 			mbclen = mbrtowc (&wc2, (const char *) p, mlen,
650 					  &cur_state);
651 			if (raw + offset - p <= mbclen && mbclen < (size_t) -2)
652 			  {
653 			    memset (&pstr->cur_state, '\0',
654 				    sizeof (mbstate_t));
655 			    pstr->valid_len = mbclen - (raw + offset - p);
656 			    wc = wc2;
657 			  }
658 			break;
659 		      }
660 		}
661 
662 	      if (wc == WEOF)
663 		pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
664 	      if (BE (pstr->valid_len, 0))
665 		{
666 		  for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
667 		    pstr->wcs[wcs_idx] = WEOF;
668 		  if (pstr->mbs_allocated)
669 		    memset (pstr->mbs, -1, pstr->valid_len);
670 		}
671 	      pstr->valid_raw_len = pstr->valid_len;
672 	      pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
673 				    && IS_WIDE_WORD_CHAR (wc))
674 				   ? CONTEXT_WORD
675 				   : ((IS_WIDE_NEWLINE (wc)
676 				       && pstr->newline_anchor)
677 				      ? CONTEXT_NEWLINE : 0));
678 	    }
679 	  else
680 #endif /* RE_ENABLE_I18N */
681 	    {
682 	      int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
683 	      if (pstr->trans)
684 		c = pstr->trans[c];
685 	      pstr->tip_context = (bitset_contain (pstr->word_char, c)
686 				   ? CONTEXT_WORD
687 				   : ((IS_NEWLINE (c) && pstr->newline_anchor)
688 				      ? CONTEXT_NEWLINE : 0));
689 	    }
690 	}
691       if (!BE (pstr->mbs_allocated, 0))
692 	pstr->mbs += offset;
693     }
694   pstr->raw_mbs_idx = idx;
695   pstr->len -= offset;
696   pstr->stop -= offset;
697 
698   /* Then build the buffers.  */
699 #ifdef RE_ENABLE_I18N
700   if (pstr->mb_cur_max > 1)
701     {
702       if (pstr->icase)
703 	{
704 	  reg_errcode_t ret = build_wcs_upper_buffer (pstr);
705 	  if (BE (ret != REG_NOERROR, 0))
706 	    return ret;
707 	}
708       else
709 	build_wcs_buffer (pstr);
710     }
711   else
712 #endif /* RE_ENABLE_I18N */
713   if (BE (pstr->mbs_allocated, 0))
714     {
715       if (pstr->icase)
716 	build_upper_buffer (pstr);
717       else if (pstr->trans != NULL)
718 	re_string_translate_buffer (pstr);
719     }
720   else
721     pstr->valid_len = pstr->len;
722 
723   pstr->cur_idx = 0;
724   return REG_NOERROR;
725 }
726 
727 static unsigned char
internal_function(pure)728 internal_function __attribute ((pure))
729 re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
730 {
731   int ch;
732   Idx off;
733 
734   /* Handle the common (easiest) cases first.  */
735   if (BE (!pstr->mbs_allocated, 1))
736     return re_string_peek_byte (pstr, idx);
737 
738 #ifdef RE_ENABLE_I18N
739   if (pstr->mb_cur_max > 1
740       && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
741     return re_string_peek_byte (pstr, idx);
742 #endif
743 
744   off = pstr->cur_idx + idx;
745 #ifdef RE_ENABLE_I18N
746   if (pstr->offsets_needed)
747     off = pstr->offsets[off];
748 #endif
749 
750   ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
751 
752 #ifdef RE_ENABLE_I18N
753   /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
754      this function returns CAPITAL LETTER I instead of first byte of
755      DOTLESS SMALL LETTER I.  The latter would confuse the parser,
756      since peek_byte_case doesn't advance cur_idx in any way.  */
757   if (pstr->offsets_needed && !isascii (ch))
758     return re_string_peek_byte (pstr, idx);
759 #endif
760 
761   return ch;
762 }
763 
764 static unsigned char
internal_function(pure)765 internal_function __attribute ((pure))
766 re_string_fetch_byte_case (re_string_t *pstr)
767 {
768   if (BE (!pstr->mbs_allocated, 1))
769     return re_string_fetch_byte (pstr);
770 
771 #ifdef RE_ENABLE_I18N
772   if (pstr->offsets_needed)
773     {
774       Idx off;
775       int ch;
776 
777       /* For tr_TR.UTF-8 [[:islower:]] there is
778 	 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs.  Skip
779 	 in that case the whole multi-byte character and return
780 	 the original letter.  On the other side, with
781 	 [[: DOTLESS SMALL LETTER I return [[:I, as doing
782 	 anything else would complicate things too much.  */
783 
784       if (!re_string_first_byte (pstr, pstr->cur_idx))
785 	return re_string_fetch_byte (pstr);
786 
787       off = pstr->offsets[pstr->cur_idx];
788       ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
789 
790       if (! isascii (ch))
791 	return re_string_fetch_byte (pstr);
792 
793       re_string_skip_bytes (pstr,
794 			    re_string_char_size_at (pstr, pstr->cur_idx));
795       return ch;
796     }
797 #endif
798 
799   return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
800 }
801 
802 static void
803 internal_function
re_string_destruct(re_string_t * pstr)804 re_string_destruct (re_string_t *pstr)
805 {
806 #ifdef RE_ENABLE_I18N
807   re_free (pstr->wcs);
808   re_free (pstr->offsets);
809 #endif /* RE_ENABLE_I18N  */
810   if (pstr->mbs_allocated)
811     re_free (pstr->mbs);
812 }
813 
814 /* Return the context at IDX in INPUT.  */
815 
816 static unsigned int
817 internal_function
re_string_context_at(const re_string_t * input,Idx idx,int eflags)818 re_string_context_at (const re_string_t *input, Idx idx, int eflags)
819 {
820   int c;
821   if (BE (! REG_VALID_INDEX (idx), 0))
822     /* In this case, we use the value stored in input->tip_context,
823        since we can't know the character in input->mbs[-1] here.  */
824     return input->tip_context;
825   if (BE (idx == input->len, 0))
826     return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
827 	    : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
828 #ifdef RE_ENABLE_I18N
829   if (input->mb_cur_max > 1)
830     {
831       wint_t wc;
832       Idx wc_idx = idx;
833       while(input->wcs[wc_idx] == WEOF)
834 	{
835 #ifdef DEBUG
836 	  /* It must not happen.  */
837 	  assert (REG_VALID_INDEX (wc_idx));
838 #endif
839 	  --wc_idx;
840 	  if (! REG_VALID_INDEX (wc_idx))
841 	    return input->tip_context;
842 	}
843       wc = input->wcs[wc_idx];
844       if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
845 	return CONTEXT_WORD;
846       return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
847 	      ? CONTEXT_NEWLINE : 0);
848     }
849   else
850 #endif
851     {
852       c = re_string_byte_at (input, idx);
853       if (bitset_contain (input->word_char, c))
854 	return CONTEXT_WORD;
855       return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
856     }
857 }
858 
859 /* Functions for set operation.  */
860 
861 static reg_errcode_t
862 internal_function
re_node_set_alloc(re_node_set * set,Idx size)863 re_node_set_alloc (re_node_set *set, Idx size)
864 {
865   set->alloc = size;
866   set->nelem = 0;
867   set->elems = re_xmalloc (Idx, size);
868   if (BE (set->elems == NULL, 0))
869     return REG_ESPACE;
870   return REG_NOERROR;
871 }
872 
873 static reg_errcode_t
874 internal_function
re_node_set_init_1(re_node_set * set,Idx elem)875 re_node_set_init_1 (re_node_set *set, Idx elem)
876 {
877   set->alloc = 1;
878   set->nelem = 1;
879   set->elems = re_malloc (Idx, 1);
880   if (BE (set->elems == NULL, 0))
881     {
882       set->alloc = set->nelem = 0;
883       return REG_ESPACE;
884     }
885   set->elems[0] = elem;
886   return REG_NOERROR;
887 }
888 
889 static reg_errcode_t
890 internal_function
re_node_set_init_2(re_node_set * set,Idx elem1,Idx elem2)891 re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
892 {
893   set->alloc = 2;
894   set->elems = re_malloc (Idx, 2);
895   if (BE (set->elems == NULL, 0))
896     return REG_ESPACE;
897   if (elem1 == elem2)
898     {
899       set->nelem = 1;
900       set->elems[0] = elem1;
901     }
902   else
903     {
904       set->nelem = 2;
905       if (elem1 < elem2)
906 	{
907 	  set->elems[0] = elem1;
908 	  set->elems[1] = elem2;
909 	}
910       else
911 	{
912 	  set->elems[0] = elem2;
913 	  set->elems[1] = elem1;
914 	}
915     }
916   return REG_NOERROR;
917 }
918 
919 static reg_errcode_t
920 internal_function
re_node_set_init_copy(re_node_set * dest,const re_node_set * src)921 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
922 {
923   dest->nelem = src->nelem;
924   if (src->nelem > 0)
925     {
926       dest->alloc = dest->nelem;
927       dest->elems = re_malloc (Idx, dest->alloc);
928       if (BE (dest->elems == NULL, 0))
929 	{
930 	  dest->alloc = dest->nelem = 0;
931 	  return REG_ESPACE;
932 	}
933       memcpy (dest->elems, src->elems, src->nelem * sizeof dest->elems[0]);
934     }
935   else
936     re_node_set_init_empty (dest);
937   return REG_NOERROR;
938 }
939 
940 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
941    DEST. Return value indicate the error code or REG_NOERROR if succeeded.
942    Note: We assume dest->elems is NULL, when dest->alloc is 0.  */
943 
944 static reg_errcode_t
945 internal_function
re_node_set_add_intersect(re_node_set * dest,const re_node_set * src1,const re_node_set * src2)946 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
947 			   const re_node_set *src2)
948 {
949   Idx i1, i2, is, id, delta, sbase;
950   if (src1->nelem == 0 || src2->nelem == 0)
951     return REG_NOERROR;
952 
953   /* We need dest->nelem + 2 * elems_in_intersection; this is a
954      conservative estimate.  */
955   if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
956     {
957       Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
958       Idx *new_elems;
959       if (sizeof (Idx) < 3
960 	  && (new_alloc < dest->alloc
961 	      || ((Idx) (src1->nelem + src2->nelem) < src1->nelem)))
962 	return REG_ESPACE;
963       new_elems = re_xrealloc (dest->elems, Idx, new_alloc);
964       if (BE (new_elems == NULL, 0))
965         return REG_ESPACE;
966       dest->elems = new_elems;
967       dest->alloc = new_alloc;
968     }
969 
970   /* Find the items in the intersection of SRC1 and SRC2, and copy
971      into the top of DEST those that are not already in DEST itself.  */
972   sbase = dest->nelem + src1->nelem + src2->nelem;
973   i1 = src1->nelem - 1;
974   i2 = src2->nelem - 1;
975   id = dest->nelem - 1;
976   for (;;)
977     {
978       if (src1->elems[i1] == src2->elems[i2])
979 	{
980 	  /* Try to find the item in DEST.  Maybe we could binary search?  */
981 	  while (REG_VALID_INDEX (id) && dest->elems[id] > src1->elems[i1])
982 	    --id;
983 
984           if (! REG_VALID_INDEX (id) || dest->elems[id] != src1->elems[i1])
985             dest->elems[--sbase] = src1->elems[i1];
986 
987 	  if (! REG_VALID_INDEX (--i1) || ! REG_VALID_INDEX (--i2))
988 	    break;
989 	}
990 
991       /* Lower the highest of the two items.  */
992       else if (src1->elems[i1] < src2->elems[i2])
993 	{
994 	  if (! REG_VALID_INDEX (--i2))
995 	    break;
996 	}
997       else
998 	{
999 	  if (! REG_VALID_INDEX (--i1))
1000 	    break;
1001 	}
1002     }
1003 
1004   id = dest->nelem - 1;
1005   is = dest->nelem + src1->nelem + src2->nelem - 1;
1006   delta = is - sbase + 1;
1007 
1008   /* Now copy.  When DELTA becomes zero, the remaining
1009      DEST elements are already in place; this is more or
1010      less the same loop that is in re_node_set_merge.  */
1011   dest->nelem += delta;
1012   if (delta > 0 && REG_VALID_INDEX (id))
1013     for (;;)
1014       {
1015         if (dest->elems[is] > dest->elems[id])
1016           {
1017             /* Copy from the top.  */
1018             dest->elems[id + delta--] = dest->elems[is--];
1019             if (delta == 0)
1020               break;
1021           }
1022         else
1023           {
1024             /* Slide from the bottom.  */
1025             dest->elems[id + delta] = dest->elems[id];
1026             if (! REG_VALID_INDEX (--id))
1027               break;
1028           }
1029       }
1030 
1031   /* Copy remaining SRC elements.  */
1032   memcpy (dest->elems, dest->elems + sbase, delta * sizeof dest->elems[0]);
1033 
1034   return REG_NOERROR;
1035 }
1036 
1037 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1038    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1039 
1040 static reg_errcode_t
1041 internal_function
re_node_set_init_union(re_node_set * dest,const re_node_set * src1,const re_node_set * src2)1042 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1043 			const re_node_set *src2)
1044 {
1045   Idx i1, i2, id;
1046   if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1047     {
1048       dest->alloc = src1->nelem + src2->nelem;
1049       if (sizeof (Idx) < 2 && dest->alloc < src1->nelem)
1050 	return REG_ESPACE;
1051       dest->elems = re_xmalloc (Idx, dest->alloc);
1052       if (BE (dest->elems == NULL, 0))
1053 	return REG_ESPACE;
1054     }
1055   else
1056     {
1057       if (src1 != NULL && src1->nelem > 0)
1058 	return re_node_set_init_copy (dest, src1);
1059       else if (src2 != NULL && src2->nelem > 0)
1060 	return re_node_set_init_copy (dest, src2);
1061       else
1062 	re_node_set_init_empty (dest);
1063       return REG_NOERROR;
1064     }
1065   for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1066     {
1067       if (src1->elems[i1] > src2->elems[i2])
1068 	{
1069 	  dest->elems[id++] = src2->elems[i2++];
1070 	  continue;
1071 	}
1072       if (src1->elems[i1] == src2->elems[i2])
1073 	++i2;
1074       dest->elems[id++] = src1->elems[i1++];
1075     }
1076   if (i1 < src1->nelem)
1077     {
1078       memcpy (dest->elems + id, src1->elems + i1,
1079 	     (src1->nelem - i1) * sizeof dest->elems[0]);
1080       id += src1->nelem - i1;
1081     }
1082   else if (i2 < src2->nelem)
1083     {
1084       memcpy (dest->elems + id, src2->elems + i2,
1085 	     (src2->nelem - i2) * sizeof dest->elems[0]);
1086       id += src2->nelem - i2;
1087     }
1088   dest->nelem = id;
1089   return REG_NOERROR;
1090 }
1091 
1092 /* Calculate the union set of the sets DEST and SRC. And store it to
1093    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1094 
1095 static reg_errcode_t
1096 internal_function
re_node_set_merge(re_node_set * dest,const re_node_set * src)1097 re_node_set_merge (re_node_set *dest, const re_node_set *src)
1098 {
1099   Idx is, id, sbase, delta;
1100   if (src == NULL || src->nelem == 0)
1101     return REG_NOERROR;
1102   if (sizeof (Idx) < 3
1103       && ((Idx) (2 * src->nelem) < src->nelem
1104 	  || (Idx) (2 * src->nelem + dest->nelem) < dest->nelem))
1105     return REG_ESPACE;
1106   if (dest->alloc < 2 * src->nelem + dest->nelem)
1107     {
1108       Idx new_alloc = src->nelem + dest->alloc;
1109       Idx *new_buffer;
1110       if (sizeof (Idx) < 4 && new_alloc < dest->alloc)
1111 	return REG_ESPACE;
1112       new_buffer = re_x2realloc (dest->elems, Idx, &new_alloc);
1113       if (BE (new_buffer == NULL, 0))
1114 	return REG_ESPACE;
1115       dest->elems = new_buffer;
1116       dest->alloc = new_alloc;
1117     }
1118 
1119   if (BE (dest->nelem == 0, 0))
1120     {
1121       dest->nelem = src->nelem;
1122       memcpy (dest->elems, src->elems, src->nelem * sizeof dest->elems[0]);
1123       return REG_NOERROR;
1124     }
1125 
1126   /* Copy into the top of DEST the items of SRC that are not
1127      found in DEST.  Maybe we could binary search in DEST?  */
1128   for (sbase = dest->nelem + 2 * src->nelem,
1129        is = src->nelem - 1, id = dest->nelem - 1;
1130        REG_VALID_INDEX (is) && REG_VALID_INDEX (id); )
1131     {
1132       if (dest->elems[id] == src->elems[is])
1133         is--, id--;
1134       else if (dest->elems[id] < src->elems[is])
1135         dest->elems[--sbase] = src->elems[is--];
1136       else /* if (dest->elems[id] > src->elems[is]) */
1137         --id;
1138     }
1139 
1140   if (REG_VALID_INDEX (is))
1141     {
1142       /* If DEST is exhausted, the remaining items of SRC must be unique.  */
1143       sbase -= is + 1;
1144       memcpy (dest->elems + sbase, src->elems,
1145 	      (is + 1) * sizeof dest->elems[0]);
1146     }
1147 
1148   id = dest->nelem - 1;
1149   is = dest->nelem + 2 * src->nelem - 1;
1150   delta = is - sbase + 1;
1151   if (delta == 0)
1152     return REG_NOERROR;
1153 
1154   /* Now copy.  When DELTA becomes zero, the remaining
1155      DEST elements are already in place.  */
1156   dest->nelem += delta;
1157   for (;;)
1158     {
1159       if (dest->elems[is] > dest->elems[id])
1160         {
1161 	  /* Copy from the top.  */
1162           dest->elems[id + delta--] = dest->elems[is--];
1163 	  if (delta == 0)
1164 	    break;
1165 	}
1166       else
1167         {
1168           /* Slide from the bottom.  */
1169           dest->elems[id + delta] = dest->elems[id];
1170 	  if (! REG_VALID_INDEX (--id))
1171 	    {
1172 	      /* Copy remaining SRC elements.  */
1173 	      memcpy (dest->elems, dest->elems + sbase,
1174 	              delta * sizeof dest->elems[0]);
1175 	      break;
1176 	    }
1177 	}
1178     }
1179 
1180   return REG_NOERROR;
1181 }
1182 
1183 /* Insert the new element ELEM to the re_node_set* SET.
1184    SET should not already have ELEM.
1185    Return true if successful.  */
1186 
1187 static bool
1188 internal_function
re_node_set_insert(re_node_set * set,Idx elem)1189 re_node_set_insert (re_node_set *set, Idx elem)
1190 {
1191   Idx idx;
1192   /* In case the set is empty.  */
1193   if (set->alloc == 0)
1194     return re_node_set_init_1 (set, elem) == REG_NOERROR;
1195 
1196   if (BE (set->nelem, 0) == 0)
1197     {
1198       /* We already guaranteed above that set->alloc != 0.  */
1199       set->elems[0] = elem;
1200       ++set->nelem;
1201       return true;
1202     }
1203 
1204   /* Realloc if we need.  */
1205   if (set->alloc == set->nelem)
1206     {
1207       Idx *new_elems = re_x2realloc (set->elems, Idx, &set->alloc);
1208       if (BE (new_elems == NULL, 0))
1209 	return false;
1210       set->elems = new_elems;
1211     }
1212 
1213   /* Move the elements which follows the new element.  Test the
1214      first element separately to skip a check in the inner loop.  */
1215   if (elem < set->elems[0])
1216     {
1217       idx = 0;
1218       for (idx = set->nelem; idx > 0; idx--)
1219         set->elems[idx] = set->elems[idx - 1];
1220     }
1221   else
1222     {
1223       for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1224         set->elems[idx] = set->elems[idx - 1];
1225     }
1226 
1227   /* Insert the new element.  */
1228   set->elems[idx] = elem;
1229   ++set->nelem;
1230   return true;
1231 }
1232 
1233 /* Insert the new element ELEM to the re_node_set* SET.
1234    SET should not already have any element greater than or equal to ELEM.
1235    Return true if successful.  */
1236 
1237 static bool
1238 internal_function
re_node_set_insert_last(re_node_set * set,Idx elem)1239 re_node_set_insert_last (re_node_set *set, Idx elem)
1240 {
1241   /* Realloc if we need.  */
1242   if (set->alloc == set->nelem)
1243     {
1244       Idx *new_elems;
1245       new_elems = re_x2realloc (set->elems, Idx, &set->alloc);
1246       if (BE (new_elems == NULL, 0))
1247 	return false;
1248       set->elems = new_elems;
1249     }
1250 
1251   /* Insert the new element.  */
1252   set->elems[set->nelem++] = elem;
1253   return true;
1254 }
1255 
1256 /* Compare two node sets SET1 and SET2.
1257    Return true if SET1 and SET2 are equivalent.  */
1258 
1259 static bool
internal_function(pure)1260 internal_function __attribute ((pure))
1261 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1262 {
1263   Idx i;
1264   if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1265     return false;
1266   for (i = set1->nelem ; REG_VALID_INDEX (--i) ; )
1267     if (set1->elems[i] != set2->elems[i])
1268       return false;
1269   return true;
1270 }
1271 
1272 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise.  */
1273 
1274 static Idx
internal_function(pure)1275 internal_function __attribute ((pure))
1276 re_node_set_contains (const re_node_set *set, Idx elem)
1277 {
1278   __re_size_t idx, right, mid;
1279   if (! REG_VALID_NONZERO_INDEX (set->nelem))
1280     return 0;
1281 
1282   /* Binary search the element.  */
1283   idx = 0;
1284   right = set->nelem - 1;
1285   while (idx < right)
1286     {
1287       mid = (idx + right) / 2;
1288       if (set->elems[mid] < elem)
1289 	idx = mid + 1;
1290       else
1291 	right = mid;
1292     }
1293   return set->elems[idx] == elem ? idx + 1 : 0;
1294 }
1295 
1296 static void
1297 internal_function
re_node_set_remove_at(re_node_set * set,Idx idx)1298 re_node_set_remove_at (re_node_set *set, Idx idx)
1299 {
1300   if (idx < 0 || idx >= set->nelem)
1301     return;
1302   --set->nelem;
1303   for (; idx < set->nelem; idx++)
1304     set->elems[idx] = set->elems[idx + 1];
1305 }
1306 
1307 
1308 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1309    Or return REG_MISSING if an error occurred.  */
1310 
1311 static Idx
1312 internal_function
re_dfa_add_node(re_dfa_t * dfa,re_token_t token)1313 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1314 {
1315   int type = token.type;
1316   if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1317     {
1318       Idx new_nodes_alloc = dfa->nodes_alloc;
1319       Idx *new_nexts, *new_indices;
1320       re_node_set *new_edests, *new_eclosures;
1321 
1322       re_token_t *new_nodes = re_x2realloc (dfa->nodes, re_token_t,
1323 					    &new_nodes_alloc);
1324       if (BE (new_nodes == NULL, 0))
1325 	return REG_MISSING;
1326       dfa->nodes = new_nodes;
1327       new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1328       new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
1329       new_edests = re_xrealloc (dfa->edests, re_node_set, new_nodes_alloc);
1330       new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1331       if (BE (new_nexts == NULL || new_indices == NULL
1332 	      || new_edests == NULL || new_eclosures == NULL, 0))
1333 	return REG_MISSING;
1334       dfa->nexts = new_nexts;
1335       dfa->org_indices = new_indices;
1336       dfa->edests = new_edests;
1337       dfa->eclosures = new_eclosures;
1338       dfa->nodes_alloc = new_nodes_alloc;
1339     }
1340   dfa->nodes[dfa->nodes_len] = token;
1341   dfa->nodes[dfa->nodes_len].constraint = 0;
1342 #ifdef RE_ENABLE_I18N
1343   dfa->nodes[dfa->nodes_len].accept_mb =
1344     (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
1345 #endif
1346   dfa->nexts[dfa->nodes_len] = REG_MISSING;
1347   re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1348   re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1349   return dfa->nodes_len++;
1350 }
1351 
1352 static inline re_hashval_t
1353 internal_function
calc_state_hash(const re_node_set * nodes,unsigned int context)1354 calc_state_hash (const re_node_set *nodes, unsigned int context)
1355 {
1356   re_hashval_t hash = nodes->nelem + context;
1357   Idx i;
1358   for (i = 0 ; i < nodes->nelem ; i++)
1359     hash += nodes->elems[i];
1360   return hash;
1361 }
1362 
1363 /* Search for the state whose node_set is equivalent to NODES.
1364    Return the pointer to the state, if we found it in the DFA.
1365    Otherwise create the new one and return it.  In case of an error
1366    return NULL and set the error code in ERR.
1367    Note: - We assume NULL as the invalid state, then it is possible that
1368 	   return value is NULL and ERR is REG_NOERROR.
1369 	 - We never return non-NULL value in case of any errors, it is for
1370 	   optimization.  */
1371 
1372 static re_dfastate_t*
1373 internal_function
re_acquire_state(reg_errcode_t * err,re_dfa_t * dfa,const re_node_set * nodes)1374 re_acquire_state (reg_errcode_t *err, re_dfa_t *dfa, const re_node_set *nodes)
1375 {
1376   re_hashval_t hash;
1377   re_dfastate_t *new_state;
1378   struct re_state_table_entry *spot;
1379   Idx i;
1380 #ifdef lint
1381   /* Suppress bogus uninitialized-variable warnings.  */
1382   *err = REG_NOERROR;
1383 #endif
1384   if (BE (nodes->nelem == 0, 0))
1385     {
1386       *err = REG_NOERROR;
1387       return NULL;
1388     }
1389   hash = calc_state_hash (nodes, 0);
1390   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1391 
1392   for (i = 0 ; i < spot->num ; i++)
1393     {
1394       re_dfastate_t *state = spot->array[i];
1395       if (hash != state->hash)
1396 	continue;
1397       if (re_node_set_compare (&state->nodes, nodes))
1398 	return state;
1399     }
1400 
1401   /* There are no appropriate state in the dfa, create the new one.  */
1402   new_state = create_ci_newstate (dfa, nodes, hash);
1403   if (BE (new_state != NULL, 1))
1404     return new_state;
1405   else
1406     {
1407       *err = REG_ESPACE;
1408       return NULL;
1409     }
1410 }
1411 
1412 /* Search for the state whose node_set is equivalent to NODES and
1413    whose context is equivalent to CONTEXT.
1414    Return the pointer to the state, if we found it in the DFA.
1415    Otherwise create the new one and return it.  In case of an error
1416    return NULL and set the error code in ERR.
1417    Note: - We assume NULL as the invalid state, then it is possible that
1418 	   return value is NULL and ERR is REG_NOERROR.
1419 	 - We never return non-NULL value in case of any errors, it is for
1420 	   optimization.  */
1421 
1422 static re_dfastate_t*
1423 internal_function
re_acquire_state_context(reg_errcode_t * err,re_dfa_t * dfa,const re_node_set * nodes,unsigned int context)1424 re_acquire_state_context (reg_errcode_t *err, re_dfa_t *dfa,
1425 			  const re_node_set *nodes, unsigned int context)
1426 {
1427   re_hashval_t hash;
1428   re_dfastate_t *new_state;
1429   struct re_state_table_entry *spot;
1430   Idx i;
1431 #ifdef lint
1432   /* Suppress bogus uninitialized-variable warnings.  */
1433   *err = REG_NOERROR;
1434 #endif
1435   if (nodes->nelem == 0)
1436     {
1437       *err = REG_NOERROR;
1438       return NULL;
1439     }
1440   hash = calc_state_hash (nodes, context);
1441   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1442 
1443   for (i = 0 ; i < spot->num ; i++)
1444     {
1445       re_dfastate_t *state = spot->array[i];
1446       if (state->hash == hash
1447 	  && state->context == context
1448 	  && re_node_set_compare (state->entrance_nodes, nodes))
1449 	return state;
1450     }
1451   /* There are no appropriate state in `dfa', create the new one.  */
1452   new_state = create_cd_newstate (dfa, nodes, context, hash);
1453   if (BE (new_state != NULL, 1))
1454     return new_state;
1455   else
1456     {
1457       *err = REG_ESPACE;
1458       return NULL;
1459     }
1460 }
1461 
1462 /* Finish initialization of the new state NEWSTATE, and using its hash value
1463    HASH put in the appropriate bucket of DFA's state table.  Return value
1464    indicates the error code if failed.  */
1465 
1466 static reg_errcode_t
1467 internal_function
register_state(const re_dfa_t * dfa,re_dfastate_t * newstate,re_hashval_t hash)1468 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate, re_hashval_t hash)
1469 {
1470   struct re_state_table_entry *spot;
1471   reg_errcode_t err;
1472   Idx i;
1473 
1474   newstate->hash = hash;
1475   err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1476   if (BE (err != REG_NOERROR, 0))
1477     return REG_ESPACE;
1478   for (i = 0; i < newstate->nodes.nelem; i++)
1479     {
1480       Idx elem = newstate->nodes.elems[i];
1481       if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1482 	{
1483 	  bool ok = re_node_set_insert_last (&newstate->non_eps_nodes, elem);
1484 	  if (BE (! ok, 0))
1485 	    return REG_ESPACE;
1486 	}
1487     }
1488 
1489   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1490   if (BE (spot->alloc <= spot->num, 0))
1491     {
1492       Idx new_alloc = spot->num;
1493       re_dfastate_t **new_array = re_x2realloc (spot->array, re_dfastate_t *,
1494 						&new_alloc);
1495       if (BE (new_array == NULL, 0))
1496 	return REG_ESPACE;
1497       spot->array = new_array;
1498       spot->alloc = new_alloc;
1499     }
1500   spot->array[spot->num++] = newstate;
1501   return REG_NOERROR;
1502 }
1503 
1504 /* Create the new state which is independ of contexts.
1505    Return the new state if succeeded, otherwise return NULL.  */
1506 
1507 static re_dfastate_t *
1508 internal_function
create_ci_newstate(const re_dfa_t * dfa,const re_node_set * nodes,re_hashval_t hash)1509 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1510 		    re_hashval_t hash)
1511 {
1512   Idx i;
1513   reg_errcode_t err;
1514   re_dfastate_t *newstate;
1515 
1516   newstate = re_calloc (re_dfastate_t, 1);
1517   if (BE (newstate == NULL, 0))
1518     return NULL;
1519   err = re_node_set_init_copy (&newstate->nodes, nodes);
1520   if (BE (err != REG_NOERROR, 0))
1521     {
1522       re_free (newstate);
1523       return NULL;
1524     }
1525 
1526   newstate->entrance_nodes = &newstate->nodes;
1527   for (i = 0 ; i < nodes->nelem ; i++)
1528     {
1529       re_token_t *node = dfa->nodes + nodes->elems[i];
1530       re_token_type_t type = node->type;
1531       if (type == CHARACTER && !node->constraint)
1532 	continue;
1533 #ifdef RE_ENABLE_I18N
1534       newstate->accept_mb |= node->accept_mb;
1535 #endif /* RE_ENABLE_I18N */
1536 
1537       /* If the state has the halt node, the state is a halt state.  */
1538       if (type == END_OF_RE)
1539 	newstate->halt = 1;
1540       else if (type == OP_BACK_REF)
1541 	newstate->has_backref = 1;
1542       else if (type == ANCHOR || node->constraint)
1543 	newstate->has_constraint = 1;
1544     }
1545   err = register_state (dfa, newstate, hash);
1546   if (BE (err != REG_NOERROR, 0))
1547     {
1548       free_state (newstate);
1549       newstate = NULL;
1550     }
1551   return newstate;
1552 }
1553 
1554 /* Create the new state which is depend on the context CONTEXT.
1555    Return the new state if succeeded, otherwise return NULL.  */
1556 
1557 static re_dfastate_t *
1558 internal_function
create_cd_newstate(const re_dfa_t * dfa,const re_node_set * nodes,unsigned int context,re_hashval_t hash)1559 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1560 		    unsigned int context, re_hashval_t hash)
1561 {
1562   Idx i, nctx_nodes = 0;
1563   reg_errcode_t err;
1564   re_dfastate_t *newstate;
1565 
1566   newstate = re_calloc (re_dfastate_t, 1);
1567   if (BE (newstate == NULL, 0))
1568     return NULL;
1569   err = re_node_set_init_copy (&newstate->nodes, nodes);
1570   if (BE (err != REG_NOERROR, 0))
1571     {
1572       re_free (newstate);
1573       return NULL;
1574     }
1575 
1576   newstate->context = context;
1577   newstate->entrance_nodes = &newstate->nodes;
1578 
1579   for (i = 0 ; i < nodes->nelem ; i++)
1580     {
1581       unsigned int constraint = 0;
1582       re_token_t *node = dfa->nodes + nodes->elems[i];
1583       re_token_type_t type = node->type;
1584       if (node->constraint)
1585 	constraint = node->constraint;
1586 
1587       if (type == CHARACTER && !constraint)
1588 	continue;
1589 #ifdef RE_ENABLE_I18N
1590       newstate->accept_mb |= node->accept_mb;
1591 #endif /* RE_ENABLE_I18N */
1592 
1593       /* If the state has the halt node, the state is a halt state.  */
1594       if (type == END_OF_RE)
1595 	newstate->halt = 1;
1596       else if (type == OP_BACK_REF)
1597 	newstate->has_backref = 1;
1598       else if (type == ANCHOR)
1599 	constraint = node->opr.ctx_type;
1600 
1601       if (constraint)
1602 	{
1603 	  if (newstate->entrance_nodes == &newstate->nodes)
1604 	    {
1605 	      newstate->entrance_nodes = re_malloc (re_node_set, 1);
1606 	      if (BE (newstate->entrance_nodes == NULL, 0))
1607 		{
1608 		  free_state (newstate);
1609 		  return NULL;
1610 		}
1611 	      re_node_set_init_copy (newstate->entrance_nodes, nodes);
1612 	      nctx_nodes = 0;
1613 	      newstate->has_constraint = 1;
1614 	    }
1615 
1616 	  if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1617 	    {
1618 	      re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1619 	      ++nctx_nodes;
1620 	    }
1621 	}
1622     }
1623   err = register_state (dfa, newstate, hash);
1624   if (BE (err != REG_NOERROR, 0))
1625     {
1626       free_state (newstate);
1627       newstate = NULL;
1628     }
1629   return  newstate;
1630 }
1631 
1632 static void
1633 internal_function
free_state(re_dfastate_t * state)1634 free_state (re_dfastate_t *state)
1635 {
1636   re_node_set_free (&state->non_eps_nodes);
1637   re_node_set_free (&state->inveclosure);
1638   if (state->entrance_nodes != &state->nodes)
1639     {
1640       re_node_set_free (state->entrance_nodes);
1641       re_free (state->entrance_nodes);
1642     }
1643   re_node_set_free (&state->nodes);
1644   re_free (state->word_trtable);
1645   re_free (state->trtable);
1646   re_free (state);
1647 }
1648 
1649 __RCSID("$MirOS: src/gnu/usr.bin/cvs/lib/regex_internal.c,v 1.2 2011/07/28 15:54:34 tg Exp $");
1650