1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
9 any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License along
17 with this program; if not, write to the Free Software Foundation,
18 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
19
20 static void re_string_construct_common (const char *str, Idx len,
21 re_string_t *pstr,
22 REG_TRANSLATE_TYPE trans, bool icase,
23 const re_dfa_t *dfa) internal_function;
24 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
25 const re_node_set *nodes,
26 re_hashval_t hash) internal_function;
27 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
28 const re_node_set *nodes,
29 unsigned int context,
30 re_hashval_t hash) internal_function;
31
32 /* Functions for string operation. */
33
34 /* This function allocate the buffers. It is necessary to call
35 re_string_reconstruct before using the object. */
36
37 static reg_errcode_t
38 internal_function
re_string_allocate(re_string_t * pstr,const char * str,Idx len,Idx init_len,REG_TRANSLATE_TYPE trans,bool icase,const re_dfa_t * dfa)39 re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
40 REG_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
41 {
42 reg_errcode_t ret;
43 Idx init_buf_len;
44
45 /* Ensure at least one character fits into the buffers. */
46 if (init_len < dfa->mb_cur_max)
47 init_len = dfa->mb_cur_max;
48 init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
49 re_string_construct_common (str, len, pstr, trans, icase, dfa);
50
51 ret = re_string_realloc_buffers (pstr, init_buf_len);
52 if (BE (ret != REG_NOERROR, 0))
53 return ret;
54
55 pstr->word_char = dfa->word_char;
56 pstr->word_ops_used = dfa->word_ops_used;
57 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
58 pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
59 pstr->valid_raw_len = pstr->valid_len;
60 return REG_NOERROR;
61 }
62
63 /* This function allocate the buffers, and initialize them. */
64
65 static reg_errcode_t
66 internal_function
re_string_construct(re_string_t * pstr,const char * str,Idx len,REG_TRANSLATE_TYPE trans,bool icase,const re_dfa_t * dfa)67 re_string_construct (re_string_t *pstr, const char *str, Idx len,
68 REG_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
69 {
70 reg_errcode_t ret;
71 memset (pstr, '\0', sizeof (re_string_t));
72 re_string_construct_common (str, len, pstr, trans, icase, dfa);
73
74 if (len > 0)
75 {
76 ret = re_string_realloc_buffers (pstr, len + 1);
77 if (BE (ret != REG_NOERROR, 0))
78 return ret;
79 }
80 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
81
82 if (icase)
83 {
84 #ifdef RE_ENABLE_I18N
85 if (dfa->mb_cur_max > 1)
86 {
87 while (1)
88 {
89 ret = build_wcs_upper_buffer (pstr);
90 if (BE (ret != REG_NOERROR, 0))
91 return ret;
92 if (pstr->valid_raw_len >= len)
93 break;
94 if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
95 break;
96 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
97 if (BE (ret != REG_NOERROR, 0))
98 return ret;
99 }
100 }
101 else
102 #endif /* RE_ENABLE_I18N */
103 build_upper_buffer (pstr);
104 }
105 else
106 {
107 #ifdef RE_ENABLE_I18N
108 if (dfa->mb_cur_max > 1)
109 build_wcs_buffer (pstr);
110 else
111 #endif /* RE_ENABLE_I18N */
112 {
113 if (trans != NULL)
114 re_string_translate_buffer (pstr);
115 else
116 {
117 pstr->valid_len = pstr->bufs_len;
118 pstr->valid_raw_len = pstr->bufs_len;
119 }
120 }
121 }
122
123 return REG_NOERROR;
124 }
125
126 /* Helper functions for re_string_allocate, and re_string_construct. */
127
128 static reg_errcode_t
129 internal_function
re_string_realloc_buffers(re_string_t * pstr,Idx new_buf_len)130 re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
131 {
132 #ifdef RE_ENABLE_I18N
133 if (pstr->mb_cur_max > 1)
134 {
135 wint_t *new_wcs = re_xrealloc (pstr->wcs, wint_t, new_buf_len);
136 if (BE (new_wcs == NULL, 0))
137 return REG_ESPACE;
138 pstr->wcs = new_wcs;
139 if (pstr->offsets != NULL)
140 {
141 Idx *new_offsets = re_xrealloc (pstr->offsets, Idx, new_buf_len);
142 if (BE (new_offsets == NULL, 0))
143 return REG_ESPACE;
144 pstr->offsets = new_offsets;
145 }
146 }
147 #endif /* RE_ENABLE_I18N */
148 if (pstr->mbs_allocated)
149 {
150 unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
151 new_buf_len);
152 if (BE (new_mbs == NULL, 0))
153 return REG_ESPACE;
154 pstr->mbs = new_mbs;
155 }
156 pstr->bufs_len = new_buf_len;
157 return REG_NOERROR;
158 }
159
160
161 static void
162 internal_function
re_string_construct_common(const char * str,Idx len,re_string_t * pstr,REG_TRANSLATE_TYPE trans,bool icase,const re_dfa_t * dfa)163 re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
164 REG_TRANSLATE_TYPE trans, bool icase,
165 const re_dfa_t *dfa)
166 {
167 pstr->raw_mbs = (const unsigned char *) str;
168 pstr->len = len;
169 pstr->raw_len = len;
170 pstr->trans = (unsigned REG_TRANSLATE_TYPE) trans;
171 pstr->icase = icase;
172 pstr->mbs_allocated = (trans != NULL || icase);
173 pstr->mb_cur_max = dfa->mb_cur_max;
174 pstr->is_utf8 = dfa->is_utf8;
175 pstr->map_notascii = dfa->map_notascii;
176 pstr->stop = pstr->len;
177 pstr->raw_stop = pstr->stop;
178 }
179
180 #ifdef RE_ENABLE_I18N
181
182 /* Build wide character buffer PSTR->WCS.
183 If the byte sequence of the string are:
184 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
185 Then wide character buffer will be:
186 <wc1> , WEOF , <wc2> , WEOF , <wc3>
187 We use WEOF for padding, they indicate that the position isn't
188 a first byte of a multibyte character.
189
190 Note that this function assumes PSTR->VALID_LEN elements are already
191 built and starts from PSTR->VALID_LEN. */
192
193 static void
194 internal_function
build_wcs_buffer(re_string_t * pstr)195 build_wcs_buffer (re_string_t *pstr)
196 {
197 #ifdef _LIBC
198 unsigned char buf[MB_LEN_MAX];
199 assert (MB_LEN_MAX >= pstr->mb_cur_max);
200 #else
201 unsigned char buf[64];
202 #endif
203 mbstate_t prev_st;
204 Idx byte_idx, end_idx, remain_len;
205 size_t mbclen;
206
207 /* Build the buffers from pstr->valid_len to either pstr->len or
208 pstr->bufs_len. */
209 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
210 for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
211 {
212 wchar_t wc;
213 const char *p;
214
215 remain_len = end_idx - byte_idx;
216 prev_st = pstr->cur_state;
217 /* Apply the translation if we need. */
218 if (BE (pstr->trans != NULL, 0))
219 {
220 int i, ch;
221
222 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
223 {
224 ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
225 buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
226 }
227 p = (const char *) buf;
228 }
229 else
230 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
231 mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
232 if (BE (mbclen == (size_t) -2, 0))
233 {
234 /* The buffer doesn't have enough space, finish to build. */
235 pstr->cur_state = prev_st;
236 break;
237 }
238 else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
239 {
240 /* We treat these cases as a singlebyte character. */
241 mbclen = 1;
242 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
243 if (BE (pstr->trans != NULL, 0))
244 wc = pstr->trans[wc];
245 pstr->cur_state = prev_st;
246 }
247
248 /* Write wide character and padding. */
249 pstr->wcs[byte_idx++] = wc;
250 /* Write paddings. */
251 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
252 pstr->wcs[byte_idx++] = WEOF;
253 }
254 pstr->valid_len = byte_idx;
255 pstr->valid_raw_len = byte_idx;
256 }
257
258 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
259 but for REG_ICASE. */
260
261 static reg_errcode_t
262 internal_function
build_wcs_upper_buffer(re_string_t * pstr)263 build_wcs_upper_buffer (re_string_t *pstr)
264 {
265 mbstate_t prev_st;
266 Idx src_idx, byte_idx, end_idx, remain_len;
267 size_t mbclen;
268 #ifdef _LIBC
269 char buf[MB_LEN_MAX];
270 assert (MB_LEN_MAX >= pstr->mb_cur_max);
271 #else
272 char buf[64];
273 #endif
274
275 byte_idx = pstr->valid_len;
276 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
277
278 /* The following optimization assumes that ASCII characters can be
279 mapped to wide characters with a simple cast. */
280 if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
281 {
282 while (byte_idx < end_idx)
283 {
284 wchar_t wc;
285
286 if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
287 && mbsinit (&pstr->cur_state))
288 {
289 /* In case of a singlebyte character. */
290 pstr->mbs[byte_idx]
291 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
292 /* The next step uses the assumption that wchar_t is encoded
293 ASCII-safe: all ASCII values can be converted like this. */
294 pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
295 ++byte_idx;
296 continue;
297 }
298
299 remain_len = end_idx - byte_idx;
300 prev_st = pstr->cur_state;
301 mbclen = mbrtowc (&wc,
302 ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
303 + byte_idx), remain_len, &pstr->cur_state);
304 if (BE ((size_t) (mbclen + 2) > 2, 1))
305 {
306 wchar_t wcu = wc;
307 if (iswlower (wc))
308 {
309 size_t mbcdlen;
310
311 wcu = towupper (wc);
312 mbcdlen = wcrtomb (buf, wcu, &prev_st);
313 if (BE (mbclen == mbcdlen, 1))
314 memcpy (pstr->mbs + byte_idx, buf, mbclen);
315 else
316 {
317 src_idx = byte_idx;
318 goto offsets_needed;
319 }
320 }
321 else
322 memcpy (pstr->mbs + byte_idx,
323 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
324 pstr->wcs[byte_idx++] = wcu;
325 /* Write paddings. */
326 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
327 pstr->wcs[byte_idx++] = WEOF;
328 }
329 else if (mbclen == (size_t) -1 || mbclen == 0)
330 {
331 /* It is an invalid character or '\0'. Just use the byte. */
332 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
333 pstr->mbs[byte_idx] = ch;
334 /* And also cast it to wide char. */
335 pstr->wcs[byte_idx++] = (wchar_t) ch;
336 if (BE (mbclen == (size_t) -1, 0))
337 pstr->cur_state = prev_st;
338 }
339 else
340 {
341 /* The buffer doesn't have enough space, finish to build. */
342 pstr->cur_state = prev_st;
343 break;
344 }
345 }
346 pstr->valid_len = byte_idx;
347 pstr->valid_raw_len = byte_idx;
348 return REG_NOERROR;
349 }
350 else
351 for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
352 {
353 wchar_t wc;
354 const char *p;
355 offsets_needed:
356 remain_len = end_idx - byte_idx;
357 prev_st = pstr->cur_state;
358 if (BE (pstr->trans != NULL, 0))
359 {
360 int i, ch;
361
362 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
363 {
364 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
365 buf[i] = pstr->trans[ch];
366 }
367 p = (const char *) buf;
368 }
369 else
370 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
371 mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
372 if (BE ((size_t) (mbclen + 2) > 2, 1))
373 {
374 wchar_t wcu = wc;
375 if (iswlower (wc))
376 {
377 size_t mbcdlen;
378
379 wcu = towupper (wc);
380 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
381 if (BE (mbclen == mbcdlen, 1))
382 memcpy (pstr->mbs + byte_idx, buf, mbclen);
383 else if (mbcdlen != (size_t) -1)
384 {
385 size_t i;
386
387 if (byte_idx + mbcdlen > pstr->bufs_len)
388 {
389 pstr->cur_state = prev_st;
390 break;
391 }
392
393 if (pstr->offsets == NULL)
394 {
395 pstr->offsets = re_xmalloc (Idx, pstr->bufs_len);
396
397 if (pstr->offsets == NULL)
398 return REG_ESPACE;
399 }
400 if (!pstr->offsets_needed)
401 {
402 for (i = 0; i < (size_t) byte_idx; ++i)
403 pstr->offsets[i] = i;
404 pstr->offsets_needed = 1;
405 }
406
407 memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
408 pstr->wcs[byte_idx] = wcu;
409 pstr->offsets[byte_idx] = src_idx;
410 for (i = 1; i < mbcdlen; ++i)
411 {
412 pstr->offsets[byte_idx + i]
413 = src_idx + (i < mbclen ? i : mbclen - 1);
414 pstr->wcs[byte_idx + i] = WEOF;
415 }
416 pstr->len += mbcdlen - mbclen;
417 if (pstr->raw_stop > src_idx)
418 pstr->stop += mbcdlen - mbclen;
419 end_idx = (pstr->bufs_len > pstr->len)
420 ? pstr->len : pstr->bufs_len;
421 byte_idx += mbcdlen;
422 src_idx += mbclen;
423 continue;
424 }
425 else
426 memcpy (pstr->mbs + byte_idx, p, mbclen);
427 }
428 else
429 memcpy (pstr->mbs + byte_idx, p, mbclen);
430
431 if (BE (pstr->offsets_needed != 0, 0))
432 {
433 size_t i;
434 for (i = 0; i < mbclen; ++i)
435 pstr->offsets[byte_idx + i] = src_idx + i;
436 }
437 src_idx += mbclen;
438
439 pstr->wcs[byte_idx++] = wcu;
440 /* Write paddings. */
441 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
442 pstr->wcs[byte_idx++] = WEOF;
443 }
444 else if (mbclen == (size_t) -1 || mbclen == 0)
445 {
446 /* It is an invalid character or '\0'. Just use the byte. */
447 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
448
449 if (BE (pstr->trans != NULL, 0))
450 ch = pstr->trans [ch];
451 pstr->mbs[byte_idx] = ch;
452
453 if (BE (pstr->offsets_needed != 0, 0))
454 pstr->offsets[byte_idx] = src_idx;
455 ++src_idx;
456
457 /* And also cast it to wide char. */
458 pstr->wcs[byte_idx++] = (wchar_t) ch;
459 if (BE (mbclen == (size_t) -1, 0))
460 pstr->cur_state = prev_st;
461 }
462 else
463 {
464 /* The buffer doesn't have enough space, finish to build. */
465 pstr->cur_state = prev_st;
466 break;
467 }
468 }
469 pstr->valid_len = byte_idx;
470 pstr->valid_raw_len = src_idx;
471 return REG_NOERROR;
472 }
473
474 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
475 Return the index. */
476
477 static Idx
478 internal_function
re_string_skip_chars(re_string_t * pstr,Idx new_raw_idx,wint_t * last_wc)479 re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
480 {
481 mbstate_t prev_st;
482 Idx rawbuf_idx;
483 size_t mbclen;
484 wchar_t wc = 0;
485
486 /* Skip the characters which are not necessary to check. */
487 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
488 rawbuf_idx < new_raw_idx;)
489 {
490 Idx remain_len;
491 remain_len = pstr->len - rawbuf_idx;
492 prev_st = pstr->cur_state;
493 mbclen = mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx,
494 remain_len, &pstr->cur_state);
495 if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
496 {
497 /* We treat these cases as a singlebyte character. */
498 mbclen = 1;
499 pstr->cur_state = prev_st;
500 }
501 /* Then proceed the next character. */
502 rawbuf_idx += mbclen;
503 }
504 *last_wc = (wint_t) wc;
505 return rawbuf_idx;
506 }
507 #endif /* RE_ENABLE_I18N */
508
509 /* Build the buffer PSTR->MBS, and apply the translation if we need.
510 This function is used in case of REG_ICASE. */
511
512 static void
513 internal_function
build_upper_buffer(re_string_t * pstr)514 build_upper_buffer (re_string_t *pstr)
515 {
516 Idx char_idx, end_idx;
517 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
518
519 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
520 {
521 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
522 if (BE (pstr->trans != NULL, 0))
523 ch = pstr->trans[ch];
524 if (islower (ch))
525 pstr->mbs[char_idx] = toupper (ch);
526 else
527 pstr->mbs[char_idx] = ch;
528 }
529 pstr->valid_len = char_idx;
530 pstr->valid_raw_len = char_idx;
531 }
532
533 /* Apply TRANS to the buffer in PSTR. */
534
535 static void
536 internal_function
re_string_translate_buffer(re_string_t * pstr)537 re_string_translate_buffer (re_string_t *pstr)
538 {
539 Idx buf_idx, end_idx;
540 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
541
542 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
543 {
544 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
545 pstr->mbs[buf_idx] = pstr->trans[ch];
546 }
547
548 pstr->valid_len = buf_idx;
549 pstr->valid_raw_len = buf_idx;
550 }
551
552 /* This function re-construct the buffers.
553 Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
554 convert to upper case in case of REG_ICASE, apply translation. */
555
556 static reg_errcode_t
557 internal_function
re_string_reconstruct(re_string_t * pstr,Idx idx,int eflags)558 re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
559 {
560 Idx offset;
561
562 if (BE (pstr->raw_mbs_idx <= idx, 0))
563 offset = idx - pstr->raw_mbs_idx;
564 else
565 {
566 /* Reset buffer. */
567 #ifdef RE_ENABLE_I18N
568 if (pstr->mb_cur_max > 1)
569 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
570 #endif /* RE_ENABLE_I18N */
571 pstr->len = pstr->raw_len;
572 pstr->stop = pstr->raw_stop;
573 pstr->valid_len = 0;
574 pstr->raw_mbs_idx = 0;
575 pstr->valid_raw_len = 0;
576 pstr->offsets_needed = 0;
577 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
578 : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
579 if (!pstr->mbs_allocated)
580 pstr->mbs = (unsigned char *) pstr->raw_mbs;
581 offset = idx;
582 }
583
584 if (BE (offset != 0, 1))
585 {
586 /* Are the characters which are already checked remain? */
587 if (BE (offset < pstr->valid_raw_len, 1)
588 #ifdef RE_ENABLE_I18N
589 /* Handling this would enlarge the code too much.
590 Accept a slowdown in that case. */
591 && pstr->offsets_needed == 0
592 #endif
593 )
594 {
595 /* Yes, move them to the front of the buffer. */
596 pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags);
597 #ifdef RE_ENABLE_I18N
598 if (pstr->mb_cur_max > 1)
599 memmove (pstr->wcs, pstr->wcs + offset,
600 (pstr->valid_len - offset) * sizeof (wint_t));
601 #endif /* RE_ENABLE_I18N */
602 if (BE (pstr->mbs_allocated, 0))
603 memmove (pstr->mbs, pstr->mbs + offset,
604 pstr->valid_len - offset);
605 pstr->valid_len -= offset;
606 pstr->valid_raw_len -= offset;
607 #if DEBUG
608 assert (pstr->valid_len > 0);
609 #endif
610 }
611 else
612 {
613 /* No, skip all characters until IDX. */
614 #ifdef RE_ENABLE_I18N
615 if (BE (pstr->offsets_needed, 0))
616 {
617 pstr->len = pstr->raw_len - idx + offset;
618 pstr->stop = pstr->raw_stop - idx + offset;
619 pstr->offsets_needed = 0;
620 }
621 #endif
622 pstr->valid_len = 0;
623 pstr->valid_raw_len = 0;
624 #ifdef RE_ENABLE_I18N
625 if (pstr->mb_cur_max > 1)
626 {
627 Idx wcs_idx;
628 wint_t wc = WEOF;
629
630 if (pstr->is_utf8)
631 {
632 const unsigned char *raw, *p, *end;
633
634 /* Special case UTF-8. Multi-byte chars start with any
635 byte other than 0x80 - 0xbf. */
636 raw = pstr->raw_mbs + pstr->raw_mbs_idx;
637 end = raw + (offset - pstr->mb_cur_max);
638 for (p = raw + offset - 1; p >= end; --p)
639 if ((*p & 0xc0) != 0x80)
640 {
641 mbstate_t cur_state;
642 wchar_t wc2;
643 Idx mlen = raw + pstr->len - p;
644 size_t mbclen;
645
646 /* XXX Don't use mbrtowc, we know which conversion
647 to use (UTF-8 -> UCS4). */
648 memset (&cur_state, 0, sizeof (cur_state));
649 mbclen = mbrtowc (&wc2, (const char *) p, mlen,
650 &cur_state);
651 if (raw + offset - p <= mbclen && mbclen < (size_t) -2)
652 {
653 memset (&pstr->cur_state, '\0',
654 sizeof (mbstate_t));
655 pstr->valid_len = mbclen - (raw + offset - p);
656 wc = wc2;
657 }
658 break;
659 }
660 }
661
662 if (wc == WEOF)
663 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
664 if (BE (pstr->valid_len, 0))
665 {
666 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
667 pstr->wcs[wcs_idx] = WEOF;
668 if (pstr->mbs_allocated)
669 memset (pstr->mbs, -1, pstr->valid_len);
670 }
671 pstr->valid_raw_len = pstr->valid_len;
672 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
673 && IS_WIDE_WORD_CHAR (wc))
674 ? CONTEXT_WORD
675 : ((IS_WIDE_NEWLINE (wc)
676 && pstr->newline_anchor)
677 ? CONTEXT_NEWLINE : 0));
678 }
679 else
680 #endif /* RE_ENABLE_I18N */
681 {
682 int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
683 if (pstr->trans)
684 c = pstr->trans[c];
685 pstr->tip_context = (bitset_contain (pstr->word_char, c)
686 ? CONTEXT_WORD
687 : ((IS_NEWLINE (c) && pstr->newline_anchor)
688 ? CONTEXT_NEWLINE : 0));
689 }
690 }
691 if (!BE (pstr->mbs_allocated, 0))
692 pstr->mbs += offset;
693 }
694 pstr->raw_mbs_idx = idx;
695 pstr->len -= offset;
696 pstr->stop -= offset;
697
698 /* Then build the buffers. */
699 #ifdef RE_ENABLE_I18N
700 if (pstr->mb_cur_max > 1)
701 {
702 if (pstr->icase)
703 {
704 reg_errcode_t ret = build_wcs_upper_buffer (pstr);
705 if (BE (ret != REG_NOERROR, 0))
706 return ret;
707 }
708 else
709 build_wcs_buffer (pstr);
710 }
711 else
712 #endif /* RE_ENABLE_I18N */
713 if (BE (pstr->mbs_allocated, 0))
714 {
715 if (pstr->icase)
716 build_upper_buffer (pstr);
717 else if (pstr->trans != NULL)
718 re_string_translate_buffer (pstr);
719 }
720 else
721 pstr->valid_len = pstr->len;
722
723 pstr->cur_idx = 0;
724 return REG_NOERROR;
725 }
726
727 static unsigned char
internal_function(pure)728 internal_function __attribute ((pure))
729 re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
730 {
731 int ch;
732 Idx off;
733
734 /* Handle the common (easiest) cases first. */
735 if (BE (!pstr->mbs_allocated, 1))
736 return re_string_peek_byte (pstr, idx);
737
738 #ifdef RE_ENABLE_I18N
739 if (pstr->mb_cur_max > 1
740 && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
741 return re_string_peek_byte (pstr, idx);
742 #endif
743
744 off = pstr->cur_idx + idx;
745 #ifdef RE_ENABLE_I18N
746 if (pstr->offsets_needed)
747 off = pstr->offsets[off];
748 #endif
749
750 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
751
752 #ifdef RE_ENABLE_I18N
753 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
754 this function returns CAPITAL LETTER I instead of first byte of
755 DOTLESS SMALL LETTER I. The latter would confuse the parser,
756 since peek_byte_case doesn't advance cur_idx in any way. */
757 if (pstr->offsets_needed && !isascii (ch))
758 return re_string_peek_byte (pstr, idx);
759 #endif
760
761 return ch;
762 }
763
764 static unsigned char
internal_function(pure)765 internal_function __attribute ((pure))
766 re_string_fetch_byte_case (re_string_t *pstr)
767 {
768 if (BE (!pstr->mbs_allocated, 1))
769 return re_string_fetch_byte (pstr);
770
771 #ifdef RE_ENABLE_I18N
772 if (pstr->offsets_needed)
773 {
774 Idx off;
775 int ch;
776
777 /* For tr_TR.UTF-8 [[:islower:]] there is
778 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
779 in that case the whole multi-byte character and return
780 the original letter. On the other side, with
781 [[: DOTLESS SMALL LETTER I return [[:I, as doing
782 anything else would complicate things too much. */
783
784 if (!re_string_first_byte (pstr, pstr->cur_idx))
785 return re_string_fetch_byte (pstr);
786
787 off = pstr->offsets[pstr->cur_idx];
788 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
789
790 if (! isascii (ch))
791 return re_string_fetch_byte (pstr);
792
793 re_string_skip_bytes (pstr,
794 re_string_char_size_at (pstr, pstr->cur_idx));
795 return ch;
796 }
797 #endif
798
799 return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
800 }
801
802 static void
803 internal_function
re_string_destruct(re_string_t * pstr)804 re_string_destruct (re_string_t *pstr)
805 {
806 #ifdef RE_ENABLE_I18N
807 re_free (pstr->wcs);
808 re_free (pstr->offsets);
809 #endif /* RE_ENABLE_I18N */
810 if (pstr->mbs_allocated)
811 re_free (pstr->mbs);
812 }
813
814 /* Return the context at IDX in INPUT. */
815
816 static unsigned int
817 internal_function
re_string_context_at(const re_string_t * input,Idx idx,int eflags)818 re_string_context_at (const re_string_t *input, Idx idx, int eflags)
819 {
820 int c;
821 if (BE (! REG_VALID_INDEX (idx), 0))
822 /* In this case, we use the value stored in input->tip_context,
823 since we can't know the character in input->mbs[-1] here. */
824 return input->tip_context;
825 if (BE (idx == input->len, 0))
826 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
827 : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
828 #ifdef RE_ENABLE_I18N
829 if (input->mb_cur_max > 1)
830 {
831 wint_t wc;
832 Idx wc_idx = idx;
833 while(input->wcs[wc_idx] == WEOF)
834 {
835 #ifdef DEBUG
836 /* It must not happen. */
837 assert (REG_VALID_INDEX (wc_idx));
838 #endif
839 --wc_idx;
840 if (! REG_VALID_INDEX (wc_idx))
841 return input->tip_context;
842 }
843 wc = input->wcs[wc_idx];
844 if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
845 return CONTEXT_WORD;
846 return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
847 ? CONTEXT_NEWLINE : 0);
848 }
849 else
850 #endif
851 {
852 c = re_string_byte_at (input, idx);
853 if (bitset_contain (input->word_char, c))
854 return CONTEXT_WORD;
855 return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
856 }
857 }
858
859 /* Functions for set operation. */
860
861 static reg_errcode_t
862 internal_function
re_node_set_alloc(re_node_set * set,Idx size)863 re_node_set_alloc (re_node_set *set, Idx size)
864 {
865 set->alloc = size;
866 set->nelem = 0;
867 set->elems = re_xmalloc (Idx, size);
868 if (BE (set->elems == NULL, 0))
869 return REG_ESPACE;
870 return REG_NOERROR;
871 }
872
873 static reg_errcode_t
874 internal_function
re_node_set_init_1(re_node_set * set,Idx elem)875 re_node_set_init_1 (re_node_set *set, Idx elem)
876 {
877 set->alloc = 1;
878 set->nelem = 1;
879 set->elems = re_malloc (Idx, 1);
880 if (BE (set->elems == NULL, 0))
881 {
882 set->alloc = set->nelem = 0;
883 return REG_ESPACE;
884 }
885 set->elems[0] = elem;
886 return REG_NOERROR;
887 }
888
889 static reg_errcode_t
890 internal_function
re_node_set_init_2(re_node_set * set,Idx elem1,Idx elem2)891 re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
892 {
893 set->alloc = 2;
894 set->elems = re_malloc (Idx, 2);
895 if (BE (set->elems == NULL, 0))
896 return REG_ESPACE;
897 if (elem1 == elem2)
898 {
899 set->nelem = 1;
900 set->elems[0] = elem1;
901 }
902 else
903 {
904 set->nelem = 2;
905 if (elem1 < elem2)
906 {
907 set->elems[0] = elem1;
908 set->elems[1] = elem2;
909 }
910 else
911 {
912 set->elems[0] = elem2;
913 set->elems[1] = elem1;
914 }
915 }
916 return REG_NOERROR;
917 }
918
919 static reg_errcode_t
920 internal_function
re_node_set_init_copy(re_node_set * dest,const re_node_set * src)921 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
922 {
923 dest->nelem = src->nelem;
924 if (src->nelem > 0)
925 {
926 dest->alloc = dest->nelem;
927 dest->elems = re_malloc (Idx, dest->alloc);
928 if (BE (dest->elems == NULL, 0))
929 {
930 dest->alloc = dest->nelem = 0;
931 return REG_ESPACE;
932 }
933 memcpy (dest->elems, src->elems, src->nelem * sizeof dest->elems[0]);
934 }
935 else
936 re_node_set_init_empty (dest);
937 return REG_NOERROR;
938 }
939
940 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
941 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
942 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
943
944 static reg_errcode_t
945 internal_function
re_node_set_add_intersect(re_node_set * dest,const re_node_set * src1,const re_node_set * src2)946 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
947 const re_node_set *src2)
948 {
949 Idx i1, i2, is, id, delta, sbase;
950 if (src1->nelem == 0 || src2->nelem == 0)
951 return REG_NOERROR;
952
953 /* We need dest->nelem + 2 * elems_in_intersection; this is a
954 conservative estimate. */
955 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
956 {
957 Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
958 Idx *new_elems;
959 if (sizeof (Idx) < 3
960 && (new_alloc < dest->alloc
961 || ((Idx) (src1->nelem + src2->nelem) < src1->nelem)))
962 return REG_ESPACE;
963 new_elems = re_xrealloc (dest->elems, Idx, new_alloc);
964 if (BE (new_elems == NULL, 0))
965 return REG_ESPACE;
966 dest->elems = new_elems;
967 dest->alloc = new_alloc;
968 }
969
970 /* Find the items in the intersection of SRC1 and SRC2, and copy
971 into the top of DEST those that are not already in DEST itself. */
972 sbase = dest->nelem + src1->nelem + src2->nelem;
973 i1 = src1->nelem - 1;
974 i2 = src2->nelem - 1;
975 id = dest->nelem - 1;
976 for (;;)
977 {
978 if (src1->elems[i1] == src2->elems[i2])
979 {
980 /* Try to find the item in DEST. Maybe we could binary search? */
981 while (REG_VALID_INDEX (id) && dest->elems[id] > src1->elems[i1])
982 --id;
983
984 if (! REG_VALID_INDEX (id) || dest->elems[id] != src1->elems[i1])
985 dest->elems[--sbase] = src1->elems[i1];
986
987 if (! REG_VALID_INDEX (--i1) || ! REG_VALID_INDEX (--i2))
988 break;
989 }
990
991 /* Lower the highest of the two items. */
992 else if (src1->elems[i1] < src2->elems[i2])
993 {
994 if (! REG_VALID_INDEX (--i2))
995 break;
996 }
997 else
998 {
999 if (! REG_VALID_INDEX (--i1))
1000 break;
1001 }
1002 }
1003
1004 id = dest->nelem - 1;
1005 is = dest->nelem + src1->nelem + src2->nelem - 1;
1006 delta = is - sbase + 1;
1007
1008 /* Now copy. When DELTA becomes zero, the remaining
1009 DEST elements are already in place; this is more or
1010 less the same loop that is in re_node_set_merge. */
1011 dest->nelem += delta;
1012 if (delta > 0 && REG_VALID_INDEX (id))
1013 for (;;)
1014 {
1015 if (dest->elems[is] > dest->elems[id])
1016 {
1017 /* Copy from the top. */
1018 dest->elems[id + delta--] = dest->elems[is--];
1019 if (delta == 0)
1020 break;
1021 }
1022 else
1023 {
1024 /* Slide from the bottom. */
1025 dest->elems[id + delta] = dest->elems[id];
1026 if (! REG_VALID_INDEX (--id))
1027 break;
1028 }
1029 }
1030
1031 /* Copy remaining SRC elements. */
1032 memcpy (dest->elems, dest->elems + sbase, delta * sizeof dest->elems[0]);
1033
1034 return REG_NOERROR;
1035 }
1036
1037 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1038 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1039
1040 static reg_errcode_t
1041 internal_function
re_node_set_init_union(re_node_set * dest,const re_node_set * src1,const re_node_set * src2)1042 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1043 const re_node_set *src2)
1044 {
1045 Idx i1, i2, id;
1046 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1047 {
1048 dest->alloc = src1->nelem + src2->nelem;
1049 if (sizeof (Idx) < 2 && dest->alloc < src1->nelem)
1050 return REG_ESPACE;
1051 dest->elems = re_xmalloc (Idx, dest->alloc);
1052 if (BE (dest->elems == NULL, 0))
1053 return REG_ESPACE;
1054 }
1055 else
1056 {
1057 if (src1 != NULL && src1->nelem > 0)
1058 return re_node_set_init_copy (dest, src1);
1059 else if (src2 != NULL && src2->nelem > 0)
1060 return re_node_set_init_copy (dest, src2);
1061 else
1062 re_node_set_init_empty (dest);
1063 return REG_NOERROR;
1064 }
1065 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1066 {
1067 if (src1->elems[i1] > src2->elems[i2])
1068 {
1069 dest->elems[id++] = src2->elems[i2++];
1070 continue;
1071 }
1072 if (src1->elems[i1] == src2->elems[i2])
1073 ++i2;
1074 dest->elems[id++] = src1->elems[i1++];
1075 }
1076 if (i1 < src1->nelem)
1077 {
1078 memcpy (dest->elems + id, src1->elems + i1,
1079 (src1->nelem - i1) * sizeof dest->elems[0]);
1080 id += src1->nelem - i1;
1081 }
1082 else if (i2 < src2->nelem)
1083 {
1084 memcpy (dest->elems + id, src2->elems + i2,
1085 (src2->nelem - i2) * sizeof dest->elems[0]);
1086 id += src2->nelem - i2;
1087 }
1088 dest->nelem = id;
1089 return REG_NOERROR;
1090 }
1091
1092 /* Calculate the union set of the sets DEST and SRC. And store it to
1093 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1094
1095 static reg_errcode_t
1096 internal_function
re_node_set_merge(re_node_set * dest,const re_node_set * src)1097 re_node_set_merge (re_node_set *dest, const re_node_set *src)
1098 {
1099 Idx is, id, sbase, delta;
1100 if (src == NULL || src->nelem == 0)
1101 return REG_NOERROR;
1102 if (sizeof (Idx) < 3
1103 && ((Idx) (2 * src->nelem) < src->nelem
1104 || (Idx) (2 * src->nelem + dest->nelem) < dest->nelem))
1105 return REG_ESPACE;
1106 if (dest->alloc < 2 * src->nelem + dest->nelem)
1107 {
1108 Idx new_alloc = src->nelem + dest->alloc;
1109 Idx *new_buffer;
1110 if (sizeof (Idx) < 4 && new_alloc < dest->alloc)
1111 return REG_ESPACE;
1112 new_buffer = re_x2realloc (dest->elems, Idx, &new_alloc);
1113 if (BE (new_buffer == NULL, 0))
1114 return REG_ESPACE;
1115 dest->elems = new_buffer;
1116 dest->alloc = new_alloc;
1117 }
1118
1119 if (BE (dest->nelem == 0, 0))
1120 {
1121 dest->nelem = src->nelem;
1122 memcpy (dest->elems, src->elems, src->nelem * sizeof dest->elems[0]);
1123 return REG_NOERROR;
1124 }
1125
1126 /* Copy into the top of DEST the items of SRC that are not
1127 found in DEST. Maybe we could binary search in DEST? */
1128 for (sbase = dest->nelem + 2 * src->nelem,
1129 is = src->nelem - 1, id = dest->nelem - 1;
1130 REG_VALID_INDEX (is) && REG_VALID_INDEX (id); )
1131 {
1132 if (dest->elems[id] == src->elems[is])
1133 is--, id--;
1134 else if (dest->elems[id] < src->elems[is])
1135 dest->elems[--sbase] = src->elems[is--];
1136 else /* if (dest->elems[id] > src->elems[is]) */
1137 --id;
1138 }
1139
1140 if (REG_VALID_INDEX (is))
1141 {
1142 /* If DEST is exhausted, the remaining items of SRC must be unique. */
1143 sbase -= is + 1;
1144 memcpy (dest->elems + sbase, src->elems,
1145 (is + 1) * sizeof dest->elems[0]);
1146 }
1147
1148 id = dest->nelem - 1;
1149 is = dest->nelem + 2 * src->nelem - 1;
1150 delta = is - sbase + 1;
1151 if (delta == 0)
1152 return REG_NOERROR;
1153
1154 /* Now copy. When DELTA becomes zero, the remaining
1155 DEST elements are already in place. */
1156 dest->nelem += delta;
1157 for (;;)
1158 {
1159 if (dest->elems[is] > dest->elems[id])
1160 {
1161 /* Copy from the top. */
1162 dest->elems[id + delta--] = dest->elems[is--];
1163 if (delta == 0)
1164 break;
1165 }
1166 else
1167 {
1168 /* Slide from the bottom. */
1169 dest->elems[id + delta] = dest->elems[id];
1170 if (! REG_VALID_INDEX (--id))
1171 {
1172 /* Copy remaining SRC elements. */
1173 memcpy (dest->elems, dest->elems + sbase,
1174 delta * sizeof dest->elems[0]);
1175 break;
1176 }
1177 }
1178 }
1179
1180 return REG_NOERROR;
1181 }
1182
1183 /* Insert the new element ELEM to the re_node_set* SET.
1184 SET should not already have ELEM.
1185 Return true if successful. */
1186
1187 static bool
1188 internal_function
re_node_set_insert(re_node_set * set,Idx elem)1189 re_node_set_insert (re_node_set *set, Idx elem)
1190 {
1191 Idx idx;
1192 /* In case the set is empty. */
1193 if (set->alloc == 0)
1194 return re_node_set_init_1 (set, elem) == REG_NOERROR;
1195
1196 if (BE (set->nelem, 0) == 0)
1197 {
1198 /* We already guaranteed above that set->alloc != 0. */
1199 set->elems[0] = elem;
1200 ++set->nelem;
1201 return true;
1202 }
1203
1204 /* Realloc if we need. */
1205 if (set->alloc == set->nelem)
1206 {
1207 Idx *new_elems = re_x2realloc (set->elems, Idx, &set->alloc);
1208 if (BE (new_elems == NULL, 0))
1209 return false;
1210 set->elems = new_elems;
1211 }
1212
1213 /* Move the elements which follows the new element. Test the
1214 first element separately to skip a check in the inner loop. */
1215 if (elem < set->elems[0])
1216 {
1217 idx = 0;
1218 for (idx = set->nelem; idx > 0; idx--)
1219 set->elems[idx] = set->elems[idx - 1];
1220 }
1221 else
1222 {
1223 for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1224 set->elems[idx] = set->elems[idx - 1];
1225 }
1226
1227 /* Insert the new element. */
1228 set->elems[idx] = elem;
1229 ++set->nelem;
1230 return true;
1231 }
1232
1233 /* Insert the new element ELEM to the re_node_set* SET.
1234 SET should not already have any element greater than or equal to ELEM.
1235 Return true if successful. */
1236
1237 static bool
1238 internal_function
re_node_set_insert_last(re_node_set * set,Idx elem)1239 re_node_set_insert_last (re_node_set *set, Idx elem)
1240 {
1241 /* Realloc if we need. */
1242 if (set->alloc == set->nelem)
1243 {
1244 Idx *new_elems;
1245 new_elems = re_x2realloc (set->elems, Idx, &set->alloc);
1246 if (BE (new_elems == NULL, 0))
1247 return false;
1248 set->elems = new_elems;
1249 }
1250
1251 /* Insert the new element. */
1252 set->elems[set->nelem++] = elem;
1253 return true;
1254 }
1255
1256 /* Compare two node sets SET1 and SET2.
1257 Return true if SET1 and SET2 are equivalent. */
1258
1259 static bool
internal_function(pure)1260 internal_function __attribute ((pure))
1261 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1262 {
1263 Idx i;
1264 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1265 return false;
1266 for (i = set1->nelem ; REG_VALID_INDEX (--i) ; )
1267 if (set1->elems[i] != set2->elems[i])
1268 return false;
1269 return true;
1270 }
1271
1272 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
1273
1274 static Idx
internal_function(pure)1275 internal_function __attribute ((pure))
1276 re_node_set_contains (const re_node_set *set, Idx elem)
1277 {
1278 __re_size_t idx, right, mid;
1279 if (! REG_VALID_NONZERO_INDEX (set->nelem))
1280 return 0;
1281
1282 /* Binary search the element. */
1283 idx = 0;
1284 right = set->nelem - 1;
1285 while (idx < right)
1286 {
1287 mid = (idx + right) / 2;
1288 if (set->elems[mid] < elem)
1289 idx = mid + 1;
1290 else
1291 right = mid;
1292 }
1293 return set->elems[idx] == elem ? idx + 1 : 0;
1294 }
1295
1296 static void
1297 internal_function
re_node_set_remove_at(re_node_set * set,Idx idx)1298 re_node_set_remove_at (re_node_set *set, Idx idx)
1299 {
1300 if (idx < 0 || idx >= set->nelem)
1301 return;
1302 --set->nelem;
1303 for (; idx < set->nelem; idx++)
1304 set->elems[idx] = set->elems[idx + 1];
1305 }
1306
1307
1308 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1309 Or return REG_MISSING if an error occurred. */
1310
1311 static Idx
1312 internal_function
re_dfa_add_node(re_dfa_t * dfa,re_token_t token)1313 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1314 {
1315 int type = token.type;
1316 if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1317 {
1318 Idx new_nodes_alloc = dfa->nodes_alloc;
1319 Idx *new_nexts, *new_indices;
1320 re_node_set *new_edests, *new_eclosures;
1321
1322 re_token_t *new_nodes = re_x2realloc (dfa->nodes, re_token_t,
1323 &new_nodes_alloc);
1324 if (BE (new_nodes == NULL, 0))
1325 return REG_MISSING;
1326 dfa->nodes = new_nodes;
1327 new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1328 new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
1329 new_edests = re_xrealloc (dfa->edests, re_node_set, new_nodes_alloc);
1330 new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1331 if (BE (new_nexts == NULL || new_indices == NULL
1332 || new_edests == NULL || new_eclosures == NULL, 0))
1333 return REG_MISSING;
1334 dfa->nexts = new_nexts;
1335 dfa->org_indices = new_indices;
1336 dfa->edests = new_edests;
1337 dfa->eclosures = new_eclosures;
1338 dfa->nodes_alloc = new_nodes_alloc;
1339 }
1340 dfa->nodes[dfa->nodes_len] = token;
1341 dfa->nodes[dfa->nodes_len].constraint = 0;
1342 #ifdef RE_ENABLE_I18N
1343 dfa->nodes[dfa->nodes_len].accept_mb =
1344 (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
1345 #endif
1346 dfa->nexts[dfa->nodes_len] = REG_MISSING;
1347 re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1348 re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1349 return dfa->nodes_len++;
1350 }
1351
1352 static inline re_hashval_t
1353 internal_function
calc_state_hash(const re_node_set * nodes,unsigned int context)1354 calc_state_hash (const re_node_set *nodes, unsigned int context)
1355 {
1356 re_hashval_t hash = nodes->nelem + context;
1357 Idx i;
1358 for (i = 0 ; i < nodes->nelem ; i++)
1359 hash += nodes->elems[i];
1360 return hash;
1361 }
1362
1363 /* Search for the state whose node_set is equivalent to NODES.
1364 Return the pointer to the state, if we found it in the DFA.
1365 Otherwise create the new one and return it. In case of an error
1366 return NULL and set the error code in ERR.
1367 Note: - We assume NULL as the invalid state, then it is possible that
1368 return value is NULL and ERR is REG_NOERROR.
1369 - We never return non-NULL value in case of any errors, it is for
1370 optimization. */
1371
1372 static re_dfastate_t*
1373 internal_function
re_acquire_state(reg_errcode_t * err,re_dfa_t * dfa,const re_node_set * nodes)1374 re_acquire_state (reg_errcode_t *err, re_dfa_t *dfa, const re_node_set *nodes)
1375 {
1376 re_hashval_t hash;
1377 re_dfastate_t *new_state;
1378 struct re_state_table_entry *spot;
1379 Idx i;
1380 #ifdef lint
1381 /* Suppress bogus uninitialized-variable warnings. */
1382 *err = REG_NOERROR;
1383 #endif
1384 if (BE (nodes->nelem == 0, 0))
1385 {
1386 *err = REG_NOERROR;
1387 return NULL;
1388 }
1389 hash = calc_state_hash (nodes, 0);
1390 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1391
1392 for (i = 0 ; i < spot->num ; i++)
1393 {
1394 re_dfastate_t *state = spot->array[i];
1395 if (hash != state->hash)
1396 continue;
1397 if (re_node_set_compare (&state->nodes, nodes))
1398 return state;
1399 }
1400
1401 /* There are no appropriate state in the dfa, create the new one. */
1402 new_state = create_ci_newstate (dfa, nodes, hash);
1403 if (BE (new_state != NULL, 1))
1404 return new_state;
1405 else
1406 {
1407 *err = REG_ESPACE;
1408 return NULL;
1409 }
1410 }
1411
1412 /* Search for the state whose node_set is equivalent to NODES and
1413 whose context is equivalent to CONTEXT.
1414 Return the pointer to the state, if we found it in the DFA.
1415 Otherwise create the new one and return it. In case of an error
1416 return NULL and set the error code in ERR.
1417 Note: - We assume NULL as the invalid state, then it is possible that
1418 return value is NULL and ERR is REG_NOERROR.
1419 - We never return non-NULL value in case of any errors, it is for
1420 optimization. */
1421
1422 static re_dfastate_t*
1423 internal_function
re_acquire_state_context(reg_errcode_t * err,re_dfa_t * dfa,const re_node_set * nodes,unsigned int context)1424 re_acquire_state_context (reg_errcode_t *err, re_dfa_t *dfa,
1425 const re_node_set *nodes, unsigned int context)
1426 {
1427 re_hashval_t hash;
1428 re_dfastate_t *new_state;
1429 struct re_state_table_entry *spot;
1430 Idx i;
1431 #ifdef lint
1432 /* Suppress bogus uninitialized-variable warnings. */
1433 *err = REG_NOERROR;
1434 #endif
1435 if (nodes->nelem == 0)
1436 {
1437 *err = REG_NOERROR;
1438 return NULL;
1439 }
1440 hash = calc_state_hash (nodes, context);
1441 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1442
1443 for (i = 0 ; i < spot->num ; i++)
1444 {
1445 re_dfastate_t *state = spot->array[i];
1446 if (state->hash == hash
1447 && state->context == context
1448 && re_node_set_compare (state->entrance_nodes, nodes))
1449 return state;
1450 }
1451 /* There are no appropriate state in `dfa', create the new one. */
1452 new_state = create_cd_newstate (dfa, nodes, context, hash);
1453 if (BE (new_state != NULL, 1))
1454 return new_state;
1455 else
1456 {
1457 *err = REG_ESPACE;
1458 return NULL;
1459 }
1460 }
1461
1462 /* Finish initialization of the new state NEWSTATE, and using its hash value
1463 HASH put in the appropriate bucket of DFA's state table. Return value
1464 indicates the error code if failed. */
1465
1466 static reg_errcode_t
1467 internal_function
register_state(const re_dfa_t * dfa,re_dfastate_t * newstate,re_hashval_t hash)1468 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate, re_hashval_t hash)
1469 {
1470 struct re_state_table_entry *spot;
1471 reg_errcode_t err;
1472 Idx i;
1473
1474 newstate->hash = hash;
1475 err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1476 if (BE (err != REG_NOERROR, 0))
1477 return REG_ESPACE;
1478 for (i = 0; i < newstate->nodes.nelem; i++)
1479 {
1480 Idx elem = newstate->nodes.elems[i];
1481 if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1482 {
1483 bool ok = re_node_set_insert_last (&newstate->non_eps_nodes, elem);
1484 if (BE (! ok, 0))
1485 return REG_ESPACE;
1486 }
1487 }
1488
1489 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1490 if (BE (spot->alloc <= spot->num, 0))
1491 {
1492 Idx new_alloc = spot->num;
1493 re_dfastate_t **new_array = re_x2realloc (spot->array, re_dfastate_t *,
1494 &new_alloc);
1495 if (BE (new_array == NULL, 0))
1496 return REG_ESPACE;
1497 spot->array = new_array;
1498 spot->alloc = new_alloc;
1499 }
1500 spot->array[spot->num++] = newstate;
1501 return REG_NOERROR;
1502 }
1503
1504 /* Create the new state which is independ of contexts.
1505 Return the new state if succeeded, otherwise return NULL. */
1506
1507 static re_dfastate_t *
1508 internal_function
create_ci_newstate(const re_dfa_t * dfa,const re_node_set * nodes,re_hashval_t hash)1509 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1510 re_hashval_t hash)
1511 {
1512 Idx i;
1513 reg_errcode_t err;
1514 re_dfastate_t *newstate;
1515
1516 newstate = re_calloc (re_dfastate_t, 1);
1517 if (BE (newstate == NULL, 0))
1518 return NULL;
1519 err = re_node_set_init_copy (&newstate->nodes, nodes);
1520 if (BE (err != REG_NOERROR, 0))
1521 {
1522 re_free (newstate);
1523 return NULL;
1524 }
1525
1526 newstate->entrance_nodes = &newstate->nodes;
1527 for (i = 0 ; i < nodes->nelem ; i++)
1528 {
1529 re_token_t *node = dfa->nodes + nodes->elems[i];
1530 re_token_type_t type = node->type;
1531 if (type == CHARACTER && !node->constraint)
1532 continue;
1533 #ifdef RE_ENABLE_I18N
1534 newstate->accept_mb |= node->accept_mb;
1535 #endif /* RE_ENABLE_I18N */
1536
1537 /* If the state has the halt node, the state is a halt state. */
1538 if (type == END_OF_RE)
1539 newstate->halt = 1;
1540 else if (type == OP_BACK_REF)
1541 newstate->has_backref = 1;
1542 else if (type == ANCHOR || node->constraint)
1543 newstate->has_constraint = 1;
1544 }
1545 err = register_state (dfa, newstate, hash);
1546 if (BE (err != REG_NOERROR, 0))
1547 {
1548 free_state (newstate);
1549 newstate = NULL;
1550 }
1551 return newstate;
1552 }
1553
1554 /* Create the new state which is depend on the context CONTEXT.
1555 Return the new state if succeeded, otherwise return NULL. */
1556
1557 static re_dfastate_t *
1558 internal_function
create_cd_newstate(const re_dfa_t * dfa,const re_node_set * nodes,unsigned int context,re_hashval_t hash)1559 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1560 unsigned int context, re_hashval_t hash)
1561 {
1562 Idx i, nctx_nodes = 0;
1563 reg_errcode_t err;
1564 re_dfastate_t *newstate;
1565
1566 newstate = re_calloc (re_dfastate_t, 1);
1567 if (BE (newstate == NULL, 0))
1568 return NULL;
1569 err = re_node_set_init_copy (&newstate->nodes, nodes);
1570 if (BE (err != REG_NOERROR, 0))
1571 {
1572 re_free (newstate);
1573 return NULL;
1574 }
1575
1576 newstate->context = context;
1577 newstate->entrance_nodes = &newstate->nodes;
1578
1579 for (i = 0 ; i < nodes->nelem ; i++)
1580 {
1581 unsigned int constraint = 0;
1582 re_token_t *node = dfa->nodes + nodes->elems[i];
1583 re_token_type_t type = node->type;
1584 if (node->constraint)
1585 constraint = node->constraint;
1586
1587 if (type == CHARACTER && !constraint)
1588 continue;
1589 #ifdef RE_ENABLE_I18N
1590 newstate->accept_mb |= node->accept_mb;
1591 #endif /* RE_ENABLE_I18N */
1592
1593 /* If the state has the halt node, the state is a halt state. */
1594 if (type == END_OF_RE)
1595 newstate->halt = 1;
1596 else if (type == OP_BACK_REF)
1597 newstate->has_backref = 1;
1598 else if (type == ANCHOR)
1599 constraint = node->opr.ctx_type;
1600
1601 if (constraint)
1602 {
1603 if (newstate->entrance_nodes == &newstate->nodes)
1604 {
1605 newstate->entrance_nodes = re_malloc (re_node_set, 1);
1606 if (BE (newstate->entrance_nodes == NULL, 0))
1607 {
1608 free_state (newstate);
1609 return NULL;
1610 }
1611 re_node_set_init_copy (newstate->entrance_nodes, nodes);
1612 nctx_nodes = 0;
1613 newstate->has_constraint = 1;
1614 }
1615
1616 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1617 {
1618 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1619 ++nctx_nodes;
1620 }
1621 }
1622 }
1623 err = register_state (dfa, newstate, hash);
1624 if (BE (err != REG_NOERROR, 0))
1625 {
1626 free_state (newstate);
1627 newstate = NULL;
1628 }
1629 return newstate;
1630 }
1631
1632 static void
1633 internal_function
free_state(re_dfastate_t * state)1634 free_state (re_dfastate_t *state)
1635 {
1636 re_node_set_free (&state->non_eps_nodes);
1637 re_node_set_free (&state->inveclosure);
1638 if (state->entrance_nodes != &state->nodes)
1639 {
1640 re_node_set_free (state->entrance_nodes);
1641 re_free (state->entrance_nodes);
1642 }
1643 re_node_set_free (&state->nodes);
1644 re_free (state->word_trtable);
1645 re_free (state->trtable);
1646 re_free (state);
1647 }
1648
1649 __RCSID("$MirOS: src/gnu/usr.bin/cvs/lib/regex_internal.c,v 1.2 2011/07/28 15:54:34 tg Exp $");
1650