1 /* regcomp.c
2 */
3
4 /*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
8 /* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
10 * a regular expression.
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
16 */
17
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30 */
31
32 #ifdef PERL_EXT_RE_BUILD
33 /* need to replace pregcomp et al, so enable that */
34 # ifndef PERL_IN_XSUB_RE
35 # define PERL_IN_XSUB_RE
36 # endif
37 /* need access to debugger hooks */
38 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
39 # define DEBUGGING
40 # endif
41 #endif
42
43 #ifdef PERL_IN_XSUB_RE
44 /* We *really* need to overwrite these symbols: */
45 # define Perl_pregcomp my_regcomp
46 # define Perl_regdump my_regdump
47 # define Perl_regprop my_regprop
48 # define Perl_pregfree my_regfree
49 # define Perl_re_intuit_string my_re_intuit_string
50 /* *These* symbols are masked to allow static link. */
51 # define Perl_regnext my_regnext
52 # define Perl_save_re_context my_save_re_context
53 # define Perl_reginitcolors my_reginitcolors
54
55 # define PERL_NO_GET_CONTEXT
56 #endif
57
58 /*
59 * pregcomp and pregexec -- regsub and regerror are not used in perl
60 *
61 * Copyright (c) 1986 by University of Toronto.
62 * Written by Henry Spencer. Not derived from licensed software.
63 *
64 * Permission is granted to anyone to use this software for any
65 * purpose on any computer system, and to redistribute it freely,
66 * subject to the following restrictions:
67 *
68 * 1. The author is not responsible for the consequences of use of
69 * this software, no matter how awful, even if they arise
70 * from defects in it.
71 *
72 * 2. The origin of this software must not be misrepresented, either
73 * by explicit claim or by omission.
74 *
75 * 3. Altered versions must be plainly marked as such, and must not
76 * be misrepresented as being the original software.
77 *
78 *
79 **** Alterations to Henry's code are...
80 ****
81 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
82 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
83 ****
84 **** You may distribute under the terms of either the GNU General Public
85 **** License or the Artistic License, as specified in the README file.
86
87 *
88 * Beware that some of this code is subtly aware of the way operator
89 * precedence is structured in regular expressions. Serious changes in
90 * regular-expression syntax might require a total rethink.
91 */
92 #include "EXTERN.h"
93 #define PERL_IN_REGCOMP_C
94 #include "perl.h"
95
96 #ifndef PERL_IN_XSUB_RE
97 # include "INTERN.h"
98 #endif
99
100 #define REG_COMP_C
101 #include "regcomp.h"
102
103 #ifdef op
104 #undef op
105 #endif /* op */
106
107 #ifdef MSDOS
108 # if defined(BUGGY_MSC6)
109 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
110 # pragma optimize("a",off)
111 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
112 # pragma optimize("w",on )
113 # endif /* BUGGY_MSC6 */
114 #endif /* MSDOS */
115
116 #ifndef STATIC
117 #define STATIC static
118 #endif
119
120 typedef struct RExC_state_t {
121 U32 flags; /* are we folding, multilining? */
122 char *precomp; /* uncompiled string. */
123 regexp *rx;
124 char *start; /* Start of input for compile */
125 char *end; /* End of input for compile */
126 char *parse; /* Input-scan pointer. */
127 I32 whilem_seen; /* number of WHILEM in this expr */
128 regnode *emit_start; /* Start of emitted-code area */
129 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
130 I32 naughty; /* How bad is this pattern? */
131 I32 sawback; /* Did we see \1, ...? */
132 U32 seen;
133 I32 size; /* Code size. */
134 I32 npar; /* () count. */
135 I32 extralen;
136 I32 seen_zerolen;
137 I32 seen_evals;
138 I32 utf8;
139 I32 orig_utf8;
140 #if ADD_TO_REGEXEC
141 char *starttry; /* -Dr: where regtry was called. */
142 #define RExC_starttry (pRExC_state->starttry)
143 #endif
144 } RExC_state_t;
145
146 #define RExC_flags (pRExC_state->flags)
147 #define RExC_precomp (pRExC_state->precomp)
148 #define RExC_rx (pRExC_state->rx)
149 #define RExC_start (pRExC_state->start)
150 #define RExC_end (pRExC_state->end)
151 #define RExC_parse (pRExC_state->parse)
152 #define RExC_whilem_seen (pRExC_state->whilem_seen)
153 #define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
154 #define RExC_emit (pRExC_state->emit)
155 #define RExC_emit_start (pRExC_state->emit_start)
156 #define RExC_naughty (pRExC_state->naughty)
157 #define RExC_sawback (pRExC_state->sawback)
158 #define RExC_seen (pRExC_state->seen)
159 #define RExC_size (pRExC_state->size)
160 #define RExC_npar (pRExC_state->npar)
161 #define RExC_extralen (pRExC_state->extralen)
162 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
163 #define RExC_seen_evals (pRExC_state->seen_evals)
164 #define RExC_utf8 (pRExC_state->utf8)
165 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
166
167 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
168 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
169 ((*s) == '{' && regcurly(s)))
170
171 #ifdef SPSTART
172 #undef SPSTART /* dratted cpp namespace... */
173 #endif
174 /*
175 * Flags to be passed up and down.
176 */
177 #define WORST 0 /* Worst case. */
178 #define HASWIDTH 0x1 /* Known to match non-null strings. */
179 #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
180 #define SPSTART 0x4 /* Starts with * or +. */
181 #define TRYAGAIN 0x8 /* Weeded out a declaration. */
182
183 /* Length of a variant. */
184
185 typedef struct scan_data_t {
186 I32 len_min;
187 I32 len_delta;
188 I32 pos_min;
189 I32 pos_delta;
190 SV *last_found;
191 I32 last_end; /* min value, <0 unless valid. */
192 I32 last_start_min;
193 I32 last_start_max;
194 SV **longest; /* Either &l_fixed, or &l_float. */
195 SV *longest_fixed;
196 I32 offset_fixed;
197 SV *longest_float;
198 I32 offset_float_min;
199 I32 offset_float_max;
200 I32 flags;
201 I32 whilem_c;
202 I32 *last_closep;
203 struct regnode_charclass_class *start_class;
204 } scan_data_t;
205
206 /*
207 * Forward declarations for pregcomp()'s friends.
208 */
209
210 static const scan_data_t zero_scan_data =
211 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
212
213 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
214 #define SF_BEFORE_SEOL 0x1
215 #define SF_BEFORE_MEOL 0x2
216 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
217 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
218
219 #ifdef NO_UNARY_PLUS
220 # define SF_FIX_SHIFT_EOL (0+2)
221 # define SF_FL_SHIFT_EOL (0+4)
222 #else
223 # define SF_FIX_SHIFT_EOL (+2)
224 # define SF_FL_SHIFT_EOL (+4)
225 #endif
226
227 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
228 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
229
230 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
231 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
232 #define SF_IS_INF 0x40
233 #define SF_HAS_PAR 0x80
234 #define SF_IN_PAR 0x100
235 #define SF_HAS_EVAL 0x200
236 #define SCF_DO_SUBSTR 0x400
237 #define SCF_DO_STCLASS_AND 0x0800
238 #define SCF_DO_STCLASS_OR 0x1000
239 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
240 #define SCF_WHILEM_VISITED_POS 0x2000
241
242 #define UTF (RExC_utf8 != 0)
243 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
244 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
245
246 #define OOB_UNICODE 12345678
247 #define OOB_NAMEDCLASS -1
248
249 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
250 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
251
252
253 /* length of regex to show in messages that don't mark a position within */
254 #define RegexLengthToShowInErrorMessages 127
255
256 /*
257 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
258 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
259 * op/pragma/warn/regcomp.
260 */
261 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
262 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
263
264 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
265
266 /*
267 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
268 * arg. Show regex, up to a maximum length. If it's too long, chop and add
269 * "...".
270 */
271 #define FAIL(msg) STMT_START { \
272 const char *ellipses = ""; \
273 IV len = RExC_end - RExC_precomp; \
274 \
275 if (!SIZE_ONLY) \
276 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
277 if (len > RegexLengthToShowInErrorMessages) { \
278 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
279 len = RegexLengthToShowInErrorMessages - 10; \
280 ellipses = "..."; \
281 } \
282 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
283 msg, (int)len, RExC_precomp, ellipses); \
284 } STMT_END
285
286 /*
287 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
288 * args. Show regex, up to a maximum length. If it's too long, chop and add
289 * "...".
290 */
291 #define FAIL2(pat,msg) STMT_START { \
292 const char *ellipses = ""; \
293 IV len = RExC_end - RExC_precomp; \
294 \
295 if (!SIZE_ONLY) \
296 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
297 if (len > RegexLengthToShowInErrorMessages) { \
298 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
299 len = RegexLengthToShowInErrorMessages - 10; \
300 ellipses = "..."; \
301 } \
302 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
303 msg, (int)len, RExC_precomp, ellipses); \
304 } STMT_END
305
306
307 /*
308 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
309 */
310 #define Simple_vFAIL(m) STMT_START { \
311 const IV offset = RExC_parse - RExC_precomp; \
312 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
313 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
314 } STMT_END
315
316 /*
317 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
318 */
319 #define vFAIL(m) STMT_START { \
320 if (!SIZE_ONLY) \
321 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
322 Simple_vFAIL(m); \
323 } STMT_END
324
325 /*
326 * Like Simple_vFAIL(), but accepts two arguments.
327 */
328 #define Simple_vFAIL2(m,a1) STMT_START { \
329 const IV offset = RExC_parse - RExC_precomp; \
330 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
331 (int)offset, RExC_precomp, RExC_precomp + offset); \
332 } STMT_END
333
334 /*
335 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
336 */
337 #define vFAIL2(m,a1) STMT_START { \
338 if (!SIZE_ONLY) \
339 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
340 Simple_vFAIL2(m, a1); \
341 } STMT_END
342
343
344 /*
345 * Like Simple_vFAIL(), but accepts three arguments.
346 */
347 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
348 const IV offset = RExC_parse - RExC_precomp; \
349 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
350 (int)offset, RExC_precomp, RExC_precomp + offset); \
351 } STMT_END
352
353 /*
354 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
355 */
356 #define vFAIL3(m,a1,a2) STMT_START { \
357 if (!SIZE_ONLY) \
358 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
359 Simple_vFAIL3(m, a1, a2); \
360 } STMT_END
361
362 /*
363 * Like Simple_vFAIL(), but accepts four arguments.
364 */
365 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
366 const IV offset = RExC_parse - RExC_precomp; \
367 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
368 (int)offset, RExC_precomp, RExC_precomp + offset); \
369 } STMT_END
370
371 #define vWARN(loc,m) STMT_START { \
372 const IV offset = loc - RExC_precomp; \
373 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
374 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
375 } STMT_END
376
377 #define vWARNdep(loc,m) STMT_START { \
378 const IV offset = loc - RExC_precomp; \
379 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
380 "%s" REPORT_LOCATION, \
381 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
382 } STMT_END
383
384
385 #define vWARN2(loc, m, a1) STMT_START { \
386 const IV offset = loc - RExC_precomp; \
387 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
388 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
389 } STMT_END
390
391 #define vWARN3(loc, m, a1, a2) STMT_START { \
392 const IV offset = loc - RExC_precomp; \
393 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
394 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
395 } STMT_END
396
397 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
398 const IV offset = loc - RExC_precomp; \
399 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
400 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
401 } STMT_END
402
403 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
404 const IV offset = loc - RExC_precomp; \
405 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
406 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
407 } STMT_END
408
409
410 /* Allow for side effects in s */
411 #define REGC(c,s) STMT_START { \
412 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
413 } STMT_END
414
415 /* Macros for recording node offsets. 20001227 mjd@plover.com
416 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
417 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
418 * Element 0 holds the number n.
419 */
420
421 #define MJD_OFFSET_DEBUG(x)
422 /* #define MJD_OFFSET_DEBUG(x) Perl_warn_nocontext x */
423
424
425 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
426 if (! SIZE_ONLY) { \
427 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
428 __LINE__, (node), (byte))); \
429 if((node) < 0) { \
430 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
431 } else { \
432 RExC_offsets[2*(node)-1] = (byte); \
433 } \
434 } \
435 } STMT_END
436
437 #define Set_Node_Offset(node,byte) \
438 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
439 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
440
441 #define Set_Node_Length_To_R(node,len) STMT_START { \
442 if (! SIZE_ONLY) { \
443 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
444 __LINE__, (int)(node), (int)(len))); \
445 if((node) < 0) { \
446 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
447 } else { \
448 RExC_offsets[2*(node)] = (len); \
449 } \
450 } \
451 } STMT_END
452
453 #define Set_Node_Length(node,len) \
454 Set_Node_Length_To_R((node)-RExC_emit_start, len)
455 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
456 #define Set_Node_Cur_Length(node) \
457 Set_Node_Length(node, RExC_parse - parse_start)
458
459 /* Get offsets and lengths */
460 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
461 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
462
463 static void clear_re(pTHX_ void *r);
464
465 /* Mark that we cannot extend a found fixed substring at this point.
466 Updata the longest found anchored substring and the longest found
467 floating substrings if needed. */
468
469 STATIC void
S_scan_commit(pTHX_ RExC_state_t * pRExC_state,scan_data_t * data)470 S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
471 {
472 const STRLEN l = CHR_SVLEN(data->last_found);
473 const STRLEN old_l = CHR_SVLEN(*data->longest);
474
475 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
476 SvSetMagicSV(*data->longest, data->last_found);
477 if (*data->longest == data->longest_fixed) {
478 data->offset_fixed = l ? data->last_start_min : data->pos_min;
479 if (data->flags & SF_BEFORE_EOL)
480 data->flags
481 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
482 else
483 data->flags &= ~SF_FIX_BEFORE_EOL;
484 }
485 else {
486 data->offset_float_min = l ? data->last_start_min : data->pos_min;
487 data->offset_float_max = (l
488 ? data->last_start_max
489 : data->pos_min + data->pos_delta);
490 if ((U32)data->offset_float_max > (U32)I32_MAX)
491 data->offset_float_max = I32_MAX;
492 if (data->flags & SF_BEFORE_EOL)
493 data->flags
494 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
495 else
496 data->flags &= ~SF_FL_BEFORE_EOL;
497 }
498 }
499 SvCUR_set(data->last_found, 0);
500 {
501 SV * const sv = data->last_found;
502 MAGIC * const mg =
503 SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL;
504 if (mg && mg->mg_len > 0)
505 mg->mg_len = 0;
506 }
507 data->last_end = -1;
508 data->flags &= ~SF_BEFORE_EOL;
509 }
510
511 /* Can match anything (initialization) */
512 STATIC void
S_cl_anything(pTHX_ RExC_state_t * pRExC_state,struct regnode_charclass_class * cl)513 S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
514 {
515 ANYOF_CLASS_ZERO(cl);
516 ANYOF_BITMAP_SETALL(cl);
517 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
518 if (LOC)
519 cl->flags |= ANYOF_LOCALE;
520 }
521
522 /* Can match anything (initialization) */
523 STATIC int
S_cl_is_anything(pTHX_ const struct regnode_charclass_class * cl)524 S_cl_is_anything(pTHX_ const struct regnode_charclass_class *cl)
525 {
526 int value;
527
528 for (value = 0; value <= ANYOF_MAX; value += 2)
529 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
530 return 1;
531 if (!(cl->flags & ANYOF_UNICODE_ALL))
532 return 0;
533 if (!ANYOF_BITMAP_TESTALLSET(cl))
534 return 0;
535 return 1;
536 }
537
538 /* Can match anything (initialization) */
539 STATIC void
S_cl_init(pTHX_ RExC_state_t * pRExC_state,struct regnode_charclass_class * cl)540 S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
541 {
542 Zero(cl, 1, struct regnode_charclass_class);
543 cl->type = ANYOF;
544 cl_anything(pRExC_state, cl);
545 }
546
547 STATIC void
S_cl_init_zero(pTHX_ RExC_state_t * pRExC_state,struct regnode_charclass_class * cl)548 S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
549 {
550 Zero(cl, 1, struct regnode_charclass_class);
551 cl->type = ANYOF;
552 cl_anything(pRExC_state, cl);
553 if (LOC)
554 cl->flags |= ANYOF_LOCALE;
555 }
556
557 /* 'And' a given class with another one. Can create false positives */
558 /* We assume that cl is not inverted */
559 STATIC void
S_cl_and(pTHX_ struct regnode_charclass_class * cl,const struct regnode_charclass_class * and_with)560 S_cl_and(pTHX_ struct regnode_charclass_class *cl,
561 const struct regnode_charclass_class *and_with)
562 {
563 if (!(and_with->flags & ANYOF_CLASS)
564 && !(cl->flags & ANYOF_CLASS)
565 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
566 && !(and_with->flags & ANYOF_FOLD)
567 && !(cl->flags & ANYOF_FOLD)) {
568 int i;
569
570 if (and_with->flags & ANYOF_INVERT)
571 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
572 cl->bitmap[i] &= ~and_with->bitmap[i];
573 else
574 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
575 cl->bitmap[i] &= and_with->bitmap[i];
576 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
577 if (!(and_with->flags & ANYOF_EOS))
578 cl->flags &= ~ANYOF_EOS;
579
580 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
581 !(and_with->flags & ANYOF_INVERT)) {
582 cl->flags &= ~ANYOF_UNICODE_ALL;
583 cl->flags |= ANYOF_UNICODE;
584 ARG_SET(cl, ARG(and_with));
585 }
586 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
587 !(and_with->flags & ANYOF_INVERT))
588 cl->flags &= ~ANYOF_UNICODE_ALL;
589 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
590 !(and_with->flags & ANYOF_INVERT))
591 cl->flags &= ~ANYOF_UNICODE;
592 }
593
594 /* 'OR' a given class with another one. Can create false positives */
595 /* We assume that cl is not inverted */
596 STATIC void
S_cl_or(pTHX_ RExC_state_t * pRExC_state,struct regnode_charclass_class * cl,const struct regnode_charclass_class * or_with)597 S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
598 {
599 if (or_with->flags & ANYOF_INVERT) {
600 /* We do not use
601 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
602 * <= (B1 | !B2) | (CL1 | !CL2)
603 * which is wasteful if CL2 is small, but we ignore CL2:
604 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
605 * XXXX Can we handle case-fold? Unclear:
606 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
607 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
608 */
609 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
610 && !(or_with->flags & ANYOF_FOLD)
611 && !(cl->flags & ANYOF_FOLD) ) {
612 int i;
613
614 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
615 cl->bitmap[i] |= ~or_with->bitmap[i];
616 } /* XXXX: logic is complicated otherwise */
617 else {
618 cl_anything(pRExC_state, cl);
619 }
620 } else {
621 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
622 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
623 && (!(or_with->flags & ANYOF_FOLD)
624 || (cl->flags & ANYOF_FOLD)) ) {
625 int i;
626
627 /* OR char bitmap and class bitmap separately */
628 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
629 cl->bitmap[i] |= or_with->bitmap[i];
630 if (or_with->flags & ANYOF_CLASS) {
631 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
632 cl->classflags[i] |= or_with->classflags[i];
633 cl->flags |= ANYOF_CLASS;
634 }
635 }
636 else { /* XXXX: logic is complicated, leave it along for a moment. */
637 cl_anything(pRExC_state, cl);
638 }
639 }
640 if (or_with->flags & ANYOF_EOS)
641 cl->flags |= ANYOF_EOS;
642
643 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
644 ARG(cl) != ARG(or_with)) {
645 cl->flags |= ANYOF_UNICODE_ALL;
646 cl->flags &= ~ANYOF_UNICODE;
647 }
648 if (or_with->flags & ANYOF_UNICODE_ALL) {
649 cl->flags |= ANYOF_UNICODE_ALL;
650 cl->flags &= ~ANYOF_UNICODE;
651 }
652 }
653
654 /*
655 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
656 * These need to be revisited when a newer toolchain becomes available.
657 */
658 #if defined(__sparc64__) && defined(__GNUC__)
659 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
660 # undef SPARC64_GCC_WORKAROUND
661 # define SPARC64_GCC_WORKAROUND 1
662 # endif
663 #endif
664
665 /* REx optimizer. Converts nodes into quickier variants "in place".
666 Finds fixed substrings. */
667
668 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
669 to the position after last scanned or to NULL. */
670
671 STATIC I32
S_study_chunk(pTHX_ RExC_state_t * pRExC_state,regnode ** scanp,I32 * deltap,regnode * last,scan_data_t * data,U32 flags)672 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
673 /* scanp: Start here (read-write). */
674 /* deltap: Write maxlen-minlen here. */
675 /* last: Stop before this one. */
676 {
677 I32 min = 0, pars = 0, code;
678 regnode *scan = *scanp, *next;
679 I32 delta = 0;
680 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
681 int is_inf_internal = 0; /* The studied chunk is infinite */
682 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
683 scan_data_t data_fake;
684 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
685
686 while (scan && OP(scan) != END && scan < last) {
687 /* Peephole optimizer: */
688
689 if (PL_regkind[(U8)OP(scan)] == EXACT) {
690 /* Merge several consecutive EXACTish nodes into one. */
691 regnode *n = regnext(scan);
692 U32 stringok = 1;
693 #ifdef DEBUGGING
694 regnode *stop = scan;
695 #endif
696
697 next = scan + NODE_SZ_STR(scan);
698 /* Skip NOTHING, merge EXACT*. */
699 while (n &&
700 ( PL_regkind[(U8)OP(n)] == NOTHING ||
701 (stringok && (OP(n) == OP(scan))))
702 && NEXT_OFF(n)
703 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
704 if (OP(n) == TAIL || n > next)
705 stringok = 0;
706 if (PL_regkind[(U8)OP(n)] == NOTHING) {
707 NEXT_OFF(scan) += NEXT_OFF(n);
708 next = n + NODE_STEP_REGNODE;
709 #ifdef DEBUGGING
710 if (stringok)
711 stop = n;
712 #endif
713 n = regnext(n);
714 }
715 else if (stringok) {
716 const int oldl = STR_LEN(scan);
717 regnode *nnext = regnext(n);
718
719 if (oldl + STR_LEN(n) > U8_MAX)
720 break;
721 NEXT_OFF(scan) += NEXT_OFF(n);
722 STR_LEN(scan) += STR_LEN(n);
723 next = n + NODE_SZ_STR(n);
724 /* Now we can overwrite *n : */
725 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
726 #ifdef DEBUGGING
727 stop = next - 1;
728 #endif
729 n = nnext;
730 }
731 }
732
733 if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) {
734 /*
735 Two problematic code points in Unicode casefolding of EXACT nodes:
736
737 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
738 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
739
740 which casefold to
741
742 Unicode UTF-8
743
744 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
745 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
746
747 This means that in case-insensitive matching (or "loose matching",
748 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
749 length of the above casefolded versions) can match a target string
750 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
751 This would rather mess up the minimum length computation.
752
753 What we'll do is to look for the tail four bytes, and then peek
754 at the preceding two bytes to see whether we need to decrease
755 the minimum length by four (six minus two).
756
757 Thanks to the design of UTF-8, there cannot be false matches:
758 A sequence of valid UTF-8 bytes cannot be a subsequence of
759 another valid sequence of UTF-8 bytes.
760
761 */
762 char *s0 = STRING(scan), *s, *t;
763 char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
764 const char * const t0 = "\xcc\x88\xcc\x81";
765 const char * const t1 = t0 + 3;
766
767 for (s = s0 + 2;
768 s < s2 && (t = ninstr(s, s1, t0, t1));
769 s = t + 4) {
770 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
771 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
772 min -= 4;
773 }
774 }
775
776 #ifdef DEBUGGING
777 /* Allow dumping */
778 n = scan + NODE_SZ_STR(scan);
779 while (n <= stop) {
780 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
781 OP(n) = OPTIMIZED;
782 NEXT_OFF(n) = 0;
783 }
784 n++;
785 }
786 #endif
787 }
788 /* Follow the next-chain of the current node and optimize
789 away all the NOTHINGs from it. */
790 if (OP(scan) != CURLYX) {
791 const int max = (reg_off_by_arg[OP(scan)]
792 ? I32_MAX
793 /* I32 may be smaller than U16 on CRAYs! */
794 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
795 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
796 int noff;
797 regnode *n = scan;
798
799 /* Skip NOTHING and LONGJMP. */
800 while ((n = regnext(n))
801 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
802 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
803 && off + noff < max)
804 off += noff;
805 if (reg_off_by_arg[OP(scan)])
806 ARG(scan) = off;
807 else
808 NEXT_OFF(scan) = off;
809 }
810 /* The principal pseudo-switch. Cannot be a switch, since we
811 look into several different things. */
812 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
813 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
814 next = regnext(scan);
815 code = OP(scan);
816
817 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
818 I32 max1 = 0, min1 = I32_MAX, num = 0;
819 struct regnode_charclass_class accum;
820
821 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
822 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
823 if (flags & SCF_DO_STCLASS)
824 cl_init_zero(pRExC_state, &accum);
825 while (OP(scan) == code) {
826 I32 deltanext, minnext, f = 0, fake;
827 struct regnode_charclass_class this_class;
828
829 num++;
830 data_fake.flags = 0;
831 if (data) {
832 data_fake.whilem_c = data->whilem_c;
833 data_fake.last_closep = data->last_closep;
834 }
835 else
836 data_fake.last_closep = &fake;
837 next = regnext(scan);
838 scan = NEXTOPER(scan);
839 if (code != BRANCH)
840 scan = NEXTOPER(scan);
841 if (flags & SCF_DO_STCLASS) {
842 cl_init(pRExC_state, &this_class);
843 data_fake.start_class = &this_class;
844 f = SCF_DO_STCLASS_AND;
845 }
846 if (flags & SCF_WHILEM_VISITED_POS)
847 f |= SCF_WHILEM_VISITED_POS;
848 /* we suppose the run is continuous, last=next...*/
849 minnext = study_chunk(pRExC_state, &scan, &deltanext,
850 next, &data_fake, f);
851 if (min1 > minnext)
852 min1 = minnext;
853 if (max1 < minnext + deltanext)
854 max1 = minnext + deltanext;
855 if (deltanext == I32_MAX)
856 is_inf = is_inf_internal = 1;
857 scan = next;
858 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
859 pars++;
860 if (data && (data_fake.flags & SF_HAS_EVAL))
861 data->flags |= SF_HAS_EVAL;
862 if (data)
863 data->whilem_c = data_fake.whilem_c;
864 if (flags & SCF_DO_STCLASS)
865 cl_or(pRExC_state, &accum, &this_class);
866 if (code == SUSPEND)
867 break;
868 }
869 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
870 min1 = 0;
871 if (flags & SCF_DO_SUBSTR) {
872 data->pos_min += min1;
873 data->pos_delta += max1 - min1;
874 if (max1 != min1 || is_inf)
875 data->longest = &(data->longest_float);
876 }
877 min += min1;
878 delta += max1 - min1;
879 if (flags & SCF_DO_STCLASS_OR) {
880 cl_or(pRExC_state, data->start_class, &accum);
881 if (min1) {
882 cl_and(data->start_class, &and_with);
883 flags &= ~SCF_DO_STCLASS;
884 }
885 }
886 else if (flags & SCF_DO_STCLASS_AND) {
887 if (min1) {
888 cl_and(data->start_class, &accum);
889 flags &= ~SCF_DO_STCLASS;
890 }
891 else {
892 /* Switch to OR mode: cache the old value of
893 * data->start_class */
894 StructCopy(data->start_class, &and_with,
895 struct regnode_charclass_class);
896 flags &= ~SCF_DO_STCLASS_AND;
897 StructCopy(&accum, data->start_class,
898 struct regnode_charclass_class);
899 flags |= SCF_DO_STCLASS_OR;
900 data->start_class->flags |= ANYOF_EOS;
901 }
902 }
903
904 }
905 else if (code == BRANCHJ) /* single branch is optimized. */
906 scan = NEXTOPER(NEXTOPER(scan));
907 else /* single branch is optimized. */
908 scan = NEXTOPER(scan);
909 continue;
910 }
911 else if (OP(scan) == EXACT) {
912 I32 l = STR_LEN(scan);
913 UV uc = *((U8*)STRING(scan));
914 if (UTF) {
915 const U8 * const s = (U8*)STRING(scan);
916 l = utf8_length((U8 *)s, (U8 *)s + l);
917 uc = utf8_to_uvchr((U8 *)s, NULL);
918 }
919 min += l;
920 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
921 /* The code below prefers earlier match for fixed
922 offset, later match for variable offset. */
923 if (data->last_end == -1) { /* Update the start info. */
924 data->last_start_min = data->pos_min;
925 data->last_start_max = is_inf
926 ? I32_MAX : data->pos_min + data->pos_delta;
927 }
928 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
929 {
930 SV * const sv = data->last_found;
931 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
932 mg_find(sv, PERL_MAGIC_utf8) : NULL;
933 if (mg && mg->mg_len >= 0)
934 mg->mg_len += utf8_length((U8*)STRING(scan),
935 (U8*)STRING(scan)+STR_LEN(scan));
936 }
937 if (UTF)
938 SvUTF8_on(data->last_found);
939 data->last_end = data->pos_min + l;
940 data->pos_min += l; /* As in the first entry. */
941 data->flags &= ~SF_BEFORE_EOL;
942 }
943 if (flags & SCF_DO_STCLASS_AND) {
944 /* Check whether it is compatible with what we know already! */
945 int compat = 1;
946
947 if (uc >= 0x100 ||
948 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
949 && !ANYOF_BITMAP_TEST(data->start_class, uc)
950 && (!(data->start_class->flags & ANYOF_FOLD)
951 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
952 )
953 compat = 0;
954 ANYOF_CLASS_ZERO(data->start_class);
955 ANYOF_BITMAP_ZERO(data->start_class);
956 if (compat)
957 ANYOF_BITMAP_SET(data->start_class, uc);
958 data->start_class->flags &= ~ANYOF_EOS;
959 if (uc < 0x100)
960 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
961 }
962 else if (flags & SCF_DO_STCLASS_OR) {
963 /* false positive possible if the class is case-folded */
964 if (uc < 0x100)
965 ANYOF_BITMAP_SET(data->start_class, uc);
966 else
967 data->start_class->flags |= ANYOF_UNICODE_ALL;
968 data->start_class->flags &= ~ANYOF_EOS;
969 cl_and(data->start_class, &and_with);
970 }
971 flags &= ~SCF_DO_STCLASS;
972 }
973 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
974 I32 l = STR_LEN(scan);
975 UV uc = *((U8*)STRING(scan));
976
977 /* Search for fixed substrings supports EXACT only. */
978 if (flags & SCF_DO_SUBSTR)
979 scan_commit(pRExC_state, data);
980 if (UTF) {
981 U8 *s = (U8 *)STRING(scan);
982 l = utf8_length(s, s + l);
983 uc = utf8_to_uvchr(s, NULL);
984 }
985 min += l;
986 if (data && (flags & SCF_DO_SUBSTR))
987 data->pos_min += l;
988 if (flags & SCF_DO_STCLASS_AND) {
989 /* Check whether it is compatible with what we know already! */
990 int compat = 1;
991
992 if (uc >= 0x100 ||
993 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
994 && !ANYOF_BITMAP_TEST(data->start_class, uc)
995 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
996 compat = 0;
997 ANYOF_CLASS_ZERO(data->start_class);
998 ANYOF_BITMAP_ZERO(data->start_class);
999 if (compat) {
1000 ANYOF_BITMAP_SET(data->start_class, uc);
1001 data->start_class->flags &= ~ANYOF_EOS;
1002 data->start_class->flags |= ANYOF_FOLD;
1003 if (OP(scan) == EXACTFL)
1004 data->start_class->flags |= ANYOF_LOCALE;
1005 }
1006 }
1007 else if (flags & SCF_DO_STCLASS_OR) {
1008 if (data->start_class->flags & ANYOF_FOLD) {
1009 /* false positive possible if the class is case-folded.
1010 Assume that the locale settings are the same... */
1011 if (uc < 0x100)
1012 ANYOF_BITMAP_SET(data->start_class, uc);
1013 data->start_class->flags &= ~ANYOF_EOS;
1014 }
1015 cl_and(data->start_class, &and_with);
1016 }
1017 flags &= ~SCF_DO_STCLASS;
1018 }
1019 else if (strchr((const char*)PL_varies,OP(scan))) {
1020 I32 mincount, maxcount, minnext, deltanext, fl = 0;
1021 I32 f = flags, pos_before = 0;
1022 regnode *oscan = scan;
1023 struct regnode_charclass_class this_class;
1024 struct regnode_charclass_class *oclass = NULL;
1025 I32 next_is_eval = 0;
1026
1027 switch (PL_regkind[(U8)OP(scan)]) {
1028 case WHILEM: /* End of (?:...)* . */
1029 scan = NEXTOPER(scan);
1030 goto finish;
1031 case PLUS:
1032 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
1033 next = NEXTOPER(scan);
1034 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
1035 mincount = 1;
1036 maxcount = REG_INFTY;
1037 next = regnext(scan);
1038 scan = NEXTOPER(scan);
1039 goto do_curly;
1040 }
1041 }
1042 if (flags & SCF_DO_SUBSTR)
1043 data->pos_min++;
1044 min++;
1045 /* Fall through. */
1046 case STAR:
1047 if (flags & SCF_DO_STCLASS) {
1048 mincount = 0;
1049 maxcount = REG_INFTY;
1050 next = regnext(scan);
1051 scan = NEXTOPER(scan);
1052 goto do_curly;
1053 }
1054 is_inf = is_inf_internal = 1;
1055 scan = regnext(scan);
1056 if (flags & SCF_DO_SUBSTR) {
1057 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
1058 data->longest = &(data->longest_float);
1059 }
1060 goto optimize_curly_tail;
1061 case CURLY:
1062 mincount = ARG1(scan);
1063 maxcount = ARG2(scan);
1064 next = regnext(scan);
1065 if (OP(scan) == CURLYX) {
1066 I32 lp = (data ? *(data->last_closep) : 0);
1067
1068 scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1069 }
1070 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
1071 next_is_eval = (OP(scan) == EVAL);
1072 do_curly:
1073 if (flags & SCF_DO_SUBSTR) {
1074 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
1075 pos_before = data->pos_min;
1076 }
1077 if (data) {
1078 fl = data->flags;
1079 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1080 if (is_inf)
1081 data->flags |= SF_IS_INF;
1082 }
1083 if (flags & SCF_DO_STCLASS) {
1084 cl_init(pRExC_state, &this_class);
1085 oclass = data->start_class;
1086 data->start_class = &this_class;
1087 f |= SCF_DO_STCLASS_AND;
1088 f &= ~SCF_DO_STCLASS_OR;
1089 }
1090 /* These are the cases when once a subexpression
1091 fails at a particular position, it cannot succeed
1092 even after backtracking at the enclosing scope.
1093
1094 XXXX what if minimal match and we are at the
1095 initial run of {n,m}? */
1096 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1097 f &= ~SCF_WHILEM_VISITED_POS;
1098
1099 /* This will finish on WHILEM, setting scan, or on NULL: */
1100 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1101 mincount == 0
1102 ? (f & ~SCF_DO_SUBSTR) : f);
1103
1104 if (flags & SCF_DO_STCLASS)
1105 data->start_class = oclass;
1106 if (mincount == 0 || minnext == 0) {
1107 if (flags & SCF_DO_STCLASS_OR) {
1108 cl_or(pRExC_state, data->start_class, &this_class);
1109 }
1110 else if (flags & SCF_DO_STCLASS_AND) {
1111 /* Switch to OR mode: cache the old value of
1112 * data->start_class */
1113 StructCopy(data->start_class, &and_with,
1114 struct regnode_charclass_class);
1115 flags &= ~SCF_DO_STCLASS_AND;
1116 StructCopy(&this_class, data->start_class,
1117 struct regnode_charclass_class);
1118 flags |= SCF_DO_STCLASS_OR;
1119 data->start_class->flags |= ANYOF_EOS;
1120 }
1121 } else { /* Non-zero len */
1122 if (flags & SCF_DO_STCLASS_OR) {
1123 cl_or(pRExC_state, data->start_class, &this_class);
1124 cl_and(data->start_class, &and_with);
1125 }
1126 else if (flags & SCF_DO_STCLASS_AND)
1127 cl_and(data->start_class, &this_class);
1128 flags &= ~SCF_DO_STCLASS;
1129 }
1130 if (!scan) /* It was not CURLYX, but CURLY. */
1131 scan = next;
1132 if ( /* ? quantifier ok, except for (?{ ... }) */
1133 (next_is_eval || !(mincount == 0 && maxcount == 1))
1134 && (minnext == 0) && (deltanext == 0)
1135 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
1136 && maxcount <= REG_INFTY/3 /* Complement check for big count */
1137 && ckWARN(WARN_REGEXP))
1138 {
1139 vWARN(RExC_parse,
1140 "Quantifier unexpected on zero-length expression");
1141 }
1142
1143 min += minnext * mincount;
1144 is_inf_internal |= ((maxcount == REG_INFTY
1145 && (minnext + deltanext) > 0)
1146 || deltanext == I32_MAX);
1147 is_inf |= is_inf_internal;
1148 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1149
1150 /* Try powerful optimization CURLYX => CURLYN. */
1151 if ( OP(oscan) == CURLYX && data
1152 && data->flags & SF_IN_PAR
1153 && !(data->flags & SF_HAS_EVAL)
1154 && !deltanext && minnext == 1 ) {
1155 /* Try to optimize to CURLYN. */
1156 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
1157 regnode *nxt1 = nxt;
1158 #ifdef DEBUGGING
1159 regnode *nxt2;
1160 #endif
1161
1162 /* Skip open. */
1163 nxt = regnext(nxt);
1164 if (!strchr((const char*)PL_simple,OP(nxt))
1165 && !(PL_regkind[(U8)OP(nxt)] == EXACT
1166 && STR_LEN(nxt) == 1))
1167 goto nogo;
1168 #ifdef DEBUGGING
1169 nxt2 = nxt;
1170 #endif
1171 nxt = regnext(nxt);
1172 if (OP(nxt) != CLOSE)
1173 goto nogo;
1174 /* Now we know that nxt2 is the only contents: */
1175 oscan->flags = (U8)ARG(nxt);
1176 OP(oscan) = CURLYN;
1177 OP(nxt1) = NOTHING; /* was OPEN. */
1178 #ifdef DEBUGGING
1179 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1180 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1181 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1182 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1183 OP(nxt + 1) = OPTIMIZED; /* was count. */
1184 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
1185 #endif
1186 }
1187 nogo:
1188
1189 /* Try optimization CURLYX => CURLYM. */
1190 if ( OP(oscan) == CURLYX && data
1191 && !(data->flags & SF_HAS_PAR)
1192 && !(data->flags & SF_HAS_EVAL)
1193 && !deltanext /* atom is fixed width */
1194 && minnext != 0 /* CURLYM can't handle zero width */
1195 ) {
1196 /* XXXX How to optimize if data == 0? */
1197 /* Optimize to a simpler form. */
1198 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1199 regnode *nxt2;
1200
1201 OP(oscan) = CURLYM;
1202 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
1203 && (OP(nxt2) != WHILEM))
1204 nxt = nxt2;
1205 OP(nxt2) = SUCCEED; /* Whas WHILEM */
1206 /* Need to optimize away parenths. */
1207 if (data->flags & SF_IN_PAR) {
1208 /* Set the parenth number. */
1209 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1210
1211 if (OP(nxt) != CLOSE)
1212 FAIL("Panic opt close");
1213 oscan->flags = (U8)ARG(nxt);
1214 OP(nxt1) = OPTIMIZED; /* was OPEN. */
1215 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1216 #ifdef DEBUGGING
1217 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1218 OP(nxt + 1) = OPTIMIZED; /* was count. */
1219 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1220 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
1221 #endif
1222 #if 0
1223 while ( nxt1 && (OP(nxt1) != WHILEM)) {
1224 regnode *nnxt = regnext(nxt1);
1225
1226 if (nnxt == nxt) {
1227 if (reg_off_by_arg[OP(nxt1)])
1228 ARG_SET(nxt1, nxt2 - nxt1);
1229 else if (nxt2 - nxt1 < U16_MAX)
1230 NEXT_OFF(nxt1) = nxt2 - nxt1;
1231 else
1232 OP(nxt) = NOTHING; /* Cannot beautify */
1233 }
1234 nxt1 = nnxt;
1235 }
1236 #endif
1237 /* Optimize again: */
1238 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
1239 NULL, 0);
1240 }
1241 else
1242 oscan->flags = 0;
1243 }
1244 else if ((OP(oscan) == CURLYX)
1245 && (flags & SCF_WHILEM_VISITED_POS)
1246 /* See the comment on a similar expression above.
1247 However, this time it not a subexpression
1248 we care about, but the expression itself. */
1249 && (maxcount == REG_INFTY)
1250 && data && ++data->whilem_c < 16) {
1251 /* This stays as CURLYX, we can put the count/of pair. */
1252 /* Find WHILEM (as in regexec.c) */
1253 regnode *nxt = oscan + NEXT_OFF(oscan);
1254
1255 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1256 nxt += ARG(nxt);
1257 PREVOPER(nxt)->flags = (U8)(data->whilem_c
1258 | (RExC_whilem_seen << 4)); /* On WHILEM */
1259 }
1260 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
1261 pars++;
1262 if (flags & SCF_DO_SUBSTR) {
1263 SV *last_str = Nullsv;
1264 int counted = mincount != 0;
1265
1266 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
1267 #if defined(SPARC64_GCC_WORKAROUND)
1268 I32 b = 0;
1269 STRLEN l = 0;
1270 const char *s = NULL;
1271 I32 old = 0;
1272
1273 if (pos_before >= data->last_start_min)
1274 b = pos_before;
1275 else
1276 b = data->last_start_min;
1277
1278 l = 0;
1279 s = SvPV_const(data->last_found, l);
1280 old = b - data->last_start_min;
1281
1282 #else
1283 I32 b = pos_before >= data->last_start_min
1284 ? pos_before : data->last_start_min;
1285 STRLEN l;
1286 const char *s = SvPV_const(data->last_found, l);
1287 I32 old = b - data->last_start_min;
1288 #endif
1289
1290 if (UTF)
1291 old = utf8_hop((U8*)s, old) - (U8*)s;
1292
1293 l -= old;
1294 /* Get the added string: */
1295 last_str = newSVpvn(s + old, l);
1296 if (UTF)
1297 SvUTF8_on(last_str);
1298 if (deltanext == 0 && pos_before == b) {
1299 /* What was added is a constant string */
1300 if (mincount > 1) {
1301 SvGROW(last_str, (mincount * l) + 1);
1302 repeatcpy(SvPVX(last_str) + l,
1303 SvPVX_const(last_str), l, mincount - 1);
1304 SvCUR_set(last_str, SvCUR(last_str) * mincount);
1305 /* Add additional parts. */
1306 SvCUR_set(data->last_found,
1307 SvCUR(data->last_found) - l);
1308 sv_catsv(data->last_found, last_str);
1309 {
1310 SV * sv = data->last_found;
1311 MAGIC *mg =
1312 SvUTF8(sv) && SvMAGICAL(sv) ?
1313 mg_find(sv, PERL_MAGIC_utf8) : NULL;
1314 if (mg && mg->mg_len >= 0)
1315 mg->mg_len += CHR_SVLEN(last_str);
1316 }
1317 data->last_end += l * (mincount - 1);
1318 }
1319 } else {
1320 /* start offset must point into the last copy */
1321 data->last_start_min += minnext * (mincount - 1);
1322 data->last_start_max += is_inf ? I32_MAX
1323 : (maxcount - 1) * (minnext + data->pos_delta);
1324 }
1325 }
1326 /* It is counted once already... */
1327 data->pos_min += minnext * (mincount - counted);
1328 data->pos_delta += - counted * deltanext +
1329 (minnext + deltanext) * maxcount - minnext * mincount;
1330 if (mincount != maxcount) {
1331 /* Cannot extend fixed substrings found inside
1332 the group. */
1333 scan_commit(pRExC_state,data);
1334 if (mincount && last_str) {
1335 sv_setsv(data->last_found, last_str);
1336 data->last_end = data->pos_min;
1337 data->last_start_min =
1338 data->pos_min - CHR_SVLEN(last_str);
1339 data->last_start_max = is_inf
1340 ? I32_MAX
1341 : data->pos_min + data->pos_delta
1342 - CHR_SVLEN(last_str);
1343 }
1344 data->longest = &(data->longest_float);
1345 }
1346 SvREFCNT_dec(last_str);
1347 }
1348 if (data && (fl & SF_HAS_EVAL))
1349 data->flags |= SF_HAS_EVAL;
1350 optimize_curly_tail:
1351 if (OP(oscan) != CURLYX) {
1352 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
1353 && NEXT_OFF(next))
1354 NEXT_OFF(oscan) += NEXT_OFF(next);
1355 }
1356 continue;
1357 default: /* REF and CLUMP only? */
1358 if (flags & SCF_DO_SUBSTR) {
1359 scan_commit(pRExC_state,data); /* Cannot expect anything... */
1360 data->longest = &(data->longest_float);
1361 }
1362 is_inf = is_inf_internal = 1;
1363 if (flags & SCF_DO_STCLASS_OR)
1364 cl_anything(pRExC_state, data->start_class);
1365 flags &= ~SCF_DO_STCLASS;
1366 break;
1367 }
1368 }
1369 else if (strchr((const char*)PL_simple,OP(scan))) {
1370 int value = 0;
1371
1372 if (flags & SCF_DO_SUBSTR) {
1373 scan_commit(pRExC_state,data);
1374 data->pos_min++;
1375 }
1376 min++;
1377 if (flags & SCF_DO_STCLASS) {
1378 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1379
1380 /* Some of the logic below assumes that switching
1381 locale on will only add false positives. */
1382 switch (PL_regkind[(U8)OP(scan)]) {
1383 case SANY:
1384 default:
1385 do_default:
1386 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1387 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1388 cl_anything(pRExC_state, data->start_class);
1389 break;
1390 case REG_ANY:
1391 if (OP(scan) == SANY)
1392 goto do_default;
1393 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1394 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1395 || (data->start_class->flags & ANYOF_CLASS));
1396 cl_anything(pRExC_state, data->start_class);
1397 }
1398 if (flags & SCF_DO_STCLASS_AND || !value)
1399 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1400 break;
1401 case ANYOF:
1402 if (flags & SCF_DO_STCLASS_AND)
1403 cl_and(data->start_class,
1404 (struct regnode_charclass_class*)scan);
1405 else
1406 cl_or(pRExC_state, data->start_class,
1407 (struct regnode_charclass_class*)scan);
1408 break;
1409 case ALNUM:
1410 if (flags & SCF_DO_STCLASS_AND) {
1411 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1412 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1413 for (value = 0; value < 256; value++)
1414 if (!isALNUM(value))
1415 ANYOF_BITMAP_CLEAR(data->start_class, value);
1416 }
1417 }
1418 else {
1419 if (data->start_class->flags & ANYOF_LOCALE)
1420 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1421 else {
1422 for (value = 0; value < 256; value++)
1423 if (isALNUM(value))
1424 ANYOF_BITMAP_SET(data->start_class, value);
1425 }
1426 }
1427 break;
1428 case ALNUML:
1429 if (flags & SCF_DO_STCLASS_AND) {
1430 if (data->start_class->flags & ANYOF_LOCALE)
1431 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1432 }
1433 else {
1434 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1435 data->start_class->flags |= ANYOF_LOCALE;
1436 }
1437 break;
1438 case NALNUM:
1439 if (flags & SCF_DO_STCLASS_AND) {
1440 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1441 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1442 for (value = 0; value < 256; value++)
1443 if (isALNUM(value))
1444 ANYOF_BITMAP_CLEAR(data->start_class, value);
1445 }
1446 }
1447 else {
1448 if (data->start_class->flags & ANYOF_LOCALE)
1449 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1450 else {
1451 for (value = 0; value < 256; value++)
1452 if (!isALNUM(value))
1453 ANYOF_BITMAP_SET(data->start_class, value);
1454 }
1455 }
1456 break;
1457 case NALNUML:
1458 if (flags & SCF_DO_STCLASS_AND) {
1459 if (data->start_class->flags & ANYOF_LOCALE)
1460 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1461 }
1462 else {
1463 data->start_class->flags |= ANYOF_LOCALE;
1464 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1465 }
1466 break;
1467 case SPACE:
1468 if (flags & SCF_DO_STCLASS_AND) {
1469 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1470 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1471 for (value = 0; value < 256; value++)
1472 if (!isSPACE(value))
1473 ANYOF_BITMAP_CLEAR(data->start_class, value);
1474 }
1475 }
1476 else {
1477 if (data->start_class->flags & ANYOF_LOCALE)
1478 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1479 else {
1480 for (value = 0; value < 256; value++)
1481 if (isSPACE(value))
1482 ANYOF_BITMAP_SET(data->start_class, value);
1483 }
1484 }
1485 break;
1486 case SPACEL:
1487 if (flags & SCF_DO_STCLASS_AND) {
1488 if (data->start_class->flags & ANYOF_LOCALE)
1489 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1490 }
1491 else {
1492 data->start_class->flags |= ANYOF_LOCALE;
1493 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1494 }
1495 break;
1496 case NSPACE:
1497 if (flags & SCF_DO_STCLASS_AND) {
1498 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1499 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1500 for (value = 0; value < 256; value++)
1501 if (isSPACE(value))
1502 ANYOF_BITMAP_CLEAR(data->start_class, value);
1503 }
1504 }
1505 else {
1506 if (data->start_class->flags & ANYOF_LOCALE)
1507 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1508 else {
1509 for (value = 0; value < 256; value++)
1510 if (!isSPACE(value))
1511 ANYOF_BITMAP_SET(data->start_class, value);
1512 }
1513 }
1514 break;
1515 case NSPACEL:
1516 if (flags & SCF_DO_STCLASS_AND) {
1517 if (data->start_class->flags & ANYOF_LOCALE) {
1518 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1519 for (value = 0; value < 256; value++)
1520 if (!isSPACE(value))
1521 ANYOF_BITMAP_CLEAR(data->start_class, value);
1522 }
1523 }
1524 else {
1525 data->start_class->flags |= ANYOF_LOCALE;
1526 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1527 }
1528 break;
1529 case DIGIT:
1530 if (flags & SCF_DO_STCLASS_AND) {
1531 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1532 for (value = 0; value < 256; value++)
1533 if (!isDIGIT(value))
1534 ANYOF_BITMAP_CLEAR(data->start_class, value);
1535 }
1536 else {
1537 if (data->start_class->flags & ANYOF_LOCALE)
1538 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1539 else {
1540 for (value = 0; value < 256; value++)
1541 if (isDIGIT(value))
1542 ANYOF_BITMAP_SET(data->start_class, value);
1543 }
1544 }
1545 break;
1546 case NDIGIT:
1547 if (flags & SCF_DO_STCLASS_AND) {
1548 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1549 for (value = 0; value < 256; value++)
1550 if (isDIGIT(value))
1551 ANYOF_BITMAP_CLEAR(data->start_class, value);
1552 }
1553 else {
1554 if (data->start_class->flags & ANYOF_LOCALE)
1555 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1556 else {
1557 for (value = 0; value < 256; value++)
1558 if (!isDIGIT(value))
1559 ANYOF_BITMAP_SET(data->start_class, value);
1560 }
1561 }
1562 break;
1563 }
1564 if (flags & SCF_DO_STCLASS_OR)
1565 cl_and(data->start_class, &and_with);
1566 flags &= ~SCF_DO_STCLASS;
1567 }
1568 }
1569 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
1570 data->flags |= (OP(scan) == MEOL
1571 ? SF_BEFORE_MEOL
1572 : SF_BEFORE_SEOL);
1573 }
1574 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
1575 /* Lookbehind, or need to calculate parens/evals/stclass: */
1576 && (scan->flags || data || (flags & SCF_DO_STCLASS))
1577 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
1578 /* Lookahead/lookbehind */
1579 I32 deltanext, minnext, fake = 0;
1580 regnode *nscan;
1581 struct regnode_charclass_class intrnl;
1582 int f = 0;
1583
1584 data_fake.flags = 0;
1585 if (data) {
1586 data_fake.whilem_c = data->whilem_c;
1587 data_fake.last_closep = data->last_closep;
1588 }
1589 else
1590 data_fake.last_closep = &fake;
1591 if ( flags & SCF_DO_STCLASS && !scan->flags
1592 && OP(scan) == IFMATCH ) { /* Lookahead */
1593 cl_init(pRExC_state, &intrnl);
1594 data_fake.start_class = &intrnl;
1595 f |= SCF_DO_STCLASS_AND;
1596 }
1597 if (flags & SCF_WHILEM_VISITED_POS)
1598 f |= SCF_WHILEM_VISITED_POS;
1599 next = regnext(scan);
1600 nscan = NEXTOPER(NEXTOPER(scan));
1601 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
1602 if (scan->flags) {
1603 if (deltanext) {
1604 vFAIL("Variable length lookbehind not implemented");
1605 }
1606 else if (minnext > U8_MAX) {
1607 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
1608 }
1609 scan->flags = (U8)minnext;
1610 }
1611 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1612 pars++;
1613 if (data && (data_fake.flags & SF_HAS_EVAL))
1614 data->flags |= SF_HAS_EVAL;
1615 if (data)
1616 data->whilem_c = data_fake.whilem_c;
1617 if (f & SCF_DO_STCLASS_AND) {
1618 const int was = (data->start_class->flags & ANYOF_EOS);
1619
1620 cl_and(data->start_class, &intrnl);
1621 if (was)
1622 data->start_class->flags |= ANYOF_EOS;
1623 }
1624 }
1625 else if (OP(scan) == OPEN) {
1626 pars++;
1627 }
1628 else if (OP(scan) == CLOSE) {
1629 if ((I32)ARG(scan) == is_par) {
1630 next = regnext(scan);
1631
1632 if ( next && (OP(next) != WHILEM) && next < last)
1633 is_par = 0; /* Disable optimization */
1634 }
1635 if (data)
1636 *(data->last_closep) = ARG(scan);
1637 }
1638 else if (OP(scan) == EVAL) {
1639 if (data)
1640 data->flags |= SF_HAS_EVAL;
1641 }
1642 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
1643 if (flags & SCF_DO_SUBSTR) {
1644 scan_commit(pRExC_state,data);
1645 data->longest = &(data->longest_float);
1646 }
1647 is_inf = is_inf_internal = 1;
1648 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1649 cl_anything(pRExC_state, data->start_class);
1650 flags &= ~SCF_DO_STCLASS;
1651 }
1652 /* Else: zero-length, ignore. */
1653 scan = regnext(scan);
1654 }
1655
1656 finish:
1657 *scanp = scan;
1658 *deltap = is_inf_internal ? I32_MAX : delta;
1659 if (flags & SCF_DO_SUBSTR && is_inf)
1660 data->pos_delta = I32_MAX - data->pos_min;
1661 if (is_par > U8_MAX)
1662 is_par = 0;
1663 if (is_par && pars==1 && data) {
1664 data->flags |= SF_IN_PAR;
1665 data->flags &= ~SF_HAS_PAR;
1666 }
1667 else if (pars && data) {
1668 data->flags |= SF_HAS_PAR;
1669 data->flags &= ~SF_IN_PAR;
1670 }
1671 if (flags & SCF_DO_STCLASS_OR)
1672 cl_and(data->start_class, &and_with);
1673 return min;
1674 }
1675
1676 STATIC I32
S_add_data(pTHX_ RExC_state_t * pRExC_state,I32 n,const char * s)1677 S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, const char *s)
1678 {
1679 if (RExC_rx->data) {
1680 Renewc(RExC_rx->data,
1681 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
1682 char, struct reg_data);
1683 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1684 RExC_rx->data->count += n;
1685 }
1686 else {
1687 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
1688 char, struct reg_data);
1689 Newx(RExC_rx->data->what, n, U8);
1690 RExC_rx->data->count = n;
1691 }
1692 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1693 return RExC_rx->data->count - n;
1694 }
1695
1696 void
Perl_reginitcolors(pTHX)1697 Perl_reginitcolors(pTHX)
1698 {
1699 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
1700 if (s) {
1701 char *t = savepv(s);
1702 int i = 0;
1703 PL_colors[0] = t;
1704 while (++i < 6) {
1705 t = strchr(t, '\t');
1706 if (t) {
1707 *t = '\0';
1708 PL_colors[i] = ++t;
1709 }
1710 else
1711 PL_colors[i] = t = (char *)"";
1712 }
1713 } else {
1714 int i = 0;
1715 while (i < 6)
1716 PL_colors[i++] = (char *)"";
1717 }
1718 PL_colorset = 1;
1719 }
1720
1721
1722 /*
1723 - pregcomp - compile a regular expression into internal code
1724 *
1725 * We can't allocate space until we know how big the compiled form will be,
1726 * but we can't compile it (and thus know how big it is) until we've got a
1727 * place to put the code. So we cheat: we compile it twice, once with code
1728 * generation turned off and size counting turned on, and once "for real".
1729 * This also means that we don't allocate space until we are sure that the
1730 * thing really will compile successfully, and we never have to move the
1731 * code and thus invalidate pointers into it. (Note that it has to be in
1732 * one piece because free() must be able to free it all.) [NB: not true in perl]
1733 *
1734 * Beware that the optimization-preparation code in here knows about some
1735 * of the structure of the compiled regexp. [I'll say.]
1736 */
1737 regexp *
Perl_pregcomp(pTHX_ char * exp,char * xend,PMOP * pm)1738 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
1739 {
1740 register regexp *r;
1741 regnode *scan;
1742 regnode *first;
1743 I32 flags;
1744 I32 minlen = 0;
1745 I32 sawplus = 0;
1746 I32 sawopen = 0;
1747 scan_data_t data;
1748 RExC_state_t RExC_state;
1749 RExC_state_t *pRExC_state = &RExC_state;
1750
1751 if (exp == NULL)
1752 FAIL("NULL regexp argument");
1753
1754 RExC_orig_utf8 = RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
1755
1756 DEBUG_r({
1757 if (!PL_colorset) reginitcolors();
1758 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
1759 PL_colors[4],PL_colors[5],PL_colors[0],
1760 (int)(xend - exp), exp, PL_colors[1]);
1761 });
1762
1763 redo_first_pass:
1764 RExC_precomp = exp;
1765 RExC_flags = pm->op_pmflags;
1766 RExC_sawback = 0;
1767
1768 RExC_seen = 0;
1769 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1770 RExC_seen_evals = 0;
1771 RExC_extralen = 0;
1772
1773 /* First pass: determine size, legality. */
1774 RExC_parse = exp;
1775 RExC_start = exp;
1776 RExC_end = xend;
1777 RExC_naughty = 0;
1778 RExC_npar = 1;
1779 RExC_size = 0L;
1780 RExC_emit = &PL_regdummy;
1781 RExC_whilem_seen = 0;
1782 #if 0 /* REGC() is (currently) a NOP at the first pass.
1783 * Clever compilers notice this and complain. --jhi */
1784 REGC((U8)REG_MAGIC, (char*)RExC_emit);
1785 #endif
1786 if (reg(pRExC_state, 0, &flags) == NULL) {
1787 RExC_precomp = Nullch;
1788 return(NULL);
1789 }
1790 if (RExC_utf8 && !RExC_orig_utf8) {
1791 STRLEN len = xend-exp;
1792 DEBUG_r(PerlIO_printf(Perl_debug_log,
1793 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
1794 exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)exp, &len);
1795 xend = exp + len;
1796 RExC_orig_utf8 = RExC_utf8;
1797 SAVEFREEPV(exp);
1798 goto redo_first_pass;
1799 }
1800
1801 DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
1802
1803 /* Small enough for pointer-storage convention?
1804 If extralen==0, this means that we will not need long jumps. */
1805 if (RExC_size >= 0x10000L && RExC_extralen)
1806 RExC_size += RExC_extralen;
1807 else
1808 RExC_extralen = 0;
1809 if (RExC_whilem_seen > 15)
1810 RExC_whilem_seen = 15;
1811
1812 /* Allocate space and initialize. */
1813 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
1814 char, regexp);
1815 if (r == NULL)
1816 FAIL("Regexp out of space");
1817
1818 #ifdef DEBUGGING
1819 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
1820 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
1821 #endif
1822 r->refcnt = 1;
1823 r->prelen = xend - exp;
1824 r->precomp = savepvn(RExC_precomp, r->prelen);
1825 r->subbeg = NULL;
1826 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
1827 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
1828
1829 r->substrs = 0; /* Useful during FAIL. */
1830 r->startp = 0; /* Useful during FAIL. */
1831 r->endp = 0; /* Useful during FAIL. */
1832
1833 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1834 if (r->offsets) {
1835 r->offsets[0] = RExC_size;
1836 }
1837 DEBUG_r(PerlIO_printf(Perl_debug_log,
1838 "%s %"UVuf" bytes for offset annotations.\n",
1839 r->offsets ? "Got" : "Couldn't get",
1840 (UV)((2*RExC_size+1) * sizeof(U32))));
1841
1842 RExC_rx = r;
1843
1844 /* Second pass: emit code. */
1845 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
1846 RExC_parse = exp;
1847 RExC_end = xend;
1848 RExC_naughty = 0;
1849 RExC_npar = 1;
1850 RExC_emit_start = r->program;
1851 RExC_emit = r->program;
1852 /* Store the count of eval-groups for security checks: */
1853 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1854 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
1855 r->data = 0;
1856 if (reg(pRExC_state, 0, &flags) == NULL)
1857 return(NULL);
1858
1859 /* Dig out information for optimizations. */
1860 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
1861 pm->op_pmflags = RExC_flags;
1862 if (UTF)
1863 r->reganch |= ROPT_UTF8; /* Unicode in it? */
1864 r->regstclass = NULL;
1865 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
1866 r->reganch |= ROPT_NAUGHTY;
1867 scan = r->program + 1; /* First BRANCH. */
1868
1869 /* XXXX To minimize changes to RE engine we always allocate
1870 3-units-long substrs field. */
1871 Newxz(r->substrs, 1, struct reg_substr_data);
1872
1873 StructCopy(&zero_scan_data, &data, scan_data_t);
1874 /* XXXX Should not we check for something else? Usually it is OPEN1... */
1875 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
1876 I32 fake;
1877 STRLEN longest_float_length, longest_fixed_length;
1878 struct regnode_charclass_class ch_class;
1879 int stclass_flag;
1880 I32 last_close = 0;
1881
1882 first = scan;
1883 /* Skip introductions and multiplicators >= 1. */
1884 while ((OP(first) == OPEN && (sawopen = 1)) ||
1885 /* An OR of *one* alternative - should not happen now. */
1886 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1887 (OP(first) == PLUS) ||
1888 (OP(first) == MINMOD) ||
1889 /* An {n,m} with n>0 */
1890 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
1891 if (OP(first) == PLUS)
1892 sawplus = 1;
1893 else
1894 first += regarglen[(U8)OP(first)];
1895 first = NEXTOPER(first);
1896 }
1897
1898 /* Starting-point info. */
1899 again:
1900 if (PL_regkind[(U8)OP(first)] == EXACT) {
1901 if (OP(first) == EXACT)
1902 ; /* Empty, get anchored substr later. */
1903 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
1904 r->regstclass = first;
1905 }
1906 else if (strchr((const char*)PL_simple,OP(first)))
1907 r->regstclass = first;
1908 else if (PL_regkind[(U8)OP(first)] == BOUND ||
1909 PL_regkind[(U8)OP(first)] == NBOUND)
1910 r->regstclass = first;
1911 else if (PL_regkind[(U8)OP(first)] == BOL) {
1912 r->reganch |= (OP(first) == MBOL
1913 ? ROPT_ANCH_MBOL
1914 : (OP(first) == SBOL
1915 ? ROPT_ANCH_SBOL
1916 : ROPT_ANCH_BOL));
1917 first = NEXTOPER(first);
1918 goto again;
1919 }
1920 else if (OP(first) == GPOS) {
1921 r->reganch |= ROPT_ANCH_GPOS;
1922 first = NEXTOPER(first);
1923 goto again;
1924 }
1925 else if (!sawopen && (OP(first) == STAR &&
1926 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
1927 !(r->reganch & ROPT_ANCH) )
1928 {
1929 /* turn .* into ^.* with an implied $*=1 */
1930 const int type =
1931 (OP(NEXTOPER(first)) == REG_ANY)
1932 ? ROPT_ANCH_MBOL
1933 : ROPT_ANCH_SBOL;
1934 r->reganch |= type | ROPT_IMPLICIT;
1935 first = NEXTOPER(first);
1936 goto again;
1937 }
1938 if (sawplus && (!sawopen || !RExC_sawback)
1939 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
1940 /* x+ must match at the 1st pos of run of x's */
1941 r->reganch |= ROPT_SKIP;
1942
1943 /* Scan is after the zeroth branch, first is atomic matcher. */
1944 DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
1945 (IV)(first - scan + 1)));
1946 /*
1947 * If there's something expensive in the r.e., find the
1948 * longest literal string that must appear and make it the
1949 * regmust. Resolve ties in favor of later strings, since
1950 * the regstart check works with the beginning of the r.e.
1951 * and avoiding duplication strengthens checking. Not a
1952 * strong reason, but sufficient in the absence of others.
1953 * [Now we resolve ties in favor of the earlier string if
1954 * it happens that c_offset_min has been invalidated, since the
1955 * earlier string may buy us something the later one won't.]
1956 */
1957 minlen = 0;
1958
1959 data.longest_fixed = newSVpvn("",0);
1960 data.longest_float = newSVpvn("",0);
1961 data.last_found = newSVpvn("",0);
1962 data.longest = &(data.longest_fixed);
1963 first = scan;
1964 if (!r->regstclass) {
1965 cl_init(pRExC_state, &ch_class);
1966 data.start_class = &ch_class;
1967 stclass_flag = SCF_DO_STCLASS_AND;
1968 } else /* XXXX Check for BOUND? */
1969 stclass_flag = 0;
1970 data.last_closep = &last_close;
1971
1972 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
1973 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
1974 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
1975 && data.last_start_min == 0 && data.last_end > 0
1976 && !RExC_seen_zerolen
1977 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
1978 r->reganch |= ROPT_CHECK_ALL;
1979 scan_commit(pRExC_state, &data);
1980 SvREFCNT_dec(data.last_found);
1981
1982 longest_float_length = CHR_SVLEN(data.longest_float);
1983 if (longest_float_length
1984 || (data.flags & SF_FL_BEFORE_EOL
1985 && (!(data.flags & SF_FL_BEFORE_MEOL)
1986 || (RExC_flags & PMf_MULTILINE)))) {
1987 int t;
1988
1989 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
1990 && data.offset_fixed == data.offset_float_min
1991 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1992 goto remove_float; /* As in (a)+. */
1993
1994 if (SvUTF8(data.longest_float)) {
1995 r->float_utf8 = data.longest_float;
1996 r->float_substr = Nullsv;
1997 } else {
1998 r->float_substr = data.longest_float;
1999 r->float_utf8 = Nullsv;
2000 }
2001 r->float_min_offset = data.offset_float_min;
2002 r->float_max_offset = data.offset_float_max;
2003 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
2004 && (!(data.flags & SF_FL_BEFORE_MEOL)
2005 || (RExC_flags & PMf_MULTILINE)));
2006 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
2007 }
2008 else {
2009 remove_float:
2010 r->float_substr = r->float_utf8 = Nullsv;
2011 SvREFCNT_dec(data.longest_float);
2012 longest_float_length = 0;
2013 }
2014
2015 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
2016 if (longest_fixed_length
2017 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
2018 && (!(data.flags & SF_FIX_BEFORE_MEOL)
2019 || (RExC_flags & PMf_MULTILINE)))) {
2020 int t;
2021
2022 if (SvUTF8(data.longest_fixed)) {
2023 r->anchored_utf8 = data.longest_fixed;
2024 r->anchored_substr = Nullsv;
2025 } else {
2026 r->anchored_substr = data.longest_fixed;
2027 r->anchored_utf8 = Nullsv;
2028 }
2029 r->anchored_offset = data.offset_fixed;
2030 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
2031 && (!(data.flags & SF_FIX_BEFORE_MEOL)
2032 || (RExC_flags & PMf_MULTILINE)));
2033 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
2034 }
2035 else {
2036 r->anchored_substr = r->anchored_utf8 = Nullsv;
2037 SvREFCNT_dec(data.longest_fixed);
2038 longest_fixed_length = 0;
2039 }
2040 if (r->regstclass
2041 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
2042 r->regstclass = NULL;
2043 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
2044 && stclass_flag
2045 && !(data.start_class->flags & ANYOF_EOS)
2046 && !cl_is_anything(data.start_class))
2047 {
2048 const I32 n = add_data(pRExC_state, 1, "f");
2049
2050 Newx(RExC_rx->data->data[n], 1,
2051 struct regnode_charclass_class);
2052 StructCopy(data.start_class,
2053 (struct regnode_charclass_class*)RExC_rx->data->data[n],
2054 struct regnode_charclass_class);
2055 r->regstclass = (regnode*)RExC_rx->data->data[n];
2056 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
2057 PL_regdata = r->data; /* for regprop() */
2058 DEBUG_r({ SV *sv = sv_newmortal();
2059 regprop(sv, (regnode*)data.start_class);
2060 PerlIO_printf(Perl_debug_log,
2061 "synthetic stclass \"%s\".\n",
2062 SvPVX_const(sv));});
2063 }
2064
2065 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
2066 if (longest_fixed_length > longest_float_length) {
2067 r->check_substr = r->anchored_substr;
2068 r->check_utf8 = r->anchored_utf8;
2069 r->check_offset_min = r->check_offset_max = r->anchored_offset;
2070 if (r->reganch & ROPT_ANCH_SINGLE)
2071 r->reganch |= ROPT_NOSCAN;
2072 }
2073 else {
2074 r->check_substr = r->float_substr;
2075 r->check_utf8 = r->float_utf8;
2076 r->check_offset_min = data.offset_float_min;
2077 r->check_offset_max = data.offset_float_max;
2078 }
2079 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
2080 This should be changed ASAP! */
2081 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
2082 r->reganch |= RE_USE_INTUIT;
2083 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
2084 r->reganch |= RE_INTUIT_TAIL;
2085 }
2086 }
2087 else {
2088 /* Several toplevels. Best we can is to set minlen. */
2089 I32 fake;
2090 struct regnode_charclass_class ch_class;
2091 I32 last_close = 0;
2092
2093 DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
2094 scan = r->program + 1;
2095 cl_init(pRExC_state, &ch_class);
2096 data.start_class = &ch_class;
2097 data.last_closep = &last_close;
2098 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
2099 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
2100 = r->float_substr = r->float_utf8 = Nullsv;
2101 if (!(data.start_class->flags & ANYOF_EOS)
2102 && !cl_is_anything(data.start_class))
2103 {
2104 const I32 n = add_data(pRExC_state, 1, "f");
2105
2106 Newx(RExC_rx->data->data[n], 1,
2107 struct regnode_charclass_class);
2108 StructCopy(data.start_class,
2109 (struct regnode_charclass_class*)RExC_rx->data->data[n],
2110 struct regnode_charclass_class);
2111 r->regstclass = (regnode*)RExC_rx->data->data[n];
2112 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
2113 DEBUG_r({ SV* sv = sv_newmortal();
2114 regprop(sv, (regnode*)data.start_class);
2115 PerlIO_printf(Perl_debug_log,
2116 "synthetic stclass \"%s\".\n",
2117 SvPVX_const(sv));});
2118 }
2119 }
2120
2121 r->minlen = minlen;
2122 if (RExC_seen & REG_SEEN_GPOS)
2123 r->reganch |= ROPT_GPOS_SEEN;
2124 if (RExC_seen & REG_SEEN_LOOKBEHIND)
2125 r->reganch |= ROPT_LOOKBEHIND_SEEN;
2126 if (RExC_seen & REG_SEEN_EVAL)
2127 r->reganch |= ROPT_EVAL_SEEN;
2128 if (RExC_seen & REG_SEEN_CANY)
2129 r->reganch |= ROPT_CANY_SEEN;
2130 Newxz(r->startp, RExC_npar, I32);
2131 Newxz(r->endp, RExC_npar, I32);
2132 PL_regdata = r->data; /* for regprop() */
2133 DEBUG_r(regdump(r));
2134 return(r);
2135 }
2136
2137 /*
2138 - reg - regular expression, i.e. main body or parenthesized thing
2139 *
2140 * Caller must absorb opening parenthesis.
2141 *
2142 * Combining parenthesis handling with the base level of regular expression
2143 * is a trifle forced, but the need to tie the tails of the branches to what
2144 * follows makes it hard to avoid.
2145 */
2146 STATIC regnode *
S_reg(pTHX_ RExC_state_t * pRExC_state,I32 paren,I32 * flagp)2147 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
2148 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
2149 {
2150 register regnode *ret; /* Will be the head of the group. */
2151 register regnode *br;
2152 register regnode *lastbr;
2153 register regnode *ender = 0;
2154 register I32 parno = 0;
2155 I32 flags, oregflags = RExC_flags, have_branch = 0, open = 0;
2156
2157 /* for (?g), (?gc), and (?o) warnings; warning
2158 about (?c) will warn about (?g) -- japhy */
2159
2160 I32 wastedflags = 0x00,
2161 wasted_o = 0x01,
2162 wasted_g = 0x02,
2163 wasted_gc = 0x02 | 0x04,
2164 wasted_c = 0x04;
2165
2166 char * parse_start = RExC_parse; /* MJD */
2167 char * const oregcomp_parse = RExC_parse;
2168 char c;
2169
2170 *flagp = 0; /* Tentatively. */
2171
2172
2173 /* Make an OPEN node, if parenthesized. */
2174 if (paren) {
2175 if (*RExC_parse == '?') { /* (?...) */
2176 U32 posflags = 0, negflags = 0;
2177 U32 *flagsp = &posflags;
2178 int logical = 0;
2179 const char * const seqstart = RExC_parse;
2180
2181 RExC_parse++;
2182 paren = *RExC_parse++;
2183 ret = NULL; /* For look-ahead/behind. */
2184 switch (paren) {
2185 case '<': /* (?<...) */
2186 RExC_seen |= REG_SEEN_LOOKBEHIND;
2187 if (*RExC_parse == '!')
2188 paren = ',';
2189 if (*RExC_parse != '=' && *RExC_parse != '!')
2190 goto unknown;
2191 RExC_parse++;
2192 case '=': /* (?=...) */
2193 case '!': /* (?!...) */
2194 RExC_seen_zerolen++;
2195 case ':': /* (?:...) */
2196 case '>': /* (?>...) */
2197 break;
2198 case '$': /* (?$...) */
2199 case '@': /* (?@...) */
2200 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
2201 break;
2202 case '#': /* (?#...) */
2203 while (*RExC_parse && *RExC_parse != ')')
2204 RExC_parse++;
2205 if (*RExC_parse != ')')
2206 FAIL("Sequence (?#... not terminated");
2207 nextchar(pRExC_state);
2208 *flagp = TRYAGAIN;
2209 return NULL;
2210 case 'p': /* (?p...) */
2211 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
2212 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
2213 /* FALL THROUGH*/
2214 case '?': /* (??...) */
2215 logical = 1;
2216 if (*RExC_parse != '{')
2217 goto unknown;
2218 paren = *RExC_parse++;
2219 /* FALL THROUGH */
2220 case '{': /* (?{...}) */
2221 {
2222 I32 count = 1, n = 0;
2223 char c;
2224 char *s = RExC_parse;
2225 SV *sv;
2226 OP_4tree *sop, *rop;
2227
2228 RExC_seen_zerolen++;
2229 RExC_seen |= REG_SEEN_EVAL;
2230 while (count && (c = *RExC_parse)) {
2231 if (c == '\\' && RExC_parse[1])
2232 RExC_parse++;
2233 else if (c == '{')
2234 count++;
2235 else if (c == '}')
2236 count--;
2237 RExC_parse++;
2238 }
2239 if (*RExC_parse != ')')
2240 {
2241 RExC_parse = s;
2242 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2243 }
2244 if (!SIZE_ONLY) {
2245 PAD *pad;
2246
2247 if (RExC_parse - 1 - s)
2248 sv = newSVpvn(s, RExC_parse - 1 - s);
2249 else
2250 sv = newSVpvn("", 0);
2251
2252 ENTER;
2253 Perl_save_re_context(aTHX);
2254 rop = sv_compile_2op(sv, &sop, "re", &pad);
2255 sop->op_private |= OPpREFCOUNTED;
2256 /* re_dup will OpREFCNT_inc */
2257 OpREFCNT_set(sop, 1);
2258 LEAVE;
2259
2260 n = add_data(pRExC_state, 3, "nop");
2261 RExC_rx->data->data[n] = (void*)rop;
2262 RExC_rx->data->data[n+1] = (void*)sop;
2263 RExC_rx->data->data[n+2] = (void*)pad;
2264 SvREFCNT_dec(sv);
2265 }
2266 else { /* First pass */
2267 if (PL_reginterp_cnt < ++RExC_seen_evals
2268 && IN_PERL_RUNTIME)
2269 /* No compiled RE interpolated, has runtime
2270 components ===> unsafe. */
2271 FAIL("Eval-group not allowed at runtime, use re 'eval'");
2272 if (PL_tainting && PL_tainted)
2273 FAIL("Eval-group in insecure regular expression");
2274 }
2275
2276 nextchar(pRExC_state);
2277 if (logical) {
2278 ret = reg_node(pRExC_state, LOGICAL);
2279 if (!SIZE_ONLY)
2280 ret->flags = 2;
2281 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
2282 /* deal with the length of this later - MJD */
2283 return ret;
2284 }
2285 ret = reganode(pRExC_state, EVAL, n);
2286 Set_Node_Length(ret, RExC_parse - parse_start + 1);
2287 Set_Node_Offset(ret, parse_start);
2288 return ret;
2289 }
2290 case '(': /* (?(?{...})...) and (?(?=...)...) */
2291 {
2292 if (RExC_parse[0] == '?') { /* (?(?...)) */
2293 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2294 || RExC_parse[1] == '<'
2295 || RExC_parse[1] == '{') { /* Lookahead or eval. */
2296 I32 flag;
2297
2298 ret = reg_node(pRExC_state, LOGICAL);
2299 if (!SIZE_ONLY)
2300 ret->flags = 1;
2301 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
2302 goto insert_if;
2303 }
2304 }
2305 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
2306 /* (?(1)...) */
2307 parno = atoi(RExC_parse++);
2308
2309 while (isDIGIT(*RExC_parse))
2310 RExC_parse++;
2311 ret = reganode(pRExC_state, GROUPP, parno);
2312
2313 if ((c = *nextchar(pRExC_state)) != ')')
2314 vFAIL("Switch condition not recognized");
2315 insert_if:
2316 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2317 br = regbranch(pRExC_state, &flags, 1);
2318 if (br == NULL)
2319 br = reganode(pRExC_state, LONGJMP, 0);
2320 else
2321 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2322 c = *nextchar(pRExC_state);
2323 if (flags&HASWIDTH)
2324 *flagp |= HASWIDTH;
2325 if (c == '|') {
2326 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2327 regbranch(pRExC_state, &flags, 1);
2328 regtail(pRExC_state, ret, lastbr);
2329 if (flags&HASWIDTH)
2330 *flagp |= HASWIDTH;
2331 c = *nextchar(pRExC_state);
2332 }
2333 else
2334 lastbr = NULL;
2335 if (c != ')')
2336 vFAIL("Switch (?(condition)... contains too many branches");
2337 ender = reg_node(pRExC_state, TAIL);
2338 regtail(pRExC_state, br, ender);
2339 if (lastbr) {
2340 regtail(pRExC_state, lastbr, ender);
2341 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
2342 }
2343 else
2344 regtail(pRExC_state, ret, ender);
2345 return ret;
2346 }
2347 else {
2348 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
2349 }
2350 }
2351 case 0:
2352 RExC_parse--; /* for vFAIL to print correctly */
2353 vFAIL("Sequence (? incomplete");
2354 break;
2355 default:
2356 --RExC_parse;
2357 parse_flags: /* (?i) */
2358 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
2359 /* (?g), (?gc) and (?o) are useless here
2360 and must be globally applied -- japhy */
2361
2362 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2363 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2364 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2365 if (! (wastedflags & wflagbit) ) {
2366 wastedflags |= wflagbit;
2367 vWARN5(
2368 RExC_parse + 1,
2369 "Useless (%s%c) - %suse /%c modifier",
2370 flagsp == &negflags ? "?-" : "?",
2371 *RExC_parse,
2372 flagsp == &negflags ? "don't " : "",
2373 *RExC_parse
2374 );
2375 }
2376 }
2377 }
2378 else if (*RExC_parse == 'c') {
2379 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2380 if (! (wastedflags & wasted_c) ) {
2381 wastedflags |= wasted_gc;
2382 vWARN3(
2383 RExC_parse + 1,
2384 "Useless (%sc) - %suse /gc modifier",
2385 flagsp == &negflags ? "?-" : "?",
2386 flagsp == &negflags ? "don't " : ""
2387 );
2388 }
2389 }
2390 }
2391 else { pmflag(flagsp, *RExC_parse); }
2392
2393 ++RExC_parse;
2394 }
2395 if (*RExC_parse == '-') {
2396 flagsp = &negflags;
2397 wastedflags = 0; /* reset so (?g-c) warns twice */
2398 ++RExC_parse;
2399 goto parse_flags;
2400 }
2401 RExC_flags |= posflags;
2402 RExC_flags &= ~negflags;
2403 if (*RExC_parse == ':') {
2404 RExC_parse++;
2405 paren = ':';
2406 break;
2407 }
2408 unknown:
2409 if (*RExC_parse != ')') {
2410 RExC_parse++;
2411 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
2412 }
2413 nextchar(pRExC_state);
2414 *flagp = TRYAGAIN;
2415 return NULL;
2416 }
2417 }
2418 else { /* (...) */
2419 parno = RExC_npar;
2420 RExC_npar++;
2421 ret = reganode(pRExC_state, OPEN, parno);
2422 Set_Node_Length(ret, 1); /* MJD */
2423 Set_Node_Offset(ret, RExC_parse); /* MJD */
2424 open = 1;
2425 }
2426 }
2427 else /* ! paren */
2428 ret = NULL;
2429
2430 /* Pick up the branches, linking them together. */
2431 parse_start = RExC_parse; /* MJD */
2432 br = regbranch(pRExC_state, &flags, 1);
2433 /* branch_len = (paren != 0); */
2434
2435 if (br == NULL)
2436 return(NULL);
2437 if (*RExC_parse == '|') {
2438 if (!SIZE_ONLY && RExC_extralen) {
2439 reginsert(pRExC_state, BRANCHJ, br);
2440 }
2441 else { /* MJD */
2442 reginsert(pRExC_state, BRANCH, br);
2443 Set_Node_Length(br, paren != 0);
2444 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2445 }
2446 have_branch = 1;
2447 if (SIZE_ONLY)
2448 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
2449 }
2450 else if (paren == ':') {
2451 *flagp |= flags&SIMPLE;
2452 }
2453 if (open) { /* Starts with OPEN. */
2454 regtail(pRExC_state, ret, br); /* OPEN -> first. */
2455 }
2456 else if (paren != '?') /* Not Conditional */
2457 ret = br;
2458 *flagp |= flags & (SPSTART | HASWIDTH);
2459 lastbr = br;
2460 while (*RExC_parse == '|') {
2461 if (!SIZE_ONLY && RExC_extralen) {
2462 ender = reganode(pRExC_state, LONGJMP,0);
2463 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
2464 }
2465 if (SIZE_ONLY)
2466 RExC_extralen += 2; /* Account for LONGJMP. */
2467 nextchar(pRExC_state);
2468 br = regbranch(pRExC_state, &flags, 0);
2469
2470 if (br == NULL)
2471 return(NULL);
2472 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
2473 lastbr = br;
2474 if (flags&HASWIDTH)
2475 *flagp |= HASWIDTH;
2476 *flagp |= flags&SPSTART;
2477 }
2478
2479 if (have_branch || paren != ':') {
2480 /* Make a closing node, and hook it on the end. */
2481 switch (paren) {
2482 case ':':
2483 ender = reg_node(pRExC_state, TAIL);
2484 break;
2485 case 1:
2486 ender = reganode(pRExC_state, CLOSE, parno);
2487 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2488 Set_Node_Length(ender,1); /* MJD */
2489 break;
2490 case '<':
2491 case ',':
2492 case '=':
2493 case '!':
2494 *flagp &= ~HASWIDTH;
2495 /* FALL THROUGH */
2496 case '>':
2497 ender = reg_node(pRExC_state, SUCCEED);
2498 break;
2499 case 0:
2500 ender = reg_node(pRExC_state, END);
2501 break;
2502 }
2503 regtail(pRExC_state, lastbr, ender);
2504
2505 if (have_branch) {
2506 /* Hook the tails of the branches to the closing node. */
2507 for (br = ret; br != NULL; br = regnext(br)) {
2508 regoptail(pRExC_state, br, ender);
2509 }
2510 }
2511 }
2512
2513 {
2514 const char *p;
2515 static const char parens[] = "=!<,>";
2516
2517 if (paren && (p = strchr(parens, paren))) {
2518 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2519 int flag = (p - parens) > 1;
2520
2521 if (paren == '>')
2522 node = SUSPEND, flag = 0;
2523 reginsert(pRExC_state, node,ret);
2524 Set_Node_Cur_Length(ret);
2525 Set_Node_Offset(ret, parse_start + 1);
2526 ret->flags = flag;
2527 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
2528 }
2529 }
2530
2531 /* Check for proper termination. */
2532 if (paren) {
2533 RExC_flags = oregflags;
2534 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2535 RExC_parse = oregcomp_parse;
2536 vFAIL("Unmatched (");
2537 }
2538 }
2539 else if (!paren && RExC_parse < RExC_end) {
2540 if (*RExC_parse == ')') {
2541 RExC_parse++;
2542 vFAIL("Unmatched )");
2543 }
2544 else
2545 FAIL("Junk on end of regexp"); /* "Can't happen". */
2546 /* NOTREACHED */
2547 }
2548
2549 return(ret);
2550 }
2551
2552 /*
2553 - regbranch - one alternative of an | operator
2554 *
2555 * Implements the concatenation operator.
2556 */
2557 STATIC regnode *
S_regbranch(pTHX_ RExC_state_t * pRExC_state,I32 * flagp,I32 first)2558 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
2559 {
2560 register regnode *ret;
2561 register regnode *chain = NULL;
2562 register regnode *latest;
2563 I32 flags = 0, c = 0;
2564
2565 if (first)
2566 ret = NULL;
2567 else {
2568 if (!SIZE_ONLY && RExC_extralen)
2569 ret = reganode(pRExC_state, BRANCHJ,0);
2570 else {
2571 ret = reg_node(pRExC_state, BRANCH);
2572 Set_Node_Length(ret, 1);
2573 }
2574 }
2575
2576 if (!first && SIZE_ONLY)
2577 RExC_extralen += 1; /* BRANCHJ */
2578
2579 *flagp = WORST; /* Tentatively. */
2580
2581 RExC_parse--;
2582 nextchar(pRExC_state);
2583 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
2584 flags &= ~TRYAGAIN;
2585 latest = regpiece(pRExC_state, &flags);
2586 if (latest == NULL) {
2587 if (flags & TRYAGAIN)
2588 continue;
2589 return(NULL);
2590 }
2591 else if (ret == NULL)
2592 ret = latest;
2593 *flagp |= flags&HASWIDTH;
2594 if (chain == NULL) /* First piece. */
2595 *flagp |= flags&SPSTART;
2596 else {
2597 RExC_naughty++;
2598 regtail(pRExC_state, chain, latest);
2599 }
2600 chain = latest;
2601 c++;
2602 }
2603 if (chain == NULL) { /* Loop ran zero times. */
2604 chain = reg_node(pRExC_state, NOTHING);
2605 if (ret == NULL)
2606 ret = chain;
2607 }
2608 if (c == 1) {
2609 *flagp |= flags&SIMPLE;
2610 }
2611
2612 return(ret);
2613 }
2614
2615 /*
2616 - regpiece - something followed by possible [*+?]
2617 *
2618 * Note that the branching code sequences used for ? and the general cases
2619 * of * and + are somewhat optimized: they use the same NOTHING node as
2620 * both the endmarker for their branch list and the body of the last branch.
2621 * It might seem that this node could be dispensed with entirely, but the
2622 * endmarker role is not redundant.
2623 */
2624 STATIC regnode *
S_regpiece(pTHX_ RExC_state_t * pRExC_state,I32 * flagp)2625 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2626 {
2627 register regnode *ret;
2628 register char op;
2629 register char *next;
2630 I32 flags;
2631 const char * const origparse = RExC_parse;
2632 char *maxpos;
2633 I32 min;
2634 I32 max = REG_INFTY;
2635 char *parse_start;
2636
2637 ret = regatom(pRExC_state, &flags);
2638 if (ret == NULL) {
2639 if (flags & TRYAGAIN)
2640 *flagp |= TRYAGAIN;
2641 return(NULL);
2642 }
2643
2644 op = *RExC_parse;
2645
2646 if (op == '{' && regcurly(RExC_parse)) {
2647 parse_start = RExC_parse; /* MJD */
2648 next = RExC_parse + 1;
2649 maxpos = Nullch;
2650 while (isDIGIT(*next) || *next == ',') {
2651 if (*next == ',') {
2652 if (maxpos)
2653 break;
2654 else
2655 maxpos = next;
2656 }
2657 next++;
2658 }
2659 if (*next == '}') { /* got one */
2660 if (!maxpos)
2661 maxpos = next;
2662 RExC_parse++;
2663 min = atoi(RExC_parse);
2664 if (*maxpos == ',')
2665 maxpos++;
2666 else
2667 maxpos = RExC_parse;
2668 max = atoi(maxpos);
2669 if (!max && *maxpos != '0')
2670 max = REG_INFTY; /* meaning "infinity" */
2671 else if (max >= REG_INFTY)
2672 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
2673 RExC_parse = next;
2674 nextchar(pRExC_state);
2675
2676 do_curly:
2677 if ((flags&SIMPLE)) {
2678 RExC_naughty += 2 + RExC_naughty / 2;
2679 reginsert(pRExC_state, CURLY, ret);
2680 Set_Node_Offset(ret, parse_start+1); /* MJD */
2681 Set_Node_Cur_Length(ret);
2682 }
2683 else {
2684 regnode *w = reg_node(pRExC_state, WHILEM);
2685
2686 w->flags = 0;
2687 regtail(pRExC_state, ret, w);
2688 if (!SIZE_ONLY && RExC_extralen) {
2689 reginsert(pRExC_state, LONGJMP,ret);
2690 reginsert(pRExC_state, NOTHING,ret);
2691 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
2692 }
2693 reginsert(pRExC_state, CURLYX,ret);
2694 /* MJD hk */
2695 Set_Node_Offset(ret, parse_start+1);
2696 Set_Node_Length(ret,
2697 op == '{' ? (RExC_parse - parse_start) : 1);
2698
2699 if (!SIZE_ONLY && RExC_extralen)
2700 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
2701 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
2702 if (SIZE_ONLY)
2703 RExC_whilem_seen++, RExC_extralen += 3;
2704 RExC_naughty += 4 + RExC_naughty; /* compound interest */
2705 }
2706 ret->flags = 0;
2707
2708 if (min > 0)
2709 *flagp = WORST;
2710 if (max > 0)
2711 *flagp |= HASWIDTH;
2712 if (max && max < min)
2713 vFAIL("Can't do {n,m} with n > m");
2714 if (!SIZE_ONLY) {
2715 ARG1_SET(ret, (U16)min);
2716 ARG2_SET(ret, (U16)max);
2717 }
2718
2719 goto nest_check;
2720 }
2721 }
2722
2723 if (!ISMULT1(op)) {
2724 *flagp = flags;
2725 return(ret);
2726 }
2727
2728 #if 0 /* Now runtime fix should be reliable. */
2729
2730 /* if this is reinstated, don't forget to put this back into perldiag:
2731
2732 =item Regexp *+ operand could be empty at {#} in regex m/%s/
2733
2734 (F) The part of the regexp subject to either the * or + quantifier
2735 could match an empty string. The {#} shows in the regular
2736 expression about where the problem was discovered.
2737
2738 */
2739
2740 if (!(flags&HASWIDTH) && op != '?')
2741 vFAIL("Regexp *+ operand could be empty");
2742 #endif
2743
2744 parse_start = RExC_parse;
2745 nextchar(pRExC_state);
2746
2747 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
2748
2749 if (op == '*' && (flags&SIMPLE)) {
2750 reginsert(pRExC_state, STAR, ret);
2751 ret->flags = 0;
2752 RExC_naughty += 4;
2753 }
2754 else if (op == '*') {
2755 min = 0;
2756 goto do_curly;
2757 }
2758 else if (op == '+' && (flags&SIMPLE)) {
2759 reginsert(pRExC_state, PLUS, ret);
2760 ret->flags = 0;
2761 RExC_naughty += 3;
2762 }
2763 else if (op == '+') {
2764 min = 1;
2765 goto do_curly;
2766 }
2767 else if (op == '?') {
2768 min = 0; max = 1;
2769 goto do_curly;
2770 }
2771 nest_check:
2772 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
2773 vWARN3(RExC_parse,
2774 "%.*s matches null string many times",
2775 RExC_parse - origparse,
2776 origparse);
2777 }
2778
2779 if (*RExC_parse == '?') {
2780 nextchar(pRExC_state);
2781 reginsert(pRExC_state, MINMOD, ret);
2782 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
2783 }
2784 if (ISMULT2(RExC_parse)) {
2785 RExC_parse++;
2786 vFAIL("Nested quantifiers");
2787 }
2788
2789 return(ret);
2790 }
2791
2792 /*
2793 - regatom - the lowest level
2794 *
2795 * Optimization: gobbles an entire sequence of ordinary characters so that
2796 * it can turn them into a single node, which is smaller to store and
2797 * faster to run. Backslashed characters are exceptions, each becoming a
2798 * separate node; the code is simpler that way and it's not worth fixing.
2799 *
2800 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
2801 STATIC regnode *
S_regatom(pTHX_ RExC_state_t * pRExC_state,I32 * flagp)2802 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2803 {
2804 register regnode *ret = 0;
2805 I32 flags;
2806 char *parse_start = RExC_parse;
2807
2808 *flagp = WORST; /* Tentatively. */
2809
2810 tryagain:
2811 switch (*RExC_parse) {
2812 case '^':
2813 RExC_seen_zerolen++;
2814 nextchar(pRExC_state);
2815 if (RExC_flags & PMf_MULTILINE)
2816 ret = reg_node(pRExC_state, MBOL);
2817 else if (RExC_flags & PMf_SINGLELINE)
2818 ret = reg_node(pRExC_state, SBOL);
2819 else
2820 ret = reg_node(pRExC_state, BOL);
2821 Set_Node_Length(ret, 1); /* MJD */
2822 break;
2823 case '$':
2824 nextchar(pRExC_state);
2825 if (*RExC_parse)
2826 RExC_seen_zerolen++;
2827 if (RExC_flags & PMf_MULTILINE)
2828 ret = reg_node(pRExC_state, MEOL);
2829 else if (RExC_flags & PMf_SINGLELINE)
2830 ret = reg_node(pRExC_state, SEOL);
2831 else
2832 ret = reg_node(pRExC_state, EOL);
2833 Set_Node_Length(ret, 1); /* MJD */
2834 break;
2835 case '.':
2836 nextchar(pRExC_state);
2837 if (RExC_flags & PMf_SINGLELINE)
2838 ret = reg_node(pRExC_state, SANY);
2839 else
2840 ret = reg_node(pRExC_state, REG_ANY);
2841 *flagp |= HASWIDTH|SIMPLE;
2842 RExC_naughty++;
2843 Set_Node_Length(ret, 1); /* MJD */
2844 break;
2845 case '[':
2846 {
2847 char *oregcomp_parse = ++RExC_parse;
2848 ret = regclass(pRExC_state);
2849 if (*RExC_parse != ']') {
2850 RExC_parse = oregcomp_parse;
2851 vFAIL("Unmatched [");
2852 }
2853 nextchar(pRExC_state);
2854 *flagp |= HASWIDTH|SIMPLE;
2855 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
2856 break;
2857 }
2858 case '(':
2859 nextchar(pRExC_state);
2860 ret = reg(pRExC_state, 1, &flags);
2861 if (ret == NULL) {
2862 if (flags & TRYAGAIN) {
2863 if (RExC_parse == RExC_end) {
2864 /* Make parent create an empty node if needed. */
2865 *flagp |= TRYAGAIN;
2866 return(NULL);
2867 }
2868 goto tryagain;
2869 }
2870 return(NULL);
2871 }
2872 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
2873 break;
2874 case '|':
2875 case ')':
2876 if (flags & TRYAGAIN) {
2877 *flagp |= TRYAGAIN;
2878 return NULL;
2879 }
2880 vFAIL("Internal urp");
2881 /* Supposed to be caught earlier. */
2882 break;
2883 case '{':
2884 if (!regcurly(RExC_parse)) {
2885 RExC_parse++;
2886 goto defchar;
2887 }
2888 /* FALL THROUGH */
2889 case '?':
2890 case '+':
2891 case '*':
2892 RExC_parse++;
2893 vFAIL("Quantifier follows nothing");
2894 break;
2895 case '\\':
2896 switch (*++RExC_parse) {
2897 case 'A':
2898 RExC_seen_zerolen++;
2899 ret = reg_node(pRExC_state, SBOL);
2900 *flagp |= SIMPLE;
2901 nextchar(pRExC_state);
2902 Set_Node_Length(ret, 2); /* MJD */
2903 break;
2904 case 'G':
2905 ret = reg_node(pRExC_state, GPOS);
2906 RExC_seen |= REG_SEEN_GPOS;
2907 *flagp |= SIMPLE;
2908 nextchar(pRExC_state);
2909 Set_Node_Length(ret, 2); /* MJD */
2910 break;
2911 case 'Z':
2912 ret = reg_node(pRExC_state, SEOL);
2913 *flagp |= SIMPLE;
2914 RExC_seen_zerolen++; /* Do not optimize RE away */
2915 nextchar(pRExC_state);
2916 break;
2917 case 'z':
2918 ret = reg_node(pRExC_state, EOS);
2919 *flagp |= SIMPLE;
2920 RExC_seen_zerolen++; /* Do not optimize RE away */
2921 nextchar(pRExC_state);
2922 Set_Node_Length(ret, 2); /* MJD */
2923 break;
2924 case 'C':
2925 ret = reg_node(pRExC_state, CANY);
2926 RExC_seen |= REG_SEEN_CANY;
2927 *flagp |= HASWIDTH|SIMPLE;
2928 nextchar(pRExC_state);
2929 Set_Node_Length(ret, 2); /* MJD */
2930 break;
2931 case 'X':
2932 ret = reg_node(pRExC_state, CLUMP);
2933 *flagp |= HASWIDTH;
2934 nextchar(pRExC_state);
2935 Set_Node_Length(ret, 2); /* MJD */
2936 break;
2937 case 'w':
2938 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
2939 *flagp |= HASWIDTH|SIMPLE;
2940 nextchar(pRExC_state);
2941 Set_Node_Length(ret, 2); /* MJD */
2942 break;
2943 case 'W':
2944 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
2945 *flagp |= HASWIDTH|SIMPLE;
2946 nextchar(pRExC_state);
2947 Set_Node_Length(ret, 2); /* MJD */
2948 break;
2949 case 'b':
2950 RExC_seen_zerolen++;
2951 RExC_seen |= REG_SEEN_LOOKBEHIND;
2952 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
2953 *flagp |= SIMPLE;
2954 nextchar(pRExC_state);
2955 Set_Node_Length(ret, 2); /* MJD */
2956 break;
2957 case 'B':
2958 RExC_seen_zerolen++;
2959 RExC_seen |= REG_SEEN_LOOKBEHIND;
2960 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
2961 *flagp |= SIMPLE;
2962 nextchar(pRExC_state);
2963 Set_Node_Length(ret, 2); /* MJD */
2964 break;
2965 case 's':
2966 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
2967 *flagp |= HASWIDTH|SIMPLE;
2968 nextchar(pRExC_state);
2969 Set_Node_Length(ret, 2); /* MJD */
2970 break;
2971 case 'S':
2972 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
2973 *flagp |= HASWIDTH|SIMPLE;
2974 nextchar(pRExC_state);
2975 Set_Node_Length(ret, 2); /* MJD */
2976 break;
2977 case 'd':
2978 ret = reg_node(pRExC_state, DIGIT);
2979 *flagp |= HASWIDTH|SIMPLE;
2980 nextchar(pRExC_state);
2981 Set_Node_Length(ret, 2); /* MJD */
2982 break;
2983 case 'D':
2984 ret = reg_node(pRExC_state, NDIGIT);
2985 *flagp |= HASWIDTH|SIMPLE;
2986 nextchar(pRExC_state);
2987 Set_Node_Length(ret, 2); /* MJD */
2988 break;
2989 case 'p':
2990 case 'P':
2991 {
2992 char* oldregxend = RExC_end;
2993 char* parse_start = RExC_parse - 2;
2994
2995 if (RExC_parse[1] == '{') {
2996 /* a lovely hack--pretend we saw [\pX] instead */
2997 RExC_end = strchr(RExC_parse, '}');
2998 if (!RExC_end) {
2999 U8 c = (U8)*RExC_parse;
3000 RExC_parse += 2;
3001 RExC_end = oldregxend;
3002 vFAIL2("Missing right brace on \\%c{}", c);
3003 }
3004 RExC_end++;
3005 }
3006 else {
3007 RExC_end = RExC_parse + 2;
3008 if (RExC_end > oldregxend)
3009 RExC_end = oldregxend;
3010 }
3011 RExC_parse--;
3012
3013 ret = regclass(pRExC_state);
3014
3015 RExC_end = oldregxend;
3016 RExC_parse--;
3017
3018 Set_Node_Offset(ret, parse_start + 2);
3019 Set_Node_Cur_Length(ret);
3020 nextchar(pRExC_state);
3021 *flagp |= HASWIDTH|SIMPLE;
3022 }
3023 break;
3024 case 'n':
3025 case 'r':
3026 case 't':
3027 case 'f':
3028 case 'e':
3029 case 'a':
3030 case 'x':
3031 case 'c':
3032 case '0':
3033 goto defchar;
3034 case '1': case '2': case '3': case '4':
3035 case '5': case '6': case '7': case '8': case '9':
3036 {
3037 const I32 num = atoi(RExC_parse);
3038
3039 if (num > 9 && num >= RExC_npar)
3040 goto defchar;
3041 else {
3042 char * parse_start = RExC_parse - 1; /* MJD */
3043 while (isDIGIT(*RExC_parse))
3044 RExC_parse++;
3045
3046 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
3047 vFAIL("Reference to nonexistent group");
3048 RExC_sawback = 1;
3049 ret = reganode(pRExC_state,
3050 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
3051 num);
3052 *flagp |= HASWIDTH;
3053
3054 /* override incorrect value set in reganode MJD */
3055 Set_Node_Offset(ret, parse_start+1);
3056 Set_Node_Cur_Length(ret); /* MJD */
3057 RExC_parse--;
3058 nextchar(pRExC_state);
3059 }
3060 }
3061 break;
3062 case '\0':
3063 if (RExC_parse >= RExC_end)
3064 FAIL("Trailing \\");
3065 /* FALL THROUGH */
3066 default:
3067 /* Do not generate "unrecognized" warnings here, we fall
3068 back into the quick-grab loop below */
3069 parse_start--;
3070 goto defchar;
3071 }
3072 break;
3073
3074 case '#':
3075 if (RExC_flags & PMf_EXTENDED) {
3076 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
3077 if (RExC_parse < RExC_end)
3078 goto tryagain;
3079 }
3080 /* FALL THROUGH */
3081
3082 default: {
3083 register STRLEN len;
3084 register UV ender;
3085 register char *p;
3086 char *oldp, *s;
3087 STRLEN foldlen;
3088 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
3089
3090 parse_start = RExC_parse - 1;
3091
3092 RExC_parse++;
3093
3094 defchar:
3095 ender = 0;
3096 ret = reg_node(pRExC_state,
3097 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
3098 s = STRING(ret);
3099 for (len = 0, p = RExC_parse - 1;
3100 len < 127 && p < RExC_end;
3101 len++)
3102 {
3103 oldp = p;
3104
3105 if (RExC_flags & PMf_EXTENDED)
3106 p = regwhite(p, RExC_end);
3107 switch (*p) {
3108 case '^':
3109 case '$':
3110 case '.':
3111 case '[':
3112 case '(':
3113 case ')':
3114 case '|':
3115 goto loopdone;
3116 case '\\':
3117 switch (*++p) {
3118 case 'A':
3119 case 'C':
3120 case 'X':
3121 case 'G':
3122 case 'Z':
3123 case 'z':
3124 case 'w':
3125 case 'W':
3126 case 'b':
3127 case 'B':
3128 case 's':
3129 case 'S':
3130 case 'd':
3131 case 'D':
3132 case 'p':
3133 case 'P':
3134 --p;
3135 goto loopdone;
3136 case 'n':
3137 ender = '\n';
3138 p++;
3139 break;
3140 case 'r':
3141 ender = '\r';
3142 p++;
3143 break;
3144 case 't':
3145 ender = '\t';
3146 p++;
3147 break;
3148 case 'f':
3149 ender = '\f';
3150 p++;
3151 break;
3152 case 'e':
3153 ender = ASCII_TO_NATIVE('\033');
3154 p++;
3155 break;
3156 case 'a':
3157 ender = ASCII_TO_NATIVE('\007');
3158 p++;
3159 break;
3160 case 'x':
3161 if (*++p == '{') {
3162 char* const e = strchr(p, '}');
3163
3164 if (!e) {
3165 RExC_parse = p + 1;
3166 vFAIL("Missing right brace on \\x{}");
3167 }
3168 else {
3169 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3170 | PERL_SCAN_DISALLOW_PREFIX;
3171 STRLEN numlen = e - p - 1;
3172 ender = grok_hex(p + 1, &numlen, &flags, NULL);
3173 if (ender > 0xff)
3174 RExC_utf8 = 1;
3175 p = e + 1;
3176 }
3177 }
3178 else {
3179 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3180 STRLEN numlen = 2;
3181 ender = grok_hex(p, &numlen, &flags, NULL);
3182 p += numlen;
3183 }
3184 break;
3185 case 'c':
3186 p++;
3187 ender = UCHARAT(p++);
3188 ender = toCTRL(ender);
3189 break;
3190 case '0': case '1': case '2': case '3':case '4':
3191 case '5': case '6': case '7': case '8':case '9':
3192 if (*p == '0' ||
3193 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
3194 I32 flags = 0;
3195 STRLEN numlen = 3;
3196 ender = grok_oct(p, &numlen, &flags, NULL);
3197 p += numlen;
3198 }
3199 else {
3200 --p;
3201 goto loopdone;
3202 }
3203 break;
3204 case '\0':
3205 if (p >= RExC_end)
3206 FAIL("Trailing \\");
3207 /* FALL THROUGH */
3208 default:
3209 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
3210 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
3211 goto normal_default;
3212 }
3213 break;
3214 default:
3215 normal_default:
3216 if (UTF8_IS_START(*p) && UTF) {
3217 STRLEN numlen;
3218 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
3219 &numlen, 0);
3220 p += numlen;
3221 }
3222 else
3223 ender = *p++;
3224 break;
3225 }
3226 if (RExC_flags & PMf_EXTENDED)
3227 p = regwhite(p, RExC_end);
3228 if (UTF && FOLD) {
3229 /* Prime the casefolded buffer. */
3230 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
3231 }
3232 if (ISMULT2(p)) { /* Back off on ?+*. */
3233 if (len)
3234 p = oldp;
3235 else if (UTF) {
3236 STRLEN unilen;
3237
3238 if (FOLD) {
3239 /* Emit all the Unicode characters. */
3240 STRLEN numlen;
3241 for (foldbuf = tmpbuf;
3242 foldlen;
3243 foldlen -= numlen) {
3244 ender = utf8_to_uvchr(foldbuf, &numlen);
3245 if (numlen > 0) {
3246 reguni(pRExC_state, ender, s, &unilen);
3247 s += unilen;
3248 len += unilen;
3249 /* In EBCDIC the numlen
3250 * and unilen can differ. */
3251 foldbuf += numlen;
3252 if (numlen >= foldlen)
3253 break;
3254 }
3255 else
3256 break; /* "Can't happen." */
3257 }
3258 }
3259 else {
3260 reguni(pRExC_state, ender, s, &unilen);
3261 if (unilen > 0) {
3262 s += unilen;
3263 len += unilen;
3264 }
3265 }
3266 }
3267 else {
3268 len++;
3269 REGC((char)ender, s++);
3270 }
3271 break;
3272 }
3273 if (UTF) {
3274 STRLEN unilen;
3275
3276 if (FOLD) {
3277 /* Emit all the Unicode characters. */
3278 STRLEN numlen;
3279 for (foldbuf = tmpbuf;
3280 foldlen;
3281 foldlen -= numlen) {
3282 ender = utf8_to_uvchr(foldbuf, &numlen);
3283 if (numlen > 0) {
3284 reguni(pRExC_state, ender, s, &unilen);
3285 len += unilen;
3286 s += unilen;
3287 /* In EBCDIC the numlen
3288 * and unilen can differ. */
3289 foldbuf += numlen;
3290 if (numlen >= foldlen)
3291 break;
3292 }
3293 else
3294 break;
3295 }
3296 }
3297 else {
3298 reguni(pRExC_state, ender, s, &unilen);
3299 if (unilen > 0) {
3300 s += unilen;
3301 len += unilen;
3302 }
3303 }
3304 len--;
3305 }
3306 else
3307 REGC((char)ender, s++);
3308 }
3309 loopdone:
3310 RExC_parse = p - 1;
3311 Set_Node_Cur_Length(ret); /* MJD */
3312 nextchar(pRExC_state);
3313 {
3314 /* len is STRLEN which is unsigned, need to copy to signed */
3315 IV iv = len;
3316 if (iv < 0)
3317 vFAIL("Internal disaster");
3318 }
3319 if (len > 0)
3320 *flagp |= HASWIDTH;
3321 if (len == 1 && UNI_IS_INVARIANT(ender))
3322 *flagp |= SIMPLE;
3323 if (!SIZE_ONLY)
3324 STR_LEN(ret) = len;
3325 if (SIZE_ONLY)
3326 RExC_size += STR_SZ(len);
3327 else
3328 RExC_emit += STR_SZ(len);
3329 }
3330 break;
3331 }
3332
3333 /* If the encoding pragma is in effect recode the text of
3334 * any EXACT-kind nodes. */
3335 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
3336 STRLEN oldlen = STR_LEN(ret);
3337 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
3338
3339 if (RExC_utf8)
3340 SvUTF8_on(sv);
3341 if (sv_utf8_downgrade(sv, TRUE)) {
3342 const char * const s = sv_recode_to_utf8(sv, PL_encoding);
3343 const STRLEN newlen = SvCUR(sv);
3344
3345 if (SvUTF8(sv))
3346 RExC_utf8 = 1;
3347 if (!SIZE_ONLY) {
3348 DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
3349 (int)oldlen, STRING(ret),
3350 (int)newlen, s));
3351 Copy(s, STRING(ret), newlen, char);
3352 STR_LEN(ret) += newlen - oldlen;
3353 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
3354 } else
3355 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
3356 }
3357 }
3358
3359 return(ret);
3360 }
3361
3362 STATIC char *
S_regwhite(pTHX_ char * p,const char * e)3363 S_regwhite(pTHX_ char *p, const char *e)
3364 {
3365 while (p < e) {
3366 if (isSPACE(*p))
3367 ++p;
3368 else if (*p == '#') {
3369 do {
3370 p++;
3371 } while (p < e && *p != '\n');
3372 }
3373 else
3374 break;
3375 }
3376 return p;
3377 }
3378
3379 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3380 Character classes ([:foo:]) can also be negated ([:^foo:]).
3381 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3382 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
3383 but trigger failures because they are currently unimplemented. */
3384
3385 #define POSIXCC_DONE(c) ((c) == ':')
3386 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
3387 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
3388
3389 STATIC I32
S_regpposixcc(pTHX_ RExC_state_t * pRExC_state,I32 value)3390 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
3391 {
3392 I32 namedclass = OOB_NAMEDCLASS;
3393
3394 if (value == '[' && RExC_parse + 1 < RExC_end &&
3395 /* I smell either [: or [= or [. -- POSIX has been here, right? */
3396 POSIXCC(UCHARAT(RExC_parse))) {
3397 const char c = UCHARAT(RExC_parse);
3398 char* s = RExC_parse++;
3399
3400 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
3401 RExC_parse++;
3402 if (RExC_parse == RExC_end)
3403 /* Grandfather lone [:, [=, [. */
3404 RExC_parse = s;
3405 else {
3406 const char* t = RExC_parse++; /* skip over the c */
3407 const char *posixcc;
3408
3409 assert(*t == c);
3410
3411 if (UCHARAT(RExC_parse) == ']') {
3412 RExC_parse++; /* skip over the ending ] */
3413 posixcc = s + 1;
3414 if (*s == ':') {
3415 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3416 const I32 skip = t - posixcc;
3417
3418 /* Initially switch on the length of the name. */
3419 switch (skip) {
3420 case 4:
3421 if (memEQ(posixcc, "word", 4)) {
3422 /* this is not POSIX, this is the Perl \w */;
3423 namedclass
3424 = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3425 }
3426 break;
3427 case 5:
3428 /* Names all of length 5. */
3429 /* alnum alpha ascii blank cntrl digit graph lower
3430 print punct space upper */
3431 /* Offset 4 gives the best switch position. */
3432 switch (posixcc[4]) {
3433 case 'a':
3434 if (memEQ(posixcc, "alph", 4)) {
3435 /* a */
3436 namedclass
3437 = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3438 }
3439 break;
3440 case 'e':
3441 if (memEQ(posixcc, "spac", 4)) {
3442 /* e */
3443 namedclass
3444 = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
3445 }
3446 break;
3447 case 'h':
3448 if (memEQ(posixcc, "grap", 4)) {
3449 /* h */
3450 namedclass
3451 = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3452 }
3453 break;
3454 case 'i':
3455 if (memEQ(posixcc, "asci", 4)) {
3456 /* i */
3457 namedclass
3458 = complement ? ANYOF_NASCII : ANYOF_ASCII;
3459 }
3460 break;
3461 case 'k':
3462 if (memEQ(posixcc, "blan", 4)) {
3463 /* k */
3464 namedclass
3465 = complement ? ANYOF_NBLANK : ANYOF_BLANK;
3466 }
3467 break;
3468 case 'l':
3469 if (memEQ(posixcc, "cntr", 4)) {
3470 /* l */
3471 namedclass
3472 = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3473 }
3474 break;
3475 case 'm':
3476 if (memEQ(posixcc, "alnu", 4)) {
3477 /* m */
3478 namedclass
3479 = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3480 }
3481 break;
3482 case 'r':
3483 if (memEQ(posixcc, "lowe", 4)) {
3484 /* r */
3485 namedclass
3486 = complement ? ANYOF_NLOWER : ANYOF_LOWER;
3487 }
3488 if (memEQ(posixcc, "uppe", 4)) {
3489 /* r */
3490 namedclass
3491 = complement ? ANYOF_NUPPER : ANYOF_UPPER;
3492 }
3493 break;
3494 case 't':
3495 if (memEQ(posixcc, "digi", 4)) {
3496 /* t */
3497 namedclass
3498 = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3499 }
3500 if (memEQ(posixcc, "prin", 4)) {
3501 /* t */
3502 namedclass
3503 = complement ? ANYOF_NPRINT : ANYOF_PRINT;
3504 }
3505 if (memEQ(posixcc, "punc", 4)) {
3506 /* t */
3507 namedclass
3508 = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3509 }
3510 break;
3511 }
3512 break;
3513 case 6:
3514 if (memEQ(posixcc, "xdigit", 6)) {
3515 namedclass
3516 = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3517 }
3518 break;
3519 }
3520
3521 if (namedclass == OOB_NAMEDCLASS)
3522 {
3523 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3524 t - s - 1, s + 1);
3525 }
3526 assert (posixcc[skip] == ':');
3527 assert (posixcc[skip+1] == ']');
3528 } else if (!SIZE_ONLY) {
3529 /* [[=foo=]] and [[.foo.]] are still future. */
3530
3531 /* adjust RExC_parse so the warning shows after
3532 the class closes */
3533 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
3534 RExC_parse++;
3535 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3536 }
3537 } else {
3538 /* Maternal grandfather:
3539 * "[:" ending in ":" but not in ":]" */
3540 RExC_parse = s;
3541 }
3542 }
3543 }
3544
3545 return namedclass;
3546 }
3547
3548 STATIC void
S_checkposixcc(pTHX_ RExC_state_t * pRExC_state)3549 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
3550 {
3551 if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
3552 const char *s = RExC_parse;
3553 const char c = *s++;
3554
3555 while(*s && isALNUM(*s))
3556 s++;
3557 if (*s && c == *s && s[1] == ']') {
3558 if (ckWARN(WARN_REGEXP))
3559 vWARN3(s+2,
3560 "POSIX syntax [%c %c] belongs inside character classes",
3561 c, c);
3562
3563 /* [[=foo=]] and [[.foo.]] are still future. */
3564 if (POSIXCC_NOTYET(c)) {
3565 /* adjust RExC_parse so the error shows after
3566 the class closes */
3567 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3568 ;
3569 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3570 }
3571 }
3572 }
3573 }
3574
3575 STATIC regnode *
S_regclass(pTHX_ RExC_state_t * pRExC_state)3576 S_regclass(pTHX_ RExC_state_t *pRExC_state)
3577 {
3578 register UV value;
3579 register UV nextvalue;
3580 register IV prevvalue = OOB_UNICODE;
3581 register IV range = 0;
3582 register regnode *ret;
3583 STRLEN numlen;
3584 IV namedclass;
3585 char *rangebegin = 0;
3586 bool need_class = 0;
3587 SV *listsv = Nullsv;
3588 register char *e;
3589 UV n;
3590 bool optimize_invert = TRUE;
3591 AV* unicode_alternate = 0;
3592 #ifdef EBCDIC
3593 UV literal_endpoint = 0;
3594 #endif
3595
3596 ret = reganode(pRExC_state, ANYOF, 0);
3597
3598 if (!SIZE_ONLY)
3599 ANYOF_FLAGS(ret) = 0;
3600
3601 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
3602 RExC_naughty++;
3603 RExC_parse++;
3604 if (!SIZE_ONLY)
3605 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3606 }
3607
3608 if (SIZE_ONLY)
3609 RExC_size += ANYOF_SKIP;
3610 else {
3611 RExC_emit += ANYOF_SKIP;
3612 if (FOLD)
3613 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3614 if (LOC)
3615 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
3616 ANYOF_BITMAP_ZERO(ret);
3617 listsv = newSVpvn("# comment\n", 10);
3618 }
3619
3620 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3621
3622 if (!SIZE_ONLY && POSIXCC(nextvalue))
3623 checkposixcc(pRExC_state);
3624
3625 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
3626 if (UCHARAT(RExC_parse) == ']')
3627 goto charclassloop;
3628
3629 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
3630
3631 charclassloop:
3632
3633 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3634
3635 if (!range)
3636 rangebegin = RExC_parse;
3637 if (UTF) {
3638 value = utf8n_to_uvchr((U8*)RExC_parse,
3639 RExC_end - RExC_parse,
3640 &numlen, 0);
3641 RExC_parse += numlen;
3642 }
3643 else
3644 value = UCHARAT(RExC_parse++);
3645 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3646 if (value == '[' && POSIXCC(nextvalue))
3647 namedclass = regpposixcc(pRExC_state, value);
3648 else if (value == '\\') {
3649 if (UTF) {
3650 value = utf8n_to_uvchr((U8*)RExC_parse,
3651 RExC_end - RExC_parse,
3652 &numlen, 0);
3653 RExC_parse += numlen;
3654 }
3655 else
3656 value = UCHARAT(RExC_parse++);
3657 /* Some compilers cannot handle switching on 64-bit integer
3658 * values, therefore value cannot be an UV. Yes, this will
3659 * be a problem later if we want switch on Unicode.
3660 * A similar issue a little bit later when switching on
3661 * namedclass. --jhi */
3662 switch ((I32)value) {
3663 case 'w': namedclass = ANYOF_ALNUM; break;
3664 case 'W': namedclass = ANYOF_NALNUM; break;
3665 case 's': namedclass = ANYOF_SPACE; break;
3666 case 'S': namedclass = ANYOF_NSPACE; break;
3667 case 'd': namedclass = ANYOF_DIGIT; break;
3668 case 'D': namedclass = ANYOF_NDIGIT; break;
3669 case 'p':
3670 case 'P':
3671 if (RExC_parse >= RExC_end)
3672 vFAIL2("Empty \\%c{}", (U8)value);
3673 if (*RExC_parse == '{') {
3674 const U8 c = (U8)value;
3675 e = strchr(RExC_parse++, '}');
3676 if (!e)
3677 vFAIL2("Missing right brace on \\%c{}", c);
3678 while (isSPACE(UCHARAT(RExC_parse)))
3679 RExC_parse++;
3680 if (e == RExC_parse)
3681 vFAIL2("Empty \\%c{}", c);
3682 n = e - RExC_parse;
3683 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
3684 n--;
3685 }
3686 else {
3687 e = RExC_parse;
3688 n = 1;
3689 }
3690 if (!SIZE_ONLY) {
3691 if (UCHARAT(RExC_parse) == '^') {
3692 RExC_parse++;
3693 n--;
3694 value = value == 'p' ? 'P' : 'p'; /* toggle */
3695 while (isSPACE(UCHARAT(RExC_parse))) {
3696 RExC_parse++;
3697 n--;
3698 }
3699 }
3700 if (value == 'p')
3701 Perl_sv_catpvf(aTHX_ listsv,
3702 "+utf8::%.*s\n", (int)n, RExC_parse);
3703 else
3704 Perl_sv_catpvf(aTHX_ listsv,
3705 "!utf8::%.*s\n", (int)n, RExC_parse);
3706 }
3707 RExC_parse = e + 1;
3708 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3709 namedclass = ANYOF_MAX; /* no official name, but it's named */
3710 break;
3711 case 'n': value = '\n'; break;
3712 case 'r': value = '\r'; break;
3713 case 't': value = '\t'; break;
3714 case 'f': value = '\f'; break;
3715 case 'b': value = '\b'; break;
3716 case 'e': value = ASCII_TO_NATIVE('\033');break;
3717 case 'a': value = ASCII_TO_NATIVE('\007');break;
3718 case 'x':
3719 if (*RExC_parse == '{') {
3720 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3721 | PERL_SCAN_DISALLOW_PREFIX;
3722 e = strchr(RExC_parse++, '}');
3723 if (!e)
3724 vFAIL("Missing right brace on \\x{}");
3725
3726 numlen = e - RExC_parse;
3727 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3728 RExC_parse = e + 1;
3729 }
3730 else {
3731 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3732 numlen = 2;
3733 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3734 RExC_parse += numlen;
3735 }
3736 break;
3737 case 'c':
3738 value = UCHARAT(RExC_parse++);
3739 value = toCTRL(value);
3740 break;
3741 case '0': case '1': case '2': case '3': case '4':
3742 case '5': case '6': case '7': case '8': case '9':
3743 {
3744 I32 flags = 0;
3745 numlen = 3;
3746 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
3747 RExC_parse += numlen;
3748 break;
3749 }
3750 default:
3751 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
3752 vWARN2(RExC_parse,
3753 "Unrecognized escape \\%c in character class passed through",
3754 (int)value);
3755 break;
3756 }
3757 } /* end of \blah */
3758 #ifdef EBCDIC
3759 else
3760 literal_endpoint++;
3761 #endif
3762
3763 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3764
3765 if (!SIZE_ONLY && !need_class)
3766 ANYOF_CLASS_ZERO(ret);
3767
3768 need_class = 1;
3769
3770 /* a bad range like a-\d, a-[:digit:] ? */
3771 if (range) {
3772 if (!SIZE_ONLY) {
3773 if (ckWARN(WARN_REGEXP))
3774 vWARN4(RExC_parse,
3775 "False [] range \"%*.*s\"",
3776 RExC_parse - rangebegin,
3777 RExC_parse - rangebegin,
3778 rangebegin);
3779 if (prevvalue < 256) {
3780 ANYOF_BITMAP_SET(ret, prevvalue);
3781 ANYOF_BITMAP_SET(ret, '-');
3782 }
3783 else {
3784 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3785 Perl_sv_catpvf(aTHX_ listsv,
3786 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
3787 }
3788 }
3789
3790 range = 0; /* this was not a true range */
3791 }
3792
3793 if (!SIZE_ONLY) {
3794 const char *what = NULL;
3795 char yesno = 0;
3796
3797 if (namedclass > OOB_NAMEDCLASS)
3798 optimize_invert = FALSE;
3799 /* Possible truncation here but in some 64-bit environments
3800 * the compiler gets heartburn about switch on 64-bit values.
3801 * A similar issue a little earlier when switching on value.
3802 * --jhi */
3803 switch ((I32)namedclass) {
3804 case ANYOF_ALNUM:
3805 if (LOC)
3806 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
3807 else {
3808 for (value = 0; value < 256; value++)
3809 if (isALNUM(value))
3810 ANYOF_BITMAP_SET(ret, value);
3811 }
3812 yesno = '+';
3813 what = "Word";
3814 break;
3815 case ANYOF_NALNUM:
3816 if (LOC)
3817 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
3818 else {
3819 for (value = 0; value < 256; value++)
3820 if (!isALNUM(value))
3821 ANYOF_BITMAP_SET(ret, value);
3822 }
3823 yesno = '!';
3824 what = "Word";
3825 break;
3826 case ANYOF_ALNUMC:
3827 if (LOC)
3828 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
3829 else {
3830 for (value = 0; value < 256; value++)
3831 if (isALNUMC(value))
3832 ANYOF_BITMAP_SET(ret, value);
3833 }
3834 yesno = '+';
3835 what = "Alnum";
3836 break;
3837 case ANYOF_NALNUMC:
3838 if (LOC)
3839 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
3840 else {
3841 for (value = 0; value < 256; value++)
3842 if (!isALNUMC(value))
3843 ANYOF_BITMAP_SET(ret, value);
3844 }
3845 yesno = '!';
3846 what = "Alnum";
3847 break;
3848 case ANYOF_ALPHA:
3849 if (LOC)
3850 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
3851 else {
3852 for (value = 0; value < 256; value++)
3853 if (isALPHA(value))
3854 ANYOF_BITMAP_SET(ret, value);
3855 }
3856 yesno = '+';
3857 what = "Alpha";
3858 break;
3859 case ANYOF_NALPHA:
3860 if (LOC)
3861 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
3862 else {
3863 for (value = 0; value < 256; value++)
3864 if (!isALPHA(value))
3865 ANYOF_BITMAP_SET(ret, value);
3866 }
3867 yesno = '!';
3868 what = "Alpha";
3869 break;
3870 case ANYOF_ASCII:
3871 if (LOC)
3872 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
3873 else {
3874 #ifndef EBCDIC
3875 for (value = 0; value < 128; value++)
3876 ANYOF_BITMAP_SET(ret, value);
3877 #else /* EBCDIC */
3878 for (value = 0; value < 256; value++) {
3879 if (isASCII(value))
3880 ANYOF_BITMAP_SET(ret, value);
3881 }
3882 #endif /* EBCDIC */
3883 }
3884 yesno = '+';
3885 what = "ASCII";
3886 break;
3887 case ANYOF_NASCII:
3888 if (LOC)
3889 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
3890 else {
3891 #ifndef EBCDIC
3892 for (value = 128; value < 256; value++)
3893 ANYOF_BITMAP_SET(ret, value);
3894 #else /* EBCDIC */
3895 for (value = 0; value < 256; value++) {
3896 if (!isASCII(value))
3897 ANYOF_BITMAP_SET(ret, value);
3898 }
3899 #endif /* EBCDIC */
3900 }
3901 yesno = '!';
3902 what = "ASCII";
3903 break;
3904 case ANYOF_BLANK:
3905 if (LOC)
3906 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3907 else {
3908 for (value = 0; value < 256; value++)
3909 if (isBLANK(value))
3910 ANYOF_BITMAP_SET(ret, value);
3911 }
3912 yesno = '+';
3913 what = "Blank";
3914 break;
3915 case ANYOF_NBLANK:
3916 if (LOC)
3917 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3918 else {
3919 for (value = 0; value < 256; value++)
3920 if (!isBLANK(value))
3921 ANYOF_BITMAP_SET(ret, value);
3922 }
3923 yesno = '!';
3924 what = "Blank";
3925 break;
3926 case ANYOF_CNTRL:
3927 if (LOC)
3928 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
3929 else {
3930 for (value = 0; value < 256; value++)
3931 if (isCNTRL(value))
3932 ANYOF_BITMAP_SET(ret, value);
3933 }
3934 yesno = '+';
3935 what = "Cntrl";
3936 break;
3937 case ANYOF_NCNTRL:
3938 if (LOC)
3939 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
3940 else {
3941 for (value = 0; value < 256; value++)
3942 if (!isCNTRL(value))
3943 ANYOF_BITMAP_SET(ret, value);
3944 }
3945 yesno = '!';
3946 what = "Cntrl";
3947 break;
3948 case ANYOF_DIGIT:
3949 if (LOC)
3950 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3951 else {
3952 /* consecutive digits assumed */
3953 for (value = '0'; value <= '9'; value++)
3954 ANYOF_BITMAP_SET(ret, value);
3955 }
3956 yesno = '+';
3957 what = "Digit";
3958 break;
3959 case ANYOF_NDIGIT:
3960 if (LOC)
3961 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3962 else {
3963 /* consecutive digits assumed */
3964 for (value = 0; value < '0'; value++)
3965 ANYOF_BITMAP_SET(ret, value);
3966 for (value = '9' + 1; value < 256; value++)
3967 ANYOF_BITMAP_SET(ret, value);
3968 }
3969 yesno = '!';
3970 what = "Digit";
3971 break;
3972 case ANYOF_GRAPH:
3973 if (LOC)
3974 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
3975 else {
3976 for (value = 0; value < 256; value++)
3977 if (isGRAPH(value))
3978 ANYOF_BITMAP_SET(ret, value);
3979 }
3980 yesno = '+';
3981 what = "Graph";
3982 break;
3983 case ANYOF_NGRAPH:
3984 if (LOC)
3985 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
3986 else {
3987 for (value = 0; value < 256; value++)
3988 if (!isGRAPH(value))
3989 ANYOF_BITMAP_SET(ret, value);
3990 }
3991 yesno = '!';
3992 what = "Graph";
3993 break;
3994 case ANYOF_LOWER:
3995 if (LOC)
3996 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
3997 else {
3998 for (value = 0; value < 256; value++)
3999 if (isLOWER(value))
4000 ANYOF_BITMAP_SET(ret, value);
4001 }
4002 yesno = '+';
4003 what = "Lower";
4004 break;
4005 case ANYOF_NLOWER:
4006 if (LOC)
4007 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
4008 else {
4009 for (value = 0; value < 256; value++)
4010 if (!isLOWER(value))
4011 ANYOF_BITMAP_SET(ret, value);
4012 }
4013 yesno = '!';
4014 what = "Lower";
4015 break;
4016 case ANYOF_PRINT:
4017 if (LOC)
4018 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
4019 else {
4020 for (value = 0; value < 256; value++)
4021 if (isPRINT(value))
4022 ANYOF_BITMAP_SET(ret, value);
4023 }
4024 yesno = '+';
4025 what = "Print";
4026 break;
4027 case ANYOF_NPRINT:
4028 if (LOC)
4029 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
4030 else {
4031 for (value = 0; value < 256; value++)
4032 if (!isPRINT(value))
4033 ANYOF_BITMAP_SET(ret, value);
4034 }
4035 yesno = '!';
4036 what = "Print";
4037 break;
4038 case ANYOF_PSXSPC:
4039 if (LOC)
4040 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
4041 else {
4042 for (value = 0; value < 256; value++)
4043 if (isPSXSPC(value))
4044 ANYOF_BITMAP_SET(ret, value);
4045 }
4046 yesno = '+';
4047 what = "Space";
4048 break;
4049 case ANYOF_NPSXSPC:
4050 if (LOC)
4051 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
4052 else {
4053 for (value = 0; value < 256; value++)
4054 if (!isPSXSPC(value))
4055 ANYOF_BITMAP_SET(ret, value);
4056 }
4057 yesno = '!';
4058 what = "Space";
4059 break;
4060 case ANYOF_PUNCT:
4061 if (LOC)
4062 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
4063 else {
4064 for (value = 0; value < 256; value++)
4065 if (isPUNCT(value))
4066 ANYOF_BITMAP_SET(ret, value);
4067 }
4068 yesno = '+';
4069 what = "Punct";
4070 break;
4071 case ANYOF_NPUNCT:
4072 if (LOC)
4073 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
4074 else {
4075 for (value = 0; value < 256; value++)
4076 if (!isPUNCT(value))
4077 ANYOF_BITMAP_SET(ret, value);
4078 }
4079 yesno = '!';
4080 what = "Punct";
4081 break;
4082 case ANYOF_SPACE:
4083 if (LOC)
4084 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
4085 else {
4086 for (value = 0; value < 256; value++)
4087 if (isSPACE(value))
4088 ANYOF_BITMAP_SET(ret, value);
4089 }
4090 yesno = '+';
4091 what = "SpacePerl";
4092 break;
4093 case ANYOF_NSPACE:
4094 if (LOC)
4095 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
4096 else {
4097 for (value = 0; value < 256; value++)
4098 if (!isSPACE(value))
4099 ANYOF_BITMAP_SET(ret, value);
4100 }
4101 yesno = '!';
4102 what = "SpacePerl";
4103 break;
4104 case ANYOF_UPPER:
4105 if (LOC)
4106 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
4107 else {
4108 for (value = 0; value < 256; value++)
4109 if (isUPPER(value))
4110 ANYOF_BITMAP_SET(ret, value);
4111 }
4112 yesno = '+';
4113 what = "Upper";
4114 break;
4115 case ANYOF_NUPPER:
4116 if (LOC)
4117 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
4118 else {
4119 for (value = 0; value < 256; value++)
4120 if (!isUPPER(value))
4121 ANYOF_BITMAP_SET(ret, value);
4122 }
4123 yesno = '!';
4124 what = "Upper";
4125 break;
4126 case ANYOF_XDIGIT:
4127 if (LOC)
4128 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
4129 else {
4130 for (value = 0; value < 256; value++)
4131 if (isXDIGIT(value))
4132 ANYOF_BITMAP_SET(ret, value);
4133 }
4134 yesno = '+';
4135 what = "XDigit";
4136 break;
4137 case ANYOF_NXDIGIT:
4138 if (LOC)
4139 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
4140 else {
4141 for (value = 0; value < 256; value++)
4142 if (!isXDIGIT(value))
4143 ANYOF_BITMAP_SET(ret, value);
4144 }
4145 yesno = '!';
4146 what = "XDigit";
4147 break;
4148 case ANYOF_MAX:
4149 /* this is to handle \p and \P */
4150 break;
4151 default:
4152 vFAIL("Invalid [::] class");
4153 break;
4154 }
4155 if (what) {
4156 /* Strings such as "+utf8::isWord\n" */
4157 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
4158 }
4159 if (LOC)
4160 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
4161 continue;
4162 }
4163 } /* end of namedclass \blah */
4164
4165 if (range) {
4166 if (prevvalue > (IV)value) /* b-a */ {
4167 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
4168 RExC_parse - rangebegin,
4169 RExC_parse - rangebegin,
4170 rangebegin);
4171 range = 0; /* not a valid range */
4172 }
4173 }
4174 else {
4175 prevvalue = value; /* save the beginning of the range */
4176 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
4177 RExC_parse[1] != ']') {
4178 RExC_parse++;
4179
4180 /* a bad range like \w-, [:word:]- ? */
4181 if (namedclass > OOB_NAMEDCLASS) {
4182 if (ckWARN(WARN_REGEXP))
4183 vWARN4(RExC_parse,
4184 "False [] range \"%*.*s\"",
4185 RExC_parse - rangebegin,
4186 RExC_parse - rangebegin,
4187 rangebegin);
4188 if (!SIZE_ONLY)
4189 ANYOF_BITMAP_SET(ret, '-');
4190 } else
4191 range = 1; /* yeah, it's a range! */
4192 continue; /* but do it the next time */
4193 }
4194 }
4195
4196 /* now is the next time */
4197 if (!SIZE_ONLY) {
4198 IV i;
4199
4200 if (prevvalue < 256) {
4201 const IV ceilvalue = value < 256 ? value : 255;
4202
4203 #ifdef EBCDIC
4204 /* In EBCDIC [\x89-\x91] should include
4205 * the \x8e but [i-j] should not. */
4206 if (literal_endpoint == 2 &&
4207 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
4208 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
4209 {
4210 if (isLOWER(prevvalue)) {
4211 for (i = prevvalue; i <= ceilvalue; i++)
4212 if (isLOWER(i))
4213 ANYOF_BITMAP_SET(ret, i);
4214 } else {
4215 for (i = prevvalue; i <= ceilvalue; i++)
4216 if (isUPPER(i))
4217 ANYOF_BITMAP_SET(ret, i);
4218 }
4219 }
4220 else
4221 #endif
4222 for (i = prevvalue; i <= ceilvalue; i++)
4223 ANYOF_BITMAP_SET(ret, i);
4224 }
4225 if (value > 255 || UTF) {
4226 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
4227 const UV natvalue = NATIVE_TO_UNI(value);
4228
4229 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4230 if (prevnatvalue < natvalue) { /* what about > ? */
4231 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
4232 prevnatvalue, natvalue);
4233 }
4234 else if (prevnatvalue == natvalue) {
4235 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
4236 if (FOLD) {
4237 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
4238 STRLEN foldlen;
4239 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
4240
4241 /* If folding and foldable and a single
4242 * character, insert also the folded version
4243 * to the charclass. */
4244 if (f != value) {
4245 if (foldlen == (STRLEN)UNISKIP(f))
4246 Perl_sv_catpvf(aTHX_ listsv,
4247 "%04"UVxf"\n", f);
4248 else {
4249 /* Any multicharacter foldings
4250 * require the following transform:
4251 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
4252 * where E folds into "pq" and F folds
4253 * into "rst", all other characters
4254 * fold to single characters. We save
4255 * away these multicharacter foldings,
4256 * to be later saved as part of the
4257 * additional "s" data. */
4258 SV *sv;
4259
4260 if (!unicode_alternate)
4261 unicode_alternate = newAV();
4262 sv = newSVpvn((char*)foldbuf, foldlen);
4263 SvUTF8_on(sv);
4264 av_push(unicode_alternate, sv);
4265 }
4266 }
4267
4268 /* If folding and the value is one of the Greek
4269 * sigmas insert a few more sigmas to make the
4270 * folding rules of the sigmas to work right.
4271 * Note that not all the possible combinations
4272 * are handled here: some of them are handled
4273 * by the standard folding rules, and some of
4274 * them (literal or EXACTF cases) are handled
4275 * during runtime in regexec.c:S_find_byclass(). */
4276 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
4277 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4278 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
4279 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4280 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4281 }
4282 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
4283 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4284 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4285 }
4286 }
4287 }
4288 #ifdef EBCDIC
4289 literal_endpoint = 0;
4290 #endif
4291 }
4292
4293 range = 0; /* this range (if it was one) is done now */
4294 }
4295
4296 if (need_class) {
4297 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
4298 if (SIZE_ONLY)
4299 RExC_size += ANYOF_CLASS_ADD_SKIP;
4300 else
4301 RExC_emit += ANYOF_CLASS_ADD_SKIP;
4302 }
4303
4304 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
4305 if (!SIZE_ONLY &&
4306 /* If the only flag is folding (plus possibly inversion). */
4307 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
4308 ) {
4309 for (value = 0; value < 256; ++value) {
4310 if (ANYOF_BITMAP_TEST(ret, value)) {
4311 UV fold = PL_fold[value];
4312
4313 if (fold != value)
4314 ANYOF_BITMAP_SET(ret, fold);
4315 }
4316 }
4317 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
4318 }
4319
4320 /* optimize inverted simple patterns (e.g. [^a-z]) */
4321 if (!SIZE_ONLY && optimize_invert &&
4322 /* If the only flag is inversion. */
4323 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
4324 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
4325 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
4326 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
4327 }
4328
4329 if (!SIZE_ONLY) {
4330 AV *av = newAV();
4331 SV *rv;
4332
4333 /* The 0th element stores the character class description
4334 * in its textual form: used later (regexec.c:Perl_regclass_swash())
4335 * to initialize the appropriate swash (which gets stored in
4336 * the 1st element), and also useful for dumping the regnode.
4337 * The 2nd element stores the multicharacter foldings,
4338 * used later (regexec.c:S_reginclass()). */
4339 av_store(av, 0, listsv);
4340 av_store(av, 1, NULL);
4341 av_store(av, 2, (SV*)unicode_alternate);
4342 rv = newRV_noinc((SV*)av);
4343 n = add_data(pRExC_state, 1, "s");
4344 RExC_rx->data->data[n] = (void*)rv;
4345 ARG_SET(ret, n);
4346 }
4347
4348 return ret;
4349 }
4350
4351 STATIC char*
S_nextchar(pTHX_ RExC_state_t * pRExC_state)4352 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
4353 {
4354 char* retval = RExC_parse++;
4355
4356 for (;;) {
4357 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
4358 RExC_parse[2] == '#') {
4359 while (*RExC_parse != ')') {
4360 if (RExC_parse == RExC_end)
4361 FAIL("Sequence (?#... not terminated");
4362 RExC_parse++;
4363 }
4364 RExC_parse++;
4365 continue;
4366 }
4367 if (RExC_flags & PMf_EXTENDED) {
4368 if (isSPACE(*RExC_parse)) {
4369 RExC_parse++;
4370 continue;
4371 }
4372 else if (*RExC_parse == '#') {
4373 while (RExC_parse < RExC_end)
4374 if (*RExC_parse++ == '\n') break;
4375 continue;
4376 }
4377 }
4378 return retval;
4379 }
4380 }
4381
4382 /*
4383 - reg_node - emit a node
4384 */
4385 STATIC regnode * /* Location. */
S_reg_node(pTHX_ RExC_state_t * pRExC_state,U8 op)4386 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
4387 {
4388 register regnode *ptr;
4389 regnode * const ret = RExC_emit;
4390
4391 if (SIZE_ONLY) {
4392 SIZE_ALIGN(RExC_size);
4393 RExC_size += 1;
4394 return(ret);
4395 }
4396
4397 NODE_ALIGN_FILL(ret);
4398 ptr = ret;
4399 FILL_ADVANCE_NODE(ptr, op);
4400 if (RExC_offsets) { /* MJD */
4401 MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
4402 "reg_node", __LINE__,
4403 reg_name[op],
4404 RExC_emit - RExC_emit_start > RExC_offsets[0]
4405 ? "Overwriting end of array!\n" : "OK",
4406 RExC_emit - RExC_emit_start,
4407 RExC_parse - RExC_start,
4408 RExC_offsets[0]));
4409 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
4410 }
4411
4412 RExC_emit = ptr;
4413
4414 return(ret);
4415 }
4416
4417 /*
4418 - reganode - emit a node with an argument
4419 */
4420 STATIC regnode * /* Location. */
S_reganode(pTHX_ RExC_state_t * pRExC_state,U8 op,U32 arg)4421 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
4422 {
4423 register regnode *ptr;
4424 regnode * const ret = RExC_emit;
4425
4426 if (SIZE_ONLY) {
4427 SIZE_ALIGN(RExC_size);
4428 RExC_size += 2;
4429 return(ret);
4430 }
4431
4432 NODE_ALIGN_FILL(ret);
4433 ptr = ret;
4434 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
4435 if (RExC_offsets) { /* MJD */
4436 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
4437 "reganode",
4438 __LINE__,
4439 reg_name[op],
4440 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
4441 "Overwriting end of array!\n" : "OK",
4442 RExC_emit - RExC_emit_start,
4443 RExC_parse - RExC_start,
4444 RExC_offsets[0]));
4445 Set_Cur_Node_Offset;
4446 }
4447
4448 RExC_emit = ptr;
4449
4450 return(ret);
4451 }
4452
4453 /*
4454 - reguni - emit (if appropriate) a Unicode character
4455 */
4456 STATIC void
S_reguni(pTHX_ const RExC_state_t * pRExC_state,UV uv,char * s,STRLEN * lenp)4457 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
4458 {
4459 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
4460 }
4461
4462 /*
4463 - reginsert - insert an operator in front of already-emitted operand
4464 *
4465 * Means relocating the operand.
4466 */
4467 STATIC void
S_reginsert(pTHX_ RExC_state_t * pRExC_state,U8 op,regnode * opnd)4468 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
4469 {
4470 register regnode *src;
4471 register regnode *dst;
4472 register regnode *place;
4473 const int offset = regarglen[(U8)op];
4474
4475 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
4476
4477 if (SIZE_ONLY) {
4478 RExC_size += NODE_STEP_REGNODE + offset;
4479 return;
4480 }
4481
4482 src = RExC_emit;
4483 RExC_emit += NODE_STEP_REGNODE + offset;
4484 dst = RExC_emit;
4485 while (src > opnd) {
4486 StructCopy(--src, --dst, regnode);
4487 if (RExC_offsets) { /* MJD 20010112 */
4488 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
4489 "reg_insert",
4490 __LINE__,
4491 reg_name[op],
4492 dst - RExC_emit_start > RExC_offsets[0]
4493 ? "Overwriting end of array!\n" : "OK",
4494 src - RExC_emit_start,
4495 dst - RExC_emit_start,
4496 RExC_offsets[0]));
4497 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
4498 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
4499 }
4500 }
4501
4502
4503 place = opnd; /* Op node, where operand used to be. */
4504 if (RExC_offsets) { /* MJD */
4505 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
4506 "reginsert",
4507 __LINE__,
4508 reg_name[op],
4509 place - RExC_emit_start > RExC_offsets[0]
4510 ? "Overwriting end of array!\n" : "OK",
4511 place - RExC_emit_start,
4512 RExC_parse - RExC_start,
4513 RExC_offsets[0]));
4514 Set_Node_Offset(place, RExC_parse);
4515 Set_Node_Length(place, 1);
4516 }
4517 src = NEXTOPER(place);
4518 FILL_ADVANCE_NODE(place, op);
4519 Zero(src, offset, regnode);
4520 }
4521
4522 /*
4523 - regtail - set the next-pointer at the end of a node chain of p to val.
4524 */
4525 STATIC void
S_regtail(pTHX_ RExC_state_t * pRExC_state,regnode * p,regnode * val)4526 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4527 {
4528 register regnode *scan;
4529
4530 if (SIZE_ONLY)
4531 return;
4532
4533 /* Find last node. */
4534 scan = p;
4535 for (;;) {
4536 regnode * const temp = regnext(scan);
4537 if (temp == NULL)
4538 break;
4539 scan = temp;
4540 }
4541
4542 if (reg_off_by_arg[OP(scan)]) {
4543 ARG_SET(scan, val - scan);
4544 }
4545 else {
4546 NEXT_OFF(scan) = val - scan;
4547 }
4548 }
4549
4550 /*
4551 - regoptail - regtail on operand of first argument; nop if operandless
4552 */
4553 STATIC void
S_regoptail(pTHX_ RExC_state_t * pRExC_state,regnode * p,regnode * val)4554 S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4555 {
4556 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
4557 if (p == NULL || SIZE_ONLY)
4558 return;
4559 if (PL_regkind[(U8)OP(p)] == BRANCH) {
4560 regtail(pRExC_state, NEXTOPER(p), val);
4561 }
4562 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
4563 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
4564 }
4565 else
4566 return;
4567 }
4568
4569 /*
4570 - regcurly - a little FSA that accepts {\d+,?\d*}
4571 */
4572 STATIC I32
S_regcurly(pTHX_ register const char * s)4573 S_regcurly(pTHX_ register const char *s)
4574 {
4575 if (*s++ != '{')
4576 return FALSE;
4577 if (!isDIGIT(*s))
4578 return FALSE;
4579 while (isDIGIT(*s))
4580 s++;
4581 if (*s == ',')
4582 s++;
4583 while (isDIGIT(*s))
4584 s++;
4585 if (*s != '}')
4586 return FALSE;
4587 return TRUE;
4588 }
4589
4590 /*
4591 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
4592 */
4593 void
Perl_regdump(pTHX_ regexp * r)4594 Perl_regdump(pTHX_ regexp *r)
4595 {
4596 #ifdef DEBUGGING
4597 SV *sv = sv_newmortal();
4598
4599 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
4600
4601 /* Header fields of interest. */
4602 if (r->anchored_substr)
4603 PerlIO_printf(Perl_debug_log,
4604 "anchored \"%s%.*s%s\"%s at %"IVdf" ",
4605 PL_colors[0],
4606 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
4607 SvPVX_const(r->anchored_substr),
4608 PL_colors[1],
4609 SvTAIL(r->anchored_substr) ? "$" : "",
4610 (IV)r->anchored_offset);
4611 else if (r->anchored_utf8)
4612 PerlIO_printf(Perl_debug_log,
4613 "anchored utf8 \"%s%.*s%s\"%s at %"IVdf" ",
4614 PL_colors[0],
4615 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
4616 SvPVX_const(r->anchored_utf8),
4617 PL_colors[1],
4618 SvTAIL(r->anchored_utf8) ? "$" : "",
4619 (IV)r->anchored_offset);
4620 if (r->float_substr)
4621 PerlIO_printf(Perl_debug_log,
4622 "floating \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
4623 PL_colors[0],
4624 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
4625 SvPVX_const(r->float_substr),
4626 PL_colors[1],
4627 SvTAIL(r->float_substr) ? "$" : "",
4628 (IV)r->float_min_offset, (UV)r->float_max_offset);
4629 else if (r->float_utf8)
4630 PerlIO_printf(Perl_debug_log,
4631 "floating utf8 \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
4632 PL_colors[0],
4633 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
4634 SvPVX_const(r->float_utf8),
4635 PL_colors[1],
4636 SvTAIL(r->float_utf8) ? "$" : "",
4637 (IV)r->float_min_offset, (UV)r->float_max_offset);
4638 if (r->check_substr || r->check_utf8)
4639 PerlIO_printf(Perl_debug_log,
4640 r->check_substr == r->float_substr
4641 && r->check_utf8 == r->float_utf8
4642 ? "(checking floating" : "(checking anchored");
4643 if (r->reganch & ROPT_NOSCAN)
4644 PerlIO_printf(Perl_debug_log, " noscan");
4645 if (r->reganch & ROPT_CHECK_ALL)
4646 PerlIO_printf(Perl_debug_log, " isall");
4647 if (r->check_substr || r->check_utf8)
4648 PerlIO_printf(Perl_debug_log, ") ");
4649
4650 if (r->regstclass) {
4651 regprop(sv, r->regstclass);
4652 PerlIO_printf(Perl_debug_log, "stclass \"%s\" ", SvPVX_const(sv));
4653 }
4654 if (r->reganch & ROPT_ANCH) {
4655 PerlIO_printf(Perl_debug_log, "anchored");
4656 if (r->reganch & ROPT_ANCH_BOL)
4657 PerlIO_printf(Perl_debug_log, "(BOL)");
4658 if (r->reganch & ROPT_ANCH_MBOL)
4659 PerlIO_printf(Perl_debug_log, "(MBOL)");
4660 if (r->reganch & ROPT_ANCH_SBOL)
4661 PerlIO_printf(Perl_debug_log, "(SBOL)");
4662 if (r->reganch & ROPT_ANCH_GPOS)
4663 PerlIO_printf(Perl_debug_log, "(GPOS)");
4664 PerlIO_putc(Perl_debug_log, ' ');
4665 }
4666 if (r->reganch & ROPT_GPOS_SEEN)
4667 PerlIO_printf(Perl_debug_log, "GPOS ");
4668 if (r->reganch & ROPT_SKIP)
4669 PerlIO_printf(Perl_debug_log, "plus ");
4670 if (r->reganch & ROPT_IMPLICIT)
4671 PerlIO_printf(Perl_debug_log, "implicit ");
4672 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
4673 if (r->reganch & ROPT_EVAL_SEEN)
4674 PerlIO_printf(Perl_debug_log, "with eval ");
4675 PerlIO_printf(Perl_debug_log, "\n");
4676 if (r->offsets) {
4677 U32 i;
4678 const U32 len = r->offsets[0];
4679 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
4680 for (i = 1; i <= len; i++)
4681 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
4682 (UV)r->offsets[i*2-1],
4683 (UV)r->offsets[i*2]);
4684 PerlIO_printf(Perl_debug_log, "\n");
4685 }
4686 #endif /* DEBUGGING */
4687 }
4688
4689 /*
4690 - regprop - printable representation of opcode
4691 */
4692 void
Perl_regprop(pTHX_ SV * sv,regnode * o)4693 Perl_regprop(pTHX_ SV *sv, regnode *o)
4694 {
4695 #ifdef DEBUGGING
4696 register int k;
4697
4698 sv_setpvn(sv, "", 0);
4699 if (OP(o) >= reg_num) /* regnode.type is unsigned */
4700 /* It would be nice to FAIL() here, but this may be called from
4701 regexec.c, and it would be hard to supply pRExC_state. */
4702 Perl_croak(aTHX_ "Corrupted regexp opcode");
4703 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
4704
4705 k = PL_regkind[(U8)OP(o)];
4706
4707 if (k == EXACT) {
4708 SV * const dsv = sv_2mortal(newSVpvn("", 0));
4709 /* Using is_utf8_string() is a crude hack but it may
4710 * be the best for now since we have no flag "this EXACTish
4711 * node was UTF-8" --jhi */
4712 const bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
4713 const char * const s = do_utf8 ?
4714 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
4715 UNI_DISPLAY_REGEX) :
4716 STRING(o);
4717 const int len = do_utf8 ?
4718 strlen(s) :
4719 STR_LEN(o);
4720 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
4721 PL_colors[0],
4722 len, s,
4723 PL_colors[1]);
4724 }
4725 else if (k == CURLY) {
4726 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
4727 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4728 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
4729 }
4730 else if (k == WHILEM && o->flags) /* Ordinal/of */
4731 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
4732 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
4733 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
4734 else if (k == LOGICAL)
4735 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
4736 else if (k == ANYOF) {
4737 int i, rangestart = -1;
4738 const U8 flags = ANYOF_FLAGS(o);
4739 const char * const anyofs[] = { /* Should be synchronized with
4740 * ANYOF_ #xdefines in regcomp.h */
4741 "\\w",
4742 "\\W",
4743 "\\s",
4744 "\\S",
4745 "\\d",
4746 "\\D",
4747 "[:alnum:]",
4748 "[:^alnum:]",
4749 "[:alpha:]",
4750 "[:^alpha:]",
4751 "[:ascii:]",
4752 "[:^ascii:]",
4753 "[:ctrl:]",
4754 "[:^ctrl:]",
4755 "[:graph:]",
4756 "[:^graph:]",
4757 "[:lower:]",
4758 "[:^lower:]",
4759 "[:print:]",
4760 "[:^print:]",
4761 "[:punct:]",
4762 "[:^punct:]",
4763 "[:upper:]",
4764 "[:^upper:]",
4765 "[:xdigit:]",
4766 "[:^xdigit:]",
4767 "[:space:]",
4768 "[:^space:]",
4769 "[:blank:]",
4770 "[:^blank:]"
4771 };
4772
4773 if (flags & ANYOF_LOCALE)
4774 sv_catpv(sv, "{loc}");
4775 if (flags & ANYOF_FOLD)
4776 sv_catpv(sv, "{i}");
4777 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
4778 if (flags & ANYOF_INVERT)
4779 sv_catpv(sv, "^");
4780 for (i = 0; i <= 256; i++) {
4781 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4782 if (rangestart == -1)
4783 rangestart = i;
4784 } else if (rangestart != -1) {
4785 if (i <= rangestart + 3)
4786 for (; rangestart < i; rangestart++)
4787 put_byte(sv, rangestart);
4788 else {
4789 put_byte(sv, rangestart);
4790 sv_catpv(sv, "-");
4791 put_byte(sv, i - 1);
4792 }
4793 rangestart = -1;
4794 }
4795 }
4796
4797 if (o->flags & ANYOF_CLASS)
4798 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4799 if (ANYOF_CLASS_TEST(o,i))
4800 sv_catpv(sv, anyofs[i]);
4801
4802 if (flags & ANYOF_UNICODE)
4803 sv_catpv(sv, "{unicode}");
4804 else if (flags & ANYOF_UNICODE_ALL)
4805 sv_catpv(sv, "{unicode_all}");
4806
4807 {
4808 SV *lv;
4809 SV * const sw = regclass_swash(o, FALSE, &lv, 0);
4810
4811 if (lv) {
4812 if (sw) {
4813 U8 s[UTF8_MAXBYTES_CASE+1];
4814
4815 for (i = 0; i <= 256; i++) { /* just the first 256 */
4816 uvchr_to_utf8(s, i);
4817
4818 if (i < 256 && swash_fetch(sw, s, TRUE)) {
4819 if (rangestart == -1)
4820 rangestart = i;
4821 } else if (rangestart != -1) {
4822 if (i <= rangestart + 3)
4823 for (; rangestart < i; rangestart++) {
4824 const U8 * const e = uvchr_to_utf8(s,rangestart);
4825 U8 *p;
4826 for(p = s; p < e; p++)
4827 put_byte(sv, *p);
4828 }
4829 else {
4830 const U8 *e = uvchr_to_utf8(s,rangestart);
4831 U8 *p;
4832 for (p = s; p < e; p++)
4833 put_byte(sv, *p);
4834 sv_catpvn(sv, "-", 1);
4835 e = uvchr_to_utf8(s, i-1);
4836 for (p = s; p < e; p++)
4837 put_byte(sv, *p);
4838 }
4839 rangestart = -1;
4840 }
4841 }
4842
4843 sv_catpv(sv, "..."); /* et cetera */
4844 }
4845
4846 {
4847 char *s = savesvpv(lv);
4848 char *origs = s;
4849
4850 while(*s && *s != '\n') s++;
4851
4852 if (*s == '\n') {
4853 const char * const t = ++s;
4854
4855 while (*s) {
4856 if (*s == '\n')
4857 *s = ' ';
4858 s++;
4859 }
4860 if (s[-1] == ' ')
4861 s[-1] = 0;
4862
4863 sv_catpv(sv, t);
4864 }
4865
4866 Safefree(origs);
4867 }
4868 }
4869 }
4870
4871 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4872 }
4873 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
4874 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
4875 #endif /* DEBUGGING */
4876 }
4877
4878 SV *
Perl_re_intuit_string(pTHX_ regexp * prog)4879 Perl_re_intuit_string(pTHX_ regexp *prog)
4880 { /* Assume that RE_INTUIT is set */
4881 DEBUG_r(
4882 {
4883 const char * const s = SvPV_nolen_const(prog->check_substr
4884 ? prog->check_substr : prog->check_utf8);
4885
4886 if (!PL_colorset) reginitcolors();
4887 PerlIO_printf(Perl_debug_log,
4888 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
4889 PL_colors[4],
4890 prog->check_substr ? "" : "utf8 ",
4891 PL_colors[5],PL_colors[0],
4892 s,
4893 PL_colors[1],
4894 (strlen(s) > 60 ? "..." : ""));
4895 } );
4896
4897 return prog->check_substr ? prog->check_substr : prog->check_utf8;
4898 }
4899
4900 void
Perl_pregfree(pTHX_ struct regexp * r)4901 Perl_pregfree(pTHX_ struct regexp *r)
4902 {
4903 #ifdef DEBUGGING
4904 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
4905 #endif
4906
4907 if (!r || (--r->refcnt > 0))
4908 return;
4909 DEBUG_r({
4910 const char *s = (r->reganch & ROPT_UTF8)
4911 ? pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60, UNI_DISPLAY_REGEX)
4912 : pv_display(dsv, r->precomp, r->prelen, 0, 60);
4913 const int len = SvCUR(dsv);
4914 if (!PL_colorset)
4915 reginitcolors();
4916 PerlIO_printf(Perl_debug_log,
4917 "%sFreeing REx:%s `%s%*.*s%s%s'\n",
4918 PL_colors[4],PL_colors[5],PL_colors[0],
4919 len, len, s,
4920 PL_colors[1],
4921 len > 60 ? "..." : "");
4922 });
4923
4924 /* gcov results gave these as non-null 100% of the time, so there's no
4925 optimisation in checking them before calling Safefree */
4926 Safefree(r->precomp);
4927 Safefree(r->offsets); /* 20010421 MJD */
4928 if (RX_MATCH_COPIED(r))
4929 Safefree(r->subbeg);
4930 if (r->substrs) {
4931 if (r->anchored_substr)
4932 SvREFCNT_dec(r->anchored_substr);
4933 if (r->anchored_utf8)
4934 SvREFCNT_dec(r->anchored_utf8);
4935 if (r->float_substr)
4936 SvREFCNT_dec(r->float_substr);
4937 if (r->float_utf8)
4938 SvREFCNT_dec(r->float_utf8);
4939 Safefree(r->substrs);
4940 }
4941 if (r->data) {
4942 int n = r->data->count;
4943 PAD* new_comppad = NULL;
4944 PAD* old_comppad;
4945 PADOFFSET refcnt;
4946
4947 while (--n >= 0) {
4948 /* If you add a ->what type here, update the comment in regcomp.h */
4949 switch (r->data->what[n]) {
4950 case 's':
4951 SvREFCNT_dec((SV*)r->data->data[n]);
4952 break;
4953 case 'f':
4954 Safefree(r->data->data[n]);
4955 break;
4956 case 'p':
4957 new_comppad = (AV*)r->data->data[n];
4958 break;
4959 case 'o':
4960 if (new_comppad == NULL)
4961 Perl_croak(aTHX_ "panic: pregfree comppad");
4962 PAD_SAVE_LOCAL(old_comppad,
4963 /* Watch out for global destruction's random ordering. */
4964 (SvTYPE(new_comppad) == SVt_PVAV) ?
4965 new_comppad : Null(PAD *)
4966 );
4967 OP_REFCNT_LOCK;
4968 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
4969 OP_REFCNT_UNLOCK;
4970 if (!refcnt)
4971 op_free((OP_4tree*)r->data->data[n]);
4972
4973 PAD_RESTORE_LOCAL(old_comppad);
4974 SvREFCNT_dec((SV*)new_comppad);
4975 new_comppad = NULL;
4976 break;
4977 case 'n':
4978 break;
4979 default:
4980 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
4981 }
4982 }
4983 Safefree(r->data->what);
4984 Safefree(r->data);
4985 }
4986 Safefree(r->startp);
4987 Safefree(r->endp);
4988 Safefree(r);
4989 }
4990
4991 /*
4992 - regnext - dig the "next" pointer out of a node
4993 *
4994 * [Note, when REGALIGN is defined there are two places in regmatch()
4995 * that bypass this code for speed.]
4996 */
4997 regnode *
Perl_regnext(pTHX_ register regnode * p)4998 Perl_regnext(pTHX_ register regnode *p)
4999 {
5000 register I32 offset;
5001
5002 if (p == &PL_regdummy)
5003 return(NULL);
5004
5005 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
5006 if (offset == 0)
5007 return(NULL);
5008
5009 return(p+offset);
5010 }
5011
5012 STATIC void
S_re_croak2(pTHX_ const char * pat1,const char * pat2,...)5013 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
5014 {
5015 va_list args;
5016 STRLEN l1 = strlen(pat1);
5017 STRLEN l2 = strlen(pat2);
5018 char buf[512];
5019 SV *msv;
5020 const char *message;
5021
5022 if (l1 > 510)
5023 l1 = 510;
5024 if (l1 + l2 > 510)
5025 l2 = 510 - l1;
5026 Copy(pat1, buf, l1 , char);
5027 Copy(pat2, buf + l1, l2 , char);
5028 buf[l1 + l2] = '\n';
5029 buf[l1 + l2 + 1] = '\0';
5030 #ifdef I_STDARG
5031 /* ANSI variant takes additional second argument */
5032 va_start(args, pat2);
5033 #else
5034 va_start(args);
5035 #endif
5036 msv = vmess(buf, &args);
5037 va_end(args);
5038 message = SvPV_const(msv,l1);
5039 if (l1 > 512)
5040 l1 = 512;
5041 Copy(message, buf, l1 , char);
5042 buf[l1-1] = '\0'; /* Overwrite \n */
5043 Perl_croak(aTHX_ "%s", buf);
5044 }
5045
5046 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
5047
5048 void
Perl_save_re_context(pTHX)5049 Perl_save_re_context(pTHX)
5050 {
5051 SAVEI32(PL_reg_flags); /* from regexec.c */
5052 SAVEPPTR(PL_bostr);
5053 SAVEPPTR(PL_reginput); /* String-input pointer. */
5054 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
5055 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
5056 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
5057 SAVEVPTR(PL_regendp); /* Ditto for endp. */
5058 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
5059 SAVEVPTR(PL_reglastcloseparen); /* Similarly for lastcloseparen. */
5060 SAVEPPTR(PL_regtill); /* How far we are required to go. */
5061 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
5062 PL_reg_start_tmp = 0;
5063 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
5064 PL_reg_start_tmpl = 0;
5065 SAVEVPTR(PL_regdata);
5066 SAVEI32(PL_reg_eval_set); /* from regexec.c */
5067 SAVEI32(PL_regnarrate); /* from regexec.c */
5068 SAVEVPTR(PL_regprogram); /* from regexec.c */
5069 SAVEINT(PL_regindent); /* from regexec.c */
5070 SAVEVPTR(PL_regcc); /* from regexec.c */
5071 SAVEVPTR(PL_curcop);
5072 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
5073 SAVEVPTR(PL_reg_re); /* from regexec.c */
5074 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
5075 SAVESPTR(PL_reg_sv); /* from regexec.c */
5076 SAVEBOOL(PL_reg_match_utf8); /* from regexec.c */
5077 SAVEVPTR(PL_reg_magic); /* from regexec.c */
5078 SAVEI32(PL_reg_oldpos); /* from regexec.c */
5079 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
5080 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
5081 SAVEPPTR(PL_reg_oldsaved); /* old saved substr during match */
5082 PL_reg_oldsaved = Nullch;
5083 SAVEI32(PL_reg_oldsavedlen); /* old length of saved substr during match */
5084 PL_reg_oldsavedlen = 0;
5085 SAVEI32(PL_reg_maxiter); /* max wait until caching pos */
5086 PL_reg_maxiter = 0;
5087 SAVEI32(PL_reg_leftiter); /* wait until caching pos */
5088 PL_reg_leftiter = 0;
5089 SAVEGENERICPV(PL_reg_poscache); /* cache of pos of WHILEM */
5090 PL_reg_poscache = Nullch;
5091 SAVEI32(PL_reg_poscache_size); /* size of pos cache of WHILEM */
5092 PL_reg_poscache_size = 0;
5093 SAVEPPTR(PL_regprecomp); /* uncompiled string. */
5094 SAVEI32(PL_regnpar); /* () count. */
5095 SAVEI32(PL_regsize); /* from regexec.c */
5096
5097 {
5098 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
5099 REGEXP *rx;
5100
5101 if (PL_curpm && (rx = PM_GETRE(PL_curpm))) {
5102 U32 i;
5103 for (i = 1; i <= rx->nparens; i++) {
5104 GV *mgv;
5105 char digits[TYPE_CHARS(long)];
5106 sprintf(digits, "%lu", (long)i);
5107 if ((mgv = gv_fetchpv(digits, FALSE, SVt_PV)))
5108 save_scalar(mgv);
5109 }
5110 }
5111 }
5112
5113 #ifdef DEBUGGING
5114 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
5115 #endif
5116 }
5117
5118 static void
clear_re(pTHX_ void * r)5119 clear_re(pTHX_ void *r)
5120 {
5121 ReREFCNT_dec((regexp *)r);
5122 }
5123
5124 #ifdef DEBUGGING
5125
5126 STATIC void
S_put_byte(pTHX_ SV * sv,int c)5127 S_put_byte(pTHX_ SV *sv, int c)
5128 {
5129 if (isCNTRL(c) || c == 255 || !isPRINT(c))
5130 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
5131 else if (c == '-' || c == ']' || c == '\\' || c == '^')
5132 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
5133 else
5134 Perl_sv_catpvf(aTHX_ sv, "%c", c);
5135 }
5136
5137
5138 STATIC regnode *
S_dumpuntil(pTHX_ regnode * start,regnode * node,regnode * last,SV * sv,I32 l)5139 S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
5140 {
5141 register U8 op = EXACT; /* Arbitrary non-END op. */
5142 register regnode *next;
5143
5144 while (op != END && (!last || node < last)) {
5145 /* While that wasn't END last time... */
5146
5147 NODE_ALIGN(node);
5148 op = OP(node);
5149 if (op == CLOSE)
5150 l--;
5151 next = regnext(node);
5152 /* Where, what. */
5153 if (OP(node) == OPTIMIZED)
5154 goto after_print;
5155 regprop(sv, node);
5156 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
5157 (int)(2*l + 1), "", SvPVX_const(sv));
5158 if (next == NULL) /* Next ptr. */
5159 PerlIO_printf(Perl_debug_log, "(0)");
5160 else
5161 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
5162 (void)PerlIO_putc(Perl_debug_log, '\n');
5163 after_print:
5164 if (PL_regkind[(U8)op] == BRANCHJ) {
5165 register regnode *nnode = (OP(next) == LONGJMP
5166 ? regnext(next)
5167 : next);
5168 if (last && nnode > last)
5169 nnode = last;
5170 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
5171 }
5172 else if (PL_regkind[(U8)op] == BRANCH) {
5173 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
5174 }
5175 else if ( op == CURLY) { /* "next" might be very big: optimizer */
5176 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
5177 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
5178 }
5179 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
5180 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
5181 next, sv, l + 1);
5182 }
5183 else if ( op == PLUS || op == STAR) {
5184 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
5185 }
5186 else if (op == ANYOF) {
5187 /* arglen 1 + class block */
5188 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
5189 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
5190 node = NEXTOPER(node);
5191 }
5192 else if (PL_regkind[(U8)op] == EXACT) {
5193 /* Literal string, where present. */
5194 node += NODE_SZ_STR(node) - 1;
5195 node = NEXTOPER(node);
5196 }
5197 else {
5198 node = NEXTOPER(node);
5199 node += regarglen[(U8)op];
5200 }
5201 if (op == CURLYX || op == OPEN)
5202 l++;
5203 else if (op == WHILEM)
5204 l--;
5205 }
5206 return node;
5207 }
5208
5209 #endif /* DEBUGGING */
5210
5211 /*
5212 * Local variables:
5213 * c-indentation-style: bsd
5214 * c-basic-offset: 4
5215 * indent-tabs-mode: t
5216 * End:
5217 *
5218 * ex: set ts=8 sts=4 sw=4 noet:
5219 */
5220