1 /* regcomp.h 2 * 3 * Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 4 * 2000, 2001, 2002, 2003, 2005 by Larry Wall and others 5 * 6 * You may distribute under the terms of either the GNU General Public 7 * License or the Artistic License, as specified in the README file. 8 * 9 */ 10 11 typedef OP OP_4tree; /* Will be redefined later. */ 12 13 /* 14 * The "internal use only" fields in regexp.h are present to pass info from 15 * compile to execute that permits the execute phase to run lots faster on 16 * simple cases. They are: 17 * 18 * regstart sv that must begin a match; Nullch if none obvious 19 * reganch is the match anchored (at beginning-of-line only)? 20 * regmust string (pointer into program) that match must include, or NULL 21 * [regmust changed to SV* for bminstr()--law] 22 * regmlen length of regmust string 23 * [regmlen not used currently] 24 * 25 * Regstart and reganch permit very fast decisions on suitable starting points 26 * for a match, cutting down the work a lot. Regmust permits fast rejection 27 * of lines that cannot possibly match. The regmust tests are costly enough 28 * that pregcomp() supplies a regmust only if the r.e. contains something 29 * potentially expensive (at present, the only such thing detected is * or + 30 * at the start of the r.e., which can involve a lot of backup). Regmlen is 31 * supplied because the test in pregexec() needs it and pregcomp() is computing 32 * it anyway. 33 * [regmust is now supplied always. The tests that use regmust have a 34 * heuristic that disables the test if it usually matches.] 35 * 36 * [In fact, we now use regmust in many cases to locate where the search 37 * starts in the string, so if regback is >= 0, the regmust search is never 38 * wasted effort. The regback variable says how many characters back from 39 * where regmust matched is the earliest possible start of the match. 40 * For instance, /[a-z].foo/ has a regmust of 'foo' and a regback of 2.] 41 */ 42 43 /* 44 * Structure for regexp "program". This is essentially a linear encoding 45 * of a nondeterministic finite-state machine (aka syntax charts or 46 * "railroad normal form" in parsing technology). Each node is an opcode 47 * plus a "next" pointer, possibly plus an operand. "Next" pointers of 48 * all nodes except BRANCH implement concatenation; a "next" pointer with 49 * a BRANCH on both ends of it is connecting two alternatives. (Here we 50 * have one of the subtle syntax dependencies: an individual BRANCH (as 51 * opposed to a collection of them) is never concatenated with anything 52 * because of operator precedence.) The operand of some types of node is 53 * a literal string; for others, it is a node leading into a sub-FSM. In 54 * particular, the operand of a BRANCH node is the first node of the branch. 55 * (NB this is *not* a tree structure: the tail of the branch connects 56 * to the thing following the set of BRANCHes.) The opcodes are: 57 */ 58 59 /* 60 * A node is one char of opcode followed by two chars of "next" pointer. 61 * "Next" pointers are stored as two 8-bit pieces, high order first. The 62 * value is a positive offset from the opcode of the node containing it. 63 * An operand, if any, simply follows the node. (Note that much of the 64 * code generation knows about this implicit relationship.) 65 * 66 * Using two bytes for the "next" pointer is vast overkill for most things, 67 * but allows patterns to get big without disasters. 68 * 69 * [The "next" pointer is always aligned on an even 70 * boundary, and reads the offset directly as a short. Also, there is no 71 * special test to reverse the sign of BACK pointers since the offset is 72 * stored negative.] 73 */ 74 75 struct regnode_string { 76 U8 str_len; 77 U8 type; 78 U16 next_off; 79 char string[1]; 80 }; 81 82 struct regnode_1 { 83 U8 flags; 84 U8 type; 85 U16 next_off; 86 U32 arg1; 87 }; 88 89 struct regnode_2 { 90 U8 flags; 91 U8 type; 92 U16 next_off; 93 U16 arg1; 94 U16 arg2; 95 }; 96 97 #define ANYOF_BITMAP_SIZE 32 /* 256 b/(8 b/B) */ 98 #define ANYOF_CLASSBITMAP_SIZE 4 /* up to 32 (8*4) named classes */ 99 100 struct regnode_charclass { 101 U8 flags; 102 U8 type; 103 U16 next_off; 104 U32 arg1; 105 char bitmap[ANYOF_BITMAP_SIZE]; /* only compile-time */ 106 }; 107 108 struct regnode_charclass_class { /* has [[:blah:]] classes */ 109 U8 flags; /* should have ANYOF_CLASS here */ 110 U8 type; 111 U16 next_off; 112 U32 arg1; 113 char bitmap[ANYOF_BITMAP_SIZE]; /* both compile-time */ 114 char classflags[ANYOF_CLASSBITMAP_SIZE]; /* and run-time */ 115 }; 116 117 /* XXX fix this description. 118 Impose a limit of REG_INFTY on various pattern matching operations 119 to limit stack growth and to avoid "infinite" recursions. 120 */ 121 /* The default size for REG_INFTY is I16_MAX, which is the same as 122 SHORT_MAX (see perl.h). Unfortunately I16 isn't necessarily 16 bits 123 (see handy.h). On the Cray C90, sizeof(short)==4 and hence I16_MAX is 124 ((1<<31)-1), while on the Cray T90, sizeof(short)==8 and I16_MAX is 125 ((1<<63)-1). To limit stack growth to reasonable sizes, supply a 126 smaller default. 127 --Andy Dougherty 11 June 1998 128 */ 129 #if SHORTSIZE > 2 130 # ifndef REG_INFTY 131 # define REG_INFTY ((1<<15)-1) 132 # endif 133 #endif 134 135 #ifndef REG_INFTY 136 # define REG_INFTY I16_MAX 137 #endif 138 139 #define ARG_VALUE(arg) (arg) 140 #define ARG__SET(arg,val) ((arg) = (val)) 141 142 #undef ARG 143 #undef ARG1 144 #undef ARG2 145 146 #define ARG(p) ARG_VALUE(ARG_LOC(p)) 147 #define ARG1(p) ARG_VALUE(ARG1_LOC(p)) 148 #define ARG2(p) ARG_VALUE(ARG2_LOC(p)) 149 #define ARG_SET(p, val) ARG__SET(ARG_LOC(p), (val)) 150 #define ARG1_SET(p, val) ARG__SET(ARG1_LOC(p), (val)) 151 #define ARG2_SET(p, val) ARG__SET(ARG2_LOC(p), (val)) 152 153 #undef NEXT_OFF 154 #undef NODE_ALIGN 155 156 #define NEXT_OFF(p) ((p)->next_off) 157 #define NODE_ALIGN(node) 158 #define NODE_ALIGN_FILL(node) ((node)->flags = 0xde) /* deadbeef */ 159 160 #define SIZE_ALIGN NODE_ALIGN 161 162 #undef OP 163 #undef OPERAND 164 #undef MASK 165 #undef STRING 166 167 #define OP(p) ((p)->type) 168 #define OPERAND(p) (((struct regnode_string *)p)->string) 169 #define MASK(p) ((char*)OPERAND(p)) 170 #define STR_LEN(p) (((struct regnode_string *)p)->str_len) 171 #define STRING(p) (((struct regnode_string *)p)->string) 172 #define STR_SZ(l) ((l + sizeof(regnode) - 1) / sizeof(regnode)) 173 #define NODE_SZ_STR(p) (STR_SZ(STR_LEN(p))+1) 174 175 #undef NODE_ALIGN 176 #undef ARG_LOC 177 #undef NEXTOPER 178 #undef PREVOPER 179 180 #define NODE_ALIGN(node) 181 #define ARG_LOC(p) (((struct regnode_1 *)p)->arg1) 182 #define ARG1_LOC(p) (((struct regnode_2 *)p)->arg1) 183 #define ARG2_LOC(p) (((struct regnode_2 *)p)->arg2) 184 #define NODE_STEP_REGNODE 1 /* sizeof(regnode)/sizeof(regnode) */ 185 #define EXTRA_STEP_2ARGS EXTRA_SIZE(struct regnode_2) 186 187 #define NODE_STEP_B 4 188 189 #define NEXTOPER(p) ((p) + NODE_STEP_REGNODE) 190 #define PREVOPER(p) ((p) - NODE_STEP_REGNODE) 191 192 #define FILL_ADVANCE_NODE(ptr, op) STMT_START { \ 193 (ptr)->type = op; (ptr)->next_off = 0; (ptr)++; } STMT_END 194 #define FILL_ADVANCE_NODE_ARG(ptr, op, arg) STMT_START { \ 195 ARG_SET(ptr, arg); FILL_ADVANCE_NODE(ptr, op); (ptr) += 1; } STMT_END 196 197 #define REG_MAGIC 0234 198 199 #define SIZE_ONLY (RExC_emit == &PL_regdummy) 200 201 /* Flags for node->flags of ANYOF */ 202 203 #define ANYOF_CLASS 0x08 /* has [[:blah:]] classes */ 204 #define ANYOF_INVERT 0x04 205 #define ANYOF_FOLD 0x02 206 #define ANYOF_LOCALE 0x01 207 208 /* Used for regstclass only */ 209 #define ANYOF_EOS 0x10 /* Can match an empty string too */ 210 211 /* There is a character or a range past 0xff */ 212 #define ANYOF_UNICODE 0x20 213 #define ANYOF_UNICODE_ALL 0x40 /* Can match any char past 0xff */ 214 215 /* size of node is large (includes class pointer) */ 216 #define ANYOF_LARGE 0x80 217 218 /* Are there any runtime flags on in this node? */ 219 #define ANYOF_RUNTIME(s) (ANYOF_FLAGS(s) & 0x0f) 220 221 #define ANYOF_FLAGS_ALL 0xff 222 223 /* Character classes for node->classflags of ANYOF */ 224 /* Should be synchronized with a table in regprop() */ 225 /* 2n should pair with 2n+1 */ 226 227 #define ANYOF_ALNUM 0 /* \w, PL_utf8_alnum, utf8::IsWord, ALNUM */ 228 #define ANYOF_NALNUM 1 229 #define ANYOF_SPACE 2 /* \s */ 230 #define ANYOF_NSPACE 3 231 #define ANYOF_DIGIT 4 232 #define ANYOF_NDIGIT 5 233 #define ANYOF_ALNUMC 6 /* isalnum(3), utf8::IsAlnum, ALNUMC */ 234 #define ANYOF_NALNUMC 7 235 #define ANYOF_ALPHA 8 236 #define ANYOF_NALPHA 9 237 #define ANYOF_ASCII 10 238 #define ANYOF_NASCII 11 239 #define ANYOF_CNTRL 12 240 #define ANYOF_NCNTRL 13 241 #define ANYOF_GRAPH 14 242 #define ANYOF_NGRAPH 15 243 #define ANYOF_LOWER 16 244 #define ANYOF_NLOWER 17 245 #define ANYOF_PRINT 18 246 #define ANYOF_NPRINT 19 247 #define ANYOF_PUNCT 20 248 #define ANYOF_NPUNCT 21 249 #define ANYOF_UPPER 22 250 #define ANYOF_NUPPER 23 251 #define ANYOF_XDIGIT 24 252 #define ANYOF_NXDIGIT 25 253 #define ANYOF_PSXSPC 26 /* POSIX space: \s plus the vertical tab */ 254 #define ANYOF_NPSXSPC 27 255 #define ANYOF_BLANK 28 /* GNU extension: space and tab: non-vertical space */ 256 #define ANYOF_NBLANK 29 257 258 #define ANYOF_MAX 32 259 260 /* Backward source code compatibility. */ 261 262 #define ANYOF_ALNUML ANYOF_ALNUM 263 #define ANYOF_NALNUML ANYOF_NALNUM 264 #define ANYOF_SPACEL ANYOF_SPACE 265 #define ANYOF_NSPACEL ANYOF_NSPACE 266 267 /* Utility macros for the bitmap and classes of ANYOF */ 268 269 #define ANYOF_SIZE (sizeof(struct regnode_charclass)) 270 #define ANYOF_CLASS_SIZE (sizeof(struct regnode_charclass_class)) 271 272 #define ANYOF_FLAGS(p) ((p)->flags) 273 274 #define ANYOF_BIT(c) (1 << ((c) & 7)) 275 276 #define ANYOF_CLASS_BYTE(p, c) (((struct regnode_charclass_class*)(p))->classflags[((c) >> 3) & 3]) 277 #define ANYOF_CLASS_SET(p, c) (ANYOF_CLASS_BYTE(p, c) |= ANYOF_BIT(c)) 278 #define ANYOF_CLASS_CLEAR(p, c) (ANYOF_CLASS_BYTE(p, c) &= ~ANYOF_BIT(c)) 279 #define ANYOF_CLASS_TEST(p, c) (ANYOF_CLASS_BYTE(p, c) & ANYOF_BIT(c)) 280 281 #define ANYOF_CLASS_ZERO(ret) Zero(((struct regnode_charclass_class*)(ret))->classflags, ANYOF_CLASSBITMAP_SIZE, char) 282 #define ANYOF_BITMAP_ZERO(ret) Zero(((struct regnode_charclass*)(ret))->bitmap, ANYOF_BITMAP_SIZE, char) 283 284 #define ANYOF_BITMAP(p) (((struct regnode_charclass*)(p))->bitmap) 285 #define ANYOF_BITMAP_BYTE(p, c) (ANYOF_BITMAP(p)[((c) >> 3) & 31]) 286 #define ANYOF_BITMAP_SET(p, c) (ANYOF_BITMAP_BYTE(p, c) |= ANYOF_BIT(c)) 287 #define ANYOF_BITMAP_CLEAR(p,c) (ANYOF_BITMAP_BYTE(p, c) &= ~ANYOF_BIT(c)) 288 #define ANYOF_BITMAP_TEST(p, c) (ANYOF_BITMAP_BYTE(p, c) & ANYOF_BIT(c)) 289 290 #define ANYOF_BITMAP_SETALL(p) \ 291 memset (ANYOF_BITMAP(p), 255, ANYOF_BITMAP_SIZE) 292 #define ANYOF_BITMAP_CLEARALL(p) \ 293 Zero (ANYOF_BITMAP(p), ANYOF_BITMAP_SIZE) 294 /* Check that all 256 bits are all set. Used in S_cl_is_anything() */ 295 #define ANYOF_BITMAP_TESTALLSET(p) \ 296 memEQ (ANYOF_BITMAP(p), "\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377", ANYOF_BITMAP_SIZE) 297 298 #define ANYOF_SKIP ((ANYOF_SIZE - 1)/sizeof(regnode)) 299 #define ANYOF_CLASS_SKIP ((ANYOF_CLASS_SIZE - 1)/sizeof(regnode)) 300 #define ANYOF_CLASS_ADD_SKIP (ANYOF_CLASS_SKIP - ANYOF_SKIP) 301 302 /* 303 * Utility definitions. 304 */ 305 #ifndef CHARMASK 306 # define UCHARAT(p) ((int)*(const U8*)(p)) 307 #else 308 # define UCHARAT(p) ((int)*(p)&CHARMASK) 309 #endif 310 311 #define EXTRA_SIZE(guy) ((sizeof(guy)-1)/sizeof(struct regnode)) 312 313 #define REG_SEEN_ZERO_LEN 1 314 #define REG_SEEN_LOOKBEHIND 2 315 #define REG_SEEN_GPOS 4 316 #define REG_SEEN_EVAL 8 317 #define REG_SEEN_CANY 16 318 #define REG_SEEN_SANY REG_SEEN_CANY /* src bckwrd cmpt */ 319 320 START_EXTERN_C 321 322 #include "regnodes.h" 323 324 /* The following have no fixed length. U8 so we can do strchr() on it. */ 325 #ifndef DOINIT 326 EXTCONST U8 PL_varies[]; 327 #else 328 EXTCONST U8 PL_varies[] = { 329 BRANCH, BACK, STAR, PLUS, CURLY, CURLYX, REF, REFF, REFFL, 330 WHILEM, CURLYM, CURLYN, BRANCHJ, IFTHEN, SUSPEND, CLUMP, 0 331 }; 332 #endif 333 334 /* The following always have a length of 1. U8 we can do strchr() on it. */ 335 /* (Note that length 1 means "one character" under UTF8, not "one octet".) */ 336 #ifndef DOINIT 337 EXTCONST U8 PL_simple[]; 338 #else 339 EXTCONST U8 PL_simple[] = { 340 REG_ANY, SANY, CANY, 341 ANYOF, 342 ALNUM, ALNUML, 343 NALNUM, NALNUML, 344 SPACE, SPACEL, 345 NSPACE, NSPACEL, 346 DIGIT, NDIGIT, 347 0 348 }; 349 #endif 350 351 END_EXTERN_C 352 353 typedef struct re_scream_pos_data_s 354 { 355 char **scream_olds; /* match pos */ 356 I32 *scream_pos; /* Internal iterator of scream. */ 357 } re_scream_pos_data; 358 359 /* .what is a character array with one character for each member of .data 360 * The character describes the function of the corresponding .data item: 361 * f - start-class data for regstclass optimization 362 * n - Root of op tree for (?{EVAL}) item 363 * o - Start op for (?{EVAL}) item 364 * p - Pad for (?{EVAL} item 365 * s - swash for unicode-style character class, and the multicharacter 366 * strings resulting from casefolding the single-character entries 367 * in the character class 368 * 20010712 mjd@plover.com 369 * (Remember to update re_dup() and pregfree() if you add any items.) 370 */ 371 struct reg_data { 372 U32 count; 373 U8 *what; 374 void* data[1]; 375 }; 376 377 struct reg_substr_datum { 378 I32 min_offset; 379 I32 max_offset; 380 SV *substr; /* non-utf8 variant */ 381 SV *utf8_substr; /* utf8 variant */ 382 }; 383 384 struct reg_substr_data { 385 struct reg_substr_datum data[3]; /* Actual array */ 386 }; 387 388 #define anchored_substr substrs->data[0].substr 389 #define anchored_utf8 substrs->data[0].utf8_substr 390 #define anchored_offset substrs->data[0].min_offset 391 #define float_substr substrs->data[1].substr 392 #define float_utf8 substrs->data[1].utf8_substr 393 #define float_min_offset substrs->data[1].min_offset 394 #define float_max_offset substrs->data[1].max_offset 395 #define check_substr substrs->data[2].substr 396 #define check_utf8 substrs->data[2].utf8_substr 397 #define check_offset_min substrs->data[2].min_offset 398 #define check_offset_max substrs->data[2].max_offset 399