1 /* $OpenBSD: nfa.c,v 1.9 2003/06/04 17:34:44 millert Exp $ */
2
3 /* nfa - NFA construction routines */
4
5 /*-
6 * Copyright (c) 1990 The Regents of the University of California.
7 * All rights reserved.
8 *
9 * This code is derived from software contributed to Berkeley by
10 * Vern Paxson.
11 *
12 * The United States Government has rights in this work pursuant
13 * to contract no. DE-AC03-76SF00098 between the United States
14 * Department of Energy and the University of California.
15 *
16 * Redistribution and use in source and binary forms, with or without
17 * modification, are permitted provided that the following conditions
18 * are met:
19 *
20 * 1. Redistributions of source code must retain the above copyright
21 * notice, this list of conditions and the following disclaimer.
22 * 2. Redistributions in binary form must reproduce the above copyright
23 * notice, this list of conditions and the following disclaimer in the
24 * documentation and/or other materials provided with the distribution.
25 *
26 * Neither the name of the University nor the names of its contributors
27 * may be used to endorse or promote products derived from this software
28 * without specific prior written permission.
29 *
30 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
31 * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
32 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
33 * PURPOSE.
34 */
35
36 /* $Header: /cvs/src/usr.bin/lex/nfa.c,v 1.9 2003/06/04 17:34:44 millert Exp $ */
37
38 #include "flexdef.h"
39
40
41 /* declare functions that have forward references */
42
43 int dupmachine PROTO((int));
44 void mkxtion PROTO((int, int));
45
46
47 /* add_accept - add an accepting state to a machine
48 *
49 * accepting_number becomes mach's accepting number.
50 */
51
add_accept(mach,accepting_number)52 void add_accept( mach, accepting_number )
53 int mach, accepting_number;
54 {
55 /* Hang the accepting number off an epsilon state. if it is associated
56 * with a state that has a non-epsilon out-transition, then the state
57 * will accept BEFORE it makes that transition, i.e., one character
58 * too soon.
59 */
60
61 if ( transchar[finalst[mach]] == SYM_EPSILON )
62 accptnum[finalst[mach]] = accepting_number;
63
64 else
65 {
66 int astate = mkstate( SYM_EPSILON );
67 accptnum[astate] = accepting_number;
68 (void) link_machines( mach, astate );
69 }
70 }
71
72
73 /* copysingl - make a given number of copies of a singleton machine
74 *
75 * synopsis
76 *
77 * newsng = copysingl( singl, num );
78 *
79 * newsng - a new singleton composed of num copies of singl
80 * singl - a singleton machine
81 * num - the number of copies of singl to be present in newsng
82 */
83
copysingl(singl,num)84 int copysingl( singl, num )
85 int singl, num;
86 {
87 int copy, i;
88
89 copy = mkstate( SYM_EPSILON );
90
91 for ( i = 1; i <= num; ++i )
92 copy = link_machines( copy, dupmachine( singl ) );
93
94 return copy;
95 }
96
97
98 /* dumpnfa - debugging routine to write out an nfa */
99
dumpnfa(state1)100 void dumpnfa( state1 )
101 int state1;
102
103 {
104 int sym, tsp1, tsp2, anum, ns;
105
106 fprintf( stderr,
107 _( "\n\n********** beginning dump of nfa with start state %d\n" ),
108 state1 );
109
110 /* We probably should loop starting at firstst[state1] and going to
111 * lastst[state1], but they're not maintained properly when we "or"
112 * all of the rules together. So we use our knowledge that the machine
113 * starts at state 1 and ends at lastnfa.
114 */
115
116 /* for ( ns = firstst[state1]; ns <= lastst[state1]; ++ns ) */
117 for ( ns = 1; ns <= lastnfa; ++ns )
118 {
119 fprintf( stderr, _( "state # %4d\t" ), ns );
120
121 sym = transchar[ns];
122 tsp1 = trans1[ns];
123 tsp2 = trans2[ns];
124 anum = accptnum[ns];
125
126 fprintf( stderr, "%3d: %4d, %4d", sym, tsp1, tsp2 );
127
128 if ( anum != NIL )
129 fprintf( stderr, " [%d]", anum );
130
131 fprintf( stderr, "\n" );
132 }
133
134 fprintf( stderr, _( "********** end of dump\n" ) );
135 }
136
137
138 /* dupmachine - make a duplicate of a given machine
139 *
140 * synopsis
141 *
142 * copy = dupmachine( mach );
143 *
144 * copy - holds duplicate of mach
145 * mach - machine to be duplicated
146 *
147 * note that the copy of mach is NOT an exact duplicate; rather, all the
148 * transition states values are adjusted so that the copy is self-contained,
149 * as the original should have been.
150 *
151 * also note that the original MUST be contiguous, with its low and high
152 * states accessible by the arrays firstst and lastst
153 */
154
dupmachine(mach)155 int dupmachine( mach )
156 int mach;
157 {
158 int i, init, state_offset;
159 int state = 0;
160 int last = lastst[mach];
161
162 for ( i = firstst[mach]; i <= last; ++i )
163 {
164 state = mkstate( transchar[i] );
165
166 if ( trans1[i] != NO_TRANSITION )
167 {
168 mkxtion( finalst[state], trans1[i] + state - i );
169
170 if ( transchar[i] == SYM_EPSILON &&
171 trans2[i] != NO_TRANSITION )
172 mkxtion( finalst[state],
173 trans2[i] + state - i );
174 }
175
176 accptnum[state] = accptnum[i];
177 }
178
179 if ( state == 0 )
180 flexfatal( _( "empty machine in dupmachine()" ) );
181
182 state_offset = state - i + 1;
183
184 init = mach + state_offset;
185 firstst[init] = firstst[mach] + state_offset;
186 finalst[init] = finalst[mach] + state_offset;
187 lastst[init] = lastst[mach] + state_offset;
188
189 return init;
190 }
191
192
193 /* finish_rule - finish up the processing for a rule
194 *
195 * An accepting number is added to the given machine. If variable_trail_rule
196 * is true then the rule has trailing context and both the head and trail
197 * are variable size. Otherwise if headcnt or trailcnt is non-zero then
198 * the machine recognizes a pattern with trailing context and headcnt is
199 * the number of characters in the matched part of the pattern, or zero
200 * if the matched part has variable length. trailcnt is the number of
201 * trailing context characters in the pattern, or zero if the trailing
202 * context has variable length.
203 */
204
finish_rule(mach,variable_trail_rule,headcnt,trailcnt)205 void finish_rule( mach, variable_trail_rule, headcnt, trailcnt )
206 int mach, variable_trail_rule, headcnt, trailcnt;
207 {
208 char action_text[MAXLINE];
209
210 add_accept( mach, num_rules );
211
212 /* We did this in new_rule(), but it often gets the wrong
213 * number because we do it before we start parsing the current rule.
214 */
215 rule_linenum[num_rules] = linenum;
216
217 /* If this is a continued action, then the line-number has already
218 * been updated, giving us the wrong number.
219 */
220 if ( continued_action )
221 --rule_linenum[num_rules];
222
223 snprintf( action_text, sizeof action_text, "case %d:\n", num_rules );
224 add_action( action_text );
225
226 if ( variable_trail_rule )
227 {
228 rule_type[num_rules] = RULE_VARIABLE;
229
230 if ( performance_report > 0 )
231 fprintf( stderr,
232 _( "Variable trailing context rule at line %d\n" ),
233 rule_linenum[num_rules] );
234
235 variable_trailing_context_rules = true;
236 }
237
238 else
239 {
240 rule_type[num_rules] = RULE_NORMAL;
241
242 if ( headcnt > 0 || trailcnt > 0 )
243 {
244 /* Do trailing context magic to not match the trailing
245 * characters.
246 */
247 char *scanner_cp = "yy_c_buf_p = yy_cp";
248 char *scanner_bp = "yy_bp";
249
250 add_action(
251 "*yy_cp = yy_hold_char; /* undo effects of setting up yytext */\n" );
252
253 if ( headcnt > 0 )
254 {
255 snprintf( action_text, sizeof action_text,
256 "%s = %s + %d;\n",
257 scanner_cp, scanner_bp, headcnt );
258 add_action( action_text );
259 }
260
261 else
262 {
263 snprintf( action_text, sizeof action_text,
264 "%s -= %d;\n",
265 scanner_cp, trailcnt );
266 add_action( action_text );
267 }
268
269 add_action(
270 "YY_DO_BEFORE_ACTION; /* set up yytext again */\n" );
271 }
272 }
273
274 /* Okay, in the action code at this point yytext and yyleng have
275 * their proper final values for this rule, so here's the point
276 * to do any user action. But don't do it for continued actions,
277 * as that'll result in multiple YY_RULE_SETUP's.
278 */
279 if ( ! continued_action )
280 add_action( "YY_RULE_SETUP\n" );
281
282 line_directive_out( (FILE *) 0, 1 );
283 }
284
285
286 /* link_machines - connect two machines together
287 *
288 * synopsis
289 *
290 * new = link_machines( first, last );
291 *
292 * new - a machine constructed by connecting first to last
293 * first - the machine whose successor is to be last
294 * last - the machine whose predecessor is to be first
295 *
296 * note: this routine concatenates the machine first with the machine
297 * last to produce a machine new which will pattern-match first first
298 * and then last, and will fail if either of the sub-patterns fails.
299 * FIRST is set to new by the operation. last is unmolested.
300 */
301
link_machines(first,last)302 int link_machines( first, last )
303 int first, last;
304 {
305 if ( first == NIL )
306 return last;
307
308 else if ( last == NIL )
309 return first;
310
311 else
312 {
313 mkxtion( finalst[first], last );
314 finalst[first] = finalst[last];
315 lastst[first] = MAX( lastst[first], lastst[last] );
316 firstst[first] = MIN( firstst[first], firstst[last] );
317
318 return first;
319 }
320 }
321
322
323 /* mark_beginning_as_normal - mark each "beginning" state in a machine
324 * as being a "normal" (i.e., not trailing context-
325 * associated) states
326 *
327 * The "beginning" states are the epsilon closure of the first state
328 */
329
mark_beginning_as_normal(mach)330 void mark_beginning_as_normal( mach )
331 int mach;
332 {
333 switch ( state_type[mach] )
334 {
335 case STATE_NORMAL:
336 /* Oh, we've already visited here. */
337 return;
338
339 case STATE_TRAILING_CONTEXT:
340 state_type[mach] = STATE_NORMAL;
341
342 if ( transchar[mach] == SYM_EPSILON )
343 {
344 if ( trans1[mach] != NO_TRANSITION )
345 mark_beginning_as_normal(
346 trans1[mach] );
347
348 if ( trans2[mach] != NO_TRANSITION )
349 mark_beginning_as_normal(
350 trans2[mach] );
351 }
352 break;
353
354 default:
355 flexerror(
356 _( "bad state type in mark_beginning_as_normal()" ) );
357 break;
358 }
359 }
360
361
362 /* mkbranch - make a machine that branches to two machines
363 *
364 * synopsis
365 *
366 * branch = mkbranch( first, second );
367 *
368 * branch - a machine which matches either first's pattern or second's
369 * first, second - machines whose patterns are to be or'ed (the | operator)
370 *
371 * Note that first and second are NEITHER destroyed by the operation. Also,
372 * the resulting machine CANNOT be used with any other "mk" operation except
373 * more mkbranch's. Compare with mkor()
374 */
375
mkbranch(first,second)376 int mkbranch( first, second )
377 int first, second;
378 {
379 int eps;
380
381 if ( first == NO_TRANSITION )
382 return second;
383
384 else if ( second == NO_TRANSITION )
385 return first;
386
387 eps = mkstate( SYM_EPSILON );
388
389 mkxtion( eps, first );
390 mkxtion( eps, second );
391
392 return eps;
393 }
394
395
396 /* mkclos - convert a machine into a closure
397 *
398 * synopsis
399 * new = mkclos( state );
400 *
401 * new - a new state which matches the closure of "state"
402 */
403
mkclos(state)404 int mkclos( state )
405 int state;
406 {
407 return mkopt( mkposcl( state ) );
408 }
409
410
411 /* mkopt - make a machine optional
412 *
413 * synopsis
414 *
415 * new = mkopt( mach );
416 *
417 * new - a machine which optionally matches whatever mach matched
418 * mach - the machine to make optional
419 *
420 * notes:
421 * 1. mach must be the last machine created
422 * 2. mach is destroyed by the call
423 */
424
mkopt(mach)425 int mkopt( mach )
426 int mach;
427 {
428 int eps;
429
430 if ( ! SUPER_FREE_EPSILON(finalst[mach]) )
431 {
432 eps = mkstate( SYM_EPSILON );
433 mach = link_machines( mach, eps );
434 }
435
436 /* Can't skimp on the following if FREE_EPSILON(mach) is true because
437 * some state interior to "mach" might point back to the beginning
438 * for a closure.
439 */
440 eps = mkstate( SYM_EPSILON );
441 mach = link_machines( eps, mach );
442
443 mkxtion( mach, finalst[mach] );
444
445 return mach;
446 }
447
448
449 /* mkor - make a machine that matches either one of two machines
450 *
451 * synopsis
452 *
453 * new = mkor( first, second );
454 *
455 * new - a machine which matches either first's pattern or second's
456 * first, second - machines whose patterns are to be or'ed (the | operator)
457 *
458 * note that first and second are both destroyed by the operation
459 * the code is rather convoluted because an attempt is made to minimize
460 * the number of epsilon states needed
461 */
462
mkor(first,second)463 int mkor( first, second )
464 int first, second;
465 {
466 int eps, orend;
467
468 if ( first == NIL )
469 return second;
470
471 else if ( second == NIL )
472 return first;
473
474 else
475 {
476 /* See comment in mkopt() about why we can't use the first
477 * state of "first" or "second" if they satisfy "FREE_EPSILON".
478 */
479 eps = mkstate( SYM_EPSILON );
480
481 first = link_machines( eps, first );
482
483 mkxtion( first, second );
484
485 if ( SUPER_FREE_EPSILON(finalst[first]) &&
486 accptnum[finalst[first]] == NIL )
487 {
488 orend = finalst[first];
489 mkxtion( finalst[second], orend );
490 }
491
492 else if ( SUPER_FREE_EPSILON(finalst[second]) &&
493 accptnum[finalst[second]] == NIL )
494 {
495 orend = finalst[second];
496 mkxtion( finalst[first], orend );
497 }
498
499 else
500 {
501 eps = mkstate( SYM_EPSILON );
502
503 first = link_machines( first, eps );
504 orend = finalst[first];
505
506 mkxtion( finalst[second], orend );
507 }
508 }
509
510 finalst[first] = orend;
511 return first;
512 }
513
514
515 /* mkposcl - convert a machine into a positive closure
516 *
517 * synopsis
518 * new = mkposcl( state );
519 *
520 * new - a machine matching the positive closure of "state"
521 */
522
mkposcl(state)523 int mkposcl( state )
524 int state;
525 {
526 int eps;
527
528 if ( SUPER_FREE_EPSILON(finalst[state]) )
529 {
530 mkxtion( finalst[state], state );
531 return state;
532 }
533
534 else
535 {
536 eps = mkstate( SYM_EPSILON );
537 mkxtion( eps, state );
538 return link_machines( state, eps );
539 }
540 }
541
542
543 /* mkrep - make a replicated machine
544 *
545 * synopsis
546 * new = mkrep( mach, lb, ub );
547 *
548 * new - a machine that matches whatever "mach" matched from "lb"
549 * number of times to "ub" number of times
550 *
551 * note
552 * if "ub" is INFINITY then "new" matches "lb" or more occurrences of "mach"
553 */
554
mkrep(mach,lb,ub)555 int mkrep( mach, lb, ub )
556 int mach, lb, ub;
557 {
558 int base_mach, tail, copy, i;
559
560 base_mach = copysingl( mach, lb - 1 );
561
562 if ( ub == INFINITY )
563 {
564 copy = dupmachine( mach );
565 mach = link_machines( mach,
566 link_machines( base_mach, mkclos( copy ) ) );
567 }
568
569 else
570 {
571 tail = mkstate( SYM_EPSILON );
572
573 for ( i = lb; i < ub; ++i )
574 {
575 copy = dupmachine( mach );
576 tail = mkopt( link_machines( copy, tail ) );
577 }
578
579 mach = link_machines( mach, link_machines( base_mach, tail ) );
580 }
581
582 return mach;
583 }
584
585
586 /* mkstate - create a state with a transition on a given symbol
587 *
588 * synopsis
589 *
590 * state = mkstate( sym );
591 *
592 * state - a new state matching sym
593 * sym - the symbol the new state is to have an out-transition on
594 *
595 * note that this routine makes new states in ascending order through the
596 * state array (and increments LASTNFA accordingly). The routine DUPMACHINE
597 * relies on machines being made in ascending order and that they are
598 * CONTIGUOUS. Change it and you will have to rewrite DUPMACHINE (kludge
599 * that it admittedly is)
600 */
601
mkstate(sym)602 int mkstate( sym )
603 int sym;
604 {
605 if ( ++lastnfa >= current_mns )
606 {
607 if ( (current_mns += MNS_INCREMENT) >= MAXIMUM_MNS )
608 lerrif(
609 _( "input rules are too complicated (>= %d NFA states)" ),
610 current_mns );
611
612 ++num_reallocs;
613
614 firstst = reallocate_integer_array( firstst, current_mns );
615 lastst = reallocate_integer_array( lastst, current_mns );
616 finalst = reallocate_integer_array( finalst, current_mns );
617 transchar = reallocate_integer_array( transchar, current_mns );
618 trans1 = reallocate_integer_array( trans1, current_mns );
619 trans2 = reallocate_integer_array( trans2, current_mns );
620 accptnum = reallocate_integer_array( accptnum, current_mns );
621 assoc_rule =
622 reallocate_integer_array( assoc_rule, current_mns );
623 state_type =
624 reallocate_integer_array( state_type, current_mns );
625 }
626
627 firstst[lastnfa] = lastnfa;
628 finalst[lastnfa] = lastnfa;
629 lastst[lastnfa] = lastnfa;
630 transchar[lastnfa] = sym;
631 trans1[lastnfa] = NO_TRANSITION;
632 trans2[lastnfa] = NO_TRANSITION;
633 accptnum[lastnfa] = NIL;
634 assoc_rule[lastnfa] = num_rules;
635 state_type[lastnfa] = current_state_type;
636
637 /* Fix up equivalence classes base on this transition. Note that any
638 * character which has its own transition gets its own equivalence
639 * class. Thus only characters which are only in character classes
640 * have a chance at being in the same equivalence class. E.g. "a|b"
641 * puts 'a' and 'b' into two different equivalence classes. "[ab]"
642 * puts them in the same equivalence class (barring other differences
643 * elsewhere in the input).
644 */
645
646 if ( sym < 0 )
647 {
648 /* We don't have to update the equivalence classes since
649 * that was already done when the ccl was created for the
650 * first time.
651 */
652 }
653
654 else if ( sym == SYM_EPSILON )
655 ++numeps;
656
657 else
658 {
659 check_char( sym );
660
661 if ( useecs )
662 /* Map NUL's to csize. */
663 mkechar( sym ? sym : csize, nextecm, ecgroup );
664 }
665
666 return lastnfa;
667 }
668
669
670 /* mkxtion - make a transition from one state to another
671 *
672 * synopsis
673 *
674 * mkxtion( statefrom, stateto );
675 *
676 * statefrom - the state from which the transition is to be made
677 * stateto - the state to which the transition is to be made
678 */
679
mkxtion(statefrom,stateto)680 void mkxtion( statefrom, stateto )
681 int statefrom, stateto;
682 {
683 if ( trans1[statefrom] == NO_TRANSITION )
684 trans1[statefrom] = stateto;
685
686 else if ( (transchar[statefrom] != SYM_EPSILON) ||
687 (trans2[statefrom] != NO_TRANSITION) )
688 flexfatal( _( "found too many transitions in mkxtion()" ) );
689
690 else
691 { /* second out-transition for an epsilon state */
692 ++eps2;
693 trans2[statefrom] = stateto;
694 }
695 }
696
697 /* new_rule - initialize for a new rule */
698
new_rule()699 void new_rule()
700 {
701 if ( ++num_rules >= current_max_rules )
702 {
703 ++num_reallocs;
704 current_max_rules += MAX_RULES_INCREMENT;
705 rule_type = reallocate_integer_array( rule_type,
706 current_max_rules );
707 rule_linenum = reallocate_integer_array( rule_linenum,
708 current_max_rules );
709 rule_useful = reallocate_integer_array( rule_useful,
710 current_max_rules );
711 }
712
713 if ( num_rules > MAX_RULE )
714 lerrif( _( "too many rules (> %d)!" ), MAX_RULE );
715
716 rule_linenum[num_rules] = linenum;
717 rule_useful[num_rules] = false;
718 }
719