1 /*	$OpenBSD: scanner.c,v 1.6 2003/08/01 22:01:37 david Exp $	*/
2 /*	$NetBSD: scanner.c,v 1.3 1995/09/28 10:34:36 tls Exp $	*/
3 
4 /*
5  * Copyright (c) 1983, 1993
6  *	The Regents of the University of California.  All rights reserved.
7  *
8  * This code is derived from software contributed to Berkeley by
9  * Edward Wang at The University of California, Berkeley.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  * 3. Neither the name of the University nor the names of its contributors
20  *    may be used to endorse or promote products derived from this software
21  *    without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 
36 #ifndef lint
37 #if 0
38 static char sccsid[] = "@(#)scanner.c	8.1 (Berkeley) 6/6/93";
39 #else
40 static char rcsid[] = "$OpenBSD: scanner.c,v 1.6 2003/08/01 22:01:37 david Exp $";
41 #endif
42 #endif /* not lint */
43 
44 #include "value.h"
45 #include "token.h"
46 #include "context.h"
47 #include "string.h"
48 #include <stdlib.h>
49 
s_getc()50 s_getc()
51 {
52 	int c;
53 
54 	switch (cx.x_type) {
55 	case X_FILE:
56 		c = getc(cx.x_fp);
57 		if (cx.x_bol && c != EOF) {
58 			cx.x_bol = 0;
59 			cx.x_lineno++;
60 		}
61 		if (c == '\n')
62 			cx.x_bol = 1;
63 		return c;
64 	case X_BUF:
65 		if (*cx.x_bufp != 0)
66 			return *cx.x_bufp++ & 0xff;
67 		else
68 			return EOF;
69 	}
70 	/*NOTREACHED*/
71 }
72 
s_ungetc(c)73 s_ungetc(c)
74 {
75 	if (c == EOF)
76 		return EOF;
77 	switch (cx.x_type) {
78 	case X_FILE:
79 		cx.x_bol = 0;
80 		return ungetc(c, cx.x_fp);
81 	case X_BUF:
82 		if (cx.x_bufp > cx.x_buf)
83 			return *--cx.x_bufp = c;
84 		else
85 			return EOF;
86 	}
87 	/*NOTREACHED*/
88 }
89 
s_gettok()90 s_gettok()
91 {
92 	char buf[100];
93 	char *p = buf;
94 	int c;
95 	int state = 0;
96 
97 loop:
98 	c = s_getc();
99 	switch (state) {
100 	case 0:
101 		switch (c) {
102 		case ' ':
103 		case '\t':
104 			break;
105 		case '\n':
106 		case ';':
107 			cx.x_token = T_EOL;
108 			state = -1;
109 			break;
110 		case '#':
111 			state = 1;
112 			break;
113 		case EOF:
114 			cx.x_token = T_EOF;
115 			state = -1;
116 			break;
117 		case 'a': case 'b': case 'c': case 'd': case 'e':
118 		case 'f': case 'g': case 'h': case 'i': case 'j':
119 		case 'k': case 'l': case 'm': case 'n': case 'o':
120 		case 'p': case 'q': case 'r': case 's': case 't':
121 		case 'u': case 'v': case 'w': case 'x': case 'y':
122 		case 'z':
123 		case 'A': case 'B': case 'C': case 'D': case 'E':
124 		case 'F': case 'G': case 'H': case 'I': case 'J':
125 		case 'K': case 'L': case 'M': case 'N': case 'O':
126 		case 'P': case 'Q': case 'R': case 'S': case 'T':
127 		case 'U': case 'V': case 'W': case 'X': case 'Y':
128 		case 'Z':
129 		case '_': case '.':
130 			*p++ = c;
131 			state = 2;
132 			break;
133 		case '"':
134 			state = 3;
135 			break;
136 		case '\'':
137 			state = 4;
138 			break;
139 		case '\\':
140 			switch (c = s_gettok1()) {
141 			case -1:
142 				break;
143 			case -2:
144 				state = 0;
145 				break;
146 			default:
147 				*p++ = c;
148 				state = 2;
149 			}
150 			break;
151 		case '0':
152 			cx.x_val.v_num = 0;
153 			state = 10;
154 			break;
155 		case '1': case '2': case '3': case '4':
156 		case '5': case '6': case '7': case '8': case '9':
157 			cx.x_val.v_num = c - '0';
158 			state = 11;
159 			break;
160 		case '>':
161 			state = 20;
162 			break;
163 		case '<':
164 			state = 21;
165 			break;
166 		case '=':
167 			state = 22;
168 			break;
169 		case '!':
170 			state = 23;
171 			break;
172 		case '&':
173 			state = 24;
174 			break;
175 		case '|':
176 			state = 25;
177 			break;
178 		case '$':
179 			state = 26;
180 			break;
181 		case '~':
182 			cx.x_token = T_COMP;
183 			state = -1;
184 			break;
185 		case '+':
186 			cx.x_token = T_PLUS;
187 			state = -1;
188 			break;
189 		case '-':
190 			cx.x_token = T_MINUS;
191 			state = -1;
192 			break;
193 		case '*':
194 			cx.x_token = T_MUL;
195 			state = -1;
196 			break;
197 		case '/':
198 			cx.x_token = T_DIV;
199 			state = -1;
200 			break;
201 		case '%':
202 			cx.x_token = T_MOD;
203 			state = -1;
204 			break;
205 		case '^':
206 			cx.x_token = T_XOR;
207 			state = -1;
208 			break;
209 		case '(':
210 			cx.x_token = T_LP;
211 			state = -1;
212 			break;
213 		case ')':
214 			cx.x_token = T_RP;
215 			state = -1;
216 			break;
217 		case ',':
218 			cx.x_token = T_COMMA;
219 			state = -1;
220 			break;
221 		case '?':
222 			cx.x_token = T_QUEST;
223 			state = -1;
224 			break;
225 		case ':':
226 			cx.x_token = T_COLON;
227 			state = -1;
228 			break;
229 		case '[':
230 			cx.x_token = T_LB;
231 			state = -1;
232 			break;
233 		case ']':
234 			cx.x_token = T_RB;
235 			state = -1;
236 			break;
237 		default:
238 			cx.x_val.v_num = c;
239 			cx.x_token = T_CHAR;
240 			state = -1;
241 			break;
242 		}
243 		break;
244 	case 1:				/* got # */
245 		if (c == '\n' || c == EOF) {
246 			(void) s_ungetc(c);
247 			state = 0;
248 		}
249 		break;
250 	case 2:				/* unquoted string */
251 		switch (c) {
252 		case 'a': case 'b': case 'c': case 'd': case 'e':
253 		case 'f': case 'g': case 'h': case 'i': case 'j':
254 		case 'k': case 'l': case 'm': case 'n': case 'o':
255 		case 'p': case 'q': case 'r': case 's': case 't':
256 		case 'u': case 'v': case 'w': case 'x': case 'y':
257 		case 'z':
258 		case 'A': case 'B': case 'C': case 'D': case 'E':
259 		case 'F': case 'G': case 'H': case 'I': case 'J':
260 		case 'K': case 'L': case 'M': case 'N': case 'O':
261 		case 'P': case 'Q': case 'R': case 'S': case 'T':
262 		case 'U': case 'V': case 'W': case 'X': case 'Y':
263 		case 'Z':
264 		case '_': case '.':
265 		case '0': case '1': case '2': case '3': case '4':
266 		case '5': case '6': case '7': case '8': case '9':
267 			if (p < buf + sizeof buf - 1)
268 				*p++ = c;
269 			break;
270 		case '"':
271 			state = 3;
272 			break;
273 		case '\'':
274 			state = 4;
275 			break;
276 		case '\\':
277 			switch (c = s_gettok1()) {
278 			case -2:
279 				(void) s_ungetc(' ');
280 			case -1:
281 				break;
282 			default:
283 				if (p < buf + sizeof buf - 1)
284 					*p++ = c;
285 			}
286 			break;
287 		default:
288 			(void) s_ungetc(c);
289 		case EOF:
290 			*p = 0;
291 			cx.x_token = T_STR;
292 			switch (*buf) {
293 			case 'i':
294 				if (buf[1] == 'f' && buf[2] == 0)
295 					cx.x_token = T_IF;
296 				break;
297 			case 't':
298 				if (buf[1] == 'h' && buf[2] == 'e'
299 				    && buf[3] == 'n' && buf[4] == 0)
300 					cx.x_token = T_THEN;
301 				break;
302 			case 'e':
303 				if (buf[1] == 'n' && buf[2] == 'd'
304 				    && buf[3] == 'i' && buf[4] == 'f'
305 				    && buf[5] == 0)
306 					cx.x_token = T_ENDIF;
307 				else if (buf[1] == 'l' && buf[2] == 's')
308 					if (buf[3] == 'i' && buf[4] == 'f'
309 					    && buf[5] == 0)
310 						cx.x_token = T_ELSIF;
311 					else if (buf[3] == 'e' && buf[4] == 0)
312 						cx.x_token = T_ELSE;
313 				break;
314 			}
315 			if (cx.x_token == T_STR
316 			    && (cx.x_val.v_str = str_cpy(buf)) == 0) {
317 				p_memerror();
318 				cx.x_token = T_EOF;
319 			}
320 			state = -1;
321 			break;
322 		}
323 		break;
324 	case 3:				/* " quoted string */
325 		switch (c) {
326 		case '\n':
327 			(void) s_ungetc(c);
328 		case EOF:
329 		case '"':
330 			state = 2;
331 			break;
332 		case '\\':
333 			switch (c = s_gettok1()) {
334 			case -1:
335 			case -2:	/* newlines are invisible */
336 				break;
337 			default:
338 				if (p < buf + sizeof buf - 1)
339 					*p++ = c;
340 			}
341 			break;
342 		default:
343 			if (p < buf + sizeof buf - 1)
344 				*p++ = c;
345 			break;
346 		}
347 		break;
348 	case 4:				/* ' quoted string */
349 		switch (c) {
350 		case '\n':
351 			(void) s_ungetc(c);
352 		case EOF:
353 		case '\'':
354 			state = 2;
355 			break;
356 		case '\\':
357 			switch (c = s_gettok1()) {
358 			case -1:
359 			case -2:	/* newlines are invisible */
360 				break;
361 			default:
362 				if (p < buf + sizeof buf - 1)
363 					*p++ = c;
364 			}
365 			break;
366 		default:
367 			if (p < buf + sizeof buf - 1)
368 				*p++ = c;
369 			break;
370 		}
371 		break;
372 	case 10:			/* got 0 */
373 		switch (c) {
374 		case 'x':
375 		case 'X':
376 			cx.x_val.v_num = 0;
377 			state = 12;
378 			break;
379 		case '0': case '1': case '2': case '3': case '4':
380 		case '5': case '6': case '7':
381 			cx.x_val.v_num = c - '0';
382 			state = 13;
383 			break;
384 		case '8': case '9':
385 			cx.x_val.v_num = c - '0';
386 			state = 11;
387 			break;
388 		default:
389 			(void) s_ungetc(c);
390 			state = -1;
391 			cx.x_token = T_NUM;
392 		}
393 		break;
394 	case 11:			/* decimal number */
395 		switch (c) {
396 		case '0': case '1': case '2': case '3': case '4':
397 		case '5': case '6': case '7': case '8': case '9':
398 			cx.x_val.v_num = cx.x_val.v_num * 10 + c - '0';
399 			break;
400 		default:
401 			(void) s_ungetc(c);
402 			state = -1;
403 			cx.x_token = T_NUM;
404 		}
405 		break;
406 	case 12:			/* hex number */
407 		switch (c) {
408 		case '0': case '1': case '2': case '3': case '4':
409 		case '5': case '6': case '7': case '8': case '9':
410 			cx.x_val.v_num = cx.x_val.v_num * 16 + c - '0';
411 			break;
412 		case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
413 			cx.x_val.v_num = cx.x_val.v_num * 16 + c - 'a' + 10;
414 			break;
415 		case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
416 			cx.x_val.v_num = cx.x_val.v_num * 16 + c - 'A' + 10;
417 			break;
418 		default:
419 			(void) s_ungetc(c);
420 			state = -1;
421 			cx.x_token = T_NUM;
422 		}
423 		break;
424 	case 13:			/* octal number */
425 		switch (c) {
426 		case '0': case '1': case '2': case '3': case '4':
427 		case '5': case '6': case '7':
428 			cx.x_val.v_num = cx.x_val.v_num * 8 + c - '0';
429 			break;
430 		default:
431 			(void) s_ungetc(c);
432 			state = -1;
433 			cx.x_token = T_NUM;
434 		}
435 		break;
436 	case 20:			/* got > */
437 		switch (c) {
438 		case '=':
439 			cx.x_token = T_GE;
440 			state = -1;
441 			break;
442 		case '>':
443 			cx.x_token = T_RS;
444 			state = -1;
445 			break;
446 		default:
447 			(void) s_ungetc(c);
448 			cx.x_token = T_GT;
449 			state = -1;
450 		}
451 		break;
452 	case 21:			/* got < */
453 		switch (c) {
454 		case '=':
455 			cx.x_token = T_LE;
456 			state = -1;
457 			break;
458 		case '<':
459 			cx.x_token = T_LS;
460 			state = -1;
461 			break;
462 		default:
463 			(void) s_ungetc(c);
464 			cx.x_token = T_LT;
465 			state = -1;
466 		}
467 		break;
468 	case 22:			/* got = */
469 		switch (c) {
470 		case '=':
471 			cx.x_token = T_EQ;
472 			state = -1;
473 			break;
474 		default:
475 			(void) s_ungetc(c);
476 			cx.x_token = T_ASSIGN;
477 			state = -1;
478 		}
479 		break;
480 	case 23:			/* got ! */
481 		switch (c) {
482 		case '=':
483 			cx.x_token = T_NE;
484 			state = -1;
485 			break;
486 		default:
487 			(void) s_ungetc(c);
488 			cx.x_token = T_NOT;
489 			state = -1;
490 		}
491 		break;
492 	case 24:			/* got & */
493 		switch (c) {
494 		case '&':
495 			cx.x_token = T_ANDAND;
496 			state = -1;
497 			break;
498 		default:
499 			(void) s_ungetc(c);
500 			cx.x_token = T_AND;
501 			state = -1;
502 		}
503 		break;
504 	case 25:			/* got | */
505 		switch (c) {
506 		case '|':
507 			cx.x_token = T_OROR;
508 			state = -1;
509 			break;
510 		default:
511 			(void) s_ungetc(c);
512 			cx.x_token = T_OR;
513 			state = -1;
514 		}
515 		break;
516 	case 26:			/* got $ */
517 		switch (c) {
518 		case '?':
519 			cx.x_token = T_DQ;
520 			state = -1;
521 			break;
522 		default:
523 			(void) s_ungetc(c);
524 			cx.x_token = T_DOLLAR;
525 			state = -1;
526 		}
527 		break;
528 	default:
529 		abort();
530 	}
531 	if (state >= 0)
532 		goto loop;
533 	return cx.x_token;
534 }
535 
s_gettok1()536 s_gettok1()
537 {
538 	int c;
539 	int n;
540 
541 	c = s_getc();			/* got \ */
542 	switch (c) {
543 	case EOF:
544 		return -1;
545 	case '\n':
546 		return -2;
547 	case 'b':
548 		return '\b';
549 	case 'f':
550 		return '\f';
551 	case 'n':
552 		return '\n';
553 	case 'r':
554 		return '\r';
555 	case 't':
556 		return '\t';
557 	default:
558 		return c;
559 	case '0': case '1': case '2': case '3': case '4':
560 	case '5': case '6': case '7':
561 		break;
562 	}
563 	n = c - '0';
564 	c = s_getc();			/* got \[0-7] */
565 	if (c < '0' || c > '7') {
566 		(void) s_ungetc(c);
567 		return n;
568 	}
569 	n = n * 8 + c - '0';
570 	c = s_getc();			/* got \[0-7][0-7] */
571 	if (c < '0' || c > '7') {
572 		(void) s_ungetc(c);
573 		return n;
574 	}
575 	return n * 8 + c - '0';
576 }
577