1 /*	$OpenBSD: lex.c,v 1.12 2011/09/28 19:27:18 millert Exp $	*/
2 /****************************************************************
3 Copyright (C) Lucent Technologies 1997
4 All Rights Reserved
5 
6 Permission to use, copy, modify, and distribute this software and
7 its documentation for any purpose and without fee is hereby
8 granted, provided that the above copyright notice appear in all
9 copies and that both that the copyright notice and this
10 permission notice and warranty disclaimer appear in supporting
11 documentation, and that the name Lucent Technologies or any of
12 its entities not be used in advertising or publicity pertaining
13 to distribution of the software without specific, written prior
14 permission.
15 
16 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
17 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
18 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
19 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
20 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
21 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
22 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
23 THIS SOFTWARE.
24 ****************************************************************/
25 
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <ctype.h>
30 #include "awk.h"
31 #include "awkgram.h"
32 
33 __RCSID("$MirOS: src/usr.bin/awk/lex.c,v 1.3 2014/03/13 00:37:36 tg Exp $");
34 
35 extern int	infunc;
36 
37 int	lineno	= 1;
38 int	bracecnt = 0;
39 int	brackcnt  = 0;
40 int	parencnt = 0;
41 
42 typedef struct Keyword {
43 	const char *word;
44 	int	sub;
45 	int	type;
46 } Keyword;
47 
48 Keyword keywords[] ={	/* keep sorted: binary searched */
49 	{ "BEGIN",	XBEGIN,		XBEGIN },
50 	{ "END",	XEND,		XEND },
51 	{ "NF",		VARNF,		VARNF },
52 	{ "and",	FAND,		BLTIN },
53 	{ "atan2",	FATAN,		BLTIN },
54 	{ "break",	BREAK,		BREAK },
55 	{ "close",	CLOSE,		CLOSE },
56 	{ "compl",	FCOMPL,		BLTIN },
57 	{ "continue",	CONTINUE,	CONTINUE },
58 	{ "cos",	FCOS,		BLTIN },
59 	{ "delete",	DELETE,		DELETE },
60 	{ "do",		DO,		DO },
61 	{ "else",	ELSE,		ELSE },
62 	{ "exit",	EXIT,		EXIT },
63 	{ "exp",	FEXP,		BLTIN },
64 	{ "fflush",	FFLUSH,		BLTIN },
65 	{ "for",	FOR,		FOR },
66 	{ "func",	FUNC,		FUNC },
67 	{ "function",	FUNC,		FUNC },
68 	{ "getline",	GETLINE,	GETLINE },
69 	{ "gsub",	GSUB,		GSUB },
70 	{ "if",		IF,		IF },
71 	{ "in",		IN,		IN },
72 	{ "index",	INDEX,		INDEX },
73 	{ "int",	FINT,		BLTIN },
74 	{ "length",	FLENGTH,	BLTIN },
75 	{ "log",	FLOG,		BLTIN },
76 	{ "lshift",	FLSHIFT,	BLTIN },
77 	{ "match",	MATCHFCN,	MATCHFCN },
78 	{ "next",	NEXT,		NEXT },
79 	{ "nextfile",	NEXTFILE,	NEXTFILE },
80 	{ "or",		FFOR,		BLTIN },
81 	{ "print",	PRINT,		PRINT },
82 	{ "printf",	PRINTF,		PRINTF },
83 	{ "rand",	FRAND,		BLTIN },
84 	{ "return",	RETURN,		RETURN },
85 	{ "rshift",	FRSHIFT,	BLTIN },
86 	{ "sin",	FSIN,		BLTIN },
87 	{ "split",	SPLIT,		SPLIT },
88 	{ "sprintf",	SPRINTF,	SPRINTF },
89 	{ "sqrt",	FSQRT,		BLTIN },
90 	{ "srand",	FSRAND,		BLTIN },
91 	{ "sub",	SUB,		SUB },
92 	{ "substr",	SUBSTR,		SUBSTR },
93 	{ "system",	FSYSTEM,	BLTIN },
94 	{ "tolower",	FTOLOWER,	BLTIN },
95 	{ "toupper",	FTOUPPER,	BLTIN },
96 	{ "while",	WHILE,		WHILE },
97 	{ "xor",	FXOR,		BLTIN },
98 };
99 
100 #define	RET(x)	{ if(dbg)printf("lex %s\n", tokname(x)); return(x); }
101 
102 int peek(void);
103 int gettok(char **, int *);
104 int binsearch(char *, Keyword *, int);
105 
peek(void)106 int peek(void)
107 {
108 	int c = input();
109 	unput(c);
110 	return c;
111 }
112 
gettok(char ** pbuf,int * psz)113 int gettok(char **pbuf, int *psz)	/* get next input token */
114 {
115 	int c, retc;
116 	char *buf = *pbuf;
117 	int sz = *psz;
118 	char *bp = buf;
119 
120 	c = input();
121 	if (c == 0)
122 		return 0;
123 	buf[0] = c;
124 	buf[1] = 0;
125 	if (!isalnum(c) && c != '.' && c != '_')
126 		return c;
127 
128 	*bp++ = c;
129 	if (isalpha(c) || c == '_') {	/* it's a varname */
130 		for ( ; (c = input()) != 0; ) {
131 			if (bp-buf >= sz)
132 				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
133 					FATAL( "out of space for name %.10s...", buf );
134 			if (isalnum(c) || c == '_')
135 				*bp++ = c;
136 			else {
137 				*bp = 0;
138 				unput(c);
139 				break;
140 			}
141 		}
142 		*bp = 0;
143 		retc = 'a';	/* alphanumeric */
144 	} else {	/* maybe it's a number, but could be . */
145 		char *rem;
146 		/* read input until can't be a number */
147 		for ( ; (c = input()) != 0; ) {
148 			if (bp-buf >= sz)
149 				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
150 					FATAL( "out of space for number %.10s...", buf );
151 			if (isdigit(c) || c == 'e' || c == 'E'
152 			  || c == '.' || c == '+' || c == '-')
153 				*bp++ = c;
154 			else {
155 				unput(c);
156 				break;
157 			}
158 		}
159 		*bp = 0;
160 		strtod(buf, &rem);	/* parse the number */
161 		if (rem == buf) {	/* it wasn't a valid number at all */
162 			buf[1] = 0;	/* return one character as token */
163 			retc = buf[0];	/* character is its own type */
164 			unputstr(rem+1); /* put rest back for later */
165 		} else {	/* some prefix was a number */
166 			unputstr(rem);	/* put rest back for later */
167 			rem[0] = 0;	/* truncate buf after number part */
168 			retc = '0';	/* type is number */
169 		}
170 	}
171 	*pbuf = buf;
172 	*psz = sz;
173 	return retc;
174 }
175 
176 int	word(char *);
177 int	string(void);
178 int	regexpr(void);
179 int	sc	= 0;	/* 1 => return a } right now */
180 int	reg	= 0;	/* 1 => return a REGEXPR now */
181 
yylex(void)182 int yylex(void)
183 {
184 	int c;
185 	static char *buf = 0;
186 	static int bufsize = 5; /* BUG: setting this small causes core dump! */
187 
188 	if (buf == 0 && (buf = (char *) malloc(bufsize)) == NULL)
189 		FATAL( "out of space in yylex" );
190 	if (sc) {
191 		sc = 0;
192 		RET('}');
193 	}
194 	if (reg) {
195 		reg = 0;
196 		return regexpr();
197 	}
198 	for (;;) {
199 		c = gettok(&buf, &bufsize);
200 		if (c == 0)
201 			return 0;
202 		if (isalpha(c) || c == '_')
203 			return word(buf);
204 		if (isdigit(c)) {
205 			yylval.cp = setsymtab(buf, tostring(buf), atof(buf), CON|NUM, symtab);
206 			/* should this also have STR set? */
207 			RET(NUMBER);
208 		}
209 
210 		yylval.i = c;
211 		switch (c) {
212 		case '\n':	/* {EOL} */
213 			RET(NL);
214 		case '\r':	/* assume \n is coming */
215 		case ' ':	/* {WS}+ */
216 		case '\t':
217 			break;
218 		case '#':	/* #.* strip comments */
219 			while ((c = input()) != '\n' && c != 0)
220 				;
221 			unput(c);
222 			break;
223 		case ';':
224 			RET(';');
225 		case '\\':
226 			if (peek() == '\n') {
227 				input();
228 			} else if (peek() == '\r') {
229 				input(); input();	/* \n */
230 				lineno++;
231 			} else {
232 				RET(c);
233 			}
234 			break;
235 		case '&':
236 			if (peek() == '&') {
237 				input(); RET(AND);
238 			} else
239 				RET('&');
240 		case '|':
241 			if (peek() == '|') {
242 				input(); RET(BOR);
243 			} else
244 				RET('|');
245 		case '!':
246 			if (peek() == '=') {
247 				input(); yylval.i = NE; RET(NE);
248 			} else if (peek() == '~') {
249 				input(); yylval.i = NOTMATCH; RET(MATCHOP);
250 			} else
251 				RET(NOT);
252 		case '~':
253 			yylval.i = MATCH;
254 			RET(MATCHOP);
255 		case '<':
256 			if (peek() == '=') {
257 				input(); yylval.i = LE; RET(LE);
258 			} else {
259 				yylval.i = LT; RET(LT);
260 			}
261 		case '=':
262 			if (peek() == '=') {
263 				input(); yylval.i = EQ; RET(EQ);
264 			} else {
265 				yylval.i = ASSIGN; RET(ASGNOP);
266 			}
267 		case '>':
268 			if (peek() == '=') {
269 				input(); yylval.i = GE; RET(GE);
270 			} else if (peek() == '>') {
271 				input(); yylval.i = APPEND; RET(APPEND);
272 			} else {
273 				yylval.i = GT; RET(GT);
274 			}
275 		case '+':
276 			if (peek() == '+') {
277 				input(); yylval.i = INCR; RET(INCR);
278 			} else if (peek() == '=') {
279 				input(); yylval.i = ADDEQ; RET(ASGNOP);
280 			} else
281 				RET('+');
282 		case '-':
283 			if (peek() == '-') {
284 				input(); yylval.i = DECR; RET(DECR);
285 			} else if (peek() == '=') {
286 				input(); yylval.i = SUBEQ; RET(ASGNOP);
287 			} else
288 				RET('-');
289 		case '*':
290 			if (peek() == '=') {	/* *= */
291 				input(); yylval.i = MULTEQ; RET(ASGNOP);
292 			} else if (peek() == '*') {	/* ** or **= */
293 				input();	/* eat 2nd * */
294 				if (peek() == '=') {
295 					input(); yylval.i = POWEQ; RET(ASGNOP);
296 				} else {
297 					RET(POWER);
298 				}
299 			} else
300 				RET('*');
301 		case '/':
302 			RET('/');
303 		case '%':
304 			if (peek() == '=') {
305 				input(); yylval.i = MODEQ; RET(ASGNOP);
306 			} else
307 				RET('%');
308 		case '^':
309 			if (peek() == '=') {
310 				input(); yylval.i = POWEQ; RET(ASGNOP);
311 			} else
312 				RET(POWER);
313 
314 		case '$':
315 			/* BUG: awkward, if not wrong */
316 			c = gettok(&buf, &bufsize);
317 			if (isalpha(c)) {
318 				if (strcmp(buf, "NF") == 0) {	/* very special */
319 					unputstr("(NF)");
320 					RET(INDIRECT);
321 				}
322 				c = peek();
323 				if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) {
324 					unputstr(buf);
325 					RET(INDIRECT);
326 				}
327 				yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab);
328 				RET(IVAR);
329 			} else if (c == 0) {	/*  */
330 				SYNTAX( "unexpected end of input after $" );
331 				RET(';');
332 			} else {
333 				unputstr(buf);
334 				RET(INDIRECT);
335 			}
336 
337 		case '}':
338 			if (--bracecnt < 0)
339 				SYNTAX( "extra }" );
340 			sc = 1;
341 			RET(';');
342 		case ']':
343 			if (--brackcnt < 0)
344 				SYNTAX( "extra ]" );
345 			RET(']');
346 		case ')':
347 			if (--parencnt < 0)
348 				SYNTAX( "extra )" );
349 			RET(')');
350 		case '{':
351 			bracecnt++;
352 			RET('{');
353 		case '[':
354 			brackcnt++;
355 			RET('[');
356 		case '(':
357 			parencnt++;
358 			RET('(');
359 
360 		case '"':
361 			return string();	/* BUG: should be like tran.c ? */
362 
363 		default:
364 			RET(c);
365 		}
366 	}
367 }
368 
string(void)369 int string(void)
370 {
371 	int c, n;
372 	char *s, *bp;
373 	static char *buf = 0;
374 	static int bufsz = 500;
375 
376 	if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL)
377 		FATAL("out of space for strings");
378 	for (bp = buf; (c = input()) != '"'; ) {
379 		if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, "string"))
380 			FATAL("out of space for string %.10s...", buf);
381 		switch (c) {
382 		case '\n':
383 		case '\r':
384 		case 0:
385 			SYNTAX( "non-terminated string %.10s...", buf );
386 			lineno++;
387 			if (c == 0)	/* hopeless */
388 				FATAL( "giving up" );
389 			break;
390 		case '\\':
391 			c = input();
392 			switch (c) {
393 			case '"': *bp++ = '"'; break;
394 			case 'n': *bp++ = '\n'; break;
395 			case 't': *bp++ = '\t'; break;
396 			case 'f': *bp++ = '\f'; break;
397 			case 'r': *bp++ = '\r'; break;
398 			case 'b': *bp++ = '\b'; break;
399 			case 'v': *bp++ = '\v'; break;
400 			case 'a': *bp++ = '\007'; break;
401 			case '\\': *bp++ = '\\'; break;
402 
403 			case '0': case '1': case '2': /* octal: \d \dd \ddd */
404 			case '3': case '4': case '5': case '6': case '7':
405 				n = c - '0';
406 				if ((c = peek()) >= '0' && c < '8') {
407 					n = 8 * n + input() - '0';
408 					if ((c = peek()) >= '0' && c < '8')
409 						n = 8 * n + input() - '0';
410 				}
411 				*bp++ = n;
412 				break;
413 
414 			case 'x':	/* hex  \x0-9a-fA-F + */
415 			    {	char xbuf[100], *px;
416 				for (px = xbuf; (c = input()) != 0 && px-xbuf < 100-2; ) {
417 					if (isdigit(c)
418 					 || (c >= 'a' && c <= 'f')
419 					 || (c >= 'A' && c <= 'F'))
420 						*px++ = c;
421 					else
422 						break;
423 				}
424 				*px = 0;
425 				unput(c);
426 	  			sscanf(xbuf, "%x", (unsigned int *) &n);
427 				*bp++ = n;
428 				break;
429 			    }
430 
431 			default:
432 				*bp++ = c;
433 				break;
434 			}
435 			break;
436 		default:
437 			*bp++ = c;
438 			break;
439 		}
440 	}
441 	*bp = 0;
442 	s = tostring(buf);
443 	*bp++ = ' '; *bp++ = 0;
444 	yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
445 	RET(STRING);
446 }
447 
448 
binsearch(char * w,Keyword * kp,int n)449 int binsearch(char *w, Keyword *kp, int n)
450 {
451 	int cond, low, mid, high;
452 
453 	low = 0;
454 	high = n - 1;
455 	while (low <= high) {
456 		mid = (low + high) / 2;
457 		if ((cond = strcmp(w, kp[mid].word)) < 0)
458 			high = mid - 1;
459 		else if (cond > 0)
460 			low = mid + 1;
461 		else
462 			return mid;
463 	}
464 	return -1;
465 }
466 
word(char * w)467 int word(char *w)
468 {
469 	Keyword *kp;
470 	int c, n;
471 
472 	n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0]));
473 /* BUG: this ought to be inside the if; in theory could fault (daniel barrett) */
474 	kp = keywords + n;
475 	if (n != -1) {	/* found in table */
476 		yylval.i = kp->sub;
477 		switch (kp->type) {	/* special handling */
478 		case BLTIN:
479 			if (kp->sub == FSYSTEM && safe)
480 				SYNTAX( "system is unsafe" );
481 			RET(kp->type);
482 		case FUNC:
483 			if (infunc)
484 				SYNTAX( "illegal nested function" );
485 			RET(kp->type);
486 		case RETURN:
487 			if (!infunc)
488 				SYNTAX( "return not in function" );
489 			RET(kp->type);
490 		case VARNF:
491 			yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab);
492 			RET(VARNF);
493 		default:
494 			RET(kp->type);
495 		}
496 	}
497 	c = peek();	/* look for '(' */
498 	if (c != '(' && infunc && (n=isarg(w)) >= 0) {
499 		yylval.i = n;
500 		RET(ARG);
501 	} else {
502 		yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab);
503 		if (c == '(') {
504 			RET(CALL);
505 		} else {
506 			RET(VAR);
507 		}
508 	}
509 }
510 
startreg(void)511 void startreg(void)	/* next call to yylex will return a regular expression */
512 {
513 	reg = 1;
514 }
515 
regexpr(void)516 int regexpr(void)
517 {
518 	int c, openclass = 0;
519 	static char *buf = 0;
520 	static int bufsz = 500;
521 	char *bp;
522 
523 	if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL)
524 		FATAL("out of space for rex expr");
525 	bp = buf;
526 	for ( ; ((c = input()) != '/' || openclass == 1) && c != 0; ) {
527 		if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, "regexpr"))
528 			FATAL("out of space for reg expr %.10s...", buf);
529 		if (c == '\n') {
530 			SYNTAX( "newline in regular expression %.10s...", buf );
531 			unput('\n');
532 			break;
533 		} else if (c == '\\') {
534 			*bp++ = '\\';
535 			*bp++ = input();
536 		} else {
537 			if (c == '[')
538 				openclass = 1;
539 			else if (c == ']')
540 				openclass = 0;
541 			*bp++ = c;
542 		}
543 	}
544 	*bp = 0;
545 	if (c == 0)
546 		SYNTAX("non-terminated regular expression %.10s...", buf);
547 	yylval.s = tostring(buf);
548 	unput('/');
549 	RET(REGEXPR);
550 }
551 
552 /* low-level lexical stuff, sort of inherited from lex */
553 
554 char	ebuf[300];
555 char	*ep = ebuf;
556 char	yysbuf[100];	/* pushback buffer */
557 char	*yysptr = yysbuf;
558 FILE	*yyin = 0;
559 
input(void)560 int input(void)	/* get next lexical input character */
561 {
562 	int c;
563 	extern char *lexprog;
564 
565 	if (yysptr > yysbuf)
566 		c = (uschar)*--yysptr;
567 	else if (lexprog != NULL) {	/* awk '...' */
568 		if ((c = (uschar)*lexprog) != 0)
569 			lexprog++;
570 	} else				/* awk -f ... */
571 		c = pgetc();
572 	if (c == '\n')
573 		lineno++;
574 	else if (c == EOF)
575 		c = 0;
576 	if (ep >= ebuf + sizeof ebuf)
577 		ep = ebuf;
578 	return *ep++ = c;
579 }
580 
unput(int c)581 void unput(int c)	/* put lexical character back on input */
582 {
583 	if (c == '\n')
584 		lineno--;
585 	if (yysptr >= yysbuf + sizeof(yysbuf))
586 		FATAL("pushed back too much: %.20s...", yysbuf);
587 	*yysptr++ = c;
588 	if (--ep < ebuf)
589 		ep = ebuf + sizeof(ebuf) - 1;
590 }
591 
unputstr(const char * s)592 void unputstr(const char *s)	/* put a string back on input */
593 {
594 	int i;
595 
596 	for (i = strlen(s)-1; i >= 0; i--)
597 		unput(s[i]);
598 }
599