1 /* $OpenBSD: lex.c,v 1.12 2011/09/28 19:27:18 millert Exp $ */
2 /****************************************************************
3 Copyright (C) Lucent Technologies 1997
4 All Rights Reserved
5
6 Permission to use, copy, modify, and distribute this software and
7 its documentation for any purpose and without fee is hereby
8 granted, provided that the above copyright notice appear in all
9 copies and that both that the copyright notice and this
10 permission notice and warranty disclaimer appear in supporting
11 documentation, and that the name Lucent Technologies or any of
12 its entities not be used in advertising or publicity pertaining
13 to distribution of the software without specific, written prior
14 permission.
15
16 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
17 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
18 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
19 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
20 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
21 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
22 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
23 THIS SOFTWARE.
24 ****************************************************************/
25
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <ctype.h>
30 #include "awk.h"
31 #include "awkgram.h"
32
33 __RCSID("$MirOS: src/usr.bin/awk/lex.c,v 1.3 2014/03/13 00:37:36 tg Exp $");
34
35 extern int infunc;
36
37 int lineno = 1;
38 int bracecnt = 0;
39 int brackcnt = 0;
40 int parencnt = 0;
41
42 typedef struct Keyword {
43 const char *word;
44 int sub;
45 int type;
46 } Keyword;
47
48 Keyword keywords[] ={ /* keep sorted: binary searched */
49 { "BEGIN", XBEGIN, XBEGIN },
50 { "END", XEND, XEND },
51 { "NF", VARNF, VARNF },
52 { "and", FAND, BLTIN },
53 { "atan2", FATAN, BLTIN },
54 { "break", BREAK, BREAK },
55 { "close", CLOSE, CLOSE },
56 { "compl", FCOMPL, BLTIN },
57 { "continue", CONTINUE, CONTINUE },
58 { "cos", FCOS, BLTIN },
59 { "delete", DELETE, DELETE },
60 { "do", DO, DO },
61 { "else", ELSE, ELSE },
62 { "exit", EXIT, EXIT },
63 { "exp", FEXP, BLTIN },
64 { "fflush", FFLUSH, BLTIN },
65 { "for", FOR, FOR },
66 { "func", FUNC, FUNC },
67 { "function", FUNC, FUNC },
68 { "getline", GETLINE, GETLINE },
69 { "gsub", GSUB, GSUB },
70 { "if", IF, IF },
71 { "in", IN, IN },
72 { "index", INDEX, INDEX },
73 { "int", FINT, BLTIN },
74 { "length", FLENGTH, BLTIN },
75 { "log", FLOG, BLTIN },
76 { "lshift", FLSHIFT, BLTIN },
77 { "match", MATCHFCN, MATCHFCN },
78 { "next", NEXT, NEXT },
79 { "nextfile", NEXTFILE, NEXTFILE },
80 { "or", FFOR, BLTIN },
81 { "print", PRINT, PRINT },
82 { "printf", PRINTF, PRINTF },
83 { "rand", FRAND, BLTIN },
84 { "return", RETURN, RETURN },
85 { "rshift", FRSHIFT, BLTIN },
86 { "sin", FSIN, BLTIN },
87 { "split", SPLIT, SPLIT },
88 { "sprintf", SPRINTF, SPRINTF },
89 { "sqrt", FSQRT, BLTIN },
90 { "srand", FSRAND, BLTIN },
91 { "sub", SUB, SUB },
92 { "substr", SUBSTR, SUBSTR },
93 { "system", FSYSTEM, BLTIN },
94 { "tolower", FTOLOWER, BLTIN },
95 { "toupper", FTOUPPER, BLTIN },
96 { "while", WHILE, WHILE },
97 { "xor", FXOR, BLTIN },
98 };
99
100 #define RET(x) { if(dbg)printf("lex %s\n", tokname(x)); return(x); }
101
102 int peek(void);
103 int gettok(char **, int *);
104 int binsearch(char *, Keyword *, int);
105
peek(void)106 int peek(void)
107 {
108 int c = input();
109 unput(c);
110 return c;
111 }
112
gettok(char ** pbuf,int * psz)113 int gettok(char **pbuf, int *psz) /* get next input token */
114 {
115 int c, retc;
116 char *buf = *pbuf;
117 int sz = *psz;
118 char *bp = buf;
119
120 c = input();
121 if (c == 0)
122 return 0;
123 buf[0] = c;
124 buf[1] = 0;
125 if (!isalnum(c) && c != '.' && c != '_')
126 return c;
127
128 *bp++ = c;
129 if (isalpha(c) || c == '_') { /* it's a varname */
130 for ( ; (c = input()) != 0; ) {
131 if (bp-buf >= sz)
132 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
133 FATAL( "out of space for name %.10s...", buf );
134 if (isalnum(c) || c == '_')
135 *bp++ = c;
136 else {
137 *bp = 0;
138 unput(c);
139 break;
140 }
141 }
142 *bp = 0;
143 retc = 'a'; /* alphanumeric */
144 } else { /* maybe it's a number, but could be . */
145 char *rem;
146 /* read input until can't be a number */
147 for ( ; (c = input()) != 0; ) {
148 if (bp-buf >= sz)
149 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
150 FATAL( "out of space for number %.10s...", buf );
151 if (isdigit(c) || c == 'e' || c == 'E'
152 || c == '.' || c == '+' || c == '-')
153 *bp++ = c;
154 else {
155 unput(c);
156 break;
157 }
158 }
159 *bp = 0;
160 strtod(buf, &rem); /* parse the number */
161 if (rem == buf) { /* it wasn't a valid number at all */
162 buf[1] = 0; /* return one character as token */
163 retc = buf[0]; /* character is its own type */
164 unputstr(rem+1); /* put rest back for later */
165 } else { /* some prefix was a number */
166 unputstr(rem); /* put rest back for later */
167 rem[0] = 0; /* truncate buf after number part */
168 retc = '0'; /* type is number */
169 }
170 }
171 *pbuf = buf;
172 *psz = sz;
173 return retc;
174 }
175
176 int word(char *);
177 int string(void);
178 int regexpr(void);
179 int sc = 0; /* 1 => return a } right now */
180 int reg = 0; /* 1 => return a REGEXPR now */
181
yylex(void)182 int yylex(void)
183 {
184 int c;
185 static char *buf = 0;
186 static int bufsize = 5; /* BUG: setting this small causes core dump! */
187
188 if (buf == 0 && (buf = (char *) malloc(bufsize)) == NULL)
189 FATAL( "out of space in yylex" );
190 if (sc) {
191 sc = 0;
192 RET('}');
193 }
194 if (reg) {
195 reg = 0;
196 return regexpr();
197 }
198 for (;;) {
199 c = gettok(&buf, &bufsize);
200 if (c == 0)
201 return 0;
202 if (isalpha(c) || c == '_')
203 return word(buf);
204 if (isdigit(c)) {
205 yylval.cp = setsymtab(buf, tostring(buf), atof(buf), CON|NUM, symtab);
206 /* should this also have STR set? */
207 RET(NUMBER);
208 }
209
210 yylval.i = c;
211 switch (c) {
212 case '\n': /* {EOL} */
213 RET(NL);
214 case '\r': /* assume \n is coming */
215 case ' ': /* {WS}+ */
216 case '\t':
217 break;
218 case '#': /* #.* strip comments */
219 while ((c = input()) != '\n' && c != 0)
220 ;
221 unput(c);
222 break;
223 case ';':
224 RET(';');
225 case '\\':
226 if (peek() == '\n') {
227 input();
228 } else if (peek() == '\r') {
229 input(); input(); /* \n */
230 lineno++;
231 } else {
232 RET(c);
233 }
234 break;
235 case '&':
236 if (peek() == '&') {
237 input(); RET(AND);
238 } else
239 RET('&');
240 case '|':
241 if (peek() == '|') {
242 input(); RET(BOR);
243 } else
244 RET('|');
245 case '!':
246 if (peek() == '=') {
247 input(); yylval.i = NE; RET(NE);
248 } else if (peek() == '~') {
249 input(); yylval.i = NOTMATCH; RET(MATCHOP);
250 } else
251 RET(NOT);
252 case '~':
253 yylval.i = MATCH;
254 RET(MATCHOP);
255 case '<':
256 if (peek() == '=') {
257 input(); yylval.i = LE; RET(LE);
258 } else {
259 yylval.i = LT; RET(LT);
260 }
261 case '=':
262 if (peek() == '=') {
263 input(); yylval.i = EQ; RET(EQ);
264 } else {
265 yylval.i = ASSIGN; RET(ASGNOP);
266 }
267 case '>':
268 if (peek() == '=') {
269 input(); yylval.i = GE; RET(GE);
270 } else if (peek() == '>') {
271 input(); yylval.i = APPEND; RET(APPEND);
272 } else {
273 yylval.i = GT; RET(GT);
274 }
275 case '+':
276 if (peek() == '+') {
277 input(); yylval.i = INCR; RET(INCR);
278 } else if (peek() == '=') {
279 input(); yylval.i = ADDEQ; RET(ASGNOP);
280 } else
281 RET('+');
282 case '-':
283 if (peek() == '-') {
284 input(); yylval.i = DECR; RET(DECR);
285 } else if (peek() == '=') {
286 input(); yylval.i = SUBEQ; RET(ASGNOP);
287 } else
288 RET('-');
289 case '*':
290 if (peek() == '=') { /* *= */
291 input(); yylval.i = MULTEQ; RET(ASGNOP);
292 } else if (peek() == '*') { /* ** or **= */
293 input(); /* eat 2nd * */
294 if (peek() == '=') {
295 input(); yylval.i = POWEQ; RET(ASGNOP);
296 } else {
297 RET(POWER);
298 }
299 } else
300 RET('*');
301 case '/':
302 RET('/');
303 case '%':
304 if (peek() == '=') {
305 input(); yylval.i = MODEQ; RET(ASGNOP);
306 } else
307 RET('%');
308 case '^':
309 if (peek() == '=') {
310 input(); yylval.i = POWEQ; RET(ASGNOP);
311 } else
312 RET(POWER);
313
314 case '$':
315 /* BUG: awkward, if not wrong */
316 c = gettok(&buf, &bufsize);
317 if (isalpha(c)) {
318 if (strcmp(buf, "NF") == 0) { /* very special */
319 unputstr("(NF)");
320 RET(INDIRECT);
321 }
322 c = peek();
323 if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) {
324 unputstr(buf);
325 RET(INDIRECT);
326 }
327 yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab);
328 RET(IVAR);
329 } else if (c == 0) { /* */
330 SYNTAX( "unexpected end of input after $" );
331 RET(';');
332 } else {
333 unputstr(buf);
334 RET(INDIRECT);
335 }
336
337 case '}':
338 if (--bracecnt < 0)
339 SYNTAX( "extra }" );
340 sc = 1;
341 RET(';');
342 case ']':
343 if (--brackcnt < 0)
344 SYNTAX( "extra ]" );
345 RET(']');
346 case ')':
347 if (--parencnt < 0)
348 SYNTAX( "extra )" );
349 RET(')');
350 case '{':
351 bracecnt++;
352 RET('{');
353 case '[':
354 brackcnt++;
355 RET('[');
356 case '(':
357 parencnt++;
358 RET('(');
359
360 case '"':
361 return string(); /* BUG: should be like tran.c ? */
362
363 default:
364 RET(c);
365 }
366 }
367 }
368
string(void)369 int string(void)
370 {
371 int c, n;
372 char *s, *bp;
373 static char *buf = 0;
374 static int bufsz = 500;
375
376 if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL)
377 FATAL("out of space for strings");
378 for (bp = buf; (c = input()) != '"'; ) {
379 if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, "string"))
380 FATAL("out of space for string %.10s...", buf);
381 switch (c) {
382 case '\n':
383 case '\r':
384 case 0:
385 SYNTAX( "non-terminated string %.10s...", buf );
386 lineno++;
387 if (c == 0) /* hopeless */
388 FATAL( "giving up" );
389 break;
390 case '\\':
391 c = input();
392 switch (c) {
393 case '"': *bp++ = '"'; break;
394 case 'n': *bp++ = '\n'; break;
395 case 't': *bp++ = '\t'; break;
396 case 'f': *bp++ = '\f'; break;
397 case 'r': *bp++ = '\r'; break;
398 case 'b': *bp++ = '\b'; break;
399 case 'v': *bp++ = '\v'; break;
400 case 'a': *bp++ = '\007'; break;
401 case '\\': *bp++ = '\\'; break;
402
403 case '0': case '1': case '2': /* octal: \d \dd \ddd */
404 case '3': case '4': case '5': case '6': case '7':
405 n = c - '0';
406 if ((c = peek()) >= '0' && c < '8') {
407 n = 8 * n + input() - '0';
408 if ((c = peek()) >= '0' && c < '8')
409 n = 8 * n + input() - '0';
410 }
411 *bp++ = n;
412 break;
413
414 case 'x': /* hex \x0-9a-fA-F + */
415 { char xbuf[100], *px;
416 for (px = xbuf; (c = input()) != 0 && px-xbuf < 100-2; ) {
417 if (isdigit(c)
418 || (c >= 'a' && c <= 'f')
419 || (c >= 'A' && c <= 'F'))
420 *px++ = c;
421 else
422 break;
423 }
424 *px = 0;
425 unput(c);
426 sscanf(xbuf, "%x", (unsigned int *) &n);
427 *bp++ = n;
428 break;
429 }
430
431 default:
432 *bp++ = c;
433 break;
434 }
435 break;
436 default:
437 *bp++ = c;
438 break;
439 }
440 }
441 *bp = 0;
442 s = tostring(buf);
443 *bp++ = ' '; *bp++ = 0;
444 yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
445 RET(STRING);
446 }
447
448
binsearch(char * w,Keyword * kp,int n)449 int binsearch(char *w, Keyword *kp, int n)
450 {
451 int cond, low, mid, high;
452
453 low = 0;
454 high = n - 1;
455 while (low <= high) {
456 mid = (low + high) / 2;
457 if ((cond = strcmp(w, kp[mid].word)) < 0)
458 high = mid - 1;
459 else if (cond > 0)
460 low = mid + 1;
461 else
462 return mid;
463 }
464 return -1;
465 }
466
word(char * w)467 int word(char *w)
468 {
469 Keyword *kp;
470 int c, n;
471
472 n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0]));
473 /* BUG: this ought to be inside the if; in theory could fault (daniel barrett) */
474 kp = keywords + n;
475 if (n != -1) { /* found in table */
476 yylval.i = kp->sub;
477 switch (kp->type) { /* special handling */
478 case BLTIN:
479 if (kp->sub == FSYSTEM && safe)
480 SYNTAX( "system is unsafe" );
481 RET(kp->type);
482 case FUNC:
483 if (infunc)
484 SYNTAX( "illegal nested function" );
485 RET(kp->type);
486 case RETURN:
487 if (!infunc)
488 SYNTAX( "return not in function" );
489 RET(kp->type);
490 case VARNF:
491 yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab);
492 RET(VARNF);
493 default:
494 RET(kp->type);
495 }
496 }
497 c = peek(); /* look for '(' */
498 if (c != '(' && infunc && (n=isarg(w)) >= 0) {
499 yylval.i = n;
500 RET(ARG);
501 } else {
502 yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab);
503 if (c == '(') {
504 RET(CALL);
505 } else {
506 RET(VAR);
507 }
508 }
509 }
510
startreg(void)511 void startreg(void) /* next call to yylex will return a regular expression */
512 {
513 reg = 1;
514 }
515
regexpr(void)516 int regexpr(void)
517 {
518 int c, openclass = 0;
519 static char *buf = 0;
520 static int bufsz = 500;
521 char *bp;
522
523 if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL)
524 FATAL("out of space for rex expr");
525 bp = buf;
526 for ( ; ((c = input()) != '/' || openclass == 1) && c != 0; ) {
527 if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, "regexpr"))
528 FATAL("out of space for reg expr %.10s...", buf);
529 if (c == '\n') {
530 SYNTAX( "newline in regular expression %.10s...", buf );
531 unput('\n');
532 break;
533 } else if (c == '\\') {
534 *bp++ = '\\';
535 *bp++ = input();
536 } else {
537 if (c == '[')
538 openclass = 1;
539 else if (c == ']')
540 openclass = 0;
541 *bp++ = c;
542 }
543 }
544 *bp = 0;
545 if (c == 0)
546 SYNTAX("non-terminated regular expression %.10s...", buf);
547 yylval.s = tostring(buf);
548 unput('/');
549 RET(REGEXPR);
550 }
551
552 /* low-level lexical stuff, sort of inherited from lex */
553
554 char ebuf[300];
555 char *ep = ebuf;
556 char yysbuf[100]; /* pushback buffer */
557 char *yysptr = yysbuf;
558 FILE *yyin = 0;
559
input(void)560 int input(void) /* get next lexical input character */
561 {
562 int c;
563 extern char *lexprog;
564
565 if (yysptr > yysbuf)
566 c = (uschar)*--yysptr;
567 else if (lexprog != NULL) { /* awk '...' */
568 if ((c = (uschar)*lexprog) != 0)
569 lexprog++;
570 } else /* awk -f ... */
571 c = pgetc();
572 if (c == '\n')
573 lineno++;
574 else if (c == EOF)
575 c = 0;
576 if (ep >= ebuf + sizeof ebuf)
577 ep = ebuf;
578 return *ep++ = c;
579 }
580
unput(int c)581 void unput(int c) /* put lexical character back on input */
582 {
583 if (c == '\n')
584 lineno--;
585 if (yysptr >= yysbuf + sizeof(yysbuf))
586 FATAL("pushed back too much: %.20s...", yysbuf);
587 *yysptr++ = c;
588 if (--ep < ebuf)
589 ep = ebuf + sizeof(ebuf) - 1;
590 }
591
unputstr(const char * s)592 void unputstr(const char *s) /* put a string back on input */
593 {
594 int i;
595
596 for (i = strlen(s)-1; i >= 0; i--)
597 unput(s[i]);
598 }
599