xref: /dragonfly/contrib/awk/lex.c (revision e2ee60a4f1757f9ded9e1041053222b631f387b6)
1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
3 All Rights Reserved
4 
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
13 permission.
14 
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 THIS SOFTWARE.
23 ****************************************************************/
24 
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include <ctype.h>
29 #include "awk.h"
30 #include "awkgram.tab.h"
31 
32 extern YYSTYPE      yylval;
33 extern bool         infunc;
34 
35 int       lineno    = 1;
36 int       bracecnt = 0;
37 int       brackcnt  = 0;
38 int       parencnt = 0;
39 
40 typedef struct Keyword {
41           const char *word;
42           int       sub;
43           int       type;
44 } Keyword;
45 
46 const Keyword keywords[] = {  /* keep sorted: binary searched */
47           { "BEGIN",          XBEGIN,             XBEGIN },
48           { "END",  XEND,               XEND },
49           { "NF",             VARNF,              VARNF },
50           { "atan2",          FATAN,              BLTIN },
51           { "break",          BREAK,              BREAK },
52           { "close",          CLOSE,              CLOSE },
53           { "continue",       CONTINUE, CONTINUE },
54           { "cos",  FCOS,               BLTIN },
55           { "delete",         DELETE,             DELETE },
56           { "do",             DO,                 DO },
57           { "else", ELSE,               ELSE },
58           { "exit", EXIT,               EXIT },
59           { "exp",  FEXP,               BLTIN },
60           { "fflush",         FFLUSH,             BLTIN },
61           { "for",  FOR,                FOR },
62           { "func", FUNC,               FUNC },
63           { "function",       FUNC,               FUNC },
64           { "getline",        GETLINE,  GETLINE },
65           { "gsub", GSUB,               GSUB },
66           { "if",             IF,                 IF },
67           { "in",             IN,                 IN },
68           { "index",          INDEX,              INDEX },
69           { "int",  FINT,               BLTIN },
70           { "length",         FLENGTH,  BLTIN },
71           { "log",  FLOG,               BLTIN },
72           { "match",          MATCHFCN, MATCHFCN },
73           { "next", NEXT,               NEXT },
74           { "nextfile",       NEXTFILE, NEXTFILE },
75           { "print",          PRINT,              PRINT },
76           { "printf",         PRINTF,             PRINTF },
77           { "rand", FRAND,              BLTIN },
78           { "return",         RETURN,             RETURN },
79           { "sin",  FSIN,               BLTIN },
80           { "split",          SPLIT,              SPLIT },
81           { "sprintf",        SPRINTF,  SPRINTF },
82           { "sqrt", FSQRT,              BLTIN },
83           { "srand",          FSRAND,             BLTIN },
84           { "sub",  SUB,                SUB },
85           { "substr",         SUBSTR,             SUBSTR },
86           { "system",         FSYSTEM,  BLTIN },
87           { "tolower",        FTOLOWER, BLTIN },
88           { "toupper",        FTOUPPER, BLTIN },
89           { "while",          WHILE,              WHILE },
90 };
91 
92 #define   RET(x)    { if(dbg)printf("lex %s\n", tokname(x)); return(x); }
93 
peek(void)94 static int peek(void)
95 {
96           int c = input();
97           unput(c);
98           return c;
99 }
100 
gettok(char ** pbuf,int * psz)101 static int gettok(char **pbuf, int *psz)          /* get next input token */
102 {
103           int c, retc;
104           char *buf = *pbuf;
105           int sz = *psz;
106           char *bp = buf;
107 
108           c = input();
109           if (c == 0)
110                     return 0;
111           buf[0] = c;
112           buf[1] = 0;
113           if (!isalnum(c) && c != '.' && c != '_')
114                     return c;
115 
116           *bp++ = c;
117           if (isalpha(c) || c == '_') { /* it's a varname */
118                     for ( ; (c = input()) != 0; ) {
119                               if (bp-buf >= sz)
120                                         if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
121                                                   FATAL( "out of space for name %.10s...", buf );
122                               if (isalnum(c) || c == '_')
123                                         *bp++ = c;
124                               else {
125                                         *bp = 0;
126                                         unput(c);
127                                         break;
128                               }
129                     }
130                     *bp = 0;
131                     retc = 'a';         /* alphanumeric */
132           } else {  /* maybe it's a number, but could be . */
133                     char *rem;
134                     /* read input until can't be a number */
135                     for ( ; (c = input()) != 0; ) {
136                               if (bp-buf >= sz)
137                                         if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
138                                                   FATAL( "out of space for number %.10s...", buf );
139                               if (isdigit(c) || c == 'e' || c == 'E'
140                                 || c == '.' || c == '+' || c == '-')
141                                         *bp++ = c;
142                               else {
143                                         unput(c);
144                                         break;
145                               }
146                     }
147                     *bp = 0;
148                     strtod(buf, &rem);  /* parse the number */
149                     if (rem == buf) {   /* it wasn't a valid number at all */
150                               buf[1] = 0;         /* return one character as token */
151                               retc = (uschar)buf[0];        /* character is its own type */
152                               unputstr(rem+1); /* put rest back for later */
153                     } else {  /* some prefix was a number */
154                               unputstr(rem);      /* put rest back for later */
155                               rem[0] = 0;         /* truncate buf after number part */
156                               retc = '0';         /* type is number */
157                     }
158           }
159           *pbuf = buf;
160           *psz = sz;
161           return retc;
162 }
163 
164 int       word(char *);
165 int       string(void);
166 int       regexpr(void);
167 bool      sc        = false;  /* true => return a } right now */
168 bool      reg       = false;  /* true => return a REGEXPR now */
169 
yylex(void)170 int yylex(void)
171 {
172           int c;
173           static char *buf = NULL;
174           static int bufsize = 5; /* BUG: setting this small causes core dump! */
175 
176           if (buf == NULL && (buf = (char *) malloc(bufsize)) == NULL)
177                     FATAL( "out of space in yylex" );
178           if (sc) {
179                     sc = false;
180                     RET('}');
181           }
182           if (reg) {
183                     reg = false;
184                     return regexpr();
185           }
186           for (;;) {
187                     c = gettok(&buf, &bufsize);
188                     if (c == 0)
189                               return 0;
190                     if (isalpha(c) || c == '_')
191                               return word(buf);
192                     if (isdigit(c)) {
193                               char *cp = tostring(buf);
194                               double result;
195 
196                               if (is_number(cp, & result))
197                                         yylval.cp = setsymtab(buf, cp, result, CON|NUM, symtab);
198                               else
199                                         yylval.cp = setsymtab(buf, cp, 0.0, STR, symtab);
200                               free(cp);
201                               /* should this also have STR set? */
202                               RET(NUMBER);
203                     }
204 
205                     yylval.i = c;
206                     switch (c) {
207                     case '\n':          /* {EOL} */
208                               lineno++;
209                               RET(NL);
210                     case '\r':          /* assume \n is coming */
211                     case ' ': /* {WS}+ */
212                     case '\t':
213                               break;
214                     case '#': /* #.* strip comments */
215                               while ((c = input()) != '\n' && c != 0)
216                                         ;
217                               unput(c);
218                               /*
219                                * Next line is a hack, itcompensates for
220                                * unput's treatment of \n.
221                                */
222                               lineno++;
223                               break;
224                     case ';':
225                               RET(';');
226                     case '\\':
227                               if (peek() == '\n') {
228                                         input();
229                                         lineno++;
230                               } else if (peek() == '\r') {
231                                         input(); input();   /* \n */
232                                         lineno++;
233                               } else {
234                                         RET(c);
235                               }
236                               break;
237                     case '&':
238                               if (peek() == '&') {
239                                         input(); RET(AND);
240                               } else
241                                         RET('&');
242                     case '|':
243                               if (peek() == '|') {
244                                         input(); RET(BOR);
245                               } else
246                                         RET('|');
247                     case '!':
248                               if (peek() == '=') {
249                                         input(); yylval.i = NE; RET(NE);
250                               } else if (peek() == '~') {
251                                         input(); yylval.i = NOTMATCH; RET(MATCHOP);
252                               } else
253                                         RET(NOT);
254                     case '~':
255                               yylval.i = MATCH;
256                               RET(MATCHOP);
257                     case '<':
258                               if (peek() == '=') {
259                                         input(); yylval.i = LE; RET(LE);
260                               } else {
261                                         yylval.i = LT; RET(LT);
262                               }
263                     case '=':
264                               if (peek() == '=') {
265                                         input(); yylval.i = EQ; RET(EQ);
266                               } else {
267                                         yylval.i = ASSIGN; RET(ASGNOP);
268                               }
269                     case '>':
270                               if (peek() == '=') {
271                                         input(); yylval.i = GE; RET(GE);
272                               } else if (peek() == '>') {
273                                         input(); yylval.i = APPEND; RET(APPEND);
274                               } else {
275                                         yylval.i = GT; RET(GT);
276                               }
277                     case '+':
278                               if (peek() == '+') {
279                                         input(); yylval.i = INCR; RET(INCR);
280                               } else if (peek() == '=') {
281                                         input(); yylval.i = ADDEQ; RET(ASGNOP);
282                               } else
283                                         RET('+');
284                     case '-':
285                               if (peek() == '-') {
286                                         input(); yylval.i = DECR; RET(DECR);
287                               } else if (peek() == '=') {
288                                         input(); yylval.i = SUBEQ; RET(ASGNOP);
289                               } else
290                                         RET('-');
291                     case '*':
292                               if (peek() == '=') {          /* *= */
293                                         input(); yylval.i = MULTEQ; RET(ASGNOP);
294                               } else if (peek() == '*') {   /* ** or **= */
295                                         input();  /* eat 2nd * */
296                                         if (peek() == '=') {
297                                                   input(); yylval.i = POWEQ; RET(ASGNOP);
298                                         } else {
299                                                   RET(POWER);
300                                         }
301                               } else
302                                         RET('*');
303                     case '/':
304                               RET('/');
305                     case '%':
306                               if (peek() == '=') {
307                                         input(); yylval.i = MODEQ; RET(ASGNOP);
308                               } else
309                                         RET('%');
310                     case '^':
311                               if (peek() == '=') {
312                                         input(); yylval.i = POWEQ; RET(ASGNOP);
313                               } else
314                                         RET(POWER);
315 
316                     case '$':
317                               /* BUG: awkward, if not wrong */
318                               c = gettok(&buf, &bufsize);
319                               if (isalpha(c)) {
320                                         if (strcmp(buf, "NF") == 0) { /* very special */
321                                                   unputstr("(NF)");
322                                                   RET(INDIRECT);
323                                         }
324                                         c = peek();
325                                         if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) {
326                                                   unputstr(buf);
327                                                   RET(INDIRECT);
328                                         }
329                                         yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab);
330                                         RET(IVAR);
331                               } else if (c == 0) {          /*  */
332                                         SYNTAX( "unexpected end of input after $" );
333                                         RET(';');
334                               } else {
335                                         unputstr(buf);
336                                         RET(INDIRECT);
337                               }
338 
339                     case '}':
340                               if (--bracecnt < 0)
341                                         SYNTAX( "extra }" );
342                               sc = true;
343                               RET(';');
344                     case ']':
345                               if (--brackcnt < 0)
346                                         SYNTAX( "extra ]" );
347                               RET(']');
348                     case ')':
349                               if (--parencnt < 0)
350                                         SYNTAX( "extra )" );
351                               RET(')');
352                     case '{':
353                               bracecnt++;
354                               RET('{');
355                     case '[':
356                               brackcnt++;
357                               RET('[');
358                     case '(':
359                               parencnt++;
360                               RET('(');
361 
362                     case '"':
363                               return string();    /* BUG: should be like tran.c ? */
364 
365                     default:
366                               RET(c);
367                     }
368           }
369 }
370 
371 extern int runetochar(char *str, int c);
372 
string(void)373 int string(void)
374 {
375           int c, n;
376           char *s, *bp;
377           static char *buf = NULL;
378           static int bufsz = 500;
379 
380           if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL)
381                     FATAL("out of space for strings");
382           for (bp = buf; (c = input()) != '"'; ) {
383                     if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, "string"))
384                               FATAL("out of space for string %.10s...", buf);
385                     switch (c) {
386                     case '\n':
387                     case '\r':
388                     case 0:
389                               *bp = '\0';
390                               SYNTAX( "non-terminated string %.10s...", buf );
391                               if (c == 0)         /* hopeless */
392                                         FATAL( "giving up" );
393                               lineno++;
394                               break;
395                     case '\\':
396                               c = input();
397                               switch (c) {
398                               case '\n': break;
399                               case '"': *bp++ = '"'; break;
400                               case 'n': *bp++ = '\n'; break;
401                               case 't': *bp++ = '\t'; break;
402                               case 'f': *bp++ = '\f'; break;
403                               case 'r': *bp++ = '\r'; break;
404                               case 'b': *bp++ = '\b'; break;
405                               case 'v': *bp++ = '\v'; break;
406                               case 'a': *bp++ = '\a'; break;
407                               case '\\': *bp++ = '\\'; break;
408 
409                               case '0': case '1': case '2': /* octal: \d \dd \ddd */
410                               case '3': case '4': case '5': case '6': case '7':
411                                         n = c - '0';
412                                         if ((c = peek()) >= '0' && c < '8') {
413                                                   n = 8 * n + input() - '0';
414                                                   if ((c = peek()) >= '0' && c < '8')
415                                                             n = 8 * n + input() - '0';
416                                         }
417                                         *bp++ = n;
418                                         break;
419 
420                               case 'x': /* hex  \x0-9a-fA-F (exactly two) */
421                                   {
422                                         int i;
423 
424                                         if (!isxdigit(peek())) {
425                                                   unput(c);
426                                                   break;
427                                         }
428                                         n = 0;
429                                         for (i = 0; i < 2; i++) {
430                                                   c = input();
431                                                   if (c == 0)
432                                                             break;
433                                                   if (isxdigit(c)) {
434                                                             c = tolower(c);
435                                                             n *= 16;
436                                                             if (isdigit(c))
437                                                                       n += (c - '0');
438                                                             else
439                                                                       n += 10 + (c - 'a');
440                                                   } else {
441                                                             unput(c);
442                                                             break;
443                                                   }
444                                         }
445                                         if (i)
446                                                   *bp++ = n;
447                                         break;
448                                   }
449 
450                               case 'u': /* utf  \u0-9a-fA-F (1..8) */
451                                   {
452                                         int i;
453 
454                                         n = 0;
455                                         for (i = 0; i < 8; i++) {
456                                                   c = input();
457                                                   if (!isxdigit(c) || c == 0)
458                                                             break;
459                                                   c = tolower(c);
460                                                   n *= 16;
461                                                   if (isdigit(c))
462                                                             n += (c - '0');
463                                                   else
464                                                             n += 10 + (c - 'a');
465                                         }
466                                         unput(c);
467                                         bp += runetochar(bp, n);
468                                         break;
469                                   }
470 
471                               default:
472                                         *bp++ = c;
473                                         break;
474                               }
475                               break;
476                     default:
477                               *bp++ = c;
478                               break;
479                     }
480           }
481           *bp = 0;
482           s = tostring(buf);
483           *bp++ = ' '; *bp++ = '\0';
484           yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
485           free(s);
486           RET(STRING);
487 }
488 
489 
binsearch(char * w,const Keyword * kp,int n)490 static int binsearch(char *w, const Keyword *kp, int n)
491 {
492           int cond, low, mid, high;
493 
494           low = 0;
495           high = n - 1;
496           while (low <= high) {
497                     mid = (low + high) / 2;
498                     if ((cond = strcmp(w, kp[mid].word)) < 0)
499                               high = mid - 1;
500                     else if (cond > 0)
501                               low = mid + 1;
502                     else
503                               return mid;
504           }
505           return -1;
506 }
507 
word(char * w)508 int word(char *w)
509 {
510           const Keyword *kp;
511           int c, n;
512 
513           n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0]));
514           if (n != -1) {      /* found in table */
515                     kp = keywords + n;
516                     yylval.i = kp->sub;
517                     switch (kp->type) { /* special handling */
518                     case BLTIN:
519                               if (kp->sub == FSYSTEM && safe)
520                                         SYNTAX( "system is unsafe" );
521                               RET(kp->type);
522                     case FUNC:
523                               if (infunc)
524                                         SYNTAX( "illegal nested function" );
525                               RET(kp->type);
526                     case RETURN:
527                               if (!infunc)
528                                         SYNTAX( "return not in function" );
529                               RET(kp->type);
530                     case VARNF:
531                               yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab);
532                               RET(VARNF);
533                     default:
534                               RET(kp->type);
535                     }
536           }
537           c = peek();         /* look for '(' */
538           if (c != '(' && infunc && (n=isarg(w)) >= 0) {
539                     yylval.i = n;
540                     RET(ARG);
541           } else {
542                     yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab);
543                     if (c == '(') {
544                               RET(CALL);
545                     } else {
546                               RET(VAR);
547                     }
548           }
549 }
550 
startreg(void)551 void startreg(void) /* next call to yylex will return a regular expression */
552 {
553           reg = true;
554 }
555 
regexpr(void)556 int regexpr(void)
557 {
558           int c;
559           static char *buf = NULL;
560           static int bufsz = 500;
561           char *bp;
562 
563           if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL)
564                     FATAL("out of space for reg expr");
565           bp = buf;
566           for ( ; (c = input()) != '/' && c != 0; ) {
567                     if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, "regexpr"))
568                               FATAL("out of space for reg expr %.10s...", buf);
569                     if (c == '\n') {
570                               *bp = '\0';
571                               SYNTAX( "newline in regular expression %.10s...", buf );
572                               unput('\n');
573                               break;
574                     } else if (c == '\\') {
575                               *bp++ = '\\';
576                               *bp++ = input();
577                     } else {
578                               *bp++ = c;
579                     }
580           }
581           *bp = 0;
582           if (c == 0)
583                     SYNTAX("non-terminated regular expression %.10s...", buf);
584           yylval.s = tostring(buf);
585           unput('/');
586           RET(REGEXPR);
587 }
588 
589 /* low-level lexical stuff, sort of inherited from lex */
590 
591 char      ebuf[300];
592 char      *ep = ebuf;
593 char      yysbuf[100];        /* pushback buffer */
594 char      *yysptr = yysbuf;
595 FILE      *yyin = NULL;
596 
input(void)597 int input(void)     /* get next lexical input character */
598 {
599           int c;
600           extern char *lexprog;
601 
602           if (yysptr > yysbuf)
603                     c = (uschar)*--yysptr;
604           else if (lexprog != NULL) {   /* awk '...' */
605                     if ((c = (uschar)*lexprog) != 0)
606                               lexprog++;
607           } else                                  /* awk -f ... */
608                     c = pgetc();
609           if (c == EOF)
610                     c = 0;
611           if (ep >= ebuf + sizeof ebuf)
612                     ep = ebuf;
613           *ep = c;
614           if (c != 0) {
615                     ep++;
616           }
617           return (c);
618 }
619 
unput(int c)620 void unput(int c)   /* put lexical character back on input */
621 {
622           if (c == '\n')
623                     lineno--;
624           if (yysptr >= yysbuf + sizeof(yysbuf))
625                     FATAL("pushed back too much: %.20s...", yysbuf);
626           *yysptr++ = c;
627           if (--ep < ebuf)
628                     ep = ebuf + sizeof(ebuf) - 1;
629 }
630 
unputstr(const char * s)631 void unputstr(const char *s)  /* put a string back on input */
632 {
633           int i;
634 
635           for (i = strlen(s)-1; i >= 0; i--)
636                     unput(s[i]);
637 }
638