1 /**	$MirOS: src/usr.bin/indent/lexi.c,v 1.4 2007/05/06 18:04:44 tg Exp $ */
2 /*	$OpenBSD: lexi.c,v 1.12 2005/03/06 14:34:25 millert Exp $	*/
3 
4 /*
5  * Copyright (c) 1980, 1993
6  *	The Regents of the University of California.
7  * Copyright (c) 1976 Board of Trustees of the University of Illinois.
8  * Copyright (c) 1985 Sun Microsystems, Inc.
9  * All rights reserved.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  * 3. Neither the name of the University nor the names of its contributors
20  *    may be used to endorse or promote products derived from this software
21  *    without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 
36 /*
37  * Here we have the token scanner for indent.  It scans off one token and puts
38  * it in the global variable "token".  It returns a code, indicating the type
39  * of token scanned.
40  */
41 
42 #include <sys/cdefs.h>
43 #include <stdio.h>
44 #include <ctype.h>
45 #include <stdlib.h>
46 #include <string.h>
47 #include <err.h>
48 #include "indent_globs.h"
49 #include "indent_codes.h"
50 
51 __SCCSID("@(#)lexi.c	8.1 (Berkeley) 6/6/93");
52 __RCSID("$MirOS: src/usr.bin/indent/lexi.c,v 1.4 2007/05/06 18:04:44 tg Exp $");
53 
54 #define alphanum 1
55 #define opchar 3
56 
57 struct templ {
58     char       *rwd;
59     int         rwcode;
60 };
61 
62 struct templ specialsinit[] = {
63 	{ "switch", 1 },
64 	{ "case", 2 },
65 	{ "break", 0 },
66 	{ "struct", 3 },
67 	{ "union", 3 },
68 	{ "enum", 3 },
69 	{ "default", 2 },
70 	{ "int", 4 },
71 	{ "char", 4 },
72 	{ "float", 4 },
73 	{ "double", 4 },
74 	{ "long", 4 },
75 	{ "short", 4 },
76 	{ "typdef", 4 },
77 	{ "unsigned", 4 },
78 	{ "register", 4 },
79 	{ "static", 4 },
80 	{ "global", 4 },
81 	{ "extern", 4 },
82 	{ "void", 4 },
83 	{ "goto", 0 },
84 	{ "return", 0 },
85 	{ "if", 5 },
86 	{ "while", 5 },
87 	{ "for", 5 },
88 	{ "else", 6 },
89 	{ "do", 6 },
90 	{ "sizeof", 7 },
91 };
92 
93 struct templ *specials = specialsinit;
94 int	nspecials = sizeof(specialsinit) / sizeof(specialsinit[0]);
95 int	maxspecials;
96 
97 char        chartype[128] =
98 {				/* this is used to facilitate the decision of
99 				 * what type (alphanumeric, operator) each
100 				 * character is */
101     0, 0, 0, 0, 0, 0, 0, 0,
102     0, 0, 0, 0, 0, 0, 0, 0,
103     0, 0, 0, 0, 0, 0, 0, 0,
104     0, 0, 0, 0, 0, 0, 0, 0,
105     0, 3, 0, 0, 1, 3, 3, 0,
106     0, 0, 3, 3, 0, 3, 0, 3,
107     1, 1, 1, 1, 1, 1, 1, 1,
108     1, 1, 0, 0, 3, 3, 3, 3,
109     0, 1, 1, 1, 1, 1, 1, 1,
110     1, 1, 1, 1, 1, 1, 1, 1,
111     1, 1, 1, 1, 1, 1, 1, 1,
112     1, 1, 1, 0, 0, 0, 3, 1,
113     0, 1, 1, 1, 1, 1, 1, 1,
114     1, 1, 1, 1, 1, 1, 1, 1,
115     1, 1, 1, 1, 1, 1, 1, 1,
116     1, 1, 1, 0, 3, 0, 3, 0
117 };
118 
119 
120 
121 
122 int
lexi(void)123 lexi(void)
124 {
125     int         unary_delim;	/* this is set to 1 if the current token
126 				 * forces a following operator to be unary */
127     static int  last_code;	/* the last token type returned */
128     static int  l_struct;	/* set to 1 if the last token was 'struct' */
129     int         code;		/* internal code to be returned */
130     char        qchar;		/* the delimiter character for a string */
131     int		i;
132 
133     e_token = s_token;		/* point to start of place to save token */
134     unary_delim = false;
135     ps.col_1 = ps.last_nl;	/* tell world that this token started in
136 				 * column 1 iff the last thing scanned was nl */
137     ps.last_nl = false;
138 
139     while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
140 	ps.col_1 = false;	/* leading blanks imply token is not in column
141 				 * 1 */
142 	if (++buf_ptr >= buf_end)
143 	    fill_buffer();
144     }
145 
146     if (buf_ptr[0] == 'L' && ((buf_ptr[1] == '"') || (buf_ptr[1] == '\'')))
147 	goto scan_na;
148 
149     /* Scan an alphanumeric token */
150     if (chartype[(int)*buf_ptr] == alphanum ||
151 	(buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
152 	/*
153 	 * we have a character or number
154 	 */
155 	char *j;	/* used for searching thru list of
156 			 * reserved words */
157 	if (isdigit(*buf_ptr) || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
158 	    int         seendot = 0,
159 	                seenexp = 0,
160 			seensfx = 0;
161 	    if (*buf_ptr == '0' &&
162 		    (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) {
163 		*e_token++ = *buf_ptr++;
164 		*e_token++ = *buf_ptr++;
165 		while (isxdigit(*buf_ptr)) {
166 		    CHECK_SIZE_TOKEN;
167 		    *e_token++ = *buf_ptr++;
168 		}
169 	    }
170 	    else
171 		while (1) {
172 		    if (*buf_ptr == '.') {
173 			if (seendot)
174 			    break;
175 			else
176 			    seendot++;
177 		    }
178 		    CHECK_SIZE_TOKEN;
179 		    *e_token++ = *buf_ptr++;
180 		    if (!isdigit(*buf_ptr) && *buf_ptr != '.') {
181 			if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp)
182 			    break;
183 			else {
184 			    seenexp++;
185 			    seendot++;
186 			    CHECK_SIZE_TOKEN;
187 			    *e_token++ = *buf_ptr++;
188 			    if (*buf_ptr == '+' || *buf_ptr == '-')
189 				*e_token++ = *buf_ptr++;
190 			}
191 		    }
192 		}
193 	    while (1) {
194 		if (!(seensfx & 1) &&
195 			(*buf_ptr == 'U' || *buf_ptr == 'u')) {
196 		    CHECK_SIZE_TOKEN;
197 		    *e_token++ = *buf_ptr++;
198 		    seensfx |= 1;
199 		    continue;
200 		}
201         	if (!(seensfx & 2) &&
202 			(*buf_ptr == 'L' || *buf_ptr == 'l')) {
203 		    CHECK_SIZE_TOKEN;
204 		    if (buf_ptr[1] == buf_ptr[0])
205 		        *e_token++ = *buf_ptr++;
206 		    *e_token++ = *buf_ptr++;
207 		    seensfx |= 2;
208 		    continue;
209 		}
210 		break;
211 	    }
212 	}
213 	else
214 	    while (chartype[(int)*buf_ptr] == alphanum) {	/* copy it over */
215 		CHECK_SIZE_TOKEN;
216 		*e_token++ = *buf_ptr++;
217 		if (buf_ptr >= buf_end)
218 		    fill_buffer();
219 	    }
220 	*e_token++ = '\0';
221 	while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
222 	    if (++buf_ptr >= buf_end)
223 		fill_buffer();
224 	}
225 	ps.its_a_keyword = false;
226 	ps.sizeof_keyword = false;
227 	if (l_struct) {		/* if last token was 'struct', then this token
228 				 * should be treated as a declaration */
229 	    l_struct = false;
230 	    last_code = ident;
231 	    ps.last_u_d = true;
232 	    return (decl);
233 	}
234 	ps.last_u_d = false;	/* Operator after indentifier is binary */
235 	last_code = ident;	/* Remember that this is the code we will
236 				 * return */
237 
238 	/*
239 	 * This loop will check if the token is a keyword.
240 	 */
241 	for (i = 0; i < nspecials; i++) {
242 	    char *p = s_token;	/* point at scanned token */
243 	    j = specials[i].rwd;
244 	    if (*j++ != *p++ || *j++ != *p++)
245 		continue;	/* This test depends on the fact that
246 				 * identifiers are always at least 1 character
247 				 * long (ie. the first two bytes of the
248 				 * identifier are always meaningful) */
249 	    if (p[-1] == 0)
250 		break;		/* If its a one-character identifier */
251 	    while (*p++ == *j)
252 		if (*j++ == 0)
253 		    goto found_keyword;	/* I wish that C had a multi-level
254 					 * break... */
255 	}
256 	if (i < nspecials) {		/* we have a keyword */
257     found_keyword:
258 	    ps.its_a_keyword = true;
259 	    ps.last_u_d = true;
260 	    switch (specials[i].rwcode) {
261 	    case 1:		/* it is a switch */
262 		return (swstmt);
263 	    case 2:		/* a case or default */
264 		return (casestmt);
265 
266 	    case 3:		/* a "struct" */
267 		if (ps.p_l_follow)
268 		    break;	/* inside parens: cast */
269 		l_struct = true;
270 
271 		/*
272 		 * Next time around, we will want to know that we have had a
273 		 * 'struct'
274 		 */
275 	    case 4:		/* one of the declaration keywords */
276 		if (ps.p_l_follow) {
277 		    ps.cast_mask |= 1 << ps.p_l_follow;
278 		    break;	/* inside parens: cast */
279 		}
280 		last_code = decl;
281 		return (decl);
282 
283 	    case 5:		/* if, while, for */
284 		return (sp_paren);
285 
286 	    case 6:		/* do, else */
287 		return (sp_nparen);
288 
289 	    case 7:
290 		ps.sizeof_keyword = true;
291 	    default:		/* all others are treated like any other
292 				 * identifier */
293 		return (ident);
294 	    }			/* end of switch */
295 	}			/* end of if (found_it) */
296 	if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) {
297 	    char *tp = buf_ptr;
298 	    while (tp < buf_end)
299 		if (*tp++ == ')' && (*tp == ';' || *tp == ','))
300 		    goto not_proc;
301 	    strlcpy(ps.procname, token, sizeof ps.procname);
302 	    ps.in_parameter_declaration = 1;
303 	    rparen_count = 1;
304     not_proc:;
305 	}
306 	/*
307 	 * The following hack attempts to guess whether or not the current
308 	 * token is in fact a declaration keyword -- one that has been
309 	 * typedefd
310 	 */
311 	if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_')
312 		&& !ps.p_l_follow
313 	        && !ps.block_init
314 		&& (ps.last_token == rparen || ps.last_token == semicolon ||
315 		    ps.last_token == decl ||
316 		    ps.last_token == lbrace || ps.last_token == rbrace)) {
317 	    ps.its_a_keyword = true;
318 	    ps.last_u_d = true;
319 	    last_code = decl;
320 	    return decl;
321 	}
322 	if (last_code == decl)	/* if this is a declared variable, then
323 				 * following sign is unary */
324 	    ps.last_u_d = true;	/* will make "int a -1" work */
325 	last_code = ident;
326 	return (ident);		/* the ident is not in the list */
327     }				/* end of procesing for alpanum character */
328 
329     /* Scan a non-alphanumeric token */
330 scan_na:
331     *e_token++ = *buf_ptr;	/* if it is only a one-character token, it is
332 				 * moved here */
333     *e_token = '\0';
334     if (++buf_ptr >= buf_end)
335 	fill_buffer();
336 
337     switch (*token) {
338     case '\n':
339 	unary_delim = ps.last_u_d;
340 	ps.last_nl = true;	/* remember that we just had a newline */
341 	code = (had_eof ? 0 : newline);
342 
343 	/*
344 	 * if data has been exausted, the newline is a dummy, and we should
345 	 * return code to stop
346 	 */
347 	break;
348 
349     case '\'':			/* start of quoted character */
350     case '"':			/* start of string */
351 	qchar = *token;
352 	if (troff) {
353 	    e_token[-1] = '`';
354 	    if (qchar == '"')
355 		*e_token++ = '`';
356 	    e_token = chfont(&bodyf, &stringf, e_token);
357 	}
358 	do {			/* copy the string */
359 	    while (1) {		/* move one character or [/<char>]<char> */
360 		if (*buf_ptr == '\n') {
361 		    printf("%d: Unterminated literal\n", line_no);
362 		    goto stop_lit;
363 		}
364 		CHECK_SIZE_TOKEN;	/* Only have to do this once in this loop,
365 					 * since CHECK_SIZE guarantees that there
366 					 * are at least 5 entries left */
367 		*e_token = *buf_ptr++;
368 		if (buf_ptr >= buf_end)
369 		    fill_buffer();
370 		if (*e_token == BACKSLASH) {	/* if escape, copy extra char */
371 		    if (*buf_ptr == '\n')	/* check for escaped newline */
372 			++line_no;
373 		    if (troff) {
374 			*++e_token = BACKSLASH;
375 			if (*buf_ptr == BACKSLASH)
376 			    *++e_token = BACKSLASH;
377 		    }
378 		    *++e_token = *buf_ptr++;
379 		    ++e_token;	/* we must increment this again because we
380 				 * copied two chars */
381 		    if (buf_ptr >= buf_end)
382 			fill_buffer();
383 		}
384 		else
385 		    break;	/* we copied one character */
386 	    }			/* end of while (1) */
387 	} while (*e_token++ != qchar);
388 	if (troff) {
389 	    e_token = chfont(&stringf, &bodyf, e_token - 1);
390 	    if (qchar == '"')
391 		*e_token++ = '\'';
392 	}
393 stop_lit:
394 	code = ident;
395 	break;
396 
397     case ('('):
398     case ('['):
399 	unary_delim = true;
400 	code = lparen;
401 	break;
402 
403     case (')'):
404     case (']'):
405 	code = rparen;
406 	break;
407 
408     case '#':
409 	unary_delim = ps.last_u_d;
410 	code = preesc;
411 	break;
412 
413     case '?':
414 	unary_delim = true;
415 	code = question;
416 	break;
417 
418     case (':'):
419 	code = colon;
420 	unary_delim = true;
421 	break;
422 
423     case (';'):
424 	unary_delim = true;
425 	code = semicolon;
426 	break;
427 
428     case ('{'):
429 	unary_delim = true;
430 
431 	/*
432 	 * if (ps.in_or_st) ps.block_init = 1;
433 	 */
434 	/* ?	code = ps.block_init ? lparen : lbrace; */
435 	code = lbrace;
436 	break;
437 
438     case ('}'):
439 	unary_delim = true;
440 	/* ?	code = ps.block_init ? rparen : rbrace; */
441 	code = rbrace;
442 	break;
443 
444     case 014:			/* a form feed */
445 	unary_delim = ps.last_u_d;
446 	ps.last_nl = true;	/* remember this so we can set 'ps.col_1'
447 				 * right */
448 	code = form_feed;
449 	break;
450 
451     case (','):
452 	unary_delim = true;
453 	code = comma;
454 	break;
455 
456     case '.':
457 	unary_delim = false;
458 	code = period;
459 	break;
460 
461     case '-':
462     case '+':			/* check for -, +, --, ++ */
463 	code = (ps.last_u_d ? unary_op : binary_op);
464 	unary_delim = true;
465 
466 	if (*buf_ptr == token[0]) {
467 	    /* check for doubled character */
468 	    *e_token++ = *buf_ptr++;
469 	    /* buffer overflow will be checked at end of loop */
470 	    if (last_code == ident || last_code == rparen) {
471 		code = (ps.last_u_d ? unary_op : postop);
472 		/* check for following ++ or -- */
473 		unary_delim = false;
474 	    }
475 	}
476 	else if (*buf_ptr == '=')
477 	    /* check for operator += */
478 	    *e_token++ = *buf_ptr++;
479 	else if (*buf_ptr == '>') {
480 	    /* check for operator -> */
481 	    *e_token++ = *buf_ptr++;
482 	    if (!pointer_as_binop) {
483 		unary_delim = false;
484 		code = unary_op;
485 		ps.want_blank = false;
486 	    }
487 	}
488 	break;			/* buffer overflow will be checked at end of
489 				 * switch */
490 
491     case '=':
492 	if (ps.in_or_st)
493 	    ps.block_init = 1;
494 #ifdef undef
495 	if (chartype[*buf_ptr] == opchar) {	/* we have two char assignment */
496 	    e_token[-1] = *buf_ptr++;
497 	    if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
498 		*e_token++ = *buf_ptr++;
499 	    *e_token++ = '=';	/* Flip =+ to += */
500 	    *e_token = 0;
501 	}
502 #else
503 	if (*buf_ptr == '=') {/* == */
504 	    *e_token++ = '=';	/* Flip =+ to += */
505 	    buf_ptr++;
506 	    *e_token = 0;
507 	}
508 #endif
509 	code = binary_op;
510 	unary_delim = true;
511 	break;
512 	/* can drop thru!!! */
513 
514     case '>':
515     case '<':
516     case '!':			/* ops like <, <<, <=, !=, etc */
517 	if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
518 	    *e_token++ = *buf_ptr;
519 	    if (++buf_ptr >= buf_end)
520 		fill_buffer();
521 	}
522 	if (*buf_ptr == '=')
523 	    *e_token++ = *buf_ptr++;
524 	code = (ps.last_u_d ? unary_op : binary_op);
525 	unary_delim = true;
526 	break;
527 
528     default:
529 	if (token[0] == '/' && *buf_ptr == '*') {
530 	    /* it is start of comment */
531 	    *e_token++ = '*';
532 
533 	    if (++buf_ptr >= buf_end)
534 		fill_buffer();
535 
536 	    code = comment;
537 	    unary_delim = ps.last_u_d;
538 	    break;
539 	}
540 	while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
541 	    /*
542 	     * handle ||, &&, etc, and also things as in int *****i
543 	     */
544 	    *e_token++ = *buf_ptr;
545 	    if (++buf_ptr >= buf_end)
546 		fill_buffer();
547 	}
548 	code = (ps.last_u_d ? unary_op : binary_op);
549 	unary_delim = true;
550 
551 
552     }				/* end of switch */
553     if (code != newline) {
554 	l_struct = false;
555 	last_code = code;
556     }
557     if (buf_ptr >= buf_end)	/* check for input buffer empty */
558 	fill_buffer();
559     ps.last_u_d = unary_delim;
560     *e_token = '\0';		/* null terminate the token */
561     return (code);
562 }
563 
564 /*
565  * Add the given keyword to the keyword table, using val as the keyword type
566  */
567 void
addkey(char * key,int val)568 addkey(char *key, int val)
569 {
570     struct templ *p;
571     int i;
572 
573     for (i = 0; i < nspecials; i++) {
574 	p = &specials[i];
575 	if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0)
576 	    return;
577     }
578 
579     if (specials == specialsinit) {
580 	/*
581 	 * Whoa. Must reallocate special table.
582 	 */
583 	nspecials = sizeof (specialsinit) / sizeof (specialsinit[0]);
584 	maxspecials = nspecials + (nspecials >> 2);
585 	specials = (struct templ *)malloc(maxspecials * sizeof specials[0]);
586 	if (specials == NULL)
587 	    err(1, NULL);
588 	memcpy(specials, specialsinit, sizeof specialsinit);
589     } else if (nspecials >= maxspecials) {
590 	int newspecials = maxspecials + (maxspecials >> 2);
591 	struct templ *specials2;
592 
593 	specials2 = realloc(specials, newspecials * sizeof specials[0]);
594 	if (specials2 == NULL)
595 	    err(1, NULL);
596 	specials = specials2;
597 	maxspecials = newspecials;
598     }
599 
600     p = &specials[nspecials];
601     p->rwd = key;
602     p->rwcode = val;
603     nspecials++;
604     return;
605 }
606