1 /** $MirOS: src/usr.bin/indent/lexi.c,v 1.4 2007/05/06 18:04:44 tg Exp $ */
2 /* $OpenBSD: lexi.c,v 1.12 2005/03/06 14:34:25 millert Exp $ */
3
4 /*
5 * Copyright (c) 1980, 1993
6 * The Regents of the University of California.
7 * Copyright (c) 1976 Board of Trustees of the University of Illinois.
8 * Copyright (c) 1985 Sun Microsystems, Inc.
9 * All rights reserved.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 * 3. Neither the name of the University nor the names of its contributors
20 * may be used to endorse or promote products derived from this software
21 * without specific prior written permission.
22 *
23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * SUCH DAMAGE.
34 */
35
36 /*
37 * Here we have the token scanner for indent. It scans off one token and puts
38 * it in the global variable "token". It returns a code, indicating the type
39 * of token scanned.
40 */
41
42 #include <sys/cdefs.h>
43 #include <stdio.h>
44 #include <ctype.h>
45 #include <stdlib.h>
46 #include <string.h>
47 #include <err.h>
48 #include "indent_globs.h"
49 #include "indent_codes.h"
50
51 __SCCSID("@(#)lexi.c 8.1 (Berkeley) 6/6/93");
52 __RCSID("$MirOS: src/usr.bin/indent/lexi.c,v 1.4 2007/05/06 18:04:44 tg Exp $");
53
54 #define alphanum 1
55 #define opchar 3
56
57 struct templ {
58 char *rwd;
59 int rwcode;
60 };
61
62 struct templ specialsinit[] = {
63 { "switch", 1 },
64 { "case", 2 },
65 { "break", 0 },
66 { "struct", 3 },
67 { "union", 3 },
68 { "enum", 3 },
69 { "default", 2 },
70 { "int", 4 },
71 { "char", 4 },
72 { "float", 4 },
73 { "double", 4 },
74 { "long", 4 },
75 { "short", 4 },
76 { "typdef", 4 },
77 { "unsigned", 4 },
78 { "register", 4 },
79 { "static", 4 },
80 { "global", 4 },
81 { "extern", 4 },
82 { "void", 4 },
83 { "goto", 0 },
84 { "return", 0 },
85 { "if", 5 },
86 { "while", 5 },
87 { "for", 5 },
88 { "else", 6 },
89 { "do", 6 },
90 { "sizeof", 7 },
91 };
92
93 struct templ *specials = specialsinit;
94 int nspecials = sizeof(specialsinit) / sizeof(specialsinit[0]);
95 int maxspecials;
96
97 char chartype[128] =
98 { /* this is used to facilitate the decision of
99 * what type (alphanumeric, operator) each
100 * character is */
101 0, 0, 0, 0, 0, 0, 0, 0,
102 0, 0, 0, 0, 0, 0, 0, 0,
103 0, 0, 0, 0, 0, 0, 0, 0,
104 0, 0, 0, 0, 0, 0, 0, 0,
105 0, 3, 0, 0, 1, 3, 3, 0,
106 0, 0, 3, 3, 0, 3, 0, 3,
107 1, 1, 1, 1, 1, 1, 1, 1,
108 1, 1, 0, 0, 3, 3, 3, 3,
109 0, 1, 1, 1, 1, 1, 1, 1,
110 1, 1, 1, 1, 1, 1, 1, 1,
111 1, 1, 1, 1, 1, 1, 1, 1,
112 1, 1, 1, 0, 0, 0, 3, 1,
113 0, 1, 1, 1, 1, 1, 1, 1,
114 1, 1, 1, 1, 1, 1, 1, 1,
115 1, 1, 1, 1, 1, 1, 1, 1,
116 1, 1, 1, 0, 3, 0, 3, 0
117 };
118
119
120
121
122 int
lexi(void)123 lexi(void)
124 {
125 int unary_delim; /* this is set to 1 if the current token
126 * forces a following operator to be unary */
127 static int last_code; /* the last token type returned */
128 static int l_struct; /* set to 1 if the last token was 'struct' */
129 int code; /* internal code to be returned */
130 char qchar; /* the delimiter character for a string */
131 int i;
132
133 e_token = s_token; /* point to start of place to save token */
134 unary_delim = false;
135 ps.col_1 = ps.last_nl; /* tell world that this token started in
136 * column 1 iff the last thing scanned was nl */
137 ps.last_nl = false;
138
139 while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */
140 ps.col_1 = false; /* leading blanks imply token is not in column
141 * 1 */
142 if (++buf_ptr >= buf_end)
143 fill_buffer();
144 }
145
146 if (buf_ptr[0] == 'L' && ((buf_ptr[1] == '"') || (buf_ptr[1] == '\'')))
147 goto scan_na;
148
149 /* Scan an alphanumeric token */
150 if (chartype[(int)*buf_ptr] == alphanum ||
151 (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
152 /*
153 * we have a character or number
154 */
155 char *j; /* used for searching thru list of
156 * reserved words */
157 if (isdigit(*buf_ptr) || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
158 int seendot = 0,
159 seenexp = 0,
160 seensfx = 0;
161 if (*buf_ptr == '0' &&
162 (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) {
163 *e_token++ = *buf_ptr++;
164 *e_token++ = *buf_ptr++;
165 while (isxdigit(*buf_ptr)) {
166 CHECK_SIZE_TOKEN;
167 *e_token++ = *buf_ptr++;
168 }
169 }
170 else
171 while (1) {
172 if (*buf_ptr == '.') {
173 if (seendot)
174 break;
175 else
176 seendot++;
177 }
178 CHECK_SIZE_TOKEN;
179 *e_token++ = *buf_ptr++;
180 if (!isdigit(*buf_ptr) && *buf_ptr != '.') {
181 if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp)
182 break;
183 else {
184 seenexp++;
185 seendot++;
186 CHECK_SIZE_TOKEN;
187 *e_token++ = *buf_ptr++;
188 if (*buf_ptr == '+' || *buf_ptr == '-')
189 *e_token++ = *buf_ptr++;
190 }
191 }
192 }
193 while (1) {
194 if (!(seensfx & 1) &&
195 (*buf_ptr == 'U' || *buf_ptr == 'u')) {
196 CHECK_SIZE_TOKEN;
197 *e_token++ = *buf_ptr++;
198 seensfx |= 1;
199 continue;
200 }
201 if (!(seensfx & 2) &&
202 (*buf_ptr == 'L' || *buf_ptr == 'l')) {
203 CHECK_SIZE_TOKEN;
204 if (buf_ptr[1] == buf_ptr[0])
205 *e_token++ = *buf_ptr++;
206 *e_token++ = *buf_ptr++;
207 seensfx |= 2;
208 continue;
209 }
210 break;
211 }
212 }
213 else
214 while (chartype[(int)*buf_ptr] == alphanum) { /* copy it over */
215 CHECK_SIZE_TOKEN;
216 *e_token++ = *buf_ptr++;
217 if (buf_ptr >= buf_end)
218 fill_buffer();
219 }
220 *e_token++ = '\0';
221 while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */
222 if (++buf_ptr >= buf_end)
223 fill_buffer();
224 }
225 ps.its_a_keyword = false;
226 ps.sizeof_keyword = false;
227 if (l_struct) { /* if last token was 'struct', then this token
228 * should be treated as a declaration */
229 l_struct = false;
230 last_code = ident;
231 ps.last_u_d = true;
232 return (decl);
233 }
234 ps.last_u_d = false; /* Operator after indentifier is binary */
235 last_code = ident; /* Remember that this is the code we will
236 * return */
237
238 /*
239 * This loop will check if the token is a keyword.
240 */
241 for (i = 0; i < nspecials; i++) {
242 char *p = s_token; /* point at scanned token */
243 j = specials[i].rwd;
244 if (*j++ != *p++ || *j++ != *p++)
245 continue; /* This test depends on the fact that
246 * identifiers are always at least 1 character
247 * long (ie. the first two bytes of the
248 * identifier are always meaningful) */
249 if (p[-1] == 0)
250 break; /* If its a one-character identifier */
251 while (*p++ == *j)
252 if (*j++ == 0)
253 goto found_keyword; /* I wish that C had a multi-level
254 * break... */
255 }
256 if (i < nspecials) { /* we have a keyword */
257 found_keyword:
258 ps.its_a_keyword = true;
259 ps.last_u_d = true;
260 switch (specials[i].rwcode) {
261 case 1: /* it is a switch */
262 return (swstmt);
263 case 2: /* a case or default */
264 return (casestmt);
265
266 case 3: /* a "struct" */
267 if (ps.p_l_follow)
268 break; /* inside parens: cast */
269 l_struct = true;
270
271 /*
272 * Next time around, we will want to know that we have had a
273 * 'struct'
274 */
275 case 4: /* one of the declaration keywords */
276 if (ps.p_l_follow) {
277 ps.cast_mask |= 1 << ps.p_l_follow;
278 break; /* inside parens: cast */
279 }
280 last_code = decl;
281 return (decl);
282
283 case 5: /* if, while, for */
284 return (sp_paren);
285
286 case 6: /* do, else */
287 return (sp_nparen);
288
289 case 7:
290 ps.sizeof_keyword = true;
291 default: /* all others are treated like any other
292 * identifier */
293 return (ident);
294 } /* end of switch */
295 } /* end of if (found_it) */
296 if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) {
297 char *tp = buf_ptr;
298 while (tp < buf_end)
299 if (*tp++ == ')' && (*tp == ';' || *tp == ','))
300 goto not_proc;
301 strlcpy(ps.procname, token, sizeof ps.procname);
302 ps.in_parameter_declaration = 1;
303 rparen_count = 1;
304 not_proc:;
305 }
306 /*
307 * The following hack attempts to guess whether or not the current
308 * token is in fact a declaration keyword -- one that has been
309 * typedefd
310 */
311 if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_')
312 && !ps.p_l_follow
313 && !ps.block_init
314 && (ps.last_token == rparen || ps.last_token == semicolon ||
315 ps.last_token == decl ||
316 ps.last_token == lbrace || ps.last_token == rbrace)) {
317 ps.its_a_keyword = true;
318 ps.last_u_d = true;
319 last_code = decl;
320 return decl;
321 }
322 if (last_code == decl) /* if this is a declared variable, then
323 * following sign is unary */
324 ps.last_u_d = true; /* will make "int a -1" work */
325 last_code = ident;
326 return (ident); /* the ident is not in the list */
327 } /* end of procesing for alpanum character */
328
329 /* Scan a non-alphanumeric token */
330 scan_na:
331 *e_token++ = *buf_ptr; /* if it is only a one-character token, it is
332 * moved here */
333 *e_token = '\0';
334 if (++buf_ptr >= buf_end)
335 fill_buffer();
336
337 switch (*token) {
338 case '\n':
339 unary_delim = ps.last_u_d;
340 ps.last_nl = true; /* remember that we just had a newline */
341 code = (had_eof ? 0 : newline);
342
343 /*
344 * if data has been exausted, the newline is a dummy, and we should
345 * return code to stop
346 */
347 break;
348
349 case '\'': /* start of quoted character */
350 case '"': /* start of string */
351 qchar = *token;
352 if (troff) {
353 e_token[-1] = '`';
354 if (qchar == '"')
355 *e_token++ = '`';
356 e_token = chfont(&bodyf, &stringf, e_token);
357 }
358 do { /* copy the string */
359 while (1) { /* move one character or [/<char>]<char> */
360 if (*buf_ptr == '\n') {
361 printf("%d: Unterminated literal\n", line_no);
362 goto stop_lit;
363 }
364 CHECK_SIZE_TOKEN; /* Only have to do this once in this loop,
365 * since CHECK_SIZE guarantees that there
366 * are at least 5 entries left */
367 *e_token = *buf_ptr++;
368 if (buf_ptr >= buf_end)
369 fill_buffer();
370 if (*e_token == BACKSLASH) { /* if escape, copy extra char */
371 if (*buf_ptr == '\n') /* check for escaped newline */
372 ++line_no;
373 if (troff) {
374 *++e_token = BACKSLASH;
375 if (*buf_ptr == BACKSLASH)
376 *++e_token = BACKSLASH;
377 }
378 *++e_token = *buf_ptr++;
379 ++e_token; /* we must increment this again because we
380 * copied two chars */
381 if (buf_ptr >= buf_end)
382 fill_buffer();
383 }
384 else
385 break; /* we copied one character */
386 } /* end of while (1) */
387 } while (*e_token++ != qchar);
388 if (troff) {
389 e_token = chfont(&stringf, &bodyf, e_token - 1);
390 if (qchar == '"')
391 *e_token++ = '\'';
392 }
393 stop_lit:
394 code = ident;
395 break;
396
397 case ('('):
398 case ('['):
399 unary_delim = true;
400 code = lparen;
401 break;
402
403 case (')'):
404 case (']'):
405 code = rparen;
406 break;
407
408 case '#':
409 unary_delim = ps.last_u_d;
410 code = preesc;
411 break;
412
413 case '?':
414 unary_delim = true;
415 code = question;
416 break;
417
418 case (':'):
419 code = colon;
420 unary_delim = true;
421 break;
422
423 case (';'):
424 unary_delim = true;
425 code = semicolon;
426 break;
427
428 case ('{'):
429 unary_delim = true;
430
431 /*
432 * if (ps.in_or_st) ps.block_init = 1;
433 */
434 /* ? code = ps.block_init ? lparen : lbrace; */
435 code = lbrace;
436 break;
437
438 case ('}'):
439 unary_delim = true;
440 /* ? code = ps.block_init ? rparen : rbrace; */
441 code = rbrace;
442 break;
443
444 case 014: /* a form feed */
445 unary_delim = ps.last_u_d;
446 ps.last_nl = true; /* remember this so we can set 'ps.col_1'
447 * right */
448 code = form_feed;
449 break;
450
451 case (','):
452 unary_delim = true;
453 code = comma;
454 break;
455
456 case '.':
457 unary_delim = false;
458 code = period;
459 break;
460
461 case '-':
462 case '+': /* check for -, +, --, ++ */
463 code = (ps.last_u_d ? unary_op : binary_op);
464 unary_delim = true;
465
466 if (*buf_ptr == token[0]) {
467 /* check for doubled character */
468 *e_token++ = *buf_ptr++;
469 /* buffer overflow will be checked at end of loop */
470 if (last_code == ident || last_code == rparen) {
471 code = (ps.last_u_d ? unary_op : postop);
472 /* check for following ++ or -- */
473 unary_delim = false;
474 }
475 }
476 else if (*buf_ptr == '=')
477 /* check for operator += */
478 *e_token++ = *buf_ptr++;
479 else if (*buf_ptr == '>') {
480 /* check for operator -> */
481 *e_token++ = *buf_ptr++;
482 if (!pointer_as_binop) {
483 unary_delim = false;
484 code = unary_op;
485 ps.want_blank = false;
486 }
487 }
488 break; /* buffer overflow will be checked at end of
489 * switch */
490
491 case '=':
492 if (ps.in_or_st)
493 ps.block_init = 1;
494 #ifdef undef
495 if (chartype[*buf_ptr] == opchar) { /* we have two char assignment */
496 e_token[-1] = *buf_ptr++;
497 if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
498 *e_token++ = *buf_ptr++;
499 *e_token++ = '='; /* Flip =+ to += */
500 *e_token = 0;
501 }
502 #else
503 if (*buf_ptr == '=') {/* == */
504 *e_token++ = '='; /* Flip =+ to += */
505 buf_ptr++;
506 *e_token = 0;
507 }
508 #endif
509 code = binary_op;
510 unary_delim = true;
511 break;
512 /* can drop thru!!! */
513
514 case '>':
515 case '<':
516 case '!': /* ops like <, <<, <=, !=, etc */
517 if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
518 *e_token++ = *buf_ptr;
519 if (++buf_ptr >= buf_end)
520 fill_buffer();
521 }
522 if (*buf_ptr == '=')
523 *e_token++ = *buf_ptr++;
524 code = (ps.last_u_d ? unary_op : binary_op);
525 unary_delim = true;
526 break;
527
528 default:
529 if (token[0] == '/' && *buf_ptr == '*') {
530 /* it is start of comment */
531 *e_token++ = '*';
532
533 if (++buf_ptr >= buf_end)
534 fill_buffer();
535
536 code = comment;
537 unary_delim = ps.last_u_d;
538 break;
539 }
540 while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
541 /*
542 * handle ||, &&, etc, and also things as in int *****i
543 */
544 *e_token++ = *buf_ptr;
545 if (++buf_ptr >= buf_end)
546 fill_buffer();
547 }
548 code = (ps.last_u_d ? unary_op : binary_op);
549 unary_delim = true;
550
551
552 } /* end of switch */
553 if (code != newline) {
554 l_struct = false;
555 last_code = code;
556 }
557 if (buf_ptr >= buf_end) /* check for input buffer empty */
558 fill_buffer();
559 ps.last_u_d = unary_delim;
560 *e_token = '\0'; /* null terminate the token */
561 return (code);
562 }
563
564 /*
565 * Add the given keyword to the keyword table, using val as the keyword type
566 */
567 void
addkey(char * key,int val)568 addkey(char *key, int val)
569 {
570 struct templ *p;
571 int i;
572
573 for (i = 0; i < nspecials; i++) {
574 p = &specials[i];
575 if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0)
576 return;
577 }
578
579 if (specials == specialsinit) {
580 /*
581 * Whoa. Must reallocate special table.
582 */
583 nspecials = sizeof (specialsinit) / sizeof (specialsinit[0]);
584 maxspecials = nspecials + (nspecials >> 2);
585 specials = (struct templ *)malloc(maxspecials * sizeof specials[0]);
586 if (specials == NULL)
587 err(1, NULL);
588 memcpy(specials, specialsinit, sizeof specialsinit);
589 } else if (nspecials >= maxspecials) {
590 int newspecials = maxspecials + (maxspecials >> 2);
591 struct templ *specials2;
592
593 specials2 = realloc(specials, newspecials * sizeof specials[0]);
594 if (specials2 == NULL)
595 err(1, NULL);
596 specials = specials2;
597 maxspecials = newspecials;
598 }
599
600 p = &specials[nspecials];
601 p->rwd = key;
602 p->rwcode = val;
603 nspecials++;
604 return;
605 }
606