1 /*        $NetBSD: lexi.c,v 1.242 2023/12/03 21:44:42 rillig Exp $    */
2 
3 /*-
4  * SPDX-License-Identifier: BSD-4-Clause
5  *
6  * Copyright (c) 1985 Sun Microsystems, Inc.
7  * Copyright (c) 1980, 1993
8  *        The Regents of the University of California.  All rights reserved.
9  * All rights reserved.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  * 3. All advertising materials mentioning features or use of this software
20  *    must display the following acknowledgement:
21  *        This product includes software developed by the University of
22  *        California, Berkeley and its contributors.
23  * 4. Neither the name of the University nor the names of its contributors
24  *    may be used to endorse or promote products derived from this software
25  *    without specific prior written permission.
26  *
27  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
28  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37  * SUCH DAMAGE.
38  */
39 
40 #include <sys/cdefs.h>
41 __RCSID("$NetBSD: lexi.c,v 1.242 2023/12/03 21:44:42 rillig Exp $");
42 
43 #include <stdlib.h>
44 #include <string.h>
45 
46 #include "indent.h"
47 
48 /* must be sorted alphabetically, is used in binary search */
49 static const struct keyword {
50           const char name[12];
51           lexer_symbol lsym;
52 } keywords[] = {
53           {"_Bool", lsym_type},
54           {"_Complex", lsym_modifier},
55           {"_Imaginary", lsym_modifier},
56           {"auto", lsym_modifier},
57           {"bool", lsym_type},
58           {"break", lsym_word},
59           {"case", lsym_case},
60           {"char", lsym_type},
61           {"complex", lsym_modifier},
62           {"const", lsym_modifier},
63           {"continue", lsym_word},
64           {"default", lsym_default},
65           {"do", lsym_do},
66           {"double", lsym_type},
67           {"else", lsym_else},
68           {"enum", lsym_tag},
69           {"extern", lsym_modifier},
70           {"float", lsym_type},
71           {"for", lsym_for},
72           {"goto", lsym_word},
73           {"if", lsym_if},
74           {"imaginary", lsym_modifier},
75           {"inline", lsym_modifier},
76           {"int", lsym_type},
77           {"long", lsym_type},
78           {"offsetof", lsym_offsetof},
79           {"register", lsym_modifier},
80           {"restrict", lsym_word},
81           {"return", lsym_return},
82           {"short", lsym_type},
83           {"signed", lsym_type},
84           {"sizeof", lsym_sizeof},
85           {"static", lsym_modifier},
86           {"struct", lsym_tag},
87           {"switch", lsym_switch},
88           {"typedef", lsym_typedef},
89           {"union", lsym_tag},
90           {"unsigned", lsym_type},
91           {"void", lsym_type},
92           {"volatile", lsym_modifier},
93           {"while", lsym_while}
94 };
95 
96 static struct {
97           const char **items;
98           unsigned int len;
99           unsigned int cap;
100 } typenames;
101 
102 /*-
103  * The transition table below was rewritten by hand from lx's output, given
104  * the following definitions. lx is Katherine Flavel's lexer generator.
105  *
106  * O  = /[0-7]/;        D  = /[0-9]/;          NZ = /[1-9]/;
107  * H  = /[a-f0-9]/i;    B  = /[0-1]/;          HP = /0x/i;
108  * BP = /0b/i;          E  = /e[+\-]?/i D+;    P  = /p[+\-]?/i D+;
109  * FS = /[fl]/i;        IS = /u/i /(l|L|ll|LL)/? | /(l|L|ll|LL)/ /u/i?;
110  *
111  * D+           E  FS? -> $float;
112  * D*    "." D+ E? FS? -> $float;
113  * D+    "."    E? FS? -> $float;    HP H+           IS? -> $int;
114  * HP H+        P  FS? -> $float;    NZ D*           IS? -> $int;
115  * HP H* "." H+ P  FS? -> $float;    "0" O*          IS? -> $int;
116  * HP H+ "."    P  FS  -> $float;    BP B+           IS? -> $int;
117  */
118 /* INDENT OFF */
119 static const unsigned char lex_number_state[][26] = {
120           /*                examples:
121                                            00
122                    s                      0xx
123                    t                    00xaa
124                    a     11       101100xxa..
125                    r   11ee0001101lbuuxx.a.pp
126                    t.01.e+008bLuxll0Ll.aa.p+0
127           states:  ABCDEFGHIJKLMNOPQRSTUVWXYZ */
128           [0] =   "uuiifuufiuuiiuiiiiiuiuuuuu",   /* (other) */
129           [1] =   "CEIDEHHHIJQ  U  Q  VUVVZZZ",   /* 0 */
130           [2] =   "DEIDEHHHIJQ  U  Q  VUVVZZZ",   /* 1 */
131           [3] =   "DEIDEHHHIJ   U     VUVVZZZ",   /* 2 3 4 5 6 7 */
132           [4] =   "DEJDEHHHJJ   U     VUVVZZZ",   /* 8 9 */
133           [5] =   "             U     VUVV   ",   /* A a C c D d */
134           [6] =   "  K          U     VUVV   ",   /* B b */
135           [7] =   "  FFF   FF   U     VUVV   ",   /* E e */
136           [8] =   "    f  f     U     VUVV  f",   /* F f */
137           [9] =   "  LLf  fL  PR   Li  L    f",   /* L */
138           [10] =  "  OOf  fO   S P O i O    f",   /* l */
139           [11] =  "                    FFX   ",   /* P p */
140           [12] =  "  MM    M  i  iiM   M     ",   /* U u */
141           [13] =  "  N                       ",   /* X x */
142           [14] =  "     G                 Y  ",   /* + - */
143           [15] =  "B EE    EE   T      W     ",   /* . */
144           /*       ABCDEFGHIJKLMNOPQRSTUVWXYZ */
145 };
146 /* INDENT ON */
147 
148 static const unsigned char lex_number_row[] = {
149           ['0'] = 1,
150           ['1'] = 2,
151           ['2'] = 3, ['3'] = 3, ['4'] = 3, ['5'] = 3, ['6'] = 3, ['7'] = 3,
152           ['8'] = 4, ['9'] = 4,
153           ['A'] = 5, ['a'] = 5, ['C'] = 5, ['c'] = 5, ['D'] = 5, ['d'] = 5,
154           ['B'] = 6, ['b'] = 6,
155           ['E'] = 7, ['e'] = 7,
156           ['F'] = 8, ['f'] = 8,
157           ['L'] = 9,
158           ['l'] = 10,
159           ['P'] = 11, ['p'] = 11,
160           ['U'] = 12, ['u'] = 12,
161           ['X'] = 13, ['x'] = 13,
162           ['+'] = 14, ['-'] = 14,
163           ['.'] = 15,
164 };
165 
166 
167 static bool
is_identifier_start(char ch)168 is_identifier_start(char ch)
169 {
170           return ch_isalpha(ch) || ch == '_' || ch == '$';
171 }
172 
173 static bool
is_identifier_part(char ch)174 is_identifier_part(char ch)
175 {
176           return ch_isalnum(ch) || ch == '_' || ch == '$';
177 }
178 
179 static void
token_add_char(char ch)180 token_add_char(char ch)
181 {
182           buf_add_char(&token, ch);
183 }
184 
185 static bool
skip_line_continuation(void)186 skip_line_continuation(void)
187 {
188           if (in.p[0] == '\\' && in.p[1] == '\n') {
189                     in.p++;
190                     inp_skip();
191                     in.token_end_line++;
192                     return true;
193           }
194           return false;
195 }
196 
197 static void
lex_number(void)198 lex_number(void)
199 {
200           for (unsigned char s = 'A'; s != 'f' && s != 'i' && s != 'u';) {
201                     unsigned char ch = (unsigned char)*in.p;
202                     if (skip_line_continuation())
203                               continue;
204                     if (ch >= array_length(lex_number_row)
205                         || lex_number_row[ch] == 0)
206                               break;
207 
208                     unsigned char row = lex_number_row[ch];
209                     if (lex_number_state[row][s - 'A'] == ' ') {
210                               // lex_number_state[0][s - 'A'] now indicates the type:
211                               // f = floating, i = integer, u = unknown
212                               return;
213                     }
214 
215                     s = lex_number_state[row][s - 'A'];
216                     token_add_char(inp_next());
217           }
218 }
219 
220 static void
lex_word(void)221 lex_word(void)
222 {
223           for (;;) {
224                     if (is_identifier_part(*in.p))
225                               token_add_char(*in.p++);
226                     else if (skip_line_continuation())
227                               continue;
228                     else
229                               return;
230           }
231 }
232 
233 static void
lex_char_or_string(void)234 lex_char_or_string(void)
235 {
236           for (char delim = token.s[token.len - 1];;) {
237                     if (*in.p == '\n') {
238                               diag(1, "Unterminated literal");
239                               return;
240                     }
241 
242                     token_add_char(*in.p++);
243                     if (token.s[token.len - 1] == delim)
244                               return;
245 
246                     if (token.s[token.len - 1] == '\\') {
247                               if (*in.p == '\n')
248                                         in.token_end_line++;
249                               token_add_char(inp_next());
250                     }
251           }
252 }
253 
254 /* Guess whether the current token is a declared type. */
255 static bool
probably_typename(void)256 probably_typename(void)
257 {
258           if (ps.prev_lsym == lsym_modifier)
259                     return true;
260           if (ps.in_init)
261                     return false;
262           if (ps.in_stmt_or_decl)       /* XXX: this condition looks incorrect */
263                     return false;
264           if (ps.prev_lsym == lsym_semicolon
265               || ps.prev_lsym == lsym_lbrace
266               || ps.prev_lsym == lsym_rbrace) {
267                     if (in.p[0] == '*' && in.p[1] != '=')
268                               return true;
269                     /* XXX: is_identifier_start */
270                     if (ch_isalpha(in.p[0]))
271                               return true;
272           }
273           return false;
274 }
275 
276 static int
bsearch_typenames(const char * key)277 bsearch_typenames(const char *key)
278 {
279           const char **arr = typenames.items;
280           unsigned lo = 0;
281           unsigned hi = typenames.len;
282 
283           while (lo < hi) {
284                     unsigned mid = (lo + hi) / 2;
285                     int cmp = strcmp(arr[mid], key);
286                     if (cmp < 0)
287                               lo = mid + 1;
288                     else if (cmp > 0)
289                               hi = mid;
290                     else
291                               return (int)mid;
292           }
293           return -1 - (int)lo;
294 }
295 
296 static bool
is_typename(void)297 is_typename(void)
298 {
299           if (ps.prev_lsym == lsym_tag)
300                     return true;
301           if (opt.auto_typedefs &&
302               token.len >= 2 && memcmp(token.s + token.len - 2, "_t", 2) == 0)
303                     return true;
304 
305           return bsearch_typenames(token.s) >= 0;
306 }
307 
308 void
register_typename(const char * name)309 register_typename(const char *name)
310 {
311           if (typenames.len >= typenames.cap) {
312                     typenames.cap = 16 + 2 * typenames.cap;
313                     typenames.items = nonnull(realloc(typenames.items,
314                               sizeof(typenames.items[0]) * typenames.cap));
315           }
316 
317           int pos = bsearch_typenames(name);
318           if (pos >= 0)
319                     return;             /* already in the list */
320 
321           pos = -1 - pos;
322           memmove(typenames.items + pos + 1, typenames.items + pos,
323               sizeof(typenames.items[0]) * (typenames.len++ - (unsigned)pos));
324           typenames.items[pos] = nonnull(strdup(name));
325 }
326 
327 static int
cmp_keyword_by_name(const void * key,const void * elem)328 cmp_keyword_by_name(const void *key, const void *elem)
329 {
330           return strcmp(key, ((const struct keyword *)elem)->name);
331 }
332 
333 /*
334  * Looking at the '(', guess whether this starts a function definition or a
335  * function declaration.
336  */
337 static bool
probably_function_definition(const char * p)338 probably_function_definition(const char *p)
339 {
340           // TODO: Don't look at characters in comments, see lsym_funcname.c.
341           int paren_level = 0;
342           for (; *p != '\n'; p++) {
343                     if (*p == '(')
344                               paren_level++;
345                     if (*p == ')' && --paren_level == 0) {
346                               p++;
347 
348                               while (*p != '\n'
349                                   && (ch_isspace(*p) || is_identifier_part(*p)))
350                                         p++;      /* '__dead' or '__unused' */
351 
352                               if (*p == '\n')     /* func(...) */
353                                         break;
354                               if (*p == ';')      /* func(...); */
355                                         return false;
356                               if (*p == ',')      /* double abs(), pi; */
357                                         return false;
358                               if (*p == '(')      /* func(...) __attribute__((...)) */
359                                         paren_level++;      /* func(...) __printflike(...)
360                                                              */
361                               else
362                                         break;    /* func(...) { ... */
363                     }
364 
365                     if (paren_level == 1 && p[0] == '*' && p[1] == ',')
366                               return false;
367           }
368 
369           /*
370            * To further reduce the cases where indent wrongly treats an
371            * incomplete function declaration as a function definition, thus
372            * adding a newline before the function name, it may be worth looking
373            * for parameter names, as these are often omitted in function
374            * declarations and only included in function definitions. Or just
375            * increase the lookahead to more than just the current line of input,
376            * until the next '{'.
377            */
378           return true;
379 }
380 
381 static lexer_symbol
lexi_alnum(void)382 lexi_alnum(void)
383 {
384           if (ch_isdigit(in.p[0]) ||
385               (in.p[0] == '.' && ch_isdigit(in.p[1]))) {
386                     lex_number();
387           } else if (is_identifier_start(in.p[0])) {
388                     lex_word();
389 
390                     if (token.len == 1 && token.s[0] == 'L' &&
391                         (in.p[0] == '"' || in.p[0] == '\'')) {
392                               token_add_char(*in.p++);
393                               lex_char_or_string();
394                               ps.next_unary = false;
395                               return lsym_word;
396                     }
397           } else
398                     return lsym_eof;    /* just as a placeholder */
399 
400           while (ch_isblank(*in.p))
401                     in.p++;
402 
403           ps.next_unary = ps.prev_lsym == lsym_tag
404               || ps.prev_lsym == lsym_typedef
405               || (ps.prev_lsym == lsym_modifier && *in.p == '*');
406 
407           if (ps.prev_lsym == lsym_tag && ps.paren.len == 0)
408                     return lsym_type;
409           if (ps.spaced_expr_psym == psym_for_exprs
410               && ps.prev_lsym == lsym_lparen && ps.paren.len == 1
411               && *in.p == '*') {
412                     ps.next_unary = true;
413                     return lsym_type;
414           }
415 
416           token_add_char('\0');         // Terminate in non-debug mode as well.
417           token.len--;
418           const struct keyword *kw = bsearch(token.s, keywords,
419               array_length(keywords), sizeof(keywords[0]), cmp_keyword_by_name);
420           lexer_symbol lsym = lsym_word;
421           if (kw != NULL) {
422                     lsym = kw->lsym;
423                     ps.next_unary = true;
424                     if (lsym == lsym_tag || lsym == lsym_type)
425                               goto found_typename;
426                     return lsym;
427           }
428 
429           if (is_typename()) {
430                     lsym = lsym_type;
431                     ps.next_unary = true;
432 found_typename:
433                     if (ps.prev_lsym != lsym_period
434                         && ps.prev_lsym != lsym_unary_op) {
435                               if (lsym == lsym_tag)
436                                         return lsym_tag;
437                               if (ps.paren.len == 0)
438                                         return lsym_type;
439                     }
440           }
441 
442           const char *p = in.p;
443           if (*p == ')')
444                     p++;
445           if (*p == '(' && ps.psyms.len < 3 && ps.ind_level == 0 &&
446               !ps.in_func_def_params && !ps.in_init) {
447 
448                     bool maybe_function_definition = *in.p == ')'
449                         ? ps.paren.len == 1 && ps.prev_lsym != lsym_unary_op
450                         : ps.paren.len == 0;
451                     if (maybe_function_definition
452                         && probably_function_definition(p)) {
453                               ps.line_has_func_def = true;
454                               if (ps.in_decl)
455                                         ps.in_func_def_params = true;
456                               return lsym_funcname;
457                     }
458 
459           } else if (ps.paren.len == 0 && probably_typename()) {
460                     ps.next_unary = true;
461                     return lsym_type;
462           }
463 
464           return lsym;
465 }
466 
467 static void
check_parenthesized_function_definition(void)468 check_parenthesized_function_definition(void)
469 {
470           const char *p = in.p;
471           while (ch_isblank(*p))
472                     p++;
473           if (is_identifier_start(*p))
474                     while (is_identifier_part(*p))
475                               p++;
476           while (ch_isblank(*p))
477                     p++;
478           if (*p == ')') {
479                     p++;
480                     while (ch_isblank(*p))
481                               p++;
482                     if (*p == '(' && probably_function_definition(p))
483                               ps.line_has_func_def = true;
484           }
485 }
486 
487 static bool
is_asterisk_unary(void)488 is_asterisk_unary(void)
489 {
490           const char *p = in.p;
491           while (*p == '*' || ch_isblank(*p))
492                     p++;
493           if (*p == ')')
494                     return true;
495           if (ps.next_unary || ps.in_func_def_params)
496                     return true;
497           if (ps.prev_lsym == lsym_word ||
498               ps.prev_lsym == lsym_rparen ||
499               ps.prev_lsym == lsym_rbracket)
500                     return false;
501           return ps.in_decl && ps.paren.len > 0;
502 }
503 
504 static bool
probably_in_function_definition(void)505 probably_in_function_definition(void)
506 {
507           for (const char *p = in.p; *p != '\n';) {
508                     if (ch_isspace(*p))
509                               p++;
510                     else if (is_identifier_start(*p)) {
511                               p++;
512                               while (is_identifier_part(*p))
513                                         p++;
514                     } else
515                               return *p == '(';
516           }
517           return false;
518 }
519 
520 static void
lex_asterisk_unary(void)521 lex_asterisk_unary(void)
522 {
523           while (*in.p == '*' || ch_isspace(*in.p)) {
524                     if (*in.p == '*')
525                               token_add_char('*');
526                     if (*in.p == '\n')
527                               in.token_end_line++;
528                     inp_skip();
529           }
530 
531           if (ps.in_decl && probably_in_function_definition())
532                     ps.line_has_func_def = true;
533 }
534 
535 static bool
skip(const char ** pp,const char * s)536 skip(const char **pp, const char *s)
537 {
538           size_t len = strlen(s);
539           while (ch_isblank(**pp))
540                     (*pp)++;
541           if (strncmp(*pp, s, len) == 0) {
542                     *pp += len;
543                     return true;
544           }
545           return false;
546 }
547 
548 static void
lex_indent_comment(void)549 lex_indent_comment(void)
550 {
551           const char *p = in.line.s;
552           if (skip(&p, "/*") && skip(&p, "INDENT")) {
553                     enum indent_enabled enabled;
554                     if (skip(&p, "ON") || *p == '*')
555                               enabled = indent_last_off_line;
556                     else if (skip(&p, "OFF"))
557                               enabled = indent_off;
558                     else
559                               return;
560                     if (skip(&p, "*/\n")) {
561                               if (lab.len > 0 || code.len > 0 || com.len > 0)
562                                         output_line();
563                               indent_enabled = enabled;
564                     }
565           }
566 }
567 
568 /* Reads the next token, placing it in the global variable "token". */
569 lexer_symbol
lexi(void)570 lexi(void)
571 {
572           buf_clear(&token);
573 
574           for (;;) {
575                     if (ch_isblank(*in.p))
576                               in.p++;
577                     else if (skip_line_continuation())
578                               continue;
579                     else
580                               break;
581           }
582           in.token_start_line = in.token_end_line;
583 
584           lexer_symbol alnum_lsym = lexi_alnum();
585           if (alnum_lsym != lsym_eof)
586                     return alnum_lsym;
587 
588           /* Scan a non-alphanumeric token */
589 
590           token_add_char(inp_next());
591 
592           lexer_symbol lsym;
593           bool next_unary;
594 
595           switch (token.s[token.len - 1]) {
596 
597           case '#':
598                     lsym = lsym_preprocessing;
599                     next_unary = ps.next_unary;
600                     break;
601 
602           case '\n':
603                     /* if data has been exhausted, the '\n' is a dummy. */
604                     lsym = had_eof ? lsym_eof : lsym_newline;
605                     next_unary = ps.next_unary;
606                     break;
607 
608           /* INDENT OFF */
609           case ')': lsym = lsym_rparen; next_unary = false; break;
610           case '[': lsym = lsym_lbracket;         next_unary = true;  break;
611           case ']': lsym = lsym_rbracket;         next_unary = false; break;
612           case '{': lsym = lsym_lbrace; next_unary = true;  break;
613           case '}': lsym = lsym_rbrace; next_unary = true;  break;
614           case '.': lsym = lsym_period; next_unary = false; break;
615           case '?': lsym = lsym_question;         next_unary = true;  break;
616           case ',': lsym = lsym_comma;  next_unary = true;  break;
617           case ';': lsym = lsym_semicolon;        next_unary = true;  break;
618           /* INDENT ON */
619 
620           case '(':
621                     if (in.p == in.line.s + 1)
622                               check_parenthesized_function_definition();
623                     lsym = lsym_lparen;
624                     next_unary = true;
625                     break;
626 
627           case '+':
628           case '-':
629                     lsym = ps.next_unary ? lsym_unary_op : lsym_binary_op;
630                     next_unary = true;
631 
632                     /* '++' or '--' */
633                     if (*in.p == token.s[token.len - 1]) {
634                               token_add_char(*in.p++);
635                               if (ps.prev_lsym == lsym_word ||
636                                   ps.prev_lsym == lsym_rparen ||
637                                   ps.prev_lsym == lsym_rbracket) {
638                                         lsym = ps.next_unary
639                                             ? lsym_unary_op : lsym_postfix_op;
640                                         next_unary = false;
641                               }
642 
643                     } else if (*in.p == '=') {    /* '+=' or '-=' */
644                               token_add_char(*in.p++);
645 
646                     } else if (*in.p == '>') {    /* '->' */
647                               token_add_char(*in.p++);
648                               lsym = lsym_unary_op;
649                               next_unary = false;
650                               ps.want_blank = false;
651                     }
652                     break;
653 
654           case ':':
655                     lsym = ps.quest_level > 0
656                         ? (ps.quest_level--, lsym_question_colon)
657                         : ps.in_var_decl ? lsym_other_colon : lsym_label_colon;
658                     next_unary = true;
659                     break;
660 
661           case '*':
662                     if (*in.p == '=') {
663                               token_add_char(*in.p++);
664                               lsym = lsym_binary_op;
665                     } else if (is_asterisk_unary()) {
666                               lex_asterisk_unary();
667                               lsym = lsym_unary_op;
668                     } else
669                               lsym = lsym_binary_op;
670                     next_unary = true;
671                     break;
672 
673           case '=':
674                     if (ps.in_var_decl)
675                               ps.in_init = true;
676                     if (*in.p == '=')
677                               token_add_char(*in.p++);
678                     lsym = lsym_binary_op;
679                     next_unary = true;
680                     break;
681 
682           case '>':
683           case '<':
684           case '!':           /* ops like <, <<, <=, !=, etc. */
685                     if (*in.p == '>' || *in.p == '<' || *in.p == '=')
686                               token_add_char(*in.p++);
687                     if (*in.p == '=')
688                               token_add_char(*in.p++);
689                     lsym = ps.next_unary ? lsym_unary_op : lsym_binary_op;
690                     next_unary = true;
691                     break;
692 
693           case '\'':
694           case '"':
695                     lex_char_or_string();
696                     lsym = lsym_word;
697                     next_unary = false;
698                     break;
699 
700           default:
701                     if (token.s[token.len - 1] == '/'
702                         && (*in.p == '*' || *in.p == '/')) {
703                               enum indent_enabled prev = indent_enabled;
704                               lex_indent_comment();
705                               if (prev == indent_on && indent_enabled == indent_off)
706                                         buf_clear(&out.indent_off_text);
707                               token_add_char(*in.p++);
708                               lsym = lsym_comment;
709                               next_unary = ps.next_unary;
710                               break;
711                     }
712 
713                     /* punctuation like '%', '&&', '/', '^', '||', '~' */
714                     lsym = ps.next_unary ? lsym_unary_op : lsym_binary_op;
715                     if (*in.p == token.s[token.len - 1])
716                               token_add_char(*in.p++), lsym = lsym_binary_op;
717                     if (*in.p == '=')
718                               token_add_char(*in.p++), lsym = lsym_binary_op;
719 
720                     next_unary = true;
721           }
722 
723           ps.next_unary = next_unary;
724 
725           return lsym;
726 }
727