1 /* CPP Library - lexical analysis.
2    Copyright (C) 2000-2022 Free Software Foundation, Inc.
3    Contributed by Per Bothner, 1994-95.
4    Based on CCCP program by Paul Rubin, June 1986
5    Adapted to ANSI C, Richard Stallman, Jan 1987
6    Broken out to separate file, Zack Weinberg, Mar 2000
7 
8 This program is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by the
10 Free Software Foundation; either version 3, or (at your option) any
11 later version.
12 
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16 GNU General Public License for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21 
22 #include "config.h"
23 #include "system.h"
24 #include "cpplib.h"
25 #include "internal.h"
26 
27 enum spell_type
28 {
29   SPELL_OPERATOR = 0,
30   SPELL_IDENT,
31   SPELL_LITERAL,
32   SPELL_NONE
33 };
34 
35 struct token_spelling
36 {
37   enum spell_type category;
38   const unsigned char *name;
39 };
40 
41 static const unsigned char *const digraph_spellings[] =
42 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
43 
44 #define OP(e, s) { SPELL_OPERATOR, UC s  },
45 #define TK(e, s) { SPELL_ ## s,    UC #e },
46 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
47 #undef OP
48 #undef TK
49 
50 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
51 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
52 
53 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
54 static int skip_line_comment (cpp_reader *);
55 static void skip_whitespace (cpp_reader *, cppchar_t);
56 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
57 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
58 static void store_comment (cpp_reader *, cpp_token *);
59 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
60                                   unsigned int, enum cpp_ttype);
61 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
62 static int name_p (cpp_reader *, const cpp_string *);
63 static tokenrun *next_tokenrun (tokenrun *);
64 
65 static _cpp_buff *new_buff (size_t);
66 
67 
68 /* Utility routine:
69 
70    Compares, the token TOKEN to the NUL-terminated string STRING.
71    TOKEN must be a CPP_NAME.  Returns 1 for equal, 0 for unequal.  */
72 int
cpp_ideq(const cpp_token * token,const char * string)73 cpp_ideq (const cpp_token *token, const char *string)
74 {
75   if (token->type != CPP_NAME)
76     return 0;
77 
78   return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
79 }
80 
81 /* Record a note TYPE at byte POS into the current cleaned logical
82    line.  */
83 static void
add_line_note(cpp_buffer * buffer,const uchar * pos,unsigned int type)84 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
85 {
86   if (buffer->notes_used == buffer->notes_cap)
87     {
88       buffer->notes_cap = buffer->notes_cap * 2 + 200;
89       buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
90                                   buffer->notes_cap);
91     }
92 
93   buffer->notes[buffer->notes_used].pos = pos;
94   buffer->notes[buffer->notes_used].type = type;
95   buffer->notes_used++;
96 }
97 
98 
99 /* Fast path to find line special characters using optimized character
100    scanning algorithms.  Anything complicated falls back to the slow
101    path below.  Since this loop is very hot it's worth doing these kinds
102    of optimizations.
103 
104    One of the paths through the ifdefs should provide
105 
106      const uchar *search_line_fast (const uchar *s, const uchar *end);
107 
108    Between S and END, search for \n, \r, \\, ?.  Return a pointer to
109    the found character.
110 
111    Note that the last character of the buffer is *always* a newline,
112    as forced by _cpp_convert_input.  This fact can be used to avoid
113    explicitly looking for the end of the buffer.  */
114 
115 /* Configure gives us an ifdef test.  */
116 #ifndef WORDS_BIGENDIAN
117 #define WORDS_BIGENDIAN 0
118 #endif
119 
120 /* We'd like the largest integer that fits into a register.  There's nothing
121    in <stdint.h> that gives us that.  For most hosts this is unsigned long,
122    but MS decided on an LLP64 model.  Thankfully when building with GCC we
123    can get the "real" word size.  */
124 #ifdef __GNUC__
125 typedef unsigned int word_type __attribute__((__mode__(__word__)));
126 #else
127 typedef unsigned long word_type;
128 #endif
129 
130 /* The code below is only expecting sizes 4 or 8.
131    Die at compile-time if this expectation is violated.  */
132 typedef char check_word_type_size
133   [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
134 
135 /* Return X with the first N bytes forced to values that won't match one
136    of the interesting characters.  Note that NUL is not interesting.  */
137 
138 static inline word_type
acc_char_mask_misalign(word_type val,unsigned int n)139 acc_char_mask_misalign (word_type val, unsigned int n)
140 {
141   word_type mask = -1;
142   if (WORDS_BIGENDIAN)
143     mask >>= n * 8;
144   else
145     mask <<= n * 8;
146   return val & mask;
147 }
148 
149 /* Return X replicated to all byte positions within WORD_TYPE.  */
150 
151 static inline word_type
acc_char_replicate(uchar x)152 acc_char_replicate (uchar x)
153 {
154   word_type ret;
155 
156   ret = (x << 24) | (x << 16) | (x << 8) | x;
157   if (sizeof(word_type) == 8)
158     ret = (ret << 16 << 16) | ret;
159   return ret;
160 }
161 
162 /* Return non-zero if some byte of VAL is (probably) C.  */
163 
164 static inline word_type
acc_char_cmp(word_type val,word_type c)165 acc_char_cmp (word_type val, word_type c)
166 {
167 #if defined(__GNUC__) && defined(__alpha__)
168   /* We can get exact results using a compare-bytes instruction.
169      Get (val == c) via (0 >= (val ^ c)).  */
170   return __builtin_alpha_cmpbge (0, val ^ c);
171 #else
172   word_type magic = 0x7efefefeU;
173   if (sizeof(word_type) == 8)
174     magic = (magic << 16 << 16) | 0xfefefefeU;
175   magic |= 1;
176 
177   val ^= c;
178   return ((val + magic) ^ ~val) & ~magic;
179 #endif
180 }
181 
182 /* Given the result of acc_char_cmp is non-zero, return the index of
183    the found character.  If this was a false positive, return -1.  */
184 
185 static inline int
acc_char_index(word_type cmp ATTRIBUTE_UNUSED,word_type val ATTRIBUTE_UNUSED)186 acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
187                     word_type val ATTRIBUTE_UNUSED)
188 {
189 #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
190   /* The cmpbge instruction sets *bits* of the result corresponding to
191      matches in the bytes with no false positives.  */
192   return __builtin_ctzl (cmp);
193 #else
194   unsigned int i;
195 
196   /* ??? It would be nice to force unrolling here,
197      and have all of these constants folded.  */
198   for (i = 0; i < sizeof(word_type); ++i)
199     {
200       uchar c;
201       if (WORDS_BIGENDIAN)
202           c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
203       else
204           c = (val >> i * 8) & 0xff;
205 
206       if (c == '\n' || c == '\r' || c == '\\' || c == '?')
207           return i;
208     }
209 
210   return -1;
211 #endif
212 }
213 
214 /* A version of the fast scanner using bit fiddling techniques.
215 
216    For 32-bit words, one would normally perform 16 comparisons and
217    16 branches.  With this algorithm one performs 24 arithmetic
218    operations and one branch.  Whether this is faster with a 32-bit
219    word size is going to be somewhat system dependent.
220 
221    For 64-bit words, we eliminate twice the number of comparisons
222    and branches without increasing the number of arithmetic operations.
223    It's almost certainly going to be a win with 64-bit word size.  */
224 
225 static const uchar * search_line_acc_char (const uchar *, const uchar *)
226   ATTRIBUTE_UNUSED;
227 
228 static const uchar *
search_line_acc_char(const uchar * s,const uchar * end ATTRIBUTE_UNUSED)229 search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
230 {
231   const word_type repl_nl = acc_char_replicate ('\n');
232   const word_type repl_cr = acc_char_replicate ('\r');
233   const word_type repl_bs = acc_char_replicate ('\\');
234   const word_type repl_qm = acc_char_replicate ('?');
235 
236   unsigned int misalign;
237   const word_type *p;
238   word_type val, t;
239 
240   /* Align the buffer.  Mask out any bytes from before the beginning.  */
241   p = (word_type *)((uintptr_t)s & -sizeof(word_type));
242   val = *p;
243   misalign = (uintptr_t)s & (sizeof(word_type) - 1);
244   if (misalign)
245     val = acc_char_mask_misalign (val, misalign);
246 
247   /* Main loop.  */
248   while (1)
249     {
250       t  = acc_char_cmp (val, repl_nl);
251       t |= acc_char_cmp (val, repl_cr);
252       t |= acc_char_cmp (val, repl_bs);
253       t |= acc_char_cmp (val, repl_qm);
254 
255       if (__builtin_expect (t != 0, 0))
256           {
257             int i = acc_char_index (t, val);
258             if (i >= 0)
259               return (const uchar *)p + i;
260           }
261 
262       val = *++p;
263     }
264 }
265 
266 /* Disable on Solaris 2/x86 until the following problem can be properly
267    autoconfed:
268 
269    The Solaris 10+ assembler tags objects with the instruction set
270    extensions used, so SSE4.2 executables cannot run on machines that
271    don't support that extension.  */
272 
273 #if (GCC_VERSION >= 4005) && (__GNUC__ >= 5 || !defined(__PIC__)) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
274 
275 /* Replicated character data to be shared between implementations.
276    Recall that outside of a context with vector support we can't
277    define compatible vector types, therefore these are all defined
278    in terms of raw characters.  */
279 static const char repl_chars[4][16] __attribute__((aligned(16))) = {
280   { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
281     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
282   { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
283     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
284   { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
285     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
286   { '?', '?', '?', '?', '?', '?', '?', '?',
287     '?', '?', '?', '?', '?', '?', '?', '?' },
288 };
289 
290 /* A version of the fast scanner using MMX vectorized byte compare insns.
291 
292    This uses the PMOVMSKB instruction which was introduced with "MMX2",
293    which was packaged into SSE1; it is also present in the AMD MMX
294    extension.  Mark the function as using "sse" so that we emit a real
295    "emms" instruction, rather than the 3dNOW "femms" instruction.  */
296 
297 static const uchar *
298 #ifndef __SSE__
299 __attribute__((__target__("sse")))
300 #endif
search_line_mmx(const uchar * s,const uchar * end ATTRIBUTE_UNUSED)301 search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
302 {
303   typedef char v8qi __attribute__ ((__vector_size__ (8)));
304   typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
305 
306   const v8qi repl_nl = *(const v8qi *)repl_chars[0];
307   const v8qi repl_cr = *(const v8qi *)repl_chars[1];
308   const v8qi repl_bs = *(const v8qi *)repl_chars[2];
309   const v8qi repl_qm = *(const v8qi *)repl_chars[3];
310 
311   unsigned int misalign, found, mask;
312   const v8qi *p;
313   v8qi data, t, c;
314 
315   /* Align the source pointer.  While MMX doesn't generate unaligned data
316      faults, this allows us to safely scan to the end of the buffer without
317      reading beyond the end of the last page.  */
318   misalign = (uintptr_t)s & 7;
319   p = (const v8qi *)((uintptr_t)s & -8);
320   data = *p;
321 
322   /* Create a mask for the bytes that are valid within the first
323      16-byte block.  The Idea here is that the AND with the mask
324      within the loop is "free", since we need some AND or TEST
325      insn in order to set the flags for the branch anyway.  */
326   mask = -1u << misalign;
327 
328   /* Main loop processing 8 bytes at a time.  */
329   goto start;
330   do
331     {
332       data = *++p;
333       mask = -1;
334 
335     start:
336       t = __builtin_ia32_pcmpeqb(data, repl_nl);
337       c = __builtin_ia32_pcmpeqb(data, repl_cr);
338       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
339       c = __builtin_ia32_pcmpeqb(data, repl_bs);
340       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
341       c = __builtin_ia32_pcmpeqb(data, repl_qm);
342       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
343       found = __builtin_ia32_pmovmskb (t);
344       found &= mask;
345     }
346   while (!found);
347 
348   __builtin_ia32_emms ();
349 
350   /* FOUND contains 1 in bits for which we matched a relevant
351      character.  Conversion to the byte index is trivial.  */
352   found = __builtin_ctz(found);
353   return (const uchar *)p + found;
354 }
355 
356 /* A version of the fast scanner using SSE2 vectorized byte compare insns.  */
357 
358 static const uchar *
359 #ifndef __SSE2__
360 __attribute__((__target__("sse2")))
361 #endif
search_line_sse2(const uchar * s,const uchar * end ATTRIBUTE_UNUSED)362 search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
363 {
364   typedef char v16qi __attribute__ ((__vector_size__ (16)));
365 
366   const v16qi repl_nl = *(const v16qi *)repl_chars[0];
367   const v16qi repl_cr = *(const v16qi *)repl_chars[1];
368   const v16qi repl_bs = *(const v16qi *)repl_chars[2];
369   const v16qi repl_qm = *(const v16qi *)repl_chars[3];
370 
371   unsigned int misalign, found, mask;
372   const v16qi *p;
373   v16qi data, t;
374 
375   /* Align the source pointer.  */
376   misalign = (uintptr_t)s & 15;
377   p = (const v16qi *)((uintptr_t)s & -16);
378   data = *p;
379 
380   /* Create a mask for the bytes that are valid within the first
381      16-byte block.  The Idea here is that the AND with the mask
382      within the loop is "free", since we need some AND or TEST
383      insn in order to set the flags for the branch anyway.  */
384   mask = -1u << misalign;
385 
386   /* Main loop processing 16 bytes at a time.  */
387   goto start;
388   do
389     {
390       data = *++p;
391       mask = -1;
392 
393     start:
394       t  = data == repl_nl;
395       t |= data == repl_cr;
396       t |= data == repl_bs;
397       t |= data == repl_qm;
398       found = __builtin_ia32_pmovmskb128 (t);
399       found &= mask;
400     }
401   while (!found);
402 
403   /* FOUND contains 1 in bits for which we matched a relevant
404      character.  Conversion to the byte index is trivial.  */
405   found = __builtin_ctz(found);
406   return (const uchar *)p + found;
407 }
408 
409 #ifdef HAVE_SSE4
410 /* A version of the fast scanner using SSE 4.2 vectorized string insns.  */
411 
412 static const uchar *
413 #ifndef __SSE4_2__
414 __attribute__((__target__("sse4.2")))
415 #endif
search_line_sse42(const uchar * s,const uchar * end)416 search_line_sse42 (const uchar *s, const uchar *end)
417 {
418   typedef char v16qi __attribute__ ((__vector_size__ (16)));
419   static const v16qi search = { '\n', '\r', '?', '\\' };
420 
421   uintptr_t si = (uintptr_t)s;
422   uintptr_t index;
423 
424   /* Check for unaligned input.  */
425   if (si & 15)
426     {
427       v16qi sv;
428 
429       if (__builtin_expect (end - s < 16, 0)
430             && __builtin_expect ((si & 0xfff) > 0xff0, 0))
431           {
432             /* There are less than 16 bytes left in the buffer, and less
433                than 16 bytes left on the page.  Reading 16 bytes at this
434                point might generate a spurious page fault.  Defer to the
435                SSE2 implementation, which already handles alignment.  */
436             return search_line_sse2 (s, end);
437           }
438 
439       /* ??? The builtin doesn't understand that the PCMPESTRI read from
440            memory need not be aligned.  */
441       sv = __builtin_ia32_loaddqu ((const char *) s);
442       index = __builtin_ia32_pcmpestri128 (search, 4, sv, 16, 0);
443 
444       if (__builtin_expect (index < 16, 0))
445           goto found;
446 
447       /* Advance the pointer to an aligned address.  We will re-scan a
448            few bytes, but we no longer need care for reading past the
449            end of a page, since we're guaranteed a match.  */
450       s = (const uchar *)((si + 15) & -16);
451     }
452 
453   /* Main loop, processing 16 bytes at a time.  */
454 #ifdef __GCC_ASM_FLAG_OUTPUTS__
455   while (1)
456     {
457       char f;
458 
459       /* By using inline assembly instead of the builtin,
460            we can use the result, as well as the flags set.  */
461       __asm ("%vpcmpestri\t$0, %2, %3"
462                : "=c"(index), "=@ccc"(f)
463                : "m"(*s), "x"(search), "a"(4), "d"(16));
464       if (f)
465           break;
466 
467       s += 16;
468     }
469 #else
470   s -= 16;
471   /* By doing the whole loop in inline assembly,
472      we can make proper use of the flags set.  */
473   __asm (      ".balign 16\n"
474           "0:       add $16, %1\n"
475           "         %vpcmpestri\t$0, (%1), %2\n"
476           "         jnc 0b"
477           : "=&c"(index), "+r"(s)
478           : "x"(search), "a"(4), "d"(16));
479 #endif
480 
481  found:
482   return s + index;
483 }
484 
485 #else
486 /* Work around out-dated assemblers without sse4 support.  */
487 #define search_line_sse42 search_line_sse2
488 #endif
489 
490 /* Check the CPU capabilities.  */
491 
492 #include "../gcc/config/i386/cpuid.h"
493 
494 typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
495 static search_line_fast_type search_line_fast;
496 
497 #define HAVE_init_vectorized_lexer 1
498 static inline void
init_vectorized_lexer(void)499 init_vectorized_lexer (void)
500 {
501   unsigned dummy, ecx = 0, edx = 0;
502   search_line_fast_type impl = search_line_acc_char;
503   int minimum = 0;
504 
505 #if defined(__SSE4_2__)
506   minimum = 3;
507 #elif defined(__SSE2__)
508   minimum = 2;
509 #elif defined(__SSE__)
510   minimum = 1;
511 #endif
512 
513   if (minimum == 3)
514     impl = search_line_sse42;
515   else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
516     {
517       if (minimum == 3 || (ecx & bit_SSE4_2))
518         impl = search_line_sse42;
519       else if (minimum == 2 || (edx & bit_SSE2))
520           impl = search_line_sse2;
521       else if (minimum == 1 || (edx & bit_SSE))
522           impl = search_line_mmx;
523     }
524   else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
525     {
526       if (minimum == 1
527             || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
528           impl = search_line_mmx;
529     }
530 
531   search_line_fast = impl;
532 }
533 
534 #elif (GCC_VERSION >= 4005) && defined(_ARCH_PWR8) && defined(__ALTIVEC__)
535 
536 /* A vection of the fast scanner using AltiVec vectorized byte compares
537    and VSX unaligned loads (when VSX is available).  This is otherwise
538    the same as the AltiVec version.  */
539 
540 ATTRIBUTE_NO_SANITIZE_UNDEFINED
541 static const uchar *
search_line_fast(const uchar * s,const uchar * end ATTRIBUTE_UNUSED)542 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
543 {
544   typedef __attribute__((altivec(vector))) unsigned char vc;
545 
546   const vc repl_nl = {
547     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
548     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
549   };
550   const vc repl_cr = {
551     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
552     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
553   };
554   const vc repl_bs = {
555     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
556     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
557   };
558   const vc repl_qm = {
559     '?', '?', '?', '?', '?', '?', '?', '?',
560     '?', '?', '?', '?', '?', '?', '?', '?',
561   };
562   const vc zero = { 0 };
563 
564   vc data, t;
565 
566   /* Main loop processing 16 bytes at a time.  */
567   do
568     {
569       vc m_nl, m_cr, m_bs, m_qm;
570 
571       data = __builtin_vec_vsx_ld (0, s);
572       s += 16;
573 
574       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
575       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
576       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
577       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
578       t = (m_nl | m_cr) | (m_bs | m_qm);
579 
580       /* T now contains 0xff in bytes for which we matched one of the relevant
581            characters.  We want to exit the loop if any byte in T is non-zero.
582            Below is the expansion of vec_any_ne(t, zero).  */
583     }
584   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
585 
586   /* Restore s to to point to the 16 bytes we just processed.  */
587   s -= 16;
588 
589   {
590 #define N  (sizeof(vc) / sizeof(long))
591 
592     union {
593       vc v;
594       /* Statically assert that N is 2 or 4.  */
595       unsigned long l[(N == 2 || N == 4) ? N : -1];
596     } u;
597     unsigned long l, i = 0;
598 
599     u.v = t;
600 
601     /* Find the first word of T that is non-zero.  */
602     switch (N)
603       {
604       case 4:
605           l = u.l[i++];
606           if (l != 0)
607             break;
608           s += sizeof(unsigned long);
609           l = u.l[i++];
610           if (l != 0)
611             break;
612           s += sizeof(unsigned long);
613           /* FALLTHRU */
614       case 2:
615           l = u.l[i++];
616           if (l != 0)
617             break;
618           s += sizeof(unsigned long);
619           l = u.l[i];
620       }
621 
622     /* L now contains 0xff in bytes for which we matched one of the
623        relevant characters.  We can find the byte index by finding
624        its bit index and dividing by 8.  */
625 #ifdef __BIG_ENDIAN__
626     l = __builtin_clzl(l) >> 3;
627 #else
628     l = __builtin_ctzl(l) >> 3;
629 #endif
630     return s + l;
631 
632 #undef N
633   }
634 }
635 
636 #elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__) && defined (__BIG_ENDIAN__)
637 
638 /* A vection of the fast scanner using AltiVec vectorized byte compares.
639    This cannot be used for little endian because vec_lvsl/lvsr are
640    deprecated for little endian and the code won't work properly.  */
641 /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
642    so we can't compile this function without -maltivec on the command line
643    (or implied by some other switch).  */
644 
645 static const uchar *
search_line_fast(const uchar * s,const uchar * end ATTRIBUTE_UNUSED)646 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
647 {
648   typedef __attribute__((altivec(vector))) unsigned char vc;
649 
650   const vc repl_nl = {
651     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
652     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
653   };
654   const vc repl_cr = {
655     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
656     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
657   };
658   const vc repl_bs = {
659     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
660     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
661   };
662   const vc repl_qm = {
663     '?', '?', '?', '?', '?', '?', '?', '?',
664     '?', '?', '?', '?', '?', '?', '?', '?',
665   };
666   const vc ones = {
667     -1, -1, -1, -1, -1, -1, -1, -1,
668     -1, -1, -1, -1, -1, -1, -1, -1,
669   };
670   const vc zero = { 0 };
671 
672   vc data, mask, t;
673 
674   /* Altivec loads automatically mask addresses with -16.  This lets us
675      issue the first load as early as possible.  */
676   data = __builtin_vec_ld(0, (const vc *)s);
677 
678   /* Discard bytes before the beginning of the buffer.  Do this by
679      beginning with all ones and shifting in zeros according to the
680      mis-alignment.  The LVSR instruction pulls the exact shift we
681      want from the address.  */
682   mask = __builtin_vec_lvsr(0, s);
683   mask = __builtin_vec_perm(zero, ones, mask);
684   data &= mask;
685 
686   /* While altivec loads mask addresses, we still need to align S so
687      that the offset we compute at the end is correct.  */
688   s = (const uchar *)((uintptr_t)s & -16);
689 
690   /* Main loop processing 16 bytes at a time.  */
691   goto start;
692   do
693     {
694       vc m_nl, m_cr, m_bs, m_qm;
695 
696       s += 16;
697       data = __builtin_vec_ld(0, (const vc *)s);
698 
699     start:
700       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
701       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
702       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
703       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
704       t = (m_nl | m_cr) | (m_bs | m_qm);
705 
706       /* T now contains 0xff in bytes for which we matched one of the relevant
707            characters.  We want to exit the loop if any byte in T is non-zero.
708            Below is the expansion of vec_any_ne(t, zero).  */
709     }
710   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
711 
712   {
713 #define N  (sizeof(vc) / sizeof(long))
714 
715     union {
716       vc v;
717       /* Statically assert that N is 2 or 4.  */
718       unsigned long l[(N == 2 || N == 4) ? N : -1];
719     } u;
720     unsigned long l, i = 0;
721 
722     u.v = t;
723 
724     /* Find the first word of T that is non-zero.  */
725     switch (N)
726       {
727       case 4:
728           l = u.l[i++];
729           if (l != 0)
730             break;
731           s += sizeof(unsigned long);
732           l = u.l[i++];
733           if (l != 0)
734             break;
735           s += sizeof(unsigned long);
736           /* FALLTHROUGH */
737       case 2:
738           l = u.l[i++];
739           if (l != 0)
740             break;
741           s += sizeof(unsigned long);
742           l = u.l[i];
743       }
744 
745     /* L now contains 0xff in bytes for which we matched one of the
746        relevant characters.  We can find the byte index by finding
747        its bit index and dividing by 8.  */
748     l = __builtin_clzl(l) >> 3;
749     return s + l;
750 
751 #undef N
752   }
753 }
754 
755 #elif defined (__ARM_NEON) && defined (__ARM_64BIT_STATE)
756 #include "arm_neon.h"
757 
758 /* This doesn't have to be the exact page size, but no system may use
759    a size smaller than this.  ARMv8 requires a minimum page size of
760    4k.  The impact of being conservative here is a small number of
761    cases will take the slightly slower entry path into the main
762    loop.  */
763 
764 #define AARCH64_MIN_PAGE_SIZE 4096
765 
766 static const uchar *
search_line_fast(const uchar * s,const uchar * end ATTRIBUTE_UNUSED)767 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
768 {
769   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
770   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
771   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
772   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
773   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
774 
775 #ifdef __ARM_BIG_ENDIAN
776   const int16x8_t shift = {8, 8, 8, 8, 0, 0, 0, 0};
777 #else
778   const int16x8_t shift = {0, 0, 0, 0, 8, 8, 8, 8};
779 #endif
780 
781   unsigned int found;
782   const uint8_t *p;
783   uint8x16_t data;
784   uint8x16_t t;
785   uint16x8_t m;
786   uint8x16_t u, v, w;
787 
788   /* Align the source pointer.  */
789   p = (const uint8_t *)((uintptr_t)s & -16);
790 
791   /* Assuming random string start positions, with a 4k page size we'll take
792      the slow path about 0.37% of the time.  */
793   if (__builtin_expect ((AARCH64_MIN_PAGE_SIZE
794                                - (((uintptr_t) s) & (AARCH64_MIN_PAGE_SIZE - 1)))
795                               < 16, 0))
796     {
797       /* Slow path: the string starts near a possible page boundary.  */
798       uint32_t misalign, mask;
799 
800       misalign = (uintptr_t)s & 15;
801       mask = (-1u << misalign) & 0xffff;
802       data = vld1q_u8 (p);
803       t = vceqq_u8 (data, repl_nl);
804       u = vceqq_u8 (data, repl_cr);
805       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
806       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
807       t = vorrq_u8 (v, w);
808       t = vandq_u8 (t, xmask);
809       m = vpaddlq_u8 (t);
810       m = vshlq_u16 (m, shift);
811       found = vaddvq_u16 (m);
812       found &= mask;
813       if (found)
814           return (const uchar*)p + __builtin_ctz (found);
815     }
816   else
817     {
818       data = vld1q_u8 ((const uint8_t *) s);
819       t = vceqq_u8 (data, repl_nl);
820       u = vceqq_u8 (data, repl_cr);
821       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
822       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
823       t = vorrq_u8 (v, w);
824       if (__builtin_expect (vpaddd_u64 ((uint64x2_t)t) != 0, 0))
825           goto done;
826     }
827 
828   do
829     {
830       p += 16;
831       data = vld1q_u8 (p);
832       t = vceqq_u8 (data, repl_nl);
833       u = vceqq_u8 (data, repl_cr);
834       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
835       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
836       t = vorrq_u8 (v, w);
837     } while (!vpaddd_u64 ((uint64x2_t)t));
838 
839 done:
840   /* Now that we've found the terminating substring, work out precisely where
841      we need to stop.  */
842   t = vandq_u8 (t, xmask);
843   m = vpaddlq_u8 (t);
844   m = vshlq_u16 (m, shift);
845   found = vaddvq_u16 (m);
846   return (((((uintptr_t) p) < (uintptr_t) s) ? s : (const uchar *)p)
847             + __builtin_ctz (found));
848 }
849 
850 #elif defined (__ARM_NEON)
851 #include "arm_neon.h"
852 
853 static const uchar *
search_line_fast(const uchar * s,const uchar * end ATTRIBUTE_UNUSED)854 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
855 {
856   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
857   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
858   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
859   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
860   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
861 
862   unsigned int misalign, found, mask;
863   const uint8_t *p;
864   uint8x16_t data;
865 
866   /* Align the source pointer.  */
867   misalign = (uintptr_t)s & 15;
868   p = (const uint8_t *)((uintptr_t)s & -16);
869   data = vld1q_u8 (p);
870 
871   /* Create a mask for the bytes that are valid within the first
872      16-byte block.  The Idea here is that the AND with the mask
873      within the loop is "free", since we need some AND or TEST
874      insn in order to set the flags for the branch anyway.  */
875   mask = (-1u << misalign) & 0xffff;
876 
877   /* Main loop, processing 16 bytes at a time.  */
878   goto start;
879 
880   do
881     {
882       uint8x8_t l;
883       uint16x4_t m;
884       uint32x2_t n;
885       uint8x16_t t, u, v, w;
886 
887       p += 16;
888       data = vld1q_u8 (p);
889       mask = 0xffff;
890 
891     start:
892       t = vceqq_u8 (data, repl_nl);
893       u = vceqq_u8 (data, repl_cr);
894       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
895       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
896       t = vandq_u8 (vorrq_u8 (v, w), xmask);
897       l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t));
898       m = vpaddl_u8 (l);
899       n = vpaddl_u16 (m);
900 
901       found = vget_lane_u32 ((uint32x2_t) vorr_u64 ((uint64x1_t) n,
902                 vshr_n_u64 ((uint64x1_t) n, 24)), 0);
903       found &= mask;
904     }
905   while (!found);
906 
907   /* FOUND contains 1 in bits for which we matched a relevant
908      character.  Conversion to the byte index is trivial.  */
909   found = __builtin_ctz (found);
910   return (const uchar *)p + found;
911 }
912 
913 #else
914 
915 /* We only have one accelerated alternative.  Use a direct call so that
916    we encourage inlining.  */
917 
918 #define search_line_fast  search_line_acc_char
919 
920 #endif
921 
922 /* Initialize the lexer if needed.  */
923 
924 void
_cpp_init_lexer(void)925 _cpp_init_lexer (void)
926 {
927 #ifdef HAVE_init_vectorized_lexer
928   init_vectorized_lexer ();
929 #endif
930 }
931 
932 /* Returns with a logical line that contains no escaped newlines or
933    trigraphs.  This is a time-critical inner loop.  */
934 void
_cpp_clean_line(cpp_reader * pfile)935 _cpp_clean_line (cpp_reader *pfile)
936 {
937   cpp_buffer *buffer;
938   const uchar *s;
939   uchar c, *d, *p;
940 
941   buffer = pfile->buffer;
942   buffer->cur_note = buffer->notes_used = 0;
943   buffer->cur = buffer->line_base = buffer->next_line;
944   buffer->need_line = false;
945   s = buffer->next_line;
946 
947   if (!buffer->from_stage3)
948     {
949       const uchar *pbackslash = NULL;
950 
951       /* Fast path.  This is the common case of an un-escaped line with
952            no trigraphs.  The primary win here is by not writing any
953            data back to memory until we have to.  */
954       while (1)
955           {
956             /* Perform an optimized search for \n, \r, \\, ?.  */
957             s = search_line_fast (s, buffer->rlimit);
958 
959             c = *s;
960             if (c == '\\')
961               {
962                 /* Record the location of the backslash and continue.  */
963                 pbackslash = s++;
964               }
965             else if (__builtin_expect (c == '?', 0))
966               {
967                 if (__builtin_expect (s[1] == '?', false)
968                        && _cpp_trigraph_map[s[2]])
969                     {
970                       /* Have a trigraph.  We may or may not have to convert
971                          it.  Add a line note regardless, for -Wtrigraphs.  */
972                       add_line_note (buffer, s, s[2]);
973                       if (CPP_OPTION (pfile, trigraphs))
974                         {
975                           /* We do, and that means we have to switch to the
976                              slow path.  */
977                           d = (uchar *) s;
978                           *d = _cpp_trigraph_map[s[2]];
979                           s += 2;
980                           goto slow_path;
981                         }
982                     }
983                 /* Not a trigraph.  Continue on fast-path.  */
984                 s++;
985               }
986             else
987               break;
988           }
989 
990       /* This must be \r or \n.  We're either done, or we'll be forced
991            to write back to the buffer and continue on the slow path.  */
992       d = (uchar *) s;
993 
994       if (__builtin_expect (s == buffer->rlimit, false))
995           goto done;
996 
997       /* DOS line ending? */
998       if (__builtin_expect (c == '\r', false) && s[1] == '\n')
999           {
1000             s++;
1001             if (s == buffer->rlimit)
1002               goto done;
1003           }
1004 
1005       if (__builtin_expect (pbackslash == NULL, true))
1006           goto done;
1007 
1008       /* Check for escaped newline.  */
1009       p = d;
1010       while (is_nvspace (p[-1]))
1011           p--;
1012       if (p - 1 != pbackslash)
1013           goto done;
1014 
1015       /* Have an escaped newline; process it and proceed to
1016            the slow path.  */
1017       add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
1018       d = p - 2;
1019       buffer->next_line = p - 1;
1020 
1021     slow_path:
1022       while (1)
1023           {
1024             c = *++s;
1025             *++d = c;
1026 
1027             if (c == '\n' || c == '\r')
1028               {
1029                 /* Handle DOS line endings.  */
1030                 if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
1031                     s++;
1032                 if (s == buffer->rlimit)
1033                     break;
1034 
1035                 /* Escaped?  */
1036                 p = d;
1037                 while (p != buffer->next_line && is_nvspace (p[-1]))
1038                     p--;
1039                 if (p == buffer->next_line || p[-1] != '\\')
1040                     break;
1041 
1042                 add_line_note (buffer, p - 1, p != d ? ' ': '\\');
1043                 d = p - 2;
1044                 buffer->next_line = p - 1;
1045               }
1046             else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
1047               {
1048                 /* Add a note regardless, for the benefit of -Wtrigraphs.  */
1049                 add_line_note (buffer, d, s[2]);
1050                 if (CPP_OPTION (pfile, trigraphs))
1051                     {
1052                       *d = _cpp_trigraph_map[s[2]];
1053                       s += 2;
1054                     }
1055               }
1056           }
1057     }
1058   else
1059     {
1060       while (*s != '\n' && *s != '\r')
1061           s++;
1062       d = (uchar *) s;
1063 
1064       /* Handle DOS line endings.  */
1065       if (*s == '\r' && s + 1 != buffer->rlimit && s[1] == '\n')
1066           s++;
1067     }
1068 
1069  done:
1070   *d = '\n';
1071   /* A sentinel note that should never be processed.  */
1072   add_line_note (buffer, d + 1, '\n');
1073   buffer->next_line = s + 1;
1074 }
1075 
1076 /* Return true if the trigraph indicated by NOTE should be warned
1077    about in a comment.  */
1078 static bool
warn_in_comment(cpp_reader * pfile,_cpp_line_note * note)1079 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
1080 {
1081   const uchar *p;
1082 
1083   /* Within comments we don't warn about trigraphs, unless the
1084      trigraph forms an escaped newline, as that may change
1085      behavior.  */
1086   if (note->type != '/')
1087     return false;
1088 
1089   /* If -trigraphs, then this was an escaped newline iff the next note
1090      is coincident.  */
1091   if (CPP_OPTION (pfile, trigraphs))
1092     return note[1].pos == note->pos;
1093 
1094   /* Otherwise, see if this forms an escaped newline.  */
1095   p = note->pos + 3;
1096   while (is_nvspace (*p))
1097     p++;
1098 
1099   /* There might have been escaped newlines between the trigraph and the
1100      newline we found.  Hence the position test.  */
1101   return (*p == '\n' && p < note[1].pos);
1102 }
1103 
1104 /* Process the notes created by add_line_note as far as the current
1105    location.  */
1106 void
_cpp_process_line_notes(cpp_reader * pfile,int in_comment)1107 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
1108 {
1109   cpp_buffer *buffer = pfile->buffer;
1110 
1111   for (;;)
1112     {
1113       _cpp_line_note *note = &buffer->notes[buffer->cur_note];
1114       unsigned int col;
1115 
1116       if (note->pos > buffer->cur)
1117           break;
1118 
1119       buffer->cur_note++;
1120       col = CPP_BUF_COLUMN (buffer, note->pos + 1);
1121 
1122       if (note->type == '\\' || note->type == ' ')
1123           {
1124             if (note->type == ' ' && !in_comment)
1125               cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
1126                                          "backslash and newline separated by space");
1127 
1128             if (buffer->next_line > buffer->rlimit)
1129               {
1130                 cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
1131                                            "backslash-newline at end of file");
1132                 /* Prevent "no newline at end of file" warning.  */
1133                 buffer->next_line = buffer->rlimit;
1134               }
1135 
1136             buffer->line_base = note->pos;
1137             CPP_INCREMENT_LINE (pfile, 0);
1138           }
1139       else if (_cpp_trigraph_map[note->type])
1140           {
1141             if (CPP_OPTION (pfile, warn_trigraphs)
1142                 && (!in_comment || warn_in_comment (pfile, note)))
1143               {
1144                 if (CPP_OPTION (pfile, trigraphs))
1145                     cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
1146                                        pfile->line_table->highest_line, col,
1147                                                "trigraph ??%c converted to %c",
1148                                                note->type,
1149                                                (int) _cpp_trigraph_map[note->type]);
1150                 else
1151                     {
1152                       cpp_warning_with_line
1153                         (pfile, CPP_W_TRIGRAPHS,
1154                      pfile->line_table->highest_line, col,
1155                          "trigraph ??%c ignored, use -trigraphs to enable",
1156                          note->type);
1157                     }
1158               }
1159           }
1160       else if (note->type == 0)
1161           /* Already processed in lex_raw_string.  */;
1162       else
1163           abort ();
1164     }
1165 }
1166 
1167 namespace bidi {
1168   enum class kind {
1169     NONE, LRE, RLE, LRO, RLO, LRI, RLI, FSI, PDF, PDI, LTR, RTL
1170   };
1171 
1172   /* All the UTF-8 encodings of bidi characters start with E2.  */
1173   constexpr uchar utf8_start = 0xe2;
1174 
1175   struct context
1176   {
contextbidi::context1177     context () {}
contextbidi::context1178     context (location_t loc, kind k, bool pdf, bool ucn)
1179     : m_loc (loc), m_kind (k), m_pdf (pdf), m_ucn (ucn)
1180     {
1181     }
1182 
get_pop_kindbidi::context1183     kind get_pop_kind () const
1184     {
1185       return m_pdf ? kind::PDF : kind::PDI;
1186     }
ucn_pbidi::context1187     bool ucn_p () const
1188     {
1189       return m_ucn;
1190     }
1191 
1192     location_t m_loc;
1193     kind m_kind;
1194     unsigned m_pdf : 1;
1195     unsigned m_ucn : 1;
1196   };
1197 
1198   /* A vector holding currently open bidi contexts.  We use a char for
1199      each context, its LSB is 1 if it represents a PDF context, 0 if it
1200      represents a PDI context.  The next bit is 1 if this context was open
1201      by a bidi character written as a UCN, and 0 when it was UTF-8.  */
1202   semi_embedded_vec <context, 16> vec;
1203 
1204   /* Close the whole comment/identifier/string literal/character constant
1205      context.  */
on_close()1206   void on_close ()
1207   {
1208     vec.truncate (0);
1209   }
1210 
1211   /* Pop the last element in the vector.  */
pop()1212   void pop ()
1213   {
1214     unsigned int len = vec.count ();
1215     gcc_checking_assert (len > 0);
1216     vec.truncate (len - 1);
1217   }
1218 
1219   /* Return the pop kind of the context of the Ith element.  */
pop_kind_at(unsigned int i)1220   kind pop_kind_at (unsigned int i)
1221   {
1222     return vec[i].get_pop_kind ();
1223   }
1224 
1225   /* Return the pop kind of the context that is currently opened.  */
current_ctx()1226   kind current_ctx ()
1227   {
1228     unsigned int len = vec.count ();
1229     if (len == 0)
1230       return kind::NONE;
1231     return vec[len - 1].get_pop_kind ();
1232   }
1233 
1234   /* Return true if the current context comes from a UCN origin, that is,
1235      the bidi char which started this bidi context was written as a UCN.  */
current_ctx_ucn_p()1236   bool current_ctx_ucn_p ()
1237   {
1238     unsigned int len = vec.count ();
1239     gcc_checking_assert (len > 0);
1240     return vec[len - 1].m_ucn;
1241   }
1242 
current_ctx_loc()1243   location_t current_ctx_loc ()
1244   {
1245     unsigned int len = vec.count ();
1246     gcc_checking_assert (len > 0);
1247     return vec[len - 1].m_loc;
1248   }
1249 
1250   /* We've read a bidi char, update the current vector as necessary.
1251      LOC is only valid when K is not kind::NONE.  */
on_char(kind k,bool ucn_p,location_t loc)1252   void on_char (kind k, bool ucn_p, location_t loc)
1253   {
1254     switch (k)
1255       {
1256       case kind::LRE:
1257       case kind::RLE:
1258       case kind::LRO:
1259       case kind::RLO:
1260           vec.push (context (loc, k, true, ucn_p));
1261           break;
1262       case kind::LRI:
1263       case kind::RLI:
1264       case kind::FSI:
1265           vec.push (context (loc, k, false, ucn_p));
1266           break;
1267       /* PDF terminates the scope of the last LRE, RLE, LRO, or RLO
1268            whose scope has not yet been terminated.  */
1269       case kind::PDF:
1270           if (current_ctx () == kind::PDF)
1271             pop ();
1272           break;
1273       /* PDI terminates the scope of the last LRI, RLI, or FSI whose
1274            scope has not yet been terminated, as well as the scopes of
1275            any subsequent LREs, RLEs, LROs, or RLOs whose scopes have not
1276            yet been terminated.  */
1277       case kind::PDI:
1278           for (int i = vec.count () - 1; i >= 0; --i)
1279             if (pop_kind_at (i) == kind::PDI)
1280               {
1281                 vec.truncate (i);
1282                 break;
1283               }
1284           break;
1285       case kind::LTR:
1286       case kind::RTL:
1287           /* These aren't popped by a PDF/PDI.  */
1288           break;
1289       ATTR_LIKELY case kind::NONE:
1290           break;
1291       default:
1292           abort ();
1293       }
1294   }
1295 
1296   /* Return a descriptive string for K.  */
to_str(kind k)1297   const char *to_str (kind k)
1298   {
1299     switch (k)
1300       {
1301       case kind::LRE:
1302           return "U+202A (LEFT-TO-RIGHT EMBEDDING)";
1303       case kind::RLE:
1304           return "U+202B (RIGHT-TO-LEFT EMBEDDING)";
1305       case kind::LRO:
1306           return "U+202D (LEFT-TO-RIGHT OVERRIDE)";
1307       case kind::RLO:
1308           return "U+202E (RIGHT-TO-LEFT OVERRIDE)";
1309       case kind::LRI:
1310           return "U+2066 (LEFT-TO-RIGHT ISOLATE)";
1311       case kind::RLI:
1312           return "U+2067 (RIGHT-TO-LEFT ISOLATE)";
1313       case kind::FSI:
1314           return "U+2068 (FIRST STRONG ISOLATE)";
1315       case kind::PDF:
1316           return "U+202C (POP DIRECTIONAL FORMATTING)";
1317       case kind::PDI:
1318           return "U+2069 (POP DIRECTIONAL ISOLATE)";
1319       case kind::LTR:
1320           return "U+200E (LEFT-TO-RIGHT MARK)";
1321       case kind::RTL:
1322           return "U+200F (RIGHT-TO-LEFT MARK)";
1323       default:
1324           abort ();
1325       }
1326   }
1327 }
1328 
1329 /* Get location_t for the range of bytes [START, START + NUM_BYTES)
1330    within the current line in FILE, with the caret at START.  */
1331 
1332 static location_t
get_location_for_byte_range_in_cur_line(cpp_reader * pfile,const unsigned char * const start,size_t num_bytes)1333 get_location_for_byte_range_in_cur_line (cpp_reader *pfile,
1334                                                    const unsigned char *const start,
1335                                                    size_t num_bytes)
1336 {
1337   gcc_checking_assert (num_bytes > 0);
1338 
1339   /* CPP_BUF_COLUMN and linemap_position_for_column both refer
1340      to offsets in bytes, but CPP_BUF_COLUMN is 0-based,
1341      whereas linemap_position_for_column is 1-based.  */
1342 
1343   /* Get 0-based offsets within the line.  */
1344   size_t start_offset = CPP_BUF_COLUMN (pfile->buffer, start);
1345   size_t end_offset = start_offset + num_bytes - 1;
1346 
1347   /* Now convert to location_t, where "columns" are 1-based byte offsets.  */
1348   location_t start_loc = linemap_position_for_column (pfile->line_table,
1349                                                                   start_offset + 1);
1350   location_t end_loc = linemap_position_for_column (pfile->line_table,
1351                                                                  end_offset + 1);
1352 
1353   if (start_loc == end_loc)
1354     return start_loc;
1355 
1356   source_range src_range;
1357   src_range.m_start = start_loc;
1358   src_range.m_finish = end_loc;
1359   location_t combined_loc = COMBINE_LOCATION_DATA (pfile->line_table,
1360                                                                start_loc,
1361                                                                src_range,
1362                                                                NULL);
1363   return combined_loc;
1364 }
1365 
1366 /* Parse a sequence of 3 bytes starting with P and return its bidi code.  */
1367 
1368 static bidi::kind
get_bidi_utf8_1(const unsigned char * const p)1369 get_bidi_utf8_1 (const unsigned char *const p)
1370 {
1371   gcc_checking_assert (p[0] == bidi::utf8_start);
1372 
1373   if (p[1] == 0x80)
1374     switch (p[2])
1375       {
1376       case 0xaa:
1377           return bidi::kind::LRE;
1378       case 0xab:
1379           return bidi::kind::RLE;
1380       case 0xac:
1381           return bidi::kind::PDF;
1382       case 0xad:
1383           return bidi::kind::LRO;
1384       case 0xae:
1385           return bidi::kind::RLO;
1386       case 0x8e:
1387           return bidi::kind::LTR;
1388       case 0x8f:
1389           return bidi::kind::RTL;
1390       default:
1391           break;
1392       }
1393   else if (p[1] == 0x81)
1394     switch (p[2])
1395       {
1396       case 0xa6:
1397           return bidi::kind::LRI;
1398       case 0xa7:
1399           return bidi::kind::RLI;
1400       case 0xa8:
1401           return bidi::kind::FSI;
1402       case 0xa9:
1403           return bidi::kind::PDI;
1404       default:
1405           break;
1406       }
1407 
1408   return bidi::kind::NONE;
1409 }
1410 
1411 /* Parse a sequence of 3 bytes starting with P and return its bidi code.
1412    If the kind is not NONE, write the location to *OUT.*/
1413 
1414 static bidi::kind
get_bidi_utf8(cpp_reader * pfile,const unsigned char * const p,location_t * out)1415 get_bidi_utf8 (cpp_reader *pfile, const unsigned char *const p, location_t *out)
1416 {
1417   bidi::kind result = get_bidi_utf8_1 (p);
1418   if (result != bidi::kind::NONE)
1419     {
1420       /* We have a sequence of 3 bytes starting at P.  */
1421       *out = get_location_for_byte_range_in_cur_line (pfile, p, 3);
1422     }
1423   return result;
1424 }
1425 
1426 /* Parse a UCN where P points just past \u or \U and return its bidi code.  */
1427 
1428 static bidi::kind
get_bidi_ucn_1(const unsigned char * p,bool is_U)1429 get_bidi_ucn_1 (const unsigned char *p, bool is_U)
1430 {
1431   /* 6.4.3 Universal Character Names
1432       \u hex-quad
1433       \U hex-quad hex-quad
1434      where \unnnn means \U0000nnnn.  */
1435 
1436   if (is_U)
1437     {
1438       if (p[0] != '0' || p[1] != '0' || p[2] != '0' || p[3] != '0')
1439           return bidi::kind::NONE;
1440       /* Skip 4B so we can treat \u and \U the same below.  */
1441       p += 4;
1442     }
1443 
1444   /* All code points we are looking for start with 20xx.  */
1445   if (p[0] != '2' || p[1] != '0')
1446     return bidi::kind::NONE;
1447   else if (p[2] == '2')
1448     switch (p[3])
1449       {
1450       case 'a':
1451       case 'A':
1452           return bidi::kind::LRE;
1453       case 'b':
1454       case 'B':
1455           return bidi::kind::RLE;
1456       case 'c':
1457       case 'C':
1458           return bidi::kind::PDF;
1459       case 'd':
1460       case 'D':
1461           return bidi::kind::LRO;
1462       case 'e':
1463       case 'E':
1464           return bidi::kind::RLO;
1465       default:
1466           break;
1467       }
1468   else if (p[2] == '6')
1469     switch (p[3])
1470       {
1471       case '6':
1472           return bidi::kind::LRI;
1473       case '7':
1474           return bidi::kind::RLI;
1475       case '8':
1476           return bidi::kind::FSI;
1477       case '9':
1478           return bidi::kind::PDI;
1479       default:
1480           break;
1481       }
1482   else if (p[2] == '0')
1483     switch (p[3])
1484       {
1485       case 'e':
1486       case 'E':
1487           return bidi::kind::LTR;
1488       case 'f':
1489       case 'F':
1490           return bidi::kind::RTL;
1491       default:
1492           break;
1493       }
1494 
1495   return bidi::kind::NONE;
1496 }
1497 
1498 /* Parse a UCN where P points just past \u or \U and return its bidi code.
1499    If the kind is not NONE, write the location to *OUT.*/
1500 
1501 static bidi::kind
get_bidi_ucn(cpp_reader * pfile,const unsigned char * p,bool is_U,location_t * out)1502 get_bidi_ucn (cpp_reader *pfile,  const unsigned char *p, bool is_U,
1503                 location_t *out)
1504 {
1505   bidi::kind result = get_bidi_ucn_1 (p, is_U);
1506   if (result != bidi::kind::NONE)
1507     {
1508       const unsigned char *start = p - 2;
1509       size_t num_bytes = 2 + (is_U ? 8 : 4);
1510       *out = get_location_for_byte_range_in_cur_line (pfile, start, num_bytes);
1511     }
1512   return result;
1513 }
1514 
1515 /* Subclass of rich_location for reporting on unpaired UTF-8
1516    bidirectional control character(s).
1517    Escape the source lines on output, and show all unclosed
1518    bidi context, labelling everything.  */
1519 
1520 class unpaired_bidi_rich_location : public rich_location
1521 {
1522  public:
1523   class custom_range_label : public range_label
1524   {
1525    public:
get_text(unsigned range_idx) const1526      label_text get_text (unsigned range_idx) const FINAL OVERRIDE
1527      {
1528        /* range 0 is the primary location; each subsequent range i + 1
1529             is for bidi::vec[i].  */
1530        if (range_idx > 0)
1531            {
1532              const bidi::context &ctxt (bidi::vec[range_idx - 1]);
1533              return label_text::borrow (bidi::to_str (ctxt.m_kind));
1534            }
1535        else
1536            return label_text::borrow (_("end of bidirectional context"));
1537      }
1538   };
1539 
unpaired_bidi_rich_location(cpp_reader * pfile,location_t loc)1540   unpaired_bidi_rich_location (cpp_reader *pfile, location_t loc)
1541   : rich_location (pfile->line_table, loc, &m_custom_label)
1542   {
1543     set_escape_on_output (true);
1544     for (unsigned i = 0; i < bidi::vec.count (); i++)
1545       add_range (bidi::vec[i].m_loc,
1546                      SHOW_RANGE_WITHOUT_CARET,
1547                      &m_custom_label);
1548   }
1549 
1550  private:
1551    custom_range_label m_custom_label;
1552 };
1553 
1554 /* We're closing a bidi context, that is, we've encountered a newline,
1555    are closing a C-style comment, or are at the end of a string literal,
1556    character constant, or identifier.  Warn if this context was not
1557    properly terminated by a PDI or PDF.  P points to the last character
1558    in this context.  */
1559 
1560 static void
maybe_warn_bidi_on_close(cpp_reader * pfile,const uchar * p)1561 maybe_warn_bidi_on_close (cpp_reader *pfile, const uchar *p)
1562 {
1563   const auto warn_bidi = CPP_OPTION (pfile, cpp_warn_bidirectional);
1564   if (bidi::vec.count () > 0
1565       && (warn_bidi & bidirectional_unpaired
1566             && (!bidi::current_ctx_ucn_p ()
1567                 || (warn_bidi & bidirectional_ucn))))
1568     {
1569       const location_t loc
1570           = linemap_position_for_column (pfile->line_table,
1571                                                CPP_BUF_COLUMN (pfile->buffer, p));
1572       unpaired_bidi_rich_location rich_loc (pfile, loc);
1573       /* cpp_callbacks doesn't yet have a way to handle singular vs plural
1574            forms of a diagnostic, so fake it for now.  */
1575       if (bidi::vec.count () > 1)
1576           cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1577                               "unpaired UTF-8 bidirectional control characters "
1578                               "detected");
1579       else
1580           cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1581                               "unpaired UTF-8 bidirectional control character "
1582                               "detected");
1583     }
1584   /* We're done with this context.  */
1585   bidi::on_close ();
1586 }
1587 
1588 /* We're at the beginning or in the middle of an identifier/comment/string
1589    literal/character constant.  Warn if we've encountered a bidi character.
1590    KIND says which bidi control character it was; UCN_P is true iff this bidi
1591    control character was written as a UCN.  LOC is the location of the
1592    character, but is only valid if KIND != bidi::kind::NONE.  */
1593 
1594 static void
maybe_warn_bidi_on_char(cpp_reader * pfile,bidi::kind kind,bool ucn_p,location_t loc)1595 maybe_warn_bidi_on_char (cpp_reader *pfile, bidi::kind kind,
1596                                bool ucn_p, location_t loc)
1597 {
1598   if (__builtin_expect (kind == bidi::kind::NONE, 1))
1599     return;
1600 
1601   const auto warn_bidi = CPP_OPTION (pfile, cpp_warn_bidirectional);
1602 
1603   if (warn_bidi & (bidirectional_unpaired|bidirectional_any))
1604     {
1605       rich_location rich_loc (pfile->line_table, loc);
1606       rich_loc.set_escape_on_output (true);
1607 
1608       /* It seems excessive to warn about a PDI/PDF that is closing
1609            an opened context because we've already warned about the
1610            opening character.  Except warn when we have a UCN x UTF-8
1611            mismatch, if UCN checking is enabled.  */
1612       if (kind == bidi::current_ctx ())
1613           {
1614             if (warn_bidi == (bidirectional_unpaired|bidirectional_ucn)
1615                 && bidi::current_ctx_ucn_p () != ucn_p)
1616               {
1617                 rich_loc.add_range (bidi::current_ctx_loc ());
1618                 cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1619                                     "UTF-8 vs UCN mismatch when closing "
1620                                     "a context by \"%s\"", bidi::to_str (kind));
1621               }
1622           }
1623       else if (warn_bidi & bidirectional_any
1624                  && (!ucn_p || (warn_bidi & bidirectional_ucn)))
1625           {
1626             if (kind == bidi::kind::PDF || kind == bidi::kind::PDI)
1627               cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1628                                   "\"%s\" is closing an unopened context",
1629                                   bidi::to_str (kind));
1630             else
1631               cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1632                                   "found problematic Unicode character \"%s\"",
1633                                   bidi::to_str (kind));
1634           }
1635     }
1636   /* We're done with this context.  */
1637   bidi::on_char (kind, ucn_p, loc);
1638 }
1639 
1640 /* Skip a C-style block comment.  We find the end of the comment by
1641    seeing if an asterisk is before every '/' we encounter.  Returns
1642    nonzero if comment terminated by EOF, zero otherwise.
1643 
1644    Buffer->cur points to the initial asterisk of the comment.  */
1645 bool
_cpp_skip_block_comment(cpp_reader * pfile)1646 _cpp_skip_block_comment (cpp_reader *pfile)
1647 {
1648   cpp_buffer *buffer = pfile->buffer;
1649   const uchar *cur = buffer->cur;
1650   uchar c;
1651   const bool warn_bidi_p = pfile->warn_bidi_p ();
1652 
1653   cur++;
1654   if (*cur == '/')
1655     cur++;
1656 
1657   for (;;)
1658     {
1659       /* People like decorating comments with '*', so check for '/'
1660            instead for efficiency.  */
1661       c = *cur++;
1662 
1663       if (c == '/')
1664           {
1665             if (cur[-2] == '*')
1666               {
1667                 if (warn_bidi_p)
1668                     maybe_warn_bidi_on_close (pfile, cur);
1669                 break;
1670               }
1671 
1672             /* Warn about potential nested comments, but not if the '/'
1673                comes immediately before the true comment delimiter.
1674                Don't bother to get it right across escaped newlines.  */
1675             if (CPP_OPTION (pfile, warn_comments)
1676                 && cur[0] == '*' && cur[1] != '/')
1677               {
1678                 buffer->cur = cur;
1679                 cpp_warning_with_line (pfile, CPP_W_COMMENTS,
1680                                              pfile->line_table->highest_line,
1681                                              CPP_BUF_COL (buffer),
1682                                              "\"/*\" within comment");
1683               }
1684           }
1685       else if (c == '\n')
1686           {
1687             unsigned int cols;
1688             buffer->cur = cur - 1;
1689             if (warn_bidi_p)
1690               maybe_warn_bidi_on_close (pfile, cur);
1691             _cpp_process_line_notes (pfile, true);
1692             if (buffer->next_line >= buffer->rlimit)
1693               return true;
1694             _cpp_clean_line (pfile);
1695 
1696             cols = buffer->next_line - buffer->line_base;
1697             CPP_INCREMENT_LINE (pfile, cols);
1698 
1699             cur = buffer->cur;
1700           }
1701       /* If this is a beginning of a UTF-8 encoding, it might be
1702            a bidirectional control character.  */
1703       else if (__builtin_expect (c == bidi::utf8_start, 0) && warn_bidi_p)
1704           {
1705             location_t loc;
1706             bidi::kind kind = get_bidi_utf8 (pfile, cur - 1, &loc);
1707             maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
1708           }
1709     }
1710 
1711   buffer->cur = cur;
1712   _cpp_process_line_notes (pfile, true);
1713   return false;
1714 }
1715 
1716 /* Skip a C++ line comment, leaving buffer->cur pointing to the
1717    terminating newline.  Handles escaped newlines.  Returns nonzero
1718    if a multiline comment.  */
1719 static int
skip_line_comment(cpp_reader * pfile)1720 skip_line_comment (cpp_reader *pfile)
1721 {
1722   cpp_buffer *buffer = pfile->buffer;
1723   location_t orig_line = pfile->line_table->highest_line;
1724   const bool warn_bidi_p = pfile->warn_bidi_p ();
1725 
1726   if (!warn_bidi_p)
1727     while (*buffer->cur != '\n')
1728       buffer->cur++;
1729   else
1730     {
1731       while (*buffer->cur != '\n'
1732                && *buffer->cur != bidi::utf8_start)
1733           buffer->cur++;
1734       if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0))
1735           {
1736             while (*buffer->cur != '\n')
1737               {
1738                 if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0))
1739                     {
1740                       location_t loc;
1741                       bidi::kind kind = get_bidi_utf8 (pfile, buffer->cur, &loc);
1742                       maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
1743                     }
1744                 buffer->cur++;
1745               }
1746             maybe_warn_bidi_on_close (pfile, buffer->cur);
1747           }
1748     }
1749 
1750   _cpp_process_line_notes (pfile, true);
1751   return orig_line != pfile->line_table->highest_line;
1752 }
1753 
1754 /* Skips whitespace, saving the next non-whitespace character.  */
1755 static void
skip_whitespace(cpp_reader * pfile,cppchar_t c)1756 skip_whitespace (cpp_reader *pfile, cppchar_t c)
1757 {
1758   cpp_buffer *buffer = pfile->buffer;
1759   bool saw_NUL = false;
1760 
1761   do
1762     {
1763       /* Horizontal space always OK.  */
1764       if (c == ' ' || c == '\t')
1765           ;
1766       /* Just \f \v or \0 left.  */
1767       else if (c == '\0')
1768           saw_NUL = true;
1769       else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
1770           cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
1771                                    CPP_BUF_COL (buffer),
1772                                    "%s in preprocessing directive",
1773                                    c == '\f' ? "form feed" : "vertical tab");
1774 
1775       c = *buffer->cur++;
1776     }
1777   /* We only want non-vertical space, i.e. ' ' \t \f \v \0.  */
1778   while (is_nvspace (c));
1779 
1780   if (saw_NUL)
1781     {
1782       encoding_rich_location rich_loc (pfile);
1783       cpp_error_at (pfile, CPP_DL_WARNING, &rich_loc,
1784                         "null character(s) ignored");
1785     }
1786 
1787   buffer->cur--;
1788 }
1789 
1790 /* See if the characters of a number token are valid in a name (no
1791    '.', '+' or '-').  */
1792 static int
name_p(cpp_reader * pfile,const cpp_string * string)1793 name_p (cpp_reader *pfile, const cpp_string *string)
1794 {
1795   unsigned int i;
1796 
1797   for (i = 0; i < string->len; i++)
1798     if (!is_idchar (string->text[i]))
1799       return 0;
1800 
1801   return 1;
1802 }
1803 
1804 /* After parsing an identifier or other sequence, produce a warning about
1805    sequences not in NFC/NFKC.  */
1806 static void
warn_about_normalization(cpp_reader * pfile,const cpp_token * token,const struct normalize_state * s)1807 warn_about_normalization (cpp_reader *pfile,
1808                                 const cpp_token *token,
1809                                 const struct normalize_state *s)
1810 {
1811   if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
1812       && !pfile->state.skipping)
1813     {
1814       location_t loc = token->src_loc;
1815 
1816       /* If possible, create a location range for the token.  */
1817       if (loc >= RESERVED_LOCATION_COUNT
1818             && token->type != CPP_EOF
1819             /* There must be no line notes to process.  */
1820             && (!(pfile->buffer->cur
1821                     >= pfile->buffer->notes[pfile->buffer->cur_note].pos
1822                     && !pfile->overlaid_buffer)))
1823           {
1824             source_range tok_range;
1825             tok_range.m_start = loc;
1826             tok_range.m_finish
1827               = linemap_position_for_column (pfile->line_table,
1828                                                      CPP_BUF_COLUMN (pfile->buffer,
1829                                                                          pfile->buffer->cur));
1830             loc = COMBINE_LOCATION_DATA (pfile->line_table,
1831                                                loc, tok_range, NULL);
1832           }
1833 
1834       encoding_rich_location rich_loc (pfile, loc);
1835 
1836       /* Make sure that the token is printed using UCNs, even
1837            if we'd otherwise happily print UTF-8.  */
1838       unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
1839       size_t sz;
1840 
1841       sz = cpp_spell_token (pfile, token, buf, false) - buf;
1842       if (NORMALIZE_STATE_RESULT (s) == normalized_C)
1843           cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
1844                               "`%.*s' is not in NFKC", (int) sz, buf);
1845       else if (CPP_OPTION (pfile, cplusplus))
1846           cpp_pedwarning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
1847                                           "`%.*s' is not in NFC", (int) sz, buf);
1848       else
1849           cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
1850                               "`%.*s' is not in NFC", (int) sz, buf);
1851       free (buf);
1852     }
1853 }
1854 
1855 static const cppchar_t utf8_signifier = 0xC0;
1856 
1857 /* Returns TRUE if the sequence starting at buffer->cur is valid in
1858    an identifier.  FIRST is TRUE if this starts an identifier.  */
1859 
1860 static bool
forms_identifier_p(cpp_reader * pfile,int first,struct normalize_state * state)1861 forms_identifier_p (cpp_reader *pfile, int first,
1862                         struct normalize_state *state)
1863 {
1864   cpp_buffer *buffer = pfile->buffer;
1865   const bool warn_bidi_p = pfile->warn_bidi_p ();
1866 
1867   if (*buffer->cur == '$')
1868     {
1869       if (!CPP_OPTION (pfile, dollars_in_ident))
1870           return false;
1871 
1872       buffer->cur++;
1873       if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
1874           {
1875             CPP_OPTION (pfile, warn_dollars) = 0;
1876             cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
1877           }
1878 
1879       return true;
1880     }
1881 
1882   /* Is this a syntactically valid UCN or a valid UTF-8 char?  */
1883   if (CPP_OPTION (pfile, extended_identifiers))
1884     {
1885       cppchar_t s;
1886       if (*buffer->cur >= utf8_signifier)
1887           {
1888             if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0)
1889                 && warn_bidi_p)
1890               {
1891                 location_t loc;
1892                 bidi::kind kind = get_bidi_utf8 (pfile, buffer->cur, &loc);
1893                 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
1894               }
1895             if (_cpp_valid_utf8 (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1896                                      state, &s))
1897               return true;
1898           }
1899       else if (*buffer->cur == '\\'
1900                  && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
1901           {
1902             buffer->cur += 2;
1903             if (warn_bidi_p)
1904               {
1905                 location_t loc;
1906                 bidi::kind kind = get_bidi_ucn (pfile,
1907                                                         buffer->cur,
1908                                                         buffer->cur[-1] == 'U',
1909                                                         &loc);
1910                 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/true, loc);
1911               }
1912             if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1913                                     state, &s, NULL, NULL))
1914               return true;
1915             buffer->cur -= 2;
1916           }
1917     }
1918 
1919   return false;
1920 }
1921 
1922 /* Helper function to issue error about improper __VA_OPT__ use.  */
1923 static void
maybe_va_opt_error(cpp_reader * pfile)1924 maybe_va_opt_error (cpp_reader *pfile)
1925 {
1926   if (CPP_PEDANTIC (pfile) && !CPP_OPTION (pfile, va_opt))
1927     {
1928       /* __VA_OPT__ should not be accepted at all, but allow it in
1929            system headers.  */
1930       if (!_cpp_in_system_header (pfile))
1931           cpp_error (pfile, CPP_DL_PEDWARN,
1932                        "__VA_OPT__ is not available until C++20");
1933     }
1934   else if (!pfile->state.va_args_ok)
1935     {
1936       /* __VA_OPT__ should only appear in the replacement list of a
1937            variadic macro.  */
1938       cpp_error (pfile, CPP_DL_PEDWARN,
1939                      "__VA_OPT__ can only appear in the expansion"
1940                      " of a C++20 variadic macro");
1941     }
1942 }
1943 
1944 /* Helper function to get the cpp_hashnode of the identifier BASE.  */
1945 static cpp_hashnode *
lex_identifier_intern(cpp_reader * pfile,const uchar * base)1946 lex_identifier_intern (cpp_reader *pfile, const uchar *base)
1947 {
1948   cpp_hashnode *result;
1949   const uchar *cur;
1950   unsigned int len;
1951   unsigned int hash = HT_HASHSTEP (0, *base);
1952 
1953   cur = base + 1;
1954   while (ISIDNUM (*cur))
1955     {
1956       hash = HT_HASHSTEP (hash, *cur);
1957       cur++;
1958     }
1959   len = cur - base;
1960   hash = HT_HASHFINISH (hash, len);
1961   result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1962                                                         base, len, hash, HT_ALLOC));
1963 
1964   /* Rarely, identifiers require diagnostics when lexed.  */
1965   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1966                               && !pfile->state.skipping, 0))
1967     {
1968       /* It is allowed to poison the same identifier twice.  */
1969       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1970           cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1971                        NODE_NAME (result));
1972 
1973       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1974            replacement list of a variadic macro.  */
1975       if (result == pfile->spec_nodes.n__VA_ARGS__
1976             && !pfile->state.va_args_ok)
1977           {
1978             if (CPP_OPTION (pfile, cplusplus))
1979               cpp_error (pfile, CPP_DL_PEDWARN,
1980                            "__VA_ARGS__ can only appear in the expansion"
1981                            " of a C++11 variadic macro");
1982             else
1983               cpp_error (pfile, CPP_DL_PEDWARN,
1984                            "__VA_ARGS__ can only appear in the expansion"
1985                            " of a C99 variadic macro");
1986           }
1987 
1988       if (result == pfile->spec_nodes.n__VA_OPT__)
1989           maybe_va_opt_error (pfile);
1990 
1991       /* For -Wc++-compat, warn about use of C++ named operators.  */
1992       if (result->flags & NODE_WARN_OPERATOR)
1993           cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1994                          "identifier \"%s\" is a special operator name in C++",
1995                          NODE_NAME (result));
1996     }
1997 
1998   return result;
1999 }
2000 
2001 /* Get the cpp_hashnode of an identifier specified by NAME in
2002    the current cpp_reader object.  If none is found, NULL is returned.  */
2003 cpp_hashnode *
_cpp_lex_identifier(cpp_reader * pfile,const char * name)2004 _cpp_lex_identifier (cpp_reader *pfile, const char *name)
2005 {
2006   cpp_hashnode *result;
2007   result = lex_identifier_intern (pfile, (uchar *) name);
2008   return result;
2009 }
2010 
2011 /* Lex an identifier starting at BUFFER->CUR - 1.  */
2012 static cpp_hashnode *
lex_identifier(cpp_reader * pfile,const uchar * base,bool starts_ucn,struct normalize_state * nst,cpp_hashnode ** spelling)2013 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
2014                     struct normalize_state *nst, cpp_hashnode **spelling)
2015 {
2016   cpp_hashnode *result;
2017   const uchar *cur;
2018   unsigned int len;
2019   unsigned int hash = HT_HASHSTEP (0, *base);
2020   const bool warn_bidi_p = pfile->warn_bidi_p ();
2021 
2022   cur = pfile->buffer->cur;
2023   if (! starts_ucn)
2024     {
2025       while (ISIDNUM (*cur))
2026           {
2027             hash = HT_HASHSTEP (hash, *cur);
2028             cur++;
2029           }
2030       NORMALIZE_STATE_UPDATE_IDNUM (nst, *(cur - 1));
2031     }
2032   pfile->buffer->cur = cur;
2033   if (starts_ucn || forms_identifier_p (pfile, false, nst))
2034     {
2035       /* Slower version for identifiers containing UCNs
2036            or extended chars (including $).  */
2037       do {
2038           while (ISIDNUM (*pfile->buffer->cur))
2039             {
2040               NORMALIZE_STATE_UPDATE_IDNUM (nst, *pfile->buffer->cur);
2041               pfile->buffer->cur++;
2042             }
2043       } while (forms_identifier_p (pfile, false, nst));
2044       if (warn_bidi_p)
2045           maybe_warn_bidi_on_close (pfile, pfile->buffer->cur);
2046       result = _cpp_interpret_identifier (pfile, base,
2047                                                     pfile->buffer->cur - base);
2048       *spelling = cpp_lookup (pfile, base, pfile->buffer->cur - base);
2049     }
2050   else
2051     {
2052       len = cur - base;
2053       hash = HT_HASHFINISH (hash, len);
2054 
2055       result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
2056                                                               base, len, hash, HT_ALLOC));
2057       *spelling = result;
2058     }
2059 
2060   /* Rarely, identifiers require diagnostics when lexed.  */
2061   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
2062                               && !pfile->state.skipping, 0))
2063     {
2064       /* It is allowed to poison the same identifier twice.  */
2065       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
2066           cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
2067                        NODE_NAME (result));
2068 
2069       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
2070            replacement list of a variadic macro.  */
2071       if (result == pfile->spec_nodes.n__VA_ARGS__
2072             && !pfile->state.va_args_ok)
2073           {
2074             if (CPP_OPTION (pfile, cplusplus))
2075               cpp_error (pfile, CPP_DL_PEDWARN,
2076                            "__VA_ARGS__ can only appear in the expansion"
2077                            " of a C++11 variadic macro");
2078             else
2079               cpp_error (pfile, CPP_DL_PEDWARN,
2080                            "__VA_ARGS__ can only appear in the expansion"
2081                            " of a C99 variadic macro");
2082           }
2083 
2084       /* __VA_OPT__ should only appear in the replacement list of a
2085            variadic macro.  */
2086       if (result == pfile->spec_nodes.n__VA_OPT__)
2087           maybe_va_opt_error (pfile);
2088 
2089       /* For -Wc++-compat, warn about use of C++ named operators.  */
2090       if (result->flags & NODE_WARN_OPERATOR)
2091           cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
2092                          "identifier \"%s\" is a special operator name in C++",
2093                          NODE_NAME (result));
2094     }
2095 
2096   return result;
2097 }
2098 
2099 /* Lex a number to NUMBER starting at BUFFER->CUR - 1.  */
2100 static void
lex_number(cpp_reader * pfile,cpp_string * number,struct normalize_state * nst)2101 lex_number (cpp_reader *pfile, cpp_string *number,
2102               struct normalize_state *nst)
2103 {
2104   const uchar *cur;
2105   const uchar *base;
2106   uchar *dest;
2107 
2108   base = pfile->buffer->cur - 1;
2109   do
2110     {
2111       const uchar *adj_digit_sep = NULL;
2112       cur = pfile->buffer->cur;
2113 
2114       /* N.B. ISIDNUM does not include $.  */
2115       while (ISIDNUM (*cur)
2116                || (*cur == '.' && !DIGIT_SEP (cur[-1]))
2117                || DIGIT_SEP (*cur)
2118                || (VALID_SIGN (*cur, cur[-1]) && !DIGIT_SEP (cur[-2])))
2119           {
2120             NORMALIZE_STATE_UPDATE_IDNUM (nst, *cur);
2121             /* Adjacent digit separators do not form part of the pp-number syntax.
2122                However, they can safely be diagnosed here as an error, since '' is
2123                not a valid preprocessing token.  */
2124             if (DIGIT_SEP (*cur) && DIGIT_SEP (cur[-1]) && !adj_digit_sep)
2125               adj_digit_sep = cur;
2126             cur++;
2127           }
2128       /* A number can't end with a digit separator.  */
2129       while (cur > pfile->buffer->cur && DIGIT_SEP (cur[-1]))
2130           --cur;
2131       if (adj_digit_sep && adj_digit_sep < cur)
2132           cpp_error (pfile, CPP_DL_ERROR, "adjacent digit separators");
2133 
2134       pfile->buffer->cur = cur;
2135     }
2136   while (forms_identifier_p (pfile, false, nst));
2137 
2138   number->len = cur - base;
2139   dest = _cpp_unaligned_alloc (pfile, number->len + 1);
2140   memcpy (dest, base, number->len);
2141   dest[number->len] = '\0';
2142   number->text = dest;
2143 }
2144 
2145 /* Create a token of type TYPE with a literal spelling.  */
2146 static void
create_literal(cpp_reader * pfile,cpp_token * token,const uchar * base,unsigned int len,enum cpp_ttype type)2147 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
2148                     unsigned int len, enum cpp_ttype type)
2149 {
2150   token->type = type;
2151   token->val.str.len = len;
2152   token->val.str.text = cpp_alloc_token_string (pfile, base, len);
2153 }
2154 
2155 const uchar *
cpp_alloc_token_string(cpp_reader * pfile,const unsigned char * ptr,unsigned len)2156 cpp_alloc_token_string (cpp_reader *pfile,
2157                               const unsigned char *ptr, unsigned len)
2158 {
2159   uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
2160 
2161   dest[len] = 0;
2162   memcpy (dest, ptr, len);
2163   return dest;
2164 }
2165 
2166 /* A pair of raw buffer pointers.  The currently open one is [1], the
2167    first one is [0].  Used for string literal lexing.  */
2168 struct lit_accum {
2169   _cpp_buff *first;
2170   _cpp_buff *last;
2171   const uchar *rpos;
2172   size_t accum;
2173 
lit_accumlit_accum2174   lit_accum ()
2175     : first (NULL), last (NULL), rpos (0), accum (0)
2176   {
2177   }
2178 
2179   void append (cpp_reader *, const uchar *, size_t);
2180 
2181   void read_begin (cpp_reader *);
reading_plit_accum2182   bool reading_p () const
2183   {
2184     return rpos != NULL;
2185   }
read_charlit_accum2186   char read_char ()
2187   {
2188     char c = *rpos++;
2189     if (rpos == BUFF_FRONT (last))
2190       rpos = NULL;
2191     return c;
2192   }
2193 };
2194 
2195 /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
2196    sequence from *FIRST_BUFF_P to LAST_BUFF_P.  */
2197 
2198 void
append(cpp_reader * pfile,const uchar * base,size_t len)2199 lit_accum::append (cpp_reader *pfile, const uchar *base, size_t len)
2200 {
2201   if (!last)
2202     /* Starting.  */
2203     first = last = _cpp_get_buff (pfile, len);
2204   else if (len > BUFF_ROOM (last))
2205     {
2206       /* There is insufficient room in the buffer.  Copy what we can,
2207            and then either extend or create a new one.  */
2208       size_t room = BUFF_ROOM (last);
2209       memcpy (BUFF_FRONT (last), base, room);
2210       BUFF_FRONT (last) += room;
2211       base += room;
2212       len -= room;
2213       accum += room;
2214 
2215       gcc_checking_assert (!rpos);
2216 
2217       last = _cpp_append_extend_buff (pfile, last, len);
2218     }
2219 
2220   memcpy (BUFF_FRONT (last), base, len);
2221   BUFF_FRONT (last) += len;
2222   accum += len;
2223 }
2224 
2225 void
read_begin(cpp_reader * pfile)2226 lit_accum::read_begin (cpp_reader *pfile)
2227 {
2228   /* We never accumulate more than 4 chars to read.  */
2229   if (BUFF_ROOM (last) < 4)
2230 
2231     last = _cpp_append_extend_buff (pfile, last, 4);
2232   rpos = BUFF_FRONT (last);
2233 }
2234 
2235 /* Returns true if a macro has been defined.
2236    This might not work if compile with -save-temps,
2237    or preprocess separately from compilation.  */
2238 
2239 static bool
is_macro(cpp_reader * pfile,const uchar * base)2240 is_macro(cpp_reader *pfile, const uchar *base)
2241 {
2242   const uchar *cur = base;
2243   if (! ISIDST (*cur))
2244     return false;
2245   unsigned int hash = HT_HASHSTEP (0, *cur);
2246   ++cur;
2247   while (ISIDNUM (*cur))
2248     {
2249       hash = HT_HASHSTEP (hash, *cur);
2250       ++cur;
2251     }
2252   hash = HT_HASHFINISH (hash, cur - base);
2253 
2254   cpp_hashnode *result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
2255                                                   base, cur - base, hash, HT_NO_INSERT));
2256 
2257   return result && cpp_macro_p (result);
2258 }
2259 
2260 /* Returns true if a literal suffix does not have the expected form
2261    and is defined as a macro.  */
2262 
2263 static bool
is_macro_not_literal_suffix(cpp_reader * pfile,const uchar * base)2264 is_macro_not_literal_suffix(cpp_reader *pfile, const uchar *base)
2265 {
2266   /* User-defined literals outside of namespace std must start with a single
2267      underscore, so assume anything of that form really is a UDL suffix.
2268      We don't need to worry about UDLs defined inside namespace std because
2269      their names are reserved, so cannot be used as macro names in valid
2270      programs.  */
2271   if (base[0] == '_' && base[1] != '_')
2272     return false;
2273   return is_macro (pfile, base);
2274 }
2275 
2276 /* Lexes a raw string.  The stored string contains the spelling,
2277    including double quotes, delimiter string, '(' and ')', any leading
2278    'L', 'u', 'U' or 'u8' and 'R' modifier.  The created token contains
2279    the type of the literal, or CPP_OTHER if it was not properly
2280    terminated.
2281 
2282    BASE is the start of the token.  Updates pfile->buffer->cur to just
2283    after the lexed string.
2284 
2285    The spelling is NUL-terminated, but it is not guaranteed that this
2286    is the first NUL since embedded NULs are preserved.  */
2287 
2288 static void
lex_raw_string(cpp_reader * pfile,cpp_token * token,const uchar * base)2289 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
2290 {
2291   const uchar *pos = base;
2292   const bool warn_bidi_p = pfile->warn_bidi_p ();
2293 
2294   /* 'tis a pity this information isn't passed down from the lexer's
2295      initial categorization of the token.  */
2296   enum cpp_ttype type = CPP_STRING;
2297 
2298   if (*pos == 'L')
2299     {
2300       type = CPP_WSTRING;
2301       pos++;
2302     }
2303   else if (*pos == 'U')
2304     {
2305       type = CPP_STRING32;
2306       pos++;
2307     }
2308   else if (*pos == 'u')
2309     {
2310       if (pos[1] == '8')
2311           {
2312             type = CPP_UTF8STRING;
2313             pos++;
2314           }
2315       else
2316           type = CPP_STRING16;
2317       pos++;
2318     }
2319 
2320   gcc_checking_assert (pos[0] == 'R' && pos[1] == '"');
2321   pos += 2;
2322 
2323   _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
2324 
2325   /* Skip notes before the ".  */
2326   while (note->pos < pos)
2327     ++note;
2328 
2329   lit_accum accum;
2330 
2331   uchar prefix[17];
2332   unsigned prefix_len = 0;
2333   enum Phase
2334   {
2335    PHASE_PREFIX = -2,
2336    PHASE_NONE = -1,
2337    PHASE_SUFFIX = 0
2338   } phase = PHASE_PREFIX;
2339 
2340   for (;;)
2341     {
2342       gcc_checking_assert (note->pos >= pos);
2343 
2344       /* Undo any escaped newlines and trigraphs.  */
2345       if (!accum.reading_p () && note->pos == pos)
2346           switch (note->type)
2347             {
2348             case '\\':
2349             case ' ':
2350               /* Restore backslash followed by newline.  */
2351               accum.append (pfile, base, pos - base);
2352               base = pos;
2353               accum.read_begin (pfile);
2354               accum.append (pfile, UC"\\", 1);
2355 
2356             after_backslash:
2357               if (note->type == ' ')
2358                 /* GNU backslash whitespace newline extension.  FIXME
2359                      could be any sequence of non-vertical space.  When we
2360                      can properly restore any such sequence, we should
2361                      mark this note as handled so _cpp_process_line_notes
2362                      doesn't warn.  */
2363                 accum.append (pfile, UC" ", 1);
2364 
2365               accum.append (pfile, UC"\n", 1);
2366               note++;
2367               break;
2368 
2369             case '\n':
2370               /* This can happen for ??/<NEWLINE> when trigraphs are not
2371                  being interpretted.  */
2372               gcc_checking_assert (!CPP_OPTION (pfile, trigraphs));
2373               note->type = 0;
2374               note++;
2375               break;
2376 
2377             default:
2378               gcc_checking_assert (_cpp_trigraph_map[note->type]);
2379 
2380               /* Don't warn about this trigraph in
2381                  _cpp_process_line_notes, since trigraphs show up as
2382                  trigraphs in raw strings.  */
2383               uchar type = note->type;
2384               note->type = 0;
2385 
2386               if (CPP_OPTION (pfile, trigraphs))
2387                 {
2388                     accum.append (pfile, base, pos - base);
2389                     base = pos;
2390                     accum.read_begin (pfile);
2391                     accum.append (pfile, UC"??", 2);
2392                     accum.append (pfile, &type, 1);
2393 
2394                     /* ??/ followed by newline gets two line notes, one for
2395                        the trigraph and one for the backslash/newline.  */
2396                     if (type == '/' && note[1].pos == pos)
2397                       {
2398                         note++;
2399                         gcc_assert (note->type == '\\' || note->type == ' ');
2400                         goto after_backslash;
2401                       }
2402                     /* Skip the replacement character.  */
2403                     base = ++pos;
2404                 }
2405 
2406               note++;
2407               break;
2408             }
2409 
2410       /* Now get a char to process.  Either from an expanded note, or
2411            from the line buffer.  */
2412       bool read_note = accum.reading_p ();
2413       char c = read_note ? accum.read_char () : *pos++;
2414 
2415       if (phase == PHASE_PREFIX)
2416           {
2417             if (c == '(')
2418               {
2419                 /* Done.  */
2420                 phase = PHASE_NONE;
2421                 prefix[prefix_len++] = '"';
2422               }
2423             else if (prefix_len < 16
2424                        /* Prefix chars are any of the basic character set,
2425                           [lex.charset] except for '
2426                           ()\\\t\v\f\n'. Optimized for a contiguous
2427                           alphabet.  */
2428                        /* Unlike a switch, this collapses down to one or
2429                           two shift and bitmask operations on an ASCII
2430                           system, with an outlier or two.   */
2431                        && (('Z' - 'A' == 25
2432                               ? ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
2433                               : ISIDST (c))
2434                            || (c >= '0' && c <= '9')
2435                            || c == '_' || c == '{' || c == '}'
2436                            || c == '[' || c == ']' || c == '#'
2437                            || c == '<' || c == '>' || c == '%'
2438                            || c == ':' || c == ';' || c == '.' || c == '?'
2439                            || c == '*' || c == '+' || c == '-' || c == '/'
2440                            || c == '^' || c == '&' || c == '|' || c == '~'
2441                            || c == '!' || c == '=' || c == ','
2442                            || c == '"' || c == '\''))
2443               prefix[prefix_len++] = c;
2444             else
2445               {
2446                 /* Something is wrong.  */
2447                 int col = CPP_BUF_COLUMN (pfile->buffer, pos) + read_note;
2448                 if (prefix_len == 16)
2449                     cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2450                                              col, "raw string delimiter longer "
2451                                              "than 16 characters");
2452                 else if (c == '\n')
2453                     cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2454                                              col, "invalid new-line in raw "
2455                                              "string delimiter");
2456                 else
2457                     cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2458                                              col, "invalid character '%c' in "
2459                                              "raw string delimiter", c);
2460                 type = CPP_OTHER;
2461                 phase = PHASE_NONE;
2462                 /* Continue until we get a close quote, that's probably
2463                      the best failure mode.  */
2464                 prefix_len = 0;
2465               }
2466             if (c != '\n')
2467               continue;
2468           }
2469 
2470       if (phase != PHASE_NONE)
2471           {
2472             if (prefix[phase] != c)
2473               phase = PHASE_NONE;
2474             else if (unsigned (phase + 1) == prefix_len)
2475               break;
2476             else
2477               {
2478                 phase = Phase (phase + 1);
2479                 continue;
2480               }
2481           }
2482 
2483       if (!prefix_len && c == '"')
2484           /* Failure mode lexing.  */
2485           goto out;
2486       else if (prefix_len && c == ')')
2487           phase = PHASE_SUFFIX;
2488       else if (!read_note && c == '\n')
2489           {
2490             pos--;
2491             pfile->buffer->cur = pos;
2492             if (pfile->state.in_directive
2493                 || (pfile->state.parsing_args
2494                       && pfile->buffer->next_line >= pfile->buffer->rlimit))
2495               {
2496                 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
2497                                            "unterminated raw string");
2498                 type = CPP_OTHER;
2499                 goto out;
2500               }
2501 
2502             accum.append (pfile, base, pos - base + 1);
2503             _cpp_process_line_notes (pfile, false);
2504 
2505             if (pfile->buffer->next_line < pfile->buffer->rlimit)
2506               CPP_INCREMENT_LINE (pfile, 0);
2507             pfile->buffer->need_line = true;
2508 
2509             if (!_cpp_get_fresh_line (pfile))
2510               {
2511                 /* We ran out of file and failed to get a line.  */
2512                 location_t src_loc = token->src_loc;
2513                 token->type = CPP_EOF;
2514                 /* Tell the compiler the line number of the EOF token.  */
2515                 token->src_loc = pfile->line_table->highest_line;
2516                 token->flags = BOL;
2517                 if (accum.first)
2518                     _cpp_release_buff (pfile, accum.first);
2519                 cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
2520                                            "unterminated raw string");
2521                 /* Now pop the buffer that _cpp_get_fresh_line did not.  */
2522                 _cpp_pop_buffer (pfile);
2523                 return;
2524               }
2525 
2526             pos = base = pfile->buffer->cur;
2527             note = &pfile->buffer->notes[pfile->buffer->cur_note];
2528           }
2529       else if (__builtin_expect ((unsigned char) c == bidi::utf8_start, 0)
2530                  && warn_bidi_p)
2531           {
2532             location_t loc;
2533             bidi::kind kind = get_bidi_utf8 (pfile, pos - 1, &loc);
2534             maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
2535           }
2536     }
2537 
2538   if (warn_bidi_p)
2539     maybe_warn_bidi_on_close (pfile, pos);
2540 
2541   if (CPP_OPTION (pfile, user_literals))
2542     {
2543       /* If a string format macro, say from inttypes.h, is placed touching
2544            a string literal it could be parsed as a C++11 user-defined string
2545            literal thus breaking the program.  */
2546       if (is_macro_not_literal_suffix (pfile, pos))
2547           {
2548             /* Raise a warning, but do not consume subsequent tokens.  */
2549             if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
2550               cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
2551                                            token->src_loc, 0,
2552                                            "invalid suffix on literal; C++11 requires "
2553                                            "a space between literal and string macro");
2554           }
2555       /* Grab user defined literal suffix.  */
2556       else if (ISIDST (*pos))
2557           {
2558             type = cpp_userdef_string_add_type (type);
2559             ++pos;
2560 
2561             while (ISIDNUM (*pos))
2562               ++pos;
2563           }
2564     }
2565 
2566  out:
2567   pfile->buffer->cur = pos;
2568   if (!accum.accum)
2569     create_literal (pfile, token, base, pos - base, type);
2570   else
2571     {
2572       size_t extra_len = pos - base;
2573       uchar *dest = _cpp_unaligned_alloc (pfile, accum.accum + extra_len + 1);
2574 
2575       token->type = type;
2576       token->val.str.len = accum.accum + extra_len;
2577       token->val.str.text = dest;
2578       for (_cpp_buff *buf = accum.first; buf; buf = buf->next)
2579           {
2580             size_t len = BUFF_FRONT (buf) - buf->base;
2581             memcpy (dest, buf->base, len);
2582             dest += len;
2583           }
2584       _cpp_release_buff (pfile, accum.first);
2585       memcpy (dest, base, extra_len);
2586       dest[extra_len] = '\0';
2587     }
2588 }
2589 
2590 /* Lexes a string, character constant, or angle-bracketed header file
2591    name.  The stored string contains the spelling, including opening
2592    quote and any leading 'L', 'u', 'U' or 'u8' and optional
2593    'R' modifier.  It returns the type of the literal, or CPP_OTHER
2594    if it was not properly terminated, or CPP_LESS for an unterminated
2595    header name which must be relexed as normal tokens.
2596 
2597    The spelling is NUL-terminated, but it is not guaranteed that this
2598    is the first NUL since embedded NULs are preserved.  */
2599 static void
lex_string(cpp_reader * pfile,cpp_token * token,const uchar * base)2600 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
2601 {
2602   bool saw_NUL = false;
2603   const uchar *cur;
2604   cppchar_t terminator;
2605   enum cpp_ttype type;
2606 
2607   cur = base;
2608   terminator = *cur++;
2609   if (terminator == 'L' || terminator == 'U')
2610     terminator = *cur++;
2611   else if (terminator == 'u')
2612     {
2613       terminator = *cur++;
2614       if (terminator == '8')
2615           terminator = *cur++;
2616     }
2617   if (terminator == 'R')
2618     {
2619       lex_raw_string (pfile, token, base);
2620       return;
2621     }
2622   if (terminator == '"')
2623     type = (*base == 'L' ? CPP_WSTRING :
2624               *base == 'U' ? CPP_STRING32 :
2625               *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
2626                                : CPP_STRING);
2627   else if (terminator == '\'')
2628     type = (*base == 'L' ? CPP_WCHAR :
2629               *base == 'U' ? CPP_CHAR32 :
2630               *base == 'u' ? (base[1] == '8' ? CPP_UTF8CHAR : CPP_CHAR16)
2631                                : CPP_CHAR);
2632   else
2633     terminator = '>', type = CPP_HEADER_NAME;
2634 
2635   const bool warn_bidi_p = pfile->warn_bidi_p ();
2636   for (;;)
2637     {
2638       cppchar_t c = *cur++;
2639 
2640       /* In #include-style directives, terminators are not escapable.  */
2641       if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
2642           {
2643             if ((cur[0] == 'u' || cur[0] == 'U') && warn_bidi_p)
2644               {
2645                 location_t loc;
2646                 bidi::kind kind = get_bidi_ucn (pfile, cur + 1, cur[0] == 'U',
2647                                                         &loc);
2648                 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/true, loc);
2649               }
2650             cur++;
2651           }
2652       else if (c == terminator)
2653           {
2654             if (warn_bidi_p)
2655               maybe_warn_bidi_on_close (pfile, cur - 1);
2656             break;
2657           }
2658       else if (c == '\n')
2659           {
2660             cur--;
2661             /* Unmatched quotes always yield undefined behavior, but
2662                greedy lexing means that what appears to be an unterminated
2663                header name may actually be a legitimate sequence of tokens.  */
2664             if (terminator == '>')
2665               {
2666                 token->type = CPP_LESS;
2667                 return;
2668               }
2669             type = CPP_OTHER;
2670             break;
2671           }
2672       else if (c == '\0')
2673           saw_NUL = true;
2674       else if (__builtin_expect (c == bidi::utf8_start, 0) && warn_bidi_p)
2675           {
2676             location_t loc;
2677             bidi::kind kind = get_bidi_utf8 (pfile, cur - 1, &loc);
2678             maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
2679           }
2680     }
2681 
2682   if (saw_NUL && !pfile->state.skipping)
2683     cpp_error (pfile, CPP_DL_WARNING,
2684                  "null character(s) preserved in literal");
2685 
2686   if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
2687     cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
2688                  (int) terminator);
2689 
2690   if (CPP_OPTION (pfile, user_literals))
2691     {
2692       /* If a string format macro, say from inttypes.h, is placed touching
2693            a string literal it could be parsed as a C++11 user-defined string
2694            literal thus breaking the program.  */
2695       if (is_macro_not_literal_suffix (pfile, cur))
2696           {
2697             /* Raise a warning, but do not consume subsequent tokens.  */
2698             if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
2699               cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
2700                                            token->src_loc, 0,
2701                                            "invalid suffix on literal; C++11 requires "
2702                                            "a space between literal and string macro");
2703           }
2704       /* Grab user defined literal suffix.  */
2705       else if (ISIDST (*cur))
2706           {
2707             type = cpp_userdef_char_add_type (type);
2708             type = cpp_userdef_string_add_type (type);
2709           ++cur;
2710 
2711             while (ISIDNUM (*cur))
2712               ++cur;
2713           }
2714     }
2715   else if (CPP_OPTION (pfile, cpp_warn_cxx11_compat)
2716              && is_macro (pfile, cur)
2717              && !pfile->state.skipping)
2718     cpp_warning_with_line (pfile, CPP_W_CXX11_COMPAT,
2719                                  token->src_loc, 0, "C++11 requires a space "
2720                                  "between string literal and macro");
2721 
2722   pfile->buffer->cur = cur;
2723   create_literal (pfile, token, base, cur - base, type);
2724 }
2725 
2726 /* Return the comment table. The client may not make any assumption
2727    about the ordering of the table.  */
2728 cpp_comment_table *
cpp_get_comments(cpp_reader * pfile)2729 cpp_get_comments (cpp_reader *pfile)
2730 {
2731   return &pfile->comments;
2732 }
2733 
2734 /* Append a comment to the end of the comment table. */
2735 static void
store_comment(cpp_reader * pfile,cpp_token * token)2736 store_comment (cpp_reader *pfile, cpp_token *token)
2737 {
2738   int len;
2739 
2740   if (pfile->comments.allocated == 0)
2741     {
2742       pfile->comments.allocated = 256;
2743       pfile->comments.entries = (cpp_comment *) xmalloc
2744           (pfile->comments.allocated * sizeof (cpp_comment));
2745     }
2746 
2747   if (pfile->comments.count == pfile->comments.allocated)
2748     {
2749       pfile->comments.allocated *= 2;
2750       pfile->comments.entries = (cpp_comment *) xrealloc
2751           (pfile->comments.entries,
2752            pfile->comments.allocated * sizeof (cpp_comment));
2753     }
2754 
2755   len = token->val.str.len;
2756 
2757   /* Copy comment. Note, token may not be NULL terminated. */
2758   pfile->comments.entries[pfile->comments.count].comment =
2759     (char *) xmalloc (sizeof (char) * (len + 1));
2760   memcpy (pfile->comments.entries[pfile->comments.count].comment,
2761             token->val.str.text, len);
2762   pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
2763 
2764   /* Set source location. */
2765   pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
2766 
2767   /* Increment the count of entries in the comment table. */
2768   pfile->comments.count++;
2769 }
2770 
2771 /* The stored comment includes the comment start and any terminator.  */
2772 static void
save_comment(cpp_reader * pfile,cpp_token * token,const unsigned char * from,cppchar_t type)2773 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
2774                 cppchar_t type)
2775 {
2776   unsigned char *buffer;
2777   unsigned int len, clen, i;
2778   int convert_to_c = (pfile->state.in_directive || pfile->state.parsing_args)
2779     && type == '/';
2780 
2781   len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'.  */
2782 
2783   /* C++ comments probably (not definitely) have moved past a new
2784      line, which we don't want to save in the comment.  */
2785   if (is_vspace (pfile->buffer->cur[-1]))
2786     len--;
2787 
2788   /* If we are currently in a directive or in argument parsing, then
2789      we need to store all C++ comments as C comments internally, and
2790      so we need to allocate a little extra space in that case.
2791 
2792      Note that the only time we encounter a directive here is
2793      when we are saving comments in a "#define".  */
2794   clen = convert_to_c ? len + 2 : len;
2795 
2796   buffer = _cpp_unaligned_alloc (pfile, clen);
2797 
2798   token->type = CPP_COMMENT;
2799   token->val.str.len = clen;
2800   token->val.str.text = buffer;
2801 
2802   buffer[0] = '/';
2803   memcpy (buffer + 1, from, len - 1);
2804 
2805   /* Finish conversion to a C comment, if necessary.  */
2806   if (convert_to_c)
2807     {
2808       buffer[1] = '*';
2809       buffer[clen - 2] = '*';
2810       buffer[clen - 1] = '/';
2811       /* As there can be in a C++ comments illegal sequences for C comments
2812          we need to filter them out.  */
2813       for (i = 2; i < (clen - 2); i++)
2814         if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
2815           buffer[i] = '|';
2816     }
2817 
2818   /* Finally store this comment for use by clients of libcpp. */
2819   store_comment (pfile, token);
2820 }
2821 
2822 /* Returns true if comment at COMMENT_START is a recognized FALLTHROUGH
2823    comment.  */
2824 
2825 static bool
fallthrough_comment_p(cpp_reader * pfile,const unsigned char * comment_start)2826 fallthrough_comment_p (cpp_reader *pfile, const unsigned char *comment_start)
2827 {
2828   const unsigned char *from = comment_start + 1;
2829 
2830   switch (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough))
2831     {
2832       /* For both -Wimplicit-fallthrough=0 and -Wimplicit-fallthrough=5 we
2833            don't recognize any comments.  The latter only checks attributes,
2834            the former doesn't warn.  */
2835     case 0:
2836     default:
2837       return false;
2838       /* -Wimplicit-fallthrough=1 considers any comment, no matter what
2839            content it has.  */
2840     case 1:
2841       return true;
2842     case 2:
2843       /* -Wimplicit-fallthrough=2 looks for (case insensitive)
2844            .*falls?[ \t-]*thr(u|ough).* regex.  */
2845       for (; (size_t) (pfile->buffer->cur - from) >= sizeof "fallthru" - 1;
2846              from++)
2847           {
2848             /* Is there anything like strpbrk with upper boundary, or
2849                memchr looking for 2 characters rather than just one?  */
2850             if (from[0] != 'f' && from[0] != 'F')
2851               continue;
2852             if (from[1] != 'a' && from[1] != 'A')
2853               continue;
2854             if (from[2] != 'l' && from[2] != 'L')
2855               continue;
2856             if (from[3] != 'l' && from[3] != 'L')
2857               continue;
2858             from += sizeof "fall" - 1;
2859             if (from[0] == 's' || from[0] == 'S')
2860               from++;
2861             while (*from == ' ' || *from == '\t' || *from == '-')
2862               from++;
2863             if (from[0] != 't' && from[0] != 'T')
2864               continue;
2865             if (from[1] != 'h' && from[1] != 'H')
2866               continue;
2867             if (from[2] != 'r' && from[2] != 'R')
2868               continue;
2869             if (from[3] == 'u' || from[3] == 'U')
2870               return true;
2871             if (from[3] != 'o' && from[3] != 'O')
2872               continue;
2873             if (from[4] != 'u' && from[4] != 'U')
2874               continue;
2875             if (from[5] != 'g' && from[5] != 'G')
2876               continue;
2877             if (from[6] != 'h' && from[6] != 'H')
2878               continue;
2879             return true;
2880           }
2881       return false;
2882     case 3:
2883     case 4:
2884       break;
2885     }
2886 
2887   /* Whole comment contents:
2888      -fallthrough
2889      @fallthrough@
2890    */
2891   if (*from == '-' || *from == '@')
2892     {
2893       size_t len = sizeof "fallthrough" - 1;
2894       if ((size_t) (pfile->buffer->cur - from - 1) < len)
2895           return false;
2896       if (memcmp (from + 1, "fallthrough", len))
2897           return false;
2898       if (*from == '@')
2899           {
2900             if (from[len + 1] != '@')
2901               return false;
2902             len++;
2903           }
2904       from += 1 + len;
2905     }
2906   /* Whole comment contents (regex):
2907      lint -fallthrough[ \t]*
2908    */
2909   else if (*from == 'l')
2910     {
2911       size_t len = sizeof "int -fallthrough" - 1;
2912       if ((size_t) (pfile->buffer->cur - from - 1) < len)
2913           return false;
2914       if (memcmp (from + 1, "int -fallthrough", len))
2915           return false;
2916       from += 1 + len;
2917       while (*from == ' ' || *from == '\t')
2918           from++;
2919     }
2920   /* Whole comment contents (regex):
2921      [ \t]*FALLTHR(U|OUGH)[ \t]*
2922    */
2923   else if (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough) == 4)
2924     {
2925       while (*from == ' ' || *from == '\t')
2926           from++;
2927       if ((size_t) (pfile->buffer->cur - from)  < sizeof "FALLTHRU" - 1)
2928           return false;
2929       if (memcmp (from, "FALLTHR", sizeof "FALLTHR" - 1))
2930           return false;
2931       from += sizeof "FALLTHR" - 1;
2932       if (*from == 'U')
2933           from++;
2934       else if ((size_t) (pfile->buffer->cur - from)  < sizeof "OUGH" - 1)
2935           return false;
2936       else if (memcmp (from, "OUGH", sizeof "OUGH" - 1))
2937           return false;
2938       else
2939           from += sizeof "OUGH" - 1;
2940       while (*from == ' ' || *from == '\t')
2941           from++;
2942     }
2943   /* Whole comment contents (regex):
2944      [ \t.!]*(ELSE,? |INTENTIONAL(LY)? )?FALL(S | |-)?THR(OUGH|U)[ \t.!]*(-[^\n\r]*)?
2945      [ \t.!]*(Else,? |Intentional(ly)? )?Fall((s | |-)[Tt]|t)hr(ough|u)[ \t.!]*(-[^\n\r]*)?
2946      [ \t.!]*([Ee]lse,? |[Ii]ntentional(ly)? )?fall(s | |-)?thr(ough|u)[ \t.!]*(-[^\n\r]*)?
2947    */
2948   else
2949     {
2950       while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
2951           from++;
2952       unsigned char f = *from;
2953       bool all_upper = false;
2954       if (f == 'E' || f == 'e')
2955           {
2956             if ((size_t) (pfile->buffer->cur - from)
2957                 < sizeof "else fallthru" - 1)
2958               return false;
2959             if (f == 'E' && memcmp (from + 1, "LSE", sizeof "LSE" - 1) == 0)
2960               all_upper = true;
2961             else if (memcmp (from + 1, "lse", sizeof "lse" - 1))
2962               return false;
2963             from += sizeof "else" - 1;
2964             if (*from == ',')
2965               from++;
2966             if (*from != ' ')
2967               return false;
2968             from++;
2969             if (all_upper && *from == 'f')
2970               return false;
2971             if (f == 'e' && *from == 'F')
2972               return false;
2973             f = *from;
2974           }
2975       else if (f == 'I' || f == 'i')
2976           {
2977             if ((size_t) (pfile->buffer->cur - from)
2978                 < sizeof "intentional fallthru" - 1)
2979               return false;
2980             if (f == 'I' && memcmp (from + 1, "NTENTIONAL",
2981                                           sizeof "NTENTIONAL" - 1) == 0)
2982               all_upper = true;
2983             else if (memcmp (from + 1, "ntentional",
2984                                  sizeof "ntentional" - 1))
2985               return false;
2986             from += sizeof "intentional" - 1;
2987             if (*from == ' ')
2988               {
2989                 from++;
2990                 if (all_upper && *from == 'f')
2991                     return false;
2992               }
2993             else if (all_upper)
2994               {
2995                 if (memcmp (from, "LY F", sizeof "LY F" - 1))
2996                     return false;
2997                 from += sizeof "LY " - 1;
2998               }
2999             else
3000               {
3001                 if (memcmp (from, "ly ", sizeof "ly " - 1))
3002                     return false;
3003                 from += sizeof "ly " - 1;
3004               }
3005             if (f == 'i' && *from == 'F')
3006               return false;
3007             f = *from;
3008           }
3009       if (f != 'F' && f != 'f')
3010           return false;
3011       if ((size_t) (pfile->buffer->cur - from) < sizeof "fallthru" - 1)
3012           return false;
3013       if (f == 'F' && memcmp (from + 1, "ALL", sizeof "ALL" - 1) == 0)
3014           all_upper = true;
3015       else if (all_upper)
3016           return false;
3017       else if (memcmp (from + 1, "all", sizeof "all" - 1))
3018           return false;
3019       from += sizeof "fall" - 1;
3020       if (*from == (all_upper ? 'S' : 's') && from[1] == ' ')
3021           from += 2;
3022       else if (*from == ' ' || *from == '-')
3023           from++;
3024       else if (*from != (all_upper ? 'T' : 't'))
3025           return false;
3026       if ((f == 'f' || *from != 'T') && (all_upper || *from != 't'))
3027           return false;
3028       if ((size_t) (pfile->buffer->cur - from) < sizeof "thru" - 1)
3029           return false;
3030       if (memcmp (from + 1, all_upper ? "HRU" : "hru", sizeof "hru" - 1))
3031           {
3032             if ((size_t) (pfile->buffer->cur - from) < sizeof "through" - 1)
3033               return false;
3034             if (memcmp (from + 1, all_upper ? "HROUGH" : "hrough",
3035                           sizeof "hrough" - 1))
3036               return false;
3037             from += sizeof "through" - 1;
3038           }
3039       else
3040           from += sizeof "thru" - 1;
3041       while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
3042           from++;
3043       if (*from == '-')
3044           {
3045             from++;
3046             if (*comment_start == '*')
3047               {
3048                 do
3049                     {
3050                       while (*from && *from != '*'
3051                                && *from != '\n' && *from != '\r')
3052                         from++;
3053                       if (*from != '*' || from[1] == '/')
3054                         break;
3055                       from++;
3056                     }
3057                 while (1);
3058               }
3059             else
3060               while (*from && *from != '\n' && *from != '\r')
3061                 from++;
3062           }
3063     }
3064   /* C block comment.  */
3065   if (*comment_start == '*')
3066     {
3067       if (*from != '*' || from[1] != '/')
3068           return false;
3069     }
3070   /* C++ line comment.  */
3071   else if (*from != '\n')
3072     return false;
3073 
3074   return true;
3075 }
3076 
3077 /* Allocate COUNT tokens for RUN.  */
3078 void
_cpp_init_tokenrun(tokenrun * run,unsigned int count)3079 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
3080 {
3081   run->base = XNEWVEC (cpp_token, count);
3082   run->limit = run->base + count;
3083   run->next = NULL;
3084 }
3085 
3086 /* Returns the next tokenrun, or creates one if there is none.  */
3087 static tokenrun *
next_tokenrun(tokenrun * run)3088 next_tokenrun (tokenrun *run)
3089 {
3090   if (run->next == NULL)
3091     {
3092       run->next = XNEW (tokenrun);
3093       run->next->prev = run;
3094       _cpp_init_tokenrun (run->next, 250);
3095     }
3096 
3097   return run->next;
3098 }
3099 
3100 /* Return the number of not yet processed token in a given
3101    context.  */
3102 int
_cpp_remaining_tokens_num_in_context(cpp_context * context)3103 _cpp_remaining_tokens_num_in_context (cpp_context *context)
3104 {
3105   if (context->tokens_kind == TOKENS_KIND_DIRECT)
3106     return (LAST (context).token - FIRST (context).token);
3107   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
3108              || context->tokens_kind == TOKENS_KIND_EXTENDED)
3109     return (LAST (context).ptoken - FIRST (context).ptoken);
3110   else
3111       abort ();
3112 }
3113 
3114 /* Returns the token present at index INDEX in a given context.  If
3115    INDEX is zero, the next token to be processed is returned.  */
3116 static const cpp_token*
_cpp_token_from_context_at(cpp_context * context,int index)3117 _cpp_token_from_context_at (cpp_context *context, int index)
3118 {
3119   if (context->tokens_kind == TOKENS_KIND_DIRECT)
3120     return &(FIRST (context).token[index]);
3121   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
3122              || context->tokens_kind == TOKENS_KIND_EXTENDED)
3123     return FIRST (context).ptoken[index];
3124  else
3125    abort ();
3126 }
3127 
3128 /* Look ahead in the input stream.  */
3129 const cpp_token *
cpp_peek_token(cpp_reader * pfile,int index)3130 cpp_peek_token (cpp_reader *pfile, int index)
3131 {
3132   cpp_context *context = pfile->context;
3133   const cpp_token *peektok;
3134   int count;
3135 
3136   /* First, scan through any pending cpp_context objects.  */
3137   while (context->prev)
3138     {
3139       ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context);
3140 
3141       if (index < (int) sz)
3142         return _cpp_token_from_context_at (context, index);
3143       index -= (int) sz;
3144       context = context->prev;
3145     }
3146 
3147   /* We will have to read some new tokens after all (and do so
3148      without invalidating preceding tokens).  */
3149   count = index;
3150   pfile->keep_tokens++;
3151 
3152   /* For peeked tokens temporarily disable line_change reporting,
3153      until the tokens are parsed for real.  */
3154   void (*line_change) (cpp_reader *, const cpp_token *, int)
3155     = pfile->cb.line_change;
3156   pfile->cb.line_change = NULL;
3157 
3158   do
3159     {
3160       peektok = _cpp_lex_token (pfile);
3161       if (peektok->type == CPP_EOF)
3162           {
3163             index--;
3164             break;
3165           }
3166       else if (peektok->type == CPP_PRAGMA)
3167           {
3168             /* Don't peek past a pragma.  */
3169             if (peektok == &pfile->directive_result)
3170               /* Save the pragma in the buffer.  */
3171               *pfile->cur_token++ = *peektok;
3172             index--;
3173             break;
3174           }
3175     }
3176   while (index--);
3177 
3178   _cpp_backup_tokens_direct (pfile, count - index);
3179   pfile->keep_tokens--;
3180   pfile->cb.line_change = line_change;
3181 
3182   return peektok;
3183 }
3184 
3185 /* Allocate a single token that is invalidated at the same time as the
3186    rest of the tokens on the line.  Has its line and col set to the
3187    same as the last lexed token, so that diagnostics appear in the
3188    right place.  */
3189 cpp_token *
_cpp_temp_token(cpp_reader * pfile)3190 _cpp_temp_token (cpp_reader *pfile)
3191 {
3192   cpp_token *old, *result;
3193   ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
3194   ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
3195 
3196   old = pfile->cur_token - 1;
3197   /* Any pre-existing lookaheads must not be clobbered.  */
3198   if (la)
3199     {
3200       if (sz <= la)
3201         {
3202           tokenrun *next = next_tokenrun (pfile->cur_run);
3203 
3204           if (sz < la)
3205             memmove (next->base + 1, next->base,
3206                      (la - sz) * sizeof (cpp_token));
3207 
3208           next->base[0] = pfile->cur_run->limit[-1];
3209         }
3210 
3211       if (sz > 1)
3212         memmove (pfile->cur_token + 1, pfile->cur_token,
3213                  MIN (la, sz - 1) * sizeof (cpp_token));
3214     }
3215 
3216   if (!sz && pfile->cur_token == pfile->cur_run->limit)
3217     {
3218       pfile->cur_run = next_tokenrun (pfile->cur_run);
3219       pfile->cur_token = pfile->cur_run->base;
3220     }
3221 
3222   result = pfile->cur_token++;
3223   result->src_loc = old->src_loc;
3224   return result;
3225 }
3226 
3227 /* We're at the beginning of a logical line (so not in
3228   directives-mode) and RESULT is a CPP_NAME with NODE_MODULE set.  See
3229   if we should enter deferred_pragma mode to tokenize the rest of the
3230   line as a module control-line.  */
3231 
3232 static void
cpp_maybe_module_directive(cpp_reader * pfile,cpp_token * result)3233 cpp_maybe_module_directive (cpp_reader *pfile, cpp_token *result)
3234 {
3235   unsigned backup = 0; /* Tokens we peeked.  */
3236   cpp_hashnode *node = result->val.node.node;
3237   cpp_token *peek = result;
3238   cpp_token *keyword = peek;
3239   cpp_hashnode *(&n_modules)[spec_nodes::M_HWM][2] = pfile->spec_nodes.n_modules;
3240   int header_count = 0;
3241 
3242   /* Make sure the incoming state is as we expect it.  This way we
3243      can restore it using constants.  */
3244   gcc_checking_assert (!pfile->state.in_deferred_pragma
3245                            && !pfile->state.skipping
3246                            && !pfile->state.parsing_args
3247                            && !pfile->state.angled_headers
3248                            && (pfile->state.save_comments
3249                                  == !CPP_OPTION (pfile, discard_comments)));
3250 
3251   /* Enter directives mode sufficiently for peeking.  We don't have
3252      to actually set in_directive.  */
3253   pfile->state.in_deferred_pragma = true;
3254 
3255   /* These two fields are needed to process tokenization in deferred
3256      pragma mode.  They are not used outside deferred pragma mode or
3257      directives mode.  */
3258   pfile->state.pragma_allow_expansion = true;
3259   pfile->directive_line = result->src_loc;
3260 
3261   /* Saving comments is incompatible with directives mode.   */
3262   pfile->state.save_comments = 0;
3263 
3264   if (node == n_modules[spec_nodes::M_EXPORT][0])
3265     {
3266       peek = _cpp_lex_direct (pfile);
3267       keyword = peek;
3268       backup++;
3269       if (keyword->type != CPP_NAME)
3270           goto not_module;
3271       node = keyword->val.node.node;
3272       if (!(node->flags & NODE_MODULE))
3273           goto not_module;
3274     }
3275 
3276   if (node == n_modules[spec_nodes::M__IMPORT][0])
3277     /* __import  */
3278     header_count = backup + 2 + 16;
3279   else if (node == n_modules[spec_nodes::M_IMPORT][0])
3280     /* import  */
3281     header_count = backup + 2 + (CPP_OPTION (pfile, preprocessed) ? 16 : 0);
3282   else if (node == n_modules[spec_nodes::M_MODULE][0])
3283     ; /* module  */
3284   else
3285     goto not_module;
3286 
3287   /* We've seen [export] {module|import|__import}.  Check the next token.  */
3288   if (header_count)
3289     /* After '{,__}import' a header name may appear.  */
3290     pfile->state.angled_headers = true;
3291   peek = _cpp_lex_direct (pfile);
3292   backup++;
3293 
3294   /* ... import followed by identifier, ':', '<' or
3295      header-name preprocessing tokens, or module
3296      followed by cpp-identifier, ':' or ';' preprocessing
3297      tokens.  C++ keywords are not yet relevant.  */
3298   if (peek->type == CPP_NAME
3299       || peek->type == CPP_COLON
3300       ||  (header_count
3301              ? (peek->type == CPP_LESS
3302                 || (peek->type == CPP_STRING && peek->val.str.text[0] != 'R')
3303                 || peek->type == CPP_HEADER_NAME)
3304              : peek->type == CPP_SEMICOLON))
3305     {
3306       pfile->state.pragma_allow_expansion = !CPP_OPTION (pfile, preprocessed);
3307       if (!pfile->state.pragma_allow_expansion)
3308           pfile->state.prevent_expansion++;
3309 
3310       if (!header_count && linemap_included_from
3311             (LINEMAPS_LAST_ORDINARY_MAP (pfile->line_table)))
3312           cpp_error_with_line (pfile, CPP_DL_ERROR, keyword->src_loc, 0,
3313                                    "module control-line cannot be in included file");
3314 
3315       /* The first one or two tokens cannot be macro names.  */
3316       for (int ix = backup; ix--;)
3317           {
3318             cpp_token *tok = ix ? keyword : result;
3319             cpp_hashnode *node = tok->val.node.node;
3320 
3321             /* Don't attempt to expand the token.  */
3322             tok->flags |= NO_EXPAND;
3323             if (_cpp_defined_macro_p (node)
3324                 && _cpp_maybe_notify_macro_use (pfile, node, tok->src_loc)
3325                 && !cpp_fun_like_macro_p (node))
3326               cpp_error_with_line (pfile, CPP_DL_ERROR, tok->src_loc, 0,
3327                                          "module control-line \"%s\" cannot be"
3328                                          " an object-like macro",
3329                                          NODE_NAME (node));
3330           }
3331 
3332       /* Map to underbar variants.  */
3333       keyword->val.node.node = n_modules[header_count
3334                                                    ? spec_nodes::M_IMPORT
3335                                                    : spec_nodes::M_MODULE][1];
3336       if (backup != 1)
3337           result->val.node.node = n_modules[spec_nodes::M_EXPORT][1];
3338 
3339       /* Maybe tell the tokenizer we expect a header-name down the
3340            road.  */
3341       pfile->state.directive_file_token = header_count;
3342     }
3343   else
3344     {
3345     not_module:
3346       /* Drop out of directive mode.  */
3347       /* We aaserted save_comments had this value upon entry.  */
3348       pfile->state.save_comments
3349           = !CPP_OPTION (pfile, discard_comments);
3350       pfile->state.in_deferred_pragma = false;
3351       /* Do not let this remain on.  */
3352       pfile->state.angled_headers = false;
3353     }
3354 
3355   /* In either case we want to backup the peeked tokens.  */
3356   if (backup)
3357     {
3358       /* If we saw EOL, we should drop it, because this isn't a module
3359            control-line after all.  */
3360       bool eol = peek->type == CPP_PRAGMA_EOL;
3361       if (!eol || backup > 1)
3362           {
3363             /* Put put the peeked tokens back  */
3364             _cpp_backup_tokens_direct (pfile, backup);
3365             /* But if the last one was an EOL, forget it.  */
3366             if (eol)
3367               pfile->lookaheads--;
3368           }
3369     }
3370 }
3371 
3372 /* Lex a token into RESULT (external interface).  Takes care of issues
3373    like directive handling, token lookahead, multiple include
3374    optimization and skipping.  */
3375 const cpp_token *
_cpp_lex_token(cpp_reader * pfile)3376 _cpp_lex_token (cpp_reader *pfile)
3377 {
3378   cpp_token *result;
3379 
3380   for (;;)
3381     {
3382       if (pfile->cur_token == pfile->cur_run->limit)
3383           {
3384             pfile->cur_run = next_tokenrun (pfile->cur_run);
3385             pfile->cur_token = pfile->cur_run->base;
3386           }
3387       /* We assume that the current token is somewhere in the current
3388            run.  */
3389       if (pfile->cur_token < pfile->cur_run->base
3390             || pfile->cur_token >= pfile->cur_run->limit)
3391           abort ();
3392 
3393       if (pfile->lookaheads)
3394           {
3395             pfile->lookaheads--;
3396             result = pfile->cur_token++;
3397           }
3398       else
3399           result = _cpp_lex_direct (pfile);
3400 
3401       if (result->flags & BOL)
3402           {
3403             /* Is this a directive.  If _cpp_handle_directive returns
3404                false, it is an assembler #.  */
3405             if (result->type == CPP_HASH
3406                 /* 6.10.3 p 11: Directives in a list of macro arguments
3407                      gives undefined behavior.  This implementation
3408                      handles the directive as normal.  */
3409                 && pfile->state.parsing_args != 1)
3410               {
3411                 if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
3412                     {
3413                       if (pfile->directive_result.type == CPP_PADDING)
3414                         continue;
3415                       result = &pfile->directive_result;
3416                     }
3417               }
3418             else if (pfile->state.in_deferred_pragma)
3419               result = &pfile->directive_result;
3420             else if (result->type == CPP_NAME
3421                        && (result->val.node.node->flags & NODE_MODULE)
3422                        && !pfile->state.skipping
3423                        /* Unlike regular directives, we do not deal with
3424                           tokenizing module directives as macro arguments.
3425                           That's not permitted.  */
3426                        && !pfile->state.parsing_args)
3427               {
3428                 /* P1857.  Before macro expansion, At start of logical
3429                      line ... */
3430                 /* We don't have to consider lookaheads at this point.  */
3431                 gcc_checking_assert (!pfile->lookaheads);
3432 
3433                 cpp_maybe_module_directive (pfile, result);
3434               }
3435 
3436             if (pfile->cb.line_change && !pfile->state.skipping)
3437               pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
3438           }
3439 
3440       /* We don't skip tokens in directives.  */
3441       if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
3442           break;
3443 
3444       /* Outside a directive, invalidate controlling macros.  At file
3445            EOF, _cpp_lex_direct takes care of popping the buffer, so we never
3446            get here and MI optimization works.  */
3447       pfile->mi_valid = false;
3448 
3449       if (!pfile->state.skipping || result->type == CPP_EOF)
3450           break;
3451     }
3452 
3453   return result;
3454 }
3455 
3456 /* Returns true if a fresh line has been loaded.  */
3457 bool
_cpp_get_fresh_line(cpp_reader * pfile)3458 _cpp_get_fresh_line (cpp_reader *pfile)
3459 {
3460   /* We can't get a new line until we leave the current directive.  */
3461   if (pfile->state.in_directive)
3462     return false;
3463 
3464   for (;;)
3465     {
3466       cpp_buffer *buffer = pfile->buffer;
3467 
3468       if (!buffer->need_line)
3469           return true;
3470 
3471       if (buffer->next_line < buffer->rlimit)
3472           {
3473             _cpp_clean_line (pfile);
3474             return true;
3475           }
3476 
3477       /* First, get out of parsing arguments state.  */
3478       if (pfile->state.parsing_args)
3479           return false;
3480 
3481       /* End of buffer.  Non-empty files should end in a newline.  */
3482       if (buffer->buf != buffer->rlimit
3483             && buffer->next_line > buffer->rlimit
3484             && !buffer->from_stage3)
3485           {
3486             /* Clip to buffer size.  */
3487             buffer->next_line = buffer->rlimit;
3488           }
3489 
3490       if (buffer->prev && !buffer->return_at_eof)
3491           _cpp_pop_buffer (pfile);
3492       else
3493           {
3494             /* End of translation.  Do not pop the buffer yet. Increment
3495                line number so that the EOF token is on a line of its own
3496                (_cpp_lex_direct doesn't increment in that case, because
3497                it's hard for it to distinguish this special case). */
3498             CPP_INCREMENT_LINE (pfile, 0);
3499             return false;
3500           }
3501     }
3502 }
3503 
3504 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE)              \
3505   do                                                                  \
3506     {                                                                 \
3507       result->type = ELSE_TYPE;                                       \
3508       if (*buffer->cur == CHAR)                                       \
3509           buffer->cur++, result->type = THEN_TYPE;          \
3510     }                                                                 \
3511   while (0)
3512 
3513 /* Lex a token into pfile->cur_token, which is also incremented, to
3514    get diagnostics pointing to the correct location.
3515 
3516    Does not handle issues such as token lookahead, multiple-include
3517    optimization, directives, skipping etc.  This function is only
3518    suitable for use by _cpp_lex_token, and in special cases like
3519    lex_expansion_token which doesn't care for any of these issues.
3520 
3521    When meeting a newline, returns CPP_EOF if parsing a directive,
3522    otherwise returns to the start of the token buffer if permissible.
3523    Returns the location of the lexed token.  */
3524 cpp_token *
_cpp_lex_direct(cpp_reader * pfile)3525 _cpp_lex_direct (cpp_reader *pfile)
3526 {
3527   cppchar_t c;
3528   cpp_buffer *buffer;
3529   const unsigned char *comment_start;
3530   bool fallthrough_comment = false;
3531   cpp_token *result = pfile->cur_token++;
3532 
3533  fresh_line:
3534   result->flags = 0;
3535   buffer = pfile->buffer;
3536   if (buffer->need_line)
3537     {
3538       if (pfile->state.in_deferred_pragma)
3539           {
3540             /* This can happen in cases like:
3541                #define loop(x) whatever
3542                #pragma omp loop
3543                where when trying to expand loop we need to peek
3544                next token after loop, but aren't still in_deferred_pragma
3545                mode but are in in_directive mode, so buffer->need_line
3546                is set, a CPP_EOF is peeked.  */
3547             result->type = CPP_PRAGMA_EOL;
3548             pfile->state.in_deferred_pragma = false;
3549             if (!pfile->state.pragma_allow_expansion)
3550               pfile->state.prevent_expansion--;
3551             return result;
3552           }
3553       if (!_cpp_get_fresh_line (pfile))
3554           {
3555             result->type = CPP_EOF;
3556             /* Not a real EOF in a directive or arg parsing -- we refuse
3557                to advance to the next file now, and will once we're out
3558                of those modes.  */
3559             if (!pfile->state.in_directive && !pfile->state.parsing_args)
3560               {
3561                 /* Tell the compiler the line number of the EOF token.  */
3562                 result->src_loc = pfile->line_table->highest_line;
3563                 result->flags = BOL;
3564                 /* Now pop the buffer that _cpp_get_fresh_line did not.  */
3565                 _cpp_pop_buffer (pfile);
3566               }
3567             return result;
3568           }
3569       if (buffer != pfile->buffer)
3570           fallthrough_comment = false;
3571       if (!pfile->keep_tokens)
3572           {
3573             pfile->cur_run = &pfile->base_run;
3574             result = pfile->base_run.base;
3575             pfile->cur_token = result + 1;
3576           }
3577       result->flags = BOL;
3578       if (pfile->state.parsing_args == 2)
3579           result->flags |= PREV_WHITE;
3580     }
3581   buffer = pfile->buffer;
3582  update_tokens_line:
3583   result->src_loc = pfile->line_table->highest_line;
3584 
3585  skipped_white:
3586   if (buffer->cur >= buffer->notes[buffer->cur_note].pos
3587       && !pfile->overlaid_buffer)
3588     {
3589       _cpp_process_line_notes (pfile, false);
3590       result->src_loc = pfile->line_table->highest_line;
3591     }
3592   c = *buffer->cur++;
3593 
3594   if (pfile->forced_token_location)
3595     result->src_loc = pfile->forced_token_location;
3596   else
3597     result->src_loc = linemap_position_for_column (pfile->line_table,
3598                                                     CPP_BUF_COLUMN (buffer, buffer->cur));
3599 
3600   switch (c)
3601     {
3602     case ' ': case '\t': case '\f': case '\v': case '\0':
3603       result->flags |= PREV_WHITE;
3604       skip_whitespace (pfile, c);
3605       goto skipped_white;
3606 
3607     case '\n':
3608       /* Increment the line, unless this is the last line ...  */
3609       if (buffer->cur < buffer->rlimit
3610             /* ... or this is a #include, (where _cpp_stack_file needs to
3611                unwind by one line) ...  */
3612             || (pfile->state.in_directive > 1
3613                 /* ... except traditional-cpp increments this elsewhere.  */
3614                 && !CPP_OPTION (pfile, traditional)))
3615           CPP_INCREMENT_LINE (pfile, 0);
3616       buffer->need_line = true;
3617       if (pfile->state.in_deferred_pragma)
3618           {
3619             /* Produce the PRAGMA_EOL on this line.  File reading
3620                ensures there is always a \n at end of the buffer, thus
3621                in a deferred pragma we always see CPP_PRAGMA_EOL before
3622                any CPP_EOF.  */
3623             result->type = CPP_PRAGMA_EOL;
3624             result->flags &= ~PREV_WHITE;
3625             pfile->state.in_deferred_pragma = false;
3626             if (!pfile->state.pragma_allow_expansion)
3627               pfile->state.prevent_expansion--;
3628             return result;
3629           }
3630       goto fresh_line;
3631 
3632     case '0': case '1': case '2': case '3': case '4':
3633     case '5': case '6': case '7': case '8': case '9':
3634       {
3635           struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3636           result->type = CPP_NUMBER;
3637           lex_number (pfile, &result->val.str, &nst);
3638           warn_about_normalization (pfile, result, &nst);
3639           break;
3640       }
3641 
3642     case 'L':
3643     case 'u':
3644     case 'U':
3645     case 'R':
3646       /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
3647            wide strings or raw strings.  */
3648       if (c == 'L' || CPP_OPTION (pfile, rliterals)
3649             || (c != 'R' && CPP_OPTION (pfile, uliterals)))
3650           {
3651             if ((*buffer->cur == '\'' && c != 'R')
3652                 || *buffer->cur == '"'
3653                 || (*buffer->cur == 'R'
3654                       && c != 'R'
3655                       && buffer->cur[1] == '"'
3656                       && CPP_OPTION (pfile, rliterals))
3657                 || (*buffer->cur == '8'
3658                       && c == 'u'
3659                       && ((buffer->cur[1] == '"' || (buffer->cur[1] == '\''
3660                                         && CPP_OPTION (pfile, utf8_char_literals)))
3661                           || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
3662                                 && CPP_OPTION (pfile, rliterals)))))
3663               {
3664                 lex_string (pfile, result, buffer->cur - 1);
3665                 break;
3666               }
3667           }
3668       /* Fall through.  */
3669 
3670     case '_':
3671     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
3672     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
3673     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
3674     case 's': case 't':           case 'v': case 'w': case 'x':
3675     case 'y': case 'z':
3676     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
3677     case 'G': case 'H': case 'I': case 'J': case 'K':
3678     case 'M': case 'N': case 'O': case 'P': case 'Q':
3679     case 'S': case 'T':           case 'V': case 'W': case 'X':
3680     case 'Y': case 'Z':
3681       result->type = CPP_NAME;
3682       {
3683           struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3684           result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false,
3685                                                             &nst,
3686                                                             &result->val.node.spelling);
3687           warn_about_normalization (pfile, result, &nst);
3688       }
3689 
3690       /* Convert named operators to their proper types.  */
3691       if (result->val.node.node->flags & NODE_OPERATOR)
3692           {
3693             result->flags |= NAMED_OP;
3694             result->type = (enum cpp_ttype) result->val.node.node->directive_index;
3695           }
3696 
3697       /* Signal FALLTHROUGH comment followed by another token.  */
3698       if (fallthrough_comment)
3699           result->flags |= PREV_FALLTHROUGH;
3700       break;
3701 
3702     case '\'':
3703     case '"':
3704       lex_string (pfile, result, buffer->cur - 1);
3705       break;
3706 
3707     case '/':
3708       /* A potential block or line comment.  */
3709       comment_start = buffer->cur;
3710       c = *buffer->cur;
3711 
3712       if (c == '*')
3713           {
3714             if (_cpp_skip_block_comment (pfile))
3715               cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
3716           }
3717       else if (c == '/' && ! CPP_OPTION (pfile, traditional))
3718           {
3719             /* Don't warn for system headers.  */
3720             if (_cpp_in_system_header (pfile))
3721               ;
3722             /* Warn about comments if pedantically GNUC89, and not
3723                in system headers.  */
3724             else if (CPP_OPTION (pfile, lang) == CLK_GNUC89
3725                        && CPP_PEDANTIC (pfile)
3726                        && ! buffer->warned_cplusplus_comments)
3727               {
3728                 if (cpp_error (pfile, CPP_DL_PEDWARN,
3729                                    "C++ style comments are not allowed in ISO C90"))
3730                     cpp_error (pfile, CPP_DL_NOTE,
3731                                  "(this will be reported only once per input file)");
3732                 buffer->warned_cplusplus_comments = 1;
3733               }
3734             /* Or if specifically desired via -Wc90-c99-compat.  */
3735             else if (CPP_OPTION (pfile, cpp_warn_c90_c99_compat) > 0
3736                        && ! CPP_OPTION (pfile, cplusplus)
3737                        && ! buffer->warned_cplusplus_comments)
3738               {
3739                 if (cpp_error (pfile, CPP_DL_WARNING,
3740                                    "C++ style comments are incompatible with C90"))
3741                     cpp_error (pfile, CPP_DL_NOTE,
3742                                  "(this will be reported only once per input file)");
3743                 buffer->warned_cplusplus_comments = 1;
3744               }
3745             /* In C89/C94, C++ style comments are forbidden.  */
3746             else if ((CPP_OPTION (pfile, lang) == CLK_STDC89
3747                         || CPP_OPTION (pfile, lang) == CLK_STDC94))
3748               {
3749                 /* But don't be confused about valid code such as
3750                    - // immediately followed by *,
3751                      - // in a preprocessing directive,
3752                      - // in an #if 0 block.  */
3753                 if (buffer->cur[1] == '*'
3754                       || pfile->state.in_directive
3755                       || pfile->state.skipping)
3756                     {
3757                       result->type = CPP_DIV;
3758                       break;
3759                     }
3760                 else if (! buffer->warned_cplusplus_comments)
3761                     {
3762                       if (cpp_error (pfile, CPP_DL_ERROR,
3763                                          "C++ style comments are not allowed in "
3764                                          "ISO C90"))
3765                         cpp_error (pfile, CPP_DL_NOTE,
3766                                      "(this will be reported only once per input "
3767                                      "file)");
3768                       buffer->warned_cplusplus_comments = 1;
3769                     }
3770               }
3771             if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
3772               cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
3773           }
3774       else if (c == '=')
3775           {
3776             buffer->cur++;
3777             result->type = CPP_DIV_EQ;
3778             break;
3779           }
3780       else
3781           {
3782             result->type = CPP_DIV;
3783             break;
3784           }
3785 
3786       if (fallthrough_comment_p (pfile, comment_start))
3787           fallthrough_comment = true;
3788 
3789       if (pfile->cb.comment)
3790           {
3791             size_t len = pfile->buffer->cur - comment_start;
3792             pfile->cb.comment (pfile, result->src_loc, comment_start - 1,
3793                                    len + 1);
3794           }
3795 
3796       if (!pfile->state.save_comments)
3797           {
3798             result->flags |= PREV_WHITE;
3799             goto update_tokens_line;
3800           }
3801 
3802       if (fallthrough_comment)
3803           result->flags |= PREV_FALLTHROUGH;
3804 
3805       /* Save the comment as a token in its own right.  */
3806       save_comment (pfile, result, comment_start, c);
3807       break;
3808 
3809     case '<':
3810       if (pfile->state.angled_headers)
3811           {
3812             lex_string (pfile, result, buffer->cur - 1);
3813             if (result->type != CPP_LESS)
3814               break;
3815           }
3816 
3817       result->type = CPP_LESS;
3818       if (*buffer->cur == '=')
3819           {
3820             buffer->cur++, result->type = CPP_LESS_EQ;
3821             if (*buffer->cur == '>'
3822                 && CPP_OPTION (pfile, cplusplus)
3823                 && CPP_OPTION (pfile, lang) >= CLK_GNUCXX20)
3824               buffer->cur++, result->type = CPP_SPACESHIP;
3825           }
3826       else if (*buffer->cur == '<')
3827           {
3828             buffer->cur++;
3829             IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
3830           }
3831       else if (CPP_OPTION (pfile, digraphs))
3832           {
3833             if (*buffer->cur == ':')
3834               {
3835                 /* C++11 [2.5/3 lex.pptoken], "Otherwise, if the next
3836                      three characters are <:: and the subsequent character
3837                      is neither : nor >, the < is treated as a preprocessor
3838                      token by itself".  */
3839                 if (CPP_OPTION (pfile, cplusplus)
3840                       && CPP_OPTION (pfile, lang) != CLK_CXX98
3841                       && CPP_OPTION (pfile, lang) != CLK_GNUCXX
3842                       && buffer->cur[1] == ':'
3843                       && buffer->cur[2] != ':' && buffer->cur[2] != '>')
3844                     break;
3845 
3846                 buffer->cur++;
3847                 result->flags |= DIGRAPH;
3848                 result->type = CPP_OPEN_SQUARE;
3849               }
3850             else if (*buffer->cur == '%')
3851               {
3852                 buffer->cur++;
3853                 result->flags |= DIGRAPH;
3854                 result->type = CPP_OPEN_BRACE;
3855               }
3856           }
3857       break;
3858 
3859     case '>':
3860       result->type = CPP_GREATER;
3861       if (*buffer->cur == '=')
3862           buffer->cur++, result->type = CPP_GREATER_EQ;
3863       else if (*buffer->cur == '>')
3864           {
3865             buffer->cur++;
3866             IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
3867           }
3868       break;
3869 
3870     case '%':
3871       result->type = CPP_MOD;
3872       if (*buffer->cur == '=')
3873           buffer->cur++, result->type = CPP_MOD_EQ;
3874       else if (CPP_OPTION (pfile, digraphs))
3875           {
3876             if (*buffer->cur == ':')
3877               {
3878                 buffer->cur++;
3879                 result->flags |= DIGRAPH;
3880                 result->type = CPP_HASH;
3881                 if (*buffer->cur == '%' && buffer->cur[1] == ':')
3882                     buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
3883               }
3884             else if (*buffer->cur == '>')
3885               {
3886                 buffer->cur++;
3887                 result->flags |= DIGRAPH;
3888                 result->type = CPP_CLOSE_BRACE;
3889               }
3890           }
3891       break;
3892 
3893     case '.':
3894       result->type = CPP_DOT;
3895       if (ISDIGIT (*buffer->cur))
3896           {
3897             struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3898             result->type = CPP_NUMBER;
3899             lex_number (pfile, &result->val.str, &nst);
3900             warn_about_normalization (pfile, result, &nst);
3901           }
3902       else if (*buffer->cur == '.' && buffer->cur[1] == '.')
3903           buffer->cur += 2, result->type = CPP_ELLIPSIS;
3904       else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
3905           buffer->cur++, result->type = CPP_DOT_STAR;
3906       break;
3907 
3908     case '+':
3909       result->type = CPP_PLUS;
3910       if (*buffer->cur == '+')
3911           buffer->cur++, result->type = CPP_PLUS_PLUS;
3912       else if (*buffer->cur == '=')
3913           buffer->cur++, result->type = CPP_PLUS_EQ;
3914       break;
3915 
3916     case '-':
3917       result->type = CPP_MINUS;
3918       if (*buffer->cur == '>')
3919           {
3920             buffer->cur++;
3921             result->type = CPP_DEREF;
3922             if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
3923               buffer->cur++, result->type = CPP_DEREF_STAR;
3924           }
3925       else if (*buffer->cur == '-')
3926           buffer->cur++, result->type = CPP_MINUS_MINUS;
3927       else if (*buffer->cur == '=')
3928           buffer->cur++, result->type = CPP_MINUS_EQ;
3929       break;
3930 
3931     case '&':
3932       result->type = CPP_AND;
3933       if (*buffer->cur == '&')
3934           buffer->cur++, result->type = CPP_AND_AND;
3935       else if (*buffer->cur == '=')
3936           buffer->cur++, result->type = CPP_AND_EQ;
3937       break;
3938 
3939     case '|':
3940       result->type = CPP_OR;
3941       if (*buffer->cur == '|')
3942           buffer->cur++, result->type = CPP_OR_OR;
3943       else if (*buffer->cur == '=')
3944           buffer->cur++, result->type = CPP_OR_EQ;
3945       break;
3946 
3947     case ':':
3948       result->type = CPP_COLON;
3949       if (*buffer->cur == ':')
3950           {
3951             if (CPP_OPTION (pfile, scope))
3952               buffer->cur++, result->type = CPP_SCOPE;
3953             else
3954               result->flags |= COLON_SCOPE;
3955           }
3956       else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
3957           {
3958             buffer->cur++;
3959             result->flags |= DIGRAPH;
3960             result->type = CPP_CLOSE_SQUARE;
3961           }
3962       break;
3963 
3964     case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
3965     case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
3966     case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
3967     case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
3968     case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
3969 
3970     case '?': result->type = CPP_QUERY; break;
3971     case '~': result->type = CPP_COMPL; break;
3972     case ',': result->type = CPP_COMMA; break;
3973     case '(': result->type = CPP_OPEN_PAREN; break;
3974     case ')': result->type = CPP_CLOSE_PAREN; break;
3975     case '[': result->type = CPP_OPEN_SQUARE; break;
3976     case ']': result->type = CPP_CLOSE_SQUARE; break;
3977     case '{': result->type = CPP_OPEN_BRACE; break;
3978     case '}': result->type = CPP_CLOSE_BRACE; break;
3979     case ';': result->type = CPP_SEMICOLON; break;
3980 
3981       /* @ is a punctuator in Objective-C.  */
3982     case '@': result->type = CPP_ATSIGN; break;
3983 
3984     default:
3985       {
3986           const uchar *base = --buffer->cur;
3987 
3988           /* Check for an extended identifier ($ or UCN or UTF-8).  */
3989           struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3990           if (forms_identifier_p (pfile, true, &nst))
3991             {
3992               result->type = CPP_NAME;
3993               result->val.node.node = lex_identifier (pfile, base, true, &nst,
3994                                                                 &result->val.node.spelling);
3995               warn_about_normalization (pfile, result, &nst);
3996               break;
3997             }
3998 
3999           /* Otherwise this will form a CPP_OTHER token.  Parse valid UTF-8 as a
4000              single token.  */
4001           buffer->cur++;
4002           if (c >= utf8_signifier)
4003             {
4004               const uchar *pstr = base;
4005               cppchar_t s;
4006               if (_cpp_valid_utf8 (pfile, &pstr, buffer->rlimit, 0, NULL, &s))
4007                 buffer->cur = pstr;
4008             }
4009           create_literal (pfile, result, base, buffer->cur - base, CPP_OTHER);
4010           break;
4011       }
4012 
4013     }
4014 
4015   /* Potentially convert the location of the token to a range.  */
4016   if (result->src_loc >= RESERVED_LOCATION_COUNT
4017       && result->type != CPP_EOF)
4018     {
4019       /* Ensure that any line notes are processed, so that we have the
4020            correct physical line/column for the end-point of the token even
4021            when a logical line is split via one or more backslashes.  */
4022       if (buffer->cur >= buffer->notes[buffer->cur_note].pos
4023             && !pfile->overlaid_buffer)
4024           _cpp_process_line_notes (pfile, false);
4025 
4026       source_range tok_range;
4027       tok_range.m_start = result->src_loc;
4028       tok_range.m_finish
4029           = linemap_position_for_column (pfile->line_table,
4030                                                CPP_BUF_COLUMN (buffer, buffer->cur));
4031 
4032       result->src_loc = COMBINE_LOCATION_DATA (pfile->line_table,
4033                                                          result->src_loc,
4034                                                          tok_range, NULL);
4035     }
4036 
4037   return result;
4038 }
4039 
4040 /* An upper bound on the number of bytes needed to spell TOKEN.
4041    Does not include preceding whitespace.  */
4042 unsigned int
cpp_token_len(const cpp_token * token)4043 cpp_token_len (const cpp_token *token)
4044 {
4045   unsigned int len;
4046 
4047   switch (TOKEN_SPELL (token))
4048     {
4049     default:                  len = 6;                                break;
4050     case SPELL_LITERAL:       len = token->val.str.len;               break;
4051     case SPELL_IDENT:         len = NODE_LEN (token->val.node.node) * 10;       break;
4052     }
4053 
4054   return len;
4055 }
4056 
4057 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
4058    Return the number of bytes read out of NAME.  (There are always
4059    10 bytes written to BUFFER.)  */
4060 
4061 static size_t
utf8_to_ucn(unsigned char * buffer,const unsigned char * name)4062 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
4063 {
4064   int j;
4065   int ucn_len = 0;
4066   int ucn_len_c;
4067   unsigned t;
4068   unsigned long utf32;
4069 
4070   /* Compute the length of the UTF-8 sequence.  */
4071   for (t = *name; t & 0x80; t <<= 1)
4072     ucn_len++;
4073 
4074   utf32 = *name & (0x7F >> ucn_len);
4075   for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
4076     {
4077       utf32 = (utf32 << 6) | (*++name & 0x3F);
4078 
4079       /* Ill-formed UTF-8.  */
4080       if ((*name & ~0x3F) != 0x80)
4081           abort ();
4082     }
4083 
4084   *buffer++ = '\\';
4085   *buffer++ = 'U';
4086   for (j = 7; j >= 0; j--)
4087     *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
4088   return ucn_len;
4089 }
4090 
4091 /* Given a token TYPE corresponding to a digraph, return a pointer to
4092    the spelling of the digraph.  */
4093 static const unsigned char *
cpp_digraph2name(enum cpp_ttype type)4094 cpp_digraph2name (enum cpp_ttype type)
4095 {
4096   return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
4097 }
4098 
4099 /* Write the spelling of an identifier IDENT, using UCNs, to BUFFER.
4100    The buffer must already contain the enough space to hold the
4101    token's spelling.  Returns a pointer to the character after the
4102    last character written.  */
4103 unsigned char *
_cpp_spell_ident_ucns(unsigned char * buffer,cpp_hashnode * ident)4104 _cpp_spell_ident_ucns (unsigned char *buffer, cpp_hashnode *ident)
4105 {
4106   size_t i;
4107   const unsigned char *name = NODE_NAME (ident);
4108 
4109   for (i = 0; i < NODE_LEN (ident); i++)
4110     if (name[i] & ~0x7F)
4111       {
4112           i += utf8_to_ucn (buffer, name + i) - 1;
4113           buffer += 10;
4114       }
4115     else
4116       *buffer++ = name[i];
4117 
4118   return buffer;
4119 }
4120 
4121 /* Write the spelling of a token TOKEN to BUFFER.  The buffer must
4122    already contain the enough space to hold the token's spelling.
4123    Returns a pointer to the character after the last character written.
4124    FORSTRING is true if this is to be the spelling after translation
4125    phase 1 (with the original spelling of extended identifiers), false
4126    if extended identifiers should always be written using UCNs (there is
4127    no option for always writing them in the internal UTF-8 form).
4128    FIXME: Would be nice if we didn't need the PFILE argument.  */
4129 unsigned char *
cpp_spell_token(cpp_reader * pfile,const cpp_token * token,unsigned char * buffer,bool forstring)4130 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
4131                      unsigned char *buffer, bool forstring)
4132 {
4133   switch (TOKEN_SPELL (token))
4134     {
4135     case SPELL_OPERATOR:
4136       {
4137           const unsigned char *spelling;
4138           unsigned char c;
4139 
4140           if (token->flags & DIGRAPH)
4141             spelling = cpp_digraph2name (token->type);
4142           else if (token->flags & NAMED_OP)
4143             goto spell_ident;
4144           else
4145             spelling = TOKEN_NAME (token);
4146 
4147           while ((c = *spelling++) != '\0')
4148             *buffer++ = c;
4149       }
4150       break;
4151 
4152     spell_ident:
4153     case SPELL_IDENT:
4154       if (forstring)
4155           {
4156             memcpy (buffer, NODE_NAME (token->val.node.spelling),
4157                       NODE_LEN (token->val.node.spelling));
4158             buffer += NODE_LEN (token->val.node.spelling);
4159           }
4160       else
4161           buffer = _cpp_spell_ident_ucns (buffer, token->val.node.node);
4162       break;
4163 
4164     case SPELL_LITERAL:
4165       memcpy (buffer, token->val.str.text, token->val.str.len);
4166       buffer += token->val.str.len;
4167       break;
4168 
4169     case SPELL_NONE:
4170       cpp_error (pfile, CPP_DL_ICE,
4171                      "unspellable token %s", TOKEN_NAME (token));
4172       break;
4173     }
4174 
4175   return buffer;
4176 }
4177 
4178 /* Returns TOKEN spelt as a null-terminated string.  The string is
4179    freed when the reader is destroyed.  Useful for diagnostics.  */
4180 unsigned char *
cpp_token_as_text(cpp_reader * pfile,const cpp_token * token)4181 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
4182 {
4183   unsigned int len = cpp_token_len (token) + 1;
4184   unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
4185 
4186   end = cpp_spell_token (pfile, token, start, false);
4187   end[0] = '\0';
4188 
4189   return start;
4190 }
4191 
4192 /* Returns a pointer to a string which spells the token defined by
4193    TYPE and FLAGS.  Used by C front ends, which really should move to
4194    using cpp_token_as_text.  */
4195 const char *
cpp_type2name(enum cpp_ttype type,unsigned char flags)4196 cpp_type2name (enum cpp_ttype type, unsigned char flags)
4197 {
4198   if (flags & DIGRAPH)
4199     return (const char *) cpp_digraph2name (type);
4200   else if (flags & NAMED_OP)
4201     return cpp_named_operator2name (type);
4202 
4203   return (const char *) token_spellings[type].name;
4204 }
4205 
4206 /* Writes the spelling of token to FP, without any preceding space.
4207    Separated from cpp_spell_token for efficiency - to avoid stdio
4208    double-buffering.  */
4209 void
cpp_output_token(const cpp_token * token,FILE * fp)4210 cpp_output_token (const cpp_token *token, FILE *fp)
4211 {
4212   switch (TOKEN_SPELL (token))
4213     {
4214     case SPELL_OPERATOR:
4215       {
4216           const unsigned char *spelling;
4217           int c;
4218 
4219           if (token->flags & DIGRAPH)
4220             spelling = cpp_digraph2name (token->type);
4221           else if (token->flags & NAMED_OP)
4222             goto spell_ident;
4223           else
4224             spelling = TOKEN_NAME (token);
4225 
4226           c = *spelling;
4227           do
4228             putc (c, fp);
4229           while ((c = *++spelling) != '\0');
4230       }
4231       break;
4232 
4233     spell_ident:
4234     case SPELL_IDENT:
4235       {
4236           size_t i;
4237           const unsigned char * name = NODE_NAME (token->val.node.node);
4238 
4239           for (i = 0; i < NODE_LEN (token->val.node.node); i++)
4240             if (name[i] & ~0x7F)
4241               {
4242                 unsigned char buffer[10];
4243                 i += utf8_to_ucn (buffer, name + i) - 1;
4244                 fwrite (buffer, 1, 10, fp);
4245               }
4246             else
4247               fputc (NODE_NAME (token->val.node.node)[i], fp);
4248       }
4249       break;
4250 
4251     case SPELL_LITERAL:
4252       if (token->type == CPP_HEADER_NAME)
4253           fputc ('"', fp);
4254       fwrite (token->val.str.text, 1, token->val.str.len, fp);
4255       if (token->type == CPP_HEADER_NAME)
4256           fputc ('"', fp);
4257       break;
4258 
4259     case SPELL_NONE:
4260       /* An error, most probably.  */
4261       break;
4262     }
4263 }
4264 
4265 /* Compare two tokens.  */
4266 int
_cpp_equiv_tokens(const cpp_token * a,const cpp_token * b)4267 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
4268 {
4269   if (a->type == b->type && a->flags == b->flags)
4270     switch (TOKEN_SPELL (a))
4271       {
4272       default:                          /* Keep compiler happy.  */
4273       case SPELL_OPERATOR:
4274           /* token_no is used to track where multiple consecutive ##
4275              tokens were originally located.  */
4276           return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
4277       case SPELL_NONE:
4278           return (a->type != CPP_MACRO_ARG
4279                     || (a->val.macro_arg.arg_no == b->val.macro_arg.arg_no
4280                         && a->val.macro_arg.spelling == b->val.macro_arg.spelling));
4281       case SPELL_IDENT:
4282           return (a->val.node.node == b->val.node.node
4283                     && a->val.node.spelling == b->val.node.spelling);
4284       case SPELL_LITERAL:
4285           return (a->val.str.len == b->val.str.len
4286                     && !memcmp (a->val.str.text, b->val.str.text,
4287                                   a->val.str.len));
4288       }
4289 
4290   return 0;
4291 }
4292 
4293 /* Returns nonzero if a space should be inserted to avoid an
4294    accidental token paste for output.  For simplicity, it is
4295    conservative, and occasionally advises a space where one is not
4296    needed, e.g. "." and ".2".  */
4297 int
cpp_avoid_paste(cpp_reader * pfile,const cpp_token * token1,const cpp_token * token2)4298 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
4299                      const cpp_token *token2)
4300 {
4301   enum cpp_ttype a = token1->type, b = token2->type;
4302   cppchar_t c;
4303 
4304   if (token1->flags & NAMED_OP)
4305     a = CPP_NAME;
4306   if (token2->flags & NAMED_OP)
4307     b = CPP_NAME;
4308 
4309   c = EOF;
4310   if (token2->flags & DIGRAPH)
4311     c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
4312   else if (token_spellings[b].category == SPELL_OPERATOR)
4313     c = token_spellings[b].name[0];
4314 
4315   /* Quickly get everything that can paste with an '='.  */
4316   if ((int) a <= (int) CPP_LAST_EQ && c == '=')
4317     return 1;
4318 
4319   switch (a)
4320     {
4321     case CPP_GREATER:         return c == '>';
4322     case CPP_LESS:  return c == '<' || c == '%' || c == ':';
4323     case CPP_PLUS:  return c == '+';
4324     case CPP_MINUS: return c == '-' || c == '>';
4325     case CPP_DIV:   return c == '/' || c == '*'; /* Comments.  */
4326     case CPP_MOD:   return c == ':' || c == '>';
4327     case CPP_AND:   return c == '&';
4328     case CPP_OR:    return c == '|';
4329     case CPP_COLON: return c == ':' || c == '>';
4330     case CPP_DEREF: return c == '*';
4331     case CPP_DOT:   return c == '.' || c == '%' || b == CPP_NUMBER;
4332     case CPP_HASH:  return c == '#' || c == '%'; /* Digraph form.  */
4333     case CPP_PRAGMA:
4334     case CPP_NAME:  return ((b == CPP_NUMBER
4335                                          && name_p (pfile, &token2->val.str))
4336                                         || b == CPP_NAME
4337                                         || b == CPP_CHAR || b == CPP_STRING); /* L */
4338     case CPP_NUMBER:          return (b == CPP_NUMBER || b == CPP_NAME
4339                                         || b == CPP_CHAR
4340                                         || c == '.' || c == '+' || c == '-');
4341                                               /* UCNs */
4342     case CPP_OTHER: return ((token1->val.str.text[0] == '\\'
4343                                          && b == CPP_NAME)
4344                                         || (CPP_OPTION (pfile, objc)
4345                                             && token1->val.str.text[0] == '@'
4346                                             && (b == CPP_NAME || b == CPP_STRING)));
4347     case CPP_LESS_EQ:         return c == '>';
4348     case CPP_STRING:
4349     case CPP_WSTRING:
4350     case CPP_UTF8STRING:
4351     case CPP_STRING16:
4352     case CPP_STRING32:        return (CPP_OPTION (pfile, user_literals)
4353                                         && (b == CPP_NAME
4354                                             || (TOKEN_SPELL (token2) == SPELL_LITERAL
4355                                                   && ISIDST (token2->val.str.text[0]))));
4356 
4357     default:                  break;
4358     }
4359 
4360   return 0;
4361 }
4362 
4363 /* Output all the remaining tokens on the current line, and a newline
4364    character, to FP.  Leading whitespace is removed.  If there are
4365    macros, special token padding is not performed.  */
4366 void
cpp_output_line(cpp_reader * pfile,FILE * fp)4367 cpp_output_line (cpp_reader *pfile, FILE *fp)
4368 {
4369   const cpp_token *token;
4370 
4371   token = cpp_get_token (pfile);
4372   while (token->type != CPP_EOF)
4373     {
4374       cpp_output_token (token, fp);
4375       token = cpp_get_token (pfile);
4376       if (token->flags & PREV_WHITE)
4377           putc (' ', fp);
4378     }
4379 
4380   putc ('\n', fp);
4381 }
4382 
4383 /* Return a string representation of all the remaining tokens on the
4384    current line.  The result is allocated using xmalloc and must be
4385    freed by the caller.  */
4386 unsigned char *
cpp_output_line_to_string(cpp_reader * pfile,const unsigned char * dir_name)4387 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
4388 {
4389   const cpp_token *token;
4390   unsigned int out = dir_name ? ustrlen (dir_name) : 0;
4391   unsigned int alloced = 120 + out;
4392   unsigned char *result = (unsigned char *) xmalloc (alloced);
4393 
4394   /* If DIR_NAME is empty, there are no initial contents.  */
4395   if (dir_name)
4396     {
4397       sprintf ((char *) result, "#%s ", dir_name);
4398       out += 2;
4399     }
4400 
4401   token = cpp_get_token (pfile);
4402   while (token->type != CPP_EOF)
4403     {
4404       unsigned char *last;
4405       /* Include room for a possible space and the terminating nul.  */
4406       unsigned int len = cpp_token_len (token) + 2;
4407 
4408       if (out + len > alloced)
4409           {
4410             alloced *= 2;
4411             if (out + len > alloced)
4412               alloced = out + len;
4413             result = (unsigned char *) xrealloc (result, alloced);
4414           }
4415 
4416       last = cpp_spell_token (pfile, token, &result[out], 0);
4417       out = last - result;
4418 
4419       token = cpp_get_token (pfile);
4420       if (token->flags & PREV_WHITE)
4421           result[out++] = ' ';
4422     }
4423 
4424   result[out] = '\0';
4425   return result;
4426 }
4427 
4428 /* Memory buffers.  Changing these three constants can have a dramatic
4429    effect on performance.  The values here are reasonable defaults,
4430    but might be tuned.  If you adjust them, be sure to test across a
4431    range of uses of cpplib, including heavy nested function-like macro
4432    expansion.  Also check the change in peak memory usage (NJAMD is a
4433    good tool for this).  */
4434 #define MIN_BUFF_SIZE 8000
4435 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
4436 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
4437           (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
4438 
4439 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
4440   #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
4441 #endif
4442 
4443 /* Create a new allocation buffer.  Place the control block at the end
4444    of the buffer, so that buffer overflows will cause immediate chaos.  */
4445 static _cpp_buff *
new_buff(size_t len)4446 new_buff (size_t len)
4447 {
4448   _cpp_buff *result;
4449   unsigned char *base;
4450 
4451   if (len < MIN_BUFF_SIZE)
4452     len = MIN_BUFF_SIZE;
4453   len = CPP_ALIGN (len);
4454 
4455 #ifdef ENABLE_VALGRIND_ANNOTATIONS
4456   /* Valgrind warns about uses of interior pointers, so put _cpp_buff
4457      struct first.  */
4458   size_t slen = CPP_ALIGN2 (sizeof (_cpp_buff), 2 * DEFAULT_ALIGNMENT);
4459   base = XNEWVEC (unsigned char, len + slen);
4460   result = (_cpp_buff *) base;
4461   base += slen;
4462 #else
4463   base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
4464   result = (_cpp_buff *) (base + len);
4465 #endif
4466   result->base = base;
4467   result->cur = base;
4468   result->limit = base + len;
4469   result->next = NULL;
4470   return result;
4471 }
4472 
4473 /* Place a chain of unwanted allocation buffers on the free list.  */
4474 void
_cpp_release_buff(cpp_reader * pfile,_cpp_buff * buff)4475 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
4476 {
4477   _cpp_buff *end = buff;
4478 
4479   while (end->next)
4480     end = end->next;
4481   end->next = pfile->free_buffs;
4482   pfile->free_buffs = buff;
4483 }
4484 
4485 /* Return a free buffer of size at least MIN_SIZE.  */
4486 _cpp_buff *
_cpp_get_buff(cpp_reader * pfile,size_t min_size)4487 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
4488 {
4489   _cpp_buff *result, **p;
4490 
4491   for (p = &pfile->free_buffs;; p = &(*p)->next)
4492     {
4493       size_t size;
4494 
4495       if (*p == NULL)
4496           return new_buff (min_size);
4497       result = *p;
4498       size = result->limit - result->base;
4499       /* Return a buffer that's big enough, but don't waste one that's
4500          way too big.  */
4501       if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
4502           break;
4503     }
4504 
4505   *p = result->next;
4506   result->next = NULL;
4507   result->cur = result->base;
4508   return result;
4509 }
4510 
4511 /* Creates a new buffer with enough space to hold the uncommitted
4512    remaining bytes of BUFF, and at least MIN_EXTRA more bytes.  Copies
4513    the excess bytes to the new buffer.  Chains the new buffer after
4514    BUFF, and returns the new buffer.  */
4515 _cpp_buff *
_cpp_append_extend_buff(cpp_reader * pfile,_cpp_buff * buff,size_t min_extra)4516 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
4517 {
4518   size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
4519   _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
4520 
4521   buff->next = new_buff;
4522   memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
4523   return new_buff;
4524 }
4525 
4526 /* Creates a new buffer with enough space to hold the uncommitted
4527    remaining bytes of the buffer pointed to by BUFF, and at least
4528    MIN_EXTRA more bytes.  Copies the excess bytes to the new buffer.
4529    Chains the new buffer before the buffer pointed to by BUFF, and
4530    updates the pointer to point to the new buffer.  */
4531 void
_cpp_extend_buff(cpp_reader * pfile,_cpp_buff ** pbuff,size_t min_extra)4532 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
4533 {
4534   _cpp_buff *new_buff, *old_buff = *pbuff;
4535   size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
4536 
4537   new_buff = _cpp_get_buff (pfile, size);
4538   memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
4539   new_buff->next = old_buff;
4540   *pbuff = new_buff;
4541 }
4542 
4543 /* Free a chain of buffers starting at BUFF.  */
4544 void
_cpp_free_buff(_cpp_buff * buff)4545 _cpp_free_buff (_cpp_buff *buff)
4546 {
4547   _cpp_buff *next;
4548 
4549   for (; buff; buff = next)
4550     {
4551       next = buff->next;
4552 #ifdef ENABLE_VALGRIND_ANNOTATIONS
4553       free (buff);
4554 #else
4555       free (buff->base);
4556 #endif
4557     }
4558 }
4559 
4560 /* Allocate permanent, unaligned storage of length LEN.  */
4561 unsigned char *
_cpp_unaligned_alloc(cpp_reader * pfile,size_t len)4562 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
4563 {
4564   _cpp_buff *buff = pfile->u_buff;
4565   unsigned char *result = buff->cur;
4566 
4567   if (len > (size_t) (buff->limit - result))
4568     {
4569       buff = _cpp_get_buff (pfile, len);
4570       buff->next = pfile->u_buff;
4571       pfile->u_buff = buff;
4572       result = buff->cur;
4573     }
4574 
4575   buff->cur = result + len;
4576   return result;
4577 }
4578 
4579 /* Allocate permanent, unaligned storage of length LEN from a_buff.
4580    That buffer is used for growing allocations when saving macro
4581    replacement lists in a #define, and when parsing an answer to an
4582    assertion in #assert, #unassert or #if (and therefore possibly
4583    whilst expanding macros).  It therefore must not be used by any
4584    code that they might call: specifically the lexer and the guts of
4585    the macro expander.
4586 
4587    All existing other uses clearly fit this restriction: storing
4588    registered pragmas during initialization.  */
4589 unsigned char *
_cpp_aligned_alloc(cpp_reader * pfile,size_t len)4590 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
4591 {
4592   _cpp_buff *buff = pfile->a_buff;
4593   unsigned char *result = buff->cur;
4594 
4595   if (len > (size_t) (buff->limit - result))
4596     {
4597       buff = _cpp_get_buff (pfile, len);
4598       buff->next = pfile->a_buff;
4599       pfile->a_buff = buff;
4600       result = buff->cur;
4601     }
4602 
4603   buff->cur = result + len;
4604   return result;
4605 }
4606 
4607 /* Commit or allocate storage from a buffer.  */
4608 
4609 void *
_cpp_commit_buff(cpp_reader * pfile,size_t size)4610 _cpp_commit_buff (cpp_reader *pfile, size_t size)
4611 {
4612   void *ptr = BUFF_FRONT (pfile->a_buff);
4613 
4614   if (pfile->hash_table->alloc_subobject)
4615     {
4616       void *copy = pfile->hash_table->alloc_subobject (size);
4617       memcpy (copy, ptr, size);
4618       ptr = copy;
4619     }
4620   else
4621     BUFF_FRONT (pfile->a_buff) += size;
4622 
4623   return ptr;
4624 }
4625 
4626 /* Say which field of TOK is in use.  */
4627 
4628 enum cpp_token_fld_kind
cpp_token_val_index(const cpp_token * tok)4629 cpp_token_val_index (const cpp_token *tok)
4630 {
4631   switch (TOKEN_SPELL (tok))
4632     {
4633     case SPELL_IDENT:
4634       return CPP_TOKEN_FLD_NODE;
4635     case SPELL_LITERAL:
4636       return CPP_TOKEN_FLD_STR;
4637     case SPELL_OPERATOR:
4638       /* Operands which were originally spelled as ident keep around
4639          the node for the exact spelling.  */
4640       if (tok->flags & NAMED_OP)
4641           return CPP_TOKEN_FLD_NODE;
4642       else if (tok->type == CPP_PASTE)
4643           return CPP_TOKEN_FLD_TOKEN_NO;
4644       else
4645           return CPP_TOKEN_FLD_NONE;
4646     case SPELL_NONE:
4647       if (tok->type == CPP_MACRO_ARG)
4648           return CPP_TOKEN_FLD_ARG_NO;
4649       else if (tok->type == CPP_PADDING)
4650           return CPP_TOKEN_FLD_SOURCE;
4651       else if (tok->type == CPP_PRAGMA)
4652           return CPP_TOKEN_FLD_PRAGMA;
4653       /* fall through */
4654     default:
4655       return CPP_TOKEN_FLD_NONE;
4656     }
4657 }
4658 
4659 /* All tokens lexed in R after calling this function will be forced to
4660    have their location_t to be P, until
4661    cpp_stop_forcing_token_locations is called for R.  */
4662 
4663 void
cpp_force_token_locations(cpp_reader * r,location_t loc)4664 cpp_force_token_locations (cpp_reader *r, location_t loc)
4665 {
4666   r->forced_token_location = loc;
4667 }
4668 
4669 /* Go back to assigning locations naturally for lexed tokens.  */
4670 
4671 void
cpp_stop_forcing_token_locations(cpp_reader * r)4672 cpp_stop_forcing_token_locations (cpp_reader *r)
4673 {
4674   r->forced_token_location = 0;
4675 }
4676 
4677 /* We're looking at \, if it's escaping EOL, look past it.  If at
4678    LIMIT, don't advance.  */
4679 
4680 static const unsigned char *
do_peek_backslash(const unsigned char * peek,const unsigned char * limit)4681 do_peek_backslash (const unsigned char *peek, const unsigned char *limit)
4682 {
4683   const unsigned char *probe = peek;
4684 
4685   if (__builtin_expect (peek[1] == '\n', true))
4686     {
4687     eol:
4688       probe += 2;
4689       if (__builtin_expect (probe < limit, true))
4690           {
4691             peek = probe;
4692             if (*peek == '\\')
4693               /* The user might be perverse.  */
4694               return do_peek_backslash (peek, limit);
4695           }
4696     }
4697   else if (__builtin_expect (peek[1] == '\r', false))
4698     {
4699       if (probe[2] == '\n')
4700           probe++;
4701       goto eol;
4702     }
4703 
4704   return peek;
4705 }
4706 
4707 static const unsigned char *
do_peek_next(const unsigned char * peek,const unsigned char * limit)4708 do_peek_next (const unsigned char *peek, const unsigned char *limit)
4709 {
4710   if (__builtin_expect (*peek == '\\', false))
4711     peek = do_peek_backslash (peek, limit);
4712   return peek;
4713 }
4714 
4715 static const unsigned char *
do_peek_prev(const unsigned char * peek,const unsigned char * bound)4716 do_peek_prev (const unsigned char *peek, const unsigned char *bound)
4717 {
4718   if (peek == bound)
4719     return NULL;
4720 
4721   unsigned char c = *--peek;
4722   if (__builtin_expect (c == '\n', false)
4723       || __builtin_expect (c == 'r', false))
4724     {
4725       if (peek == bound)
4726           return peek;
4727       int ix = -1;
4728       if (c == '\n' && peek[ix] == '\r')
4729           {
4730             if (peek + ix == bound)
4731               return peek;
4732             ix--;
4733           }
4734 
4735       if (peek[ix] == '\\')
4736           return do_peek_prev (peek + ix, bound);
4737 
4738       return peek;
4739     }
4740   else
4741     return peek;
4742 }
4743 
4744 /* If PEEK[-1] is identifier MATCH, scan past it and trailing white
4745    space.  Otherwise return NULL.  */
4746 
4747 static const unsigned char *
do_peek_ident(const char * match,const unsigned char * peek,const unsigned char * limit)4748 do_peek_ident (const char *match, const unsigned char *peek,
4749                  const unsigned char *limit)
4750 {
4751   for (; *++match; peek++)
4752     if (*peek != *match)
4753       {
4754           peek = do_peek_next (peek, limit);
4755           if (*peek != *match)
4756             return NULL;
4757       }
4758 
4759   /* Must now not be looking at an identifier char.  */
4760   peek = do_peek_next (peek, limit);
4761   if (ISIDNUM (*peek))
4762     return NULL;
4763 
4764   /* Skip control-line whitespace.  */
4765  ws:
4766   while (*peek == ' ' || *peek == '\t')
4767     peek++;
4768   if (__builtin_expect (*peek == '\\', false))
4769     {
4770       peek = do_peek_backslash (peek, limit);
4771       if (*peek != '\\')
4772           goto ws;
4773     }
4774 
4775   return peek;
4776 }
4777 
4778 /* Are we looking at a module control line starting as PEEK - 1?  */
4779 
4780 static bool
do_peek_module(cpp_reader * pfile,unsigned char c,const unsigned char * peek,const unsigned char * limit)4781 do_peek_module (cpp_reader *pfile, unsigned char c,
4782                     const unsigned char *peek, const unsigned char *limit)
4783 {
4784   bool import = false;
4785 
4786   if (__builtin_expect (c == 'e', false))
4787     {
4788       if (!((peek[0] == 'x' || peek[0] == '\\')
4789               && (peek = do_peek_ident ("export", peek, limit))))
4790           return false;
4791 
4792       /* export, peek for import or module.  No need to peek __import
4793            here.  */
4794       if (peek[0] == 'i')
4795           {
4796             if (!((peek[1] == 'm' || peek[1] == '\\')
4797                     && (peek = do_peek_ident ("import", peek + 1, limit))))
4798               return false;
4799             import = true;
4800           }
4801       else if (peek[0] == 'm')
4802           {
4803             if (!((peek[1] == 'o' || peek[1] == '\\')
4804                     && (peek = do_peek_ident ("module", peek + 1, limit))))
4805               return false;
4806           }
4807       else
4808           return false;
4809     }
4810   else if (__builtin_expect (c == 'i', false))
4811     {
4812       if (!((peek[0] == 'm' || peek[0] == '\\')
4813               && (peek = do_peek_ident ("import", peek, limit))))
4814           return false;
4815       import = true;
4816     }
4817   else if (__builtin_expect (c == '_', false))
4818     {
4819       /* Needed for translated includes.   */
4820       if (!((peek[0] == '_' || peek[0] == '\\')
4821               && (peek = do_peek_ident ("__import", peek, limit))))
4822           return false;
4823       import = true;
4824     }
4825   else if (__builtin_expect (c == 'm', false))
4826     {
4827       if (!((peek[0] == 'o' || peek[0] == '\\')
4828               && (peek = do_peek_ident ("module", peek, limit))))
4829           return false;
4830     }
4831   else
4832     return false;
4833 
4834   /* Peek the next character to see if it's good enough.  We'll be at
4835      the first non-whitespace char, including skipping an escaped
4836      newline.  */
4837   /* ... import followed by identifier, ':', '<' or header-name
4838      preprocessing tokens, or module followed by identifier, ':' or
4839      ';' preprocessing tokens.  */
4840   unsigned char p = *peek++;
4841 
4842   /* A character literal is ... single quotes, ... optionally preceded
4843      by u8, u, U, or L */
4844   /* A string-literal is a ... double quotes, optionally prefixed by
4845      R, u8, u8R, u, uR, U, UR, L, or LR */
4846   if (p == 'u')
4847     {
4848       peek = do_peek_next (peek, limit);
4849       if (*peek == '8')
4850           {
4851             peek++;
4852             goto peek_u8;
4853           }
4854       goto peek_u;
4855     }
4856   else if (p == 'U' || p == 'L')
4857     {
4858     peek_u8:
4859       peek = do_peek_next (peek, limit);
4860     peek_u:
4861       if (*peek == '\"' || *peek == '\'')
4862           return false;
4863 
4864       if (*peek == 'R')
4865           goto peek_R;
4866       /* Identifier. Ok.  */
4867     }
4868   else if (p == 'R')
4869     {
4870     peek_R:
4871       if (CPP_OPTION (pfile, rliterals))
4872           {
4873             peek = do_peek_next (peek, limit);
4874             if (*peek == '\"')
4875               return false;
4876           }
4877       /* Identifier. Ok.  */
4878     }
4879   else if ('Z' - 'A' == 25
4880              ? ((p >= 'A' && p <= 'Z') || (p >= 'a' && p <= 'z') || p == '_')
4881              : ISIDST (p))
4882     {
4883       /* Identifier.  Ok. */
4884     }
4885   else if (p == '<')
4886     {
4887       /* Maybe angle header, ok for import.  Reject
4888            '<=', '<<' digraph:'<:'.  */
4889       if (!import)
4890           return false;
4891       peek = do_peek_next (peek, limit);
4892       if (*peek == '=' || *peek == '<'
4893             || (*peek == ':' && CPP_OPTION (pfile, digraphs)))
4894           return false;
4895     }
4896   else if (p == ';')
4897     {
4898       /* SEMICOLON, ok for module.  */
4899       if (import)
4900           return false;
4901     }
4902   else if (p == '"')
4903     {
4904       /* STRING, ok for import.  */
4905       if (!import)
4906           return false;
4907     }
4908   else if (p == ':')
4909     {
4910       /* Maybe COLON, ok.  Reject '::', digraph:':>'.  */
4911       peek = do_peek_next (peek, limit);
4912       if (*peek == ':' || (*peek == '>' && CPP_OPTION (pfile, digraphs)))
4913           return false;
4914     }
4915   else
4916     /* FIXME: Detect a unicode character, excluding those not
4917        permitted as the initial character. [lex.name]/1.  I presume
4918        we need to check the \[uU] spellings, and directly using
4919        Unicode in say UTF8 form?  Or perhaps we do the phase-1
4920        conversion of UTF8 to universal-character-names?  */
4921     return false;
4922 
4923   return true;
4924 }
4925 
4926 /* Directives-only scanning.  Somewhat more relaxed than correct
4927    parsing -- some ill-formed programs will not be rejected.  */
4928 
4929 void
cpp_directive_only_process(cpp_reader * pfile,void * data,void (* cb)(cpp_reader *,CPP_DO_task,void *,...))4930 cpp_directive_only_process (cpp_reader *pfile,
4931                                   void *data,
4932                                   void (*cb) (cpp_reader *, CPP_DO_task, void *, ...))
4933 {
4934   bool module_p = CPP_OPTION (pfile, module_directives);
4935 
4936   do
4937     {
4938     restart:
4939       /* Buffer initialization, but no line cleaning. */
4940       cpp_buffer *buffer = pfile->buffer;
4941       buffer->cur_note = buffer->notes_used = 0;
4942       buffer->cur = buffer->line_base = buffer->next_line;
4943       buffer->need_line = false;
4944       /* Files always end in a newline or carriage return.  We rely on this for
4945            character peeking safety.  */
4946       gcc_assert (buffer->rlimit[0] == '\n' || buffer->rlimit[0] == '\r');
4947 
4948       const unsigned char *base = buffer->cur;
4949       unsigned line_count = 0;
4950       const unsigned char *line_start = base;
4951 
4952       bool bol = true;
4953       bool raw = false;
4954 
4955       const unsigned char *lwm = base;
4956       for (const unsigned char *pos = base, *limit = buffer->rlimit;
4957              pos < limit;)
4958           {
4959             unsigned char c = *pos++;
4960             /* This matches the switch in _cpp_lex_direct.  */
4961             switch (c)
4962               {
4963               case ' ': case '\t': case '\f': case '\v':
4964                 /* Whitespace, do nothing.  */
4965                 break;
4966 
4967               case '\r': /* MAC line ending, or Windows \r\n  */
4968                 if (*pos == '\n')
4969                     pos++;
4970                 /* FALLTHROUGH */
4971 
4972               case '\n':
4973                 bol = true;
4974 
4975               next_line:
4976                 CPP_INCREMENT_LINE (pfile, 0);
4977                 line_count++;
4978                 line_start = pos;
4979                 break;
4980 
4981               case '\\':
4982                 /* <backslash><newline> is removed, and doesn't undo any
4983                      preceeding escape or whatnot.  */
4984                 if (*pos == '\n')
4985                     {
4986                       pos++;
4987                       goto next_line;
4988                     }
4989                 else if (*pos == '\r')
4990                     {
4991                       if (pos[1] == '\n')
4992                         pos++;
4993                       pos++;
4994                       goto next_line;
4995                     }
4996                 goto dflt;
4997 
4998               case '#':
4999                 if (bol)
5000                     {
5001                       /* Line directive.  */
5002                       if (pos - 1 > base && !pfile->state.skipping)
5003                         cb (pfile, CPP_DO_print, data,
5004                               line_count, base, pos - 1 - base);
5005 
5006                       /* Prep things for directive handling. */
5007                       buffer->next_line = pos;
5008                       buffer->need_line = true;
5009                       bool ok = _cpp_get_fresh_line (pfile);
5010                       gcc_checking_assert (ok);
5011 
5012                       /* Ensure proper column numbering for generated
5013                          error messages. */
5014                       buffer->line_base -= pos - line_start;
5015 
5016                       _cpp_handle_directive (pfile, line_start + 1 != pos);
5017 
5018                       /* Sanitize the line settings.  Duplicate #include's can
5019                          mess things up. */
5020                       // FIXME: Necessary?
5021                       pfile->line_table->highest_location
5022                         = pfile->line_table->highest_line;
5023 
5024                       if (!pfile->state.skipping
5025                           && pfile->buffer->next_line < pfile->buffer->rlimit)
5026                         cb (pfile, CPP_DO_location, data,
5027                               pfile->line_table->highest_line);
5028 
5029                       goto restart;
5030                     }
5031                 goto dflt;
5032 
5033               case '/':
5034                 {
5035                     const unsigned char *peek = do_peek_next (pos, limit);
5036                     if (!(*peek == '/' || *peek == '*'))
5037                       goto dflt;
5038 
5039                     /* Line or block comment  */
5040                     bool is_block = *peek == '*';
5041                     bool star = false;
5042                     bool esc = false;
5043                     location_t sloc
5044                       = linemap_position_for_column (pfile->line_table,
5045                                                              pos - line_start);
5046 
5047                     while (pos < limit)
5048                       {
5049                         char c = *pos++;
5050                         switch (c)
5051                           {
5052                           case '\\':
5053                               esc = true;
5054                               break;
5055 
5056                           case '\r':
5057                               if (*pos == '\n')
5058                                 pos++;
5059                               /* FALLTHROUGH  */
5060 
5061                           case '\n':
5062                               {
5063                                 CPP_INCREMENT_LINE (pfile, 0);
5064                                 line_count++;
5065                                 line_start = pos;
5066                                 if (!esc && !is_block)
5067                                   {
5068                                     bol = true;
5069                                     goto done_comment;
5070                                   }
5071                               }
5072                               if (!esc)
5073                                 star = false;
5074                               esc = false;
5075                               break;
5076 
5077                           case '*':
5078                               if (pos > peek)
5079                                 star = is_block;
5080                               esc = false;
5081                               break;
5082 
5083                           case '/':
5084                               if (star)
5085                                 goto done_comment;
5086                               /* FALLTHROUGH  */
5087 
5088                           default:
5089                               star = false;
5090                               esc = false;
5091                               break;
5092                           }
5093                       }
5094                     if (pos < limit || is_block)
5095                       cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
5096                                                "unterminated comment");
5097                 done_comment:
5098                     lwm = pos;
5099                     break;
5100                 }
5101 
5102               case '\'':
5103                 if (!CPP_OPTION (pfile, digit_separators))
5104                     goto delimited_string;
5105 
5106                 /* Possibly a number punctuator.  */
5107                 if (!ISIDNUM (*do_peek_next (pos, limit)))
5108                     goto delimited_string;
5109 
5110                 goto quote_peek;
5111 
5112               case '\"':
5113                 if (!CPP_OPTION (pfile, rliterals))
5114                     goto delimited_string;
5115 
5116               quote_peek:
5117                 {
5118                     /* For ' see if it's a number punctuator
5119                        \.?<digit>(<digit>|<identifier-nondigit>
5120                        |'<digit>|'<nondigit>|[eEpP]<sign>|\.)* */
5121                     /* For " see if it's a raw string
5122                        {U,L,u,u8}R.  This includes CPP_NUMBER detection,
5123                        because that could be 0e+R.  */
5124                     const unsigned char *peek = pos - 1;
5125                     bool quote_first = c == '"';
5126                     bool quote_eight = false;
5127                     bool maybe_number_start = false;
5128                     bool want_number = false;
5129 
5130                     while ((peek = do_peek_prev (peek, lwm)))
5131                       {
5132                         unsigned char p = *peek;
5133                         if (quote_first)
5134                           {
5135                               if (!raw)
5136                                 {
5137                                   if (p != 'R')
5138                                     break;
5139                                   raw = true;
5140                                   continue;
5141                                 }
5142 
5143                               quote_first = false;
5144                               if (p == 'L' || p == 'U' || p == 'u')
5145                                 ;
5146                               else if (p == '8')
5147                                 quote_eight = true;
5148                               else
5149                                 goto second_raw;
5150                           }
5151                         else if (quote_eight)
5152                           {
5153                               if (p != 'u')
5154                                 {
5155                                   raw = false;
5156                                   break;
5157                                 }
5158                               quote_eight = false;
5159                           }
5160                         else if (c == '"')
5161                           {
5162                           second_raw:;
5163                               if (!want_number && ISIDNUM (p))
5164                                 {
5165                                   raw = false;
5166                                   break;
5167                                 }
5168                           }
5169 
5170                         if (ISDIGIT (p))
5171                           maybe_number_start = true;
5172                         else if (p == '.')
5173                           want_number = true;
5174                         else if (ISIDNUM (p))
5175                           maybe_number_start = false;
5176                         else if (p == '+' || p == '-')
5177                           {
5178                               if (const unsigned char *peek_prev
5179                                   = do_peek_prev (peek, lwm))
5180                                 {
5181                                   p = *peek_prev;
5182                                   if (p == 'e' || p == 'E'
5183                                         || p == 'p' || p == 'P')
5184                                     {
5185                                         want_number = true;
5186                                         maybe_number_start = false;
5187                                     }
5188                                   else
5189                                     break;
5190                                 }
5191                               else
5192                                 break;
5193                           }
5194                         else if (p == '\'' || p == '\"')
5195                           {
5196                               /* If this is lwm, this must be the end of a
5197                                  previous string.  So this is a trailing
5198                                  literal type, (a) if those are allowed,
5199                                    and (b) maybe_start is false.  Otherwise
5200                                    this must be a CPP_NUMBER because we've
5201                                    met another ', and we'd have checked that
5202                                    in its own right.  */
5203                               if (peek == lwm && CPP_OPTION (pfile, uliterals))
5204                                 {
5205                                   if  (!maybe_number_start && !want_number)
5206                                     /* Must be a literal type.  */
5207                                     raw = false;
5208                                 }
5209                               else if (p == '\''
5210                                          && CPP_OPTION (pfile, digit_separators))
5211                                 maybe_number_start = true;
5212                               break;
5213                           }
5214                         else if (c == '\'')
5215                           break;
5216                         else if (!quote_first && !quote_eight)
5217                           break;
5218                       }
5219 
5220                     if (maybe_number_start)
5221                       {
5222                         if (c == '\'')
5223                           /* A CPP NUMBER.  */
5224                           goto dflt;
5225                         raw = false;
5226                       }
5227 
5228                     goto delimited_string;
5229                 }
5230 
5231               delimited_string:
5232                 {
5233                     /* (Possibly raw) string or char literal.  */
5234                     unsigned char end = c;
5235                     int delim_len = -1;
5236                     const unsigned char *delim = NULL;
5237                     location_t sloc = linemap_position_for_column (pfile->line_table,
5238                                                                              pos - line_start);
5239                     int esc = 0;
5240 
5241                     if (raw)
5242                       {
5243                         /* There can be no line breaks in the delimiter.  */
5244                         delim = pos;
5245                         for (delim_len = 0; (c = *pos++) != '('; delim_len++)
5246                           {
5247                               if (delim_len == 16)
5248                                 {
5249                                   cpp_error_with_line (pfile, CPP_DL_ERROR,
5250                                                              sloc, 0,
5251                                                              "raw string delimiter"
5252                                                              " longer than %d"
5253                                                              " characters",
5254                                                              delim_len);
5255                                   raw = false;
5256                                   pos = delim;
5257                                   break;
5258                                 }
5259                               if (strchr (") \\\t\v\f\n", c))
5260                                 {
5261                                   cpp_error_with_line (pfile, CPP_DL_ERROR,
5262                                                              sloc, 0,
5263                                                              "invalid character '%c'"
5264                                                              " in raw string"
5265                                                              " delimiter", c);
5266                                   raw = false;
5267                                   pos = delim;
5268                                   break;
5269                                 }
5270                               if (pos >= limit)
5271                                 goto bad_string;
5272                           }
5273                       }
5274 
5275                     while (pos < limit)
5276                       {
5277                         char c = *pos++;
5278                         switch (c)
5279                           {
5280                           case '\\':
5281                               if (!raw)
5282                                 esc++;
5283                               break;
5284 
5285                           case '\r':
5286                               if (*pos == '\n')
5287                                 pos++;
5288                               /* FALLTHROUGH  */
5289 
5290                           case '\n':
5291                               {
5292                                 CPP_INCREMENT_LINE (pfile, 0);
5293                                 line_count++;
5294                                 line_start = pos;
5295                               }
5296                               if (esc)
5297                                 esc--;
5298                               break;
5299 
5300                           case ')':
5301                               if (raw
5302                                   && pos + delim_len + 1 < limit
5303                                   && pos[delim_len] == end
5304                                   && !memcmp (delim, pos, delim_len))
5305                                 {
5306                                   pos += delim_len + 1;
5307                                   raw = false;
5308                                   goto done_string;
5309                                 }
5310                               break;
5311 
5312                           default:
5313                               if (!raw && !(esc & 1) && c == end)
5314                                 goto done_string;
5315                               esc = 0;
5316                               break;
5317                           }
5318                       }
5319                 bad_string:
5320                     cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
5321                                              "unterminated literal");
5322 
5323                 done_string:
5324                     raw = false;
5325                     lwm = pos - 1;
5326                 }
5327                 goto dflt;
5328 
5329               case '_':
5330               case 'e':
5331               case 'i':
5332               case 'm':
5333                 if (bol && module_p && !pfile->state.skipping
5334                       && do_peek_module (pfile, c, pos, limit))
5335                     {
5336                       /* We've seen the start of a module control line.
5337                          Start up the tokenizer.  */
5338                       pos--; /* Backup over the first character.  */
5339 
5340                       /* Backup over whitespace to start of line.  */
5341                       while (pos > line_start
5342                                && (pos[-1] == ' ' || pos[-1] == '\t'))
5343                         pos--;
5344 
5345                       if (pos > base)
5346                         cb (pfile, CPP_DO_print, data, line_count, base, pos - base);
5347 
5348                       /* Prep things for directive handling. */
5349                       buffer->next_line = pos;
5350                       buffer->need_line = true;
5351 
5352                       /* Now get tokens until the PRAGMA_EOL.  */
5353                       do
5354                         {
5355                           location_t spelling;
5356                           const cpp_token *tok
5357                               = cpp_get_token_with_location (pfile, &spelling);
5358 
5359                           gcc_assert (pfile->state.in_deferred_pragma
5360                                           || tok->type == CPP_PRAGMA_EOL);
5361                           cb (pfile, CPP_DO_token, data, tok, spelling);
5362                         }
5363                       while (pfile->state.in_deferred_pragma);
5364 
5365                       if (pfile->buffer->next_line < pfile->buffer->rlimit)
5366                         cb (pfile, CPP_DO_location, data,
5367                               pfile->line_table->highest_line);
5368 
5369                       pfile->mi_valid = false;
5370                       goto restart;
5371                     }
5372                 goto dflt;
5373 
5374               default:
5375               dflt:
5376                 bol = false;
5377                 pfile->mi_valid = false;
5378                 break;
5379               }
5380           }
5381 
5382       if (buffer->rlimit > base && !pfile->state.skipping)
5383           {
5384             const unsigned char *limit = buffer->rlimit;
5385             /* If the file was not newline terminated, add rlimit, which is
5386                guaranteed to point to a newline, to the end of our range.  */
5387             if (limit[-1] != '\n')
5388               {
5389                 limit++;
5390                 CPP_INCREMENT_LINE (pfile, 0);
5391                 line_count++;
5392               }
5393             cb (pfile, CPP_DO_print, data, line_count, base, limit - base);
5394           }
5395 
5396       _cpp_pop_buffer (pfile);
5397     }
5398   while (pfile->buffer);
5399 }
5400