1 /* This is the Assembler Pre-Processor
2    Copyright 1987, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998,
3    1999, 2000, 2001, 2002, 2003, 2006, 2007
4    Free Software Foundation, Inc.
5 
6    This file is part of GAS, the GNU Assembler.
7 
8    GAS is free software; you can redistribute it and/or modify
9    it under the terms of the GNU General Public License as published by
10    the Free Software Foundation; either version 2, or (at your option)
11    any later version.
12 
13    GAS is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16    GNU General Public License for more details.
17 
18    You should have received a copy of the GNU General Public License
19    along with GAS; see the file COPYING.  If not, write to the Free
20    Software Foundation, 51 Franklin Street - Fifth Floor, Boston, MA
21    02110-1301, USA.  */
22 
23 /* Modified by Allen Wirfs-Brock, Instantiations Inc 2/90.  */
24 /* App, the assembler pre-processor.  This pre-processor strips out
25    excess spaces, turns single-quoted characters into a decimal
26    constant, and turns the # in # <number> <filename> <garbage> into a
27    .linefile.  This needs better error-handling.  */
28 
29 #include "as.h"
30 
31 #if (__STDC__ != 1)
32 #ifndef const
33 #define const  /* empty */
34 #endif
35 #endif
36 
37 #ifdef TC_M68K
38 /* Whether we are scrubbing in m68k MRI mode.  This is different from
39    flag_m68k_mri, because the two flags will be affected by the .mri
40    pseudo-op at different times.  */
41 static int scrub_m68k_mri;
42 
43 /* The pseudo-op which switches in and out of MRI mode.  See the
44    comment in do_scrub_chars.  */
45 static const char mri_pseudo[] = ".mri 0";
46 #else
47 #define scrub_m68k_mri 0
48 #endif
49 
50 #if defined TC_ARM && defined OBJ_ELF
51 /* The pseudo-op for which we need to special-case `@' characters.
52    See the comment in do_scrub_chars.  */
53 static const char   symver_pseudo[] = ".symver";
54 static const char * symver_state;
55 #endif
56 
57 static char lex[256];
58 static const char symbol_chars[] =
59 "$._ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
60 
61 #define LEX_IS_SYMBOL_COMPONENT		1
62 #define LEX_IS_WHITESPACE		2
63 #define LEX_IS_LINE_SEPARATOR		3
64 #define LEX_IS_COMMENT_START		4
65 #define LEX_IS_LINE_COMMENT_START	5
66 #define	LEX_IS_TWOCHAR_COMMENT_1ST	6
67 #define	LEX_IS_STRINGQUOTE		8
68 #define	LEX_IS_COLON			9
69 #define	LEX_IS_NEWLINE			10
70 #define	LEX_IS_ONECHAR_QUOTE		11
71 #ifdef TC_V850
72 #define LEX_IS_DOUBLEDASH_1ST		12
73 #endif
74 #ifdef TC_M32R
75 #define DOUBLEBAR_PARALLEL
76 #endif
77 #ifdef DOUBLEBAR_PARALLEL
78 #define LEX_IS_DOUBLEBAR_1ST		13
79 #endif
80 #define LEX_IS_PARALLEL_SEPARATOR	14
81 #define IS_SYMBOL_COMPONENT(c)		(lex[c] == LEX_IS_SYMBOL_COMPONENT)
82 #define IS_WHITESPACE(c)		(lex[c] == LEX_IS_WHITESPACE)
83 #define IS_LINE_SEPARATOR(c)		(lex[c] == LEX_IS_LINE_SEPARATOR)
84 #define IS_PARALLEL_SEPARATOR(c)	(lex[c] == LEX_IS_PARALLEL_SEPARATOR)
85 #define IS_COMMENT(c)			(lex[c] == LEX_IS_COMMENT_START)
86 #define IS_LINE_COMMENT(c)		(lex[c] == LEX_IS_LINE_COMMENT_START)
87 #define	IS_NEWLINE(c)			(lex[c] == LEX_IS_NEWLINE)
88 
89 static int process_escape (int);
90 
91 /* FIXME-soon: The entire lexer/parser thingy should be
92    built statically at compile time rather than dynamically
93    each and every time the assembler is run.  xoxorich.  */
94 
95 void
do_scrub_begin(int m68k_mri ATTRIBUTE_UNUSED)96 do_scrub_begin (int m68k_mri ATTRIBUTE_UNUSED)
97 {
98   const char *p;
99   int c;
100 
101   lex[' '] = LEX_IS_WHITESPACE;
102   lex['\t'] = LEX_IS_WHITESPACE;
103   lex['\r'] = LEX_IS_WHITESPACE;
104   lex['\n'] = LEX_IS_NEWLINE;
105   lex[':'] = LEX_IS_COLON;
106 
107 #ifdef TC_M68K
108   scrub_m68k_mri = m68k_mri;
109 
110   if (! m68k_mri)
111 #endif
112     {
113       lex['"'] = LEX_IS_STRINGQUOTE;
114 
115 #if ! defined (TC_HPPA) && ! defined (TC_I370)
116       /* I370 uses single-quotes to delimit integer, float constants.  */
117       lex['\''] = LEX_IS_ONECHAR_QUOTE;
118 #endif
119 
120 #ifdef SINGLE_QUOTE_STRINGS
121       lex['\''] = LEX_IS_STRINGQUOTE;
122 #endif
123     }
124 
125   /* Note: if any other character can be LEX_IS_STRINGQUOTE, the loop
126      in state 5 of do_scrub_chars must be changed.  */
127 
128   /* Note that these override the previous defaults, e.g. if ';' is a
129      comment char, then it isn't a line separator.  */
130   for (p = symbol_chars; *p; ++p)
131     lex[(unsigned char) *p] = LEX_IS_SYMBOL_COMPONENT;
132 
133   for (c = 128; c < 256; ++c)
134     lex[c] = LEX_IS_SYMBOL_COMPONENT;
135 
136 #ifdef tc_symbol_chars
137   /* This macro permits the processor to specify all characters which
138      may appears in an operand.  This will prevent the scrubber from
139      discarding meaningful whitespace in certain cases.  The i386
140      backend uses this to support prefixes, which can confuse the
141      scrubber as to whether it is parsing operands or opcodes.  */
142   for (p = tc_symbol_chars; *p; ++p)
143     lex[(unsigned char) *p] = LEX_IS_SYMBOL_COMPONENT;
144 #endif
145 
146   /* The m68k backend wants to be able to change comment_chars.  */
147 #ifndef tc_comment_chars
148 #define tc_comment_chars comment_chars
149 #endif
150   for (p = tc_comment_chars; *p; p++)
151     lex[(unsigned char) *p] = LEX_IS_COMMENT_START;
152 
153   for (p = line_comment_chars; *p; p++)
154     lex[(unsigned char) *p] = LEX_IS_LINE_COMMENT_START;
155 
156   for (p = line_separator_chars; *p; p++)
157     lex[(unsigned char) *p] = LEX_IS_LINE_SEPARATOR;
158 
159 #ifdef tc_parallel_separator_chars
160   /* This macro permits the processor to specify all characters which
161      separate parallel insns on the same line.  */
162   for (p = tc_parallel_separator_chars; *p; p++)
163     lex[(unsigned char) *p] = LEX_IS_PARALLEL_SEPARATOR;
164 #endif
165 
166   /* Only allow slash-star comments if slash is not in use.
167      FIXME: This isn't right.  We should always permit them.  */
168   if (lex['/'] == 0)
169     lex['/'] = LEX_IS_TWOCHAR_COMMENT_1ST;
170 
171 #ifdef TC_M68K
172   if (m68k_mri)
173     {
174       lex['\''] = LEX_IS_STRINGQUOTE;
175       lex[';'] = LEX_IS_COMMENT_START;
176       lex['*'] = LEX_IS_LINE_COMMENT_START;
177       /* The MRI documentation says '!' is LEX_IS_COMMENT_START, but
178 	 then it can't be used in an expression.  */
179       lex['!'] = LEX_IS_LINE_COMMENT_START;
180     }
181 #endif
182 
183 #ifdef TC_V850
184   lex['-'] = LEX_IS_DOUBLEDASH_1ST;
185 #endif
186 #ifdef DOUBLEBAR_PARALLEL
187   lex['|'] = LEX_IS_DOUBLEBAR_1ST;
188 #endif
189 #ifdef TC_D30V
190   /* Must do this is we want VLIW instruction with "->" or "<-".  */
191   lex['-'] = LEX_IS_SYMBOL_COMPONENT;
192 #endif
193 }
194 
195 /* Saved state of the scrubber.  */
196 static int state;
197 static int old_state;
198 static char *out_string;
199 static char out_buf[20];
200 static int add_newlines;
201 static char *saved_input;
202 static int saved_input_len;
203 static char input_buffer[32 * 1024];
204 static const char *mri_state;
205 static char mri_last_ch;
206 
207 /* Data structure for saving the state of app across #include's.  Note that
208    app is called asynchronously to the parsing of the .include's, so our
209    state at the time .include is interpreted is completely unrelated.
210    That's why we have to save it all.  */
211 
212 struct app_save
213 {
214   int          state;
215   int          old_state;
216   char *       out_string;
217   char         out_buf[sizeof (out_buf)];
218   int          add_newlines;
219   char *       saved_input;
220   int          saved_input_len;
221 #ifdef TC_M68K
222   int          scrub_m68k_mri;
223 #endif
224   const char * mri_state;
225   char         mri_last_ch;
226 #if defined TC_ARM && defined OBJ_ELF
227   const char * symver_state;
228 #endif
229 };
230 
231 char *
app_push(void)232 app_push (void)
233 {
234   register struct app_save *saved;
235 
236   saved = (struct app_save *) xmalloc (sizeof (*saved));
237   saved->state = state;
238   saved->old_state = old_state;
239   saved->out_string = out_string;
240   memcpy (saved->out_buf, out_buf, sizeof (out_buf));
241   saved->add_newlines = add_newlines;
242   if (saved_input == NULL)
243     saved->saved_input = NULL;
244   else
245     {
246       saved->saved_input = xmalloc (saved_input_len);
247       memcpy (saved->saved_input, saved_input, saved_input_len);
248       saved->saved_input_len = saved_input_len;
249     }
250 #ifdef TC_M68K
251   saved->scrub_m68k_mri = scrub_m68k_mri;
252 #endif
253   saved->mri_state = mri_state;
254   saved->mri_last_ch = mri_last_ch;
255 #if defined TC_ARM && defined OBJ_ELF
256   saved->symver_state = symver_state;
257 #endif
258 
259   /* do_scrub_begin() is not useful, just wastes time.  */
260 
261   state = 0;
262   saved_input = NULL;
263 
264   return (char *) saved;
265 }
266 
267 void
app_pop(char * arg)268 app_pop (char *arg)
269 {
270   register struct app_save *saved = (struct app_save *) arg;
271 
272   /* There is no do_scrub_end ().  */
273   state = saved->state;
274   old_state = saved->old_state;
275   out_string = saved->out_string;
276   memcpy (out_buf, saved->out_buf, sizeof (out_buf));
277   add_newlines = saved->add_newlines;
278   if (saved->saved_input == NULL)
279     saved_input = NULL;
280   else
281     {
282       assert (saved->saved_input_len <= (int) (sizeof input_buffer));
283       memcpy (input_buffer, saved->saved_input, saved->saved_input_len);
284       saved_input = input_buffer;
285       saved_input_len = saved->saved_input_len;
286       free (saved->saved_input);
287     }
288 #ifdef TC_M68K
289   scrub_m68k_mri = saved->scrub_m68k_mri;
290 #endif
291   mri_state = saved->mri_state;
292   mri_last_ch = saved->mri_last_ch;
293 #if defined TC_ARM && defined OBJ_ELF
294   symver_state = saved->symver_state;
295 #endif
296 
297   free (arg);
298 }
299 
300 /* @@ This assumes that \n &c are the same on host and target.  This is not
301    necessarily true.  */
302 
303 static int
process_escape(int ch)304 process_escape (int ch)
305 {
306   switch (ch)
307     {
308     case 'b':
309       return '\b';
310     case 'f':
311       return '\f';
312     case 'n':
313       return '\n';
314     case 'r':
315       return '\r';
316     case 't':
317       return '\t';
318     case '\'':
319       return '\'';
320     case '"':
321       return '\"';
322     default:
323       return ch;
324     }
325 }
326 
327 /* This function is called to process input characters.  The GET
328    parameter is used to retrieve more input characters.  GET should
329    set its parameter to point to a buffer, and return the length of
330    the buffer; it should return 0 at end of file.  The scrubbed output
331    characters are put into the buffer starting at TOSTART; the TOSTART
332    buffer is TOLEN bytes in length.  The function returns the number
333    of scrubbed characters put into TOSTART.  This will be TOLEN unless
334    end of file was seen.  This function is arranged as a state
335    machine, and saves its state so that it may return at any point.
336    This is the way the old code used to work.  */
337 
338 int
do_scrub_chars(int (* get)(char *,int),char * tostart,int tolen)339 do_scrub_chars (int (*get) (char *, int), char *tostart, int tolen)
340 {
341   char *to = tostart;
342   char *toend = tostart + tolen;
343   char *from;
344   char *fromend;
345   int fromlen;
346   register int ch, ch2 = 0;
347   /* Character that started the string we're working on.  */
348   static char quotechar;
349 
350   /*State 0: beginning of normal line
351 	  1: After first whitespace on line (flush more white)
352 	  2: After first non-white (opcode) on line (keep 1white)
353 	  3: after second white on line (into operands) (flush white)
354 	  4: after putting out a .linefile, put out digits
355 	  5: parsing a string, then go to old-state
356 	  6: putting out \ escape in a "d string.
357 	  7: no longer used
358 	  8: no longer used
359 	  9: After seeing symbol char in state 3 (keep 1white after symchar)
360 	 10: After seeing whitespace in state 9 (keep white before symchar)
361 	 11: After seeing a symbol character in state 0 (eg a label definition)
362 	 -1: output string in out_string and go to the state in old_state
363 	 -2: flush text until a '*' '/' is seen, then go to state old_state
364 #ifdef TC_V850
365 	 12: After seeing a dash, looking for a second dash as a start
366 	     of comment.
367 #endif
368 #ifdef DOUBLEBAR_PARALLEL
369 	 13: After seeing a vertical bar, looking for a second
370 	     vertical bar as a parallel expression separator.
371 #endif
372 #ifdef TC_IA64
373 	 14: After seeing a `(' at state 0, looking for a `)' as
374 	     predicate.
375 	 15: After seeing a `(' at state 1, looking for a `)' as
376 	     predicate.
377 #endif
378 #ifdef TC_Z80
379 	 16: After seeing an 'a' or an 'A' at the start of a symbol
380 	 17: After seeing an 'f' or an 'F' in state 16
381 #endif
382 	  */
383 
384   /* I added states 9 and 10 because the MIPS ECOFF assembler uses
385      constructs like ``.loc 1 20''.  This was turning into ``.loc
386      120''.  States 9 and 10 ensure that a space is never dropped in
387      between characters which could appear in an identifier.  Ian
388      Taylor, ian@cygnus.com.
389 
390      I added state 11 so that something like "Lfoo add %r25,%r26,%r27" works
391      correctly on the PA (and any other target where colons are optional).
392      Jeff Law, law@cs.utah.edu.
393 
394      I added state 13 so that something like "cmp r1, r2 || trap #1" does not
395      get squashed into "cmp r1,r2||trap#1", with the all important space
396      between the 'trap' and the '#1' being eliminated.  nickc@cygnus.com  */
397 
398   /* This macro gets the next input character.  */
399 
400 #define GET()							\
401   (from < fromend						\
402    ? * (unsigned char *) (from++)				\
403    : (saved_input = NULL,					\
404       fromlen = (*get) (input_buffer, sizeof input_buffer),	\
405       from = input_buffer,					\
406       fromend = from + fromlen,					\
407       (fromlen == 0						\
408        ? EOF							\
409        : * (unsigned char *) (from++))))
410 
411   /* This macro pushes a character back on the input stream.  */
412 
413 #define UNGET(uch) (*--from = (uch))
414 
415   /* This macro puts a character into the output buffer.  If this
416      character fills the output buffer, this macro jumps to the label
417      TOFULL.  We use this rather ugly approach because we need to
418      handle two different termination conditions: EOF on the input
419      stream, and a full output buffer.  It would be simpler if we
420      always read in the entire input stream before processing it, but
421      I don't want to make such a significant change to the assembler's
422      memory usage.  */
423 
424 #define PUT(pch)				\
425   do						\
426     {						\
427       *to++ = (pch);				\
428       if (to >= toend)				\
429 	goto tofull;				\
430     }						\
431   while (0)
432 
433   if (saved_input != NULL)
434     {
435       from = saved_input;
436       fromend = from + saved_input_len;
437     }
438   else
439     {
440       fromlen = (*get) (input_buffer, sizeof input_buffer);
441       if (fromlen == 0)
442 	return 0;
443       from = input_buffer;
444       fromend = from + fromlen;
445     }
446 
447   while (1)
448     {
449       /* The cases in this switch end with continue, in order to
450 	 branch back to the top of this while loop and generate the
451 	 next output character in the appropriate state.  */
452       switch (state)
453 	{
454 	case -1:
455 	  ch = *out_string++;
456 	  if (*out_string == '\0')
457 	    {
458 	      state = old_state;
459 	      old_state = 3;
460 	    }
461 	  PUT (ch);
462 	  continue;
463 
464 	case -2:
465 	  for (;;)
466 	    {
467 	      do
468 		{
469 		  ch = GET ();
470 
471 		  if (ch == EOF)
472 		    {
473 		      as_warn (_("end of file in comment"));
474 		      goto fromeof;
475 		    }
476 
477 		  if (ch == '\n')
478 		    PUT ('\n');
479 		}
480 	      while (ch != '*');
481 
482 	      while ((ch = GET ()) == '*')
483 		;
484 
485 	      if (ch == EOF)
486 		{
487 		  as_warn (_("end of file in comment"));
488 		  goto fromeof;
489 		}
490 
491 	      if (ch == '/')
492 		break;
493 
494 	      UNGET (ch);
495 	    }
496 
497 	  state = old_state;
498 	  UNGET (' ');
499 	  continue;
500 
501 	case 4:
502 	  ch = GET ();
503 	  if (ch == EOF)
504 	    goto fromeof;
505 	  else if (ch >= '0' && ch <= '9')
506 	    PUT (ch);
507 	  else
508 	    {
509 	      while (ch != EOF && IS_WHITESPACE (ch))
510 		ch = GET ();
511 	      if (ch == '"')
512 		{
513 		  quotechar = ch;
514 		  state = 5;
515 		  old_state = 3;
516 		  PUT (ch);
517 		}
518 	      else
519 		{
520 		  while (ch != EOF && ch != '\n')
521 		    ch = GET ();
522 		  state = 0;
523 		  PUT (ch);
524 		}
525 	    }
526 	  continue;
527 
528 	case 5:
529 	  /* We are going to copy everything up to a quote character,
530 	     with special handling for a backslash.  We try to
531 	     optimize the copying in the simple case without using the
532 	     GET and PUT macros.  */
533 	  {
534 	    char *s;
535 	    int len;
536 
537 	    for (s = from; s < fromend; s++)
538 	      {
539 		ch = *s;
540 		if (ch == '\\'
541 		    || ch == quotechar
542 		    || ch == '\n')
543 		  break;
544 	      }
545 	    len = s - from;
546 	    if (len > toend - to)
547 	      len = toend - to;
548 	    if (len > 0)
549 	      {
550 		memcpy (to, from, len);
551 		to += len;
552 		from += len;
553 		if (to >= toend)
554 		  goto tofull;
555 	      }
556 	  }
557 
558 	  ch = GET ();
559 	  if (ch == EOF)
560 	    {
561 	      as_warn (_("end of file in string; '%c' inserted"), quotechar);
562 	      state = old_state;
563 	      UNGET ('\n');
564 	      PUT (quotechar);
565 	    }
566 	  else if (ch == quotechar)
567 	    {
568 	      state = old_state;
569 	      PUT (ch);
570 	    }
571 #ifndef NO_STRING_ESCAPES
572 	  else if (ch == '\\')
573 	    {
574 	      state = 6;
575 	      PUT (ch);
576 	    }
577 #endif
578 	  else if (scrub_m68k_mri && ch == '\n')
579 	    {
580 	      /* Just quietly terminate the string.  This permits lines like
581 		   bne	label	loop if we haven't reach end yet.  */
582 	      state = old_state;
583 	      UNGET (ch);
584 	      PUT ('\'');
585 	    }
586 	  else
587 	    {
588 	      PUT (ch);
589 	    }
590 	  continue;
591 
592 	case 6:
593 	  state = 5;
594 	  ch = GET ();
595 	  switch (ch)
596 	    {
597 	      /* Handle strings broken across lines, by turning '\n' into
598 		 '\\' and 'n'.  */
599 	    case '\n':
600 	      UNGET ('n');
601 	      add_newlines++;
602 	      PUT ('\\');
603 	      continue;
604 
605 	    case EOF:
606 	      as_warn (_("end of file in string; '%c' inserted"), quotechar);
607 	      PUT (quotechar);
608 	      continue;
609 
610 	    case '"':
611 	    case '\\':
612 	    case 'b':
613 	    case 'f':
614 	    case 'n':
615 	    case 'r':
616 	    case 't':
617 	    case 'v':
618 	    case 'x':
619 	    case 'X':
620 	    case '0':
621 	    case '1':
622 	    case '2':
623 	    case '3':
624 	    case '4':
625 	    case '5':
626 	    case '6':
627 	    case '7':
628 	      break;
629 
630 	    default:
631 #ifdef ONLY_STANDARD_ESCAPES
632 	      as_warn (_("unknown escape '\\%c' in string; ignored"), ch);
633 #endif
634 	      break;
635 	    }
636 	  PUT (ch);
637 	  continue;
638 
639 #ifdef DOUBLEBAR_PARALLEL
640 	case 13:
641 	  ch = GET ();
642 	  if (ch != '|')
643 	    abort ();
644 
645 	  /* Reset back to state 1 and pretend that we are parsing a
646 	     line from just after the first white space.  */
647 	  state = 1;
648 	  PUT ('|');
649 	  continue;
650 #endif
651 #ifdef TC_Z80
652 	case 16:
653 	  /* We have seen an 'a' at the start of a symbol, look for an 'f'.  */
654 	  ch = GET ();
655 	  if (ch == 'f' || ch == 'F')
656 	    {
657 	      state = 17;
658 	      PUT (ch);
659 	    }
660 	  else
661 	    {
662 	      state = 9;
663 	      break;
664 	    }
665 	case 17:
666 	  /* We have seen "af" at the start of a symbol,
667 	     a ' here is a part of that symbol.  */
668 	  ch = GET ();
669 	  state = 9;
670 	  if (ch == '\'')
671 	    /* Change to avoid warning about unclosed string.  */
672 	    PUT ('`');
673 	  else
674 	    UNGET (ch);
675 	  break;
676 #endif
677 	}
678 
679       /* OK, we are somewhere in states 0 through 4 or 9 through 11.  */
680 
681       /* flushchar: */
682       ch = GET ();
683 
684 #ifdef TC_IA64
685       if (ch == '(' && (state == 0 || state == 1))
686 	{
687 	  state += 14;
688 	  PUT (ch);
689 	  continue;
690 	}
691       else if (state == 14 || state == 15)
692 	{
693 	  if (ch == ')')
694 	    {
695 	      state -= 14;
696 	      PUT (ch);
697 	      ch = GET ();
698 	    }
699 	  else
700 	    {
701 	      PUT (ch);
702 	      continue;
703 	    }
704 	}
705 #endif
706 
707     recycle:
708 
709 #if defined TC_ARM && defined OBJ_ELF
710       /* We need to watch out for .symver directives.  See the comment later
711 	 in this function.  */
712       if (symver_state == NULL)
713 	{
714 	  if ((state == 0 || state == 1) && ch == symver_pseudo[0])
715 	    symver_state = symver_pseudo + 1;
716 	}
717       else
718 	{
719 	  /* We advance to the next state if we find the right
720 	     character.  */
721 	  if (ch != '\0' && (*symver_state == ch))
722 	    ++symver_state;
723 	  else if (*symver_state != '\0')
724 	    /* We did not get the expected character, or we didn't
725 	       get a valid terminating character after seeing the
726 	       entire pseudo-op, so we must go back to the beginning.  */
727 	    symver_state = NULL;
728 	  else
729 	    {
730 	      /* We've read the entire pseudo-op.  If this is the end
731 		 of the line, go back to the beginning.  */
732 	      if (IS_NEWLINE (ch))
733 		symver_state = NULL;
734 	    }
735 	}
736 #endif /* TC_ARM && OBJ_ELF */
737 
738 #ifdef TC_M68K
739       /* We want to have pseudo-ops which control whether we are in
740 	 MRI mode or not.  Unfortunately, since m68k MRI mode affects
741 	 the scrubber, that means that we need a special purpose
742 	 recognizer here.  */
743       if (mri_state == NULL)
744 	{
745 	  if ((state == 0 || state == 1)
746 	      && ch == mri_pseudo[0])
747 	    mri_state = mri_pseudo + 1;
748 	}
749       else
750 	{
751 	  /* We advance to the next state if we find the right
752 	     character, or if we need a space character and we get any
753 	     whitespace character, or if we need a '0' and we get a
754 	     '1' (this is so that we only need one state to handle
755 	     ``.mri 0'' and ``.mri 1'').  */
756 	  if (ch != '\0'
757 	      && (*mri_state == ch
758 		  || (*mri_state == ' '
759 		      && lex[ch] == LEX_IS_WHITESPACE)
760 		  || (*mri_state == '0'
761 		      && ch == '1')))
762 	    {
763 	      mri_last_ch = ch;
764 	      ++mri_state;
765 	    }
766 	  else if (*mri_state != '\0'
767 		   || (lex[ch] != LEX_IS_WHITESPACE
768 		       && lex[ch] != LEX_IS_NEWLINE))
769 	    {
770 	      /* We did not get the expected character, or we didn't
771 		 get a valid terminating character after seeing the
772 		 entire pseudo-op, so we must go back to the
773 		 beginning.  */
774 	      mri_state = NULL;
775 	    }
776 	  else
777 	    {
778 	      /* We've read the entire pseudo-op.  mips_last_ch is
779 		 either '0' or '1' indicating whether to enter or
780 		 leave MRI mode.  */
781 	      do_scrub_begin (mri_last_ch == '1');
782 	      mri_state = NULL;
783 
784 	      /* We continue handling the character as usual.  The
785 		 main gas reader must also handle the .mri pseudo-op
786 		 to control expression parsing and the like.  */
787 	    }
788 	}
789 #endif
790 
791       if (ch == EOF)
792 	{
793 	  if (state != 0)
794 	    {
795 	      as_warn (_("end of file not at end of a line; newline inserted"));
796 	      state = 0;
797 	      PUT ('\n');
798 	    }
799 	  goto fromeof;
800 	}
801 
802       switch (lex[ch])
803 	{
804 	case LEX_IS_WHITESPACE:
805 	  do
806 	    {
807 	      ch = GET ();
808 	    }
809 	  while (ch != EOF && IS_WHITESPACE (ch));
810 	  if (ch == EOF)
811 	    goto fromeof;
812 
813 	  if (state == 0)
814 	    {
815 	      /* Preserve a single whitespace character at the
816 		 beginning of a line.  */
817 	      state = 1;
818 	      UNGET (ch);
819 	      PUT (' ');
820 	      break;
821 	    }
822 
823 #ifdef KEEP_WHITE_AROUND_COLON
824 	  if (lex[ch] == LEX_IS_COLON)
825 	    {
826 	      /* Only keep this white if there's no white *after* the
827 		 colon.  */
828 	      ch2 = GET ();
829 	      UNGET (ch2);
830 	      if (!IS_WHITESPACE (ch2))
831 		{
832 		  state = 9;
833 		  UNGET (ch);
834 		  PUT (' ');
835 		  break;
836 		}
837 	    }
838 #endif
839 	  if (IS_COMMENT (ch)
840 	      || ch == '/'
841 	      || IS_LINE_SEPARATOR (ch)
842 	      || IS_PARALLEL_SEPARATOR (ch))
843 	    {
844 	      if (scrub_m68k_mri)
845 		{
846 		  /* In MRI mode, we keep these spaces.  */
847 		  UNGET (ch);
848 		  PUT (' ');
849 		  break;
850 		}
851 	      goto recycle;
852 	    }
853 
854 	  /* If we're in state 2 or 11, we've seen a non-white
855 	     character followed by whitespace.  If the next character
856 	     is ':', this is whitespace after a label name which we
857 	     normally must ignore.  In MRI mode, though, spaces are
858 	     not permitted between the label and the colon.  */
859 	  if ((state == 2 || state == 11)
860 	      && lex[ch] == LEX_IS_COLON
861 	      && ! scrub_m68k_mri)
862 	    {
863 	      state = 1;
864 	      PUT (ch);
865 	      break;
866 	    }
867 
868 	  switch (state)
869 	    {
870 	    case 1:
871 	      /* We can arrive here if we leave a leading whitespace
872 		 character at the beginning of a line.  */
873 	      goto recycle;
874 	    case 2:
875 	      state = 3;
876 	      if (to + 1 < toend)
877 		{
878 		  /* Optimize common case by skipping UNGET/GET.  */
879 		  PUT (' ');	/* Sp after opco */
880 		  goto recycle;
881 		}
882 	      UNGET (ch);
883 	      PUT (' ');
884 	      break;
885 	    case 3:
886 	      if (scrub_m68k_mri)
887 		{
888 		  /* In MRI mode, we keep these spaces.  */
889 		  UNGET (ch);
890 		  PUT (' ');
891 		  break;
892 		}
893 	      goto recycle;	/* Sp in operands */
894 	    case 9:
895 	    case 10:
896 	      if (scrub_m68k_mri)
897 		{
898 		  /* In MRI mode, we keep these spaces.  */
899 		  state = 3;
900 		  UNGET (ch);
901 		  PUT (' ');
902 		  break;
903 		}
904 	      state = 10;	/* Sp after symbol char */
905 	      goto recycle;
906 	    case 11:
907 	      if (LABELS_WITHOUT_COLONS || flag_m68k_mri)
908 		state = 1;
909 	      else
910 		{
911 		  /* We know that ch is not ':', since we tested that
912 		     case above.  Therefore this is not a label, so it
913 		     must be the opcode, and we've just seen the
914 		     whitespace after it.  */
915 		  state = 3;
916 		}
917 	      UNGET (ch);
918 	      PUT (' ');	/* Sp after label definition.  */
919 	      break;
920 	    default:
921 	      BAD_CASE (state);
922 	    }
923 	  break;
924 
925 	case LEX_IS_TWOCHAR_COMMENT_1ST:
926 	  ch2 = GET ();
927 	  if (ch2 == '*')
928 	    {
929 	      for (;;)
930 		{
931 		  do
932 		    {
933 		      ch2 = GET ();
934 		      if (ch2 != EOF && IS_NEWLINE (ch2))
935 			add_newlines++;
936 		    }
937 		  while (ch2 != EOF && ch2 != '*');
938 
939 		  while (ch2 == '*')
940 		    ch2 = GET ();
941 
942 		  if (ch2 == EOF || ch2 == '/')
943 		    break;
944 
945 		  /* This UNGET will ensure that we count newlines
946 		     correctly.  */
947 		  UNGET (ch2);
948 		}
949 
950 	      if (ch2 == EOF)
951 		as_warn (_("end of file in multiline comment"));
952 
953 	      ch = ' ';
954 	      goto recycle;
955 	    }
956 #ifdef DOUBLESLASH_LINE_COMMENTS
957 	  else if (ch2 == '/')
958 	    {
959 	      do
960 		{
961 		  ch = GET ();
962 		}
963 	      while (ch != EOF && !IS_NEWLINE (ch));
964 	      if (ch == EOF)
965 		as_warn ("end of file in comment; newline inserted");
966 	      state = 0;
967 	      PUT ('\n');
968 	      break;
969 	    }
970 #endif
971 	  else
972 	    {
973 	      if (ch2 != EOF)
974 		UNGET (ch2);
975 	      if (state == 9 || state == 10)
976 		state = 3;
977 	      PUT (ch);
978 	    }
979 	  break;
980 
981 	case LEX_IS_STRINGQUOTE:
982 	  quotechar = ch;
983 	  if (state == 10)
984 	    {
985 	      /* Preserve the whitespace in foo "bar".  */
986 	      UNGET (ch);
987 	      state = 3;
988 	      PUT (' ');
989 
990 	      /* PUT didn't jump out.  We could just break, but we
991 		 know what will happen, so optimize a bit.  */
992 	      ch = GET ();
993 	      old_state = 3;
994 	    }
995 	  else if (state == 9)
996 	    old_state = 3;
997 	  else
998 	    old_state = state;
999 	  state = 5;
1000 	  PUT (ch);
1001 	  break;
1002 
1003 #ifndef IEEE_STYLE
1004 	case LEX_IS_ONECHAR_QUOTE:
1005 	  if (state == 10)
1006 	    {
1007 	      /* Preserve the whitespace in foo 'b'.  */
1008 	      UNGET (ch);
1009 	      state = 3;
1010 	      PUT (' ');
1011 	      break;
1012 	    }
1013 	  ch = GET ();
1014 	  if (ch == EOF)
1015 	    {
1016 	      as_warn (_("end of file after a one-character quote; \\0 inserted"));
1017 	      ch = 0;
1018 	    }
1019 	  if (ch == '\\')
1020 	    {
1021 	      ch = GET ();
1022 	      if (ch == EOF)
1023 		{
1024 		  as_warn (_("end of file in escape character"));
1025 		  ch = '\\';
1026 		}
1027 	      else
1028 		ch = process_escape (ch);
1029 	    }
1030 	  sprintf (out_buf, "%d", (int) (unsigned char) ch);
1031 
1032 	  /* None of these 'x constants for us.  We want 'x'.  */
1033 	  if ((ch = GET ()) != '\'')
1034 	    {
1035 #ifdef REQUIRE_CHAR_CLOSE_QUOTE
1036 	      as_warn (_("missing close quote; (assumed)"));
1037 #else
1038 	      if (ch != EOF)
1039 		UNGET (ch);
1040 #endif
1041 	    }
1042 	  if (strlen (out_buf) == 1)
1043 	    {
1044 	      PUT (out_buf[0]);
1045 	      break;
1046 	    }
1047 	  if (state == 9)
1048 	    old_state = 3;
1049 	  else
1050 	    old_state = state;
1051 	  state = -1;
1052 	  out_string = out_buf;
1053 	  PUT (*out_string++);
1054 	  break;
1055 #endif
1056 
1057 	case LEX_IS_COLON:
1058 #ifdef KEEP_WHITE_AROUND_COLON
1059 	  state = 9;
1060 #else
1061 	  if (state == 9 || state == 10)
1062 	    state = 3;
1063 	  else if (state != 3)
1064 	    state = 1;
1065 #endif
1066 	  PUT (ch);
1067 	  break;
1068 
1069 	case LEX_IS_NEWLINE:
1070 	  /* Roll out a bunch of newlines from inside comments, etc.  */
1071 	  if (add_newlines)
1072 	    {
1073 	      --add_newlines;
1074 	      UNGET (ch);
1075 	    }
1076 	  /* Fall through.  */
1077 
1078 	case LEX_IS_LINE_SEPARATOR:
1079 	  state = 0;
1080 	  PUT (ch);
1081 	  break;
1082 
1083 	case LEX_IS_PARALLEL_SEPARATOR:
1084 	  state = 1;
1085 	  PUT (ch);
1086 	  break;
1087 
1088 #ifdef TC_V850
1089 	case LEX_IS_DOUBLEDASH_1ST:
1090 	  ch2 = GET ();
1091 	  if (ch2 != '-')
1092 	    {
1093 	      UNGET (ch2);
1094 	      goto de_fault;
1095 	    }
1096 	  /* Read and skip to end of line.  */
1097 	  do
1098 	    {
1099 	      ch = GET ();
1100 	    }
1101 	  while (ch != EOF && ch != '\n');
1102 
1103 	  if (ch == EOF)
1104 	    as_warn (_("end of file in comment; newline inserted"));
1105 
1106 	  state = 0;
1107 	  PUT ('\n');
1108 	  break;
1109 #endif
1110 #ifdef DOUBLEBAR_PARALLEL
1111 	case LEX_IS_DOUBLEBAR_1ST:
1112 	  ch2 = GET ();
1113 	  UNGET (ch2);
1114 	  if (ch2 != '|')
1115 	    goto de_fault;
1116 
1117 	  /* Handle '||' in two states as invoking PUT twice might
1118 	     result in the first one jumping out of this loop.  We'd
1119 	     then lose track of the state and one '|' char.  */
1120 	  state = 13;
1121 	  PUT ('|');
1122 	  break;
1123 #endif
1124 	case LEX_IS_LINE_COMMENT_START:
1125 	  /* FIXME-someday: The two character comment stuff was badly
1126 	     thought out.  On i386, we want '/' as line comment start
1127 	     AND we want C style comments.  hence this hack.  The
1128 	     whole lexical process should be reworked.  xoxorich.  */
1129 	  if (ch == '/')
1130 	    {
1131 	      ch2 = GET ();
1132 	      if (ch2 == '*')
1133 		{
1134 		  old_state = 3;
1135 		  state = -2;
1136 		  break;
1137 		}
1138 	      else
1139 		{
1140 		  UNGET (ch2);
1141 		}
1142 	    }
1143 
1144 	  if (state == 0 || state == 1)	/* Only comment at start of line.  */
1145 	    {
1146 	      int startch;
1147 
1148 	      startch = ch;
1149 
1150 	      do
1151 		{
1152 		  ch = GET ();
1153 		}
1154 	      while (ch != EOF && IS_WHITESPACE (ch));
1155 
1156 	      if (ch == EOF)
1157 		{
1158 		  as_warn (_("end of file in comment; newline inserted"));
1159 		  PUT ('\n');
1160 		  break;
1161 		}
1162 
1163 	      if (ch < '0' || ch > '9' || state != 0 || startch != '#')
1164 		{
1165 		  /* Not a cpp line.  */
1166 		  while (ch != EOF && !IS_NEWLINE (ch))
1167 		    ch = GET ();
1168 		  if (ch == EOF)
1169 		    as_warn (_("end of file in comment; newline inserted"));
1170 		  state = 0;
1171 		  PUT ('\n');
1172 		  break;
1173 		}
1174 	      /* Looks like `# 123 "filename"' from cpp.  */
1175 	      UNGET (ch);
1176 	      old_state = 4;
1177 	      state = -1;
1178 	      if (scrub_m68k_mri)
1179 		out_string = "\tlinefile ";
1180 	      else
1181 		out_string = "\t.linefile ";
1182 	      PUT (*out_string++);
1183 	      break;
1184 	    }
1185 
1186 #ifdef TC_D10V
1187 	  /* All insns end in a char for which LEX_IS_SYMBOL_COMPONENT is true.
1188 	     Trap is the only short insn that has a first operand that is
1189 	     neither register nor label.
1190 	     We must prevent exef0f ||trap #1 to degenerate to exef0f ||trap#1 .
1191 	     We can't make '#' LEX_IS_SYMBOL_COMPONENT because it is
1192 	     already LEX_IS_LINE_COMMENT_START.  However, it is the
1193 	     only character in line_comment_chars for d10v, hence we
1194 	     can recognize it as such.  */
1195 	  /* An alternative approach would be to reset the state to 1 when
1196 	     we see '||', '<'- or '->', but that seems to be overkill.  */
1197 	  if (state == 10)
1198 	    PUT (' ');
1199 #endif
1200 	  /* We have a line comment character which is not at the
1201 	     start of a line.  If this is also a normal comment
1202 	     character, fall through.  Otherwise treat it as a default
1203 	     character.  */
1204 	  if (strchr (tc_comment_chars, ch) == NULL
1205 	      && (! scrub_m68k_mri
1206 		  || (ch != '!' && ch != '*')))
1207 	    goto de_fault;
1208 	  if (scrub_m68k_mri
1209 	      && (ch == '!' || ch == '*' || ch == '#')
1210 	      && state != 1
1211 	      && state != 10)
1212 	    goto de_fault;
1213 	  /* Fall through.  */
1214 	case LEX_IS_COMMENT_START:
1215 #if defined TC_ARM && defined OBJ_ELF
1216 	  /* On the ARM, `@' is the comment character.
1217 	     Unfortunately this is also a special character in ELF .symver
1218 	     directives (and .type, though we deal with those another way).
1219 	     So we check if this line is such a directive, and treat
1220 	     the character as default if so.  This is a hack.  */
1221 	  if ((symver_state != NULL) && (*symver_state == 0))
1222 	    goto de_fault;
1223 #endif
1224 
1225 #ifdef TC_ARM
1226 	  /* For the ARM, care is needed not to damage occurrences of \@
1227 	     by stripping the @ onwards.  Yuck.  */
1228 	  if (to > tostart && *(to - 1) == '\\')
1229 	    /* Do not treat the @ as a start-of-comment.  */
1230 	    goto de_fault;
1231 #endif
1232 
1233 #ifdef WARN_COMMENTS
1234 	  if (!found_comment)
1235 	    as_where (&found_comment_file, &found_comment);
1236 #endif
1237 	  do
1238 	    {
1239 	      ch = GET ();
1240 	    }
1241 	  while (ch != EOF && !IS_NEWLINE (ch));
1242 	  if (ch == EOF)
1243 	    as_warn (_("end of file in comment; newline inserted"));
1244 	  state = 0;
1245 	  PUT ('\n');
1246 	  break;
1247 
1248 	case LEX_IS_SYMBOL_COMPONENT:
1249 	  if (state == 10)
1250 	    {
1251 	      /* This is a symbol character following another symbol
1252 		 character, with whitespace in between.  We skipped
1253 		 the whitespace earlier, so output it now.  */
1254 	      UNGET (ch);
1255 	      state = 3;
1256 	      PUT (' ');
1257 	      break;
1258 	    }
1259 
1260 #ifdef TC_Z80
1261 	  /* "af'" is a symbol containing '\''.  */
1262 	  if (state == 3 && (ch == 'a' || ch == 'A'))
1263 	    {
1264 	      state = 16;
1265 	      PUT (ch);
1266 	      ch = GET ();
1267 	      if (ch == 'f' || ch == 'F')
1268 		{
1269 		  state = 17;
1270 		  PUT (ch);
1271 		  break;
1272 		}
1273 	      else
1274 		{
1275 		  state = 9;
1276 		  if (!IS_SYMBOL_COMPONENT (ch))
1277 		    {
1278 		      UNGET (ch);
1279 		      break;
1280 		    }
1281 		}
1282 	    }
1283 #endif
1284 	  if (state == 3)
1285 	    state = 9;
1286 
1287 	  /* This is a common case.  Quickly copy CH and all the
1288 	     following symbol component or normal characters.  */
1289 	  if (to + 1 < toend
1290 	      && mri_state == NULL
1291 #if defined TC_ARM && defined OBJ_ELF
1292 	      && symver_state == NULL
1293 #endif
1294 	      )
1295 	    {
1296 	      char *s;
1297 	      int len;
1298 
1299 	      for (s = from; s < fromend; s++)
1300 		{
1301 		  int type;
1302 
1303 		  ch2 = *(unsigned char *) s;
1304 		  type = lex[ch2];
1305 		  if (type != 0
1306 		      && type != LEX_IS_SYMBOL_COMPONENT)
1307 		    break;
1308 		}
1309 
1310 	      if (s > from)
1311 		/* Handle the last character normally, for
1312 		   simplicity.  */
1313 		--s;
1314 
1315 	      len = s - from;
1316 
1317 	      if (len > (toend - to) - 1)
1318 		len = (toend - to) - 1;
1319 
1320 	      if (len > 0)
1321 		{
1322 		  PUT (ch);
1323 		  memcpy (to, from, len);
1324 		  to += len;
1325 		  from += len;
1326 		  if (to >= toend)
1327 		    goto tofull;
1328 		  ch = GET ();
1329 		}
1330 	    }
1331 
1332 	  /* Fall through.  */
1333 	default:
1334 	de_fault:
1335 	  /* Some relatively `normal' character.  */
1336 	  if (state == 0)
1337 	    {
1338 	      state = 11;	/* Now seeing label definition.  */
1339 	    }
1340 	  else if (state == 1)
1341 	    {
1342 	      state = 2;	/* Ditto.  */
1343 	    }
1344 	  else if (state == 9)
1345 	    {
1346 	      if (!IS_SYMBOL_COMPONENT (ch))
1347 		state = 3;
1348 	    }
1349 	  else if (state == 10)
1350 	    {
1351 	      if (ch == '\\')
1352 		{
1353 		  /* Special handling for backslash: a backslash may
1354 		     be the beginning of a formal parameter (of a
1355 		     macro) following another symbol character, with
1356 		     whitespace in between.  If that is the case, we
1357 		     output a space before the parameter.  Strictly
1358 		     speaking, correct handling depends upon what the
1359 		     macro parameter expands into; if the parameter
1360 		     expands into something which does not start with
1361 		     an operand character, then we don't want to keep
1362 		     the space.  We don't have enough information to
1363 		     make the right choice, so here we are making the
1364 		     choice which is more likely to be correct.  */
1365 		  if (to + 1 >= toend)
1366 		    {
1367 		      /* If we're near the end of the buffer, save the
1368 		         character for the next time round.  Otherwise
1369 		         we'll lose our state.  */
1370 		      UNGET (ch);
1371 		      goto tofull;
1372 		    }
1373 		  *to++ = ' ';
1374 		}
1375 
1376 	      state = 3;
1377 	    }
1378 	  PUT (ch);
1379 	  break;
1380 	}
1381     }
1382 
1383   /*NOTREACHED*/
1384 
1385  fromeof:
1386   /* We have reached the end of the input.  */
1387   return to - tostart;
1388 
1389  tofull:
1390   /* The output buffer is full.  Save any input we have not yet
1391      processed.  */
1392   if (fromend > from)
1393     {
1394       saved_input = from;
1395       saved_input_len = fromend - from;
1396     }
1397   else
1398     saved_input = NULL;
1399 
1400   return to - tostart;
1401 }
1402 
1403