1 /*
2  * $LynxId: SGML.c,v 1.148 2012/02/10 18:32:26 tom Exp $
3  *
4  *			General SGML Parser code		SGML.c
5  *			========================
6  *
7  *	This module implements an HTStream object.  To parse an
8  *	SGML file, create this object which is a parser.  The object
9  *	is (currently) created by being passed a DTD structure,
10  *	and a target HTStructured object at which to throw the parsed stuff.
11  *
12  *	 6 Feb 93  Binary searches used. Interface modified.
13  */
14 
15 #define HTSTREAM_INTERNAL 1
16 
17 #include <HTUtils.h>
18 
19 #include <SGML.h>
20 #include <HTMLDTD.h>
21 #include <HTAccess.h>
22 #include <HTCJK.h>		/* FIXME: this doesn't belong in SGML.c */
23 #include <UCMap.h>
24 #include <UCDefs.h>
25 #include <UCAux.h>
26 
27 #include <HTChunk.h>
28 #include <HTUtils.h>
29 
30 #include <LYCharSets.h>
31 #include <LYCharVals.h>		/* S/390 -- gil -- 0635 */
32 #include <LYGlobalDefs.h>
33 #include <LYStrings.h>
34 #include <LYLeaks.h>
35 #include <LYUtils.h>
36 
37 #ifdef USE_COLOR_STYLE
38 # include <LYStyle.h>
39 #endif
40 #ifdef USE_PRETTYSRC
41 # include <LYPrettySrc.h>
42 #endif
43 
44 #define AssumeCP1252(context) \
45 	(((context)->inUCLYhndl == LATIN1 \
46 	  || (context)->inUCLYhndl == US_ASCII) \
47 	 && html5_charsets)
48 
49 #define INVALID (-1)
50 
51 static int sgml_offset;
52 
53 #ifdef USE_PRETTYSRC
54 
55 static char *entity_string;	/* this is used for printing entity name.
56 
57 				   Unconditionally added since redundant assigments don't hurt much */
58 
fake_put_character(void * p GCC_UNUSED,char c GCC_UNUSED)59 static void fake_put_character(void *p GCC_UNUSED,
60 			       char c GCC_UNUSED)
61 {
62 }
63 
64 #define START TRUE
65 #define STOP FALSE
66 
67 #define PUTS_TR(x) psrc_convert_string = TRUE; PUTS(x)
68 
69 #endif
70 
71 /* my_casecomp() - optimized by the first character, NOT_ASCII ok */
72 #define my_casecomp(a,b)  ((TOUPPER(*a) == TOUPPER(*b)) ? \
73 			AS_casecomp(a,b) : \
74 			(TOASCII(TOUPPER(*a)) - TOASCII(TOUPPER(*b))))
75 
76  /* will use partially inlined version */
77 #define orig_HTChunkPutUtf8Char HTChunkPutUtf8Char
78 #undef HTChunkPutUtf8Char
79 
80 /* ...used for comments and attributes value like href... */
81 #define HTChunkPutUtf8Char(ch,x) \
82     { \
83     if ((TOASCII(x) < 128)  && (ch->size < ch->allocated)) \
84 	ch->data[ch->size++] = (char)x; \
85     else \
86 	orig_HTChunkPutUtf8Char(ch,x); \
87     }
88 
89 #define PUTS(str) ((*context->actions->put_string)(context->target, str))
90 #define PUTC(ch)  ((*context->actions->put_character)(context->target, (char) ch))
91 #define PUTUTF8(code) (UCPutUtf8_charstring((HTStream *)context->target, \
92 		      (putc_func_t*)(context->actions->put_character), code))
93 
94 #ifdef USE_PRETTYSRC
95 #define PRETTYSRC_PUTC(c) if (psrc_view) PUTC(c)
96 #else
97 #define PRETTYSRC_PUTC(c)	/* nothing */
98 #endif
99 
100 /*the following macros are used for pretty source view. */
101 #define IS_C(attr) (attr.type == HTMLA_CLASS)
102 
103 HTCJKlang HTCJK = NOCJK;	/* CJK enum value.              */
104 BOOL HTPassEightBitRaw = FALSE;	/* Pass 161-172,174-255 raw.    */
105 BOOL HTPassEightBitNum = FALSE;	/* Pass ^ numeric entities raw. */
106 BOOL HTPassHighCtrlRaw = FALSE;	/* Pass 127-160,173,&#127; raw. */
107 BOOL HTPassHighCtrlNum = FALSE;	/* Pass &#128;-&#159; raw.      */
108 
109 /*	The State (context) of the parser
110  *
111  *	This is passed with each call to make the parser reentrant
112  *
113  */
114 
115 #define MAX_ATTRIBUTES 36	/* Max number of attributes per element */
116 
117 /*		Element Stack
118  *		-------------
119  *	This allows us to return down the stack reselecting styles.
120  *	As we return, attribute values will be garbage in general.
121  */
122 typedef struct _HTElement HTElement;
123 struct _HTElement {
124     HTElement *next;		/* Previously nested element or 0 */
125     HTTag *tag;			/* The tag at this level  */
126 };
127 
128 typedef enum {
129     S_text = 0
130     ,S_attr
131     ,S_attr_gap
132     ,S_comment
133     ,S_cro
134     ,S_doctype
135     ,S_dollar
136     ,S_dollar_dq
137     ,S_dollar_paren
138     ,S_dollar_paren_dq
139     ,S_dollar_paren_sq
140     ,S_dollar_sq
141     ,S_dquoted
142     ,S_end
143     ,S_entity
144     ,S_equals
145     ,S_ero
146     ,S_esc
147     ,S_esc_dq
148     ,S_esc_sq
149     ,S_exclamation
150     ,S_in_kanji
151     ,S_incro
152     ,S_junk_tag
153     ,S_litteral
154     ,S_marked
155     ,S_nonascii_text
156     ,S_nonascii_text_dq
157     ,S_nonascii_text_sq
158     ,S_paren
159     ,S_paren_dq
160     ,S_paren_sq
161     ,S_pcdata
162     ,S_pi
163     ,S_script
164     ,S_sgmlatt
165     ,S_sgmlele
166     ,S_sgmlent
167     ,S_squoted
168     ,S_tag
169     ,S_tag_gap
170     ,S_tagname_slash
171     ,S_value
172 } sgml_state;
173 
174 /*	Internal Context Data Structure
175  *	-------------------------------
176  */
177 struct _HTStream {
178 
179     const HTStreamClass *isa;	/* inherited from HTStream */
180 
181     const SGML_dtd *dtd;
182     const HTStructuredClass *actions;	/* target class  */
183     HTStructured *target;	/* target object */
184 
185     HTTag *current_tag;
186     HTTag *slashedtag;
187     const HTTag *unknown_tag;
188     BOOL extended_html;		/* xhtml */
189     BOOL strict_xml;		/* xml */
190     BOOL inSELECT;
191     BOOL no_lynx_specialcodes;
192     int current_attribute_number;
193     HTChunk *string;
194     int leading_spaces;
195     int trailing_spaces;
196     HTElement *element_stack;
197     sgml_state state;
198     unsigned char kanji_buf;
199 #ifdef CALLERDATA
200     void *callerData;
201 #endif				/* CALLERDATA */
202     BOOL present[MAX_ATTRIBUTES];	/* Flags: attribute is present? */
203     char *value[MAX_ATTRIBUTES];	/* NULL, or strings alloc'd with StrAllocCopy_extra() */
204 
205     BOOL lead_exclamation;
206     BOOL first_dash;
207     BOOL end_comment;
208     BOOL doctype_bracket;
209     BOOL first_bracket;
210     BOOL second_bracket;
211     BOOL isHex;
212 
213     HTParentAnchor *node_anchor;
214     LYUCcharset *inUCI;		/* pointer to anchor UCInfo */
215     int inUCLYhndl;		/* charset we are fed       */
216     LYUCcharset *outUCI;	/* anchor UCInfo for target */
217     int outUCLYhndl;		/* charset for target       */
218     char utf_count;
219     UCode_t utf_char;
220     char utf_buf[8];
221     char *utf_buf_p;
222     UCTransParams T;
223     int current_tag_charset;	/* charset to pass attributes */
224 
225     char *recover;
226     int recover_index;
227     char *include;
228     char *active_include;
229     int include_index;
230     char *url;
231     char *csi;
232     int csi_index;
233 #ifdef USE_PRETTYSRC
234     BOOL cur_attr_is_href;
235     BOOL cur_attr_is_name;
236 #endif
237 };
238 
239 #ifdef NO_LYNX_TRACE
240 #define state_name(n) "state"
241 #else
state_name(sgml_state n)242 static const char *state_name(sgml_state n)
243 {
244     const char *result = "?";
245     /* *INDENT-OFF* */
246     switch (n) {
247     case S_attr:                result = "S_attr";              break;
248     case S_attr_gap:            result = "S_attr_gap";          break;
249     case S_comment:             result = "S_comment";           break;
250     case S_cro:                 result = "S_cro";               break;
251     case S_doctype:             result = "S_doctype";           break;
252     case S_dollar:              result = "S_dollar";            break;
253     case S_dollar_dq:           result = "S_dollar_dq";         break;
254     case S_dollar_paren:        result = "S_dollar_paren";      break;
255     case S_dollar_paren_dq:     result = "S_dollar_paren_dq";   break;
256     case S_dollar_paren_sq:     result = "S_dollar_paren_sq";   break;
257     case S_dollar_sq:           result = "S_dollar_sq";         break;
258     case S_dquoted:             result = "S_dquoted";           break;
259     case S_end:                 result = "S_end";               break;
260     case S_entity:              result = "S_entity";            break;
261     case S_equals:              result = "S_equals";            break;
262     case S_ero:                 result = "S_ero";               break;
263     case S_esc:                 result = "S_esc";               break;
264     case S_esc_dq:              result = "S_esc_dq";            break;
265     case S_esc_sq:              result = "S_esc_sq";            break;
266     case S_exclamation:         result = "S_exclamation";       break;
267     case S_in_kanji:            result = "S_in_kanji";          break;
268     case S_incro:               result = "S_incro";             break;
269     case S_pi:                  result = "S_pi";                break;
270     case S_junk_tag:            result = "S_junk_tag";          break;
271     case S_litteral:            result = "S_litteral";          break;
272     case S_marked:              result = "S_marked";            break;
273     case S_nonascii_text:       result = "S_nonascii_text";     break;
274     case S_nonascii_text_dq:    result = "S_nonascii_text_dq";  break;
275     case S_nonascii_text_sq:    result = "S_nonascii_text_sq";  break;
276     case S_paren:               result = "S_paren";             break;
277     case S_paren_dq:            result = "S_paren_dq";          break;
278     case S_paren_sq:            result = "S_paren_sq";          break;
279     case S_pcdata:              result = "S_pcdata";            break;
280     case S_script:              result = "S_script";            break;
281     case S_sgmlatt:             result = "S_sgmlatt";           break;
282     case S_sgmlele:             result = "S_sgmlele";           break;
283     case S_sgmlent:             result = "S_sgmlent";           break;
284     case S_squoted:             result = "S_squoted";           break;
285     case S_tag:                 result = "S_tag";               break;
286     case S_tag_gap:             result = "S_tag_gap";           break;
287     case S_tagname_slash:       result = "S_tagname_slash";     break;
288     case S_text:                result = "S_text";              break;
289     case S_value:               result = "S_value";             break;
290     }
291     /* *INDENT-ON* */
292 
293     return result;
294 }
295 #endif
296 
297 /* storage for Element Stack */
298 #define DEPTH 10
299 static HTElement pool[DEPTH];
300 static int depth = 0;
301 
pool_alloc(void)302 static HTElement *pool_alloc(void)
303 {
304     depth++;
305     if (depth > DEPTH)
306 	return (HTElement *) malloc(sizeof(HTElement));
307     return (pool + depth - 1);
308 }
309 
pool_free(HTElement * e)310 static void pool_free(HTElement * e)
311 {
312     if (depth > DEPTH)
313 	FREE(e);
314     depth--;
315     return;
316 }
317 
318 #ifdef USE_PRETTYSRC
319 
HTMLSRC_apply_markup(HTStream * context,HTlexeme lexeme,int start)320 static void HTMLSRC_apply_markup(HTStream *context,
321 				 HTlexeme lexeme,
322 				 int start)
323 {
324     HT_tagspec *ts = *((start ? lexeme_start : lexeme_end) + lexeme);
325 
326     while (ts) {
327 #ifdef USE_COLOR_STYLE
328 	if (ts->start) {
329 	    current_tag_style = ts->style;
330 	    force_current_tag_style = TRUE;
331 	    forced_classname = ts->class_name;
332 	    force_classname = TRUE;
333 	}
334 #endif
335 	CTRACE((tfp, ts->start ? "SRCSTART %d\n" : "SRCSTOP %d\n", (int) lexeme));
336 	if (ts->start)
337 	    (*context->actions->start_element) (context->target,
338 						(int) ts->element,
339 						ts->present,
340 						(STRING2PTR) ts->value,
341 						context->current_tag_charset,
342 						&context->include);
343 	else
344 	    (*context->actions->end_element) (context->target,
345 					      (int) ts->element,
346 					      &context->include);
347 	ts = ts->next;
348     }
349 }
350 
351 #define PSRCSTART(x)	HTMLSRC_apply_markup(context,HTL_##x,START)
352 #define PSRCSTOP(x)   HTMLSRC_apply_markup(context,HTL_##x,STOP)
353 
354 #define attr_is_href context->cur_attr_is_href
355 #define attr_is_name context->cur_attr_is_name
356 #endif
357 
set_chartrans_handling(HTStream * context,HTParentAnchor * anchor,int chndl)358 static void set_chartrans_handling(HTStream *context,
359 				   HTParentAnchor *anchor,
360 				   int chndl)
361 {
362     if (chndl < 0) {
363 	/*
364 	 * Nothing was set for the parser in earlier stages, so the HTML
365 	 * parser's UCLYhndl should still be its default.  - FM
366 	 */
367 	chndl = HTAnchor_getUCLYhndl(anchor, UCT_STAGE_STRUCTURED);
368 	if (chndl < 0)
369 	    /*
370 	     * That wasn't set either, so seek the HText default.  - FM
371 	     */
372 	    chndl = HTAnchor_getUCLYhndl(anchor, UCT_STAGE_HTEXT);
373 	if (chndl < 0)
374 	    /*
375 	     * That wasn't set either, so assume the current display character
376 	     * set.  - FM
377 	     */
378 	    chndl = current_char_set;
379 	/*
380 	 * Try to set the HText and HTML stages' chartrans info with the
381 	 * default lock level (will not be changed if it was set previously
382 	 * with a higher lock level).  - FM
383 	 */
384 	HTAnchor_setUCInfoStage(anchor, chndl,
385 				UCT_STAGE_HTEXT,
386 				UCT_SETBY_DEFAULT);
387 	HTAnchor_setUCInfoStage(anchor, chndl,
388 				UCT_STAGE_STRUCTURED,
389 				UCT_SETBY_DEFAULT);
390 	/*
391 	 * Get the chartrans info for output to the HTML parser.  - FM
392 	 */
393 	context->outUCI = HTAnchor_getUCInfoStage(anchor,
394 						  UCT_STAGE_STRUCTURED);
395 	context->outUCLYhndl = HTAnchor_getUCLYhndl(context->node_anchor,
396 						    UCT_STAGE_STRUCTURED);
397     }
398     /*
399      * Set the in->out transformation parameters.  - FM
400      */
401     UCSetTransParams(&context->T,
402 		     context->inUCLYhndl, context->inUCI,
403 		     context->outUCLYhndl, context->outUCI);
404     /*
405      * This is intended for passing the SGML parser's input charset as an
406      * argument in each call to the HTML parser's start tag function, but it
407      * would be better to call a Lynx_HTML_parser function to set an element in
408      * its HTStructured object, itself, if this were needed.  - FM
409      */
410 #ifndef EXP_JAPANESEUTF8_SUPPORT
411     if (IS_CJK_TTY) {
412 	context->current_tag_charset = -1;
413     } else
414 #endif
415     if (context->T.transp) {
416 	context->current_tag_charset = context->inUCLYhndl;
417     } else if (context->T.decode_utf8) {
418 	context->current_tag_charset = context->inUCLYhndl;
419     } else if (context->T.do_8bitraw ||
420 	       context->T.use_raw_char_in) {
421 	context->current_tag_charset = context->inUCLYhndl;
422     } else if (context->T.output_utf8 ||
423 	       context->T.trans_from_uni) {
424 	context->current_tag_charset = UCGetLYhndl_byMIME("utf-8");
425     } else {
426 	context->current_tag_charset = LATIN1;
427     }
428 }
429 
change_chartrans_handling(HTStream * context)430 static void change_chartrans_handling(HTStream *context)
431 {
432     int new_LYhndl = HTAnchor_getUCLYhndl(context->node_anchor,
433 					  UCT_STAGE_PARSER);
434 
435     if (new_LYhndl != context->inUCLYhndl &&
436 	new_LYhndl >= 0) {
437 	/*
438 	 * Something changed.  but ignore if a META wants an unknown charset.
439 	 */
440 	LYUCcharset *new_UCI = HTAnchor_getUCInfoStage(context->node_anchor,
441 						       UCT_STAGE_PARSER);
442 
443 	if (new_UCI) {
444 	    LYUCcharset *next_UCI = HTAnchor_getUCInfoStage(context->node_anchor,
445 							    UCT_STAGE_STRUCTURED);
446 	    int next_LYhndl = HTAnchor_getUCLYhndl(context->node_anchor, UCT_STAGE_STRUCTURED);
447 
448 	    context->inUCI = new_UCI;
449 	    context->inUCLYhndl = new_LYhndl;
450 	    context->outUCI = next_UCI;
451 	    context->outUCLYhndl = next_LYhndl;
452 	    set_chartrans_handling(context,
453 				   context->node_anchor, next_LYhndl);
454 	}
455     }
456 }
457 
458 #ifdef USE_COLOR_STYLE
459 #include <AttrList.h>
460 static int current_is_class = 0;
461 #endif
462 
463 /*	Handle Attribute
464  *	----------------
465  */
466 /* PUBLIC const char * SGML_default = "";   ?? */
467 
handle_attribute_name(HTStream * context,const char * s)468 static void handle_attribute_name(HTStream *context, const char *s)
469 {
470     HTTag *tag = context->current_tag;
471     const attr *attributes = tag->attributes;
472     int high, low, i, diff;
473 
474 #ifdef USE_PRETTYSRC
475     if (psrc_view) {
476 	attr_is_href = FALSE;
477 	attr_is_name = FALSE;
478     }
479 #endif
480     /*
481      * Ignore unknown tag.  - KW
482      */
483     if (tag == context->unknown_tag) {
484 #ifdef USE_PRETTYSRC
485 	if (psrc_view)
486 	    context->current_attribute_number = 1;	/* anything !=INVALID */
487 #endif
488 	return;
489     }
490 
491     /*
492      * Binary search for attribute name.
493      */
494     for (low = 0, high = tag->number_of_attributes;
495 	 high > low;
496 	 diff < 0 ? (low = i + 1) : (high = i)) {
497 	i = (low + (high - low) / 2);
498 	diff = my_casecomp(attributes[i].name, s);
499 	if (diff == 0) {	/* success: found it */
500 	    context->current_attribute_number = i;
501 #ifdef USE_PRETTYSRC
502 	    if (psrc_view) {
503 		attr_is_name = (BOOL) (attributes[i].type == HTMLA_ANAME);
504 		attr_is_href = (BOOL) (attributes[i].type == HTMLA_HREF);
505 	    } else
506 #endif
507 	    {
508 		context->present[i] = YES;
509 		Clear_extra(context->value[i]);
510 #ifdef USE_COLOR_STYLE
511 #   ifdef USE_PRETTYSRC
512 		current_is_class = IS_C(attributes[i]);
513 #   else
514 		current_is_class = (!strcasecomp("class", s));
515 #   endif
516 		CTRACE((tfp, "SGML: found attribute %s, %d\n", s, current_is_class));
517 #endif
518 	    }
519 	    return;
520 	}
521 	/* if */
522     }				/* for */
523 
524     CTRACE((tfp, "SGML: Unknown attribute %s for tag %s\n",
525 	    s, NonNull(context->current_tag->name)));
526     context->current_attribute_number = INVALID;	/* Invalid */
527 }
528 
529 /*	Handle attribute value
530  *	----------------------
531  */
handle_attribute_value(HTStream * context,const char * s)532 static void handle_attribute_value(HTStream *context, const char *s)
533 {
534     if (context->current_attribute_number != INVALID) {
535 	StrAllocCopy_extra(context->value[context->current_attribute_number], s);
536 #ifdef USE_COLOR_STYLE
537 	if (current_is_class) {
538 	    StrNCpy(class_string, s, TEMPSTRINGSIZE);
539 	    CTRACE((tfp, "SGML: class is '%s'\n", s));
540 	} else {
541 	    CTRACE((tfp, "SGML: attribute value is '%s'\n", s));
542 	}
543 #endif
544     } else {
545 	CTRACE((tfp, "SGML: Attribute value %s ***ignored\n", s));
546     }
547     context->current_attribute_number = INVALID;	/* can't have two assignments! */
548 }
549 
550 /*
551  *  Translate some Unicodes to Lynx special codes and output them.
552  *  Special codes - ones those output depend on parsing.
553  *
554  *  Additional issue, like handling bidirectional text if necessary
555  *  may be called from here:  zwnj (8204), zwj (8205), lrm (8206), rlm (8207)
556  *  - currently they are ignored in SGML.c and LYCharUtils.c
557  *  but also in UCdomap.c because they are non printable...
558  *
559  */
put_special_unicodes(HTStream * context,UCode_t code)560 static BOOL put_special_unicodes(HTStream *context, UCode_t code)
561 {
562     /* (Tgf_nolyspcl) */
563     if (context->no_lynx_specialcodes) {
564 	/*
565 	 * We were asked by a "DTD" flag to not generate lynx specials.  - kw
566 	 */
567 	return NO;
568     }
569 
570     if (code == CH_NBSP) {	/* S/390 -- gil -- 0657 */
571 	/*
572 	 * Use Lynx special character for nbsp.
573 	 */
574 #ifdef USE_PRETTYSRC
575 	if (!psrc_view)
576 #endif
577 	    PUTC(HT_NON_BREAK_SPACE);
578     } else if (code == CH_SHY) {
579 	/*
580 	 * Use Lynx special character for shy.
581 	 */
582 #ifdef USE_PRETTYSRC
583 	if (!psrc_view)
584 #endif
585 	    PUTC(LY_SOFT_HYPHEN);
586     } else if (code == 8194 || code == 8201) {
587 	/*
588 	 * Use Lynx special character for ensp or thinsp.
589 	 *
590 	 * Originally, Lynx use space '32' as word delimiter and omits this
591 	 * space at end of line if word is wrapped to the next line.  There are
592 	 * several other spaces in the Unicode repertoire and we should teach
593 	 * Lynx to understand them, not only as regular characters but in the
594 	 * context of line wrapping.  Unfortunately, if we use HT_EN_SPACE we
595 	 * override the chartrans tables for those spaces with a single '32'
596 	 * for all (but do line wrapping more fancy).
597 	 *
598 	 * We may treat emsp as one or two ensp (below).
599 	 */
600 #ifdef USE_PRETTYSRC
601 	if (!psrc_view)
602 #endif
603 	    PUTC(HT_EN_SPACE);
604     } else if (code == 8195) {
605 	/*
606 	 * Use Lynx special character for emsp.
607 	 */
608 #ifdef USE_PRETTYSRC
609 	if (!psrc_view) {
610 #endif
611 	    /* PUTC(HT_EN_SPACE);  let's stay with a single space :) */
612 	    PUTC(HT_EN_SPACE);
613 #ifdef USE_PRETTYSRC
614 	}
615 #endif
616     } else {
617 	/*
618 	 * Return NO if nothing done.
619 	 */
620 	return NO;
621     }
622     /*
623      * We have handled it.
624      */
625     return YES;
626 }
627 
628 #ifdef USE_PRETTYSRC
put_pretty_entity(HTStream * context,int term)629 static void put_pretty_entity(HTStream *context, int term)
630 {
631     PSRCSTART(entity);
632     PUTC('&');
633     PUTS(entity_string);
634     if (term)
635 	PUTC((char) term);
636     PSRCSTOP(entity);
637 }
638 
put_pretty_number(HTStream * context)639 static void put_pretty_number(HTStream *context)
640 {
641     PSRCSTART(entity);
642     PUTS((context->isHex ? "&#x" : "&#"));
643     PUTS(entity_string);
644     PUTC(';');
645     PSRCSTOP(entity);
646 }
647 #endif /* USE_PRETTYSRC */
648 
649 /*	Handle entity
650  *	-------------
651  *
652  * On entry,
653  *	s	contains the entity name zero terminated
654  * Bugs:
655  *	If the entity name is unknown, the terminator is treated as
656  *	a printable non-special character in all cases, even if it is '<'
657  * Bug-fix:
658  *	Modified SGML_character() so we only come here with terminator
659  *	as '\0' and check a FoundEntity flag. -- Foteos Macrides
660  *
661  * Modified more (for use with Lynx character translation code):
662  */
663 static char replace_buf[64];	/* buffer for replacement strings */
664 static BOOL FoundEntity = FALSE;
665 
handle_entity(HTStream * context,int term)666 static void handle_entity(HTStream *context, int term)
667 {
668     UCode_t code;
669     long uck = -1;
670     const char *s = context->string->data;
671 
672     /*
673      * Handle all entities normally.  - FM
674      */
675     FoundEntity = FALSE;
676     if ((code = HTMLGetEntityUCValue(s)) != 0) {
677 	/*
678 	 * We got a Unicode value for the entity name.  Check for special
679 	 * Unicodes.  - FM
680 	 */
681 	if (put_special_unicodes(context, code)) {
682 #ifdef USE_PRETTYSRC
683 	    if (psrc_view) {
684 		put_pretty_entity(context, term);
685 	    }
686 #endif
687 	    FoundEntity = TRUE;
688 	    return;
689 	}
690 	/*
691 	 * Seek a translation from the chartrans tables.
692 	 */
693 	if ((uck = UCTransUniChar(code, context->outUCLYhndl)) >= 32 &&
694 /* =============== work in ASCII below here ===============  S/390 -- gil -- 0672 */
695 	    uck < 256 &&
696 	    (uck < 127 ||
697 	     uck >= LYlowest_eightbit[context->outUCLYhndl])) {
698 #ifdef USE_PRETTYSRC
699 	    if (psrc_view) {
700 		put_pretty_entity(context, term);
701 	    } else
702 #endif
703 		PUTC(FROMASCII((char) uck));
704 	    FoundEntity = TRUE;
705 	    return;
706 	} else if ((uck == -4 ||
707 		    (context->T.repl_translated_C0 &&
708 		     uck > 0 && uck < 32)) &&
709 	    /*
710 	     * Not found; look for replacement string.
711 	     */
712 		   (uck = UCTransUniCharStr(replace_buf, 60, code,
713 					    context->outUCLYhndl, 0) >= 0)) {
714 #ifdef USE_PRETTYSRC
715 	    if (psrc_view) {
716 		put_pretty_entity(context, term);
717 	    } else
718 #endif
719 		PUTS(replace_buf);
720 	    FoundEntity = TRUE;
721 	    return;
722 	}
723 	/*
724 	 * If we're displaying UTF-8, try that now.  - FM
725 	 */
726 #ifndef USE_PRETTYSRC
727 	if (context->T.output_utf8 && PUTUTF8(code)) {
728 	    FoundEntity = TRUE;
729 	    return;
730 	}
731 #else
732 	if (context->T.output_utf8 && (psrc_view
733 				       ? (UCPutUtf8_charstring((HTStream *) context->target,
734 							       (putc_func_t *) (fake_put_character),
735 							       code))
736 				       : PUTUTF8(code))) {
737 
738 	    if (psrc_view) {
739 		put_pretty_entity(context, term);
740 	    }
741 
742 	    FoundEntity = TRUE;
743 	    return;
744 	}
745 #endif
746 	/*
747 	 * If it's safe ASCII, use it.  - FM
748 	 */
749 	if (code >= 32 && code < 127) {
750 #ifdef USE_PRETTYSRC
751 	    if (psrc_view) {
752 		put_pretty_entity(context, term);
753 	    } else
754 #endif
755 
756 		PUTC(FROMASCII((char) code));
757 	    FoundEntity = TRUE;
758 	    return;
759 	}
760 /* =============== work in ASCII above here ===============  S/390 -- gil -- 0682 */
761 	/*
762 	 * Ignore zwnj (8204) and zwj (8205), if we get to here.  Note that
763 	 * zwnj may have been handled as <WBR> by the calling function.  - FM
764 	 */
765 	if (!strcmp(s, "zwnj") ||
766 	    !strcmp(s, "zwj")) {
767 	    CTRACE((tfp, "handle_entity: Ignoring '%s'.\n", s));
768 #ifdef USE_PRETTYSRC
769 	    if (psrc_view) {
770 		put_pretty_entity(context, term);
771 	    }
772 #endif
773 	    FoundEntity = TRUE;
774 	    return;
775 	}
776 	/*
777 	 * Ignore lrm (8206), and rln (8207), if we get to here.  - FM
778 	 */
779 	if (!strcmp(s, "lrm") ||
780 	    !strcmp(s, "rlm")) {
781 	    CTRACE((tfp, "handle_entity: Ignoring '%s'.\n", s));
782 #ifdef USE_PRETTYSRC
783 	    if (psrc_view) {
784 		put_pretty_entity(context, term);
785 	    }
786 #endif
787 	    FoundEntity = TRUE;
788 	    return;
789 	}
790     }
791 
792     /*
793      * If entity string not found, display as text.
794      */
795 #ifdef USE_PRETTYSRC
796     if (psrc_view)
797 	PSRCSTART(badseq);
798 #endif
799     CTRACE((tfp, "SGML: Unknown entity '%s' %" PRI_UCode_t " %ld\n", s, code, uck));	/* S/390 -- gil -- 0695 */
800     PUTC('&');
801     PUTS(s);
802     if (term != '\0')
803 	PUTC(term);
804 #ifdef USE_PRETTYSRC
805     if (psrc_view)
806 	PSRCSTOP(badseq);
807 #endif
808 }
809 
810 /*	Handle comment
811  *	--------------
812  */
handle_comment(HTStream * context)813 static void handle_comment(HTStream *context)
814 {
815     const char *s = context->string->data;
816 
817     CTRACE((tfp, "SGML Comment:\n<%s>\n", s));
818 
819     if (context->csi == NULL &&
820 	StrNCmp(s, "!--#", 4) == 0 &&
821 	LYCheckForCSI(context->node_anchor, &context->url) == TRUE) {
822 	LYDoCSI(context->url, s, &context->csi);
823     } else {
824 	LYCommentHacks(context->node_anchor, context->string->data);
825     }
826 
827     return;
828 }
829 
830 /*	Handle identifier
831  *	-----------------
832  */
handle_identifier(HTStream * context)833 static void handle_identifier(HTStream *context)
834 {
835     const char *s = context->string->data;
836 
837     CTRACE((tfp, "SGML Identifier:\n<%s>\n", s));
838 
839     return;
840 }
841 
842 /*	Handle doctype
843  *	--------------
844  */
handle_doctype(HTStream * context)845 static void handle_doctype(HTStream *context)
846 {
847     const char *s = context->string->data;
848 
849     CTRACE((tfp, "SGML Doctype:\n<%s>\n", s));
850     if (strstr(s, "DTD XHTML ") != 0) {
851 	CTRACE((tfp, "...processing extended HTML\n"));
852 	context->extended_html = TRUE;
853     }
854 
855     return;
856 }
857 
858 /*	Handle marked
859  *	-------------
860  */
handle_marked(HTStream * context)861 static void handle_marked(HTStream *context)
862 {
863     const char *s = context->string->data;
864 
865     CTRACE((tfp, "SGML Marked Section:\n<%s>\n", s));
866 
867     if (!StrNCmp(context->string->data, "![INCLUDE[", 10)) {
868 	context->string->data[context->string->size - 3] = '\0';
869 	StrAllocCat(context->include, context->string->data + 10);
870 	/* @@@ This needs to take charset into account! @@@
871 	   the wrong assumptions will be made about the data's
872 	   charset once it is in include - kw */
873 
874     } else if (!StrNCmp(context->string->data, "![CDATA[", 8)) {
875 	(*context->actions->put_block) (context->target,
876 					context->string->data + 8,
877 					context->string->size - 11);
878 
879     }
880     return;
881 }
882 
883 /*	Handle processing instruction
884  *	-----------------------------
885  */
handle_processing_instruction(HTStream * context)886 static void handle_processing_instruction(HTStream *context)
887 {
888     const char *s = context->string->data;
889 
890     CTRACE((tfp, "SGML Processing instruction:\n<%s>\n", s));
891 
892     if (!StrNCmp(s, "?xml ", 5)) {
893 	int flag = context->T.decode_utf8;
894 
895 	context->strict_xml = TRUE;
896 	/*
897 	 * Switch to UTF-8 if the encoding is explicitly "utf-8".
898 	 */
899 	if (!flag) {
900 	    char *t = strstr(s, "encoding=");
901 
902 	    if (t != 0) {
903 		t += 9;
904 		if (*t == '"')
905 		    ++t;
906 		flag = !StrNCmp(t, "utf-8", 5);
907 	    }
908 	    if (flag) {
909 		CTRACE((tfp, "...Use UTF-8 for XML\n"));
910 		context->T.decode_utf8 = TRUE;
911 	    }
912 	}
913     }
914 
915     return;
916 }
917 
918 /*	Handle sgmlent
919  *	--------------
920  */
handle_sgmlent(HTStream * context)921 static void handle_sgmlent(HTStream *context)
922 {
923     const char *s = context->string->data;
924 
925     CTRACE((tfp, "SGML Entity Declaration:\n<%s>\n", s));
926 
927     return;
928 }
929 
930 /*	Handle sgmlent
931  *	--------------
932  */
handle_sgmlele(HTStream * context)933 static void handle_sgmlele(HTStream *context)
934 {
935     const char *s = context->string->data;
936 
937     CTRACE((tfp, "SGML Element Declaration:\n<%s>\n", s));
938 
939     return;
940 }
941 
942 /*	Handle sgmlatt
943  *	--------------
944  */
handle_sgmlatt(HTStream * context)945 static void handle_sgmlatt(HTStream *context)
946 {
947     const char *s = context->string->data;
948 
949     CTRACE((tfp, "SGML Attribute Declaration:\n<%s>\n", s));
950 
951     return;
952 }
953 
954 /*
955  * Convenience macros - tags (elements) are identified sometimes by an int or
956  * enum value ('TAGNUM'), sometimes by a pointer to HTTag ('TAGP').  - kw
957  */
958 #define TAGNUM_OF_TAGP(t) (HTMLElement) (t - context->dtd->tags)
959 #define TAGP_OF_TAGNUM(e) (context->dtd->tags + e)
960 
961 /*
962  * The following implement special knowledge about OBJECT.  As long as
963  * HTML_OBJECT is the only tag for which an alternative variant exist, they can
964  * be simple macros.  - kw
965  */
966 /* does 'TAGNUM' e have an alternative (variant) parsing mode? */
967 #define HAS_ALT_TAGNUM(e) (e == HTML_OBJECT)
968 
969 /* return 'TAGNUM' of the alternative mode for 'TAGNUM' e, if any. */
970 #define ALT_TAGNUM(e) ((e == HTML_OBJECT) ? HTML_ALT_OBJECT : e)
971 
972 /* return 'TAGNUM' of the normal mode for 'TAGNUM' e which may be alt. */
973 #define NORMAL_TAGNUM(e) (((int)(e) >= HTML_ELEMENTS) ? HTML_OBJECT : (HTMLElement)e)
974 
975 /* More convenience stuff. - kw */
976 #define ALT_TAGP_OF_TAGNUM(e) TAGP_OF_TAGNUM(ALT_TAGNUM(e))
977 #define NORMAL_TAGP_OF_TAGNUM(e) TAGP_OF_TAGNUM(NORMAL_TAGNUM(e))
978 
979 #define ALT_TAGP(t) ALT_TAGP_OF_TAGNUM(TAGNUM_OF_TAGP(t))
980 #define NORMAL_TAGP(t) NORMAL_TAGP_OF_TAGNUM(TAGNUM_OF_TAGP(t))
981 
element_valid_within(HTTag * new_tag,HTTag * stacked_tag,int direct)982 static BOOL element_valid_within(HTTag * new_tag, HTTag * stacked_tag, int direct)
983 {
984     BOOL result = YES;
985     TagClass usecontains, usecontained;
986 
987     if (stacked_tag && new_tag) {
988 	usecontains = (direct ? stacked_tag->contains : stacked_tag->icontains);
989 	usecontained = (direct ? new_tag->contained : new_tag->icontained);
990 	if (new_tag == stacked_tag) {
991 	    result = (BOOL) ((Tgc_same & usecontains) &&
992 			     (Tgc_same & usecontained));
993 	} else {
994 	    result = (BOOL) ((new_tag->tagclass & usecontains) &&
995 			     (stacked_tag->tagclass & usecontained));
996 	}
997     }
998     return result;
999 }
1000 
1001 typedef enum {
1002     close_NO = 0,
1003     close_error = 1,
1004     close_valid = 2
1005 } canclose_t;
1006 
can_close(HTTag * new_tag,HTTag * stacked_tag)1007 static canclose_t can_close(HTTag * new_tag, HTTag * stacked_tag)
1008 {
1009     canclose_t result;
1010 
1011     if (!stacked_tag) {
1012 	result = close_NO;
1013     } else if (stacked_tag->flags & Tgf_endO) {
1014 	result = close_valid;
1015     } else if (new_tag == stacked_tag) {
1016 	result = ((Tgc_same & new_tag->canclose)
1017 		  ? close_error
1018 		  : close_NO);
1019     } else {
1020 	result = ((stacked_tag->tagclass & new_tag->canclose)
1021 		  ? close_error
1022 		  : close_NO);
1023     }
1024     return result;
1025 }
1026 
do_close_stacked(HTStream * context)1027 static void do_close_stacked(HTStream *context)
1028 {
1029     HTElement *stacked = context->element_stack;
1030     HTMLElement e;
1031 
1032     if (!stacked)
1033 	return;			/* stack was empty */
1034     if (context->inSELECT && !strcasecomp(stacked->tag->name, "SELECT")) {
1035 	context->inSELECT = FALSE;
1036     }
1037     e = NORMAL_TAGNUM(TAGNUM_OF_TAGP(stacked->tag));
1038 #ifdef USE_PRETTYSRC
1039     if (!psrc_view)		/* Don't actually pass call on if viewing psrc - kw */
1040 #endif
1041 	(*context->actions->end_element) (context->target,
1042 					  (int) e,
1043 					  &context->include);
1044     context->element_stack = stacked->next;
1045     pool_free(stacked);
1046     context->no_lynx_specialcodes =
1047 	(BOOL) (context->element_stack
1048 		? (context->element_stack->tag->flags & Tgf_nolyspcl)
1049 		: NO);
1050 }
1051 
is_on_stack(HTStream * context,HTTag * old_tag)1052 static int is_on_stack(HTStream *context, HTTag * old_tag)
1053 {
1054     HTElement *stacked = context->element_stack;
1055     int i = 1;
1056 
1057     for (; stacked; stacked = stacked->next, i++) {
1058 	if (stacked->tag == old_tag ||
1059 	    stacked->tag == ALT_TAGP(old_tag))
1060 	    return i;
1061     }
1062     return 0;
1063 }
1064 
1065 /*	End element
1066  *	-----------
1067  */
end_element(HTStream * context,HTTag * old_tag)1068 static void end_element(HTStream *context, HTTag * old_tag)
1069 {
1070     BOOL extra_action_taken = NO;
1071     canclose_t canclose_check = close_valid;
1072     int stackpos = is_on_stack(context, old_tag);
1073 
1074     if (!Old_DTD) {
1075 	while (canclose_check != close_NO &&
1076 	       context->element_stack &&
1077 	       (stackpos > 1 || (!extra_action_taken && stackpos == 0))) {
1078 	    if (stackpos == 0 && (old_tag->flags & Tgf_startO) &&
1079 		element_valid_within(old_tag, context->element_stack->tag, YES)) {
1080 		CTRACE((tfp, "SGML: </%s> ignored\n", old_tag->name));
1081 		return;
1082 	    }
1083 	    canclose_check = can_close(old_tag, context->element_stack->tag);
1084 	    if (canclose_check != close_NO) {
1085 		CTRACE((tfp, "SGML: End </%s> \t<- %s end </%s>\n",
1086 			context->element_stack->tag->name,
1087 			((canclose_check == close_valid)
1088 			 ? "supplied,"
1089 			 : "***forced by"),
1090 			old_tag->name));
1091 		do_close_stacked(context);
1092 		extra_action_taken = YES;
1093 		stackpos = is_on_stack(context, old_tag);
1094 	    }
1095 	}
1096 
1097 	if (stackpos == 0 && old_tag->contents != SGML_EMPTY) {
1098 	    CTRACE((tfp, "SGML: Still open %s, ***no open %s for </%s>\n",
1099 		    context->element_stack ?
1100 		    context->element_stack->tag->name : "none",
1101 		    old_tag->name,
1102 		    old_tag->name));
1103 	    return;
1104 	}
1105 	if (stackpos > 1) {
1106 	    CTRACE((tfp,
1107 		    "SGML: Nesting <%s>...<%s> \t<- ***invalid end </%s>\n",
1108 		    old_tag->name,
1109 		    context->element_stack ?
1110 		    context->element_stack->tag->name : "none",
1111 		    old_tag->name));
1112 	    return;
1113 	}
1114     }
1115     /* Now let the non-extended code deal with the rest. - kw */
1116 
1117     /*
1118      * If we are in a SELECT block, ignore anything but a SELECT end tag.  - FM
1119      */
1120     if (context->inSELECT) {
1121 	if (!strcasecomp(old_tag->name, "SELECT")) {
1122 	    /*
1123 	     * Turn off the inSELECT flag and fall through.  - FM
1124 	     */
1125 	    context->inSELECT = FALSE;
1126 	} else {
1127 	    /*
1128 	     * Ignore the end tag.  - FM
1129 	     */
1130 	    CTRACE((tfp, "SGML: ***Ignoring end tag </%s> in SELECT block.\n",
1131 		    old_tag->name));
1132 	    return;
1133 	}
1134     }
1135     /*
1136      * Handle the end tag.  - FM
1137      */
1138     CTRACE((tfp, "SGML: End </%s>\n", old_tag->name));
1139     if (old_tag->contents == SGML_EMPTY) {
1140 	CTRACE((tfp, "SGML: ***Illegal end tag </%s> found.\n",
1141 		old_tag->name));
1142 	return;
1143     }
1144 #ifdef WIND_DOWN_STACK
1145     while (context->element_stack)	/* Loop is error path only */
1146 #else
1147     if (context->element_stack)	/* Substitute and remove one stack element */
1148 #endif /* WIND_DOWN_STACK */
1149     {
1150 	int status = HT_OK;
1151 	HTMLElement e;
1152 	HTElement *N = context->element_stack;
1153 	HTTag *t = (N->tag != old_tag) ? NORMAL_TAGP(N->tag) : N->tag;
1154 
1155 	if (old_tag != t) {	/* Mismatch: syntax error */
1156 	    if (context->element_stack->next) {		/* This is not the last level */
1157 		CTRACE((tfp,
1158 			"SGML: Found </%s> when expecting </%s>. </%s> ***assumed.\n",
1159 			old_tag->name, t->name, t->name));
1160 	    } else {		/* last level */
1161 		CTRACE((tfp,
1162 			"SGML: Found </%s> when expecting </%s>. </%s> ***Ignored.\n",
1163 			old_tag->name, t->name, old_tag->name));
1164 		return;		/* Ignore */
1165 	    }
1166 	}
1167 
1168 	e = NORMAL_TAGNUM(TAGNUM_OF_TAGP(t));
1169 	CTRACE2(TRACE_SGML, (tfp, "tagnum(%p) = %d\n", (void *) t, (int) e));
1170 #ifdef USE_PRETTYSRC
1171 	if (!psrc_view)		/* Don't actually pass call on if viewing psrc - kw */
1172 #endif
1173 	    status = (*context->actions->end_element) (context->target,
1174 						       (int) e,
1175 						       &context->include);
1176 	if (status == HT_PARSER_REOPEN_ELT) {
1177 	    CTRACE((tfp, "SGML: Restart <%s>\n", t->name));
1178 	    (*context->actions->start_element) (context->target,
1179 						(int) e,
1180 						NULL,
1181 						NULL,
1182 						context->current_tag_charset,
1183 						&context->include);
1184 	} else if (status == HT_PARSER_OTHER_CONTENT) {
1185 	    CTRACE((tfp, "SGML: Continue with other content model for <%s>\n", t->name));
1186 	    context->element_stack->tag = ALT_TAGP_OF_TAGNUM(e);
1187 	} else {
1188 	    context->element_stack = N->next;	/* Remove from stack */
1189 	    pool_free(N);
1190 	}
1191 	context->no_lynx_specialcodes =
1192 	    (BOOL) (context->element_stack
1193 		    ? (context->element_stack->tag->flags & Tgf_nolyspcl)
1194 		    : NO);
1195 #ifdef WIND_DOWN_STACK
1196 	if (old_tag == t)
1197 	    return;		/* Correct sequence */
1198 #else
1199 	return;
1200 #endif /* WIND_DOWN_STACK */
1201 
1202 	/* Syntax error path only */
1203 
1204     }
1205     CTRACE((tfp, "SGML: Extra end tag </%s> found and ignored.\n",
1206 	    old_tag->name));
1207 }
1208 
1209 /*	Start a element
1210 */
start_element(HTStream * context)1211 static void start_element(HTStream *context)
1212 {
1213     int status;
1214     HTTag *new_tag = context->current_tag;
1215     HTMLElement e = TAGNUM_OF_TAGP(new_tag);
1216     BOOL ok = FALSE;
1217 
1218     BOOL valid = YES;
1219     BOOL direct_container = YES;
1220     BOOL extra_action_taken = NO;
1221     canclose_t canclose_check = close_valid;
1222 
1223     if (!Old_DTD) {
1224 	while (context->element_stack &&
1225 	       (canclose_check == close_valid ||
1226 		(canclose_check == close_error &&
1227 		 new_tag == context->element_stack->tag)) &&
1228 	       !(valid = element_valid_within(new_tag,
1229 					      context->element_stack->tag,
1230 					      direct_container))) {
1231 	    canclose_check = can_close(new_tag, context->element_stack->tag);
1232 	    if (canclose_check != close_NO) {
1233 		CTRACE((tfp, "SGML: End </%s> \t<- %s start <%s>\n",
1234 			context->element_stack->tag->name,
1235 			((canclose_check == close_valid)
1236 			 ? "supplied,"
1237 			 : "***forced by"),
1238 			new_tag->name));
1239 		do_close_stacked(context);
1240 		extra_action_taken = YES;
1241 		if (canclose_check == close_error)
1242 		    direct_container = NO;
1243 	    } else {
1244 		CTRACE((tfp,
1245 			"SGML: Still open %s \t<- ***invalid start <%s>\n",
1246 			context->element_stack->tag->name,
1247 			new_tag->name));
1248 	    }
1249 	}
1250 	if (context->element_stack && !valid &&
1251 	    (context->element_stack->tag->flags & Tgf_strict) &&
1252 	    !(valid = element_valid_within(new_tag,
1253 					   context->element_stack->tag,
1254 					   direct_container))) {
1255 	    CTRACE((tfp, "SGML: Still open %s \t<- ***ignoring start <%s>\n",
1256 		    context->element_stack->tag->name,
1257 		    new_tag->name));
1258 	    return;
1259 	}
1260 
1261 	if (context->element_stack &&
1262 	    !extra_action_taken &&
1263 	    (canclose_check == close_NO) &&
1264 	    !valid && (new_tag->flags & Tgf_mafse)) {
1265 	    BOOL has_attributes = NO;
1266 	    int i = 0;
1267 
1268 	    for (; i < new_tag->number_of_attributes && !has_attributes; i++)
1269 		has_attributes = context->present[i];
1270 	    if (!has_attributes) {
1271 		CTRACE((tfp,
1272 			"SGML: Still open %s, ***converting invalid <%s> to </%s>\n",
1273 			context->element_stack->tag->name,
1274 			new_tag->name,
1275 			new_tag->name));
1276 		end_element(context, new_tag);
1277 		return;
1278 	    }
1279 	}
1280 
1281 	if (context->element_stack &&
1282 	    (canclose_check == close_error) &&
1283 	    !element_valid_within(new_tag,
1284 				  context->element_stack->tag,
1285 				  direct_container)) {
1286 	    CTRACE((tfp, "SGML: Still open %s \t<- ***invalid start <%s>\n",
1287 		    context->element_stack->tag->name,
1288 		    new_tag->name));
1289 	}
1290     }
1291     /* Fall through to the non-extended code - kw */
1292 
1293     /*
1294      * If we are not in a SELECT block, check if this is a SELECT start tag.
1295      * Otherwise (i.e., we are in a SELECT block) accept only OPTION as valid,
1296      * terminate the SELECT block if it is any other form-related element, and
1297      * otherwise ignore it.  - FM
1298      */
1299     if (!context->inSELECT) {
1300 	/*
1301 	 * We are not in a SELECT block, so check if this starts one.  - FM
1302 	 * (frequent case!)
1303 	 */
1304 	/* my_casecomp() - optimized by the first character */
1305 	if (!my_casecomp(new_tag->name, "SELECT")) {
1306 	    /*
1307 	     * Set the inSELECT flag and fall through.  - FM
1308 	     */
1309 	    context->inSELECT = TRUE;
1310 	}
1311     } else {
1312 	/*
1313 	 * We are in a SELECT block.  - FM
1314 	 */
1315 	if (strcasecomp(new_tag->name, "OPTION")) {
1316 	    /*
1317 	     * Ugh, it is not an OPTION.  - FM
1318 	     */
1319 	    switch (e) {
1320 	    case HTML_INPUT:
1321 	    case HTML_TEXTAREA:
1322 	    case HTML_SELECT:
1323 	    case HTML_BUTTON:
1324 	    case HTML_FIELDSET:
1325 	    case HTML_LABEL:
1326 	    case HTML_LEGEND:
1327 	    case HTML_FORM:
1328 		ok = TRUE;
1329 		break;
1330 	    default:
1331 		break;
1332 	    }
1333 	    if (ok) {
1334 		/*
1335 		 * It is another form-related start tag, so terminate the
1336 		 * current SELECT block and fall through.  - FM
1337 		 */
1338 		CTRACE((tfp,
1339 			"SGML: ***Faking SELECT end tag before <%s> start tag.\n",
1340 			new_tag->name));
1341 		end_element(context, SGMLFindTag(context->dtd, "SELECT"));
1342 	    } else {
1343 		/*
1344 		 * Ignore the start tag.  - FM
1345 		 */
1346 		CTRACE((tfp,
1347 			"SGML: ***Ignoring start tag <%s> in SELECT block.\n",
1348 			new_tag->name));
1349 		return;
1350 	    }
1351 	}
1352     }
1353     /*
1354      * Handle the start tag.  - FM
1355      */
1356     CTRACE((tfp, "SGML: Start <%s>\n", new_tag->name));
1357     status = (*context->actions->start_element) (context->target,
1358 						 (int) TAGNUM_OF_TAGP(new_tag),
1359 						 context->present,
1360 						 (STRING2PTR) context->value,	/* coerce type for think c */
1361 						 context->current_tag_charset,
1362 						 &context->include);
1363     if (status == HT_PARSER_OTHER_CONTENT)
1364 	new_tag = ALT_TAGP(new_tag);	/* this is only returned for OBJECT */
1365     if (new_tag->contents != SGML_EMPTY) {	/* i.e., tag not empty */
1366 	HTElement *N = pool_alloc();
1367 
1368 	if (N == NULL)
1369 	    outofmem(__FILE__, "start_element");
1370 
1371 	assert(N != NULL);
1372 
1373 	N->next = context->element_stack;
1374 	N->tag = new_tag;
1375 	context->element_stack = N;
1376 	context->no_lynx_specialcodes = (BOOLEAN) (new_tag->flags & Tgf_nolyspcl);
1377 
1378     } else if (e == HTML_META) {
1379 	/*
1380 	 * Check for result of META tag.  - KW & FM
1381 	 */
1382 	change_chartrans_handling(context);
1383     }
1384 }
1385 
1386 /*		Find Tag in DTD tag list
1387  *		------------------------
1388  *
1389  * On entry,
1390  *	dtd	points to dtd structure including valid tag list
1391  *	string	points to name of tag in question
1392  *
1393  * On exit,
1394  *	returns:
1395  *		NULL		tag not found
1396  *		else		address of tag structure in dtd
1397  */
SGMLFindTag(const SGML_dtd * dtd,const char * s)1398 HTTag *SGMLFindTag(const SGML_dtd * dtd,
1399 		   const char *s)
1400 {
1401     int high, low, i, diff;
1402     static HTTag *last[64] =
1403     {NULL};			/*optimize using the previous results */
1404     HTTag **res = last + (UCH(*s) % 64);	/*pointer arithmetic */
1405 
1406     if (*res) {
1407 	if ((*res)->name == NULL)
1408 	    return NULL;
1409 	if (!strcasecomp((*res)->name, s))
1410 	    return *res;
1411     }
1412 
1413     for (low = 0, high = dtd->number_of_tags;
1414 	 high > low;
1415 	 diff < 0 ? (low = i + 1) : (high = i)) {	/* Binary search */
1416 	i = (low + (high - low) / 2);
1417 	/* my_casecomp() - optimized by the first character, NOT_ASCII ok */
1418 	diff = my_casecomp(dtd->tags[i].name, s);	/* Case insensitive */
1419 	if (diff == 0) {	/* success: found it */
1420 	    *res = &dtd->tags[i];
1421 	    return *res;
1422 	}
1423     }
1424     if (IsNmStart(*s)) {
1425 	/*
1426 	 * Unrecognized, but may be valid.  - KW
1427 	 */
1428 	return &HTTag_unrecognized;
1429     }
1430     return NULL;
1431 }
1432 
1433 /*________________________________________________________________________
1434  *			Public Methods
1435  */
1436 
1437 /*	Could check that we are back to bottom of stack! @@  */
1438 /*	Do check! - FM					     */
1439 /*							     */
SGML_free(HTStream * context)1440 static void SGML_free(HTStream *context)
1441 {
1442     int i;
1443     HTElement *cur;
1444     HTTag *t;
1445 
1446     /*
1447      * Free the buffers.  - FM
1448      */
1449     FREE(context->recover);
1450     FREE(context->url);
1451     FREE(context->csi);
1452     FREE(context->include);
1453     FREE(context->active_include);
1454 
1455     /*
1456      * Wind down stack if any elements are open.  - FM
1457      */
1458     while (context->element_stack) {
1459 	cur = context->element_stack;
1460 	t = cur->tag;
1461 	context->element_stack = cur->next;	/* Remove from stack */
1462 	pool_free(cur);
1463 #ifdef USE_PRETTYSRC
1464 	if (!psrc_view)		/* Don't actually call on target if viewing psrc - kw */
1465 #endif
1466 	    (*context->actions->end_element)
1467 		(context->target,
1468 		 (int) NORMAL_TAGNUM(TAGNUM_OF_TAGP(t)),
1469 		 &context->include);
1470 	FREE(context->include);
1471     }
1472 
1473     /*
1474      * Finish off the target.  - FM
1475      */
1476     (*context->actions->_free) (context->target);
1477 
1478     /*
1479      * Free the strings and context structure.  - FM
1480      */
1481     HTChunkFree(context->string);
1482     for (i = 0; i < MAX_ATTRIBUTES; i++)
1483 	FREE_extra(context->value[i]);
1484     FREE(context);
1485 
1486 #ifdef USE_PRETTYSRC
1487     sgml_in_psrc_was_initialized = FALSE;
1488 #endif
1489 }
1490 
SGML_abort(HTStream * context,HTError e)1491 static void SGML_abort(HTStream *context, HTError e)
1492 {
1493     int i;
1494     HTElement *cur;
1495 
1496     /*
1497      * Abort the target.  - FM
1498      */
1499     (*context->actions->_abort) (context->target, e);
1500 
1501     /*
1502      * Free the buffers.  - FM
1503      */
1504     FREE(context->recover);
1505     FREE(context->include);
1506     FREE(context->active_include);
1507     FREE(context->url);
1508     FREE(context->csi);
1509 
1510     /*
1511      * Free stack memory if any elements were left open.  - KW
1512      */
1513     while (context->element_stack) {
1514 	cur = context->element_stack;
1515 	context->element_stack = cur->next;	/* Remove from stack */
1516 	pool_free(cur);
1517     }
1518 
1519     /*
1520      * Free the strings and context structure.  - FM
1521      */
1522     HTChunkFree(context->string);
1523     for (i = 0; i < MAX_ATTRIBUTES; i++)
1524 	FREE_extra(context->value[i]);
1525     FREE(context);
1526 
1527 #ifdef USE_PRETTYSRC
1528     sgml_in_psrc_was_initialized = FALSE;
1529 #endif
1530 }
1531 
1532 /*	Read and write user callback handle
1533  *	-----------------------------------
1534  *
1535  *   The callbacks from the SGML parser have an SGML context parameter.
1536  *   These calls allow the caller to associate his own context with a
1537  *   particular SGML context.
1538  */
1539 
1540 #ifdef CALLERDATA
SGML_callerData(HTStream * context)1541 void *SGML_callerData(HTStream *context)
1542 {
1543     return context->callerData;
1544 }
1545 
SGML_setCallerData(HTStream * context,void * data)1546 void SGML_setCallerData(HTStream *context, void *data)
1547 {
1548     context->callerData = data;
1549 }
1550 #endif /* CALLERDATA */
1551 
1552 #ifdef USE_PRETTYSRC
transform_tag(HTStream * context,HTChunk * string)1553 static void transform_tag(HTStream *context, HTChunk *string)
1554 {
1555     if (!context->strict_xml) {
1556 	if (tagname_transform != 1) {
1557 	    if (tagname_transform == 0)
1558 		LYLowerCase(string->data);
1559 	    else
1560 		LYUpperCase(string->data);
1561 	}
1562     }
1563 }
1564 #endif /* USE_PRETTYSRC */
1565 
ignore_when_empty(HTTag * tag)1566 static BOOL ignore_when_empty(HTTag * tag)
1567 {
1568     BOOL result = FALSE;
1569 
1570     if (!LYPreparsedSource
1571 	&& LYxhtml_parsing
1572 	&& tag->name != 0
1573 	&& !(tag->flags & Tgf_mafse)
1574 	&& tag->contents != SGML_EMPTY
1575 	&& tag->tagclass != Tgc_Plike
1576 	&& (tag->tagclass == Tgc_SELECTlike
1577 	    || (tag->contains && tag->icontains))) {
1578 	result = TRUE;
1579     }
1580     CTRACE((tfp, "SGML Do%s ignore_when_empty:%s\n",
1581 	    result ? "" : " not",
1582 	    NonNull(tag->name)));
1583     return result;
1584 }
1585 
discard_empty(HTStream * context)1586 static void discard_empty(HTStream *context)
1587 {
1588     static HTTag empty_tag;
1589 
1590     CTRACE((tfp, "SGML discarding empty %s\n",
1591 	    NonNull(context->current_tag->name)));
1592     CTRACE_FLUSH(tfp);
1593 
1594     memset(&empty_tag, 0, sizeof(empty_tag));
1595     context->current_tag = &empty_tag;
1596     context->string->size = 0;
1597 
1598     /* do not call end_element() if start_element() was not called */
1599 }
1600 
1601 #ifdef USE_PRETTYSRC
end_if_prettysrc(HTStream * context,HTChunk * string,int end_ch)1602 static BOOL end_if_prettysrc(HTStream *context, HTChunk *string, int end_ch)
1603 {
1604     BOOL result = psrc_view;
1605 
1606     if (psrc_view) {
1607 	if (attr_is_name) {
1608 	    HTStartAnchor(context->target, string->data, NULL);
1609 	    (*context->actions->end_element) (context->target,
1610 					      HTML_A,
1611 					      &context->include);
1612 	} else if (attr_is_href) {
1613 	    PSRCSTART(href);
1614 	    HTStartAnchor(context->target, NULL, string->data);
1615 	}
1616 	PUTS_TR(string->data);
1617 	if (attr_is_href) {
1618 	    (*context->actions->end_element) (context->target,
1619 					      HTML_A,
1620 					      &context->include);
1621 	    PSRCSTOP(href);
1622 	}
1623 	if (end_ch)
1624 	    PUTC(end_ch);
1625 	PSRCSTOP(attrval);
1626     }
1627     return result;
1628 }
1629 #endif
1630 
SGML_character(HTStream * context,int c_in)1631 static void SGML_character(HTStream *context, int c_in)
1632 {
1633     const SGML_dtd *dtd = context->dtd;
1634     HTChunk *string = context->string;
1635     const char *EntityName;
1636     HTTag *testtag = NULL;
1637     BOOLEAN chk;		/* Helps (?) walk through all the else ifs... */
1638     UCode_t clong, uck = 0;	/* Enough bits for UCS4 ... */
1639     int testlast;
1640 
1641     unsigned char c;
1642     unsigned char saved_char_in = '\0';
1643 
1644     ++sgml_offset;
1645 
1646     /*
1647      * Now some fun with the preprocessor.  Use copies for c and unsign_c ==
1648      * clong, so that we can revert back to the unchanged c_in.  - KW
1649      */
1650 #define unsign_c clong
1651 
1652     c = UCH(c_in);
1653     clong = UCH(c);		/* a.k.a. unsign_c */
1654 
1655     if (context->T.decode_utf8) {
1656 	/*
1657 	 * Combine UTF-8 into Unicode.  Incomplete characters silently ignored.
1658 	 * From Linux kernel's console.c.  - KW
1659 	 */
1660 	if (TOASCII(UCH(c)) > 127) {	/* S/390 -- gil -- 0710 */
1661 	    /*
1662 	     * We have an octet from a multibyte character.  - FM
1663 	     */
1664 	    if (context->utf_count > 0 && (TOASCII(c) & 0xc0) == 0x80) {
1665 		context->utf_char = (context->utf_char << 6) | (TOASCII(c) & 0x3f);
1666 		context->utf_count--;
1667 		*(context->utf_buf_p) = (char) c;
1668 		(context->utf_buf_p)++;
1669 		if (context->utf_count == 0) {
1670 		    /*
1671 		     * We have all of the bytes, so terminate the buffer and
1672 		     * set 'clong' to the UCode_t value.  - FM
1673 		     */
1674 		    *(context->utf_buf_p) = '\0';
1675 		    clong = context->utf_char;
1676 		    if (clong < 256) {
1677 			c = UCH(clong & 0xff);
1678 		    }
1679 		    /* lynx does not use left-to-right */
1680 		    if (clong == 0x200e)
1681 			return;
1682 		    goto top1;
1683 		} else {
1684 		    /*
1685 		     * Wait for more.  - KW
1686 		     */
1687 		    return;
1688 		}
1689 	    } else {
1690 		/*
1691 		 * Start handling a new multibyte character.  - FM
1692 		 */
1693 		context->utf_buf_p = context->utf_buf;
1694 		*(context->utf_buf_p) = (char) c;
1695 		(context->utf_buf_p)++;
1696 		if ((c & 0xe0) == 0xc0) {
1697 		    context->utf_count = 1;
1698 		    context->utf_char = (c & 0x1f);
1699 		} else if ((c & 0xf0) == 0xe0) {
1700 		    context->utf_count = 2;
1701 		    context->utf_char = (c & 0x0f);
1702 		} else if ((c & 0xf8) == 0xf0) {
1703 		    context->utf_count = 3;
1704 		    context->utf_char = (c & 0x07);
1705 		} else if ((c & 0xfc) == 0xf8) {
1706 		    context->utf_count = 4;
1707 		    context->utf_char = (c & 0x03);
1708 		} else if ((c & 0xfe) == 0xfc) {
1709 		    context->utf_count = 5;
1710 		    context->utf_char = (c & 0x01);
1711 		} else {
1712 		    /*
1713 		     * Garbage.  - KW
1714 		     */
1715 		    context->utf_count = 0;
1716 		    context->utf_buf_p = context->utf_buf;
1717 		    *(context->utf_buf_p) = '\0';
1718 		}
1719 		/*
1720 		 * Wait for more.  - KW
1721 		 */
1722 		return;
1723 	    }
1724 	} else {
1725 	    /*
1726 	     * Got an ASCII char.  - KW
1727 	     */
1728 	    context->utf_count = 0;
1729 	    context->utf_buf_p = context->utf_buf;
1730 	    *(context->utf_buf_p) = '\0';
1731 	    /*  goto top;  */
1732 	}
1733     }
1734     /* end of context->T.decode_utf8      S/390 -- gil -- 0726 */
1735 #ifdef NOTDEFINED
1736     /*
1737      * If we have a koi8-r input and do not have koi8-r as the output, save the
1738      * raw input in saved_char_in before we potentially convert it to Unicode.
1739      * - FM
1740      */
1741     if (context->T.strip_raw_char_in)
1742 	saved_char_in = c;
1743 #endif /* NOTDEFINED */
1744 
1745     /*
1746      * If we want the raw input converted to Unicode, try that now.  - FM
1747      */
1748     if (context->T.trans_to_uni &&
1749 #ifdef EXP_JAPANESEUTF8_SUPPORT
1750 	((strcmp(LYCharSet_UC[context->inUCLYhndl].MIMEname, "euc-jp") == 0) ||
1751 	 (strcmp(LYCharSet_UC[context->inUCLYhndl].MIMEname, "shift_jis") == 0))) {
1752 	if (strcmp(LYCharSet_UC[context->inUCLYhndl].MIMEname, "shift_jis") == 0) {
1753 	    if (context->utf_count == 0) {
1754 		if (IS_SJIS_HI1((unsigned char) c) ||
1755 		    IS_SJIS_HI2((unsigned char) c)) {
1756 		    context->utf_buf[0] = (char) c;
1757 		    context->utf_count = 1;
1758 		    clong = -11;
1759 		}
1760 	    } else {
1761 		if (IS_SJIS_LO((unsigned char) c)) {
1762 		    context->utf_buf[1] = (char) c;
1763 		    clong = UCTransJPToUni(context->utf_buf, 2, context->inUCLYhndl);
1764 		}
1765 		context->utf_count = 0;
1766 	    }
1767 	} else {
1768 	    if (context->utf_count == 0) {
1769 		if (IS_EUC_HI((unsigned char) c)) {
1770 		    context->utf_buf[0] = (char) c;
1771 		    context->utf_count = 1;
1772 		    clong = -11;
1773 		}
1774 	    } else {
1775 		if (IS_EUC_LOX((unsigned char) c)) {
1776 		    context->utf_buf[1] = (char) c;
1777 		    clong = UCTransJPToUni(context->utf_buf, 2, context->inUCLYhndl);
1778 		}
1779 		context->utf_count = 0;
1780 	    }
1781 	}
1782 	goto top1;
1783     } else if (context->T.trans_to_uni &&
1784 #endif
1785 	       ((TOASCII(unsign_c) >= LYlowest_eightbit[context->inUCLYhndl]) ||	/* S/390 -- gil -- 0744 */
1786 		(unsign_c < ' ' && unsign_c != 0 &&
1787 		 context->T.trans_C0_to_uni))) {
1788 	/*
1789 	 * Convert the octet to Unicode.  - FM
1790 	 */
1791 	clong = UCTransToUni((char) c, context->inUCLYhndl);
1792 	if (clong > 0) {
1793 	    saved_char_in = c;
1794 	    if (clong < 256) {
1795 		c = FROMASCII(UCH(clong));
1796 	    }
1797 	}
1798 	goto top1;
1799     } else if (unsign_c < ' ' && unsign_c != 0 &&	/* S/390 -- gil -- 0768 */
1800 	       context->T.trans_C0_to_uni) {
1801 	/*
1802 	 * This else if may be too ugly to keep.  - KW
1803 	 */
1804 	if (context->T.trans_from_uni &&
1805 	    (((clong = UCTransToUni((char) c, context->inUCLYhndl)) >= ' ') ||
1806 	     (context->T.transp &&
1807 	      (clong = UCTransToUni((char) c, context->inUCLYhndl)) > 0))) {
1808 	    saved_char_in = c;
1809 	    if (clong < 256) {
1810 		c = FROMASCII(UCH(clong));
1811 	    }
1812 	    goto top1;
1813 	} else {
1814 	    uck = -1;
1815 	    if (context->T.transp) {
1816 		uck = UCTransCharStr(replace_buf, 60, (char) c,
1817 				     context->inUCLYhndl,
1818 				     context->inUCLYhndl, NO);
1819 	    }
1820 	    if (!context->T.transp || uck < 0) {
1821 		uck = UCTransCharStr(replace_buf, 60, (char) c,
1822 				     context->inUCLYhndl,
1823 				     context->outUCLYhndl, YES);
1824 	    }
1825 	    if (uck == 0) {
1826 		return;
1827 	    } else if (uck < 0) {
1828 		goto top0a;
1829 	    }
1830 	    c = UCH(replace_buf[0]);
1831 	    if (c && replace_buf[1]) {
1832 		if (context->state == S_text) {
1833 		    PUTS(replace_buf);
1834 		    return;
1835 		}
1836 		StrAllocCat(context->recover, replace_buf + 1);
1837 	    }
1838 	    goto top0a;
1839 	}			/*  Next line end of ugly stuff for C0. - KW */
1840     } else {			/* end of context->T.trans_to_uni  S/390 -- gil -- 0791 */
1841 	goto top0a;
1842     }
1843 
1844     /*
1845      * At this point we have either unsign_c a.k.a.  clong in Unicode (and c in
1846      * latin1 if clong is in the latin1 range), or unsign_c and c will have to
1847      * be passed raw.  - KW
1848      */
1849 /*
1850  *  We jump up to here from below if we have
1851  *  stuff in the recover, insert, or csi buffers
1852  *  to process.	 We zero saved_char_in, in effect
1853  *  as a flag that the octet is not that of the
1854  *  actual call to this function.  This may be OK
1855  *  for now, for the stuff this function adds to
1856  *  its recover buffer, but it might not be for
1857  *  stuff other functions added to the insert or
1858  *  csi buffer, so bear that in mind. - FM
1859  *  Stuff from the recover buffer is now handled
1860  *  as UTF-8 if we can expect that's what it is,
1861  *  and in that case we don't come back up here. - kw
1862  */
1863   top:
1864     saved_char_in = '\0';
1865 /*
1866  *  We jump to here from above when we don't have
1867  *  UTF-8 input, haven't converted to Unicode, and
1868  *  want clong set to the input octet (unsigned)
1869  *  without zeroing its saved_char_in copy (which
1870  *  is signed). - FM
1871  */
1872   top0a:
1873     *(context->utf_buf) = '\0';
1874     clong = UCH(c);
1875 /*
1876  *  We jump to here from above if we have converted
1877  *  the input, or a multibyte sequence across calls,
1878  *  to a Unicode value and loaded it into clong (to
1879  *  which unsign_c has been defined), and from below
1880  *  when we are recycling a character (e.g., because
1881  *  it terminated an entity but is not the standard
1882  *  semi-colon).  The character will already have
1883  *  been put through the Unicode conversions. - FM
1884  */
1885   top1:
1886     /*
1887      * Ignore low ISO 646 7-bit control characters if HTCJK is not set.  - FM
1888      */
1889     /*
1890      * Works for both ASCII and EBCDIC. -- gil
1891      * S/390 -- gil -- 0811
1892      */
1893     if (TOASCII(unsign_c) < 32 &&
1894 	c != '\t' && c != '\n' && c != '\r' &&
1895 	!IS_CJK_TTY)
1896 	goto after_switch;
1897 
1898     /*
1899      * Ignore 127 if we don't have HTPassHighCtrlRaw or HTCJK set.  - FM
1900      */
1901 #define PASSHICTRL (context->T.transp || \
1902 		    unsign_c >= LYlowest_eightbit[context->inUCLYhndl])
1903     if (TOASCII(c) == 127 &&	/* S/390 -- gil -- 0830 */
1904 	!(PASSHICTRL || IS_CJK_TTY))
1905 	goto after_switch;
1906 
1907     /*
1908      * Ignore 8-bit control characters 128 - 159 if neither HTPassHighCtrlRaw
1909      * nor HTCJK is set.  - FM
1910      */
1911     if (TOASCII(unsign_c) > 127 && TOASCII(unsign_c) < 160 &&	/* S/390 -- gil -- 0847 */
1912 	!(PASSHICTRL || IS_CJK_TTY)) {
1913 	/*
1914 	 * If we happen to be reading from an "ISO-8859-1" or "US-ASCII"
1915 	 * document, allow the cp-1252 codes, to accommodate the HTML5 draft
1916 	 * recommendation for replacement encoding:
1917 	 *
1918 	 * http://www.whatwg.org/specs/web-apps/current-work/multipage/infrastructure.html#character-encodings-0
1919 	 */
1920 	if (AssumeCP1252(context)) {
1921 	    clong = LYcp1252ToUnicode((UCode_t) c);
1922 	    goto top1;
1923 	}
1924 	goto after_switch;
1925     }
1926 
1927     /* Almost all CJK characters are double byte but only Japanese
1928      * JIS X0201 Kana is single byte. To prevent to fail SGML parsing
1929      * we have to take care of them here. -- TH
1930      */
1931     if ((HTCJK == JAPANESE) && (context->state == S_in_kanji) &&
1932 	!IS_JAPANESE_2BYTE(context->kanji_buf, UCH(c))
1933 #ifdef EXP_JAPANESEUTF8_SUPPORT
1934 	&& !context->T.decode_utf8
1935 #endif
1936 	) {
1937 #ifdef CONV_JISX0201KANA_JISX0208KANA
1938 	if (IS_SJIS_X0201KANA(context->kanji_buf)) {
1939 	    unsigned char sjis_hi, sjis_lo;
1940 
1941 	    JISx0201TO0208_SJIS(context->kanji_buf, &sjis_hi, &sjis_lo);
1942 	    PUTC(sjis_hi);
1943 	    PUTC(sjis_lo);
1944 	} else
1945 #endif
1946 	    PUTC(context->kanji_buf);
1947 	context->state = S_text;
1948     }
1949 
1950     /*
1951      * Handle character based on context->state.
1952      */
1953     CTRACE2(TRACE_SGML, (tfp, "SGML before %s|%.*s|%c|\n",
1954 			 state_name(context->state),
1955 			 string->size,
1956 			 NonNull(string->data),
1957 			 UCH(c)));
1958     switch (context->state) {
1959 
1960     case S_in_kanji:
1961 	/*
1962 	 * Note that if we don't have a CJK input, then this is not the second
1963 	 * byte of a CJK di-byte, and we're trashing the input.  That's why
1964 	 * 8-bit characters followed by, for example, '<' can cause the tag to
1965 	 * be treated as text, not markup.  We could try to deal with it by
1966 	 * holding each first byte and then checking byte pairs, but that
1967 	 * doesn't seem worth the overhead (see below).  - FM
1968 	 */
1969 	context->state = S_text;
1970 	PUTC(context->kanji_buf);
1971 	PUTC(c);
1972 	break;
1973 
1974     case S_tagname_slash:
1975 	/*
1976 	 * We had something link "<name/" so far, set state to S_text but keep
1977 	 * context->slashedtag as a flag; except if we get '>' directly
1978 	 * after the "<name/", and really have a tag for that name in
1979 	 * context->slashedtag, in which case keep state as is and let code
1980 	 * below deal with it.  - kw
1981 	 */
1982 	if (!(c == '>' && context->slashedtag && TOASCII(unsign_c) < 127)) {
1983 	    context->state = S_text;
1984 	}
1985 	/* fall through in any case! */
1986     case S_text:
1987 	if (IS_CJK_TTY && ((TOASCII(c) & 0200) != 0)
1988 #ifdef EXP_JAPANESEUTF8_SUPPORT
1989 	    && !context->T.decode_utf8
1990 #endif
1991 	    ) {			/* S/390 -- gil -- 0864 */
1992 	    /*
1993 	     * Setting up for Kanji multibyte handling (based on Takuya ASADA's
1994 	     * (asada@three-a.co.jp) CJK Lynx).  Note that if the input is not
1995 	     * in fact CJK, the next byte also will be mishandled, as explained
1996 	     * above.  Toggle raw mode off in such cases, or select the "7 bit
1997 	     * approximations" display character set, which is largely
1998 	     * equivalent to having raw mode off with CJK.  - FM
1999 	     */
2000 	    context->state = S_in_kanji;
2001 	    context->kanji_buf = c;
2002 	    break;
2003 	} else if (IS_CJK_TTY && TOASCII(c) == '\033') {	/* S/390 -- gil -- 0881 */
2004 	    /*
2005 	     * Setting up for CJK escape sequence handling (based on Takuya
2006 	     * ASADA's (asada@three-a.co.jp) CJK Lynx).  - FM
2007 	     */
2008 	    context->state = S_esc;
2009 	    PUTC(c);
2010 	    break;
2011 	}
2012 
2013 	if (c == '&' || c == '<') {
2014 #ifdef USE_PRETTYSRC
2015 	    if (psrc_view) {	/*there is nothing useful in the element_stack */
2016 		testtag = context->current_tag;
2017 	    } else
2018 #endif
2019 	    {
2020 		testtag = context->element_stack ?
2021 		    context->element_stack->tag : NULL;
2022 	    }
2023 	}
2024 
2025 	if (c == '&' && TOASCII(unsign_c) < 127 &&	/* S/390 -- gil -- 0898 */
2026 	    (!testtag ||
2027 	     (testtag->contents == SGML_MIXED ||
2028 	      testtag->contents == SGML_ELEMENT ||
2029 	      testtag->contents == SGML_PCDATA ||
2030 #ifdef USE_PRETTYSRC
2031 	      testtag->contents == SGML_EMPTY ||
2032 #endif
2033 	      testtag->contents == SGML_RCDATA))) {
2034 	    /*
2035 	     * Setting up for possible entity, without the leading '&'.  - FM
2036 	     */
2037 	    string->size = 0;
2038 	    context->state = S_ero;
2039 	} else if (c == '<' && TOASCII(unsign_c) < 127) {	/* S/390 -- gil -- 0915 */
2040 	    /*
2041 	     * Setting up for possible tag.  - FM
2042 	     */
2043 	    string->size = 0;
2044 	    if (testtag && testtag->contents == SGML_PCDATA) {
2045 		context->state = S_pcdata;
2046 	    } else if (testtag && (testtag->contents == SGML_LITTERAL
2047 				   || testtag->contents == SGML_CDATA)) {
2048 		context->state = S_litteral;
2049 	    } else if (testtag && (testtag->contents == SGML_SCRIPT)) {
2050 		context->state = S_script;
2051 	    } else {
2052 		context->state = S_tag;
2053 	    }
2054 	    context->slashedtag = NULL;
2055 	} else if (context->slashedtag &&
2056 		   context->slashedtag->name &&
2057 		   (c == '/' ||
2058 		    (c == '>' && context->state == S_tagname_slash)) &&
2059 		   TOASCII(unsign_c) < 127) {
2060 	    /*
2061 	     * We got either the second slash of a pending "<NAME/blah blah/"
2062 	     * shortref construct, or the '>' of a mere "<NAME/>".  In both
2063 	     * cases generate a "</NAME>" end tag in the recover buffer for
2064 	     * reparsing unless NAME is really an empty element.  - kw
2065 	     */
2066 #ifdef USE_PRETTYSRC
2067 	    if (psrc_view) {
2068 		PSRCSTART(abracket);
2069 		PUTC(c);
2070 		PSRCSTOP(abracket);
2071 	    } else
2072 #endif
2073 		if (context->slashedtag != context->unknown_tag &&
2074 		    !ReallyEmptyTag(context->slashedtag)) {
2075 		if (context->recover == NULL) {
2076 		    StrAllocCopy(context->recover, "</");
2077 		    context->recover_index = 0;
2078 		} else {
2079 		    StrAllocCat(context->recover, "</");
2080 		}
2081 		StrAllocCat(context->recover, context->slashedtag->name);
2082 		StrAllocCat(context->recover, ">");
2083 	    }
2084 	    context->slashedtag = NULL;
2085 
2086 	} else if (context->element_stack &&
2087 		   (context->element_stack->tag->flags & Tgf_frecyc)) {
2088 	    /*
2089 	     * The element stack says we are within the contents of an element
2090 	     * that the next stage (HTML.c) may want to feed us back again (via
2091 	     * the *include string).  So try to output text in UTF-8 if
2092 	     * possible, using the same logic as for attribute values (which
2093 	     * should be in line with what context->current_tag_charset
2094 	     * indicates).  - kw
2095 	     */
2096 	    if (context->T.decode_utf8 &&
2097 		*context->utf_buf) {
2098 		PUTS(context->utf_buf);
2099 		context->utf_buf_p = context->utf_buf;
2100 		*(context->utf_buf_p) = '\0';
2101 	    } else if (!IS_CJK_TTY &&
2102 		       (context->T.output_utf8 ||
2103 			context->T.trans_from_uni)) {
2104 		if (LYIsASCII(clong)) {
2105 		    PUTC(c);
2106 		} else if (clong == 0xfffd && saved_char_in &&
2107 			   HTPassEightBitRaw &&
2108 			   saved_char_in >=
2109 			   LYlowest_eightbit[context->outUCLYhndl]) {
2110 		    PUTUTF8((UCode_t) (0xf000 | saved_char_in));
2111 		} else {
2112 		    PUTUTF8(clong);
2113 		}
2114 	    } else if (saved_char_in && context->T.use_raw_char_in) {
2115 		PUTC(saved_char_in);
2116 	    } else {
2117 		PUTC(c);
2118 	    }
2119 
2120 #define PASS8859SPECL context->T.pass_160_173_raw
2121 	    /*
2122 	     * Convert 160 (nbsp) to Lynx special character if neither
2123 	     * HTPassHighCtrlRaw nor HTCJK is set.  - FM
2124 	     */
2125 	} else if (unsign_c == CH_NBSP &&	/* S/390 -- gil -- 0932 */
2126 		   !context->no_lynx_specialcodes &&
2127 		   !(PASS8859SPECL || IS_CJK_TTY)) {
2128 	    PUTC(HT_NON_BREAK_SPACE);
2129 	    /*
2130 	     * Convert 173 (shy) to Lynx special character if neither
2131 	     * HTPassHighCtrlRaw nor HTCJK is set.  - FM
2132 	     */
2133 	} else if (unsign_c == CH_SHY &&	/* S/390 -- gil -- 0949 */
2134 		   !context->no_lynx_specialcodes &&
2135 		   !(PASS8859SPECL || IS_CJK_TTY)) {
2136 	    PUTC(LY_SOFT_HYPHEN);
2137 	    /*
2138 	     * Handle the case in which we think we have a character which
2139 	     * doesn't need further processing (e.g., a koi8-r input for a
2140 	     * koi8-r output).  - FM
2141 	     */
2142 	} else if (context->T.use_raw_char_in && saved_char_in) {
2143 	    /*
2144 	     * Only if the original character is still in saved_char_in,
2145 	     * otherwise we may be iterating from a goto top.  - KW
2146 	     */
2147 	    PUTC(saved_char_in);
2148 /******************************************************************
2149  * I.  LATIN-1 OR UCS2 TO DISPLAY CHARSET
2150  ******************************************************************/
2151 	} else if ((chk = (BOOL) (context->T.trans_from_uni &&
2152 				  TOASCII(unsign_c) >= 160)) &&		/* S/390 -- gil -- 0968 */
2153 		   (uck = UCTransUniChar(unsign_c,
2154 					 context->outUCLYhndl)) >= ' ' &&
2155 		   uck < 256) {
2156 	    CTRACE((tfp, "UCTransUniChar returned 0x%.2" PRI_UCode_t
2157 		    ":'%c'.\n",
2158 		    uck, FROMASCII((char)uck)));
2159 	    /*
2160 	     * We got one octet from the conversions, so use it.  - FM
2161 	     */
2162 	    PUTC(FROMASCII((char) uck));
2163 	} else if ((chk &&
2164 		    (uck == -4 ||
2165 		     (context->T.repl_translated_C0 &&
2166 		      uck > 0 && uck < 32))) &&
2167 	    /*
2168 	     * Not found; look for replacement string.  - KW
2169 	     */
2170 		   (uck = UCTransUniCharStr(replace_buf, 60, clong,
2171 					    context->outUCLYhndl,
2172 					    0) >= 0)) {
2173 	    /*
2174 	     * Got a replacement string.  No further tests for validity -
2175 	     * assume that whoever defined replacement strings knew what she
2176 	     * was doing.  - KW
2177 	     */
2178 	    PUTS(replace_buf);
2179 	    /*
2180 	     * If we're displaying UTF-8, try that now.  - FM
2181 	     */
2182 	} else if (context->T.output_utf8 && PUTUTF8(clong)) {
2183 	    ;			/* do nothing more */
2184 	    /*
2185 	     * If it's any other (> 160) 8-bit character, and we have not set
2186 	     * HTPassEightBitRaw nor HTCJK, nor have the "ISO Latin 1"
2187 	     * character set selected, back translate for our character set.  -
2188 	     * FM
2189 	     */
2190 #define IncludesLatin1Enc \
2191 		(context->outUCLYhndl == LATIN1 || \
2192 		 (context->outUCI && \
2193 		  (context->outUCI->enc & (UCT_CP_SUPERSETOF_LAT1))))
2194 
2195 #define PASSHI8BIT (HTPassEightBitRaw || \
2196 		    (context->T.do_8bitraw && !context->T.trans_from_uni))
2197 
2198 	} else if (unsign_c > 160 && unsign_c < 256 &&
2199 		   !(PASSHI8BIT || IS_CJK_TTY) &&
2200 		   !IncludesLatin1Enc) {
2201 #ifdef USE_PRETTYSRC
2202 	    int psrc_view_backup = 0;
2203 #endif
2204 
2205 	    string->size = 0;
2206 	    EntityName = HTMLGetEntityName((UCode_t) (unsign_c - 160));
2207 	    HTChunkPuts(string, EntityName);
2208 	    HTChunkTerminate(string);
2209 #ifdef USE_PRETTYSRC
2210 	    /* we need to disable it temporarily */
2211 	    if (psrc_view) {
2212 		psrc_view_backup = 1;
2213 		psrc_view = 0;
2214 	    }
2215 #endif
2216 	    handle_entity(context, '\0');
2217 #ifdef USE_PRETTYSRC
2218 	    /* we need to disable it temporarily */
2219 	    if (psrc_view_backup)
2220 		psrc_view = TRUE;
2221 #endif
2222 
2223 	    string->size = 0;
2224 	    if (!FoundEntity)
2225 		PUTC(';');
2226 	    /*
2227 	     * If we get to here and have an ASCII char, pass the character.  -
2228 	     * KW
2229 	     */
2230 	} else if (TOASCII(unsign_c) < 127 && unsign_c > 0) {	/* S/390 -- gil -- 0987 */
2231 	    PUTC(c);
2232 	    /*
2233 	     * If we get to here, and should have translated, translation has
2234 	     * failed so far.  - KW
2235 	     *
2236 	     * We should have sent UTF-8 output to the parser already, but what
2237 	     * the heck, try again.  - FM
2238 	     */
2239 	} else if (context->T.output_utf8 && *context->utf_buf) {
2240 	    PUTS(context->utf_buf);
2241 	    context->utf_buf_p = context->utf_buf;
2242 	    *(context->utf_buf_p) = '\0';
2243 #ifdef NOTDEFINED
2244 	    /*
2245 	     * Check for a strippable koi8-r 8-bit character.  - FM
2246 	     */
2247 	} else if (context->T.strip_raw_char_in && saved_char_in &&
2248 		   (saved_char_in >= 0xc0) &&
2249 		   (saved_char_in < 255)) {
2250 	    /*
2251 	     * KOI8 special:  strip high bit, gives (somewhat) readable ASCII
2252 	     * or KOI7 - it was constructed that way!  - KW
2253 	     */
2254 	    PUTC((saved_char_in & 0x7f));
2255 	    saved_char_in = '\0';
2256 #endif /* NOTDEFINED */
2257 	    /*
2258 	     * If we don't actually want the character, make it safe and output
2259 	     * that now.  - FM
2260 	     */
2261 	} else if (TOASCII(UCH(c)) <	/* S/390 -- gil -- 0997 */
2262 		   LYlowest_eightbit[context->outUCLYhndl] ||
2263 		   (context->T.trans_from_uni && !HTPassEightBitRaw)) {
2264 	    /*
2265 	     * If we get to here, pass the character.  - FM
2266 	     */
2267 	} else {
2268 	    PUTC(c);
2269 	}
2270 	break;
2271 
2272 	/*
2273 	 * Found '<' in SGML_PCDATA content; treat this mode nearly like
2274 	 * S_litteral, but recognize '<!' and '<?' to filter out comments and
2275 	 * processing instructions.  - kw
2276 	 */
2277     case S_pcdata:
2278 	if (!string->size && TOASCII(unsign_c) < 127) {		/* first after '<' */
2279 	    if (c == '!') {	/* <! */
2280 		/*
2281 		 * Terminate and set up for possible comment, identifier,
2282 		 * declaration, or marked section as under S_tag.  - kw
2283 		 */
2284 		context->state = S_exclamation;
2285 		context->lead_exclamation = TRUE;
2286 		context->doctype_bracket = FALSE;
2287 		context->first_bracket = FALSE;
2288 		HTChunkPutc(string, c);
2289 		break;
2290 	    } else if (c == '?') {	/* <? - ignore as a PI until '>' - kw */
2291 		CTRACE((tfp,
2292 			"SGML: Found PI in PCDATA, junking it until '>'\n"));
2293 #ifdef USE_PRETTYSRC
2294 		if (psrc_view) {
2295 		    PSRCSTART(abracket);
2296 		    PUTS("<?");
2297 		    PSRCSTOP(abracket);
2298 		}
2299 #endif
2300 		context->state = S_pi;
2301 		break;
2302 	    }
2303 	}
2304 	goto case_S_litteral;
2305 
2306 	/*
2307 	 * Found '<' in SGML_SCRIPT content; treat this mode nearly like
2308 	 * S_litteral, but recognize '<!' to allow the content to be treated as
2309 	 * a comment by lynx.
2310 	 */
2311     case S_script:
2312 	if (!string->size && TOASCII(unsign_c) < 127) {		/* first after '<' */
2313 	    if (c == '!') {	/* <! */
2314 		/*
2315 		 * Terminate and set up for possible comment, identifier,
2316 		 * declaration, or marked section as under S_tag.  - kw
2317 		 */
2318 		context->state = S_exclamation;
2319 		context->lead_exclamation = TRUE;
2320 		context->doctype_bracket = FALSE;
2321 		context->first_bracket = FALSE;
2322 		HTChunkPutc(string, c);
2323 		break;
2324 	    }
2325 	}
2326 	goto case_S_litteral;
2327 
2328 	/*
2329 	 * In litteral mode, waits only for specific end tag (for compatibility
2330 	 * with old servers, and for Lynx).  - FM
2331 	 */
2332       case_S_litteral:
2333     case S_litteral:
2334 	/*PSRC:this case not understood completely by HV, not done */
2335 	HTChunkPutc(string, c);
2336 #ifdef USE_PRETTYSRC
2337 	if (psrc_view) {
2338 	    /* there is nothing useful in the element_stack */
2339 	    testtag = context->current_tag;
2340 	} else
2341 #endif
2342 	    testtag = (context->element_stack
2343 		       ? context->element_stack->tag
2344 		       : NULL);
2345 
2346 	if (testtag == NULL || testtag->name == NULL) {
2347 	    string->size--;
2348 	    context->state = S_text;
2349 	    goto top1;
2350 	}
2351 
2352 	/*
2353 	 * Normally when we get the closing ">",
2354 	 *      testtag contains something like "TITLE"
2355 	 *      string contains something like "/title>"
2356 	 * so we decrement by 2 to compare the final character of each.
2357 	 */
2358 	testlast = string->size - 2 - context->trailing_spaces - context->leading_spaces;
2359 
2360 	if (TOUPPER(c) != ((testlast < 0)
2361 			   ? '/'
2362 			   : testtag->name[testlast])) {
2363 	    int i;
2364 
2365 	    /*
2366 	     * If complete match, end litteral.
2367 	     */
2368 	    if ((c == '>') &&
2369 		testlast >= 0 && !testtag->name[testlast]) {
2370 #ifdef USE_PRETTYSRC
2371 		if (psrc_view) {
2372 		    char *trailing = NULL;
2373 
2374 		    if (context->trailing_spaces) {
2375 			StrAllocCopy(trailing,
2376 				     string->data
2377 				     + string->size
2378 				     - 1
2379 				     - context->trailing_spaces);
2380 			trailing[context->trailing_spaces] = '\0';
2381 		    }
2382 
2383 		    PSRCSTART(abracket);
2384 		    PUTS("</");
2385 		    PSRCSTOP(abracket);
2386 		    PSRCSTART(tag);
2387 
2388 		    strcpy(string->data, context->current_tag->name);
2389 		    transform_tag(context, string);
2390 		    PUTS(string->data);
2391 
2392 		    if (trailing) {
2393 			PUTS(trailing);
2394 			FREE(trailing);
2395 		    }
2396 
2397 		    PSRCSTOP(tag);
2398 		    PSRCSTART(abracket);
2399 		    PUTC('>');
2400 		    PSRCSTOP(abracket);
2401 
2402 		    context->current_tag = NULL;
2403 		} else
2404 #endif
2405 		    end_element(context, context->element_stack->tag);
2406 
2407 		string->size = 0;
2408 		context->current_attribute_number = INVALID;
2409 		context->state = S_text;
2410 		context->leading_spaces = 0;
2411 		context->trailing_spaces = 0;
2412 		break;
2413 	    }
2414 
2415 	    /*
2416 	     * Allow whitespace between the "<" or ">" and the keyword, for
2417 	     * error-recovery.
2418 	     */
2419 	    if (isspace(UCH(c))) {
2420 		if (testlast == -1) {
2421 		    context->leading_spaces += 1;
2422 		    CTRACE2(TRACE_SGML, (tfp, "leading spaces: %d\n", context->leading_spaces));
2423 		    break;
2424 		} else if (testlast > 0) {
2425 		    context->trailing_spaces += 1;
2426 		    CTRACE2(TRACE_SGML, (tfp, "trailing spaces: %d\n", context->trailing_spaces));
2427 		    break;
2428 		}
2429 	    }
2430 
2431 	    /*
2432 	     * Mismatch - recover.
2433 	     */
2434 	    context->leading_spaces = 0;
2435 	    context->trailing_spaces = 0;
2436 	    if (((testtag->contents != SGML_LITTERAL &&
2437 		  (testtag->flags & Tgf_strict)) ||
2438 		 (context->state == S_pcdata &&
2439 		  (testtag->flags & (Tgf_strict | Tgf_endO)))) &&
2440 		(testlast > -1 &&
2441 		 (c == '>' || testlast > 0 || IsNmStart(c)))) {
2442 		context->state = S_end;
2443 		string->size--;
2444 		for (i = 0; i < string->size; i++)	/* remove '/' */
2445 		    string->data[i] = string->data[i + 1];
2446 		if ((string->size == 1) ? IsNmStart(c) : IsNmChar(c))
2447 		    break;
2448 		string->size--;
2449 		goto top1;
2450 	    }
2451 	    if (context->state == S_pcdata &&
2452 		(testtag->flags & (Tgf_strict | Tgf_endO)) &&
2453 		(testlast < 0 && IsNmStart(c))) {
2454 		context->state = S_tag;
2455 		break;
2456 	    }
2457 	    /*
2458 	     * If Mismatch:  recover string literally.
2459 	     */
2460 	    PUTC('<');
2461 	    for (i = 0; i < string->size - 1; i++)	/* recover, except last c */
2462 		PUTC(string->data[i]);
2463 	    string->size = 0;
2464 	    context->state = S_text;
2465 	    goto top1;		/* to recover last c */
2466 	}
2467 	break;
2468 
2469 	/*
2470 	 * Character reference (numeric entity) or named entity.
2471 	 */
2472     case S_ero:
2473 	if (c == '#') {
2474 	    /*
2475 	     * Setting up for possible numeric entity.
2476 	     */
2477 	    context->state = S_cro;	/* &# is Char Ref Open */
2478 	    break;
2479 	}
2480 	context->state = S_entity;	/* Fall through! */
2481 
2482 	/*
2483 	 * Handle possible named entity.
2484 	 */
2485     case S_entity:
2486 	if (TOASCII(unsign_c) < 127 && (string->size ?	/* S/390 -- gil -- 1029 */
2487 					isalnum(UCH(c)) : isalpha(UCH(c)))) {
2488 	    /* Should probably use IsNmStart/IsNmChar above (is that right?),
2489 	       but the world is not ready for that - there's &nbsp: (note
2490 	       colon!) and stuff around. */
2491 	    /*
2492 	     * Accept valid ASCII character.  - FM
2493 	     */
2494 	    HTChunkPutc(string, c);
2495 	} else if (string->size == 0) {
2496 	    /*
2497 	     * It was an ampersand that's just text, so output the ampersand
2498 	     * and recycle this character.  - FM
2499 	     */
2500 #ifdef USE_PRETTYSRC
2501 	    if (psrc_view)
2502 		PSRCSTART(badseq);
2503 #endif
2504 	    PUTC('&');
2505 #ifdef USE_PRETTYSRC
2506 	    if (psrc_view)
2507 		PSRCSTOP(badseq);
2508 #endif
2509 	    context->state = S_text;
2510 	    goto top1;
2511 	} else {
2512 	    /*
2513 	     * Terminate entity name and try to handle it.  - FM
2514 	     */
2515 	    HTChunkTerminate(string);
2516 #ifdef USE_PRETTYSRC
2517 	    entity_string = string->data;
2518 #endif
2519 	    /* S/390 -- gil -- 1039 */
2520 	    /* CTRACE((tfp, "%s: %d: %s\n", __FILE__, __LINE__, string->data)); */
2521 	    if (!strcmp(string->data, "zwnj") &&
2522 		(!context->element_stack ||
2523 		 (context->element_stack->tag &&
2524 		  context->element_stack->tag->contents == SGML_MIXED))) {
2525 		/*
2526 		 * Handle zwnj (8204) as <WBR>.  - FM
2527 		 */
2528 		char temp[8];
2529 
2530 		CTRACE((tfp,
2531 			"SGML_character: Handling 'zwnj' entity as 'WBR' element.\n"));
2532 
2533 		if (c != ';') {
2534 		    sprintf(temp, "<WBR>%c", c);
2535 		} else {
2536 		    sprintf(temp, "<WBR>");
2537 		}
2538 		if (context->recover == NULL) {
2539 		    StrAllocCopy(context->recover, temp);
2540 		    context->recover_index = 0;
2541 		} else {
2542 		    StrAllocCat(context->recover, temp);
2543 		}
2544 		string->size = 0;
2545 		context->state = S_text;
2546 		break;
2547 	    } else {
2548 		handle_entity(context, '\0');
2549 	    }
2550 	    string->size = 0;
2551 	    context->state = S_text;
2552 	    /*
2553 	     * Don't eat the terminator if we didn't find the entity name and
2554 	     * therefore sent the raw string via handle_entity(), or if the
2555 	     * terminator is not the "standard" semi-colon for HTML.  - FM
2556 	     */
2557 #ifdef USE_PRETTYSRC
2558 	    if (psrc_view && FoundEntity && c == ';') {
2559 		PSRCSTART(entity);
2560 		PUTC(c);
2561 		PSRCSTOP(entity);
2562 	    }
2563 #endif
2564 	    if (!FoundEntity || c != ';')
2565 		goto top1;
2566 	}
2567 	break;
2568 
2569 	/*
2570 	 * Check for a numeric entity.
2571 	 */
2572     case S_cro:
2573 	if (TOASCII(unsign_c) < 127 && TOLOWER(UCH(c)) == 'x') {	/* S/390 -- gil -- 1060 */
2574 	    context->isHex = TRUE;
2575 	    context->state = S_incro;
2576 	} else if (TOASCII(unsign_c) < 127 && isdigit(UCH(c))) {
2577 	    /*
2578 	     * Accept only valid ASCII digits.  - FM
2579 	     */
2580 	    HTChunkPutc(string, c);	/* accumulate a character NUMBER */
2581 	    context->isHex = FALSE;
2582 	    context->state = S_incro;
2583 	} else if (string->size == 0) {
2584 	    /*
2585 	     * No 'x' or digit following the "&#" so recover them and recycle
2586 	     * the character.  - FM
2587 	     */
2588 #ifdef USE_PRETTYSRC
2589 	    if (psrc_view)
2590 		PSRCSTART(badseq);
2591 #endif
2592 	    PUTC('&');
2593 	    PUTC('#');
2594 #ifdef USE_PRETTYSRC
2595 	    if (psrc_view)
2596 		PSRCSTOP(badseq);
2597 #endif
2598 	    context->state = S_text;
2599 	    goto top1;
2600 	}
2601 	break;
2602 
2603 	/*
2604 	 * Handle a numeric entity.
2605 	 */
2606     case S_incro:
2607 	/* S/390 -- gil -- 1075 */
2608 	if ((TOASCII(unsign_c) < 127) &&
2609 	    (context->isHex
2610 	     ? isxdigit(UCH(c))
2611 	     : isdigit(UCH(c)))) {
2612 	    /*
2613 	     * Accept only valid hex or ASCII digits.  - FM
2614 	     */
2615 	    HTChunkPutc(string, c);	/* accumulate a character NUMBER */
2616 	} else if (string->size == 0) {
2617 	    /*
2618 	     * No hex digit following the "&#x" so recover them and recycle the
2619 	     * character.  - FM
2620 	     */
2621 #ifdef USE_PRETTYSRC
2622 	    if (psrc_view)
2623 		PSRCSTART(badseq);
2624 #endif
2625 	    PUTS("&#x");
2626 #ifdef USE_PRETTYSRC
2627 	    if (psrc_view)
2628 		PSRCSTOP(badseq);
2629 #endif
2630 	    context->isHex = FALSE;
2631 	    context->state = S_text;
2632 	    goto top1;
2633 	} else {
2634 	    /*
2635 	     * Terminate the numeric entity and try to handle it.  - FM
2636 	     */
2637 	    UCode_t code;
2638 	    int i;
2639 
2640 	    HTChunkTerminate(string);
2641 #ifdef USE_PRETTYSRC
2642 	    entity_string = string->data;
2643 #endif
2644 	    if (UCScanCode(&code, string->data, context->isHex)) {
2645 
2646 /* =============== work in ASCII below here ===============  S/390 -- gil -- 1092 */
2647 		if (AssumeCP1252(context)) {
2648 		    code = LYcp1252ToUnicode(code);
2649 		}
2650 		/*
2651 		 * Check for special values.  - FM
2652 		 */
2653 		if ((code == 8204) &&
2654 		    (!context->element_stack ||
2655 		     (context->element_stack->tag &&
2656 		      context->element_stack->tag->contents == SGML_MIXED))) {
2657 		    /*
2658 		     * Handle zwnj (8204) as <WBR>.  - FM
2659 		     */
2660 		    char temp[8];
2661 
2662 		    CTRACE((tfp,
2663 			    "SGML_character: Handling '8204' (zwnj) reference as 'WBR' element.\n"));
2664 
2665 		    /*
2666 		     * Include the terminator if it is not the standard
2667 		     * semi-colon.  - FM
2668 		     */
2669 		    if (c != ';') {
2670 			sprintf(temp, "<WBR>%c", c);
2671 		    } else {
2672 			sprintf(temp, "<WBR>");
2673 		    }
2674 		    /*
2675 		     * Add the replacement string to the recover buffer for
2676 		     * processing.  - FM
2677 		     */
2678 		    if (context->recover == NULL) {
2679 			StrAllocCopy(context->recover, temp);
2680 			context->recover_index = 0;
2681 		    } else {
2682 			StrAllocCat(context->recover, temp);
2683 		    }
2684 		    string->size = 0;
2685 		    context->isHex = FALSE;
2686 		    context->state = S_text;
2687 		    break;
2688 		} else if (put_special_unicodes(context, code)) {
2689 		    /*
2690 		     * We handled the value as a special character, so recycle
2691 		     * the terminator or break.  - FM
2692 		     */
2693 #ifdef USE_PRETTYSRC
2694 		    if (psrc_view) {
2695 			PSRCSTART(entity);
2696 			PUTS((context->isHex ? "&#x" : "&#"));
2697 			PUTS(entity_string);
2698 			if (c == ';')
2699 			    PUTC(';');
2700 			PSRCSTOP(entity);
2701 		    }
2702 #endif
2703 		    string->size = 0;
2704 		    context->isHex = FALSE;
2705 		    context->state = S_text;
2706 		    if (c != ';')
2707 			goto top1;
2708 		    break;
2709 		}
2710 		/*
2711 		 * Seek a translation from the chartrans tables.
2712 		 */
2713 		if ((uck = UCTransUniChar(code,
2714 					  context->outUCLYhndl)) >= 32 &&
2715 		    uck < 256 &&
2716 		    (uck < 127 ||
2717 		     uck >= LYlowest_eightbit[context->outUCLYhndl])) {
2718 #ifdef USE_PRETTYSRC
2719 		    if (!psrc_view) {
2720 #endif
2721 			PUTC(FROMASCII((char) uck));
2722 #ifdef USE_PRETTYSRC
2723 		    } else {
2724 			put_pretty_number(context);
2725 		    }
2726 #endif
2727 		} else if ((uck == -4 ||
2728 			    (context->T.repl_translated_C0 &&
2729 			     uck > 0 && uck < 32)) &&
2730 		    /*
2731 		     * Not found; look for replacement string.
2732 		     */
2733 			   (uck = UCTransUniCharStr(replace_buf, 60, code,
2734 						    context->outUCLYhndl,
2735 						    0) >= 0)) {
2736 #ifdef USE_PRETTYSRC
2737 		    if (psrc_view) {
2738 			put_pretty_number(context);
2739 		    } else
2740 #endif
2741 			PUTS(replace_buf);
2742 		    /*
2743 		     * If we're displaying UTF-8, try that now.  - FM
2744 		     */
2745 		} else if (context->T.output_utf8 && PUTUTF8(code)) {
2746 		    ;		/* do nothing more */
2747 		    /*
2748 		     * Ignore 8205 (zwj), 8206 (lrm), and 8207 (rln), if we get
2749 		     * to here.  - FM
2750 		     */
2751 		} else if (code == 8205 ||
2752 			   code == 8206 ||
2753 			   code == 8207) {
2754 		    if (TRACE) {
2755 			string->size--;
2756 			LYStrNCpy(replace_buf,
2757 				  string->data,
2758 				  (string->size < 64 ? string->size : 63));
2759 			fprintf(tfp,
2760 				"SGML_character: Ignoring '%s%s'.\n",
2761 				(context->isHex ? "&#x" : "&#"),
2762 				replace_buf);
2763 		    }
2764 #ifdef USE_PRETTYSRC
2765 		    if (psrc_view) {
2766 			PSRCSTART(badseq);
2767 			PUTS((context->isHex ? "&#x" : "&#"));
2768 			PUTS(entity_string);
2769 			if (c == ';')
2770 			    PUTC(';');
2771 			PSRCSTOP(badseq);
2772 		    }
2773 #endif
2774 		    string->size = 0;
2775 		    context->isHex = FALSE;
2776 		    context->state = S_text;
2777 		    if (c != ';')
2778 			goto top1;
2779 		    break;
2780 		    /*
2781 		     * Show the numeric entity if we get to here and the value:
2782 		     * (1) Is greater than 255 (but use ASCII characters for
2783 		     * spaces or dashes).
2784 		     * (2) Is less than 32, and not valid or we don't have
2785 		     * HTCJK set.
2786 		     * (3) Is 127 and we don't have HTPassHighCtrlRaw or HTCJK
2787 		     * set.
2788 		     * (4) Is 128 - 159 and we don't have HTPassHighCtrlNum
2789 		     * set.
2790 		     * - FM
2791 		     */
2792 		} else if ((code > 255) ||
2793 			   (code < ' ' &&	/* S/390 -- gil -- 1140 */
2794 			    code != '\t' && code != '\n' && code != '\r' &&
2795 			    !IS_CJK_TTY) ||
2796 			   (TOASCII(code) == 127 &&
2797 			    !(HTPassHighCtrlRaw || IS_CJK_TTY)) ||
2798 			   (TOASCII(code) > 127 && code < 160 &&
2799 			    !HTPassHighCtrlNum)) {
2800 		    /*
2801 		     * Unhandled or illegal value.  Recover the "&#" or "&#x"
2802 		     * and digit(s), and recycle the terminator.  - FM
2803 		     */
2804 #ifdef USE_PRETTYSRC
2805 		    if (psrc_view) {
2806 			PSRCSTART(badseq);
2807 		    }
2808 #endif
2809 		    if (context->isHex) {
2810 			PUTS("&#x");
2811 			context->isHex = FALSE;
2812 		    } else {
2813 			PUTS("&#");
2814 		    }
2815 		    string->size--;
2816 		    for (i = 0; i < string->size; i++)	/* recover */
2817 			PUTC(string->data[i]);
2818 #ifdef USE_PRETTYSRC
2819 		    if (psrc_view) {
2820 			PSRCSTOP(badseq);
2821 		    }
2822 #endif
2823 		    string->size = 0;
2824 		    context->isHex = FALSE;
2825 		    context->state = S_text;
2826 		    goto top1;
2827 		} else if (TOASCII(code) < 161 ||	/* S/390 -- gil -- 1162 */
2828 			   HTPassEightBitNum ||
2829 			   IncludesLatin1Enc) {
2830 		    /*
2831 		     * No conversion needed.  - FM
2832 		     */
2833 #ifdef USE_PRETTYSRC
2834 		    if (psrc_view) {
2835 			put_pretty_number(context);
2836 		    } else
2837 #endif
2838 			PUTC(FROMASCII((char) code));
2839 		} else {
2840 		    /*
2841 		     * Handle as named entity.  - FM
2842 		     */
2843 		    code -= 160;
2844 		    EntityName = HTMLGetEntityName(code);
2845 		    if (EntityName && EntityName[0] != '\0') {
2846 			string->size = 0;
2847 			HTChunkPuts(string, EntityName);
2848 			HTChunkTerminate(string);
2849 			handle_entity(context, '\0');
2850 			/*
2851 			 * Add a semi-colon if something went wrong and
2852 			 * handle_entity() sent the string.  - FM
2853 			 */
2854 			if (!FoundEntity) {
2855 			    PUTC(';');
2856 			}
2857 		    } else {
2858 			/*
2859 			 * Our conversion failed, so recover the "&#" and
2860 			 * digit(s), and recycle the terminator.  - FM
2861 			 */
2862 #ifdef USE_PRETTYSRC
2863 			if (psrc_view)
2864 			    PSRCSTART(badseq);
2865 #endif
2866 			if (context->isHex) {
2867 			    PUTS("&#x");
2868 			    context->isHex = FALSE;
2869 			} else {
2870 			    PUTS("&#");
2871 			}
2872 			string->size--;
2873 			for (i = 0; i < string->size; i++)	/* recover */
2874 			    PUTC(string->data[i]);
2875 #ifdef USE_PRETTYSRC
2876 			if (psrc_view)
2877 			    PSRCSTOP(badseq);
2878 #endif
2879 			string->size = 0;
2880 			context->isHex = FALSE;
2881 			context->state = S_text;
2882 			goto top1;
2883 		    }
2884 		}
2885 		/*
2886 		 * If we get to here, we succeeded.  Hoorah!!!  - FM
2887 		 */
2888 		string->size = 0;
2889 		context->isHex = FALSE;
2890 		context->state = S_text;
2891 		/*
2892 		 * Don't eat the terminator if it's not the "standard"
2893 		 * semi-colon for HTML.  - FM
2894 		 */
2895 		if (c != ';') {
2896 		    goto top1;
2897 		}
2898 	    } else {
2899 		/*
2900 		 * Not an entity, and don't know why not, so add the terminator
2901 		 * to the string, output the "&#" or "&#x", and process the
2902 		 * string via the recover element.  - FM
2903 		 */
2904 		string->size--;
2905 		HTChunkPutc(string, c);
2906 		HTChunkTerminate(string);
2907 #ifdef USE_PRETTYSRC
2908 		if (psrc_view)
2909 		    PSRCSTART(badseq);
2910 #endif
2911 		if (context->isHex) {
2912 		    PUTS("&#x");
2913 		    context->isHex = FALSE;
2914 		} else {
2915 		    PUTS("&#");
2916 		}
2917 #ifdef USE_PRETTYSRC
2918 		if (psrc_view)
2919 		    PSRCSTOP(badseq);
2920 #endif
2921 		if (context->recover == NULL) {
2922 		    StrAllocCopy(context->recover, string->data);
2923 		    context->recover_index = 0;
2924 		} else {
2925 		    StrAllocCat(context->recover, string->data);
2926 		}
2927 		string->size = 0;
2928 		context->isHex = FALSE;
2929 		context->state = S_text;
2930 		break;
2931 	    }
2932 	}
2933 	break;
2934 
2935 	/*
2936 	 * Tag
2937 	 */
2938     case S_tag:		/* new tag */
2939 	if (TOASCII(unsign_c) < 127 && (string->size ?	/* S/390 -- gil -- 1179 */
2940 					IsNmChar(c) : IsNmStart(c))) {
2941 	    /*
2942 	     * Add valid ASCII character.  - FM
2943 	     */
2944 	    HTChunkPutc(string, c);
2945 	} else if (c == '!' && !string->size) {		/* <! */
2946 	    /*
2947 	     * Terminate and set up for possible comment, identifier,
2948 	     * declaration, or marked section.  - FM
2949 	     */
2950 	    context->state = S_exclamation;
2951 	    context->lead_exclamation = TRUE;
2952 	    context->doctype_bracket = FALSE;
2953 	    context->first_bracket = FALSE;
2954 	    HTChunkPutc(string, c);
2955 	    break;
2956 	} else if (!string->size &&
2957 		   (TOASCII(unsign_c) <= 160 &&		/* S/390 -- gil -- 1196 */
2958 		    (c != '/' && c != '?' && c != '_' && c != ':'))) {
2959 	    /*
2960 	     * '<' must be followed by an ASCII letter to be a valid start tag.
2961 	     * Here it isn't, nor do we have a '/' for an end tag, nor one of
2962 	     * some other characters with a special meaning for SGML or which
2963 	     * are likely to be legal Name Start characters in XML or some
2964 	     * other extension.  So recover the '<' and following character as
2965 	     * data.  - FM & KW
2966 	     */
2967 	    context->state = S_text;
2968 #ifdef USE_PRETTYSRC
2969 	    if (psrc_view)
2970 		PSRCSTART(badseq);
2971 #endif
2972 	    PUTC('<');
2973 #ifdef USE_PRETTYSRC
2974 	    if (psrc_view)
2975 		PSRCSTOP(badseq);
2976 #endif
2977 	    goto top1;
2978 	} else {		/* End of tag name */
2979 	    /*
2980 	     * Try to handle tag.  - FM
2981 	     */
2982 	    HTTag *t;
2983 
2984 	    if (c == '/') {
2985 		if (string->size == 0) {
2986 		    context->state = S_end;
2987 		    break;
2988 		}
2989 		CTRACE((tfp, "SGML: `<%.*s/' found!\n", string->size, string->data));
2990 	    }
2991 	    HTChunkTerminate(string);
2992 
2993 	    t = SGMLFindTag(dtd, string->data);
2994 	    if (t == context->unknown_tag &&
2995 		((c == ':' &&
2996 		  string->size == 4 && 0 == strcasecomp(string->data, "URL")) ||
2997 		 (string->size > 4 && 0 == strncasecomp(string->data, "URL:", 4)))) {
2998 		/*
2999 		 * Treat <URL:  as text rather than a junk tag, so we display
3000 		 * it and the URL (Lynxism 8-).  - FM
3001 		 */
3002 #ifdef USE_PRETTYSRC
3003 		if (psrc_view)
3004 		    PSRCSTART(badseq);
3005 #endif
3006 		PUTC('<');
3007 		PUTS(string->data);	/* recover */
3008 		PUTC(c);
3009 #ifdef USE_PRETTYSRC
3010 		if (psrc_view)
3011 		    PSRCSTOP(badseq);
3012 #endif
3013 		CTRACE((tfp, "SGML: Treating <%s%c as text\n",
3014 			string->data, c));
3015 		string->size = 0;
3016 		context->state = S_text;
3017 		break;
3018 	    }
3019 	    if (c == '/' && t) {
3020 		/*
3021 		 * Element name was ended by '/'.  Remember the tag that ended
3022 		 * thusly, we'll interpret this as either an indication of an
3023 		 * empty element (if '>' follows directly) or do some
3024 		 * SGMLshortref-ish treatment.  - kw
3025 		 */
3026 		context->slashedtag = t;
3027 	    }
3028 	    if (!t) {
3029 		if (c == '?' && string->size <= 1) {
3030 		    CTRACE((tfp, "SGML: Found PI, looking for '>'\n"));
3031 #ifdef USE_PRETTYSRC
3032 		    if (psrc_view) {
3033 			PSRCSTART(abracket);
3034 			PUTS("<?");
3035 			PSRCSTOP(abracket);
3036 		    }
3037 #endif
3038 		    string->size = 0;
3039 		    context->state = S_pi;
3040 		    HTChunkPutc(string, c);
3041 		    break;
3042 		}
3043 		CTRACE((tfp, "SGML: *** Invalid element %s\n",
3044 			string->data));
3045 
3046 #ifdef USE_PRETTYSRC
3047 		if (psrc_view) {
3048 		    PSRCSTART(abracket);
3049 		    PUTC('<');
3050 		    PSRCSTOP(abracket);
3051 		    PSRCSTART(badtag);
3052 		    transform_tag(context, string);
3053 		    PUTS(string->data);
3054 		    if (c == '>') {
3055 			PSRCSTOP(badtag);
3056 			PSRCSTART(abracket);
3057 			PUTC('>');
3058 			PSRCSTOP(abracket);
3059 		    } else {
3060 			PUTC(c);
3061 		    }
3062 		}
3063 #endif
3064 		context->state = (c == '>') ? S_text : S_junk_tag;
3065 		break;
3066 	    } else if (t == context->unknown_tag) {
3067 		CTRACE((tfp, "SGML: *** Unknown element \"%s\"\n",
3068 			string->data));
3069 		/*
3070 		 * Fall through and treat like valid tag for attribute parsing.
3071 		 * - KW
3072 		 */
3073 
3074 	    }
3075 	    context->current_tag = t;
3076 
3077 #ifdef USE_PRETTYSRC
3078 	    if (psrc_view) {
3079 		PSRCSTART(abracket);
3080 		PUTC('<');
3081 		PSRCSTOP(abracket);
3082 		if (t != context->unknown_tag)
3083 		    PSRCSTART(tag);
3084 		else
3085 		    PSRCSTART(badtag);
3086 		transform_tag(context, string);
3087 		PUTS(string->data);
3088 		if (t != context->unknown_tag)
3089 		    PSRCSTOP(tag);
3090 		else
3091 		    PSRCSTOP(badtag);
3092 	    }
3093 	    if (!psrc_view)	/*don't waste time */
3094 #endif
3095 	    {
3096 		/*
3097 		 * Clear out attributes.
3098 		 */
3099 		memset((void *) context->present, 0, sizeof(BOOL) *
3100 		         (unsigned) (context->current_tag->number_of_attributes));
3101 	    }
3102 
3103 	    string->size = 0;
3104 	    context->current_attribute_number = INVALID;
3105 #ifdef USE_PRETTYSRC
3106 	    if (psrc_view) {
3107 		if (c == '>' || c == '<' || (c == '/' && context->slashedtag)) {
3108 		    if (c != '<') {
3109 			PSRCSTART(abracket);
3110 			PUTC(c);
3111 			PSRCSTOP(abracket);
3112 			context->state = (c == '>') ? S_text : S_tagname_slash;
3113 		    } else {
3114 			context->state = S_tag;
3115 		    }
3116 		} else {
3117 		    if (!WHITE(c))
3118 			PUTC(c);
3119 		    context->state = S_tag_gap;
3120 		}
3121 	    } else
3122 #endif
3123 	    if (c == '>' || c == '<' || (c == '/' && context->slashedtag)) {
3124 		if (context->current_tag->name)
3125 		    start_element(context);
3126 		context->state = (c == '>') ? S_text :
3127 		    (c == '<') ? S_tag : S_tagname_slash;
3128 	    } else {
3129 		context->state = S_tag_gap;
3130 	    }
3131 	}
3132 	break;
3133 
3134     case S_exclamation:
3135 	if (context->lead_exclamation && c == '-') {
3136 	    /*
3137 	     * Set up for possible comment.  - FM
3138 	     */
3139 	    context->lead_exclamation = FALSE;
3140 	    context->first_dash = TRUE;
3141 	    HTChunkPutc(string, c);
3142 	    break;
3143 	}
3144 	if (context->lead_exclamation && c == '[') {
3145 	    /*
3146 	     * Set up for possible marked section.  - FM
3147 	     */
3148 	    context->lead_exclamation = FALSE;
3149 	    context->first_bracket = TRUE;
3150 	    context->second_bracket = FALSE;
3151 	    HTChunkPutc(string, c);
3152 	    context->state = S_marked;
3153 	    break;
3154 	}
3155 	if (context->first_dash && c == '-') {
3156 	    /*
3157 	     * Set up to handle comment.  - FM
3158 	     */
3159 	    context->lead_exclamation = FALSE;
3160 	    context->first_dash = FALSE;
3161 	    context->end_comment = FALSE;
3162 	    HTChunkPutc(string, c);
3163 	    context->state = S_comment;
3164 	    break;
3165 	}
3166 	context->lead_exclamation = FALSE;
3167 	context->first_dash = FALSE;
3168 	if (c == '>') {
3169 	    /*
3170 	     * Try to handle identifier.  - FM
3171 	     */
3172 	    HTChunkTerminate(string);
3173 #ifdef USE_PRETTYSRC
3174 	    if (psrc_view) {
3175 		PSRCSTART(sgmlspecial);
3176 		PUTC('<');
3177 		PUTS(string->data);
3178 		PUTC('>');
3179 		PSRCSTOP(sgmlspecial);
3180 	    } else
3181 #endif
3182 		handle_identifier(context);
3183 	    string->size = 0;
3184 	    context->state = S_text;
3185 	    break;
3186 	}
3187 	if (WHITE(c)) {
3188 	    if (string->size == 8 &&
3189 		!strncasecomp(string->data, "!DOCTYPE", 8)) {
3190 		/*
3191 		 * Set up for DOCTYPE declaration.  - FM
3192 		 */
3193 		HTChunkPutc(string, c);
3194 		context->doctype_bracket = FALSE;
3195 		context->state = S_doctype;
3196 		break;
3197 	    }
3198 	    if (string->size == 7 &&
3199 		!strncasecomp(string->data, "!ENTITY", 7)) {
3200 		/*
3201 		 * Set up for ENTITY declaration.  - FM
3202 		 */
3203 		HTChunkPutc(string, c);
3204 		context->first_dash = FALSE;
3205 		context->end_comment = TRUE;
3206 		context->state = S_sgmlent;
3207 		break;
3208 	    }
3209 	    if (string->size == 8 &&
3210 		!strncasecomp(string->data, "!ELEMENT", 8)) {
3211 		/*
3212 		 * Set up for ELEMENT declaration.  - FM
3213 		 */
3214 		HTChunkPutc(string, c);
3215 		context->first_dash = FALSE;
3216 		context->end_comment = TRUE;
3217 		context->state = S_sgmlele;
3218 		break;
3219 	    }
3220 	    if (string->size == 8 &&
3221 		!strncasecomp(string->data, "!ATTLIST", 8)) {
3222 		/*
3223 		 * Set up for ATTLIST declaration.  - FM
3224 		 */
3225 		HTChunkPutc(string, c);
3226 		context->first_dash = FALSE;
3227 		context->end_comment = TRUE;
3228 		context->state = S_sgmlatt;
3229 		break;
3230 	    }
3231 	}
3232 	HTChunkPutc(string, c);
3233 	break;
3234 
3235     case S_comment:		/* Expecting comment. - FM */
3236 	if (historical_comments) {
3237 	    /*
3238 	     * Any '>' terminates.  - FM
3239 	     */
3240 	    if (c == '>') {
3241 		HTChunkTerminate(string);
3242 #ifdef USE_PRETTYSRC
3243 		if (psrc_view) {
3244 		    PSRCSTART(comm);
3245 		    PUTC('<');
3246 		    PUTS_TR(string->data);
3247 		    PUTC('>');
3248 		    PSRCSTOP(comm);
3249 		} else
3250 #endif
3251 		    handle_comment(context);
3252 		string->size = 0;
3253 		context->end_comment = FALSE;
3254 		context->first_dash = FALSE;
3255 		context->state = S_text;
3256 		break;
3257 	    }
3258 	    goto S_comment_put_c;
3259 	}
3260 	if (!context->first_dash && c == '-') {
3261 	    HTChunkPutc(string, c);
3262 	    context->first_dash = TRUE;
3263 	    break;
3264 	}
3265 	if (context->first_dash && c == '-') {
3266 	    HTChunkPutc(string, c);
3267 	    context->first_dash = FALSE;
3268 	    if (!context->end_comment)
3269 		context->end_comment = TRUE;
3270 	    else if (!minimal_comments)
3271 		/*
3272 		 * Validly treat '--' pairs as successive comments (for
3273 		 * minimal, any "--WHITE>" terminates).  - FM
3274 		 */
3275 		context->end_comment = FALSE;
3276 	    break;
3277 	}
3278 	if (context->end_comment && c == '>') {
3279 	    /*
3280 	     * Terminate and handle the comment.  - FM
3281 	     */
3282 	    HTChunkTerminate(string);
3283 #ifdef USE_PRETTYSRC
3284 	    if (psrc_view) {
3285 		PSRCSTART(comm);
3286 		PUTC('<');
3287 		PUTS_TR(string->data);
3288 		PUTC('>');
3289 		PSRCSTOP(comm);
3290 	    } else
3291 #endif
3292 		handle_comment(context);
3293 	    string->size = 0;
3294 	    context->end_comment = FALSE;
3295 	    context->first_dash = FALSE;
3296 	    context->state = S_text;
3297 	    break;
3298 	}
3299 	context->first_dash = FALSE;
3300 	if (context->end_comment && !isspace(UCH(c)))
3301 	    context->end_comment = FALSE;
3302 
3303       S_comment_put_c:
3304 	if (context->T.decode_utf8 &&
3305 	    *context->utf_buf) {
3306 	    HTChunkPuts(string, context->utf_buf);
3307 	    context->utf_buf_p = context->utf_buf;
3308 	    *(context->utf_buf_p) = '\0';
3309 	} else if (!IS_CJK_TTY &&
3310 		   (context->T.output_utf8 ||
3311 		    context->T.trans_from_uni)) {
3312 	    if (clong == 0xfffd && saved_char_in &&
3313 		HTPassEightBitRaw &&
3314 		saved_char_in >=
3315 		LYlowest_eightbit[context->outUCLYhndl]) {
3316 		HTChunkPutUtf8Char(string,
3317 				   (UCode_t) (0xf000 | saved_char_in));
3318 	    } else {
3319 		HTChunkPutUtf8Char(string, clong);
3320 	    }
3321 	} else if (saved_char_in && context->T.use_raw_char_in) {
3322 	    HTChunkPutc(string, saved_char_in);
3323 	} else {
3324 	    HTChunkPutc(string, c);
3325 	}
3326 	break;
3327 
3328     case S_doctype:		/* Expecting DOCTYPE. - FM */
3329 	if (context->doctype_bracket) {
3330 	    HTChunkPutc(string, c);
3331 	    if (c == ']')
3332 		context->doctype_bracket = FALSE;
3333 	    break;
3334 	}
3335 	if (c == '[' && WHITE(string->data[string->size - 1])) {
3336 	    HTChunkPutc(string, c);
3337 	    context->doctype_bracket = TRUE;
3338 	    break;
3339 	}
3340 	if (c == '>') {
3341 	    HTChunkTerminate(string);
3342 #ifdef USE_PRETTYSRC
3343 	    if (psrc_view) {
3344 		PSRCSTART(sgmlspecial);
3345 		PUTC('<');
3346 		PUTS(string->data);
3347 		PUTC('>');
3348 		PSRCSTOP(sgmlspecial);
3349 	    } else
3350 #endif
3351 		handle_doctype(context);
3352 	    string->size = 0;
3353 	    context->state = S_text;
3354 	    break;
3355 	}
3356 	HTChunkPutc(string, c);
3357 	break;
3358 
3359     case S_marked:		/* Expecting marked section. - FM */
3360 	if (context->first_bracket && c == '[') {
3361 	    HTChunkPutc(string, c);
3362 	    context->first_bracket = FALSE;
3363 	    context->second_bracket = TRUE;
3364 	    break;
3365 	}
3366 	if (context->second_bracket && c == ']' &&
3367 	    string->data[string->size - 1] == ']') {
3368 	    HTChunkPutc(string, c);
3369 	    context->second_bracket = FALSE;
3370 	    break;
3371 	}
3372 	if (!context->second_bracket && c == '>') {
3373 	    HTChunkTerminate(string);
3374 #ifdef USE_PRETTYSRC
3375 	    if (psrc_view) {
3376 		PSRCSTART(sgmlspecial);
3377 		PUTC('<');
3378 		PUTS(string->data);
3379 		PUTC('>');
3380 		PSRCSTOP(sgmlspecial);
3381 	    } else
3382 #endif
3383 		handle_marked(context);
3384 	    string->size = 0;
3385 	    context->state = S_text;
3386 	    break;
3387 	}
3388 	HTChunkPutc(string, c);
3389 	break;
3390 
3391     case S_sgmlent:		/* Expecting ENTITY. - FM */
3392 	if (!context->first_dash && c == '-') {
3393 	    HTChunkPutc(string, c);
3394 	    context->first_dash = TRUE;
3395 	    break;
3396 	}
3397 	if (context->first_dash && c == '-') {
3398 	    HTChunkPutc(string, c);
3399 	    context->first_dash = FALSE;
3400 	    if (!context->end_comment)
3401 		context->end_comment = TRUE;
3402 	    else
3403 		context->end_comment = FALSE;
3404 	    break;
3405 	}
3406 	if (context->end_comment && c == '>') {
3407 	    HTChunkTerminate(string);
3408 #ifdef USE_PRETTYSRC
3409 	    if (psrc_view) {
3410 		PSRCSTART(sgmlspecial);
3411 		PUTC('<');
3412 		PUTS(string->data);
3413 		PUTC('>');
3414 		PSRCSTOP(sgmlspecial);
3415 	    } else
3416 #endif
3417 		handle_sgmlent(context);
3418 	    string->size = 0;
3419 	    context->end_comment = FALSE;
3420 	    context->first_dash = FALSE;
3421 	    context->state = S_text;
3422 	    break;
3423 	}
3424 	context->first_dash = FALSE;
3425 	HTChunkPutc(string, c);
3426 	break;
3427 
3428     case S_sgmlele:		/* Expecting ELEMENT. - FM */
3429 	if (!context->first_dash && c == '-') {
3430 	    HTChunkPutc(string, c);
3431 	    context->first_dash = TRUE;
3432 	    break;
3433 	}
3434 	if (context->first_dash && c == '-') {
3435 	    HTChunkPutc(string, c);
3436 	    context->first_dash = FALSE;
3437 	    if (!context->end_comment)
3438 		context->end_comment = TRUE;
3439 	    else
3440 		context->end_comment = FALSE;
3441 	    break;
3442 	}
3443 	if (context->end_comment && c == '>') {
3444 	    HTChunkTerminate(string);
3445 #ifdef USE_PRETTYSRC
3446 	    if (psrc_view) {
3447 		PSRCSTART(sgmlspecial);
3448 		PUTC('<');
3449 		PUTS(string->data);
3450 		PUTC('>');
3451 		PSRCSTOP(sgmlspecial);
3452 	    } else
3453 #endif
3454 		handle_sgmlele(context);
3455 	    string->size = 0;
3456 	    context->end_comment = FALSE;
3457 	    context->first_dash = FALSE;
3458 	    context->state = S_text;
3459 	    break;
3460 	}
3461 	context->first_dash = FALSE;
3462 	HTChunkPutc(string, c);
3463 	break;
3464 
3465     case S_sgmlatt:		/* Expecting ATTLIST. - FM */
3466 	if (!context->first_dash && c == '-') {
3467 	    HTChunkPutc(string, c);
3468 	    context->first_dash = TRUE;
3469 	    break;
3470 	}
3471 	if (context->first_dash && c == '-') {
3472 	    HTChunkPutc(string, c);
3473 	    context->first_dash = FALSE;
3474 	    if (!context->end_comment)
3475 		context->end_comment = TRUE;
3476 	    else
3477 		context->end_comment = FALSE;
3478 	    break;
3479 	}
3480 	if (context->end_comment && c == '>') {
3481 	    HTChunkTerminate(string);
3482 #ifdef USE_PRETTYSRC
3483 	    if (psrc_view) {
3484 		PSRCSTART(sgmlspecial);
3485 		PUTC('<');
3486 		PUTS(string->data);
3487 		PUTC('>');
3488 		PSRCSTOP(sgmlspecial);
3489 	    } else
3490 #endif
3491 		handle_sgmlatt(context);
3492 	    string->size = 0;
3493 	    context->end_comment = FALSE;
3494 	    context->first_dash = FALSE;
3495 	    context->state = S_text;
3496 	    break;
3497 	}
3498 	context->first_dash = FALSE;
3499 	HTChunkPutc(string, c);
3500 	break;
3501 
3502     case S_tag_gap:		/* Expecting attribute or '>' */
3503 	if (WHITE(c)) {
3504 	    /* PUTC(c); - no, done as special case */
3505 	    break;		/* Gap between attributes */
3506 	}
3507 	if (c == '>') {		/* End of tag */
3508 #ifdef USE_PRETTYSRC
3509 	    if (!psrc_view)
3510 #endif
3511 		if (context->current_tag->name)
3512 		    start_element(context);
3513 #ifdef USE_PRETTYSRC
3514 	    if (psrc_view) {
3515 		PSRCSTART(abracket);
3516 		PUTC('>');
3517 		PSRCSTOP(abracket);
3518 	    }
3519 #endif
3520 	    context->state = S_text;
3521 	    break;
3522 	}
3523 	HTChunkPutc(string, c);
3524 	context->state = S_attr;	/* Get attribute */
3525 	break;
3526 
3527 	/* accumulating value */
3528     case S_attr:
3529 	if (WHITE(c) || (c == '>') || (c == '=')) {	/* End of word */
3530 	    if ((c == '>')
3531 		&& (string->size == 1)
3532 		&& (string->data[0] == '/')) {
3533 		if (context->extended_html
3534 		    && ignore_when_empty(context->current_tag)) {
3535 		    discard_empty(context);
3536 		}
3537 	    } else {
3538 		HTChunkTerminate(string);
3539 		handle_attribute_name(context, string->data);
3540 	    }
3541 #ifdef USE_PRETTYSRC
3542 	    if (!psrc_view) {
3543 #endif
3544 		string->size = 0;
3545 		if (c == '>') {	/* End of tag */
3546 		    if (context->current_tag->name)
3547 			start_element(context);
3548 		    context->state = S_text;
3549 		    break;
3550 		}
3551 #ifdef USE_PRETTYSRC
3552 	    } else {
3553 		PUTC(' ');
3554 		if (context->current_attribute_number == INVALID)
3555 		    PSRCSTART(badattr);
3556 		else
3557 		    PSRCSTART(attrib);
3558 		if (attrname_transform != 1) {
3559 		    if (attrname_transform == 0)
3560 			LYLowerCase(string->data);
3561 		    else
3562 			LYUpperCase(string->data);
3563 		}
3564 		PUTS(string->data);
3565 		if (c == '=' || WHITE(c))
3566 		    PUTC(c);
3567 		if (c == '=' || c == '>') {
3568 		    if (context->current_attribute_number == INVALID) {
3569 			PSRCSTOP(badattr);
3570 		    } else {
3571 			PSRCSTOP(attrib);
3572 		    }
3573 		}
3574 		if (c == '>') {
3575 		    PSRCSTART(abracket);
3576 		    PUTC('>');
3577 		    PSRCSTOP(abracket);
3578 		    context->state = S_text;
3579 		    break;
3580 		}
3581 		string->size = 0;
3582 	    }
3583 #endif
3584 	    context->state = (c == '=' ? S_equals : S_attr_gap);
3585 	} else {
3586 	    HTChunkPutc(string, c);
3587 	}
3588 	break;
3589 
3590     case S_attr_gap:		/* Expecting attribute or '=' or '>' */
3591 	if (WHITE(c)) {
3592 	    PRETTYSRC_PUTC(c);
3593 	    break;		/* Gap after attribute */
3594 	}
3595 	if (c == '>') {		/* End of tag */
3596 #ifdef USE_PRETTYSRC
3597 	    if (psrc_view) {
3598 		if (context->current_attribute_number == INVALID) {
3599 		    PSRCSTOP(badattr);
3600 		} else {
3601 		    PSRCSTOP(attrib);
3602 		}
3603 		PSRCSTART(abracket);
3604 		PUTC('>');
3605 		PSRCSTOP(abracket);
3606 	    } else
3607 #endif
3608 	    if (context->current_tag->name)
3609 		start_element(context);
3610 	    context->state = S_text;
3611 	    break;
3612 	} else if (c == '=') {
3613 #ifdef USE_PRETTYSRC
3614 	    if (psrc_view) {
3615 		PUTC('=');
3616 		if (context->current_attribute_number == INVALID) {
3617 		    PSRCSTOP(badattr);
3618 		} else {
3619 		    PSRCSTOP(attrib);
3620 		}
3621 	    }
3622 #endif
3623 	    context->state = S_equals;
3624 	    break;
3625 	}
3626 	HTChunkPutc(string, c);
3627 	context->state = S_attr;	/* Get next attribute */
3628 	break;
3629 
3630     case S_equals:		/* After attr = */
3631 	if (WHITE(c)) {
3632 	    PRETTYSRC_PUTC(c);
3633 	    break;		/* Before attribute value */
3634 	}
3635 	if (c == '>') {		/* End of tag */
3636 	    CTRACE((tfp, "SGML: found = but no value\n"));
3637 #ifdef USE_PRETTYSRC
3638 	    if (psrc_view) {
3639 		PSRCSTART(abracket);
3640 		PUTC('>');
3641 		PSRCSTOP(abracket);
3642 	    } else
3643 #endif
3644 	    if (context->current_tag->name)
3645 		start_element(context);
3646 	    context->state = S_text;
3647 	    break;
3648 
3649 	} else if (c == '\'') {
3650 #ifdef USE_PRETTYSRC
3651 	    if (psrc_view) {
3652 		PSRCSTART(attrval);
3653 		PUTC(c);
3654 	    }
3655 #endif
3656 	    context->state = S_squoted;
3657 	    break;
3658 
3659 	} else if (c == '"') {
3660 #ifdef USE_PRETTYSRC
3661 	    if (psrc_view) {
3662 		PSRCSTART(attrval);
3663 		PUTC(c);
3664 	    }
3665 #endif
3666 	    context->state = S_dquoted;
3667 	    break;
3668 	}
3669 #ifdef USE_PRETTYSRC
3670 	if (psrc_view)
3671 	    PSRCSTART(attrval);
3672 #endif
3673 	context->state = S_value;
3674 	/*  no break!  fall through to S_value and process current `c`   */
3675 
3676     case S_value:
3677 	if (WHITE(c) || (c == '>')) {	/* End of word */
3678 	    HTChunkTerminate(string);
3679 #ifdef USE_PRETTYSRC
3680 	    if (!end_if_prettysrc(context, string, 0))
3681 #endif
3682 	    {
3683 #ifdef CJK_EX			/* Quick hack. - JH7AYN */
3684 		if (IS_CJK_TTY) {
3685 		    if (string->data[0] == '$') {
3686 			if (string->data[1] == 'B' || string->data[1] == '@') {
3687 			    char *jis_buf = 0;
3688 
3689 			    HTSprintf0(&jis_buf, "\033%s", string->data);
3690 			    TO_EUC((const unsigned char *) jis_buf,
3691 				   (unsigned char *) string->data);
3692 			    FREE(jis_buf);
3693 			}
3694 		    }
3695 		}
3696 #endif
3697 		handle_attribute_value(context, string->data);
3698 	    }
3699 	    string->size = 0;
3700 	    if (c == '>') {	/* End of tag */
3701 #ifdef USE_PRETTYSRC
3702 		if (psrc_view) {
3703 		    PSRCSTART(abracket);
3704 		    PUTC('>');
3705 		    PSRCSTOP(abracket);
3706 		} else
3707 #endif
3708 		if (context->current_tag->name)
3709 		    start_element(context);
3710 		context->state = S_text;
3711 		break;
3712 	    } else
3713 		context->state = S_tag_gap;
3714 	} else if (context->T.decode_utf8 &&
3715 		   *context->utf_buf) {
3716 	    HTChunkPuts(string, context->utf_buf);
3717 	    context->utf_buf_p = context->utf_buf;
3718 	    *(context->utf_buf_p) = '\0';
3719 	} else if (!IS_CJK_TTY &&
3720 		   (context->T.output_utf8 ||
3721 		    context->T.trans_from_uni)) {
3722 	    if (clong == 0xfffd && saved_char_in &&
3723 		HTPassEightBitRaw &&
3724 		saved_char_in >=
3725 		LYlowest_eightbit[context->outUCLYhndl]) {
3726 		HTChunkPutUtf8Char(string,
3727 				   (UCode_t) (0xf000 | saved_char_in));
3728 	    } else {
3729 		HTChunkPutUtf8Char(string, clong);
3730 	    }
3731 	} else if (saved_char_in && context->T.use_raw_char_in) {
3732 	    HTChunkPutc(string, saved_char_in);
3733 	} else {
3734 	    HTChunkPutc(string, c);
3735 	}
3736 	break;
3737 
3738     case S_squoted:		/* Quoted attribute value */
3739 	if (c == '\'') {	/* End of attribute value */
3740 	    HTChunkTerminate(string);
3741 #ifdef USE_PRETTYSRC
3742 	    if (!end_if_prettysrc(context, string, '\''))
3743 #endif
3744 		handle_attribute_value(context, string->data);
3745 	    string->size = 0;
3746 	    context->state = S_tag_gap;
3747 	} else if (TOASCII(c) == '\033') {	/* S/390 -- gil -- 1213 */
3748 	    /*
3749 	     * Setting up for possible single quotes in CJK escape sequences.
3750 	     * - Takuya ASADA (asada@three-a.co.jp)
3751 	     */
3752 	    context->state = S_esc_sq;
3753 	    HTChunkPutc(string, c);
3754 	} else if (context->T.decode_utf8 &&
3755 		   *context->utf_buf) {
3756 	    HTChunkPuts(string, context->utf_buf);
3757 	    context->utf_buf_p = context->utf_buf;
3758 	    *(context->utf_buf_p) = '\0';
3759 	} else if (!IS_CJK_TTY &&
3760 		   (context->T.output_utf8 ||
3761 		    context->T.trans_from_uni)) {
3762 	    if (clong == 0xfffd && saved_char_in &&
3763 		HTPassEightBitRaw &&
3764 		saved_char_in >=
3765 		LYlowest_eightbit[context->outUCLYhndl]) {
3766 		HTChunkPutUtf8Char(string,
3767 				   (UCode_t) (0xf000 | saved_char_in));
3768 	    } else {
3769 		HTChunkPutUtf8Char(string, clong);
3770 	    }
3771 	} else if (saved_char_in && context->T.use_raw_char_in) {
3772 	    HTChunkPutc(string, saved_char_in);
3773 	} else {
3774 	    HTChunkPutc(string, c);
3775 	}
3776 	break;
3777 
3778     case S_dquoted:		/* Quoted attribute value */
3779 	if (c == '"' ||		/* Valid end of attribute value */
3780 	    (soft_dquotes &&	/*  If emulating old Netscape bug, treat '>' */
3781 	     c == '>')) {	/*  as a co-terminator of dquoted and tag    */
3782 	    HTChunkTerminate(string);
3783 #ifdef USE_PRETTYSRC
3784 	    if (!end_if_prettysrc(context, string, (char) c))
3785 #endif
3786 		handle_attribute_value(context, string->data);
3787 	    string->size = 0;
3788 	    context->state = S_tag_gap;
3789 	    if (c == '>')	/* We emulated the Netscape bug, so we go  */
3790 		goto top1;	/* back and treat it as the tag terminator */
3791 	} else if (TOASCII(c) == '\033') {	/* S/390 -- gil -- 1230 */
3792 	    /*
3793 	     * Setting up for possible double quotes in CJK escape sequences.
3794 	     * - Takuya ASADA (asada@three-a.co.jp)
3795 	     */
3796 	    context->state = S_esc_dq;
3797 	    HTChunkPutc(string, c);
3798 	} else if (context->T.decode_utf8 &&
3799 		   *context->utf_buf) {
3800 	    HTChunkPuts(string, context->utf_buf);
3801 	    context->utf_buf_p = context->utf_buf;
3802 	    *(context->utf_buf_p) = '\0';
3803 	} else if (!IS_CJK_TTY &&
3804 		   (context->T.output_utf8 ||
3805 		    context->T.trans_from_uni)) {
3806 	    if (clong == 0xfffd && saved_char_in &&
3807 		HTPassEightBitRaw &&
3808 		saved_char_in >=
3809 		LYlowest_eightbit[context->outUCLYhndl]) {
3810 		HTChunkPutUtf8Char(string,
3811 				   (UCode_t) (0xf000 | saved_char_in));
3812 	    } else {
3813 		HTChunkPutUtf8Char(string, clong);
3814 	    }
3815 	} else if (saved_char_in && context->T.use_raw_char_in) {
3816 	    HTChunkPutc(string, saved_char_in);
3817 	} else {
3818 	    HTChunkPutc(string, c);
3819 	}
3820 	break;
3821 
3822     case S_end:		/* </ */
3823 	if (TOASCII(unsign_c) < 127 && (string->size ?	/* S/390 -- gil -- 1247 */
3824 					IsNmChar(c) : IsNmStart(c))) {
3825 	    HTChunkPutc(string, c);
3826 	} else {		/* End of end tag name */
3827 	    HTTag *t = 0;
3828 
3829 #ifdef USE_PRETTYSRC
3830 	    BOOL psrc_tagname_processed = FALSE;
3831 #endif
3832 
3833 	    HTChunkTerminate(string);
3834 	    if (!*string->data) {	/* Empty end tag */
3835 		if (context->element_stack)
3836 		    t = context->element_stack->tag;
3837 	    } else {
3838 		t = SGMLFindTag(dtd, string->data);
3839 	    }
3840 	    if (!t || t == context->unknown_tag) {
3841 		CTRACE((tfp, "Unknown end tag </%s>\n", string->data));
3842 #ifdef USE_PRETTYSRC
3843 		if (psrc_view) {
3844 		    PSRCSTART(abracket);
3845 		    PUTS("</");
3846 		    PSRCSTOP(abracket);
3847 		    PSRCSTART(badtag);
3848 		    transform_tag(context, string);
3849 		    PUTS(string->data);
3850 		    if (c != '>') {
3851 			PUTC(c);
3852 		    } else {
3853 			PSRCSTOP(badtag);
3854 			PSRCSTART(abracket);
3855 			PUTC('>');
3856 			PSRCSTOP(abracket);
3857 		    }
3858 		    psrc_tagname_processed = TRUE;
3859 		}
3860 	    } else if (psrc_view) {
3861 #endif
3862 	    } else {
3863 		BOOL tag_OK = (BOOL) (c == '>' || WHITE(c));
3864 		HTMLElement e = TAGNUM_OF_TAGP(t);
3865 		int branch = 2;	/* it can be 0,1,2 */
3866 
3867 		context->current_tag = t;
3868 		if (HAS_ALT_TAGNUM(TAGNUM_OF_TAGP(t)) &&
3869 		    context->element_stack &&
3870 		    ALT_TAGP(t) == context->element_stack->tag)
3871 		    context->element_stack->tag = NORMAL_TAGP(context->element_stack->tag);
3872 
3873 		if (tag_OK && Old_DTD) {
3874 		    switch (e) {
3875 		    case HTML_DD:
3876 		    case HTML_DT:
3877 		    case HTML_LI:
3878 		    case HTML_LH:
3879 		    case HTML_TD:
3880 		    case HTML_TH:
3881 		    case HTML_TR:
3882 		    case HTML_THEAD:
3883 		    case HTML_TFOOT:
3884 		    case HTML_TBODY:
3885 		    case HTML_COLGROUP:
3886 			branch = 0;
3887 			break;
3888 
3889 		    case HTML_A:
3890 		    case HTML_B:
3891 		    case HTML_BLINK:
3892 		    case HTML_CITE:
3893 		    case HTML_EM:
3894 		    case HTML_FONT:
3895 		    case HTML_FORM:
3896 		    case HTML_I:
3897 		    case HTML_P:
3898 		    case HTML_STRONG:
3899 		    case HTML_TT:
3900 		    case HTML_U:
3901 			branch = 1;
3902 			break;
3903 		    default:
3904 			break;
3905 		    }
3906 		}
3907 
3908 		/*
3909 		 * Just handle ALL end tags normally :-) - kw
3910 		 */
3911 		if (!Old_DTD) {
3912 		    end_element(context, context->current_tag);
3913 		} else if (tag_OK && (branch == 0)) {
3914 		    /*
3915 		     * Don't treat these end tags as invalid, nor act on them.
3916 		     * - FM
3917 		     */
3918 		    CTRACE((tfp, "SGML: `</%s%c' found!  Ignoring it.\n",
3919 			    string->data, c));
3920 		    string->size = 0;
3921 		    context->current_attribute_number = INVALID;
3922 		    if (c != '>') {
3923 			context->state = S_junk_tag;
3924 		    } else {
3925 			context->current_tag = NULL;
3926 			context->state = S_text;
3927 		    }
3928 		    break;
3929 		} else if (tag_OK && (branch == 1)) {
3930 		    /*
3931 		     * Handle end tags for container elements declared as
3932 		     * SGML_EMPTY to prevent "expected tag substitution" but
3933 		     * still processed via HTML_end_element() in HTML.c with
3934 		     * checks there to avoid throwing the HTML.c stack out of
3935 		     * whack (Ugh, what a hack!  8-).  - FM
3936 		     */
3937 		    if (context->inSELECT) {
3938 			/*
3939 			 * We are in a SELECT block.  - FM
3940 			 */
3941 			if (strcasecomp(string->data, "FORM")) {
3942 			    /*
3943 			     * It is not at FORM end tag, so ignore it.  - FM
3944 			     */
3945 			    CTRACE((tfp,
3946 				    "SGML: ***Ignoring end tag </%s> in SELECT block.\n",
3947 				    string->data));
3948 			} else {
3949 			    /*
3950 			     * End the SELECT block and then handle the FORM
3951 			     * end tag.  - FM
3952 			     */
3953 			    CTRACE((tfp,
3954 				    "SGML: ***Faking SELECT end tag before </%s> end tag.\n",
3955 				    string->data));
3956 			    end_element(context,
3957 					SGMLFindTag(context->dtd, "SELECT"));
3958 			    CTRACE((tfp, "SGML: End </%s>\n", string->data));
3959 
3960 #ifdef USE_PRETTYSRC
3961 			    if (!psrc_view)	/* Don't actually call if viewing psrc - kw */
3962 #endif
3963 				(*context->actions->end_element)
3964 				    (context->target,
3965 				     (int) TAGNUM_OF_TAGP(context->current_tag),
3966 				     &context->include);
3967 			}
3968 		    } else if (!strcasecomp(string->data, "P")) {
3969 			/*
3970 			 * Treat a P end tag like a P start tag (Ugh, what a
3971 			 * hack!  8-).  - FM
3972 			 */
3973 			CTRACE((tfp,
3974 				"SGML: `</%s%c' found!  Treating as '<%s%c'.\n",
3975 				string->data, c, string->data, c));
3976 			{
3977 			    int i;
3978 
3979 			    for (i = 0;
3980 				 i < context->current_tag->number_of_attributes;
3981 				 i++) {
3982 				context->present[i] = NO;
3983 			    }
3984 			}
3985 			if (context->current_tag->name)
3986 			    start_element(context);
3987 		    } else {
3988 			CTRACE((tfp, "SGML: End </%s>\n", string->data));
3989 
3990 #ifdef USE_PRETTYSRC
3991 			if (!psrc_view)		/* Don't actually call if viewing psrc - kw */
3992 #endif
3993 			    (*context->actions->end_element)
3994 				(context->target,
3995 				 (int) TAGNUM_OF_TAGP(context->current_tag),
3996 				 &context->include);
3997 		    }
3998 		    string->size = 0;
3999 		    context->current_attribute_number = INVALID;
4000 		    if (c != '>') {
4001 			context->state = S_junk_tag;
4002 		    } else {
4003 			context->current_tag = NULL;
4004 			context->state = S_text;
4005 		    }
4006 		    break;
4007 		} else {
4008 		    /*
4009 		     * Handle all other end tags normally.  - FM
4010 		     */
4011 		    end_element(context, context->current_tag);
4012 		}
4013 	    }
4014 
4015 #ifdef USE_PRETTYSRC
4016 	    if (psrc_view && !psrc_tagname_processed) {
4017 		PSRCSTART(abracket);
4018 		PUTS("</");
4019 		PSRCSTOP(abracket);
4020 		PSRCSTART(tag);
4021 		if (tagname_transform != 1) {
4022 		    if (tagname_transform == 0)
4023 			LYLowerCase(string->data);
4024 		    else
4025 			LYUpperCase(string->data);
4026 		}
4027 		PUTS(string->data);
4028 		PSRCSTOP(tag);
4029 		if (c != '>') {
4030 		    PSRCSTART(badtag);
4031 		    PUTC(c);
4032 		} else {
4033 		    PSRCSTART(abracket);
4034 		    PUTC('>');
4035 		    PSRCSTOP(abracket);
4036 		}
4037 	    }
4038 #endif
4039 
4040 	    string->size = 0;
4041 	    context->current_attribute_number = INVALID;
4042 	    if (c != '>') {
4043 		if (!WHITE(c))
4044 		    CTRACE((tfp, "SGML: `</%s%c' found!\n", string->data, c));
4045 		context->state = S_junk_tag;
4046 	    } else {
4047 		context->current_tag = NULL;
4048 		context->state = S_text;
4049 	    }
4050 	}
4051 	break;
4052 
4053     case S_esc:		/* Expecting '$'or '(' following CJK ESC. */
4054 	if (c == '$') {
4055 	    context->state = S_dollar;
4056 	} else if (c == '(') {
4057 	    context->state = S_paren;
4058 	} else {
4059 	    context->state = S_text;
4060 	}
4061 	PUTC(c);
4062 	break;
4063 
4064     case S_dollar:		/* Expecting '@', 'B', 'A' or '(' after CJK "ESC$". */
4065 	if (c == '@' || c == 'B' || c == 'A') {
4066 	    context->state = S_nonascii_text;
4067 	} else if (c == '(') {
4068 	    context->state = S_dollar_paren;
4069 	}
4070 	PUTC(c);
4071 	break;
4072 
4073     case S_dollar_paren:	/* Expecting 'C' after CJK "ESC$(". */
4074 	if (c == 'C') {
4075 	    context->state = S_nonascii_text;
4076 	} else {
4077 	    context->state = S_text;
4078 	}
4079 	PUTC(c);
4080 	break;
4081 
4082     case S_paren:		/* Expecting 'B', 'J', 'T' or 'I' after CJK "ESC(". */
4083 	if (c == 'B' || c == 'J' || c == 'T') {
4084 	    context->state = S_text;
4085 	} else if (c == 'I') {
4086 	    context->state = S_nonascii_text;
4087 	} else {
4088 	    context->state = S_text;
4089 	}
4090 	PUTC(c);
4091 	break;
4092 
4093     case S_nonascii_text:	/* Expecting CJK ESC after non-ASCII text. */
4094 	if (TOASCII(c) == '\033') {	/* S/390 -- gil -- 1264 */
4095 	    context->state = S_esc;
4096 	}
4097 	PUTC(c);
4098 	if (c < 32)
4099 	    context->state = S_text;
4100 	break;
4101 
4102     case S_esc_sq:		/* Expecting '$'or '(' following CJK ESC. */
4103 	if (c == '$') {
4104 	    context->state = S_dollar_sq;
4105 	} else if (c == '(') {
4106 	    context->state = S_paren_sq;
4107 	} else {
4108 	    context->state = S_squoted;
4109 	}
4110 	HTChunkPutc(string, c);
4111 	break;
4112 
4113     case S_dollar_sq:		/* Expecting '@', 'B', 'A' or '(' after CJK "ESC$". */
4114 	if (c == '@' || c == 'B' || c == 'A') {
4115 	    context->state = S_nonascii_text_sq;
4116 	} else if (c == '(') {
4117 	    context->state = S_dollar_paren_sq;
4118 	}
4119 	HTChunkPutc(string, c);
4120 	break;
4121 
4122     case S_dollar_paren_sq:	/* Expecting 'C' after CJK "ESC$(". */
4123 	if (c == 'C') {
4124 	    context->state = S_nonascii_text_sq;
4125 	} else {
4126 	    context->state = S_squoted;
4127 	}
4128 	HTChunkPutc(string, c);
4129 	break;
4130 
4131     case S_paren_sq:		/* Expecting 'B', 'J', 'T' or 'I' after CJK "ESC(". */
4132 	if (c == 'B' || c == 'J' || c == 'T') {
4133 	    context->state = S_squoted;
4134 	} else if (c == 'I') {
4135 	    context->state = S_nonascii_text_sq;
4136 	} else {
4137 	    context->state = S_squoted;
4138 	}
4139 	HTChunkPutc(string, c);
4140 	break;
4141 
4142     case S_nonascii_text_sq:	/* Expecting CJK ESC after non-ASCII text. */
4143 	if (TOASCII(c) == '\033') {	/* S/390 -- gil -- 1281 */
4144 	    context->state = S_esc_sq;
4145 	}
4146 	HTChunkPutc(string, c);
4147 	break;
4148 
4149     case S_esc_dq:		/* Expecting '$'or '(' following CJK ESC. */
4150 	if (c == '$') {
4151 	    context->state = S_dollar_dq;
4152 	} else if (c == '(') {
4153 	    context->state = S_paren_dq;
4154 	} else {
4155 	    context->state = S_dquoted;
4156 	}
4157 	HTChunkPutc(string, c);
4158 	break;
4159 
4160     case S_dollar_dq:		/* Expecting '@', 'B', 'A' or '(' after CJK "ESC$". */
4161 	if (c == '@' || c == 'B' || c == 'A') {
4162 	    context->state = S_nonascii_text_dq;
4163 	} else if (c == '(') {
4164 	    context->state = S_dollar_paren_dq;
4165 	}
4166 	HTChunkPutc(string, c);
4167 	break;
4168 
4169     case S_dollar_paren_dq:	/* Expecting 'C' after CJK "ESC$(". */
4170 	if (c == 'C') {
4171 	    context->state = S_nonascii_text_dq;
4172 	} else {
4173 	    context->state = S_dquoted;
4174 	}
4175 	HTChunkPutc(string, c);
4176 	break;
4177 
4178     case S_paren_dq:		/* Expecting 'B', 'J', 'T' or 'I' after CJK "ESC(". */
4179 	if (c == 'B' || c == 'J' || c == 'T') {
4180 	    context->state = S_dquoted;
4181 	} else if (c == 'I') {
4182 	    context->state = S_nonascii_text_dq;
4183 	} else {
4184 	    context->state = S_dquoted;
4185 	}
4186 	HTChunkPutc(string, c);
4187 	break;
4188 
4189     case S_nonascii_text_dq:	/* Expecting CJK ESC after non-ASCII text. */
4190 	if (TOASCII(c) == '\033') {	/* S/390 -- gil -- 1298 */
4191 	    context->state = S_esc_dq;
4192 	}
4193 	HTChunkPutc(string, c);
4194 	break;
4195 
4196     case S_junk_tag:
4197     case S_pi:
4198 	if (c == '>') {
4199 	    HTChunkTerminate(string);
4200 #ifdef USE_PRETTYSRC
4201 	    if (psrc_view) {
4202 		if (context->state == S_junk_tag) {
4203 		    PSRCSTOP(badtag);
4204 		}
4205 		PSRCSTART(abracket);
4206 		PUTC('>');
4207 		PSRCSTOP(abracket);
4208 	    }
4209 #endif
4210 	    if (context->state == S_pi)
4211 		handle_processing_instruction(context);
4212 	    string->size = 0;
4213 	    context->current_tag = NULL;
4214 	    context->state = S_text;
4215 	} else {
4216 	    HTChunkPutc(string, c);
4217 #ifdef USE_PRETTYSRC
4218 	    if (psrc_view) {
4219 		PUTC(c);
4220 	    }
4221 #endif
4222 	}
4223 
4224     }				/* switch on context->state */
4225     CTRACE2(TRACE_SGML, (tfp, "SGML after  %s|%.*s|%c|\n",
4226 			 state_name(context->state),
4227 			 string->size,
4228 			 NonNull(string->data),
4229 			 UCH(c)));
4230 
4231   after_switch:
4232     /*
4233      * Check whether an external function has added anything to the include
4234      * buffer.  If so, move the new stuff to the beginning of active_include.
4235      * - kw
4236      */
4237     if (context->include != NULL) {
4238 	if (context->include[0] == '\0') {
4239 	    FREE(context->include);
4240 	} else {
4241 	    if (context->active_include &&
4242 		context->active_include[context->include_index] != '\0')
4243 		StrAllocCat(context->include,
4244 			    context->active_include + context->include_index);
4245 	    FREE(context->active_include);
4246 	    context->active_include = context->include;
4247 	    context->include_index = 0;
4248 	    context->include = NULL;
4249 	}
4250     }
4251 
4252     /*
4253      * Check whether we've added anything to the recover buffer.  - FM
4254      */
4255     if (context->recover != NULL) {
4256 	if (context->recover[context->recover_index] == '\0') {
4257 	    FREE(context->recover);
4258 	    context->recover_index = 0;
4259 	} else {
4260 	    c = UCH(context->recover[context->recover_index]);
4261 	    context->recover_index++;
4262 	    goto top;
4263 	}
4264     }
4265 
4266     /*
4267      * Check whether an external function had added anything to the include
4268      * buffer; it should now be in active_include.  - FM / kw
4269      */
4270     if (context->active_include != NULL) {
4271 	if (context->active_include[context->include_index] == '\0') {
4272 	    FREE(context->active_include);
4273 	    context->include_index = 0;
4274 	} else {
4275 	    if (context->current_tag_charset == UTF8_handle ||
4276 		context->T.trans_from_uni) {
4277 		/*
4278 		 * If it looks like we would have fed UTF-8 to the next
4279 		 * processing stage, assume that whatever we were fed back is
4280 		 * in UTF-8 form, too.  This won't be always true for all uses
4281 		 * of the include buffer, but it's a start.  - kw
4282 		 */
4283 		char *puni = context->active_include + context->include_index;
4284 
4285 		c = UCH(*puni);
4286 		clong = UCGetUniFromUtf8String(&puni);
4287 		if (clong < 256 && clong >= 0) {
4288 		    c = UCH((clong & 0xff));
4289 		}
4290 		saved_char_in = '\0';
4291 		context->include_index = (int) (puni
4292 						- context->active_include
4293 						+ 1);
4294 		goto top1;
4295 	    } else {
4296 		/*
4297 		 * Otherwise assume no UTF-8 - do charset-naive processing and
4298 		 * hope for the best.  - kw
4299 		 */
4300 		c = UCH(context->active_include[context->include_index]);
4301 		context->include_index++;
4302 		goto top;
4303 	    }
4304 	}
4305     }
4306 
4307     /*
4308      * Check whether an external function has added anything to the csi buffer.
4309      * - FM
4310      */
4311     if (context->csi != NULL) {
4312 	if (context->csi[context->csi_index] == '\0') {
4313 	    FREE(context->csi);
4314 	    context->csi_index = 0;
4315 	} else {
4316 	    c = UCH(context->csi[context->csi_index]);
4317 	    context->csi_index++;
4318 	    goto top;
4319 	}
4320     }
4321 }				/* SGML_character */
4322 
InferUtfFromBom(HTStream * context,int chndl)4323 static void InferUtfFromBom(HTStream *context, int chndl)
4324 {
4325     HTAnchor_setUCInfoStage(context->node_anchor, chndl,
4326 			    UCT_STAGE_PARSER,
4327 			    UCT_SETBY_PARSER);
4328     change_chartrans_handling(context);
4329 }
4330 
4331 /*
4332  * Avoid rewrite of SGML_character() to handle hypothetical case of UTF-16
4333  * webpages, by pretending that the data is UTF-8.
4334  */
SGML_widechar(HTStream * context,int ch)4335 static void SGML_widechar(HTStream *context, int ch)
4336 {
4337     if (!UCPutUtf8_charstring(context, SGML_character, (UCode_t) ch)) {
4338 	SGML_character(context, ch);
4339     }
4340 }
4341 
SGML_write(HTStream * context,const char * str,int l)4342 static void SGML_write(HTStream *context, const char *str, int l)
4343 {
4344     const char *p;
4345     const char *e = str + l;
4346 
4347     if (sgml_offset == 0) {
4348 	if (l > 3
4349 	    && !MemCmp(str, "\357\273\277", 3)) {
4350 	    CTRACE((tfp, "SGML_write found UTF-8 BOM\n"));
4351 	    InferUtfFromBom(context, UTF8_handle);
4352 	    str += 3;
4353 	} else if (l > 2) {
4354 	    if (!MemCmp(str, "\377\376", 2)) {
4355 		CTRACE((tfp, "SGML_write found UCS-2 LE BOM\n"));
4356 		InferUtfFromBom(context, UTF8_handle);
4357 		str += 2;
4358 		context->T.ucs_mode = -1;
4359 	    } else if (!MemCmp(str, "\376\377", 2)) {
4360 		CTRACE((tfp, "SGML_write found UCS-2 BE BOM\n"));
4361 		InferUtfFromBom(context, UTF8_handle);
4362 		str += 2;
4363 		context->T.ucs_mode = 1;
4364 	    }
4365 	}
4366     }
4367     switch (context->T.ucs_mode) {
4368     case -1:
4369 	for (p = str; p < e; p += 2)
4370 	    SGML_widechar(context, (UCH(p[1]) << 8) | UCH(p[0]));
4371 	break;
4372     case 1:
4373 	for (p = str; p < e; p += 2)
4374 	    SGML_widechar(context, (UCH(p[0]) << 8) | UCH(p[1]));
4375 	break;
4376     default:
4377 	for (p = str; p < e; p++)
4378 	    SGML_character(context, *p);
4379 	break;
4380     }
4381 }
4382 
SGML_string(HTStream * context,const char * str)4383 static void SGML_string(HTStream *context, const char *str)
4384 {
4385     SGML_write(context, str, (int) strlen(str));
4386 }
4387 
4388 /*_______________________________________________________________________
4389 */
4390 
4391 /*	Structured Object Class
4392  *	-----------------------
4393  */
4394 const HTStreamClass SGMLParser =
4395 {
4396     "SGMLParser",
4397     SGML_free,
4398     SGML_abort,
4399     SGML_character,
4400     SGML_string,
4401     SGML_write,
4402 };
4403 
4404 /*	Create SGML Engine
4405  *	------------------
4406  *
4407  * On entry,
4408  *	dtd		represents the DTD, along with
4409  *	actions		is the sink for the data as a set of routines.
4410  *
4411  */
4412 
SGML_new(const SGML_dtd * dtd,HTParentAnchor * anchor,HTStructured * target)4413 HTStream *SGML_new(const SGML_dtd * dtd,
4414 		   HTParentAnchor *anchor,
4415 		   HTStructured * target)
4416 {
4417     HTStream *context = typecalloc(struct _HTStream);
4418 
4419     if (!context)
4420 	outofmem(__FILE__, "SGML_begin");
4421 
4422     assert(context != NULL);
4423 
4424     context->isa = &SGMLParser;
4425     context->string = HTChunkCreate(128);	/* Grow by this much */
4426     context->dtd = dtd;
4427     context->target = target;
4428     context->actions = (const HTStructuredClass *) (((HTStream *) target)->isa);
4429     /* Ugh: no OO */
4430     context->unknown_tag = &HTTag_unrecognized;
4431     context->current_tag = context->slashedtag = NULL;
4432     context->state = S_text;
4433 #ifdef CALLERDATA
4434     context->callerData = (void *) callerData;
4435 #endif /* CALLERDATA */
4436 
4437     context->node_anchor = anchor;	/* Could be NULL? */
4438     context->utf_buf_p = context->utf_buf;
4439     UCTransParams_clear(&context->T);
4440     context->inUCLYhndl = HTAnchor_getUCLYhndl(anchor,
4441 					       UCT_STAGE_PARSER);
4442     if (context->inUCLYhndl < 0) {
4443 	HTAnchor_copyUCInfoStage(anchor,
4444 				 UCT_STAGE_PARSER,
4445 				 UCT_STAGE_MIME,
4446 				 -1);
4447 	context->inUCLYhndl = HTAnchor_getUCLYhndl(anchor,
4448 						   UCT_STAGE_PARSER);
4449     }
4450 #ifdef CAN_SWITCH_DISPLAY_CHARSET	/* Allow a switch to a more suitable display charset */
4451     else if (anchor->UCStages
4452 	     && anchor->UCStages->s[UCT_STAGE_PARSER].LYhndl >= 0
4453 	     && anchor->UCStages->s[UCT_STAGE_PARSER].LYhndl != current_char_set) {
4454 	int o = anchor->UCStages->s[UCT_STAGE_PARSER].LYhndl;
4455 
4456 	anchor->UCStages->s[UCT_STAGE_PARSER].LYhndl = -1;	/* Force reset */
4457 	HTAnchor_resetUCInfoStage(anchor, o, UCT_STAGE_PARSER,
4458 	/* Preserve change this: */
4459 				  anchor->UCStages->s[UCT_STAGE_PARSER].lock);
4460     }
4461 #endif
4462 
4463     context->inUCI = HTAnchor_getUCInfoStage(anchor,
4464 					     UCT_STAGE_PARSER);
4465     set_chartrans_handling(context, anchor, -1);
4466 
4467     context->recover = NULL;
4468     context->recover_index = 0;
4469     context->include = NULL;
4470     context->active_include = NULL;
4471     context->include_index = 0;
4472     context->url = NULL;
4473     context->csi = NULL;
4474     context->csi_index = 0;
4475 
4476 #ifdef USE_PRETTYSRC
4477     if (psrc_view) {
4478 	psrc_view = FALSE;
4479 	mark_htext_as_source = TRUE;
4480 	SGML_string(context,
4481 		    "<HTML><HEAD><TITLE>source</TITLE></HEAD><BODY><PRE>");
4482 	psrc_view = TRUE;
4483 	psrc_convert_string = FALSE;
4484 	sgml_in_psrc_was_initialized = TRUE;
4485     }
4486 #endif
4487 
4488     sgml_offset = 0;
4489     return context;
4490 }
4491 
4492 /*
4493  * Return the offset within the document where we're parsing.  This is used
4494  * to help identify anchors which shift around while reparsing.
4495  */
SGML_offset(void)4496 int SGML_offset(void)
4497 {
4498     int result = sgml_offset;
4499 
4500 #ifdef USE_PRETTYSRC
4501     result += psrc_view;
4502 #endif
4503     return result;
4504 }
4505 
4506 /*		Asian character conversion functions
4507  *		====================================
4508  *
4509  *	Added 24-Mar-96 by FM, based on:
4510  *
4511  ////////////////////////////////////////////////////////////////////////
4512 Copyright (c) 1993 Electrotechnical Laboratory (ETL)
4513 
4514 Permission to use, copy, modify, and distribute this material
4515 for any purpose and without fee is hereby granted, provided
4516 that the above copyright notice and this permission notice
4517 appear in all copies, and that the name of ETL not be
4518 used in advertising or publicity pertaining to this
4519 material without the specific, prior written permission
4520 of an authorized representative of ETL.
4521 ETL MAKES NO REPRESENTATIONS ABOUT THE ACCURACY OR SUITABILITY
4522 OF THIS MATERIAL FOR ANY PURPOSE.  IT IS PROVIDED "AS IS",
4523 WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES.
4524 /////////////////////////////////////////////////////////////////////////
4525 Content-Type:	program/C; charset=US-ASCII
4526 Program:	SJIS.c
4527 Author:		Yutaka Sato <ysato@etl.go.jp>
4528 Description:
4529 History:
4530 	930923	extracted from codeconv.c of cosmos
4531 ///////////////////////////////////////////////////////////////////////
4532 */
4533 
4534 static int TREAT_SJIS = 1;
4535 
JISx0201TO0208_EUC(unsigned IHI,unsigned ILO,unsigned char * OHI,unsigned char * OLO)4536 void JISx0201TO0208_EUC(unsigned IHI,
4537 			unsigned ILO,
4538 			unsigned char *OHI,
4539 			unsigned char *OLO)
4540 {
4541     static const char *table[] =
4542     {
4543 	"\241\243",		/* A1,A3 */
4544 	"\241\326",		/* A1,D6 */
4545 	"\241\327",		/* A1,D7 */
4546 	"\241\242",		/* A1,A2 */
4547 	"\241\246",		/* A1,A6 */
4548 	"\245\362",		/* A5,F2 */
4549 	"\245\241",		/* A5,A1 */
4550 	"\245\243",		/* A5,A3 */
4551 	"\245\245",		/* A5,A5 */
4552 	"\245\247",		/* A5,A7 */
4553 	"\245\251",		/* A5,A9 */
4554 	"\245\343",		/* A5,E3 */
4555 	"\245\345",		/* A5,E5 */
4556 	"\245\347",		/* A5,E7 */
4557 	"\245\303",		/* A5,C3 */
4558 	"\241\274",		/* A1,BC */
4559 	"\245\242",		/* A5,A2 */
4560 	"\245\244",		/* A5,A4 */
4561 	"\245\246",		/* A5,A6 */
4562 	"\245\250",		/* A5,A8 */
4563 	"\245\252",		/* A5,AA */
4564 	"\245\253",		/* A5,AB */
4565 	"\245\255",		/* A5,AD */
4566 	"\245\257",		/* A5,AF */
4567 	"\245\261",		/* A5,B1 */
4568 	"\245\263",		/* A5,B3 */
4569 	"\245\265",		/* A5,B5 */
4570 	"\245\267",		/* A5,B7 */
4571 	"\245\271",		/* A5,B9 */
4572 	"\245\273",		/* A5,BB */
4573 	"\245\275",		/* A5,BD */
4574 	"\245\277",		/* A5,BF */
4575 	"\245\301",		/* A5,C1 */
4576 	"\245\304",		/* A5,C4 */
4577 	"\245\306",		/* A5,C6 */
4578 	"\245\310",		/* A5,C8 */
4579 	"\245\312",		/* A5,CA */
4580 	"\245\313",		/* A5,CB */
4581 	"\245\314",		/* A5,CC */
4582 	"\245\315",		/* A5,CD */
4583 	"\245\316",		/* A5,CE */
4584 	"\245\317",		/* A5,CF */
4585 	"\245\322",		/* A5,D2 */
4586 	"\245\325",		/* A5,D5 */
4587 	"\245\330",		/* A5,D8 */
4588 	"\245\333",		/* A5,DB */
4589 	"\245\336",		/* A5,DE */
4590 	"\245\337",		/* A5,DF */
4591 	"\245\340",		/* A5,E0 */
4592 	"\245\341",		/* A5,E1 */
4593 	"\245\342",		/* A5,E2 */
4594 	"\245\344",		/* A5,E4 */
4595 	"\245\346",		/* A5,E6 */
4596 	"\245\350",		/* A5,E8 */
4597 	"\245\351",		/* A5,E9 */
4598 	"\245\352",		/* A5,EA */
4599 	"\245\353",		/* A5,EB */
4600 	"\245\354",		/* A5,EC */
4601 	"\245\355",		/* A5,ED */
4602 	"\245\357",		/* A5,EF */
4603 	"\245\363",		/* A5,F3 */
4604 	"\241\253",		/* A1,AB */
4605 	"\241\254"		/* A1,AC */
4606     };
4607 
4608     if ((IHI == 0x8E) && (ILO >= 0xA1) && (ILO <= 0xDF)) {
4609 	*OHI = UCH(table[ILO - 0xA1][0]);
4610 	*OLO = UCH(table[ILO - 0xA1][1]);
4611     } else {
4612 	*OHI = UCH(IHI);
4613 	*OLO = UCH(ILO);
4614     }
4615 }
4616 
IS_SJIS_STR(const unsigned char * str)4617 static int IS_SJIS_STR(const unsigned char *str)
4618 {
4619     const unsigned char *s;
4620     unsigned char ch;
4621     int is_sjis = 0;
4622 
4623     s = str;
4624     while ((ch = *s++) != '\0') {
4625 	if (ch & 0x80)
4626 	    if (IS_SJIS(ch, *s, is_sjis))
4627 		return 1;
4628     }
4629     return 0;
4630 }
4631 
SJIS_TO_JIS1(unsigned HI,unsigned LO,unsigned char * JCODE)4632 unsigned char *SJIS_TO_JIS1(unsigned HI,
4633 			    unsigned LO,
4634 			    unsigned char *JCODE)
4635 {
4636     HI = UCH(HI - (unsigned) UCH((HI <= 0x9F) ? 0x71 : 0xB1));
4637     HI = UCH((HI << 1) + 1);
4638     if (0x7F < LO)
4639 	LO--;
4640     if (0x9E <= LO) {
4641 	LO = UCH(LO - UCH(0x7D));
4642 	HI++;
4643     } else {
4644 	LO = UCH(LO - UCH(0x1F));
4645     }
4646     JCODE[0] = UCH(HI);
4647     JCODE[1] = UCH(LO);
4648     return JCODE;
4649 }
4650 
JIS_TO_SJIS1(unsigned HI,unsigned LO,unsigned char * SJCODE)4651 unsigned char *JIS_TO_SJIS1(unsigned HI,
4652 			    unsigned LO,
4653 			    unsigned char *SJCODE)
4654 {
4655     if (HI & 1)
4656 	LO = UCH(LO + UCH(0x1F));
4657     else
4658 	LO = UCH(LO + UCH(0x7D));
4659     if (0x7F <= LO)
4660 	LO++;
4661 
4662     HI = UCH(((HI - 0x21) >> 1) + 0x81);
4663     if (0x9F < HI)
4664 	HI = UCH(HI + UCH(0x40));
4665     SJCODE[0] = UCH(HI);
4666     SJCODE[1] = UCH(LO);
4667     return SJCODE;
4668 }
4669 
EUC_TO_SJIS1(unsigned HI,unsigned LO,unsigned char * SJCODE)4670 unsigned char *EUC_TO_SJIS1(unsigned HI,
4671 			    unsigned LO,
4672 			    unsigned char *SJCODE)
4673 {
4674     if (HI == 0x8E) {
4675 	unsigned char HI_data[2];
4676 	unsigned char LO_data[2];
4677 
4678 	HI_data[0] = UCH(HI);
4679 	LO_data[0] = UCH(LO);
4680 	JISx0201TO0208_EUC(HI, LO, HI_data, LO_data);
4681     }
4682     JIS_TO_SJIS1(UCH(HI & 0x7F), UCH(LO & 0x7F), SJCODE);
4683     return SJCODE;
4684 }
4685 
JISx0201TO0208_SJIS(unsigned I,unsigned char * OHI,unsigned char * OLO)4686 void JISx0201TO0208_SJIS(unsigned I,
4687 			 unsigned char *OHI,
4688 			 unsigned char *OLO)
4689 {
4690     unsigned char SJCODE[2];
4691 
4692     JISx0201TO0208_EUC(0x8E, I, OHI, OLO);
4693     JIS_TO_SJIS1(UCH(*OHI & 0x7F), UCH(*OLO & 0x7F), SJCODE);
4694     *OHI = SJCODE[0];
4695     *OLO = SJCODE[1];
4696 }
4697 
SJIS_TO_EUC1(unsigned HI,unsigned LO,unsigned char * data)4698 unsigned char *SJIS_TO_EUC1(unsigned HI,
4699 			    unsigned LO,
4700 			    unsigned char *data)
4701 {
4702     SJIS_TO_JIS1(HI, LO, data);
4703     data[0] |= 0x80;
4704     data[1] |= 0x80;
4705     return data;
4706 }
4707 
SJIS_TO_EUC(unsigned char * src,unsigned char * dst)4708 unsigned char *SJIS_TO_EUC(unsigned char *src,
4709 			   unsigned char *dst)
4710 {
4711     unsigned char hi, lo, *sp, *dp;
4712     int in_sjis = 0;
4713 
4714     in_sjis = IS_SJIS_STR(src);
4715     for (sp = src, dp = dst; (hi = sp[0]) != '\0';) {
4716 	lo = sp[1];
4717 	if (TREAT_SJIS && IS_SJIS(hi, lo, in_sjis)) {
4718 	    SJIS_TO_JIS1(hi, lo, dp);
4719 	    dp[0] |= 0x80;
4720 	    dp[1] |= 0x80;
4721 	    dp += 2;
4722 	    sp += 2;
4723 	} else
4724 	    *dp++ = *sp++;
4725     }
4726     *dp = 0;
4727     return dst;
4728 }
4729 
EUC_TO_SJIS(unsigned char * src,unsigned char * dst)4730 unsigned char *EUC_TO_SJIS(unsigned char *src,
4731 			   unsigned char *dst)
4732 {
4733     unsigned char *sp, *dp;
4734 
4735     for (sp = src, dp = dst; *sp;) {
4736 	if (*sp & 0x80) {
4737 	    if (sp[1] && (sp[1] & 0x80)) {
4738 		JIS_TO_SJIS1(UCH(sp[0] & 0x7F), UCH(sp[1] & 0x7F), dp);
4739 		dp += 2;
4740 		sp += 2;
4741 	    } else {
4742 		sp++;
4743 	    }
4744 	} else {
4745 	    *dp++ = *sp++;
4746 	}
4747     }
4748     *dp = 0;
4749     return dst;
4750 }
4751 
4752 #define Strcpy(a,b)	(strcpy((char*)a,(const char*)b),&a[strlen((const char*)a)])
4753 
EUC_TO_JIS(unsigned char * src,unsigned char * dst,const char * toK,const char * toA)4754 unsigned char *EUC_TO_JIS(unsigned char *src,
4755 			  unsigned char *dst,
4756 			  const char *toK,
4757 			  const char *toA)
4758 {
4759     unsigned char kana_mode = 0;
4760     unsigned char cch;
4761     unsigned char *sp = src;
4762     unsigned char *dp = dst;
4763     int is_JIS = 0;
4764 
4765     while ((cch = *sp++) != '\0') {
4766 	if (cch & 0x80) {
4767 	    if (!IS_EUC(cch, *sp)) {
4768 		if (cch == 0xA0 && is_JIS)	/* ignore NBSP */
4769 		    continue;
4770 		is_JIS++;
4771 		*dp++ = cch;
4772 		continue;
4773 	    }
4774 	    if (!kana_mode) {
4775 		kana_mode = UCH(~kana_mode);
4776 		dp = Strcpy(dp, toK);
4777 	    }
4778 	    if (*sp & 0x80) {
4779 		*dp++ = UCH(cch & ~0x80);
4780 		*dp++ = UCH(*sp++ & ~0x80);
4781 	    }
4782 	} else {
4783 	    if (kana_mode) {
4784 		kana_mode = UCH(~kana_mode);
4785 		dp = Strcpy(dp, toA);
4786 	    }
4787 	    *dp++ = cch;
4788 	}
4789     }
4790     if (kana_mode)
4791 	dp = Strcpy(dp, toA);
4792 
4793     if (dp)
4794 	*dp = 0;
4795     return dst;
4796 }
4797 
4798 #define	IS_JIS7(c1,c2)	(0x20<(c1)&&(c1)<0x7F && 0x20<(c2)&&(c2)<0x7F)
4799 #define SO		('N'-0x40)
4800 #define SI		('O'-0x40)
4801 
4802 static int repair_JIS = 0;
4803 
repairJIStoEUC(const unsigned char * src,unsigned char ** dstp)4804 static const unsigned char *repairJIStoEUC(const unsigned char *src,
4805 					   unsigned char **dstp)
4806 {
4807     const unsigned char *s;
4808     unsigned char *d, ch1, ch2;
4809 
4810     d = *dstp;
4811     s = src;
4812     while ((ch1 = s[0]) && (ch2 = s[1])) {
4813 	s += 2;
4814 	if (ch1 == '(')
4815 	    if (ch2 == 'B' || ch2 == 'J') {
4816 		*dstp = d;
4817 		return s;
4818 	    }
4819 	if (!IS_JIS7(ch1, ch2))
4820 	    return 0;
4821 
4822 	*d++ = UCH(0x80 | ch1);
4823 	*d++ = UCH(0x80 | ch2);
4824     }
4825     return 0;
4826 }
4827 
TO_EUC(const unsigned char * jis,unsigned char * euc)4828 unsigned char *TO_EUC(const unsigned char *jis,
4829 		      unsigned char *euc)
4830 {
4831     const unsigned char *s;
4832     unsigned char c, jis_stat;
4833     unsigned char *d;
4834     int to1B, to2B;
4835     int in_sjis = 0;
4836     static int nje;
4837     int n8bits;
4838     int is_JIS;
4839 
4840     nje++;
4841     n8bits = 0;
4842     s = jis;
4843     d = euc;
4844     jis_stat = 0;
4845     to2B = TO_2BCODE;
4846     to1B = TO_1BCODE;
4847     in_sjis = IS_SJIS_STR(jis);
4848     is_JIS = 0;
4849 
4850     while ((c = *s++) != '\0') {
4851 	if (c == 0x80)
4852 	    continue;		/* ignore it */
4853 	if (c == 0xA0 && is_JIS)
4854 	    continue;		/* ignore Non-breaking space */
4855 
4856 	if (c == to2B && jis_stat == 0 && repair_JIS) {
4857 	    if (*s == 'B' || *s == '@') {
4858 		const unsigned char *ts;
4859 
4860 		if ((ts = repairJIStoEUC(s + 1, &d)) != NULL) {
4861 		    s = ts;
4862 		    continue;
4863 		}
4864 	    }
4865 	}
4866 	if (c == CH_ESC) {
4867 	    if (*s == to2B) {
4868 		if ((s[1] == 'B') || (s[1] == '@')) {
4869 		    jis_stat = 0x80;
4870 		    s += 2;
4871 		    is_JIS++;
4872 		    continue;
4873 		}
4874 		jis_stat = 0;
4875 	    } else if (*s == to1B) {
4876 		jis_stat = 0;
4877 		if ((s[1] == 'B') || (s[1] == 'J') || (s[1] == 'H')) {
4878 		    s += 2;
4879 		    continue;
4880 		}
4881 	    } else if (*s == ',') {	/* MULE */
4882 		jis_stat = 0;
4883 	    }
4884 	}
4885 	if (c & 0x80)
4886 	    n8bits++;
4887 
4888 	if (IS_SJIS(c, *s, in_sjis)) {
4889 	    SJIS_TO_EUC1(c, *s, d);
4890 	    d += 2;
4891 	    s++;
4892 	    is_JIS++;
4893 	} else if (jis_stat) {
4894 	    if (c <= 0x20 || 0x7F <= c) {
4895 		*d++ = c;
4896 		if (c == '\n')
4897 		    jis_stat = 0;
4898 	    } else {
4899 		if (IS_JIS7(c, *s)) {
4900 		    *d++ = jis_stat | c;
4901 		    *d++ = jis_stat | *s++;
4902 		} else
4903 		    *d++ = c;
4904 	    }
4905 	} else {
4906 	    if (n8bits == 0 && (c == SI || c == SO)) {
4907 	    } else {
4908 		*d++ = c;
4909 	    }
4910 	}
4911     }
4912     *d = 0;
4913     return euc;
4914 }
4915 
4916 #define non94(ch) ((ch) <= 0x20 || (ch) == 0x7F)
4917 
is_EUC_JP(unsigned char * euc)4918 static int is_EUC_JP(unsigned char *euc)
4919 {
4920     unsigned char *cp;
4921     int ch1, ch2;
4922 
4923     for (cp = euc; (ch1 = *cp) != '\0'; cp++) {
4924 	if (ch1 & 0x80) {
4925 	    ch2 = cp[1] & 0xFF;
4926 	    if ((ch2 & 0x80) == 0) {
4927 		/* sv1log("NOT_EUC1[%x][%x]\n",ch1,ch2); */
4928 		return 0;
4929 	    }
4930 	    if (non94(ch1 & 0x7F) || non94(ch2 & 0x7F)) {
4931 		/* sv1log("NOT_EUC2[%x][%x]\n",ch1,ch2); */
4932 		return 0;
4933 	    }
4934 	    cp++;
4935 	}
4936     }
4937     return 1;
4938 }
4939 
TO_SJIS(const unsigned char * arg,unsigned char * sjis)4940 void TO_SJIS(const unsigned char *arg,
4941 	     unsigned char *sjis)
4942 {
4943     unsigned char *euc;
4944 
4945     euc = typeMallocn(unsigned char, strlen((const char *) arg) + 1);
4946 
4947 #ifdef CJK_EX
4948     if (!euc)
4949 	outofmem(__FILE__, "TO_SJIS");
4950 #endif
4951     TO_EUC(arg, euc);
4952     if (is_EUC_JP(euc))
4953 	EUC_TO_SJIS(euc, sjis);
4954     else
4955 	strcpy((char *) sjis, (const char *) arg);
4956     free(euc);
4957 }
4958 
TO_JIS(const unsigned char * arg,unsigned char * jis)4959 void TO_JIS(const unsigned char *arg,
4960 	    unsigned char *jis)
4961 {
4962     unsigned char *euc;
4963 
4964     if (arg[0] == 0) {
4965 	jis[0] = 0;
4966 	return;
4967     }
4968     euc = typeMallocn(unsigned char, strlen((const char *)arg) + 1);
4969 #ifdef CJK_EX
4970     if (!euc)
4971 	outofmem(__FILE__, "TO_JIS");
4972 #endif
4973     TO_EUC(arg, euc);
4974     is_EUC_JP(euc);
4975     EUC_TO_JIS(euc, jis, TO_KANJI, TO_ASCII);
4976 
4977     free(euc);
4978 }
4979