1 /*
2 * $LynxId: SGML.c,v 1.148 2012/02/10 18:32:26 tom Exp $
3 *
4 * General SGML Parser code SGML.c
5 * ========================
6 *
7 * This module implements an HTStream object. To parse an
8 * SGML file, create this object which is a parser. The object
9 * is (currently) created by being passed a DTD structure,
10 * and a target HTStructured object at which to throw the parsed stuff.
11 *
12 * 6 Feb 93 Binary searches used. Interface modified.
13 */
14
15 #define HTSTREAM_INTERNAL 1
16
17 #include <HTUtils.h>
18
19 #include <SGML.h>
20 #include <HTMLDTD.h>
21 #include <HTAccess.h>
22 #include <HTCJK.h> /* FIXME: this doesn't belong in SGML.c */
23 #include <UCMap.h>
24 #include <UCDefs.h>
25 #include <UCAux.h>
26
27 #include <HTChunk.h>
28 #include <HTUtils.h>
29
30 #include <LYCharSets.h>
31 #include <LYCharVals.h> /* S/390 -- gil -- 0635 */
32 #include <LYGlobalDefs.h>
33 #include <LYStrings.h>
34 #include <LYLeaks.h>
35 #include <LYUtils.h>
36
37 #ifdef USE_COLOR_STYLE
38 # include <LYStyle.h>
39 #endif
40 #ifdef USE_PRETTYSRC
41 # include <LYPrettySrc.h>
42 #endif
43
44 #define AssumeCP1252(context) \
45 (((context)->inUCLYhndl == LATIN1 \
46 || (context)->inUCLYhndl == US_ASCII) \
47 && html5_charsets)
48
49 #define INVALID (-1)
50
51 static int sgml_offset;
52
53 #ifdef USE_PRETTYSRC
54
55 static char *entity_string; /* this is used for printing entity name.
56
57 Unconditionally added since redundant assigments don't hurt much */
58
fake_put_character(void * p GCC_UNUSED,char c GCC_UNUSED)59 static void fake_put_character(void *p GCC_UNUSED,
60 char c GCC_UNUSED)
61 {
62 }
63
64 #define START TRUE
65 #define STOP FALSE
66
67 #define PUTS_TR(x) psrc_convert_string = TRUE; PUTS(x)
68
69 #endif
70
71 /* my_casecomp() - optimized by the first character, NOT_ASCII ok */
72 #define my_casecomp(a,b) ((TOUPPER(*a) == TOUPPER(*b)) ? \
73 AS_casecomp(a,b) : \
74 (TOASCII(TOUPPER(*a)) - TOASCII(TOUPPER(*b))))
75
76 /* will use partially inlined version */
77 #define orig_HTChunkPutUtf8Char HTChunkPutUtf8Char
78 #undef HTChunkPutUtf8Char
79
80 /* ...used for comments and attributes value like href... */
81 #define HTChunkPutUtf8Char(ch,x) \
82 { \
83 if ((TOASCII(x) < 128) && (ch->size < ch->allocated)) \
84 ch->data[ch->size++] = (char)x; \
85 else \
86 orig_HTChunkPutUtf8Char(ch,x); \
87 }
88
89 #define PUTS(str) ((*context->actions->put_string)(context->target, str))
90 #define PUTC(ch) ((*context->actions->put_character)(context->target, (char) ch))
91 #define PUTUTF8(code) (UCPutUtf8_charstring((HTStream *)context->target, \
92 (putc_func_t*)(context->actions->put_character), code))
93
94 #ifdef USE_PRETTYSRC
95 #define PRETTYSRC_PUTC(c) if (psrc_view) PUTC(c)
96 #else
97 #define PRETTYSRC_PUTC(c) /* nothing */
98 #endif
99
100 /*the following macros are used for pretty source view. */
101 #define IS_C(attr) (attr.type == HTMLA_CLASS)
102
103 HTCJKlang HTCJK = NOCJK; /* CJK enum value. */
104 BOOL HTPassEightBitRaw = FALSE; /* Pass 161-172,174-255 raw. */
105 BOOL HTPassEightBitNum = FALSE; /* Pass ^ numeric entities raw. */
106 BOOL HTPassHighCtrlRaw = FALSE; /* Pass 127-160,173, raw. */
107 BOOL HTPassHighCtrlNum = FALSE; /* Pass €-Ÿ raw. */
108
109 /* The State (context) of the parser
110 *
111 * This is passed with each call to make the parser reentrant
112 *
113 */
114
115 #define MAX_ATTRIBUTES 36 /* Max number of attributes per element */
116
117 /* Element Stack
118 * -------------
119 * This allows us to return down the stack reselecting styles.
120 * As we return, attribute values will be garbage in general.
121 */
122 typedef struct _HTElement HTElement;
123 struct _HTElement {
124 HTElement *next; /* Previously nested element or 0 */
125 HTTag *tag; /* The tag at this level */
126 };
127
128 typedef enum {
129 S_text = 0
130 ,S_attr
131 ,S_attr_gap
132 ,S_comment
133 ,S_cro
134 ,S_doctype
135 ,S_dollar
136 ,S_dollar_dq
137 ,S_dollar_paren
138 ,S_dollar_paren_dq
139 ,S_dollar_paren_sq
140 ,S_dollar_sq
141 ,S_dquoted
142 ,S_end
143 ,S_entity
144 ,S_equals
145 ,S_ero
146 ,S_esc
147 ,S_esc_dq
148 ,S_esc_sq
149 ,S_exclamation
150 ,S_in_kanji
151 ,S_incro
152 ,S_junk_tag
153 ,S_litteral
154 ,S_marked
155 ,S_nonascii_text
156 ,S_nonascii_text_dq
157 ,S_nonascii_text_sq
158 ,S_paren
159 ,S_paren_dq
160 ,S_paren_sq
161 ,S_pcdata
162 ,S_pi
163 ,S_script
164 ,S_sgmlatt
165 ,S_sgmlele
166 ,S_sgmlent
167 ,S_squoted
168 ,S_tag
169 ,S_tag_gap
170 ,S_tagname_slash
171 ,S_value
172 } sgml_state;
173
174 /* Internal Context Data Structure
175 * -------------------------------
176 */
177 struct _HTStream {
178
179 const HTStreamClass *isa; /* inherited from HTStream */
180
181 const SGML_dtd *dtd;
182 const HTStructuredClass *actions; /* target class */
183 HTStructured *target; /* target object */
184
185 HTTag *current_tag;
186 HTTag *slashedtag;
187 const HTTag *unknown_tag;
188 BOOL extended_html; /* xhtml */
189 BOOL strict_xml; /* xml */
190 BOOL inSELECT;
191 BOOL no_lynx_specialcodes;
192 int current_attribute_number;
193 HTChunk *string;
194 int leading_spaces;
195 int trailing_spaces;
196 HTElement *element_stack;
197 sgml_state state;
198 unsigned char kanji_buf;
199 #ifdef CALLERDATA
200 void *callerData;
201 #endif /* CALLERDATA */
202 BOOL present[MAX_ATTRIBUTES]; /* Flags: attribute is present? */
203 char *value[MAX_ATTRIBUTES]; /* NULL, or strings alloc'd with StrAllocCopy_extra() */
204
205 BOOL lead_exclamation;
206 BOOL first_dash;
207 BOOL end_comment;
208 BOOL doctype_bracket;
209 BOOL first_bracket;
210 BOOL second_bracket;
211 BOOL isHex;
212
213 HTParentAnchor *node_anchor;
214 LYUCcharset *inUCI; /* pointer to anchor UCInfo */
215 int inUCLYhndl; /* charset we are fed */
216 LYUCcharset *outUCI; /* anchor UCInfo for target */
217 int outUCLYhndl; /* charset for target */
218 char utf_count;
219 UCode_t utf_char;
220 char utf_buf[8];
221 char *utf_buf_p;
222 UCTransParams T;
223 int current_tag_charset; /* charset to pass attributes */
224
225 char *recover;
226 int recover_index;
227 char *include;
228 char *active_include;
229 int include_index;
230 char *url;
231 char *csi;
232 int csi_index;
233 #ifdef USE_PRETTYSRC
234 BOOL cur_attr_is_href;
235 BOOL cur_attr_is_name;
236 #endif
237 };
238
239 #ifdef NO_LYNX_TRACE
240 #define state_name(n) "state"
241 #else
state_name(sgml_state n)242 static const char *state_name(sgml_state n)
243 {
244 const char *result = "?";
245 /* *INDENT-OFF* */
246 switch (n) {
247 case S_attr: result = "S_attr"; break;
248 case S_attr_gap: result = "S_attr_gap"; break;
249 case S_comment: result = "S_comment"; break;
250 case S_cro: result = "S_cro"; break;
251 case S_doctype: result = "S_doctype"; break;
252 case S_dollar: result = "S_dollar"; break;
253 case S_dollar_dq: result = "S_dollar_dq"; break;
254 case S_dollar_paren: result = "S_dollar_paren"; break;
255 case S_dollar_paren_dq: result = "S_dollar_paren_dq"; break;
256 case S_dollar_paren_sq: result = "S_dollar_paren_sq"; break;
257 case S_dollar_sq: result = "S_dollar_sq"; break;
258 case S_dquoted: result = "S_dquoted"; break;
259 case S_end: result = "S_end"; break;
260 case S_entity: result = "S_entity"; break;
261 case S_equals: result = "S_equals"; break;
262 case S_ero: result = "S_ero"; break;
263 case S_esc: result = "S_esc"; break;
264 case S_esc_dq: result = "S_esc_dq"; break;
265 case S_esc_sq: result = "S_esc_sq"; break;
266 case S_exclamation: result = "S_exclamation"; break;
267 case S_in_kanji: result = "S_in_kanji"; break;
268 case S_incro: result = "S_incro"; break;
269 case S_pi: result = "S_pi"; break;
270 case S_junk_tag: result = "S_junk_tag"; break;
271 case S_litteral: result = "S_litteral"; break;
272 case S_marked: result = "S_marked"; break;
273 case S_nonascii_text: result = "S_nonascii_text"; break;
274 case S_nonascii_text_dq: result = "S_nonascii_text_dq"; break;
275 case S_nonascii_text_sq: result = "S_nonascii_text_sq"; break;
276 case S_paren: result = "S_paren"; break;
277 case S_paren_dq: result = "S_paren_dq"; break;
278 case S_paren_sq: result = "S_paren_sq"; break;
279 case S_pcdata: result = "S_pcdata"; break;
280 case S_script: result = "S_script"; break;
281 case S_sgmlatt: result = "S_sgmlatt"; break;
282 case S_sgmlele: result = "S_sgmlele"; break;
283 case S_sgmlent: result = "S_sgmlent"; break;
284 case S_squoted: result = "S_squoted"; break;
285 case S_tag: result = "S_tag"; break;
286 case S_tag_gap: result = "S_tag_gap"; break;
287 case S_tagname_slash: result = "S_tagname_slash"; break;
288 case S_text: result = "S_text"; break;
289 case S_value: result = "S_value"; break;
290 }
291 /* *INDENT-ON* */
292
293 return result;
294 }
295 #endif
296
297 /* storage for Element Stack */
298 #define DEPTH 10
299 static HTElement pool[DEPTH];
300 static int depth = 0;
301
pool_alloc(void)302 static HTElement *pool_alloc(void)
303 {
304 depth++;
305 if (depth > DEPTH)
306 return (HTElement *) malloc(sizeof(HTElement));
307 return (pool + depth - 1);
308 }
309
pool_free(HTElement * e)310 static void pool_free(HTElement * e)
311 {
312 if (depth > DEPTH)
313 FREE(e);
314 depth--;
315 return;
316 }
317
318 #ifdef USE_PRETTYSRC
319
HTMLSRC_apply_markup(HTStream * context,HTlexeme lexeme,int start)320 static void HTMLSRC_apply_markup(HTStream *context,
321 HTlexeme lexeme,
322 int start)
323 {
324 HT_tagspec *ts = *((start ? lexeme_start : lexeme_end) + lexeme);
325
326 while (ts) {
327 #ifdef USE_COLOR_STYLE
328 if (ts->start) {
329 current_tag_style = ts->style;
330 force_current_tag_style = TRUE;
331 forced_classname = ts->class_name;
332 force_classname = TRUE;
333 }
334 #endif
335 CTRACE((tfp, ts->start ? "SRCSTART %d\n" : "SRCSTOP %d\n", (int) lexeme));
336 if (ts->start)
337 (*context->actions->start_element) (context->target,
338 (int) ts->element,
339 ts->present,
340 (STRING2PTR) ts->value,
341 context->current_tag_charset,
342 &context->include);
343 else
344 (*context->actions->end_element) (context->target,
345 (int) ts->element,
346 &context->include);
347 ts = ts->next;
348 }
349 }
350
351 #define PSRCSTART(x) HTMLSRC_apply_markup(context,HTL_##x,START)
352 #define PSRCSTOP(x) HTMLSRC_apply_markup(context,HTL_##x,STOP)
353
354 #define attr_is_href context->cur_attr_is_href
355 #define attr_is_name context->cur_attr_is_name
356 #endif
357
set_chartrans_handling(HTStream * context,HTParentAnchor * anchor,int chndl)358 static void set_chartrans_handling(HTStream *context,
359 HTParentAnchor *anchor,
360 int chndl)
361 {
362 if (chndl < 0) {
363 /*
364 * Nothing was set for the parser in earlier stages, so the HTML
365 * parser's UCLYhndl should still be its default. - FM
366 */
367 chndl = HTAnchor_getUCLYhndl(anchor, UCT_STAGE_STRUCTURED);
368 if (chndl < 0)
369 /*
370 * That wasn't set either, so seek the HText default. - FM
371 */
372 chndl = HTAnchor_getUCLYhndl(anchor, UCT_STAGE_HTEXT);
373 if (chndl < 0)
374 /*
375 * That wasn't set either, so assume the current display character
376 * set. - FM
377 */
378 chndl = current_char_set;
379 /*
380 * Try to set the HText and HTML stages' chartrans info with the
381 * default lock level (will not be changed if it was set previously
382 * with a higher lock level). - FM
383 */
384 HTAnchor_setUCInfoStage(anchor, chndl,
385 UCT_STAGE_HTEXT,
386 UCT_SETBY_DEFAULT);
387 HTAnchor_setUCInfoStage(anchor, chndl,
388 UCT_STAGE_STRUCTURED,
389 UCT_SETBY_DEFAULT);
390 /*
391 * Get the chartrans info for output to the HTML parser. - FM
392 */
393 context->outUCI = HTAnchor_getUCInfoStage(anchor,
394 UCT_STAGE_STRUCTURED);
395 context->outUCLYhndl = HTAnchor_getUCLYhndl(context->node_anchor,
396 UCT_STAGE_STRUCTURED);
397 }
398 /*
399 * Set the in->out transformation parameters. - FM
400 */
401 UCSetTransParams(&context->T,
402 context->inUCLYhndl, context->inUCI,
403 context->outUCLYhndl, context->outUCI);
404 /*
405 * This is intended for passing the SGML parser's input charset as an
406 * argument in each call to the HTML parser's start tag function, but it
407 * would be better to call a Lynx_HTML_parser function to set an element in
408 * its HTStructured object, itself, if this were needed. - FM
409 */
410 #ifndef EXP_JAPANESEUTF8_SUPPORT
411 if (IS_CJK_TTY) {
412 context->current_tag_charset = -1;
413 } else
414 #endif
415 if (context->T.transp) {
416 context->current_tag_charset = context->inUCLYhndl;
417 } else if (context->T.decode_utf8) {
418 context->current_tag_charset = context->inUCLYhndl;
419 } else if (context->T.do_8bitraw ||
420 context->T.use_raw_char_in) {
421 context->current_tag_charset = context->inUCLYhndl;
422 } else if (context->T.output_utf8 ||
423 context->T.trans_from_uni) {
424 context->current_tag_charset = UCGetLYhndl_byMIME("utf-8");
425 } else {
426 context->current_tag_charset = LATIN1;
427 }
428 }
429
change_chartrans_handling(HTStream * context)430 static void change_chartrans_handling(HTStream *context)
431 {
432 int new_LYhndl = HTAnchor_getUCLYhndl(context->node_anchor,
433 UCT_STAGE_PARSER);
434
435 if (new_LYhndl != context->inUCLYhndl &&
436 new_LYhndl >= 0) {
437 /*
438 * Something changed. but ignore if a META wants an unknown charset.
439 */
440 LYUCcharset *new_UCI = HTAnchor_getUCInfoStage(context->node_anchor,
441 UCT_STAGE_PARSER);
442
443 if (new_UCI) {
444 LYUCcharset *next_UCI = HTAnchor_getUCInfoStage(context->node_anchor,
445 UCT_STAGE_STRUCTURED);
446 int next_LYhndl = HTAnchor_getUCLYhndl(context->node_anchor, UCT_STAGE_STRUCTURED);
447
448 context->inUCI = new_UCI;
449 context->inUCLYhndl = new_LYhndl;
450 context->outUCI = next_UCI;
451 context->outUCLYhndl = next_LYhndl;
452 set_chartrans_handling(context,
453 context->node_anchor, next_LYhndl);
454 }
455 }
456 }
457
458 #ifdef USE_COLOR_STYLE
459 #include <AttrList.h>
460 static int current_is_class = 0;
461 #endif
462
463 /* Handle Attribute
464 * ----------------
465 */
466 /* PUBLIC const char * SGML_default = ""; ?? */
467
handle_attribute_name(HTStream * context,const char * s)468 static void handle_attribute_name(HTStream *context, const char *s)
469 {
470 HTTag *tag = context->current_tag;
471 const attr *attributes = tag->attributes;
472 int high, low, i, diff;
473
474 #ifdef USE_PRETTYSRC
475 if (psrc_view) {
476 attr_is_href = FALSE;
477 attr_is_name = FALSE;
478 }
479 #endif
480 /*
481 * Ignore unknown tag. - KW
482 */
483 if (tag == context->unknown_tag) {
484 #ifdef USE_PRETTYSRC
485 if (psrc_view)
486 context->current_attribute_number = 1; /* anything !=INVALID */
487 #endif
488 return;
489 }
490
491 /*
492 * Binary search for attribute name.
493 */
494 for (low = 0, high = tag->number_of_attributes;
495 high > low;
496 diff < 0 ? (low = i + 1) : (high = i)) {
497 i = (low + (high - low) / 2);
498 diff = my_casecomp(attributes[i].name, s);
499 if (diff == 0) { /* success: found it */
500 context->current_attribute_number = i;
501 #ifdef USE_PRETTYSRC
502 if (psrc_view) {
503 attr_is_name = (BOOL) (attributes[i].type == HTMLA_ANAME);
504 attr_is_href = (BOOL) (attributes[i].type == HTMLA_HREF);
505 } else
506 #endif
507 {
508 context->present[i] = YES;
509 Clear_extra(context->value[i]);
510 #ifdef USE_COLOR_STYLE
511 # ifdef USE_PRETTYSRC
512 current_is_class = IS_C(attributes[i]);
513 # else
514 current_is_class = (!strcasecomp("class", s));
515 # endif
516 CTRACE((tfp, "SGML: found attribute %s, %d\n", s, current_is_class));
517 #endif
518 }
519 return;
520 }
521 /* if */
522 } /* for */
523
524 CTRACE((tfp, "SGML: Unknown attribute %s for tag %s\n",
525 s, NonNull(context->current_tag->name)));
526 context->current_attribute_number = INVALID; /* Invalid */
527 }
528
529 /* Handle attribute value
530 * ----------------------
531 */
handle_attribute_value(HTStream * context,const char * s)532 static void handle_attribute_value(HTStream *context, const char *s)
533 {
534 if (context->current_attribute_number != INVALID) {
535 StrAllocCopy_extra(context->value[context->current_attribute_number], s);
536 #ifdef USE_COLOR_STYLE
537 if (current_is_class) {
538 StrNCpy(class_string, s, TEMPSTRINGSIZE);
539 CTRACE((tfp, "SGML: class is '%s'\n", s));
540 } else {
541 CTRACE((tfp, "SGML: attribute value is '%s'\n", s));
542 }
543 #endif
544 } else {
545 CTRACE((tfp, "SGML: Attribute value %s ***ignored\n", s));
546 }
547 context->current_attribute_number = INVALID; /* can't have two assignments! */
548 }
549
550 /*
551 * Translate some Unicodes to Lynx special codes and output them.
552 * Special codes - ones those output depend on parsing.
553 *
554 * Additional issue, like handling bidirectional text if necessary
555 * may be called from here: zwnj (8204), zwj (8205), lrm (8206), rlm (8207)
556 * - currently they are ignored in SGML.c and LYCharUtils.c
557 * but also in UCdomap.c because they are non printable...
558 *
559 */
put_special_unicodes(HTStream * context,UCode_t code)560 static BOOL put_special_unicodes(HTStream *context, UCode_t code)
561 {
562 /* (Tgf_nolyspcl) */
563 if (context->no_lynx_specialcodes) {
564 /*
565 * We were asked by a "DTD" flag to not generate lynx specials. - kw
566 */
567 return NO;
568 }
569
570 if (code == CH_NBSP) { /* S/390 -- gil -- 0657 */
571 /*
572 * Use Lynx special character for nbsp.
573 */
574 #ifdef USE_PRETTYSRC
575 if (!psrc_view)
576 #endif
577 PUTC(HT_NON_BREAK_SPACE);
578 } else if (code == CH_SHY) {
579 /*
580 * Use Lynx special character for shy.
581 */
582 #ifdef USE_PRETTYSRC
583 if (!psrc_view)
584 #endif
585 PUTC(LY_SOFT_HYPHEN);
586 } else if (code == 8194 || code == 8201) {
587 /*
588 * Use Lynx special character for ensp or thinsp.
589 *
590 * Originally, Lynx use space '32' as word delimiter and omits this
591 * space at end of line if word is wrapped to the next line. There are
592 * several other spaces in the Unicode repertoire and we should teach
593 * Lynx to understand them, not only as regular characters but in the
594 * context of line wrapping. Unfortunately, if we use HT_EN_SPACE we
595 * override the chartrans tables for those spaces with a single '32'
596 * for all (but do line wrapping more fancy).
597 *
598 * We may treat emsp as one or two ensp (below).
599 */
600 #ifdef USE_PRETTYSRC
601 if (!psrc_view)
602 #endif
603 PUTC(HT_EN_SPACE);
604 } else if (code == 8195) {
605 /*
606 * Use Lynx special character for emsp.
607 */
608 #ifdef USE_PRETTYSRC
609 if (!psrc_view) {
610 #endif
611 /* PUTC(HT_EN_SPACE); let's stay with a single space :) */
612 PUTC(HT_EN_SPACE);
613 #ifdef USE_PRETTYSRC
614 }
615 #endif
616 } else {
617 /*
618 * Return NO if nothing done.
619 */
620 return NO;
621 }
622 /*
623 * We have handled it.
624 */
625 return YES;
626 }
627
628 #ifdef USE_PRETTYSRC
put_pretty_entity(HTStream * context,int term)629 static void put_pretty_entity(HTStream *context, int term)
630 {
631 PSRCSTART(entity);
632 PUTC('&');
633 PUTS(entity_string);
634 if (term)
635 PUTC((char) term);
636 PSRCSTOP(entity);
637 }
638
put_pretty_number(HTStream * context)639 static void put_pretty_number(HTStream *context)
640 {
641 PSRCSTART(entity);
642 PUTS((context->isHex ? "&#x" : "&#"));
643 PUTS(entity_string);
644 PUTC(';');
645 PSRCSTOP(entity);
646 }
647 #endif /* USE_PRETTYSRC */
648
649 /* Handle entity
650 * -------------
651 *
652 * On entry,
653 * s contains the entity name zero terminated
654 * Bugs:
655 * If the entity name is unknown, the terminator is treated as
656 * a printable non-special character in all cases, even if it is '<'
657 * Bug-fix:
658 * Modified SGML_character() so we only come here with terminator
659 * as '\0' and check a FoundEntity flag. -- Foteos Macrides
660 *
661 * Modified more (for use with Lynx character translation code):
662 */
663 static char replace_buf[64]; /* buffer for replacement strings */
664 static BOOL FoundEntity = FALSE;
665
handle_entity(HTStream * context,int term)666 static void handle_entity(HTStream *context, int term)
667 {
668 UCode_t code;
669 long uck = -1;
670 const char *s = context->string->data;
671
672 /*
673 * Handle all entities normally. - FM
674 */
675 FoundEntity = FALSE;
676 if ((code = HTMLGetEntityUCValue(s)) != 0) {
677 /*
678 * We got a Unicode value for the entity name. Check for special
679 * Unicodes. - FM
680 */
681 if (put_special_unicodes(context, code)) {
682 #ifdef USE_PRETTYSRC
683 if (psrc_view) {
684 put_pretty_entity(context, term);
685 }
686 #endif
687 FoundEntity = TRUE;
688 return;
689 }
690 /*
691 * Seek a translation from the chartrans tables.
692 */
693 if ((uck = UCTransUniChar(code, context->outUCLYhndl)) >= 32 &&
694 /* =============== work in ASCII below here =============== S/390 -- gil -- 0672 */
695 uck < 256 &&
696 (uck < 127 ||
697 uck >= LYlowest_eightbit[context->outUCLYhndl])) {
698 #ifdef USE_PRETTYSRC
699 if (psrc_view) {
700 put_pretty_entity(context, term);
701 } else
702 #endif
703 PUTC(FROMASCII((char) uck));
704 FoundEntity = TRUE;
705 return;
706 } else if ((uck == -4 ||
707 (context->T.repl_translated_C0 &&
708 uck > 0 && uck < 32)) &&
709 /*
710 * Not found; look for replacement string.
711 */
712 (uck = UCTransUniCharStr(replace_buf, 60, code,
713 context->outUCLYhndl, 0) >= 0)) {
714 #ifdef USE_PRETTYSRC
715 if (psrc_view) {
716 put_pretty_entity(context, term);
717 } else
718 #endif
719 PUTS(replace_buf);
720 FoundEntity = TRUE;
721 return;
722 }
723 /*
724 * If we're displaying UTF-8, try that now. - FM
725 */
726 #ifndef USE_PRETTYSRC
727 if (context->T.output_utf8 && PUTUTF8(code)) {
728 FoundEntity = TRUE;
729 return;
730 }
731 #else
732 if (context->T.output_utf8 && (psrc_view
733 ? (UCPutUtf8_charstring((HTStream *) context->target,
734 (putc_func_t *) (fake_put_character),
735 code))
736 : PUTUTF8(code))) {
737
738 if (psrc_view) {
739 put_pretty_entity(context, term);
740 }
741
742 FoundEntity = TRUE;
743 return;
744 }
745 #endif
746 /*
747 * If it's safe ASCII, use it. - FM
748 */
749 if (code >= 32 && code < 127) {
750 #ifdef USE_PRETTYSRC
751 if (psrc_view) {
752 put_pretty_entity(context, term);
753 } else
754 #endif
755
756 PUTC(FROMASCII((char) code));
757 FoundEntity = TRUE;
758 return;
759 }
760 /* =============== work in ASCII above here =============== S/390 -- gil -- 0682 */
761 /*
762 * Ignore zwnj (8204) and zwj (8205), if we get to here. Note that
763 * zwnj may have been handled as <WBR> by the calling function. - FM
764 */
765 if (!strcmp(s, "zwnj") ||
766 !strcmp(s, "zwj")) {
767 CTRACE((tfp, "handle_entity: Ignoring '%s'.\n", s));
768 #ifdef USE_PRETTYSRC
769 if (psrc_view) {
770 put_pretty_entity(context, term);
771 }
772 #endif
773 FoundEntity = TRUE;
774 return;
775 }
776 /*
777 * Ignore lrm (8206), and rln (8207), if we get to here. - FM
778 */
779 if (!strcmp(s, "lrm") ||
780 !strcmp(s, "rlm")) {
781 CTRACE((tfp, "handle_entity: Ignoring '%s'.\n", s));
782 #ifdef USE_PRETTYSRC
783 if (psrc_view) {
784 put_pretty_entity(context, term);
785 }
786 #endif
787 FoundEntity = TRUE;
788 return;
789 }
790 }
791
792 /*
793 * If entity string not found, display as text.
794 */
795 #ifdef USE_PRETTYSRC
796 if (psrc_view)
797 PSRCSTART(badseq);
798 #endif
799 CTRACE((tfp, "SGML: Unknown entity '%s' %" PRI_UCode_t " %ld\n", s, code, uck)); /* S/390 -- gil -- 0695 */
800 PUTC('&');
801 PUTS(s);
802 if (term != '\0')
803 PUTC(term);
804 #ifdef USE_PRETTYSRC
805 if (psrc_view)
806 PSRCSTOP(badseq);
807 #endif
808 }
809
810 /* Handle comment
811 * --------------
812 */
handle_comment(HTStream * context)813 static void handle_comment(HTStream *context)
814 {
815 const char *s = context->string->data;
816
817 CTRACE((tfp, "SGML Comment:\n<%s>\n", s));
818
819 if (context->csi == NULL &&
820 StrNCmp(s, "!--#", 4) == 0 &&
821 LYCheckForCSI(context->node_anchor, &context->url) == TRUE) {
822 LYDoCSI(context->url, s, &context->csi);
823 } else {
824 LYCommentHacks(context->node_anchor, context->string->data);
825 }
826
827 return;
828 }
829
830 /* Handle identifier
831 * -----------------
832 */
handle_identifier(HTStream * context)833 static void handle_identifier(HTStream *context)
834 {
835 const char *s = context->string->data;
836
837 CTRACE((tfp, "SGML Identifier:\n<%s>\n", s));
838
839 return;
840 }
841
842 /* Handle doctype
843 * --------------
844 */
handle_doctype(HTStream * context)845 static void handle_doctype(HTStream *context)
846 {
847 const char *s = context->string->data;
848
849 CTRACE((tfp, "SGML Doctype:\n<%s>\n", s));
850 if (strstr(s, "DTD XHTML ") != 0) {
851 CTRACE((tfp, "...processing extended HTML\n"));
852 context->extended_html = TRUE;
853 }
854
855 return;
856 }
857
858 /* Handle marked
859 * -------------
860 */
handle_marked(HTStream * context)861 static void handle_marked(HTStream *context)
862 {
863 const char *s = context->string->data;
864
865 CTRACE((tfp, "SGML Marked Section:\n<%s>\n", s));
866
867 if (!StrNCmp(context->string->data, "![INCLUDE[", 10)) {
868 context->string->data[context->string->size - 3] = '\0';
869 StrAllocCat(context->include, context->string->data + 10);
870 /* @@@ This needs to take charset into account! @@@
871 the wrong assumptions will be made about the data's
872 charset once it is in include - kw */
873
874 } else if (!StrNCmp(context->string->data, "![CDATA[", 8)) {
875 (*context->actions->put_block) (context->target,
876 context->string->data + 8,
877 context->string->size - 11);
878
879 }
880 return;
881 }
882
883 /* Handle processing instruction
884 * -----------------------------
885 */
handle_processing_instruction(HTStream * context)886 static void handle_processing_instruction(HTStream *context)
887 {
888 const char *s = context->string->data;
889
890 CTRACE((tfp, "SGML Processing instruction:\n<%s>\n", s));
891
892 if (!StrNCmp(s, "?xml ", 5)) {
893 int flag = context->T.decode_utf8;
894
895 context->strict_xml = TRUE;
896 /*
897 * Switch to UTF-8 if the encoding is explicitly "utf-8".
898 */
899 if (!flag) {
900 char *t = strstr(s, "encoding=");
901
902 if (t != 0) {
903 t += 9;
904 if (*t == '"')
905 ++t;
906 flag = !StrNCmp(t, "utf-8", 5);
907 }
908 if (flag) {
909 CTRACE((tfp, "...Use UTF-8 for XML\n"));
910 context->T.decode_utf8 = TRUE;
911 }
912 }
913 }
914
915 return;
916 }
917
918 /* Handle sgmlent
919 * --------------
920 */
handle_sgmlent(HTStream * context)921 static void handle_sgmlent(HTStream *context)
922 {
923 const char *s = context->string->data;
924
925 CTRACE((tfp, "SGML Entity Declaration:\n<%s>\n", s));
926
927 return;
928 }
929
930 /* Handle sgmlent
931 * --------------
932 */
handle_sgmlele(HTStream * context)933 static void handle_sgmlele(HTStream *context)
934 {
935 const char *s = context->string->data;
936
937 CTRACE((tfp, "SGML Element Declaration:\n<%s>\n", s));
938
939 return;
940 }
941
942 /* Handle sgmlatt
943 * --------------
944 */
handle_sgmlatt(HTStream * context)945 static void handle_sgmlatt(HTStream *context)
946 {
947 const char *s = context->string->data;
948
949 CTRACE((tfp, "SGML Attribute Declaration:\n<%s>\n", s));
950
951 return;
952 }
953
954 /*
955 * Convenience macros - tags (elements) are identified sometimes by an int or
956 * enum value ('TAGNUM'), sometimes by a pointer to HTTag ('TAGP'). - kw
957 */
958 #define TAGNUM_OF_TAGP(t) (HTMLElement) (t - context->dtd->tags)
959 #define TAGP_OF_TAGNUM(e) (context->dtd->tags + e)
960
961 /*
962 * The following implement special knowledge about OBJECT. As long as
963 * HTML_OBJECT is the only tag for which an alternative variant exist, they can
964 * be simple macros. - kw
965 */
966 /* does 'TAGNUM' e have an alternative (variant) parsing mode? */
967 #define HAS_ALT_TAGNUM(e) (e == HTML_OBJECT)
968
969 /* return 'TAGNUM' of the alternative mode for 'TAGNUM' e, if any. */
970 #define ALT_TAGNUM(e) ((e == HTML_OBJECT) ? HTML_ALT_OBJECT : e)
971
972 /* return 'TAGNUM' of the normal mode for 'TAGNUM' e which may be alt. */
973 #define NORMAL_TAGNUM(e) (((int)(e) >= HTML_ELEMENTS) ? HTML_OBJECT : (HTMLElement)e)
974
975 /* More convenience stuff. - kw */
976 #define ALT_TAGP_OF_TAGNUM(e) TAGP_OF_TAGNUM(ALT_TAGNUM(e))
977 #define NORMAL_TAGP_OF_TAGNUM(e) TAGP_OF_TAGNUM(NORMAL_TAGNUM(e))
978
979 #define ALT_TAGP(t) ALT_TAGP_OF_TAGNUM(TAGNUM_OF_TAGP(t))
980 #define NORMAL_TAGP(t) NORMAL_TAGP_OF_TAGNUM(TAGNUM_OF_TAGP(t))
981
element_valid_within(HTTag * new_tag,HTTag * stacked_tag,int direct)982 static BOOL element_valid_within(HTTag * new_tag, HTTag * stacked_tag, int direct)
983 {
984 BOOL result = YES;
985 TagClass usecontains, usecontained;
986
987 if (stacked_tag && new_tag) {
988 usecontains = (direct ? stacked_tag->contains : stacked_tag->icontains);
989 usecontained = (direct ? new_tag->contained : new_tag->icontained);
990 if (new_tag == stacked_tag) {
991 result = (BOOL) ((Tgc_same & usecontains) &&
992 (Tgc_same & usecontained));
993 } else {
994 result = (BOOL) ((new_tag->tagclass & usecontains) &&
995 (stacked_tag->tagclass & usecontained));
996 }
997 }
998 return result;
999 }
1000
1001 typedef enum {
1002 close_NO = 0,
1003 close_error = 1,
1004 close_valid = 2
1005 } canclose_t;
1006
can_close(HTTag * new_tag,HTTag * stacked_tag)1007 static canclose_t can_close(HTTag * new_tag, HTTag * stacked_tag)
1008 {
1009 canclose_t result;
1010
1011 if (!stacked_tag) {
1012 result = close_NO;
1013 } else if (stacked_tag->flags & Tgf_endO) {
1014 result = close_valid;
1015 } else if (new_tag == stacked_tag) {
1016 result = ((Tgc_same & new_tag->canclose)
1017 ? close_error
1018 : close_NO);
1019 } else {
1020 result = ((stacked_tag->tagclass & new_tag->canclose)
1021 ? close_error
1022 : close_NO);
1023 }
1024 return result;
1025 }
1026
do_close_stacked(HTStream * context)1027 static void do_close_stacked(HTStream *context)
1028 {
1029 HTElement *stacked = context->element_stack;
1030 HTMLElement e;
1031
1032 if (!stacked)
1033 return; /* stack was empty */
1034 if (context->inSELECT && !strcasecomp(stacked->tag->name, "SELECT")) {
1035 context->inSELECT = FALSE;
1036 }
1037 e = NORMAL_TAGNUM(TAGNUM_OF_TAGP(stacked->tag));
1038 #ifdef USE_PRETTYSRC
1039 if (!psrc_view) /* Don't actually pass call on if viewing psrc - kw */
1040 #endif
1041 (*context->actions->end_element) (context->target,
1042 (int) e,
1043 &context->include);
1044 context->element_stack = stacked->next;
1045 pool_free(stacked);
1046 context->no_lynx_specialcodes =
1047 (BOOL) (context->element_stack
1048 ? (context->element_stack->tag->flags & Tgf_nolyspcl)
1049 : NO);
1050 }
1051
is_on_stack(HTStream * context,HTTag * old_tag)1052 static int is_on_stack(HTStream *context, HTTag * old_tag)
1053 {
1054 HTElement *stacked = context->element_stack;
1055 int i = 1;
1056
1057 for (; stacked; stacked = stacked->next, i++) {
1058 if (stacked->tag == old_tag ||
1059 stacked->tag == ALT_TAGP(old_tag))
1060 return i;
1061 }
1062 return 0;
1063 }
1064
1065 /* End element
1066 * -----------
1067 */
end_element(HTStream * context,HTTag * old_tag)1068 static void end_element(HTStream *context, HTTag * old_tag)
1069 {
1070 BOOL extra_action_taken = NO;
1071 canclose_t canclose_check = close_valid;
1072 int stackpos = is_on_stack(context, old_tag);
1073
1074 if (!Old_DTD) {
1075 while (canclose_check != close_NO &&
1076 context->element_stack &&
1077 (stackpos > 1 || (!extra_action_taken && stackpos == 0))) {
1078 if (stackpos == 0 && (old_tag->flags & Tgf_startO) &&
1079 element_valid_within(old_tag, context->element_stack->tag, YES)) {
1080 CTRACE((tfp, "SGML: </%s> ignored\n", old_tag->name));
1081 return;
1082 }
1083 canclose_check = can_close(old_tag, context->element_stack->tag);
1084 if (canclose_check != close_NO) {
1085 CTRACE((tfp, "SGML: End </%s> \t<- %s end </%s>\n",
1086 context->element_stack->tag->name,
1087 ((canclose_check == close_valid)
1088 ? "supplied,"
1089 : "***forced by"),
1090 old_tag->name));
1091 do_close_stacked(context);
1092 extra_action_taken = YES;
1093 stackpos = is_on_stack(context, old_tag);
1094 }
1095 }
1096
1097 if (stackpos == 0 && old_tag->contents != SGML_EMPTY) {
1098 CTRACE((tfp, "SGML: Still open %s, ***no open %s for </%s>\n",
1099 context->element_stack ?
1100 context->element_stack->tag->name : "none",
1101 old_tag->name,
1102 old_tag->name));
1103 return;
1104 }
1105 if (stackpos > 1) {
1106 CTRACE((tfp,
1107 "SGML: Nesting <%s>...<%s> \t<- ***invalid end </%s>\n",
1108 old_tag->name,
1109 context->element_stack ?
1110 context->element_stack->tag->name : "none",
1111 old_tag->name));
1112 return;
1113 }
1114 }
1115 /* Now let the non-extended code deal with the rest. - kw */
1116
1117 /*
1118 * If we are in a SELECT block, ignore anything but a SELECT end tag. - FM
1119 */
1120 if (context->inSELECT) {
1121 if (!strcasecomp(old_tag->name, "SELECT")) {
1122 /*
1123 * Turn off the inSELECT flag and fall through. - FM
1124 */
1125 context->inSELECT = FALSE;
1126 } else {
1127 /*
1128 * Ignore the end tag. - FM
1129 */
1130 CTRACE((tfp, "SGML: ***Ignoring end tag </%s> in SELECT block.\n",
1131 old_tag->name));
1132 return;
1133 }
1134 }
1135 /*
1136 * Handle the end tag. - FM
1137 */
1138 CTRACE((tfp, "SGML: End </%s>\n", old_tag->name));
1139 if (old_tag->contents == SGML_EMPTY) {
1140 CTRACE((tfp, "SGML: ***Illegal end tag </%s> found.\n",
1141 old_tag->name));
1142 return;
1143 }
1144 #ifdef WIND_DOWN_STACK
1145 while (context->element_stack) /* Loop is error path only */
1146 #else
1147 if (context->element_stack) /* Substitute and remove one stack element */
1148 #endif /* WIND_DOWN_STACK */
1149 {
1150 int status = HT_OK;
1151 HTMLElement e;
1152 HTElement *N = context->element_stack;
1153 HTTag *t = (N->tag != old_tag) ? NORMAL_TAGP(N->tag) : N->tag;
1154
1155 if (old_tag != t) { /* Mismatch: syntax error */
1156 if (context->element_stack->next) { /* This is not the last level */
1157 CTRACE((tfp,
1158 "SGML: Found </%s> when expecting </%s>. </%s> ***assumed.\n",
1159 old_tag->name, t->name, t->name));
1160 } else { /* last level */
1161 CTRACE((tfp,
1162 "SGML: Found </%s> when expecting </%s>. </%s> ***Ignored.\n",
1163 old_tag->name, t->name, old_tag->name));
1164 return; /* Ignore */
1165 }
1166 }
1167
1168 e = NORMAL_TAGNUM(TAGNUM_OF_TAGP(t));
1169 CTRACE2(TRACE_SGML, (tfp, "tagnum(%p) = %d\n", (void *) t, (int) e));
1170 #ifdef USE_PRETTYSRC
1171 if (!psrc_view) /* Don't actually pass call on if viewing psrc - kw */
1172 #endif
1173 status = (*context->actions->end_element) (context->target,
1174 (int) e,
1175 &context->include);
1176 if (status == HT_PARSER_REOPEN_ELT) {
1177 CTRACE((tfp, "SGML: Restart <%s>\n", t->name));
1178 (*context->actions->start_element) (context->target,
1179 (int) e,
1180 NULL,
1181 NULL,
1182 context->current_tag_charset,
1183 &context->include);
1184 } else if (status == HT_PARSER_OTHER_CONTENT) {
1185 CTRACE((tfp, "SGML: Continue with other content model for <%s>\n", t->name));
1186 context->element_stack->tag = ALT_TAGP_OF_TAGNUM(e);
1187 } else {
1188 context->element_stack = N->next; /* Remove from stack */
1189 pool_free(N);
1190 }
1191 context->no_lynx_specialcodes =
1192 (BOOL) (context->element_stack
1193 ? (context->element_stack->tag->flags & Tgf_nolyspcl)
1194 : NO);
1195 #ifdef WIND_DOWN_STACK
1196 if (old_tag == t)
1197 return; /* Correct sequence */
1198 #else
1199 return;
1200 #endif /* WIND_DOWN_STACK */
1201
1202 /* Syntax error path only */
1203
1204 }
1205 CTRACE((tfp, "SGML: Extra end tag </%s> found and ignored.\n",
1206 old_tag->name));
1207 }
1208
1209 /* Start a element
1210 */
start_element(HTStream * context)1211 static void start_element(HTStream *context)
1212 {
1213 int status;
1214 HTTag *new_tag = context->current_tag;
1215 HTMLElement e = TAGNUM_OF_TAGP(new_tag);
1216 BOOL ok = FALSE;
1217
1218 BOOL valid = YES;
1219 BOOL direct_container = YES;
1220 BOOL extra_action_taken = NO;
1221 canclose_t canclose_check = close_valid;
1222
1223 if (!Old_DTD) {
1224 while (context->element_stack &&
1225 (canclose_check == close_valid ||
1226 (canclose_check == close_error &&
1227 new_tag == context->element_stack->tag)) &&
1228 !(valid = element_valid_within(new_tag,
1229 context->element_stack->tag,
1230 direct_container))) {
1231 canclose_check = can_close(new_tag, context->element_stack->tag);
1232 if (canclose_check != close_NO) {
1233 CTRACE((tfp, "SGML: End </%s> \t<- %s start <%s>\n",
1234 context->element_stack->tag->name,
1235 ((canclose_check == close_valid)
1236 ? "supplied,"
1237 : "***forced by"),
1238 new_tag->name));
1239 do_close_stacked(context);
1240 extra_action_taken = YES;
1241 if (canclose_check == close_error)
1242 direct_container = NO;
1243 } else {
1244 CTRACE((tfp,
1245 "SGML: Still open %s \t<- ***invalid start <%s>\n",
1246 context->element_stack->tag->name,
1247 new_tag->name));
1248 }
1249 }
1250 if (context->element_stack && !valid &&
1251 (context->element_stack->tag->flags & Tgf_strict) &&
1252 !(valid = element_valid_within(new_tag,
1253 context->element_stack->tag,
1254 direct_container))) {
1255 CTRACE((tfp, "SGML: Still open %s \t<- ***ignoring start <%s>\n",
1256 context->element_stack->tag->name,
1257 new_tag->name));
1258 return;
1259 }
1260
1261 if (context->element_stack &&
1262 !extra_action_taken &&
1263 (canclose_check == close_NO) &&
1264 !valid && (new_tag->flags & Tgf_mafse)) {
1265 BOOL has_attributes = NO;
1266 int i = 0;
1267
1268 for (; i < new_tag->number_of_attributes && !has_attributes; i++)
1269 has_attributes = context->present[i];
1270 if (!has_attributes) {
1271 CTRACE((tfp,
1272 "SGML: Still open %s, ***converting invalid <%s> to </%s>\n",
1273 context->element_stack->tag->name,
1274 new_tag->name,
1275 new_tag->name));
1276 end_element(context, new_tag);
1277 return;
1278 }
1279 }
1280
1281 if (context->element_stack &&
1282 (canclose_check == close_error) &&
1283 !element_valid_within(new_tag,
1284 context->element_stack->tag,
1285 direct_container)) {
1286 CTRACE((tfp, "SGML: Still open %s \t<- ***invalid start <%s>\n",
1287 context->element_stack->tag->name,
1288 new_tag->name));
1289 }
1290 }
1291 /* Fall through to the non-extended code - kw */
1292
1293 /*
1294 * If we are not in a SELECT block, check if this is a SELECT start tag.
1295 * Otherwise (i.e., we are in a SELECT block) accept only OPTION as valid,
1296 * terminate the SELECT block if it is any other form-related element, and
1297 * otherwise ignore it. - FM
1298 */
1299 if (!context->inSELECT) {
1300 /*
1301 * We are not in a SELECT block, so check if this starts one. - FM
1302 * (frequent case!)
1303 */
1304 /* my_casecomp() - optimized by the first character */
1305 if (!my_casecomp(new_tag->name, "SELECT")) {
1306 /*
1307 * Set the inSELECT flag and fall through. - FM
1308 */
1309 context->inSELECT = TRUE;
1310 }
1311 } else {
1312 /*
1313 * We are in a SELECT block. - FM
1314 */
1315 if (strcasecomp(new_tag->name, "OPTION")) {
1316 /*
1317 * Ugh, it is not an OPTION. - FM
1318 */
1319 switch (e) {
1320 case HTML_INPUT:
1321 case HTML_TEXTAREA:
1322 case HTML_SELECT:
1323 case HTML_BUTTON:
1324 case HTML_FIELDSET:
1325 case HTML_LABEL:
1326 case HTML_LEGEND:
1327 case HTML_FORM:
1328 ok = TRUE;
1329 break;
1330 default:
1331 break;
1332 }
1333 if (ok) {
1334 /*
1335 * It is another form-related start tag, so terminate the
1336 * current SELECT block and fall through. - FM
1337 */
1338 CTRACE((tfp,
1339 "SGML: ***Faking SELECT end tag before <%s> start tag.\n",
1340 new_tag->name));
1341 end_element(context, SGMLFindTag(context->dtd, "SELECT"));
1342 } else {
1343 /*
1344 * Ignore the start tag. - FM
1345 */
1346 CTRACE((tfp,
1347 "SGML: ***Ignoring start tag <%s> in SELECT block.\n",
1348 new_tag->name));
1349 return;
1350 }
1351 }
1352 }
1353 /*
1354 * Handle the start tag. - FM
1355 */
1356 CTRACE((tfp, "SGML: Start <%s>\n", new_tag->name));
1357 status = (*context->actions->start_element) (context->target,
1358 (int) TAGNUM_OF_TAGP(new_tag),
1359 context->present,
1360 (STRING2PTR) context->value, /* coerce type for think c */
1361 context->current_tag_charset,
1362 &context->include);
1363 if (status == HT_PARSER_OTHER_CONTENT)
1364 new_tag = ALT_TAGP(new_tag); /* this is only returned for OBJECT */
1365 if (new_tag->contents != SGML_EMPTY) { /* i.e., tag not empty */
1366 HTElement *N = pool_alloc();
1367
1368 if (N == NULL)
1369 outofmem(__FILE__, "start_element");
1370
1371 assert(N != NULL);
1372
1373 N->next = context->element_stack;
1374 N->tag = new_tag;
1375 context->element_stack = N;
1376 context->no_lynx_specialcodes = (BOOLEAN) (new_tag->flags & Tgf_nolyspcl);
1377
1378 } else if (e == HTML_META) {
1379 /*
1380 * Check for result of META tag. - KW & FM
1381 */
1382 change_chartrans_handling(context);
1383 }
1384 }
1385
1386 /* Find Tag in DTD tag list
1387 * ------------------------
1388 *
1389 * On entry,
1390 * dtd points to dtd structure including valid tag list
1391 * string points to name of tag in question
1392 *
1393 * On exit,
1394 * returns:
1395 * NULL tag not found
1396 * else address of tag structure in dtd
1397 */
SGMLFindTag(const SGML_dtd * dtd,const char * s)1398 HTTag *SGMLFindTag(const SGML_dtd * dtd,
1399 const char *s)
1400 {
1401 int high, low, i, diff;
1402 static HTTag *last[64] =
1403 {NULL}; /*optimize using the previous results */
1404 HTTag **res = last + (UCH(*s) % 64); /*pointer arithmetic */
1405
1406 if (*res) {
1407 if ((*res)->name == NULL)
1408 return NULL;
1409 if (!strcasecomp((*res)->name, s))
1410 return *res;
1411 }
1412
1413 for (low = 0, high = dtd->number_of_tags;
1414 high > low;
1415 diff < 0 ? (low = i + 1) : (high = i)) { /* Binary search */
1416 i = (low + (high - low) / 2);
1417 /* my_casecomp() - optimized by the first character, NOT_ASCII ok */
1418 diff = my_casecomp(dtd->tags[i].name, s); /* Case insensitive */
1419 if (diff == 0) { /* success: found it */
1420 *res = &dtd->tags[i];
1421 return *res;
1422 }
1423 }
1424 if (IsNmStart(*s)) {
1425 /*
1426 * Unrecognized, but may be valid. - KW
1427 */
1428 return &HTTag_unrecognized;
1429 }
1430 return NULL;
1431 }
1432
1433 /*________________________________________________________________________
1434 * Public Methods
1435 */
1436
1437 /* Could check that we are back to bottom of stack! @@ */
1438 /* Do check! - FM */
1439 /* */
SGML_free(HTStream * context)1440 static void SGML_free(HTStream *context)
1441 {
1442 int i;
1443 HTElement *cur;
1444 HTTag *t;
1445
1446 /*
1447 * Free the buffers. - FM
1448 */
1449 FREE(context->recover);
1450 FREE(context->url);
1451 FREE(context->csi);
1452 FREE(context->include);
1453 FREE(context->active_include);
1454
1455 /*
1456 * Wind down stack if any elements are open. - FM
1457 */
1458 while (context->element_stack) {
1459 cur = context->element_stack;
1460 t = cur->tag;
1461 context->element_stack = cur->next; /* Remove from stack */
1462 pool_free(cur);
1463 #ifdef USE_PRETTYSRC
1464 if (!psrc_view) /* Don't actually call on target if viewing psrc - kw */
1465 #endif
1466 (*context->actions->end_element)
1467 (context->target,
1468 (int) NORMAL_TAGNUM(TAGNUM_OF_TAGP(t)),
1469 &context->include);
1470 FREE(context->include);
1471 }
1472
1473 /*
1474 * Finish off the target. - FM
1475 */
1476 (*context->actions->_free) (context->target);
1477
1478 /*
1479 * Free the strings and context structure. - FM
1480 */
1481 HTChunkFree(context->string);
1482 for (i = 0; i < MAX_ATTRIBUTES; i++)
1483 FREE_extra(context->value[i]);
1484 FREE(context);
1485
1486 #ifdef USE_PRETTYSRC
1487 sgml_in_psrc_was_initialized = FALSE;
1488 #endif
1489 }
1490
SGML_abort(HTStream * context,HTError e)1491 static void SGML_abort(HTStream *context, HTError e)
1492 {
1493 int i;
1494 HTElement *cur;
1495
1496 /*
1497 * Abort the target. - FM
1498 */
1499 (*context->actions->_abort) (context->target, e);
1500
1501 /*
1502 * Free the buffers. - FM
1503 */
1504 FREE(context->recover);
1505 FREE(context->include);
1506 FREE(context->active_include);
1507 FREE(context->url);
1508 FREE(context->csi);
1509
1510 /*
1511 * Free stack memory if any elements were left open. - KW
1512 */
1513 while (context->element_stack) {
1514 cur = context->element_stack;
1515 context->element_stack = cur->next; /* Remove from stack */
1516 pool_free(cur);
1517 }
1518
1519 /*
1520 * Free the strings and context structure. - FM
1521 */
1522 HTChunkFree(context->string);
1523 for (i = 0; i < MAX_ATTRIBUTES; i++)
1524 FREE_extra(context->value[i]);
1525 FREE(context);
1526
1527 #ifdef USE_PRETTYSRC
1528 sgml_in_psrc_was_initialized = FALSE;
1529 #endif
1530 }
1531
1532 /* Read and write user callback handle
1533 * -----------------------------------
1534 *
1535 * The callbacks from the SGML parser have an SGML context parameter.
1536 * These calls allow the caller to associate his own context with a
1537 * particular SGML context.
1538 */
1539
1540 #ifdef CALLERDATA
SGML_callerData(HTStream * context)1541 void *SGML_callerData(HTStream *context)
1542 {
1543 return context->callerData;
1544 }
1545
SGML_setCallerData(HTStream * context,void * data)1546 void SGML_setCallerData(HTStream *context, void *data)
1547 {
1548 context->callerData = data;
1549 }
1550 #endif /* CALLERDATA */
1551
1552 #ifdef USE_PRETTYSRC
transform_tag(HTStream * context,HTChunk * string)1553 static void transform_tag(HTStream *context, HTChunk *string)
1554 {
1555 if (!context->strict_xml) {
1556 if (tagname_transform != 1) {
1557 if (tagname_transform == 0)
1558 LYLowerCase(string->data);
1559 else
1560 LYUpperCase(string->data);
1561 }
1562 }
1563 }
1564 #endif /* USE_PRETTYSRC */
1565
ignore_when_empty(HTTag * tag)1566 static BOOL ignore_when_empty(HTTag * tag)
1567 {
1568 BOOL result = FALSE;
1569
1570 if (!LYPreparsedSource
1571 && LYxhtml_parsing
1572 && tag->name != 0
1573 && !(tag->flags & Tgf_mafse)
1574 && tag->contents != SGML_EMPTY
1575 && tag->tagclass != Tgc_Plike
1576 && (tag->tagclass == Tgc_SELECTlike
1577 || (tag->contains && tag->icontains))) {
1578 result = TRUE;
1579 }
1580 CTRACE((tfp, "SGML Do%s ignore_when_empty:%s\n",
1581 result ? "" : " not",
1582 NonNull(tag->name)));
1583 return result;
1584 }
1585
discard_empty(HTStream * context)1586 static void discard_empty(HTStream *context)
1587 {
1588 static HTTag empty_tag;
1589
1590 CTRACE((tfp, "SGML discarding empty %s\n",
1591 NonNull(context->current_tag->name)));
1592 CTRACE_FLUSH(tfp);
1593
1594 memset(&empty_tag, 0, sizeof(empty_tag));
1595 context->current_tag = &empty_tag;
1596 context->string->size = 0;
1597
1598 /* do not call end_element() if start_element() was not called */
1599 }
1600
1601 #ifdef USE_PRETTYSRC
end_if_prettysrc(HTStream * context,HTChunk * string,int end_ch)1602 static BOOL end_if_prettysrc(HTStream *context, HTChunk *string, int end_ch)
1603 {
1604 BOOL result = psrc_view;
1605
1606 if (psrc_view) {
1607 if (attr_is_name) {
1608 HTStartAnchor(context->target, string->data, NULL);
1609 (*context->actions->end_element) (context->target,
1610 HTML_A,
1611 &context->include);
1612 } else if (attr_is_href) {
1613 PSRCSTART(href);
1614 HTStartAnchor(context->target, NULL, string->data);
1615 }
1616 PUTS_TR(string->data);
1617 if (attr_is_href) {
1618 (*context->actions->end_element) (context->target,
1619 HTML_A,
1620 &context->include);
1621 PSRCSTOP(href);
1622 }
1623 if (end_ch)
1624 PUTC(end_ch);
1625 PSRCSTOP(attrval);
1626 }
1627 return result;
1628 }
1629 #endif
1630
SGML_character(HTStream * context,int c_in)1631 static void SGML_character(HTStream *context, int c_in)
1632 {
1633 const SGML_dtd *dtd = context->dtd;
1634 HTChunk *string = context->string;
1635 const char *EntityName;
1636 HTTag *testtag = NULL;
1637 BOOLEAN chk; /* Helps (?) walk through all the else ifs... */
1638 UCode_t clong, uck = 0; /* Enough bits for UCS4 ... */
1639 int testlast;
1640
1641 unsigned char c;
1642 unsigned char saved_char_in = '\0';
1643
1644 ++sgml_offset;
1645
1646 /*
1647 * Now some fun with the preprocessor. Use copies for c and unsign_c ==
1648 * clong, so that we can revert back to the unchanged c_in. - KW
1649 */
1650 #define unsign_c clong
1651
1652 c = UCH(c_in);
1653 clong = UCH(c); /* a.k.a. unsign_c */
1654
1655 if (context->T.decode_utf8) {
1656 /*
1657 * Combine UTF-8 into Unicode. Incomplete characters silently ignored.
1658 * From Linux kernel's console.c. - KW
1659 */
1660 if (TOASCII(UCH(c)) > 127) { /* S/390 -- gil -- 0710 */
1661 /*
1662 * We have an octet from a multibyte character. - FM
1663 */
1664 if (context->utf_count > 0 && (TOASCII(c) & 0xc0) == 0x80) {
1665 context->utf_char = (context->utf_char << 6) | (TOASCII(c) & 0x3f);
1666 context->utf_count--;
1667 *(context->utf_buf_p) = (char) c;
1668 (context->utf_buf_p)++;
1669 if (context->utf_count == 0) {
1670 /*
1671 * We have all of the bytes, so terminate the buffer and
1672 * set 'clong' to the UCode_t value. - FM
1673 */
1674 *(context->utf_buf_p) = '\0';
1675 clong = context->utf_char;
1676 if (clong < 256) {
1677 c = UCH(clong & 0xff);
1678 }
1679 /* lynx does not use left-to-right */
1680 if (clong == 0x200e)
1681 return;
1682 goto top1;
1683 } else {
1684 /*
1685 * Wait for more. - KW
1686 */
1687 return;
1688 }
1689 } else {
1690 /*
1691 * Start handling a new multibyte character. - FM
1692 */
1693 context->utf_buf_p = context->utf_buf;
1694 *(context->utf_buf_p) = (char) c;
1695 (context->utf_buf_p)++;
1696 if ((c & 0xe0) == 0xc0) {
1697 context->utf_count = 1;
1698 context->utf_char = (c & 0x1f);
1699 } else if ((c & 0xf0) == 0xe0) {
1700 context->utf_count = 2;
1701 context->utf_char = (c & 0x0f);
1702 } else if ((c & 0xf8) == 0xf0) {
1703 context->utf_count = 3;
1704 context->utf_char = (c & 0x07);
1705 } else if ((c & 0xfc) == 0xf8) {
1706 context->utf_count = 4;
1707 context->utf_char = (c & 0x03);
1708 } else if ((c & 0xfe) == 0xfc) {
1709 context->utf_count = 5;
1710 context->utf_char = (c & 0x01);
1711 } else {
1712 /*
1713 * Garbage. - KW
1714 */
1715 context->utf_count = 0;
1716 context->utf_buf_p = context->utf_buf;
1717 *(context->utf_buf_p) = '\0';
1718 }
1719 /*
1720 * Wait for more. - KW
1721 */
1722 return;
1723 }
1724 } else {
1725 /*
1726 * Got an ASCII char. - KW
1727 */
1728 context->utf_count = 0;
1729 context->utf_buf_p = context->utf_buf;
1730 *(context->utf_buf_p) = '\0';
1731 /* goto top; */
1732 }
1733 }
1734 /* end of context->T.decode_utf8 S/390 -- gil -- 0726 */
1735 #ifdef NOTDEFINED
1736 /*
1737 * If we have a koi8-r input and do not have koi8-r as the output, save the
1738 * raw input in saved_char_in before we potentially convert it to Unicode.
1739 * - FM
1740 */
1741 if (context->T.strip_raw_char_in)
1742 saved_char_in = c;
1743 #endif /* NOTDEFINED */
1744
1745 /*
1746 * If we want the raw input converted to Unicode, try that now. - FM
1747 */
1748 if (context->T.trans_to_uni &&
1749 #ifdef EXP_JAPANESEUTF8_SUPPORT
1750 ((strcmp(LYCharSet_UC[context->inUCLYhndl].MIMEname, "euc-jp") == 0) ||
1751 (strcmp(LYCharSet_UC[context->inUCLYhndl].MIMEname, "shift_jis") == 0))) {
1752 if (strcmp(LYCharSet_UC[context->inUCLYhndl].MIMEname, "shift_jis") == 0) {
1753 if (context->utf_count == 0) {
1754 if (IS_SJIS_HI1((unsigned char) c) ||
1755 IS_SJIS_HI2((unsigned char) c)) {
1756 context->utf_buf[0] = (char) c;
1757 context->utf_count = 1;
1758 clong = -11;
1759 }
1760 } else {
1761 if (IS_SJIS_LO((unsigned char) c)) {
1762 context->utf_buf[1] = (char) c;
1763 clong = UCTransJPToUni(context->utf_buf, 2, context->inUCLYhndl);
1764 }
1765 context->utf_count = 0;
1766 }
1767 } else {
1768 if (context->utf_count == 0) {
1769 if (IS_EUC_HI((unsigned char) c)) {
1770 context->utf_buf[0] = (char) c;
1771 context->utf_count = 1;
1772 clong = -11;
1773 }
1774 } else {
1775 if (IS_EUC_LOX((unsigned char) c)) {
1776 context->utf_buf[1] = (char) c;
1777 clong = UCTransJPToUni(context->utf_buf, 2, context->inUCLYhndl);
1778 }
1779 context->utf_count = 0;
1780 }
1781 }
1782 goto top1;
1783 } else if (context->T.trans_to_uni &&
1784 #endif
1785 ((TOASCII(unsign_c) >= LYlowest_eightbit[context->inUCLYhndl]) || /* S/390 -- gil -- 0744 */
1786 (unsign_c < ' ' && unsign_c != 0 &&
1787 context->T.trans_C0_to_uni))) {
1788 /*
1789 * Convert the octet to Unicode. - FM
1790 */
1791 clong = UCTransToUni((char) c, context->inUCLYhndl);
1792 if (clong > 0) {
1793 saved_char_in = c;
1794 if (clong < 256) {
1795 c = FROMASCII(UCH(clong));
1796 }
1797 }
1798 goto top1;
1799 } else if (unsign_c < ' ' && unsign_c != 0 && /* S/390 -- gil -- 0768 */
1800 context->T.trans_C0_to_uni) {
1801 /*
1802 * This else if may be too ugly to keep. - KW
1803 */
1804 if (context->T.trans_from_uni &&
1805 (((clong = UCTransToUni((char) c, context->inUCLYhndl)) >= ' ') ||
1806 (context->T.transp &&
1807 (clong = UCTransToUni((char) c, context->inUCLYhndl)) > 0))) {
1808 saved_char_in = c;
1809 if (clong < 256) {
1810 c = FROMASCII(UCH(clong));
1811 }
1812 goto top1;
1813 } else {
1814 uck = -1;
1815 if (context->T.transp) {
1816 uck = UCTransCharStr(replace_buf, 60, (char) c,
1817 context->inUCLYhndl,
1818 context->inUCLYhndl, NO);
1819 }
1820 if (!context->T.transp || uck < 0) {
1821 uck = UCTransCharStr(replace_buf, 60, (char) c,
1822 context->inUCLYhndl,
1823 context->outUCLYhndl, YES);
1824 }
1825 if (uck == 0) {
1826 return;
1827 } else if (uck < 0) {
1828 goto top0a;
1829 }
1830 c = UCH(replace_buf[0]);
1831 if (c && replace_buf[1]) {
1832 if (context->state == S_text) {
1833 PUTS(replace_buf);
1834 return;
1835 }
1836 StrAllocCat(context->recover, replace_buf + 1);
1837 }
1838 goto top0a;
1839 } /* Next line end of ugly stuff for C0. - KW */
1840 } else { /* end of context->T.trans_to_uni S/390 -- gil -- 0791 */
1841 goto top0a;
1842 }
1843
1844 /*
1845 * At this point we have either unsign_c a.k.a. clong in Unicode (and c in
1846 * latin1 if clong is in the latin1 range), or unsign_c and c will have to
1847 * be passed raw. - KW
1848 */
1849 /*
1850 * We jump up to here from below if we have
1851 * stuff in the recover, insert, or csi buffers
1852 * to process. We zero saved_char_in, in effect
1853 * as a flag that the octet is not that of the
1854 * actual call to this function. This may be OK
1855 * for now, for the stuff this function adds to
1856 * its recover buffer, but it might not be for
1857 * stuff other functions added to the insert or
1858 * csi buffer, so bear that in mind. - FM
1859 * Stuff from the recover buffer is now handled
1860 * as UTF-8 if we can expect that's what it is,
1861 * and in that case we don't come back up here. - kw
1862 */
1863 top:
1864 saved_char_in = '\0';
1865 /*
1866 * We jump to here from above when we don't have
1867 * UTF-8 input, haven't converted to Unicode, and
1868 * want clong set to the input octet (unsigned)
1869 * without zeroing its saved_char_in copy (which
1870 * is signed). - FM
1871 */
1872 top0a:
1873 *(context->utf_buf) = '\0';
1874 clong = UCH(c);
1875 /*
1876 * We jump to here from above if we have converted
1877 * the input, or a multibyte sequence across calls,
1878 * to a Unicode value and loaded it into clong (to
1879 * which unsign_c has been defined), and from below
1880 * when we are recycling a character (e.g., because
1881 * it terminated an entity but is not the standard
1882 * semi-colon). The character will already have
1883 * been put through the Unicode conversions. - FM
1884 */
1885 top1:
1886 /*
1887 * Ignore low ISO 646 7-bit control characters if HTCJK is not set. - FM
1888 */
1889 /*
1890 * Works for both ASCII and EBCDIC. -- gil
1891 * S/390 -- gil -- 0811
1892 */
1893 if (TOASCII(unsign_c) < 32 &&
1894 c != '\t' && c != '\n' && c != '\r' &&
1895 !IS_CJK_TTY)
1896 goto after_switch;
1897
1898 /*
1899 * Ignore 127 if we don't have HTPassHighCtrlRaw or HTCJK set. - FM
1900 */
1901 #define PASSHICTRL (context->T.transp || \
1902 unsign_c >= LYlowest_eightbit[context->inUCLYhndl])
1903 if (TOASCII(c) == 127 && /* S/390 -- gil -- 0830 */
1904 !(PASSHICTRL || IS_CJK_TTY))
1905 goto after_switch;
1906
1907 /*
1908 * Ignore 8-bit control characters 128 - 159 if neither HTPassHighCtrlRaw
1909 * nor HTCJK is set. - FM
1910 */
1911 if (TOASCII(unsign_c) > 127 && TOASCII(unsign_c) < 160 && /* S/390 -- gil -- 0847 */
1912 !(PASSHICTRL || IS_CJK_TTY)) {
1913 /*
1914 * If we happen to be reading from an "ISO-8859-1" or "US-ASCII"
1915 * document, allow the cp-1252 codes, to accommodate the HTML5 draft
1916 * recommendation for replacement encoding:
1917 *
1918 * http://www.whatwg.org/specs/web-apps/current-work/multipage/infrastructure.html#character-encodings-0
1919 */
1920 if (AssumeCP1252(context)) {
1921 clong = LYcp1252ToUnicode((UCode_t) c);
1922 goto top1;
1923 }
1924 goto after_switch;
1925 }
1926
1927 /* Almost all CJK characters are double byte but only Japanese
1928 * JIS X0201 Kana is single byte. To prevent to fail SGML parsing
1929 * we have to take care of them here. -- TH
1930 */
1931 if ((HTCJK == JAPANESE) && (context->state == S_in_kanji) &&
1932 !IS_JAPANESE_2BYTE(context->kanji_buf, UCH(c))
1933 #ifdef EXP_JAPANESEUTF8_SUPPORT
1934 && !context->T.decode_utf8
1935 #endif
1936 ) {
1937 #ifdef CONV_JISX0201KANA_JISX0208KANA
1938 if (IS_SJIS_X0201KANA(context->kanji_buf)) {
1939 unsigned char sjis_hi, sjis_lo;
1940
1941 JISx0201TO0208_SJIS(context->kanji_buf, &sjis_hi, &sjis_lo);
1942 PUTC(sjis_hi);
1943 PUTC(sjis_lo);
1944 } else
1945 #endif
1946 PUTC(context->kanji_buf);
1947 context->state = S_text;
1948 }
1949
1950 /*
1951 * Handle character based on context->state.
1952 */
1953 CTRACE2(TRACE_SGML, (tfp, "SGML before %s|%.*s|%c|\n",
1954 state_name(context->state),
1955 string->size,
1956 NonNull(string->data),
1957 UCH(c)));
1958 switch (context->state) {
1959
1960 case S_in_kanji:
1961 /*
1962 * Note that if we don't have a CJK input, then this is not the second
1963 * byte of a CJK di-byte, and we're trashing the input. That's why
1964 * 8-bit characters followed by, for example, '<' can cause the tag to
1965 * be treated as text, not markup. We could try to deal with it by
1966 * holding each first byte and then checking byte pairs, but that
1967 * doesn't seem worth the overhead (see below). - FM
1968 */
1969 context->state = S_text;
1970 PUTC(context->kanji_buf);
1971 PUTC(c);
1972 break;
1973
1974 case S_tagname_slash:
1975 /*
1976 * We had something link "<name/" so far, set state to S_text but keep
1977 * context->slashedtag as a flag; except if we get '>' directly
1978 * after the "<name/", and really have a tag for that name in
1979 * context->slashedtag, in which case keep state as is and let code
1980 * below deal with it. - kw
1981 */
1982 if (!(c == '>' && context->slashedtag && TOASCII(unsign_c) < 127)) {
1983 context->state = S_text;
1984 }
1985 /* fall through in any case! */
1986 case S_text:
1987 if (IS_CJK_TTY && ((TOASCII(c) & 0200) != 0)
1988 #ifdef EXP_JAPANESEUTF8_SUPPORT
1989 && !context->T.decode_utf8
1990 #endif
1991 ) { /* S/390 -- gil -- 0864 */
1992 /*
1993 * Setting up for Kanji multibyte handling (based on Takuya ASADA's
1994 * (asada@three-a.co.jp) CJK Lynx). Note that if the input is not
1995 * in fact CJK, the next byte also will be mishandled, as explained
1996 * above. Toggle raw mode off in such cases, or select the "7 bit
1997 * approximations" display character set, which is largely
1998 * equivalent to having raw mode off with CJK. - FM
1999 */
2000 context->state = S_in_kanji;
2001 context->kanji_buf = c;
2002 break;
2003 } else if (IS_CJK_TTY && TOASCII(c) == '\033') { /* S/390 -- gil -- 0881 */
2004 /*
2005 * Setting up for CJK escape sequence handling (based on Takuya
2006 * ASADA's (asada@three-a.co.jp) CJK Lynx). - FM
2007 */
2008 context->state = S_esc;
2009 PUTC(c);
2010 break;
2011 }
2012
2013 if (c == '&' || c == '<') {
2014 #ifdef USE_PRETTYSRC
2015 if (psrc_view) { /*there is nothing useful in the element_stack */
2016 testtag = context->current_tag;
2017 } else
2018 #endif
2019 {
2020 testtag = context->element_stack ?
2021 context->element_stack->tag : NULL;
2022 }
2023 }
2024
2025 if (c == '&' && TOASCII(unsign_c) < 127 && /* S/390 -- gil -- 0898 */
2026 (!testtag ||
2027 (testtag->contents == SGML_MIXED ||
2028 testtag->contents == SGML_ELEMENT ||
2029 testtag->contents == SGML_PCDATA ||
2030 #ifdef USE_PRETTYSRC
2031 testtag->contents == SGML_EMPTY ||
2032 #endif
2033 testtag->contents == SGML_RCDATA))) {
2034 /*
2035 * Setting up for possible entity, without the leading '&'. - FM
2036 */
2037 string->size = 0;
2038 context->state = S_ero;
2039 } else if (c == '<' && TOASCII(unsign_c) < 127) { /* S/390 -- gil -- 0915 */
2040 /*
2041 * Setting up for possible tag. - FM
2042 */
2043 string->size = 0;
2044 if (testtag && testtag->contents == SGML_PCDATA) {
2045 context->state = S_pcdata;
2046 } else if (testtag && (testtag->contents == SGML_LITTERAL
2047 || testtag->contents == SGML_CDATA)) {
2048 context->state = S_litteral;
2049 } else if (testtag && (testtag->contents == SGML_SCRIPT)) {
2050 context->state = S_script;
2051 } else {
2052 context->state = S_tag;
2053 }
2054 context->slashedtag = NULL;
2055 } else if (context->slashedtag &&
2056 context->slashedtag->name &&
2057 (c == '/' ||
2058 (c == '>' && context->state == S_tagname_slash)) &&
2059 TOASCII(unsign_c) < 127) {
2060 /*
2061 * We got either the second slash of a pending "<NAME/blah blah/"
2062 * shortref construct, or the '>' of a mere "<NAME/>". In both
2063 * cases generate a "</NAME>" end tag in the recover buffer for
2064 * reparsing unless NAME is really an empty element. - kw
2065 */
2066 #ifdef USE_PRETTYSRC
2067 if (psrc_view) {
2068 PSRCSTART(abracket);
2069 PUTC(c);
2070 PSRCSTOP(abracket);
2071 } else
2072 #endif
2073 if (context->slashedtag != context->unknown_tag &&
2074 !ReallyEmptyTag(context->slashedtag)) {
2075 if (context->recover == NULL) {
2076 StrAllocCopy(context->recover, "</");
2077 context->recover_index = 0;
2078 } else {
2079 StrAllocCat(context->recover, "</");
2080 }
2081 StrAllocCat(context->recover, context->slashedtag->name);
2082 StrAllocCat(context->recover, ">");
2083 }
2084 context->slashedtag = NULL;
2085
2086 } else if (context->element_stack &&
2087 (context->element_stack->tag->flags & Tgf_frecyc)) {
2088 /*
2089 * The element stack says we are within the contents of an element
2090 * that the next stage (HTML.c) may want to feed us back again (via
2091 * the *include string). So try to output text in UTF-8 if
2092 * possible, using the same logic as for attribute values (which
2093 * should be in line with what context->current_tag_charset
2094 * indicates). - kw
2095 */
2096 if (context->T.decode_utf8 &&
2097 *context->utf_buf) {
2098 PUTS(context->utf_buf);
2099 context->utf_buf_p = context->utf_buf;
2100 *(context->utf_buf_p) = '\0';
2101 } else if (!IS_CJK_TTY &&
2102 (context->T.output_utf8 ||
2103 context->T.trans_from_uni)) {
2104 if (LYIsASCII(clong)) {
2105 PUTC(c);
2106 } else if (clong == 0xfffd && saved_char_in &&
2107 HTPassEightBitRaw &&
2108 saved_char_in >=
2109 LYlowest_eightbit[context->outUCLYhndl]) {
2110 PUTUTF8((UCode_t) (0xf000 | saved_char_in));
2111 } else {
2112 PUTUTF8(clong);
2113 }
2114 } else if (saved_char_in && context->T.use_raw_char_in) {
2115 PUTC(saved_char_in);
2116 } else {
2117 PUTC(c);
2118 }
2119
2120 #define PASS8859SPECL context->T.pass_160_173_raw
2121 /*
2122 * Convert 160 (nbsp) to Lynx special character if neither
2123 * HTPassHighCtrlRaw nor HTCJK is set. - FM
2124 */
2125 } else if (unsign_c == CH_NBSP && /* S/390 -- gil -- 0932 */
2126 !context->no_lynx_specialcodes &&
2127 !(PASS8859SPECL || IS_CJK_TTY)) {
2128 PUTC(HT_NON_BREAK_SPACE);
2129 /*
2130 * Convert 173 (shy) to Lynx special character if neither
2131 * HTPassHighCtrlRaw nor HTCJK is set. - FM
2132 */
2133 } else if (unsign_c == CH_SHY && /* S/390 -- gil -- 0949 */
2134 !context->no_lynx_specialcodes &&
2135 !(PASS8859SPECL || IS_CJK_TTY)) {
2136 PUTC(LY_SOFT_HYPHEN);
2137 /*
2138 * Handle the case in which we think we have a character which
2139 * doesn't need further processing (e.g., a koi8-r input for a
2140 * koi8-r output). - FM
2141 */
2142 } else if (context->T.use_raw_char_in && saved_char_in) {
2143 /*
2144 * Only if the original character is still in saved_char_in,
2145 * otherwise we may be iterating from a goto top. - KW
2146 */
2147 PUTC(saved_char_in);
2148 /******************************************************************
2149 * I. LATIN-1 OR UCS2 TO DISPLAY CHARSET
2150 ******************************************************************/
2151 } else if ((chk = (BOOL) (context->T.trans_from_uni &&
2152 TOASCII(unsign_c) >= 160)) && /* S/390 -- gil -- 0968 */
2153 (uck = UCTransUniChar(unsign_c,
2154 context->outUCLYhndl)) >= ' ' &&
2155 uck < 256) {
2156 CTRACE((tfp, "UCTransUniChar returned 0x%.2" PRI_UCode_t
2157 ":'%c'.\n",
2158 uck, FROMASCII((char)uck)));
2159 /*
2160 * We got one octet from the conversions, so use it. - FM
2161 */
2162 PUTC(FROMASCII((char) uck));
2163 } else if ((chk &&
2164 (uck == -4 ||
2165 (context->T.repl_translated_C0 &&
2166 uck > 0 && uck < 32))) &&
2167 /*
2168 * Not found; look for replacement string. - KW
2169 */
2170 (uck = UCTransUniCharStr(replace_buf, 60, clong,
2171 context->outUCLYhndl,
2172 0) >= 0)) {
2173 /*
2174 * Got a replacement string. No further tests for validity -
2175 * assume that whoever defined replacement strings knew what she
2176 * was doing. - KW
2177 */
2178 PUTS(replace_buf);
2179 /*
2180 * If we're displaying UTF-8, try that now. - FM
2181 */
2182 } else if (context->T.output_utf8 && PUTUTF8(clong)) {
2183 ; /* do nothing more */
2184 /*
2185 * If it's any other (> 160) 8-bit character, and we have not set
2186 * HTPassEightBitRaw nor HTCJK, nor have the "ISO Latin 1"
2187 * character set selected, back translate for our character set. -
2188 * FM
2189 */
2190 #define IncludesLatin1Enc \
2191 (context->outUCLYhndl == LATIN1 || \
2192 (context->outUCI && \
2193 (context->outUCI->enc & (UCT_CP_SUPERSETOF_LAT1))))
2194
2195 #define PASSHI8BIT (HTPassEightBitRaw || \
2196 (context->T.do_8bitraw && !context->T.trans_from_uni))
2197
2198 } else if (unsign_c > 160 && unsign_c < 256 &&
2199 !(PASSHI8BIT || IS_CJK_TTY) &&
2200 !IncludesLatin1Enc) {
2201 #ifdef USE_PRETTYSRC
2202 int psrc_view_backup = 0;
2203 #endif
2204
2205 string->size = 0;
2206 EntityName = HTMLGetEntityName((UCode_t) (unsign_c - 160));
2207 HTChunkPuts(string, EntityName);
2208 HTChunkTerminate(string);
2209 #ifdef USE_PRETTYSRC
2210 /* we need to disable it temporarily */
2211 if (psrc_view) {
2212 psrc_view_backup = 1;
2213 psrc_view = 0;
2214 }
2215 #endif
2216 handle_entity(context, '\0');
2217 #ifdef USE_PRETTYSRC
2218 /* we need to disable it temporarily */
2219 if (psrc_view_backup)
2220 psrc_view = TRUE;
2221 #endif
2222
2223 string->size = 0;
2224 if (!FoundEntity)
2225 PUTC(';');
2226 /*
2227 * If we get to here and have an ASCII char, pass the character. -
2228 * KW
2229 */
2230 } else if (TOASCII(unsign_c) < 127 && unsign_c > 0) { /* S/390 -- gil -- 0987 */
2231 PUTC(c);
2232 /*
2233 * If we get to here, and should have translated, translation has
2234 * failed so far. - KW
2235 *
2236 * We should have sent UTF-8 output to the parser already, but what
2237 * the heck, try again. - FM
2238 */
2239 } else if (context->T.output_utf8 && *context->utf_buf) {
2240 PUTS(context->utf_buf);
2241 context->utf_buf_p = context->utf_buf;
2242 *(context->utf_buf_p) = '\0';
2243 #ifdef NOTDEFINED
2244 /*
2245 * Check for a strippable koi8-r 8-bit character. - FM
2246 */
2247 } else if (context->T.strip_raw_char_in && saved_char_in &&
2248 (saved_char_in >= 0xc0) &&
2249 (saved_char_in < 255)) {
2250 /*
2251 * KOI8 special: strip high bit, gives (somewhat) readable ASCII
2252 * or KOI7 - it was constructed that way! - KW
2253 */
2254 PUTC((saved_char_in & 0x7f));
2255 saved_char_in = '\0';
2256 #endif /* NOTDEFINED */
2257 /*
2258 * If we don't actually want the character, make it safe and output
2259 * that now. - FM
2260 */
2261 } else if (TOASCII(UCH(c)) < /* S/390 -- gil -- 0997 */
2262 LYlowest_eightbit[context->outUCLYhndl] ||
2263 (context->T.trans_from_uni && !HTPassEightBitRaw)) {
2264 /*
2265 * If we get to here, pass the character. - FM
2266 */
2267 } else {
2268 PUTC(c);
2269 }
2270 break;
2271
2272 /*
2273 * Found '<' in SGML_PCDATA content; treat this mode nearly like
2274 * S_litteral, but recognize '<!' and '<?' to filter out comments and
2275 * processing instructions. - kw
2276 */
2277 case S_pcdata:
2278 if (!string->size && TOASCII(unsign_c) < 127) { /* first after '<' */
2279 if (c == '!') { /* <! */
2280 /*
2281 * Terminate and set up for possible comment, identifier,
2282 * declaration, or marked section as under S_tag. - kw
2283 */
2284 context->state = S_exclamation;
2285 context->lead_exclamation = TRUE;
2286 context->doctype_bracket = FALSE;
2287 context->first_bracket = FALSE;
2288 HTChunkPutc(string, c);
2289 break;
2290 } else if (c == '?') { /* <? - ignore as a PI until '>' - kw */
2291 CTRACE((tfp,
2292 "SGML: Found PI in PCDATA, junking it until '>'\n"));
2293 #ifdef USE_PRETTYSRC
2294 if (psrc_view) {
2295 PSRCSTART(abracket);
2296 PUTS("<?");
2297 PSRCSTOP(abracket);
2298 }
2299 #endif
2300 context->state = S_pi;
2301 break;
2302 }
2303 }
2304 goto case_S_litteral;
2305
2306 /*
2307 * Found '<' in SGML_SCRIPT content; treat this mode nearly like
2308 * S_litteral, but recognize '<!' to allow the content to be treated as
2309 * a comment by lynx.
2310 */
2311 case S_script:
2312 if (!string->size && TOASCII(unsign_c) < 127) { /* first after '<' */
2313 if (c == '!') { /* <! */
2314 /*
2315 * Terminate and set up for possible comment, identifier,
2316 * declaration, or marked section as under S_tag. - kw
2317 */
2318 context->state = S_exclamation;
2319 context->lead_exclamation = TRUE;
2320 context->doctype_bracket = FALSE;
2321 context->first_bracket = FALSE;
2322 HTChunkPutc(string, c);
2323 break;
2324 }
2325 }
2326 goto case_S_litteral;
2327
2328 /*
2329 * In litteral mode, waits only for specific end tag (for compatibility
2330 * with old servers, and for Lynx). - FM
2331 */
2332 case_S_litteral:
2333 case S_litteral:
2334 /*PSRC:this case not understood completely by HV, not done */
2335 HTChunkPutc(string, c);
2336 #ifdef USE_PRETTYSRC
2337 if (psrc_view) {
2338 /* there is nothing useful in the element_stack */
2339 testtag = context->current_tag;
2340 } else
2341 #endif
2342 testtag = (context->element_stack
2343 ? context->element_stack->tag
2344 : NULL);
2345
2346 if (testtag == NULL || testtag->name == NULL) {
2347 string->size--;
2348 context->state = S_text;
2349 goto top1;
2350 }
2351
2352 /*
2353 * Normally when we get the closing ">",
2354 * testtag contains something like "TITLE"
2355 * string contains something like "/title>"
2356 * so we decrement by 2 to compare the final character of each.
2357 */
2358 testlast = string->size - 2 - context->trailing_spaces - context->leading_spaces;
2359
2360 if (TOUPPER(c) != ((testlast < 0)
2361 ? '/'
2362 : testtag->name[testlast])) {
2363 int i;
2364
2365 /*
2366 * If complete match, end litteral.
2367 */
2368 if ((c == '>') &&
2369 testlast >= 0 && !testtag->name[testlast]) {
2370 #ifdef USE_PRETTYSRC
2371 if (psrc_view) {
2372 char *trailing = NULL;
2373
2374 if (context->trailing_spaces) {
2375 StrAllocCopy(trailing,
2376 string->data
2377 + string->size
2378 - 1
2379 - context->trailing_spaces);
2380 trailing[context->trailing_spaces] = '\0';
2381 }
2382
2383 PSRCSTART(abracket);
2384 PUTS("</");
2385 PSRCSTOP(abracket);
2386 PSRCSTART(tag);
2387
2388 strcpy(string->data, context->current_tag->name);
2389 transform_tag(context, string);
2390 PUTS(string->data);
2391
2392 if (trailing) {
2393 PUTS(trailing);
2394 FREE(trailing);
2395 }
2396
2397 PSRCSTOP(tag);
2398 PSRCSTART(abracket);
2399 PUTC('>');
2400 PSRCSTOP(abracket);
2401
2402 context->current_tag = NULL;
2403 } else
2404 #endif
2405 end_element(context, context->element_stack->tag);
2406
2407 string->size = 0;
2408 context->current_attribute_number = INVALID;
2409 context->state = S_text;
2410 context->leading_spaces = 0;
2411 context->trailing_spaces = 0;
2412 break;
2413 }
2414
2415 /*
2416 * Allow whitespace between the "<" or ">" and the keyword, for
2417 * error-recovery.
2418 */
2419 if (isspace(UCH(c))) {
2420 if (testlast == -1) {
2421 context->leading_spaces += 1;
2422 CTRACE2(TRACE_SGML, (tfp, "leading spaces: %d\n", context->leading_spaces));
2423 break;
2424 } else if (testlast > 0) {
2425 context->trailing_spaces += 1;
2426 CTRACE2(TRACE_SGML, (tfp, "trailing spaces: %d\n", context->trailing_spaces));
2427 break;
2428 }
2429 }
2430
2431 /*
2432 * Mismatch - recover.
2433 */
2434 context->leading_spaces = 0;
2435 context->trailing_spaces = 0;
2436 if (((testtag->contents != SGML_LITTERAL &&
2437 (testtag->flags & Tgf_strict)) ||
2438 (context->state == S_pcdata &&
2439 (testtag->flags & (Tgf_strict | Tgf_endO)))) &&
2440 (testlast > -1 &&
2441 (c == '>' || testlast > 0 || IsNmStart(c)))) {
2442 context->state = S_end;
2443 string->size--;
2444 for (i = 0; i < string->size; i++) /* remove '/' */
2445 string->data[i] = string->data[i + 1];
2446 if ((string->size == 1) ? IsNmStart(c) : IsNmChar(c))
2447 break;
2448 string->size--;
2449 goto top1;
2450 }
2451 if (context->state == S_pcdata &&
2452 (testtag->flags & (Tgf_strict | Tgf_endO)) &&
2453 (testlast < 0 && IsNmStart(c))) {
2454 context->state = S_tag;
2455 break;
2456 }
2457 /*
2458 * If Mismatch: recover string literally.
2459 */
2460 PUTC('<');
2461 for (i = 0; i < string->size - 1; i++) /* recover, except last c */
2462 PUTC(string->data[i]);
2463 string->size = 0;
2464 context->state = S_text;
2465 goto top1; /* to recover last c */
2466 }
2467 break;
2468
2469 /*
2470 * Character reference (numeric entity) or named entity.
2471 */
2472 case S_ero:
2473 if (c == '#') {
2474 /*
2475 * Setting up for possible numeric entity.
2476 */
2477 context->state = S_cro; /* &# is Char Ref Open */
2478 break;
2479 }
2480 context->state = S_entity; /* Fall through! */
2481
2482 /*
2483 * Handle possible named entity.
2484 */
2485 case S_entity:
2486 if (TOASCII(unsign_c) < 127 && (string->size ? /* S/390 -- gil -- 1029 */
2487 isalnum(UCH(c)) : isalpha(UCH(c)))) {
2488 /* Should probably use IsNmStart/IsNmChar above (is that right?),
2489 but the world is not ready for that - there's  : (note
2490 colon!) and stuff around. */
2491 /*
2492 * Accept valid ASCII character. - FM
2493 */
2494 HTChunkPutc(string, c);
2495 } else if (string->size == 0) {
2496 /*
2497 * It was an ampersand that's just text, so output the ampersand
2498 * and recycle this character. - FM
2499 */
2500 #ifdef USE_PRETTYSRC
2501 if (psrc_view)
2502 PSRCSTART(badseq);
2503 #endif
2504 PUTC('&');
2505 #ifdef USE_PRETTYSRC
2506 if (psrc_view)
2507 PSRCSTOP(badseq);
2508 #endif
2509 context->state = S_text;
2510 goto top1;
2511 } else {
2512 /*
2513 * Terminate entity name and try to handle it. - FM
2514 */
2515 HTChunkTerminate(string);
2516 #ifdef USE_PRETTYSRC
2517 entity_string = string->data;
2518 #endif
2519 /* S/390 -- gil -- 1039 */
2520 /* CTRACE((tfp, "%s: %d: %s\n", __FILE__, __LINE__, string->data)); */
2521 if (!strcmp(string->data, "zwnj") &&
2522 (!context->element_stack ||
2523 (context->element_stack->tag &&
2524 context->element_stack->tag->contents == SGML_MIXED))) {
2525 /*
2526 * Handle zwnj (8204) as <WBR>. - FM
2527 */
2528 char temp[8];
2529
2530 CTRACE((tfp,
2531 "SGML_character: Handling 'zwnj' entity as 'WBR' element.\n"));
2532
2533 if (c != ';') {
2534 sprintf(temp, "<WBR>%c", c);
2535 } else {
2536 sprintf(temp, "<WBR>");
2537 }
2538 if (context->recover == NULL) {
2539 StrAllocCopy(context->recover, temp);
2540 context->recover_index = 0;
2541 } else {
2542 StrAllocCat(context->recover, temp);
2543 }
2544 string->size = 0;
2545 context->state = S_text;
2546 break;
2547 } else {
2548 handle_entity(context, '\0');
2549 }
2550 string->size = 0;
2551 context->state = S_text;
2552 /*
2553 * Don't eat the terminator if we didn't find the entity name and
2554 * therefore sent the raw string via handle_entity(), or if the
2555 * terminator is not the "standard" semi-colon for HTML. - FM
2556 */
2557 #ifdef USE_PRETTYSRC
2558 if (psrc_view && FoundEntity && c == ';') {
2559 PSRCSTART(entity);
2560 PUTC(c);
2561 PSRCSTOP(entity);
2562 }
2563 #endif
2564 if (!FoundEntity || c != ';')
2565 goto top1;
2566 }
2567 break;
2568
2569 /*
2570 * Check for a numeric entity.
2571 */
2572 case S_cro:
2573 if (TOASCII(unsign_c) < 127 && TOLOWER(UCH(c)) == 'x') { /* S/390 -- gil -- 1060 */
2574 context->isHex = TRUE;
2575 context->state = S_incro;
2576 } else if (TOASCII(unsign_c) < 127 && isdigit(UCH(c))) {
2577 /*
2578 * Accept only valid ASCII digits. - FM
2579 */
2580 HTChunkPutc(string, c); /* accumulate a character NUMBER */
2581 context->isHex = FALSE;
2582 context->state = S_incro;
2583 } else if (string->size == 0) {
2584 /*
2585 * No 'x' or digit following the "&#" so recover them and recycle
2586 * the character. - FM
2587 */
2588 #ifdef USE_PRETTYSRC
2589 if (psrc_view)
2590 PSRCSTART(badseq);
2591 #endif
2592 PUTC('&');
2593 PUTC('#');
2594 #ifdef USE_PRETTYSRC
2595 if (psrc_view)
2596 PSRCSTOP(badseq);
2597 #endif
2598 context->state = S_text;
2599 goto top1;
2600 }
2601 break;
2602
2603 /*
2604 * Handle a numeric entity.
2605 */
2606 case S_incro:
2607 /* S/390 -- gil -- 1075 */
2608 if ((TOASCII(unsign_c) < 127) &&
2609 (context->isHex
2610 ? isxdigit(UCH(c))
2611 : isdigit(UCH(c)))) {
2612 /*
2613 * Accept only valid hex or ASCII digits. - FM
2614 */
2615 HTChunkPutc(string, c); /* accumulate a character NUMBER */
2616 } else if (string->size == 0) {
2617 /*
2618 * No hex digit following the "&#x" so recover them and recycle the
2619 * character. - FM
2620 */
2621 #ifdef USE_PRETTYSRC
2622 if (psrc_view)
2623 PSRCSTART(badseq);
2624 #endif
2625 PUTS("&#x");
2626 #ifdef USE_PRETTYSRC
2627 if (psrc_view)
2628 PSRCSTOP(badseq);
2629 #endif
2630 context->isHex = FALSE;
2631 context->state = S_text;
2632 goto top1;
2633 } else {
2634 /*
2635 * Terminate the numeric entity and try to handle it. - FM
2636 */
2637 UCode_t code;
2638 int i;
2639
2640 HTChunkTerminate(string);
2641 #ifdef USE_PRETTYSRC
2642 entity_string = string->data;
2643 #endif
2644 if (UCScanCode(&code, string->data, context->isHex)) {
2645
2646 /* =============== work in ASCII below here =============== S/390 -- gil -- 1092 */
2647 if (AssumeCP1252(context)) {
2648 code = LYcp1252ToUnicode(code);
2649 }
2650 /*
2651 * Check for special values. - FM
2652 */
2653 if ((code == 8204) &&
2654 (!context->element_stack ||
2655 (context->element_stack->tag &&
2656 context->element_stack->tag->contents == SGML_MIXED))) {
2657 /*
2658 * Handle zwnj (8204) as <WBR>. - FM
2659 */
2660 char temp[8];
2661
2662 CTRACE((tfp,
2663 "SGML_character: Handling '8204' (zwnj) reference as 'WBR' element.\n"));
2664
2665 /*
2666 * Include the terminator if it is not the standard
2667 * semi-colon. - FM
2668 */
2669 if (c != ';') {
2670 sprintf(temp, "<WBR>%c", c);
2671 } else {
2672 sprintf(temp, "<WBR>");
2673 }
2674 /*
2675 * Add the replacement string to the recover buffer for
2676 * processing. - FM
2677 */
2678 if (context->recover == NULL) {
2679 StrAllocCopy(context->recover, temp);
2680 context->recover_index = 0;
2681 } else {
2682 StrAllocCat(context->recover, temp);
2683 }
2684 string->size = 0;
2685 context->isHex = FALSE;
2686 context->state = S_text;
2687 break;
2688 } else if (put_special_unicodes(context, code)) {
2689 /*
2690 * We handled the value as a special character, so recycle
2691 * the terminator or break. - FM
2692 */
2693 #ifdef USE_PRETTYSRC
2694 if (psrc_view) {
2695 PSRCSTART(entity);
2696 PUTS((context->isHex ? "&#x" : "&#"));
2697 PUTS(entity_string);
2698 if (c == ';')
2699 PUTC(';');
2700 PSRCSTOP(entity);
2701 }
2702 #endif
2703 string->size = 0;
2704 context->isHex = FALSE;
2705 context->state = S_text;
2706 if (c != ';')
2707 goto top1;
2708 break;
2709 }
2710 /*
2711 * Seek a translation from the chartrans tables.
2712 */
2713 if ((uck = UCTransUniChar(code,
2714 context->outUCLYhndl)) >= 32 &&
2715 uck < 256 &&
2716 (uck < 127 ||
2717 uck >= LYlowest_eightbit[context->outUCLYhndl])) {
2718 #ifdef USE_PRETTYSRC
2719 if (!psrc_view) {
2720 #endif
2721 PUTC(FROMASCII((char) uck));
2722 #ifdef USE_PRETTYSRC
2723 } else {
2724 put_pretty_number(context);
2725 }
2726 #endif
2727 } else if ((uck == -4 ||
2728 (context->T.repl_translated_C0 &&
2729 uck > 0 && uck < 32)) &&
2730 /*
2731 * Not found; look for replacement string.
2732 */
2733 (uck = UCTransUniCharStr(replace_buf, 60, code,
2734 context->outUCLYhndl,
2735 0) >= 0)) {
2736 #ifdef USE_PRETTYSRC
2737 if (psrc_view) {
2738 put_pretty_number(context);
2739 } else
2740 #endif
2741 PUTS(replace_buf);
2742 /*
2743 * If we're displaying UTF-8, try that now. - FM
2744 */
2745 } else if (context->T.output_utf8 && PUTUTF8(code)) {
2746 ; /* do nothing more */
2747 /*
2748 * Ignore 8205 (zwj), 8206 (lrm), and 8207 (rln), if we get
2749 * to here. - FM
2750 */
2751 } else if (code == 8205 ||
2752 code == 8206 ||
2753 code == 8207) {
2754 if (TRACE) {
2755 string->size--;
2756 LYStrNCpy(replace_buf,
2757 string->data,
2758 (string->size < 64 ? string->size : 63));
2759 fprintf(tfp,
2760 "SGML_character: Ignoring '%s%s'.\n",
2761 (context->isHex ? "&#x" : "&#"),
2762 replace_buf);
2763 }
2764 #ifdef USE_PRETTYSRC
2765 if (psrc_view) {
2766 PSRCSTART(badseq);
2767 PUTS((context->isHex ? "&#x" : "&#"));
2768 PUTS(entity_string);
2769 if (c == ';')
2770 PUTC(';');
2771 PSRCSTOP(badseq);
2772 }
2773 #endif
2774 string->size = 0;
2775 context->isHex = FALSE;
2776 context->state = S_text;
2777 if (c != ';')
2778 goto top1;
2779 break;
2780 /*
2781 * Show the numeric entity if we get to here and the value:
2782 * (1) Is greater than 255 (but use ASCII characters for
2783 * spaces or dashes).
2784 * (2) Is less than 32, and not valid or we don't have
2785 * HTCJK set.
2786 * (3) Is 127 and we don't have HTPassHighCtrlRaw or HTCJK
2787 * set.
2788 * (4) Is 128 - 159 and we don't have HTPassHighCtrlNum
2789 * set.
2790 * - FM
2791 */
2792 } else if ((code > 255) ||
2793 (code < ' ' && /* S/390 -- gil -- 1140 */
2794 code != '\t' && code != '\n' && code != '\r' &&
2795 !IS_CJK_TTY) ||
2796 (TOASCII(code) == 127 &&
2797 !(HTPassHighCtrlRaw || IS_CJK_TTY)) ||
2798 (TOASCII(code) > 127 && code < 160 &&
2799 !HTPassHighCtrlNum)) {
2800 /*
2801 * Unhandled or illegal value. Recover the "&#" or "&#x"
2802 * and digit(s), and recycle the terminator. - FM
2803 */
2804 #ifdef USE_PRETTYSRC
2805 if (psrc_view) {
2806 PSRCSTART(badseq);
2807 }
2808 #endif
2809 if (context->isHex) {
2810 PUTS("&#x");
2811 context->isHex = FALSE;
2812 } else {
2813 PUTS("&#");
2814 }
2815 string->size--;
2816 for (i = 0; i < string->size; i++) /* recover */
2817 PUTC(string->data[i]);
2818 #ifdef USE_PRETTYSRC
2819 if (psrc_view) {
2820 PSRCSTOP(badseq);
2821 }
2822 #endif
2823 string->size = 0;
2824 context->isHex = FALSE;
2825 context->state = S_text;
2826 goto top1;
2827 } else if (TOASCII(code) < 161 || /* S/390 -- gil -- 1162 */
2828 HTPassEightBitNum ||
2829 IncludesLatin1Enc) {
2830 /*
2831 * No conversion needed. - FM
2832 */
2833 #ifdef USE_PRETTYSRC
2834 if (psrc_view) {
2835 put_pretty_number(context);
2836 } else
2837 #endif
2838 PUTC(FROMASCII((char) code));
2839 } else {
2840 /*
2841 * Handle as named entity. - FM
2842 */
2843 code -= 160;
2844 EntityName = HTMLGetEntityName(code);
2845 if (EntityName && EntityName[0] != '\0') {
2846 string->size = 0;
2847 HTChunkPuts(string, EntityName);
2848 HTChunkTerminate(string);
2849 handle_entity(context, '\0');
2850 /*
2851 * Add a semi-colon if something went wrong and
2852 * handle_entity() sent the string. - FM
2853 */
2854 if (!FoundEntity) {
2855 PUTC(';');
2856 }
2857 } else {
2858 /*
2859 * Our conversion failed, so recover the "&#" and
2860 * digit(s), and recycle the terminator. - FM
2861 */
2862 #ifdef USE_PRETTYSRC
2863 if (psrc_view)
2864 PSRCSTART(badseq);
2865 #endif
2866 if (context->isHex) {
2867 PUTS("&#x");
2868 context->isHex = FALSE;
2869 } else {
2870 PUTS("&#");
2871 }
2872 string->size--;
2873 for (i = 0; i < string->size; i++) /* recover */
2874 PUTC(string->data[i]);
2875 #ifdef USE_PRETTYSRC
2876 if (psrc_view)
2877 PSRCSTOP(badseq);
2878 #endif
2879 string->size = 0;
2880 context->isHex = FALSE;
2881 context->state = S_text;
2882 goto top1;
2883 }
2884 }
2885 /*
2886 * If we get to here, we succeeded. Hoorah!!! - FM
2887 */
2888 string->size = 0;
2889 context->isHex = FALSE;
2890 context->state = S_text;
2891 /*
2892 * Don't eat the terminator if it's not the "standard"
2893 * semi-colon for HTML. - FM
2894 */
2895 if (c != ';') {
2896 goto top1;
2897 }
2898 } else {
2899 /*
2900 * Not an entity, and don't know why not, so add the terminator
2901 * to the string, output the "&#" or "&#x", and process the
2902 * string via the recover element. - FM
2903 */
2904 string->size--;
2905 HTChunkPutc(string, c);
2906 HTChunkTerminate(string);
2907 #ifdef USE_PRETTYSRC
2908 if (psrc_view)
2909 PSRCSTART(badseq);
2910 #endif
2911 if (context->isHex) {
2912 PUTS("&#x");
2913 context->isHex = FALSE;
2914 } else {
2915 PUTS("&#");
2916 }
2917 #ifdef USE_PRETTYSRC
2918 if (psrc_view)
2919 PSRCSTOP(badseq);
2920 #endif
2921 if (context->recover == NULL) {
2922 StrAllocCopy(context->recover, string->data);
2923 context->recover_index = 0;
2924 } else {
2925 StrAllocCat(context->recover, string->data);
2926 }
2927 string->size = 0;
2928 context->isHex = FALSE;
2929 context->state = S_text;
2930 break;
2931 }
2932 }
2933 break;
2934
2935 /*
2936 * Tag
2937 */
2938 case S_tag: /* new tag */
2939 if (TOASCII(unsign_c) < 127 && (string->size ? /* S/390 -- gil -- 1179 */
2940 IsNmChar(c) : IsNmStart(c))) {
2941 /*
2942 * Add valid ASCII character. - FM
2943 */
2944 HTChunkPutc(string, c);
2945 } else if (c == '!' && !string->size) { /* <! */
2946 /*
2947 * Terminate and set up for possible comment, identifier,
2948 * declaration, or marked section. - FM
2949 */
2950 context->state = S_exclamation;
2951 context->lead_exclamation = TRUE;
2952 context->doctype_bracket = FALSE;
2953 context->first_bracket = FALSE;
2954 HTChunkPutc(string, c);
2955 break;
2956 } else if (!string->size &&
2957 (TOASCII(unsign_c) <= 160 && /* S/390 -- gil -- 1196 */
2958 (c != '/' && c != '?' && c != '_' && c != ':'))) {
2959 /*
2960 * '<' must be followed by an ASCII letter to be a valid start tag.
2961 * Here it isn't, nor do we have a '/' for an end tag, nor one of
2962 * some other characters with a special meaning for SGML or which
2963 * are likely to be legal Name Start characters in XML or some
2964 * other extension. So recover the '<' and following character as
2965 * data. - FM & KW
2966 */
2967 context->state = S_text;
2968 #ifdef USE_PRETTYSRC
2969 if (psrc_view)
2970 PSRCSTART(badseq);
2971 #endif
2972 PUTC('<');
2973 #ifdef USE_PRETTYSRC
2974 if (psrc_view)
2975 PSRCSTOP(badseq);
2976 #endif
2977 goto top1;
2978 } else { /* End of tag name */
2979 /*
2980 * Try to handle tag. - FM
2981 */
2982 HTTag *t;
2983
2984 if (c == '/') {
2985 if (string->size == 0) {
2986 context->state = S_end;
2987 break;
2988 }
2989 CTRACE((tfp, "SGML: `<%.*s/' found!\n", string->size, string->data));
2990 }
2991 HTChunkTerminate(string);
2992
2993 t = SGMLFindTag(dtd, string->data);
2994 if (t == context->unknown_tag &&
2995 ((c == ':' &&
2996 string->size == 4 && 0 == strcasecomp(string->data, "URL")) ||
2997 (string->size > 4 && 0 == strncasecomp(string->data, "URL:", 4)))) {
2998 /*
2999 * Treat <URL: as text rather than a junk tag, so we display
3000 * it and the URL (Lynxism 8-). - FM
3001 */
3002 #ifdef USE_PRETTYSRC
3003 if (psrc_view)
3004 PSRCSTART(badseq);
3005 #endif
3006 PUTC('<');
3007 PUTS(string->data); /* recover */
3008 PUTC(c);
3009 #ifdef USE_PRETTYSRC
3010 if (psrc_view)
3011 PSRCSTOP(badseq);
3012 #endif
3013 CTRACE((tfp, "SGML: Treating <%s%c as text\n",
3014 string->data, c));
3015 string->size = 0;
3016 context->state = S_text;
3017 break;
3018 }
3019 if (c == '/' && t) {
3020 /*
3021 * Element name was ended by '/'. Remember the tag that ended
3022 * thusly, we'll interpret this as either an indication of an
3023 * empty element (if '>' follows directly) or do some
3024 * SGMLshortref-ish treatment. - kw
3025 */
3026 context->slashedtag = t;
3027 }
3028 if (!t) {
3029 if (c == '?' && string->size <= 1) {
3030 CTRACE((tfp, "SGML: Found PI, looking for '>'\n"));
3031 #ifdef USE_PRETTYSRC
3032 if (psrc_view) {
3033 PSRCSTART(abracket);
3034 PUTS("<?");
3035 PSRCSTOP(abracket);
3036 }
3037 #endif
3038 string->size = 0;
3039 context->state = S_pi;
3040 HTChunkPutc(string, c);
3041 break;
3042 }
3043 CTRACE((tfp, "SGML: *** Invalid element %s\n",
3044 string->data));
3045
3046 #ifdef USE_PRETTYSRC
3047 if (psrc_view) {
3048 PSRCSTART(abracket);
3049 PUTC('<');
3050 PSRCSTOP(abracket);
3051 PSRCSTART(badtag);
3052 transform_tag(context, string);
3053 PUTS(string->data);
3054 if (c == '>') {
3055 PSRCSTOP(badtag);
3056 PSRCSTART(abracket);
3057 PUTC('>');
3058 PSRCSTOP(abracket);
3059 } else {
3060 PUTC(c);
3061 }
3062 }
3063 #endif
3064 context->state = (c == '>') ? S_text : S_junk_tag;
3065 break;
3066 } else if (t == context->unknown_tag) {
3067 CTRACE((tfp, "SGML: *** Unknown element \"%s\"\n",
3068 string->data));
3069 /*
3070 * Fall through and treat like valid tag for attribute parsing.
3071 * - KW
3072 */
3073
3074 }
3075 context->current_tag = t;
3076
3077 #ifdef USE_PRETTYSRC
3078 if (psrc_view) {
3079 PSRCSTART(abracket);
3080 PUTC('<');
3081 PSRCSTOP(abracket);
3082 if (t != context->unknown_tag)
3083 PSRCSTART(tag);
3084 else
3085 PSRCSTART(badtag);
3086 transform_tag(context, string);
3087 PUTS(string->data);
3088 if (t != context->unknown_tag)
3089 PSRCSTOP(tag);
3090 else
3091 PSRCSTOP(badtag);
3092 }
3093 if (!psrc_view) /*don't waste time */
3094 #endif
3095 {
3096 /*
3097 * Clear out attributes.
3098 */
3099 memset((void *) context->present, 0, sizeof(BOOL) *
3100 (unsigned) (context->current_tag->number_of_attributes));
3101 }
3102
3103 string->size = 0;
3104 context->current_attribute_number = INVALID;
3105 #ifdef USE_PRETTYSRC
3106 if (psrc_view) {
3107 if (c == '>' || c == '<' || (c == '/' && context->slashedtag)) {
3108 if (c != '<') {
3109 PSRCSTART(abracket);
3110 PUTC(c);
3111 PSRCSTOP(abracket);
3112 context->state = (c == '>') ? S_text : S_tagname_slash;
3113 } else {
3114 context->state = S_tag;
3115 }
3116 } else {
3117 if (!WHITE(c))
3118 PUTC(c);
3119 context->state = S_tag_gap;
3120 }
3121 } else
3122 #endif
3123 if (c == '>' || c == '<' || (c == '/' && context->slashedtag)) {
3124 if (context->current_tag->name)
3125 start_element(context);
3126 context->state = (c == '>') ? S_text :
3127 (c == '<') ? S_tag : S_tagname_slash;
3128 } else {
3129 context->state = S_tag_gap;
3130 }
3131 }
3132 break;
3133
3134 case S_exclamation:
3135 if (context->lead_exclamation && c == '-') {
3136 /*
3137 * Set up for possible comment. - FM
3138 */
3139 context->lead_exclamation = FALSE;
3140 context->first_dash = TRUE;
3141 HTChunkPutc(string, c);
3142 break;
3143 }
3144 if (context->lead_exclamation && c == '[') {
3145 /*
3146 * Set up for possible marked section. - FM
3147 */
3148 context->lead_exclamation = FALSE;
3149 context->first_bracket = TRUE;
3150 context->second_bracket = FALSE;
3151 HTChunkPutc(string, c);
3152 context->state = S_marked;
3153 break;
3154 }
3155 if (context->first_dash && c == '-') {
3156 /*
3157 * Set up to handle comment. - FM
3158 */
3159 context->lead_exclamation = FALSE;
3160 context->first_dash = FALSE;
3161 context->end_comment = FALSE;
3162 HTChunkPutc(string, c);
3163 context->state = S_comment;
3164 break;
3165 }
3166 context->lead_exclamation = FALSE;
3167 context->first_dash = FALSE;
3168 if (c == '>') {
3169 /*
3170 * Try to handle identifier. - FM
3171 */
3172 HTChunkTerminate(string);
3173 #ifdef USE_PRETTYSRC
3174 if (psrc_view) {
3175 PSRCSTART(sgmlspecial);
3176 PUTC('<');
3177 PUTS(string->data);
3178 PUTC('>');
3179 PSRCSTOP(sgmlspecial);
3180 } else
3181 #endif
3182 handle_identifier(context);
3183 string->size = 0;
3184 context->state = S_text;
3185 break;
3186 }
3187 if (WHITE(c)) {
3188 if (string->size == 8 &&
3189 !strncasecomp(string->data, "!DOCTYPE", 8)) {
3190 /*
3191 * Set up for DOCTYPE declaration. - FM
3192 */
3193 HTChunkPutc(string, c);
3194 context->doctype_bracket = FALSE;
3195 context->state = S_doctype;
3196 break;
3197 }
3198 if (string->size == 7 &&
3199 !strncasecomp(string->data, "!ENTITY", 7)) {
3200 /*
3201 * Set up for ENTITY declaration. - FM
3202 */
3203 HTChunkPutc(string, c);
3204 context->first_dash = FALSE;
3205 context->end_comment = TRUE;
3206 context->state = S_sgmlent;
3207 break;
3208 }
3209 if (string->size == 8 &&
3210 !strncasecomp(string->data, "!ELEMENT", 8)) {
3211 /*
3212 * Set up for ELEMENT declaration. - FM
3213 */
3214 HTChunkPutc(string, c);
3215 context->first_dash = FALSE;
3216 context->end_comment = TRUE;
3217 context->state = S_sgmlele;
3218 break;
3219 }
3220 if (string->size == 8 &&
3221 !strncasecomp(string->data, "!ATTLIST", 8)) {
3222 /*
3223 * Set up for ATTLIST declaration. - FM
3224 */
3225 HTChunkPutc(string, c);
3226 context->first_dash = FALSE;
3227 context->end_comment = TRUE;
3228 context->state = S_sgmlatt;
3229 break;
3230 }
3231 }
3232 HTChunkPutc(string, c);
3233 break;
3234
3235 case S_comment: /* Expecting comment. - FM */
3236 if (historical_comments) {
3237 /*
3238 * Any '>' terminates. - FM
3239 */
3240 if (c == '>') {
3241 HTChunkTerminate(string);
3242 #ifdef USE_PRETTYSRC
3243 if (psrc_view) {
3244 PSRCSTART(comm);
3245 PUTC('<');
3246 PUTS_TR(string->data);
3247 PUTC('>');
3248 PSRCSTOP(comm);
3249 } else
3250 #endif
3251 handle_comment(context);
3252 string->size = 0;
3253 context->end_comment = FALSE;
3254 context->first_dash = FALSE;
3255 context->state = S_text;
3256 break;
3257 }
3258 goto S_comment_put_c;
3259 }
3260 if (!context->first_dash && c == '-') {
3261 HTChunkPutc(string, c);
3262 context->first_dash = TRUE;
3263 break;
3264 }
3265 if (context->first_dash && c == '-') {
3266 HTChunkPutc(string, c);
3267 context->first_dash = FALSE;
3268 if (!context->end_comment)
3269 context->end_comment = TRUE;
3270 else if (!minimal_comments)
3271 /*
3272 * Validly treat '--' pairs as successive comments (for
3273 * minimal, any "--WHITE>" terminates). - FM
3274 */
3275 context->end_comment = FALSE;
3276 break;
3277 }
3278 if (context->end_comment && c == '>') {
3279 /*
3280 * Terminate and handle the comment. - FM
3281 */
3282 HTChunkTerminate(string);
3283 #ifdef USE_PRETTYSRC
3284 if (psrc_view) {
3285 PSRCSTART(comm);
3286 PUTC('<');
3287 PUTS_TR(string->data);
3288 PUTC('>');
3289 PSRCSTOP(comm);
3290 } else
3291 #endif
3292 handle_comment(context);
3293 string->size = 0;
3294 context->end_comment = FALSE;
3295 context->first_dash = FALSE;
3296 context->state = S_text;
3297 break;
3298 }
3299 context->first_dash = FALSE;
3300 if (context->end_comment && !isspace(UCH(c)))
3301 context->end_comment = FALSE;
3302
3303 S_comment_put_c:
3304 if (context->T.decode_utf8 &&
3305 *context->utf_buf) {
3306 HTChunkPuts(string, context->utf_buf);
3307 context->utf_buf_p = context->utf_buf;
3308 *(context->utf_buf_p) = '\0';
3309 } else if (!IS_CJK_TTY &&
3310 (context->T.output_utf8 ||
3311 context->T.trans_from_uni)) {
3312 if (clong == 0xfffd && saved_char_in &&
3313 HTPassEightBitRaw &&
3314 saved_char_in >=
3315 LYlowest_eightbit[context->outUCLYhndl]) {
3316 HTChunkPutUtf8Char(string,
3317 (UCode_t) (0xf000 | saved_char_in));
3318 } else {
3319 HTChunkPutUtf8Char(string, clong);
3320 }
3321 } else if (saved_char_in && context->T.use_raw_char_in) {
3322 HTChunkPutc(string, saved_char_in);
3323 } else {
3324 HTChunkPutc(string, c);
3325 }
3326 break;
3327
3328 case S_doctype: /* Expecting DOCTYPE. - FM */
3329 if (context->doctype_bracket) {
3330 HTChunkPutc(string, c);
3331 if (c == ']')
3332 context->doctype_bracket = FALSE;
3333 break;
3334 }
3335 if (c == '[' && WHITE(string->data[string->size - 1])) {
3336 HTChunkPutc(string, c);
3337 context->doctype_bracket = TRUE;
3338 break;
3339 }
3340 if (c == '>') {
3341 HTChunkTerminate(string);
3342 #ifdef USE_PRETTYSRC
3343 if (psrc_view) {
3344 PSRCSTART(sgmlspecial);
3345 PUTC('<');
3346 PUTS(string->data);
3347 PUTC('>');
3348 PSRCSTOP(sgmlspecial);
3349 } else
3350 #endif
3351 handle_doctype(context);
3352 string->size = 0;
3353 context->state = S_text;
3354 break;
3355 }
3356 HTChunkPutc(string, c);
3357 break;
3358
3359 case S_marked: /* Expecting marked section. - FM */
3360 if (context->first_bracket && c == '[') {
3361 HTChunkPutc(string, c);
3362 context->first_bracket = FALSE;
3363 context->second_bracket = TRUE;
3364 break;
3365 }
3366 if (context->second_bracket && c == ']' &&
3367 string->data[string->size - 1] == ']') {
3368 HTChunkPutc(string, c);
3369 context->second_bracket = FALSE;
3370 break;
3371 }
3372 if (!context->second_bracket && c == '>') {
3373 HTChunkTerminate(string);
3374 #ifdef USE_PRETTYSRC
3375 if (psrc_view) {
3376 PSRCSTART(sgmlspecial);
3377 PUTC('<');
3378 PUTS(string->data);
3379 PUTC('>');
3380 PSRCSTOP(sgmlspecial);
3381 } else
3382 #endif
3383 handle_marked(context);
3384 string->size = 0;
3385 context->state = S_text;
3386 break;
3387 }
3388 HTChunkPutc(string, c);
3389 break;
3390
3391 case S_sgmlent: /* Expecting ENTITY. - FM */
3392 if (!context->first_dash && c == '-') {
3393 HTChunkPutc(string, c);
3394 context->first_dash = TRUE;
3395 break;
3396 }
3397 if (context->first_dash && c == '-') {
3398 HTChunkPutc(string, c);
3399 context->first_dash = FALSE;
3400 if (!context->end_comment)
3401 context->end_comment = TRUE;
3402 else
3403 context->end_comment = FALSE;
3404 break;
3405 }
3406 if (context->end_comment && c == '>') {
3407 HTChunkTerminate(string);
3408 #ifdef USE_PRETTYSRC
3409 if (psrc_view) {
3410 PSRCSTART(sgmlspecial);
3411 PUTC('<');
3412 PUTS(string->data);
3413 PUTC('>');
3414 PSRCSTOP(sgmlspecial);
3415 } else
3416 #endif
3417 handle_sgmlent(context);
3418 string->size = 0;
3419 context->end_comment = FALSE;
3420 context->first_dash = FALSE;
3421 context->state = S_text;
3422 break;
3423 }
3424 context->first_dash = FALSE;
3425 HTChunkPutc(string, c);
3426 break;
3427
3428 case S_sgmlele: /* Expecting ELEMENT. - FM */
3429 if (!context->first_dash && c == '-') {
3430 HTChunkPutc(string, c);
3431 context->first_dash = TRUE;
3432 break;
3433 }
3434 if (context->first_dash && c == '-') {
3435 HTChunkPutc(string, c);
3436 context->first_dash = FALSE;
3437 if (!context->end_comment)
3438 context->end_comment = TRUE;
3439 else
3440 context->end_comment = FALSE;
3441 break;
3442 }
3443 if (context->end_comment && c == '>') {
3444 HTChunkTerminate(string);
3445 #ifdef USE_PRETTYSRC
3446 if (psrc_view) {
3447 PSRCSTART(sgmlspecial);
3448 PUTC('<');
3449 PUTS(string->data);
3450 PUTC('>');
3451 PSRCSTOP(sgmlspecial);
3452 } else
3453 #endif
3454 handle_sgmlele(context);
3455 string->size = 0;
3456 context->end_comment = FALSE;
3457 context->first_dash = FALSE;
3458 context->state = S_text;
3459 break;
3460 }
3461 context->first_dash = FALSE;
3462 HTChunkPutc(string, c);
3463 break;
3464
3465 case S_sgmlatt: /* Expecting ATTLIST. - FM */
3466 if (!context->first_dash && c == '-') {
3467 HTChunkPutc(string, c);
3468 context->first_dash = TRUE;
3469 break;
3470 }
3471 if (context->first_dash && c == '-') {
3472 HTChunkPutc(string, c);
3473 context->first_dash = FALSE;
3474 if (!context->end_comment)
3475 context->end_comment = TRUE;
3476 else
3477 context->end_comment = FALSE;
3478 break;
3479 }
3480 if (context->end_comment && c == '>') {
3481 HTChunkTerminate(string);
3482 #ifdef USE_PRETTYSRC
3483 if (psrc_view) {
3484 PSRCSTART(sgmlspecial);
3485 PUTC('<');
3486 PUTS(string->data);
3487 PUTC('>');
3488 PSRCSTOP(sgmlspecial);
3489 } else
3490 #endif
3491 handle_sgmlatt(context);
3492 string->size = 0;
3493 context->end_comment = FALSE;
3494 context->first_dash = FALSE;
3495 context->state = S_text;
3496 break;
3497 }
3498 context->first_dash = FALSE;
3499 HTChunkPutc(string, c);
3500 break;
3501
3502 case S_tag_gap: /* Expecting attribute or '>' */
3503 if (WHITE(c)) {
3504 /* PUTC(c); - no, done as special case */
3505 break; /* Gap between attributes */
3506 }
3507 if (c == '>') { /* End of tag */
3508 #ifdef USE_PRETTYSRC
3509 if (!psrc_view)
3510 #endif
3511 if (context->current_tag->name)
3512 start_element(context);
3513 #ifdef USE_PRETTYSRC
3514 if (psrc_view) {
3515 PSRCSTART(abracket);
3516 PUTC('>');
3517 PSRCSTOP(abracket);
3518 }
3519 #endif
3520 context->state = S_text;
3521 break;
3522 }
3523 HTChunkPutc(string, c);
3524 context->state = S_attr; /* Get attribute */
3525 break;
3526
3527 /* accumulating value */
3528 case S_attr:
3529 if (WHITE(c) || (c == '>') || (c == '=')) { /* End of word */
3530 if ((c == '>')
3531 && (string->size == 1)
3532 && (string->data[0] == '/')) {
3533 if (context->extended_html
3534 && ignore_when_empty(context->current_tag)) {
3535 discard_empty(context);
3536 }
3537 } else {
3538 HTChunkTerminate(string);
3539 handle_attribute_name(context, string->data);
3540 }
3541 #ifdef USE_PRETTYSRC
3542 if (!psrc_view) {
3543 #endif
3544 string->size = 0;
3545 if (c == '>') { /* End of tag */
3546 if (context->current_tag->name)
3547 start_element(context);
3548 context->state = S_text;
3549 break;
3550 }
3551 #ifdef USE_PRETTYSRC
3552 } else {
3553 PUTC(' ');
3554 if (context->current_attribute_number == INVALID)
3555 PSRCSTART(badattr);
3556 else
3557 PSRCSTART(attrib);
3558 if (attrname_transform != 1) {
3559 if (attrname_transform == 0)
3560 LYLowerCase(string->data);
3561 else
3562 LYUpperCase(string->data);
3563 }
3564 PUTS(string->data);
3565 if (c == '=' || WHITE(c))
3566 PUTC(c);
3567 if (c == '=' || c == '>') {
3568 if (context->current_attribute_number == INVALID) {
3569 PSRCSTOP(badattr);
3570 } else {
3571 PSRCSTOP(attrib);
3572 }
3573 }
3574 if (c == '>') {
3575 PSRCSTART(abracket);
3576 PUTC('>');
3577 PSRCSTOP(abracket);
3578 context->state = S_text;
3579 break;
3580 }
3581 string->size = 0;
3582 }
3583 #endif
3584 context->state = (c == '=' ? S_equals : S_attr_gap);
3585 } else {
3586 HTChunkPutc(string, c);
3587 }
3588 break;
3589
3590 case S_attr_gap: /* Expecting attribute or '=' or '>' */
3591 if (WHITE(c)) {
3592 PRETTYSRC_PUTC(c);
3593 break; /* Gap after attribute */
3594 }
3595 if (c == '>') { /* End of tag */
3596 #ifdef USE_PRETTYSRC
3597 if (psrc_view) {
3598 if (context->current_attribute_number == INVALID) {
3599 PSRCSTOP(badattr);
3600 } else {
3601 PSRCSTOP(attrib);
3602 }
3603 PSRCSTART(abracket);
3604 PUTC('>');
3605 PSRCSTOP(abracket);
3606 } else
3607 #endif
3608 if (context->current_tag->name)
3609 start_element(context);
3610 context->state = S_text;
3611 break;
3612 } else if (c == '=') {
3613 #ifdef USE_PRETTYSRC
3614 if (psrc_view) {
3615 PUTC('=');
3616 if (context->current_attribute_number == INVALID) {
3617 PSRCSTOP(badattr);
3618 } else {
3619 PSRCSTOP(attrib);
3620 }
3621 }
3622 #endif
3623 context->state = S_equals;
3624 break;
3625 }
3626 HTChunkPutc(string, c);
3627 context->state = S_attr; /* Get next attribute */
3628 break;
3629
3630 case S_equals: /* After attr = */
3631 if (WHITE(c)) {
3632 PRETTYSRC_PUTC(c);
3633 break; /* Before attribute value */
3634 }
3635 if (c == '>') { /* End of tag */
3636 CTRACE((tfp, "SGML: found = but no value\n"));
3637 #ifdef USE_PRETTYSRC
3638 if (psrc_view) {
3639 PSRCSTART(abracket);
3640 PUTC('>');
3641 PSRCSTOP(abracket);
3642 } else
3643 #endif
3644 if (context->current_tag->name)
3645 start_element(context);
3646 context->state = S_text;
3647 break;
3648
3649 } else if (c == '\'') {
3650 #ifdef USE_PRETTYSRC
3651 if (psrc_view) {
3652 PSRCSTART(attrval);
3653 PUTC(c);
3654 }
3655 #endif
3656 context->state = S_squoted;
3657 break;
3658
3659 } else if (c == '"') {
3660 #ifdef USE_PRETTYSRC
3661 if (psrc_view) {
3662 PSRCSTART(attrval);
3663 PUTC(c);
3664 }
3665 #endif
3666 context->state = S_dquoted;
3667 break;
3668 }
3669 #ifdef USE_PRETTYSRC
3670 if (psrc_view)
3671 PSRCSTART(attrval);
3672 #endif
3673 context->state = S_value;
3674 /* no break! fall through to S_value and process current `c` */
3675
3676 case S_value:
3677 if (WHITE(c) || (c == '>')) { /* End of word */
3678 HTChunkTerminate(string);
3679 #ifdef USE_PRETTYSRC
3680 if (!end_if_prettysrc(context, string, 0))
3681 #endif
3682 {
3683 #ifdef CJK_EX /* Quick hack. - JH7AYN */
3684 if (IS_CJK_TTY) {
3685 if (string->data[0] == '$') {
3686 if (string->data[1] == 'B' || string->data[1] == '@') {
3687 char *jis_buf = 0;
3688
3689 HTSprintf0(&jis_buf, "\033%s", string->data);
3690 TO_EUC((const unsigned char *) jis_buf,
3691 (unsigned char *) string->data);
3692 FREE(jis_buf);
3693 }
3694 }
3695 }
3696 #endif
3697 handle_attribute_value(context, string->data);
3698 }
3699 string->size = 0;
3700 if (c == '>') { /* End of tag */
3701 #ifdef USE_PRETTYSRC
3702 if (psrc_view) {
3703 PSRCSTART(abracket);
3704 PUTC('>');
3705 PSRCSTOP(abracket);
3706 } else
3707 #endif
3708 if (context->current_tag->name)
3709 start_element(context);
3710 context->state = S_text;
3711 break;
3712 } else
3713 context->state = S_tag_gap;
3714 } else if (context->T.decode_utf8 &&
3715 *context->utf_buf) {
3716 HTChunkPuts(string, context->utf_buf);
3717 context->utf_buf_p = context->utf_buf;
3718 *(context->utf_buf_p) = '\0';
3719 } else if (!IS_CJK_TTY &&
3720 (context->T.output_utf8 ||
3721 context->T.trans_from_uni)) {
3722 if (clong == 0xfffd && saved_char_in &&
3723 HTPassEightBitRaw &&
3724 saved_char_in >=
3725 LYlowest_eightbit[context->outUCLYhndl]) {
3726 HTChunkPutUtf8Char(string,
3727 (UCode_t) (0xf000 | saved_char_in));
3728 } else {
3729 HTChunkPutUtf8Char(string, clong);
3730 }
3731 } else if (saved_char_in && context->T.use_raw_char_in) {
3732 HTChunkPutc(string, saved_char_in);
3733 } else {
3734 HTChunkPutc(string, c);
3735 }
3736 break;
3737
3738 case S_squoted: /* Quoted attribute value */
3739 if (c == '\'') { /* End of attribute value */
3740 HTChunkTerminate(string);
3741 #ifdef USE_PRETTYSRC
3742 if (!end_if_prettysrc(context, string, '\''))
3743 #endif
3744 handle_attribute_value(context, string->data);
3745 string->size = 0;
3746 context->state = S_tag_gap;
3747 } else if (TOASCII(c) == '\033') { /* S/390 -- gil -- 1213 */
3748 /*
3749 * Setting up for possible single quotes in CJK escape sequences.
3750 * - Takuya ASADA (asada@three-a.co.jp)
3751 */
3752 context->state = S_esc_sq;
3753 HTChunkPutc(string, c);
3754 } else if (context->T.decode_utf8 &&
3755 *context->utf_buf) {
3756 HTChunkPuts(string, context->utf_buf);
3757 context->utf_buf_p = context->utf_buf;
3758 *(context->utf_buf_p) = '\0';
3759 } else if (!IS_CJK_TTY &&
3760 (context->T.output_utf8 ||
3761 context->T.trans_from_uni)) {
3762 if (clong == 0xfffd && saved_char_in &&
3763 HTPassEightBitRaw &&
3764 saved_char_in >=
3765 LYlowest_eightbit[context->outUCLYhndl]) {
3766 HTChunkPutUtf8Char(string,
3767 (UCode_t) (0xf000 | saved_char_in));
3768 } else {
3769 HTChunkPutUtf8Char(string, clong);
3770 }
3771 } else if (saved_char_in && context->T.use_raw_char_in) {
3772 HTChunkPutc(string, saved_char_in);
3773 } else {
3774 HTChunkPutc(string, c);
3775 }
3776 break;
3777
3778 case S_dquoted: /* Quoted attribute value */
3779 if (c == '"' || /* Valid end of attribute value */
3780 (soft_dquotes && /* If emulating old Netscape bug, treat '>' */
3781 c == '>')) { /* as a co-terminator of dquoted and tag */
3782 HTChunkTerminate(string);
3783 #ifdef USE_PRETTYSRC
3784 if (!end_if_prettysrc(context, string, (char) c))
3785 #endif
3786 handle_attribute_value(context, string->data);
3787 string->size = 0;
3788 context->state = S_tag_gap;
3789 if (c == '>') /* We emulated the Netscape bug, so we go */
3790 goto top1; /* back and treat it as the tag terminator */
3791 } else if (TOASCII(c) == '\033') { /* S/390 -- gil -- 1230 */
3792 /*
3793 * Setting up for possible double quotes in CJK escape sequences.
3794 * - Takuya ASADA (asada@three-a.co.jp)
3795 */
3796 context->state = S_esc_dq;
3797 HTChunkPutc(string, c);
3798 } else if (context->T.decode_utf8 &&
3799 *context->utf_buf) {
3800 HTChunkPuts(string, context->utf_buf);
3801 context->utf_buf_p = context->utf_buf;
3802 *(context->utf_buf_p) = '\0';
3803 } else if (!IS_CJK_TTY &&
3804 (context->T.output_utf8 ||
3805 context->T.trans_from_uni)) {
3806 if (clong == 0xfffd && saved_char_in &&
3807 HTPassEightBitRaw &&
3808 saved_char_in >=
3809 LYlowest_eightbit[context->outUCLYhndl]) {
3810 HTChunkPutUtf8Char(string,
3811 (UCode_t) (0xf000 | saved_char_in));
3812 } else {
3813 HTChunkPutUtf8Char(string, clong);
3814 }
3815 } else if (saved_char_in && context->T.use_raw_char_in) {
3816 HTChunkPutc(string, saved_char_in);
3817 } else {
3818 HTChunkPutc(string, c);
3819 }
3820 break;
3821
3822 case S_end: /* </ */
3823 if (TOASCII(unsign_c) < 127 && (string->size ? /* S/390 -- gil -- 1247 */
3824 IsNmChar(c) : IsNmStart(c))) {
3825 HTChunkPutc(string, c);
3826 } else { /* End of end tag name */
3827 HTTag *t = 0;
3828
3829 #ifdef USE_PRETTYSRC
3830 BOOL psrc_tagname_processed = FALSE;
3831 #endif
3832
3833 HTChunkTerminate(string);
3834 if (!*string->data) { /* Empty end tag */
3835 if (context->element_stack)
3836 t = context->element_stack->tag;
3837 } else {
3838 t = SGMLFindTag(dtd, string->data);
3839 }
3840 if (!t || t == context->unknown_tag) {
3841 CTRACE((tfp, "Unknown end tag </%s>\n", string->data));
3842 #ifdef USE_PRETTYSRC
3843 if (psrc_view) {
3844 PSRCSTART(abracket);
3845 PUTS("</");
3846 PSRCSTOP(abracket);
3847 PSRCSTART(badtag);
3848 transform_tag(context, string);
3849 PUTS(string->data);
3850 if (c != '>') {
3851 PUTC(c);
3852 } else {
3853 PSRCSTOP(badtag);
3854 PSRCSTART(abracket);
3855 PUTC('>');
3856 PSRCSTOP(abracket);
3857 }
3858 psrc_tagname_processed = TRUE;
3859 }
3860 } else if (psrc_view) {
3861 #endif
3862 } else {
3863 BOOL tag_OK = (BOOL) (c == '>' || WHITE(c));
3864 HTMLElement e = TAGNUM_OF_TAGP(t);
3865 int branch = 2; /* it can be 0,1,2 */
3866
3867 context->current_tag = t;
3868 if (HAS_ALT_TAGNUM(TAGNUM_OF_TAGP(t)) &&
3869 context->element_stack &&
3870 ALT_TAGP(t) == context->element_stack->tag)
3871 context->element_stack->tag = NORMAL_TAGP(context->element_stack->tag);
3872
3873 if (tag_OK && Old_DTD) {
3874 switch (e) {
3875 case HTML_DD:
3876 case HTML_DT:
3877 case HTML_LI:
3878 case HTML_LH:
3879 case HTML_TD:
3880 case HTML_TH:
3881 case HTML_TR:
3882 case HTML_THEAD:
3883 case HTML_TFOOT:
3884 case HTML_TBODY:
3885 case HTML_COLGROUP:
3886 branch = 0;
3887 break;
3888
3889 case HTML_A:
3890 case HTML_B:
3891 case HTML_BLINK:
3892 case HTML_CITE:
3893 case HTML_EM:
3894 case HTML_FONT:
3895 case HTML_FORM:
3896 case HTML_I:
3897 case HTML_P:
3898 case HTML_STRONG:
3899 case HTML_TT:
3900 case HTML_U:
3901 branch = 1;
3902 break;
3903 default:
3904 break;
3905 }
3906 }
3907
3908 /*
3909 * Just handle ALL end tags normally :-) - kw
3910 */
3911 if (!Old_DTD) {
3912 end_element(context, context->current_tag);
3913 } else if (tag_OK && (branch == 0)) {
3914 /*
3915 * Don't treat these end tags as invalid, nor act on them.
3916 * - FM
3917 */
3918 CTRACE((tfp, "SGML: `</%s%c' found! Ignoring it.\n",
3919 string->data, c));
3920 string->size = 0;
3921 context->current_attribute_number = INVALID;
3922 if (c != '>') {
3923 context->state = S_junk_tag;
3924 } else {
3925 context->current_tag = NULL;
3926 context->state = S_text;
3927 }
3928 break;
3929 } else if (tag_OK && (branch == 1)) {
3930 /*
3931 * Handle end tags for container elements declared as
3932 * SGML_EMPTY to prevent "expected tag substitution" but
3933 * still processed via HTML_end_element() in HTML.c with
3934 * checks there to avoid throwing the HTML.c stack out of
3935 * whack (Ugh, what a hack! 8-). - FM
3936 */
3937 if (context->inSELECT) {
3938 /*
3939 * We are in a SELECT block. - FM
3940 */
3941 if (strcasecomp(string->data, "FORM")) {
3942 /*
3943 * It is not at FORM end tag, so ignore it. - FM
3944 */
3945 CTRACE((tfp,
3946 "SGML: ***Ignoring end tag </%s> in SELECT block.\n",
3947 string->data));
3948 } else {
3949 /*
3950 * End the SELECT block and then handle the FORM
3951 * end tag. - FM
3952 */
3953 CTRACE((tfp,
3954 "SGML: ***Faking SELECT end tag before </%s> end tag.\n",
3955 string->data));
3956 end_element(context,
3957 SGMLFindTag(context->dtd, "SELECT"));
3958 CTRACE((tfp, "SGML: End </%s>\n", string->data));
3959
3960 #ifdef USE_PRETTYSRC
3961 if (!psrc_view) /* Don't actually call if viewing psrc - kw */
3962 #endif
3963 (*context->actions->end_element)
3964 (context->target,
3965 (int) TAGNUM_OF_TAGP(context->current_tag),
3966 &context->include);
3967 }
3968 } else if (!strcasecomp(string->data, "P")) {
3969 /*
3970 * Treat a P end tag like a P start tag (Ugh, what a
3971 * hack! 8-). - FM
3972 */
3973 CTRACE((tfp,
3974 "SGML: `</%s%c' found! Treating as '<%s%c'.\n",
3975 string->data, c, string->data, c));
3976 {
3977 int i;
3978
3979 for (i = 0;
3980 i < context->current_tag->number_of_attributes;
3981 i++) {
3982 context->present[i] = NO;
3983 }
3984 }
3985 if (context->current_tag->name)
3986 start_element(context);
3987 } else {
3988 CTRACE((tfp, "SGML: End </%s>\n", string->data));
3989
3990 #ifdef USE_PRETTYSRC
3991 if (!psrc_view) /* Don't actually call if viewing psrc - kw */
3992 #endif
3993 (*context->actions->end_element)
3994 (context->target,
3995 (int) TAGNUM_OF_TAGP(context->current_tag),
3996 &context->include);
3997 }
3998 string->size = 0;
3999 context->current_attribute_number = INVALID;
4000 if (c != '>') {
4001 context->state = S_junk_tag;
4002 } else {
4003 context->current_tag = NULL;
4004 context->state = S_text;
4005 }
4006 break;
4007 } else {
4008 /*
4009 * Handle all other end tags normally. - FM
4010 */
4011 end_element(context, context->current_tag);
4012 }
4013 }
4014
4015 #ifdef USE_PRETTYSRC
4016 if (psrc_view && !psrc_tagname_processed) {
4017 PSRCSTART(abracket);
4018 PUTS("</");
4019 PSRCSTOP(abracket);
4020 PSRCSTART(tag);
4021 if (tagname_transform != 1) {
4022 if (tagname_transform == 0)
4023 LYLowerCase(string->data);
4024 else
4025 LYUpperCase(string->data);
4026 }
4027 PUTS(string->data);
4028 PSRCSTOP(tag);
4029 if (c != '>') {
4030 PSRCSTART(badtag);
4031 PUTC(c);
4032 } else {
4033 PSRCSTART(abracket);
4034 PUTC('>');
4035 PSRCSTOP(abracket);
4036 }
4037 }
4038 #endif
4039
4040 string->size = 0;
4041 context->current_attribute_number = INVALID;
4042 if (c != '>') {
4043 if (!WHITE(c))
4044 CTRACE((tfp, "SGML: `</%s%c' found!\n", string->data, c));
4045 context->state = S_junk_tag;
4046 } else {
4047 context->current_tag = NULL;
4048 context->state = S_text;
4049 }
4050 }
4051 break;
4052
4053 case S_esc: /* Expecting '$'or '(' following CJK ESC. */
4054 if (c == '$') {
4055 context->state = S_dollar;
4056 } else if (c == '(') {
4057 context->state = S_paren;
4058 } else {
4059 context->state = S_text;
4060 }
4061 PUTC(c);
4062 break;
4063
4064 case S_dollar: /* Expecting '@', 'B', 'A' or '(' after CJK "ESC$". */
4065 if (c == '@' || c == 'B' || c == 'A') {
4066 context->state = S_nonascii_text;
4067 } else if (c == '(') {
4068 context->state = S_dollar_paren;
4069 }
4070 PUTC(c);
4071 break;
4072
4073 case S_dollar_paren: /* Expecting 'C' after CJK "ESC$(". */
4074 if (c == 'C') {
4075 context->state = S_nonascii_text;
4076 } else {
4077 context->state = S_text;
4078 }
4079 PUTC(c);
4080 break;
4081
4082 case S_paren: /* Expecting 'B', 'J', 'T' or 'I' after CJK "ESC(". */
4083 if (c == 'B' || c == 'J' || c == 'T') {
4084 context->state = S_text;
4085 } else if (c == 'I') {
4086 context->state = S_nonascii_text;
4087 } else {
4088 context->state = S_text;
4089 }
4090 PUTC(c);
4091 break;
4092
4093 case S_nonascii_text: /* Expecting CJK ESC after non-ASCII text. */
4094 if (TOASCII(c) == '\033') { /* S/390 -- gil -- 1264 */
4095 context->state = S_esc;
4096 }
4097 PUTC(c);
4098 if (c < 32)
4099 context->state = S_text;
4100 break;
4101
4102 case S_esc_sq: /* Expecting '$'or '(' following CJK ESC. */
4103 if (c == '$') {
4104 context->state = S_dollar_sq;
4105 } else if (c == '(') {
4106 context->state = S_paren_sq;
4107 } else {
4108 context->state = S_squoted;
4109 }
4110 HTChunkPutc(string, c);
4111 break;
4112
4113 case S_dollar_sq: /* Expecting '@', 'B', 'A' or '(' after CJK "ESC$". */
4114 if (c == '@' || c == 'B' || c == 'A') {
4115 context->state = S_nonascii_text_sq;
4116 } else if (c == '(') {
4117 context->state = S_dollar_paren_sq;
4118 }
4119 HTChunkPutc(string, c);
4120 break;
4121
4122 case S_dollar_paren_sq: /* Expecting 'C' after CJK "ESC$(". */
4123 if (c == 'C') {
4124 context->state = S_nonascii_text_sq;
4125 } else {
4126 context->state = S_squoted;
4127 }
4128 HTChunkPutc(string, c);
4129 break;
4130
4131 case S_paren_sq: /* Expecting 'B', 'J', 'T' or 'I' after CJK "ESC(". */
4132 if (c == 'B' || c == 'J' || c == 'T') {
4133 context->state = S_squoted;
4134 } else if (c == 'I') {
4135 context->state = S_nonascii_text_sq;
4136 } else {
4137 context->state = S_squoted;
4138 }
4139 HTChunkPutc(string, c);
4140 break;
4141
4142 case S_nonascii_text_sq: /* Expecting CJK ESC after non-ASCII text. */
4143 if (TOASCII(c) == '\033') { /* S/390 -- gil -- 1281 */
4144 context->state = S_esc_sq;
4145 }
4146 HTChunkPutc(string, c);
4147 break;
4148
4149 case S_esc_dq: /* Expecting '$'or '(' following CJK ESC. */
4150 if (c == '$') {
4151 context->state = S_dollar_dq;
4152 } else if (c == '(') {
4153 context->state = S_paren_dq;
4154 } else {
4155 context->state = S_dquoted;
4156 }
4157 HTChunkPutc(string, c);
4158 break;
4159
4160 case S_dollar_dq: /* Expecting '@', 'B', 'A' or '(' after CJK "ESC$". */
4161 if (c == '@' || c == 'B' || c == 'A') {
4162 context->state = S_nonascii_text_dq;
4163 } else if (c == '(') {
4164 context->state = S_dollar_paren_dq;
4165 }
4166 HTChunkPutc(string, c);
4167 break;
4168
4169 case S_dollar_paren_dq: /* Expecting 'C' after CJK "ESC$(". */
4170 if (c == 'C') {
4171 context->state = S_nonascii_text_dq;
4172 } else {
4173 context->state = S_dquoted;
4174 }
4175 HTChunkPutc(string, c);
4176 break;
4177
4178 case S_paren_dq: /* Expecting 'B', 'J', 'T' or 'I' after CJK "ESC(". */
4179 if (c == 'B' || c == 'J' || c == 'T') {
4180 context->state = S_dquoted;
4181 } else if (c == 'I') {
4182 context->state = S_nonascii_text_dq;
4183 } else {
4184 context->state = S_dquoted;
4185 }
4186 HTChunkPutc(string, c);
4187 break;
4188
4189 case S_nonascii_text_dq: /* Expecting CJK ESC after non-ASCII text. */
4190 if (TOASCII(c) == '\033') { /* S/390 -- gil -- 1298 */
4191 context->state = S_esc_dq;
4192 }
4193 HTChunkPutc(string, c);
4194 break;
4195
4196 case S_junk_tag:
4197 case S_pi:
4198 if (c == '>') {
4199 HTChunkTerminate(string);
4200 #ifdef USE_PRETTYSRC
4201 if (psrc_view) {
4202 if (context->state == S_junk_tag) {
4203 PSRCSTOP(badtag);
4204 }
4205 PSRCSTART(abracket);
4206 PUTC('>');
4207 PSRCSTOP(abracket);
4208 }
4209 #endif
4210 if (context->state == S_pi)
4211 handle_processing_instruction(context);
4212 string->size = 0;
4213 context->current_tag = NULL;
4214 context->state = S_text;
4215 } else {
4216 HTChunkPutc(string, c);
4217 #ifdef USE_PRETTYSRC
4218 if (psrc_view) {
4219 PUTC(c);
4220 }
4221 #endif
4222 }
4223
4224 } /* switch on context->state */
4225 CTRACE2(TRACE_SGML, (tfp, "SGML after %s|%.*s|%c|\n",
4226 state_name(context->state),
4227 string->size,
4228 NonNull(string->data),
4229 UCH(c)));
4230
4231 after_switch:
4232 /*
4233 * Check whether an external function has added anything to the include
4234 * buffer. If so, move the new stuff to the beginning of active_include.
4235 * - kw
4236 */
4237 if (context->include != NULL) {
4238 if (context->include[0] == '\0') {
4239 FREE(context->include);
4240 } else {
4241 if (context->active_include &&
4242 context->active_include[context->include_index] != '\0')
4243 StrAllocCat(context->include,
4244 context->active_include + context->include_index);
4245 FREE(context->active_include);
4246 context->active_include = context->include;
4247 context->include_index = 0;
4248 context->include = NULL;
4249 }
4250 }
4251
4252 /*
4253 * Check whether we've added anything to the recover buffer. - FM
4254 */
4255 if (context->recover != NULL) {
4256 if (context->recover[context->recover_index] == '\0') {
4257 FREE(context->recover);
4258 context->recover_index = 0;
4259 } else {
4260 c = UCH(context->recover[context->recover_index]);
4261 context->recover_index++;
4262 goto top;
4263 }
4264 }
4265
4266 /*
4267 * Check whether an external function had added anything to the include
4268 * buffer; it should now be in active_include. - FM / kw
4269 */
4270 if (context->active_include != NULL) {
4271 if (context->active_include[context->include_index] == '\0') {
4272 FREE(context->active_include);
4273 context->include_index = 0;
4274 } else {
4275 if (context->current_tag_charset == UTF8_handle ||
4276 context->T.trans_from_uni) {
4277 /*
4278 * If it looks like we would have fed UTF-8 to the next
4279 * processing stage, assume that whatever we were fed back is
4280 * in UTF-8 form, too. This won't be always true for all uses
4281 * of the include buffer, but it's a start. - kw
4282 */
4283 char *puni = context->active_include + context->include_index;
4284
4285 c = UCH(*puni);
4286 clong = UCGetUniFromUtf8String(&puni);
4287 if (clong < 256 && clong >= 0) {
4288 c = UCH((clong & 0xff));
4289 }
4290 saved_char_in = '\0';
4291 context->include_index = (int) (puni
4292 - context->active_include
4293 + 1);
4294 goto top1;
4295 } else {
4296 /*
4297 * Otherwise assume no UTF-8 - do charset-naive processing and
4298 * hope for the best. - kw
4299 */
4300 c = UCH(context->active_include[context->include_index]);
4301 context->include_index++;
4302 goto top;
4303 }
4304 }
4305 }
4306
4307 /*
4308 * Check whether an external function has added anything to the csi buffer.
4309 * - FM
4310 */
4311 if (context->csi != NULL) {
4312 if (context->csi[context->csi_index] == '\0') {
4313 FREE(context->csi);
4314 context->csi_index = 0;
4315 } else {
4316 c = UCH(context->csi[context->csi_index]);
4317 context->csi_index++;
4318 goto top;
4319 }
4320 }
4321 } /* SGML_character */
4322
InferUtfFromBom(HTStream * context,int chndl)4323 static void InferUtfFromBom(HTStream *context, int chndl)
4324 {
4325 HTAnchor_setUCInfoStage(context->node_anchor, chndl,
4326 UCT_STAGE_PARSER,
4327 UCT_SETBY_PARSER);
4328 change_chartrans_handling(context);
4329 }
4330
4331 /*
4332 * Avoid rewrite of SGML_character() to handle hypothetical case of UTF-16
4333 * webpages, by pretending that the data is UTF-8.
4334 */
SGML_widechar(HTStream * context,int ch)4335 static void SGML_widechar(HTStream *context, int ch)
4336 {
4337 if (!UCPutUtf8_charstring(context, SGML_character, (UCode_t) ch)) {
4338 SGML_character(context, ch);
4339 }
4340 }
4341
SGML_write(HTStream * context,const char * str,int l)4342 static void SGML_write(HTStream *context, const char *str, int l)
4343 {
4344 const char *p;
4345 const char *e = str + l;
4346
4347 if (sgml_offset == 0) {
4348 if (l > 3
4349 && !MemCmp(str, "\357\273\277", 3)) {
4350 CTRACE((tfp, "SGML_write found UTF-8 BOM\n"));
4351 InferUtfFromBom(context, UTF8_handle);
4352 str += 3;
4353 } else if (l > 2) {
4354 if (!MemCmp(str, "\377\376", 2)) {
4355 CTRACE((tfp, "SGML_write found UCS-2 LE BOM\n"));
4356 InferUtfFromBom(context, UTF8_handle);
4357 str += 2;
4358 context->T.ucs_mode = -1;
4359 } else if (!MemCmp(str, "\376\377", 2)) {
4360 CTRACE((tfp, "SGML_write found UCS-2 BE BOM\n"));
4361 InferUtfFromBom(context, UTF8_handle);
4362 str += 2;
4363 context->T.ucs_mode = 1;
4364 }
4365 }
4366 }
4367 switch (context->T.ucs_mode) {
4368 case -1:
4369 for (p = str; p < e; p += 2)
4370 SGML_widechar(context, (UCH(p[1]) << 8) | UCH(p[0]));
4371 break;
4372 case 1:
4373 for (p = str; p < e; p += 2)
4374 SGML_widechar(context, (UCH(p[0]) << 8) | UCH(p[1]));
4375 break;
4376 default:
4377 for (p = str; p < e; p++)
4378 SGML_character(context, *p);
4379 break;
4380 }
4381 }
4382
SGML_string(HTStream * context,const char * str)4383 static void SGML_string(HTStream *context, const char *str)
4384 {
4385 SGML_write(context, str, (int) strlen(str));
4386 }
4387
4388 /*_______________________________________________________________________
4389 */
4390
4391 /* Structured Object Class
4392 * -----------------------
4393 */
4394 const HTStreamClass SGMLParser =
4395 {
4396 "SGMLParser",
4397 SGML_free,
4398 SGML_abort,
4399 SGML_character,
4400 SGML_string,
4401 SGML_write,
4402 };
4403
4404 /* Create SGML Engine
4405 * ------------------
4406 *
4407 * On entry,
4408 * dtd represents the DTD, along with
4409 * actions is the sink for the data as a set of routines.
4410 *
4411 */
4412
SGML_new(const SGML_dtd * dtd,HTParentAnchor * anchor,HTStructured * target)4413 HTStream *SGML_new(const SGML_dtd * dtd,
4414 HTParentAnchor *anchor,
4415 HTStructured * target)
4416 {
4417 HTStream *context = typecalloc(struct _HTStream);
4418
4419 if (!context)
4420 outofmem(__FILE__, "SGML_begin");
4421
4422 assert(context != NULL);
4423
4424 context->isa = &SGMLParser;
4425 context->string = HTChunkCreate(128); /* Grow by this much */
4426 context->dtd = dtd;
4427 context->target = target;
4428 context->actions = (const HTStructuredClass *) (((HTStream *) target)->isa);
4429 /* Ugh: no OO */
4430 context->unknown_tag = &HTTag_unrecognized;
4431 context->current_tag = context->slashedtag = NULL;
4432 context->state = S_text;
4433 #ifdef CALLERDATA
4434 context->callerData = (void *) callerData;
4435 #endif /* CALLERDATA */
4436
4437 context->node_anchor = anchor; /* Could be NULL? */
4438 context->utf_buf_p = context->utf_buf;
4439 UCTransParams_clear(&context->T);
4440 context->inUCLYhndl = HTAnchor_getUCLYhndl(anchor,
4441 UCT_STAGE_PARSER);
4442 if (context->inUCLYhndl < 0) {
4443 HTAnchor_copyUCInfoStage(anchor,
4444 UCT_STAGE_PARSER,
4445 UCT_STAGE_MIME,
4446 -1);
4447 context->inUCLYhndl = HTAnchor_getUCLYhndl(anchor,
4448 UCT_STAGE_PARSER);
4449 }
4450 #ifdef CAN_SWITCH_DISPLAY_CHARSET /* Allow a switch to a more suitable display charset */
4451 else if (anchor->UCStages
4452 && anchor->UCStages->s[UCT_STAGE_PARSER].LYhndl >= 0
4453 && anchor->UCStages->s[UCT_STAGE_PARSER].LYhndl != current_char_set) {
4454 int o = anchor->UCStages->s[UCT_STAGE_PARSER].LYhndl;
4455
4456 anchor->UCStages->s[UCT_STAGE_PARSER].LYhndl = -1; /* Force reset */
4457 HTAnchor_resetUCInfoStage(anchor, o, UCT_STAGE_PARSER,
4458 /* Preserve change this: */
4459 anchor->UCStages->s[UCT_STAGE_PARSER].lock);
4460 }
4461 #endif
4462
4463 context->inUCI = HTAnchor_getUCInfoStage(anchor,
4464 UCT_STAGE_PARSER);
4465 set_chartrans_handling(context, anchor, -1);
4466
4467 context->recover = NULL;
4468 context->recover_index = 0;
4469 context->include = NULL;
4470 context->active_include = NULL;
4471 context->include_index = 0;
4472 context->url = NULL;
4473 context->csi = NULL;
4474 context->csi_index = 0;
4475
4476 #ifdef USE_PRETTYSRC
4477 if (psrc_view) {
4478 psrc_view = FALSE;
4479 mark_htext_as_source = TRUE;
4480 SGML_string(context,
4481 "<HTML><HEAD><TITLE>source</TITLE></HEAD><BODY><PRE>");
4482 psrc_view = TRUE;
4483 psrc_convert_string = FALSE;
4484 sgml_in_psrc_was_initialized = TRUE;
4485 }
4486 #endif
4487
4488 sgml_offset = 0;
4489 return context;
4490 }
4491
4492 /*
4493 * Return the offset within the document where we're parsing. This is used
4494 * to help identify anchors which shift around while reparsing.
4495 */
SGML_offset(void)4496 int SGML_offset(void)
4497 {
4498 int result = sgml_offset;
4499
4500 #ifdef USE_PRETTYSRC
4501 result += psrc_view;
4502 #endif
4503 return result;
4504 }
4505
4506 /* Asian character conversion functions
4507 * ====================================
4508 *
4509 * Added 24-Mar-96 by FM, based on:
4510 *
4511 ////////////////////////////////////////////////////////////////////////
4512 Copyright (c) 1993 Electrotechnical Laboratory (ETL)
4513
4514 Permission to use, copy, modify, and distribute this material
4515 for any purpose and without fee is hereby granted, provided
4516 that the above copyright notice and this permission notice
4517 appear in all copies, and that the name of ETL not be
4518 used in advertising or publicity pertaining to this
4519 material without the specific, prior written permission
4520 of an authorized representative of ETL.
4521 ETL MAKES NO REPRESENTATIONS ABOUT THE ACCURACY OR SUITABILITY
4522 OF THIS MATERIAL FOR ANY PURPOSE. IT IS PROVIDED "AS IS",
4523 WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES.
4524 /////////////////////////////////////////////////////////////////////////
4525 Content-Type: program/C; charset=US-ASCII
4526 Program: SJIS.c
4527 Author: Yutaka Sato <ysato@etl.go.jp>
4528 Description:
4529 History:
4530 930923 extracted from codeconv.c of cosmos
4531 ///////////////////////////////////////////////////////////////////////
4532 */
4533
4534 static int TREAT_SJIS = 1;
4535
JISx0201TO0208_EUC(unsigned IHI,unsigned ILO,unsigned char * OHI,unsigned char * OLO)4536 void JISx0201TO0208_EUC(unsigned IHI,
4537 unsigned ILO,
4538 unsigned char *OHI,
4539 unsigned char *OLO)
4540 {
4541 static const char *table[] =
4542 {
4543 "\241\243", /* A1,A3 */
4544 "\241\326", /* A1,D6 */
4545 "\241\327", /* A1,D7 */
4546 "\241\242", /* A1,A2 */
4547 "\241\246", /* A1,A6 */
4548 "\245\362", /* A5,F2 */
4549 "\245\241", /* A5,A1 */
4550 "\245\243", /* A5,A3 */
4551 "\245\245", /* A5,A5 */
4552 "\245\247", /* A5,A7 */
4553 "\245\251", /* A5,A9 */
4554 "\245\343", /* A5,E3 */
4555 "\245\345", /* A5,E5 */
4556 "\245\347", /* A5,E7 */
4557 "\245\303", /* A5,C3 */
4558 "\241\274", /* A1,BC */
4559 "\245\242", /* A5,A2 */
4560 "\245\244", /* A5,A4 */
4561 "\245\246", /* A5,A6 */
4562 "\245\250", /* A5,A8 */
4563 "\245\252", /* A5,AA */
4564 "\245\253", /* A5,AB */
4565 "\245\255", /* A5,AD */
4566 "\245\257", /* A5,AF */
4567 "\245\261", /* A5,B1 */
4568 "\245\263", /* A5,B3 */
4569 "\245\265", /* A5,B5 */
4570 "\245\267", /* A5,B7 */
4571 "\245\271", /* A5,B9 */
4572 "\245\273", /* A5,BB */
4573 "\245\275", /* A5,BD */
4574 "\245\277", /* A5,BF */
4575 "\245\301", /* A5,C1 */
4576 "\245\304", /* A5,C4 */
4577 "\245\306", /* A5,C6 */
4578 "\245\310", /* A5,C8 */
4579 "\245\312", /* A5,CA */
4580 "\245\313", /* A5,CB */
4581 "\245\314", /* A5,CC */
4582 "\245\315", /* A5,CD */
4583 "\245\316", /* A5,CE */
4584 "\245\317", /* A5,CF */
4585 "\245\322", /* A5,D2 */
4586 "\245\325", /* A5,D5 */
4587 "\245\330", /* A5,D8 */
4588 "\245\333", /* A5,DB */
4589 "\245\336", /* A5,DE */
4590 "\245\337", /* A5,DF */
4591 "\245\340", /* A5,E0 */
4592 "\245\341", /* A5,E1 */
4593 "\245\342", /* A5,E2 */
4594 "\245\344", /* A5,E4 */
4595 "\245\346", /* A5,E6 */
4596 "\245\350", /* A5,E8 */
4597 "\245\351", /* A5,E9 */
4598 "\245\352", /* A5,EA */
4599 "\245\353", /* A5,EB */
4600 "\245\354", /* A5,EC */
4601 "\245\355", /* A5,ED */
4602 "\245\357", /* A5,EF */
4603 "\245\363", /* A5,F3 */
4604 "\241\253", /* A1,AB */
4605 "\241\254" /* A1,AC */
4606 };
4607
4608 if ((IHI == 0x8E) && (ILO >= 0xA1) && (ILO <= 0xDF)) {
4609 *OHI = UCH(table[ILO - 0xA1][0]);
4610 *OLO = UCH(table[ILO - 0xA1][1]);
4611 } else {
4612 *OHI = UCH(IHI);
4613 *OLO = UCH(ILO);
4614 }
4615 }
4616
IS_SJIS_STR(const unsigned char * str)4617 static int IS_SJIS_STR(const unsigned char *str)
4618 {
4619 const unsigned char *s;
4620 unsigned char ch;
4621 int is_sjis = 0;
4622
4623 s = str;
4624 while ((ch = *s++) != '\0') {
4625 if (ch & 0x80)
4626 if (IS_SJIS(ch, *s, is_sjis))
4627 return 1;
4628 }
4629 return 0;
4630 }
4631
SJIS_TO_JIS1(unsigned HI,unsigned LO,unsigned char * JCODE)4632 unsigned char *SJIS_TO_JIS1(unsigned HI,
4633 unsigned LO,
4634 unsigned char *JCODE)
4635 {
4636 HI = UCH(HI - (unsigned) UCH((HI <= 0x9F) ? 0x71 : 0xB1));
4637 HI = UCH((HI << 1) + 1);
4638 if (0x7F < LO)
4639 LO--;
4640 if (0x9E <= LO) {
4641 LO = UCH(LO - UCH(0x7D));
4642 HI++;
4643 } else {
4644 LO = UCH(LO - UCH(0x1F));
4645 }
4646 JCODE[0] = UCH(HI);
4647 JCODE[1] = UCH(LO);
4648 return JCODE;
4649 }
4650
JIS_TO_SJIS1(unsigned HI,unsigned LO,unsigned char * SJCODE)4651 unsigned char *JIS_TO_SJIS1(unsigned HI,
4652 unsigned LO,
4653 unsigned char *SJCODE)
4654 {
4655 if (HI & 1)
4656 LO = UCH(LO + UCH(0x1F));
4657 else
4658 LO = UCH(LO + UCH(0x7D));
4659 if (0x7F <= LO)
4660 LO++;
4661
4662 HI = UCH(((HI - 0x21) >> 1) + 0x81);
4663 if (0x9F < HI)
4664 HI = UCH(HI + UCH(0x40));
4665 SJCODE[0] = UCH(HI);
4666 SJCODE[1] = UCH(LO);
4667 return SJCODE;
4668 }
4669
EUC_TO_SJIS1(unsigned HI,unsigned LO,unsigned char * SJCODE)4670 unsigned char *EUC_TO_SJIS1(unsigned HI,
4671 unsigned LO,
4672 unsigned char *SJCODE)
4673 {
4674 if (HI == 0x8E) {
4675 unsigned char HI_data[2];
4676 unsigned char LO_data[2];
4677
4678 HI_data[0] = UCH(HI);
4679 LO_data[0] = UCH(LO);
4680 JISx0201TO0208_EUC(HI, LO, HI_data, LO_data);
4681 }
4682 JIS_TO_SJIS1(UCH(HI & 0x7F), UCH(LO & 0x7F), SJCODE);
4683 return SJCODE;
4684 }
4685
JISx0201TO0208_SJIS(unsigned I,unsigned char * OHI,unsigned char * OLO)4686 void JISx0201TO0208_SJIS(unsigned I,
4687 unsigned char *OHI,
4688 unsigned char *OLO)
4689 {
4690 unsigned char SJCODE[2];
4691
4692 JISx0201TO0208_EUC(0x8E, I, OHI, OLO);
4693 JIS_TO_SJIS1(UCH(*OHI & 0x7F), UCH(*OLO & 0x7F), SJCODE);
4694 *OHI = SJCODE[0];
4695 *OLO = SJCODE[1];
4696 }
4697
SJIS_TO_EUC1(unsigned HI,unsigned LO,unsigned char * data)4698 unsigned char *SJIS_TO_EUC1(unsigned HI,
4699 unsigned LO,
4700 unsigned char *data)
4701 {
4702 SJIS_TO_JIS1(HI, LO, data);
4703 data[0] |= 0x80;
4704 data[1] |= 0x80;
4705 return data;
4706 }
4707
SJIS_TO_EUC(unsigned char * src,unsigned char * dst)4708 unsigned char *SJIS_TO_EUC(unsigned char *src,
4709 unsigned char *dst)
4710 {
4711 unsigned char hi, lo, *sp, *dp;
4712 int in_sjis = 0;
4713
4714 in_sjis = IS_SJIS_STR(src);
4715 for (sp = src, dp = dst; (hi = sp[0]) != '\0';) {
4716 lo = sp[1];
4717 if (TREAT_SJIS && IS_SJIS(hi, lo, in_sjis)) {
4718 SJIS_TO_JIS1(hi, lo, dp);
4719 dp[0] |= 0x80;
4720 dp[1] |= 0x80;
4721 dp += 2;
4722 sp += 2;
4723 } else
4724 *dp++ = *sp++;
4725 }
4726 *dp = 0;
4727 return dst;
4728 }
4729
EUC_TO_SJIS(unsigned char * src,unsigned char * dst)4730 unsigned char *EUC_TO_SJIS(unsigned char *src,
4731 unsigned char *dst)
4732 {
4733 unsigned char *sp, *dp;
4734
4735 for (sp = src, dp = dst; *sp;) {
4736 if (*sp & 0x80) {
4737 if (sp[1] && (sp[1] & 0x80)) {
4738 JIS_TO_SJIS1(UCH(sp[0] & 0x7F), UCH(sp[1] & 0x7F), dp);
4739 dp += 2;
4740 sp += 2;
4741 } else {
4742 sp++;
4743 }
4744 } else {
4745 *dp++ = *sp++;
4746 }
4747 }
4748 *dp = 0;
4749 return dst;
4750 }
4751
4752 #define Strcpy(a,b) (strcpy((char*)a,(const char*)b),&a[strlen((const char*)a)])
4753
EUC_TO_JIS(unsigned char * src,unsigned char * dst,const char * toK,const char * toA)4754 unsigned char *EUC_TO_JIS(unsigned char *src,
4755 unsigned char *dst,
4756 const char *toK,
4757 const char *toA)
4758 {
4759 unsigned char kana_mode = 0;
4760 unsigned char cch;
4761 unsigned char *sp = src;
4762 unsigned char *dp = dst;
4763 int is_JIS = 0;
4764
4765 while ((cch = *sp++) != '\0') {
4766 if (cch & 0x80) {
4767 if (!IS_EUC(cch, *sp)) {
4768 if (cch == 0xA0 && is_JIS) /* ignore NBSP */
4769 continue;
4770 is_JIS++;
4771 *dp++ = cch;
4772 continue;
4773 }
4774 if (!kana_mode) {
4775 kana_mode = UCH(~kana_mode);
4776 dp = Strcpy(dp, toK);
4777 }
4778 if (*sp & 0x80) {
4779 *dp++ = UCH(cch & ~0x80);
4780 *dp++ = UCH(*sp++ & ~0x80);
4781 }
4782 } else {
4783 if (kana_mode) {
4784 kana_mode = UCH(~kana_mode);
4785 dp = Strcpy(dp, toA);
4786 }
4787 *dp++ = cch;
4788 }
4789 }
4790 if (kana_mode)
4791 dp = Strcpy(dp, toA);
4792
4793 if (dp)
4794 *dp = 0;
4795 return dst;
4796 }
4797
4798 #define IS_JIS7(c1,c2) (0x20<(c1)&&(c1)<0x7F && 0x20<(c2)&&(c2)<0x7F)
4799 #define SO ('N'-0x40)
4800 #define SI ('O'-0x40)
4801
4802 static int repair_JIS = 0;
4803
repairJIStoEUC(const unsigned char * src,unsigned char ** dstp)4804 static const unsigned char *repairJIStoEUC(const unsigned char *src,
4805 unsigned char **dstp)
4806 {
4807 const unsigned char *s;
4808 unsigned char *d, ch1, ch2;
4809
4810 d = *dstp;
4811 s = src;
4812 while ((ch1 = s[0]) && (ch2 = s[1])) {
4813 s += 2;
4814 if (ch1 == '(')
4815 if (ch2 == 'B' || ch2 == 'J') {
4816 *dstp = d;
4817 return s;
4818 }
4819 if (!IS_JIS7(ch1, ch2))
4820 return 0;
4821
4822 *d++ = UCH(0x80 | ch1);
4823 *d++ = UCH(0x80 | ch2);
4824 }
4825 return 0;
4826 }
4827
TO_EUC(const unsigned char * jis,unsigned char * euc)4828 unsigned char *TO_EUC(const unsigned char *jis,
4829 unsigned char *euc)
4830 {
4831 const unsigned char *s;
4832 unsigned char c, jis_stat;
4833 unsigned char *d;
4834 int to1B, to2B;
4835 int in_sjis = 0;
4836 static int nje;
4837 int n8bits;
4838 int is_JIS;
4839
4840 nje++;
4841 n8bits = 0;
4842 s = jis;
4843 d = euc;
4844 jis_stat = 0;
4845 to2B = TO_2BCODE;
4846 to1B = TO_1BCODE;
4847 in_sjis = IS_SJIS_STR(jis);
4848 is_JIS = 0;
4849
4850 while ((c = *s++) != '\0') {
4851 if (c == 0x80)
4852 continue; /* ignore it */
4853 if (c == 0xA0 && is_JIS)
4854 continue; /* ignore Non-breaking space */
4855
4856 if (c == to2B && jis_stat == 0 && repair_JIS) {
4857 if (*s == 'B' || *s == '@') {
4858 const unsigned char *ts;
4859
4860 if ((ts = repairJIStoEUC(s + 1, &d)) != NULL) {
4861 s = ts;
4862 continue;
4863 }
4864 }
4865 }
4866 if (c == CH_ESC) {
4867 if (*s == to2B) {
4868 if ((s[1] == 'B') || (s[1] == '@')) {
4869 jis_stat = 0x80;
4870 s += 2;
4871 is_JIS++;
4872 continue;
4873 }
4874 jis_stat = 0;
4875 } else if (*s == to1B) {
4876 jis_stat = 0;
4877 if ((s[1] == 'B') || (s[1] == 'J') || (s[1] == 'H')) {
4878 s += 2;
4879 continue;
4880 }
4881 } else if (*s == ',') { /* MULE */
4882 jis_stat = 0;
4883 }
4884 }
4885 if (c & 0x80)
4886 n8bits++;
4887
4888 if (IS_SJIS(c, *s, in_sjis)) {
4889 SJIS_TO_EUC1(c, *s, d);
4890 d += 2;
4891 s++;
4892 is_JIS++;
4893 } else if (jis_stat) {
4894 if (c <= 0x20 || 0x7F <= c) {
4895 *d++ = c;
4896 if (c == '\n')
4897 jis_stat = 0;
4898 } else {
4899 if (IS_JIS7(c, *s)) {
4900 *d++ = jis_stat | c;
4901 *d++ = jis_stat | *s++;
4902 } else
4903 *d++ = c;
4904 }
4905 } else {
4906 if (n8bits == 0 && (c == SI || c == SO)) {
4907 } else {
4908 *d++ = c;
4909 }
4910 }
4911 }
4912 *d = 0;
4913 return euc;
4914 }
4915
4916 #define non94(ch) ((ch) <= 0x20 || (ch) == 0x7F)
4917
is_EUC_JP(unsigned char * euc)4918 static int is_EUC_JP(unsigned char *euc)
4919 {
4920 unsigned char *cp;
4921 int ch1, ch2;
4922
4923 for (cp = euc; (ch1 = *cp) != '\0'; cp++) {
4924 if (ch1 & 0x80) {
4925 ch2 = cp[1] & 0xFF;
4926 if ((ch2 & 0x80) == 0) {
4927 /* sv1log("NOT_EUC1[%x][%x]\n",ch1,ch2); */
4928 return 0;
4929 }
4930 if (non94(ch1 & 0x7F) || non94(ch2 & 0x7F)) {
4931 /* sv1log("NOT_EUC2[%x][%x]\n",ch1,ch2); */
4932 return 0;
4933 }
4934 cp++;
4935 }
4936 }
4937 return 1;
4938 }
4939
TO_SJIS(const unsigned char * arg,unsigned char * sjis)4940 void TO_SJIS(const unsigned char *arg,
4941 unsigned char *sjis)
4942 {
4943 unsigned char *euc;
4944
4945 euc = typeMallocn(unsigned char, strlen((const char *) arg) + 1);
4946
4947 #ifdef CJK_EX
4948 if (!euc)
4949 outofmem(__FILE__, "TO_SJIS");
4950 #endif
4951 TO_EUC(arg, euc);
4952 if (is_EUC_JP(euc))
4953 EUC_TO_SJIS(euc, sjis);
4954 else
4955 strcpy((char *) sjis, (const char *) arg);
4956 free(euc);
4957 }
4958
TO_JIS(const unsigned char * arg,unsigned char * jis)4959 void TO_JIS(const unsigned char *arg,
4960 unsigned char *jis)
4961 {
4962 unsigned char *euc;
4963
4964 if (arg[0] == 0) {
4965 jis[0] = 0;
4966 return;
4967 }
4968 euc = typeMallocn(unsigned char, strlen((const char *)arg) + 1);
4969 #ifdef CJK_EX
4970 if (!euc)
4971 outofmem(__FILE__, "TO_JIS");
4972 #endif
4973 TO_EUC(arg, euc);
4974 is_EUC_JP(euc);
4975 EUC_TO_JIS(euc, jis, TO_KANJI, TO_ASCII);
4976
4977 free(euc);
4978 }
4979