1 /*
2  * $LynxId: LYCharUtils.c,v 1.123 2013/06/04 20:42:47 tom Exp $
3  *
4  *  Functions associated with LYCharSets.c and the Lynx version of HTML.c - FM
5  *  ==========================================================================
6  */
7 #include <HTUtils.h>
8 #include <SGML.h>
9 
10 #define Lynx_HTML_Handler
11 #include <HTChunk.h>
12 #include <HText.h>
13 #include <HTStyle.h>
14 #include <HTMIME.h>
15 #include <HTML.h>
16 
17 #include <HTCJK.h>
18 #include <HTAtom.h>
19 #include <HTMLGen.h>
20 #include <HTParse.h>
21 #include <UCMap.h>
22 #include <UCDefs.h>
23 #include <UCAux.h>
24 
25 #include <LYGlobalDefs.h>
26 #include <LYCharUtils.h>
27 #include <LYCharSets.h>
28 
29 #include <HTAlert.h>
30 #include <HTForms.h>
31 #include <HTNestedList.h>
32 #include <GridText.h>
33 #include <LYStrings.h>
34 #include <LYUtils.h>
35 #include <LYMap.h>
36 #include <LYBookmark.h>
37 #include <LYCurses.h>
38 #include <LYCookie.h>
39 
40 #include <LYexit.h>
41 #include <LYLeaks.h>
42 
43 /*
44  * Used for nested lists.  - FM
45  */
46 int OL_CONTINUE = -29999;	/* flag for whether CONTINUE is set */
47 int OL_VOID = -29998;		/* flag for whether a count is set */
48 
49 /*
50  *  This function converts any ampersands in allocated
51  *  strings to "&amp;".  If isTITLE is TRUE, it also
52  *  converts any angle-brackets to "&lt;" or "&gt;". - FM
53  */
LYEntify(char ** str,int isTITLE)54 void LYEntify(char **str,
55 	      int isTITLE)
56 {
57     char *p = *str;
58     char *q = NULL, *cp = NULL;
59     int amps = 0, lts = 0, gts = 0;
60 
61 #ifdef CJK_EX
62     enum _state {
63 	S_text,
64 	S_esc,
65 	S_dollar,
66 	S_paren,
67 	S_nonascii_text,
68 	S_dollar_paren
69     } state = S_text;
70     int in_sjis = 0;
71 #endif
72 
73     if (isEmpty(p))
74 	return;
75 
76     /*
77      * Count the ampersands.  - FM
78      */
79     while ((*p != '\0') && (q = strchr(p, '&')) != NULL) {
80 	amps++;
81 	p = (q + 1);
82     }
83 
84     /*
85      * Count the left-angle-brackets, if needed.  - FM
86      */
87     if (isTITLE == TRUE) {
88 	p = *str;
89 	while ((*p != '\0') && (q = strchr(p, '<')) != NULL) {
90 	    lts++;
91 	    p = (q + 1);
92 	}
93     }
94 
95     /*
96      * Count the right-angle-brackets, if needed.  - FM
97      */
98     if (isTITLE == TRUE) {
99 	p = *str;
100 	while ((*p != '\0') && (q = strchr(p, '>')) != NULL) {
101 	    gts++;
102 	    p = (q + 1);
103 	}
104     }
105 
106     /*
107      * Check whether we need to convert anything.  - FM
108      */
109     if (amps == 0 && lts == 0 && gts == 0)
110 	return;
111 
112     /*
113      * Allocate space and convert.  - FM
114      */
115     q = typecallocn(char,
116 		    (strlen(*str)
117 		     + (unsigned)(4 * amps)
118 		     + (unsigned)(3 * lts)
119 		     + (unsigned)(3 * gts) + 1));
120     if ((cp = q) == NULL)
121 	outofmem(__FILE__, "LYEntify");
122 
123     assert(cp != NULL);
124     assert(q != NULL);
125 
126     for (p = *str; *p; p++) {
127 #ifdef CJK_EX
128 	if (IS_CJK_TTY) {
129 	    switch (state) {
130 	    case S_text:
131 		if (*p == '\033') {
132 		    state = S_esc;
133 		    *q++ = *p;
134 		    continue;
135 		}
136 		break;
137 
138 	    case S_esc:
139 		if (*p == '$') {
140 		    state = S_dollar;
141 		    *q++ = *p;
142 		    continue;
143 		} else if (*p == '(') {
144 		    state = S_paren;
145 		    *q++ = *p;
146 		    continue;
147 		} else {
148 		    state = S_text;
149 		    *q++ = *p;
150 		    continue;
151 		}
152 
153 	    case S_dollar:
154 		if (*p == '@' || *p == 'B' || *p == 'A') {
155 		    state = S_nonascii_text;
156 		    *q++ = *p;
157 		    continue;
158 		} else if (*p == '(') {
159 		    state = S_dollar_paren;
160 		    *q++ = *p;
161 		    continue;
162 		} else {
163 		    state = S_text;
164 		    *q++ = *p;
165 		    continue;
166 		}
167 
168 	    case S_dollar_paren:
169 		if (*p == 'C') {
170 		    state = S_nonascii_text;
171 		    *q++ = *p;
172 		    continue;
173 		} else {
174 		    state = S_text;
175 		    *q++ = *p;
176 		    continue;
177 		}
178 
179 	    case S_paren:
180 		if (*p == 'B' || *p == 'J' || *p == 'T') {
181 		    state = S_text;
182 		    *q++ = *p;
183 		    continue;
184 		} else if (*p == 'I') {
185 		    state = S_nonascii_text;
186 		    *q++ = *p;
187 		    continue;
188 		}
189 		/* FALLTHRU */
190 
191 	    case S_nonascii_text:
192 		if (*p == '\033')
193 		    state = S_esc;
194 		*q++ = *p;
195 		continue;
196 
197 	    default:
198 		break;
199 	    }
200 	    if (*(p + 1) != '\0' &&
201 		(IS_EUC(UCH(*p), UCH(*(p + 1))) ||
202 		 IS_SJIS(UCH(*p), UCH(*(p + 1)), in_sjis) ||
203 		 IS_BIG5(UCH(*p), UCH(*(p + 1))))) {
204 		*q++ = *p++;
205 		*q++ = *p;
206 		continue;
207 	    }
208 	}
209 #endif
210 	if (*p == '&') {
211 	    *q++ = '&';
212 	    *q++ = 'a';
213 	    *q++ = 'm';
214 	    *q++ = 'p';
215 	    *q++ = ';';
216 	} else if (isTITLE && *p == '<') {
217 	    *q++ = '&';
218 	    *q++ = 'l';
219 	    *q++ = 't';
220 	    *q++ = ';';
221 	} else if (isTITLE && *p == '>') {
222 	    *q++ = '&';
223 	    *q++ = 'g';
224 	    *q++ = 't';
225 	    *q++ = ';';
226 	} else {
227 	    *q++ = *p;
228 	}
229     }
230     *q = '\0';
231     FREE(*str);
232     *str = cp;
233 }
234 
235 /*
236  * Callers to LYEntifyTitle/LYEntifyValue do not look at the 'target' param.
237  * Optimize things a little by avoiding the memory allocation if not needed,
238  * as is usually the case.
239  */
MustEntify(const char * source)240 static BOOL MustEntify(const char *source)
241 {
242     BOOL result;
243 
244 #ifdef CJK_EX
245     if (IS_CJK_TTY && strchr(source, '\033') != 0) {
246 	result = TRUE;
247     } else
248 #endif
249     {
250 	size_t length = strlen(source);
251 	size_t reject = strcspn(source, "<&>");
252 
253 	result = (BOOL) (length != reject);
254     }
255 
256     return result;
257 }
258 
259 /*
260  * Wrappers for LYEntify() which do not assume that the source was allocated,
261  * e.g., output from gettext().
262  */
LYEntifyTitle(char ** target,const char * source)263 const char *LYEntifyTitle(char **target, const char *source)
264 {
265     const char *result = 0;
266 
267     if (MustEntify(source)) {
268 	StrAllocCopy(*target, source);
269 	LYEntify(target, TRUE);
270 	result = *target;
271     } else {
272 	result = source;
273     }
274     return result;
275 }
276 
LYEntifyValue(char ** target,const char * source)277 const char *LYEntifyValue(char **target, const char *source)
278 {
279     const char *result = 0;
280 
281     if (MustEntify(source)) {
282 	StrAllocCopy(*target, source);
283 	LYEntify(target, FALSE);
284 	result = *target;
285     } else {
286 	result = source;
287     }
288     return result;
289 }
290 
291 /*
292  *  This function trims characters <= that of a space (32),
293  *  including HT_NON_BREAK_SPACE (1) and HT_EN_SPACE (2),
294  *  but not ESC, from the heads of strings. - FM
295  */
LYTrimHead(char * str)296 void LYTrimHead(char *str)
297 {
298     const char *s = str;
299 
300     if (isEmpty(s))
301 	return;
302 
303     while (*s && WHITE(*s) && UCH(*s) != UCH(CH_ESC))	/* S/390 -- gil -- 1669 */
304 	s++;
305     if (s > str) {
306 	char *ns = str;
307 
308 	while (*s) {
309 	    *ns++ = *s++;
310 	}
311 	*ns = '\0';
312     }
313 }
314 
315 /*
316  *  This function trims characters <= that of a space (32),
317  *  including HT_NON_BREAK_SPACE (1), HT_EN_SPACE (2), and
318  *  ESC from the tails of strings. - FM
319  */
LYTrimTail(char * str)320 void LYTrimTail(char *str)
321 {
322     int i;
323 
324     if (isEmpty(str))
325 	return;
326 
327     i = (int) strlen(str) - 1;
328     while (i >= 0) {
329 	if (WHITE(str[i]))
330 	    str[i] = '\0';
331 	else
332 	    break;
333 	i--;
334     }
335 }
336 
337 /*
338  * This function should receive a pointer to the start
339  * of a comment.  It returns a pointer to the end ('>')
340  * character of comment, or its best guess if the comment
341  * is invalid. - FM
342  */
LYFindEndOfComment(char * str)343 char *LYFindEndOfComment(char *str)
344 {
345     char *cp, *cp1;
346     enum comment_state {
347 	start1,
348 	start2,
349 	end1,
350 	end2
351     } state;
352 
353     if (str == NULL)
354 	/*
355 	 * We got NULL, so return NULL.  - FM
356 	 */
357 	return NULL;
358 
359     if (StrNCmp(str, "<!--", 4))
360 	/*
361 	 * We don't have the start of a comment, so return the beginning of the
362 	 * string.  - FM
363 	 */
364 	return str;
365 
366     cp = (str + 4);
367     if (*cp == '>')
368 	/*
369 	 * It's an invalid comment, so
370 	 * return this end character. - FM
371 	 */
372 	return cp;
373 
374     if ((cp1 = strchr(cp, '>')) == NULL)
375 	/*
376 	 * We don't have an end character, so return the beginning of the
377 	 * string.  - FM
378 	 */
379 	return str;
380 
381     if (*cp == '-')
382 	/*
383 	 * Ugh, it's a "decorative" series of dashes, so return the next end
384 	 * character.  - FM
385 	 */
386 	return cp1;
387 
388     /*
389      * OK, we're ready to start parsing.  - FM
390      */
391     state = start2;
392     while (*cp != '\0') {
393 	switch (state) {
394 	case start1:
395 	    if (*cp == '-')
396 		state = start2;
397 	    else
398 		/*
399 		 * Invalid comment, so return the first '>' from the start of
400 		 * the string.  - FM
401 		 */
402 		return cp1;
403 	    break;
404 
405 	case start2:
406 	    if (*cp == '-')
407 		state = end1;
408 	    break;
409 
410 	case end1:
411 	    if (*cp == '-')
412 		state = end2;
413 	    else
414 		/*
415 		 * Invalid comment, so return the first '>' from the start of
416 		 * the string.  - FM
417 		 */
418 		return cp1;
419 	    break;
420 
421 	case end2:
422 	    if (*cp == '>')
423 		/*
424 		 * Valid comment, so return the end character.  - FM
425 		 */
426 		return cp;
427 	    if (*cp == '-') {
428 		state = start1;
429 	    } else if (!(WHITE(*cp) && UCH(*cp) != UCH(CH_ESC))) {	/* S/390 -- gil -- 1686 */
430 		/*
431 		 * Invalid comment, so return the first '>' from the start of
432 		 * the string.  - FM
433 		 */
434 		return cp1;
435 	    }
436 	    break;
437 
438 	default:
439 	    break;
440 	}
441 	cp++;
442     }
443 
444     /*
445      * Invalid comment, so return the first '>' from the start of the string.
446      * - FM
447      */
448     return cp1;
449 }
450 
451 /*
452  *  If an HREF, itself or if resolved against a base,
453  *  represents a file URL, and the host is defaulted,
454  *  force in "//localhost".  We need this until
455  *  all the other Lynx code which performs security
456  *  checks based on the "localhost" string is changed
457  *  to assume "//localhost" when a host field is not
458  *  present in file URLs - FM
459  */
LYFillLocalFileURL(char ** href,const char * base)460 void LYFillLocalFileURL(char **href,
461 			const char *base)
462 {
463     char *temp = NULL;
464 
465     if (isEmpty(*href))
466 	return;
467 
468     if (!strcmp(*href, "//") || !StrNCmp(*href, "///", 3)) {
469 	if (base != NULL && isFILE_URL(base)) {
470 	    StrAllocCopy(temp, STR_FILE_URL);
471 	    StrAllocCat(temp, *href);
472 	    StrAllocCopy(*href, temp);
473 	}
474     }
475     if (isFILE_URL(*href)) {
476 	if (*(*href + 5) == '\0') {
477 	    StrAllocCat(*href, "//localhost");
478 	} else if (!strcmp(*href, "file://")) {
479 	    StrAllocCat(*href, "localhost");
480 	} else if (!StrNCmp(*href, "file:///", 8)) {
481 	    StrAllocCopy(temp, (*href + 7));
482 	    LYLocalFileToURL(href, temp);
483 	} else if (!StrNCmp(*href, "file:/", 6) && !LYIsHtmlSep(*(*href + 6))) {
484 	    StrAllocCopy(temp, (*href + 5));
485 	    LYLocalFileToURL(href, temp);
486 	}
487     }
488 #if defined(USE_DOS_DRIVES)
489     if (LYIsDosDrive(*href)) {
490 	/*
491 	 * If it's a local DOS path beginning with drive letter,
492 	 * add file://localhost/ prefix and go ahead.
493 	 */
494 	StrAllocCopy(temp, *href);
495 	LYLocalFileToURL(href, temp);
496     }
497 
498     /* use below: strlen("file://localhost/") = 17 */
499     if (!StrNCmp(*href, "file://localhost/", 17)
500 	&& (strlen(*href) == 19)
501 	&& LYIsDosDrive(*href + 17)) {
502 	/*
503 	 * Terminate DOS drive letter with a slash to surf root successfully.
504 	 * Here seems a proper place to do so.
505 	 */
506 	LYAddPathSep(href);
507     }
508 #endif /* USE_DOS_DRIVES */
509 
510     /*
511      * No path in a file://localhost URL means a
512      * directory listing for the current default. - FM
513      */
514     if (!strcmp(*href, "file://localhost")) {
515 	const char *temp2;
516 
517 #ifdef VMS
518 	temp2 = HTVMS_wwwName(LYGetEnv("PATH"));
519 #else
520 	char curdir[LY_MAXPATH];
521 
522 	temp2 = wwwName(Current_Dir(curdir));
523 #endif /* VMS */
524 	if (!LYIsHtmlSep(*temp2))
525 	    LYAddHtmlSep(href);
526 	/*
527 	 * Check for pathological cases - current dir has chars which MUST BE
528 	 * URL-escaped - kw
529 	 */
530 	if (strchr(temp2, '%') != NULL || strchr(temp2, '#') != NULL) {
531 	    FREE(temp);
532 	    temp = HTEscape(temp2, URL_PATH);
533 	    StrAllocCat(*href, temp);
534 	} else {
535 	    StrAllocCat(*href, temp2);
536 	}
537     }
538 #ifdef VMS
539     /*
540      * On VMS, a file://localhost/ URL means
541      * a listing for the login directory. - FM
542      */
543     if (!strcmp(*href, "file://localhost/"))
544 	StrAllocCat(*href, (HTVMS_wwwName(Home_Dir()) + 1));
545 #endif /* VMS */
546 
547     FREE(temp);
548     return;
549 }
550 
LYAddMETAcharsetToStream(HTStream * target,int disp_chndl)551 void LYAddMETAcharsetToStream(HTStream *target, int disp_chndl)
552 {
553     char *buf = 0;
554 
555     if (disp_chndl == -1)
556 	/*
557 	 * -1 means use current_char_set.
558 	 */
559 	disp_chndl = current_char_set;
560 
561     if (target != 0 && disp_chndl >= 0) {
562 	HTSprintf0(&buf, "<META %s content=\"text/html;charset=%s\">\n",
563 		   "http-equiv=\"content-type\"",
564 		   LYCharSet_UC[disp_chndl].MIMEname);
565 	(*target->isa->put_string) (target, buf);
566 	FREE(buf);
567     }
568 }
569 
570 /*
571  *  This function writes a line with a META tag to an open file,
572  *  which will specify a charset parameter to use when the file is
573  *  read back in.  It is meant for temporary HTML files used by the
574  *  various special pages which may show titles of documents.  When those
575  *  files are created, the title strings normally have been translated and
576  *  expanded to the display character set, so we have to make sure they
577  *  don't get translated again.
578  *  If the user has changed the display character set during the lifetime
579  *  of the Lynx session (or, more exactly, during the time the title
580  *  strings to be written were generated), they may now have different
581  *  character encodings and there is currently no way to get it all right.
582  *  To change this, we would have to add a variable for each string which
583  *  keeps track of its character encoding.
584  *  But at least we can try to ensure that reading the file after future
585  *  display character set changes will give reasonable output.
586  *
587  *  The META tag is not written if the display character set (passed as
588  *  disp_chndl) already corresponds to the charset assumption that
589  *  would be made when the file is read. - KW
590  *
591  *  Currently this function is used for temporary files like "Lynx Info Page"
592  *  and for one permanent - bookmarks (so it may be a problem if you change
593  *  the display charset later: new bookmark entries may be mistranslated).
594  *								 - LP
595  */
LYAddMETAcharsetToFD(FILE * fd,int disp_chndl)596 void LYAddMETAcharsetToFD(FILE *fd, int disp_chndl)
597 {
598     if (disp_chndl == -1)
599 	/*
600 	 * -1 means use current_char_set.
601 	 */
602 	disp_chndl = current_char_set;
603 
604     if (fd == NULL || disp_chndl < 0)
605 	/*
606 	 * Should not happen.
607 	 */
608 	return;
609 
610     if (UCLYhndl_HTFile_for_unspec == disp_chndl)
611 	/*
612 	 * Not need to do, so we don't.
613 	 */
614 	return;
615 
616     if (LYCharSet_UC[disp_chndl].enc == UCT_ENC_7BIT)
617 	/*
618 	 * There shouldn't be any 8-bit characters in this case.
619 	 */
620 	return;
621 
622     /*
623      * In other cases we don't know because UCLYhndl_for_unspec may change
624      * during the lifetime of the file (by toggling raw mode or changing the
625      * display character set), so proceed.
626      */
627     fprintf(fd, "<META %s content=\"text/html;charset=%s\">\n",
628 	    "http-equiv=\"content-type\"",
629 	    LYCharSet_UC[disp_chndl].MIMEname);
630 }
631 
632 /*
633  * This function returns OL TYPE="A" strings in
634  * the range of " A." (1) to "ZZZ." (18278). - FM
635  */
LYUppercaseA_OL_String(int seqnum)636 char *LYUppercaseA_OL_String(int seqnum)
637 {
638     static char OLstring[8];
639 
640     if (seqnum <= 1) {
641 	strcpy(OLstring, " A.");
642 	return OLstring;
643     }
644     if (seqnum < 27) {
645 	sprintf(OLstring, " %c.", (seqnum + 64));
646 	return OLstring;
647     }
648     if (seqnum < 703) {
649 	sprintf(OLstring, "%c%c.", ((seqnum - 1) / 26 + 64),
650 		(seqnum - ((seqnum - 1) / 26) * 26 + 64));
651 	return OLstring;
652     }
653     if (seqnum < 18279) {
654 	sprintf(OLstring, "%c%c%c.", ((seqnum - 27) / 676 + 64),
655 		(((seqnum - ((seqnum - 27) / 676) * 676) - 1) / 26 + 64),
656 		(seqnum - ((seqnum - 1) / 26) * 26 + 64));
657 	return OLstring;
658     }
659     strcpy(OLstring, "ZZZ.");
660     return OLstring;
661 }
662 
663 /*
664  * This function returns OL TYPE="a" strings in
665  * the range of " a." (1) to "zzz." (18278). - FM
666  */
LYLowercaseA_OL_String(int seqnum)667 char *LYLowercaseA_OL_String(int seqnum)
668 {
669     static char OLstring[8];
670 
671     if (seqnum <= 1) {
672 	strcpy(OLstring, " a.");
673 	return OLstring;
674     }
675     if (seqnum < 27) {
676 	sprintf(OLstring, " %c.", (seqnum + 96));
677 	return OLstring;
678     }
679     if (seqnum < 703) {
680 	sprintf(OLstring, "%c%c.", ((seqnum - 1) / 26 + 96),
681 		(seqnum - ((seqnum - 1) / 26) * 26 + 96));
682 	return OLstring;
683     }
684     if (seqnum < 18279) {
685 	sprintf(OLstring, "%c%c%c.", ((seqnum - 27) / 676 + 96),
686 		(((seqnum - ((seqnum - 27) / 676) * 676) - 1) / 26 + 96),
687 		(seqnum - ((seqnum - 1) / 26) * 26 + 96));
688 	return OLstring;
689     }
690     strcpy(OLstring, "zzz.");
691     return OLstring;
692 }
693 
694 /*
695  * This function returns OL TYPE="I" strings in the
696  * range of " I." (1) to "MMM." (3000).- FM
697  * Maximum length: 16 -TD
698  */
LYUppercaseI_OL_String(int seqnum)699 char *LYUppercaseI_OL_String(int seqnum)
700 {
701     static char OLstring[20];
702     int Arabic = seqnum;
703 
704     if (Arabic >= 3000) {
705 	strcpy(OLstring, "MMM.");
706 	return OLstring;
707     }
708 
709     switch (Arabic) {
710     case 1:
711 	strcpy(OLstring, " I.");
712 	return OLstring;
713     case 5:
714 	strcpy(OLstring, " V.");
715 	return OLstring;
716     case 10:
717 	strcpy(OLstring, " X.");
718 	return OLstring;
719     case 50:
720 	strcpy(OLstring, " L.");
721 	return OLstring;
722     case 100:
723 	strcpy(OLstring, " C.");
724 	return OLstring;
725     case 500:
726 	strcpy(OLstring, " D.");
727 	return OLstring;
728     case 1000:
729 	strcpy(OLstring, " M.");
730 	return OLstring;
731     default:
732 	OLstring[0] = '\0';
733 	break;
734     }
735 
736     while (Arabic >= 1000) {
737 	strcat(OLstring, "M");
738 	Arabic -= 1000;
739     }
740 
741     if (Arabic >= 900) {
742 	strcat(OLstring, "CM");
743 	Arabic -= 900;
744     }
745 
746     if (Arabic >= 500) {
747 	strcat(OLstring, "D");
748 	Arabic -= 500;
749     }
750 
751     if (Arabic >= 400) {
752 	strcat(OLstring, "CD");
753 	Arabic -= 400;
754     }
755 
756     while (Arabic >= 100) {
757 	strcat(OLstring, "C");
758 	Arabic -= 100;
759     }
760 
761     if (Arabic >= 90) {
762 	strcat(OLstring, "XC");
763 	Arabic -= 90;
764     }
765 
766     if (Arabic >= 50) {
767 	strcat(OLstring, "L");
768 	Arabic -= 50;
769     }
770 
771     if (Arabic >= 40) {
772 	strcat(OLstring, "XL");
773 	Arabic -= 40;
774     }
775 
776     while (Arabic > 10) {
777 	strcat(OLstring, "X");
778 	Arabic -= 10;
779     }
780 
781     switch (Arabic) {
782     case 1:
783 	strcat(OLstring, "I.");
784 	break;
785     case 2:
786 	strcat(OLstring, "II.");
787 	break;
788     case 3:
789 	strcat(OLstring, "III.");
790 	break;
791     case 4:
792 	strcat(OLstring, "IV.");
793 	break;
794     case 5:
795 	strcat(OLstring, "V.");
796 	break;
797     case 6:
798 	strcat(OLstring, "VI.");
799 	break;
800     case 7:
801 	strcat(OLstring, "VII.");
802 	break;
803     case 8:
804 	strcat(OLstring, "VIII.");
805 	break;
806     case 9:
807 	strcat(OLstring, "IX.");
808 	break;
809     case 10:
810 	strcat(OLstring, "X.");
811 	break;
812     default:
813 	strcat(OLstring, ".");
814 	break;
815     }
816 
817     return OLstring;
818 }
819 
820 /*
821  * This function returns OL TYPE="i" strings in
822  * range of " i." (1) to "mmm." (3000).- FM
823  * Maximum length: 16 -TD
824  */
LYLowercaseI_OL_String(int seqnum)825 char *LYLowercaseI_OL_String(int seqnum)
826 {
827     static char OLstring[20];
828     int Arabic = seqnum;
829 
830     if (Arabic >= 3000) {
831 	strcpy(OLstring, "mmm.");
832 	return OLstring;
833     }
834 
835     switch (Arabic) {
836     case 1:
837 	strcpy(OLstring, " i.");
838 	return OLstring;
839     case 5:
840 	strcpy(OLstring, " v.");
841 	return OLstring;
842     case 10:
843 	strcpy(OLstring, " x.");
844 	return OLstring;
845     case 50:
846 	strcpy(OLstring, " l.");
847 	return OLstring;
848     case 100:
849 	strcpy(OLstring, " c.");
850 	return OLstring;
851     case 500:
852 	strcpy(OLstring, " d.");
853 	return OLstring;
854     case 1000:
855 	strcpy(OLstring, " m.");
856 	return OLstring;
857     default:
858 	OLstring[0] = '\0';
859 	break;
860     }
861 
862     while (Arabic >= 1000) {
863 	strcat(OLstring, "m");
864 	Arabic -= 1000;
865     }
866 
867     if (Arabic >= 900) {
868 	strcat(OLstring, "cm");
869 	Arabic -= 900;
870     }
871 
872     if (Arabic >= 500) {
873 	strcat(OLstring, "d");
874 	Arabic -= 500;
875     }
876 
877     if (Arabic >= 400) {
878 	strcat(OLstring, "cd");
879 	Arabic -= 400;
880     }
881 
882     while (Arabic >= 100) {
883 	strcat(OLstring, "c");
884 	Arabic -= 100;
885     }
886 
887     if (Arabic >= 90) {
888 	strcat(OLstring, "xc");
889 	Arabic -= 90;
890     }
891 
892     if (Arabic >= 50) {
893 	strcat(OLstring, "l");
894 	Arabic -= 50;
895     }
896 
897     if (Arabic >= 40) {
898 	strcat(OLstring, "xl");
899 	Arabic -= 40;
900     }
901 
902     while (Arabic > 10) {
903 	strcat(OLstring, "x");
904 	Arabic -= 10;
905     }
906 
907     switch (Arabic) {
908     case 1:
909 	strcat(OLstring, "i.");
910 	break;
911     case 2:
912 	strcat(OLstring, "ii.");
913 	break;
914     case 3:
915 	strcat(OLstring, "iii.");
916 	break;
917     case 4:
918 	strcat(OLstring, "iv.");
919 	break;
920     case 5:
921 	strcat(OLstring, "v.");
922 	break;
923     case 6:
924 	strcat(OLstring, "vi.");
925 	break;
926     case 7:
927 	strcat(OLstring, "vii.");
928 	break;
929     case 8:
930 	strcat(OLstring, "viii.");
931 	break;
932     case 9:
933 	strcat(OLstring, "ix.");
934 	break;
935     case 10:
936 	strcat(OLstring, "x.");
937 	break;
938     default:
939 	strcat(OLstring, ".");
940 	break;
941     }
942 
943     return OLstring;
944 }
945 
946 /*
947  *  This function initializes the Ordered List counter. - FM
948  */
LYZero_OL_Counter(HTStructured * me)949 void LYZero_OL_Counter(HTStructured * me)
950 {
951     int i;
952 
953     if (!me)
954 	return;
955 
956     for (i = 0; i < 12; i++) {
957 	me->OL_Counter[i] = OL_VOID;
958 	me->OL_Type[i] = '1';
959     }
960 
961     me->Last_OL_Count = 0;
962     me->Last_OL_Type = '1';
963 
964     return;
965 }
966 
967 /*
968  *  This function is used by the HTML Structured object. - KW
969  */
LYGetChartransInfo(HTStructured * me)970 void LYGetChartransInfo(HTStructured * me)
971 {
972     me->UCLYhndl = HTAnchor_getUCLYhndl(me->node_anchor,
973 					UCT_STAGE_STRUCTURED);
974     if (me->UCLYhndl < 0) {
975 	int chndl = HTAnchor_getUCLYhndl(me->node_anchor, UCT_STAGE_HTEXT);
976 
977 	if (chndl < 0) {
978 	    chndl = current_char_set;
979 	    HTAnchor_setUCInfoStage(me->node_anchor, chndl,
980 				    UCT_STAGE_HTEXT,
981 				    UCT_SETBY_STRUCTURED);
982 	}
983 	HTAnchor_setUCInfoStage(me->node_anchor, chndl,
984 				UCT_STAGE_STRUCTURED,
985 				UCT_SETBY_STRUCTURED);
986 	me->UCLYhndl = HTAnchor_getUCLYhndl(me->node_anchor,
987 					    UCT_STAGE_STRUCTURED);
988     }
989     me->UCI = HTAnchor_getUCInfoStage(me->node_anchor,
990 				      UCT_STAGE_STRUCTURED);
991 }
992 
993 	/* as in HTParse.c, saves some calls - kw */
994 static const char *hex = "0123456789ABCDEF";
995 
996 /*
997  *	  Any raw 8-bit or multibyte characters already have been
998  *	  handled in relation to the display character set
999  *	  in SGML_character(), including named and numeric entities.
1000  *
1001  *  This function used for translations HTML special fields inside tags
1002  *  (ALT=, VALUE=, etc.) from charset `cs_from' to charset `cs_to'.
1003  *  It also unescapes non-ASCII characters from URL (#fragments !)
1004  *  if st_URL is active.
1005  *
1006  *  If `do_ent' is YES, it converts named entities
1007  *  and numeric character references (NCRs) to their `cs_to' replacements.
1008  *
1009  *  Named entities converted to unicodes.  NCRs (unicodes) converted
1010  *  by UCdomap.c chartrans functions.
1011  *  ???NCRs with values in the ISO-8859-1 range 160-255 may be converted
1012  *  to their HTML entity names (via old-style entities) and then translated
1013  *  according to the LYCharSets.c array for `cs_out'???.
1014  *
1015  *  Some characters (see descriptions in `put_special_unicodes' from SGML.c)
1016  *  translated in relation with the state of boolean variables
1017  *  `use_lynx_specials', `plain_space' and `hidden'.  It is not clear yet:
1018  *
1019  *  If plain_space is TRUE, nbsp (160) will be treated as an ASCII
1020  *  space (32).  If hidden is TRUE, entities will be translated
1021  *  (if `do_ent' is YES) but escape sequences will be passed unaltered.
1022  *  If `hidden' is FALSE, some characters are converted to Lynx special
1023  *  codes (see `put_special_unicodes') or ASCII space if `plain_space'
1024  *  applies).  @@ is `use_lynx_specials' needed, does it have any effect? @@
1025  *  If `use_lynx_specials' is YES, translate byte values 160 and 173
1026  *  meaning U+00A0 and U+00AD given as or converted from raw char input
1027  *  are converted to HT_NON_BREAK_SPACE and LY_SOFT_HYPHEN, respectively
1028  *  (unless input and output charset are both iso-8859-1, for compatibility
1029  *  with previous usage in HTML.c) even if `hidden' or `plain_space' is set.
1030  *
1031  *  If `Back' is YES, the reverse is done instead i.e., Lynx special codes
1032  *  in the input are translated back to character values.
1033  *
1034  *  If `Back' is YES, an attempt is made to use UCReverseTransChar() for
1035  *  back translation which may be more efficient. (?)
1036  *
1037  *  If `stype' is st_URL, non-ASCII characters are URL-encoded instead.
1038  *  The sequence of bytes being URL-encoded is the raw input character if
1039  *  we couldn't translate it from `cs_in' (CJK etc.); otherwise it is the
1040  *  UTF-8 representation if either `cs_to' requires this or if the
1041  *  character's Unicode value is > 255, otherwise it should be the iso-8859-1
1042  *  representation.
1043  *  No general URL-encoding occurs for displayable ASCII characters and
1044  *  spaces and some C0 controls valid in HTML (LF, TAB), it is expected
1045  *  that other functions will take care of that as appropriate.
1046  *
1047  *  Escape characters (0x1B, '\033') are
1048  *  - URL-encoded	if `stype'  is st_URL,	 otherwise
1049  *  - dropped		if `stype'  is st_other, otherwise (i.e., st_HTML)
1050  *  - passed		if `hidden' is TRUE or HTCJK is set, otherwise
1051  *  - dropped.
1052  *
1053  *  (If `stype' is st_URL or st_other most of the parameters really predefined:
1054  *  cs_from=cs_to, use_lynx_specials=plain_space=NO, and hidden=YES)
1055  *
1056  *
1057  *  Returns pointer to the char** passed in
1058  *		 if string translated or translation unnecessary,
1059  *	    NULL otherwise
1060  *		 (in which case something probably went wrong.)
1061  *
1062  *
1063  *  In general, this somehow ugly function (KW)
1064  *  cover three functions from v.2.7.2 (FM):
1065  *		    extern void LYExpandString (
1066  *		       HTStructured *	       me,
1067  *		       char **		       str);
1068  *		    extern void LYUnEscapeEntities (
1069  *		       HTStructured *	       me,
1070  *		       char **		       str);
1071  *		    extern void LYUnEscapeToLatinOne (
1072  *		       HTStructured *	       me,
1073  *		       char **		       str,
1074  *		       BOOLEAN		       isURL);
1075  */
1076 
LYUCFullyTranslateString(char ** str,int cs_from,int cs_to,int do_ent,int use_lynx_specials,int plain_space,int hidden,int Back,CharUtil_st stype)1077 char **LYUCFullyTranslateString(char **str,
1078 				int cs_from,
1079 				int cs_to,
1080 				int do_ent,
1081 				int use_lynx_specials,
1082 				int plain_space,
1083 				int hidden,
1084 				int Back,
1085 				CharUtil_st stype)
1086 {
1087     char *p;
1088     char *q, *qs;
1089     HTChunk *chunk = NULL;
1090     char *cp = 0;
1091     char cpe = 0;
1092     char *esc = NULL;
1093     char replace_buf[64];
1094     int uck;
1095     int lowest_8;
1096     UCode_t code = 0;
1097     BOOL output_utf8 = 0, repl_translated_C0 = 0;
1098     size_t len;
1099     const char *name = NULL;
1100     BOOLEAN no_bytetrans;
1101     UCTransParams T;
1102     BOOL from_is_utf8 = FALSE;
1103     char *puni;
1104     enum _state {
1105 	S_text,
1106 	S_esc,
1107 	S_dollar,
1108 	S_paren,
1109 	S_nonascii_text,
1110 	S_dollar_paren,
1111 	S_trans_byte,
1112 	S_check_ent,
1113 	S_ncr,
1114 	S_check_uni,
1115 	S_named,
1116 	S_check_name,
1117 	S_recover,
1118 	S_got_oututf8,
1119 	S_got_outstring,
1120 	S_put_urlstring,
1121 	S_got_outchar,
1122 	S_put_urlchar,
1123 	S_next_char,
1124 	S_done
1125     } state = S_text;
1126     enum _parsing_what {
1127 	P_text,
1128 	P_utf8,
1129 	P_hex,
1130 	P_decimal,
1131 	P_named
1132     } what = P_text;
1133 
1134 #ifdef KANJI_CODE_OVERRIDE
1135     static unsigned char sjis_1st = '\0';
1136 
1137     unsigned char sjis_str[3];
1138 #endif
1139 
1140     /*
1141      * Make sure we have a non-empty string.  - FM
1142      */
1143     if (isEmpty(*str))
1144 	return str;
1145 
1146     /*
1147      * FIXME: something's wrong with the limit checks here (clearing the
1148      * buffer helps).
1149      */
1150     memset(replace_buf, 0, sizeof(replace_buf));
1151 
1152     /*
1153      * Don't do byte translation if original AND target character sets are both
1154      * iso-8859-1 (and we are not called to back-translate), or if we are in
1155      * CJK mode.
1156      */
1157     if (IS_CJK_TTY
1158 #ifdef EXP_JAPANESEUTF8_SUPPORT
1159 	&& (strcmp(LYCharSet_UC[cs_from].MIMEname, "utf-8") != 0)
1160 	&& (strcmp(LYCharSet_UC[cs_to].MIMEname, "utf-8") != 0)
1161 #endif
1162 	) {
1163 	no_bytetrans = TRUE;
1164     } else if (cs_to <= 0 && cs_from == cs_to && (!Back || cs_to < 0)) {
1165 	no_bytetrans = TRUE;
1166     } else {
1167 	/* No need to translate or examine the string any further */
1168 	no_bytetrans = (BOOL) (!use_lynx_specials && !Back &&
1169 			       UCNeedNotTranslate(cs_from, cs_to));
1170     }
1171     /*
1172      * Save malloc/calloc overhead in simple case - kw
1173      */
1174     if (do_ent && hidden && (stype != st_URL) && (strchr(*str, '&') == NULL))
1175 	do_ent = FALSE;
1176 
1177     /* Can't do, caller should figure out what to do... */
1178     if (!UCCanTranslateFromTo(cs_from, cs_to)) {
1179 	if (cs_to < 0)
1180 	    return NULL;
1181 	if (!do_ent && no_bytetrans)
1182 	    return NULL;
1183 	no_bytetrans = TRUE;
1184     } else if (cs_to < 0) {
1185 	do_ent = FALSE;
1186     }
1187 
1188     if (!do_ent && no_bytetrans)
1189 	return str;
1190     p = *str;
1191 
1192     if (!no_bytetrans) {
1193 	UCTransParams_clear(&T);
1194 	UCSetTransParams(&T, cs_from, &LYCharSet_UC[cs_from],
1195 			 cs_to, &LYCharSet_UC[cs_to]);
1196 	from_is_utf8 = (BOOL) (LYCharSet_UC[cs_from].enc == UCT_ENC_UTF8);
1197 	output_utf8 = T.output_utf8;
1198 	repl_translated_C0 = T.repl_translated_C0;
1199 	puni = p;
1200     } else if (do_ent) {
1201 	output_utf8 = (BOOL) (LYCharSet_UC[cs_to].enc == UCT_ENC_UTF8 ||
1202 			      HText_hasUTF8OutputSet(HTMainText));
1203 	repl_translated_C0 = (BOOL) (LYCharSet_UC[cs_to].enc == UCT_ENC_8BIT_C0);
1204     }
1205 
1206     lowest_8 = LYlowest_eightbit[cs_to];
1207 
1208     /*
1209      * Create a buffer string seven times the length of the original, so we
1210      * have plenty of room for expansions.  - FM
1211      */
1212     len = strlen(p) + 16;
1213     q = p;
1214 
1215     qs = q;
1216 
1217 /*  Create the HTChunk only if we need it */
1218 #define CHUNK (chunk ? chunk : (chunk = HTChunkCreate2(128, len+1)))
1219 
1220 #define REPLACE_STRING(s) \
1221 		if (q != qs) HTChunkPutb(CHUNK, qs, (int) (q - qs)); \
1222 		HTChunkPuts(CHUNK, s); \
1223 		qs = q = *str
1224 
1225 #define REPLACE_CHAR(c) if (q > p) { \
1226 		HTChunkPutb(CHUNK, qs, (int) (q - qs)); \
1227 		qs = q = *str; \
1228 		*q++ = c; \
1229 	    } else \
1230 		*q++ = c
1231 
1232     /*
1233      * Loop through string, making conversions as needed.
1234      *
1235      * The while() checks for a non-'\0' char only for the normal text states
1236      * since other states may temporarily modify p or *p (which should be
1237      * restored before S_done!) - kw
1238      */
1239     while (*p || (state != S_text && state != S_nonascii_text)) {
1240 	switch (state) {
1241 	case S_text:
1242 	    code = UCH(*p);
1243 #ifdef KANJI_CODE_OVERRIDE
1244 	    if (HTCJK == JAPANESE && last_kcode == SJIS) {
1245 		if (sjis_1st == '\0' && (IS_SJIS_HI1(code) || IS_SJIS_HI2(code))) {
1246 		    sjis_1st = UCH(code);
1247 		} else if (sjis_1st && IS_SJIS_LO(code)) {
1248 		    sjis_1st = '\0';
1249 		} else {
1250 		    if (conv_jisx0201kana && 0xA1 <= code && code <= 0xDF) {
1251 			sjis_str[2] = '\0';
1252 			JISx0201TO0208_SJIS(UCH(code),
1253 					    sjis_str, sjis_str + 1);
1254 			REPLACE_STRING(sjis_str);
1255 			p++;
1256 			continue;
1257 		    }
1258 		}
1259 	    }
1260 #endif
1261 	    if (*p == '\033') {
1262 		if ((IS_CJK_TTY && !hidden) || stype != st_HTML) {
1263 		    state = S_esc;
1264 		    if (stype == st_URL) {
1265 			REPLACE_STRING("%1B");
1266 			p++;
1267 			continue;
1268 		    } else if (stype != st_HTML) {
1269 			p++;
1270 			continue;
1271 		    } else {
1272 			*q++ = *p++;
1273 			continue;
1274 		    }
1275 		} else if (!hidden) {
1276 		    /*
1277 		     * CJK handling not on, and not a hidden INPUT, so block
1278 		     * escape.  - FM
1279 		     */
1280 		    state = S_next_char;
1281 		} else {
1282 		    state = S_trans_byte;
1283 		}
1284 	    } else {
1285 		state = (do_ent ? S_check_ent : S_trans_byte);
1286 	    }
1287 	    break;
1288 
1289 	case S_esc:
1290 	    if (*p == '$') {
1291 		state = S_dollar;
1292 		*q++ = *p++;
1293 		continue;
1294 	    } else if (*p == '(') {
1295 		state = S_paren;
1296 		*q++ = *p++;
1297 		continue;
1298 	    } else {
1299 		state = S_text;
1300 	    }
1301 	    break;
1302 
1303 	case S_dollar:
1304 	    if (*p == '@' || *p == 'B' || *p == 'A') {
1305 		state = S_nonascii_text;
1306 		*q++ = *p++;
1307 		continue;
1308 	    } else if (*p == '(') {
1309 		state = S_dollar_paren;
1310 		*q++ = *p++;
1311 		continue;
1312 	    } else {
1313 		state = S_text;
1314 	    }
1315 	    break;
1316 
1317 	case S_dollar_paren:
1318 	    if (*p == 'C') {
1319 		state = S_nonascii_text;
1320 		*q++ = *p++;
1321 		continue;
1322 	    } else {
1323 		state = S_text;
1324 	    }
1325 	    break;
1326 
1327 	case S_paren:
1328 	    if (*p == 'B' || *p == 'J' || *p == 'T') {
1329 		state = S_text;
1330 		*q++ = *p++;
1331 		continue;
1332 	    } else if (*p == 'I') {
1333 		state = S_nonascii_text;
1334 		*q++ = *p++;
1335 		continue;
1336 	    } else {
1337 		state = S_text;
1338 	    }
1339 	    break;
1340 
1341 	case S_nonascii_text:
1342 	    if (*p == '\033') {
1343 		if ((IS_CJK_TTY && !hidden) || stype != st_HTML) {
1344 		    state = S_esc;
1345 		    if (stype == st_URL) {
1346 			REPLACE_STRING("%1B");
1347 			p++;
1348 			continue;
1349 		    } else if (stype != st_HTML) {
1350 			p++;
1351 			continue;
1352 		    }
1353 		}
1354 	    }
1355 	    *q++ = *p++;
1356 	    continue;
1357 
1358 	case S_trans_byte:
1359 	    /* character translation goes here */
1360 	    /*
1361 	     * Don't do anything if we have no string, or if original AND
1362 	     * target character sets are both iso-8859-1, or if we are in CJK
1363 	     * mode.
1364 	     */
1365 	    if (*p == '\0' || no_bytetrans) {
1366 		state = S_got_outchar;
1367 		break;
1368 	    }
1369 
1370 	    if (Back) {
1371 		int rev_c;
1372 
1373 		if ((*p) == HT_NON_BREAK_SPACE ||
1374 		    (*p) == HT_EN_SPACE) {
1375 		    if (plain_space) {
1376 			code = *p = ' ';
1377 			state = S_got_outchar;
1378 			break;
1379 		    } else {
1380 			code = 160;
1381 			if (LYCharSet_UC[cs_to].enc == UCT_ENC_8859 ||
1382 			    (LYCharSet_UC[cs_to].like8859 & UCT_R_8859SPECL)) {
1383 			    state = S_got_outchar;
1384 			    break;
1385 			} else if (!(LYCharSet_UC[cs_from].enc == UCT_ENC_8859
1386 				     || (LYCharSet_UC[cs_from].like8859 & UCT_R_8859SPECL))) {
1387 			    state = S_check_uni;
1388 			    break;
1389 			} else {
1390 			    *(unsigned char *) p = UCH(160);
1391 			}
1392 		    }
1393 		} else if ((*p) == LY_SOFT_HYPHEN) {
1394 		    code = 173;
1395 		    if (LYCharSet_UC[cs_to].enc == UCT_ENC_8859 ||
1396 			(LYCharSet_UC[cs_to].like8859 & UCT_R_8859SPECL)) {
1397 			state = S_got_outchar;
1398 			break;
1399 		    } else if (!(LYCharSet_UC[cs_from].enc == UCT_ENC_8859
1400 				 || (LYCharSet_UC[cs_from].like8859 & UCT_R_8859SPECL))) {
1401 			state = S_check_uni;
1402 			break;
1403 		    } else {
1404 			*(unsigned char *) p = UCH(173);
1405 		    }
1406 #ifdef EXP_JAPANESEUTF8_SUPPORT
1407 		} else if (output_utf8) {
1408 		    if ((!strcmp(LYCharSet_UC[cs_from].MIMEname, "euc-jp") &&
1409 			 (IS_EUC((unsigned char) (*p),
1410 				 (unsigned char) (*(p + 1))))) ||
1411 			(!strcmp(LYCharSet_UC[cs_from].MIMEname, "shift_jis") &&
1412 			 (IS_SJIS_2BYTE((unsigned char) (*p),
1413 					(unsigned char) (*(p + 1)))))) {
1414 			code = UCTransJPToUni(p, 2, cs_from);
1415 			p++;
1416 			state = S_check_uni;
1417 			break;
1418 		    }
1419 #endif
1420 		} else if (code < 127 || T.transp) {
1421 		    state = S_got_outchar;
1422 		    break;
1423 		}
1424 		rev_c = UCReverseTransChar(*p, cs_to, cs_from);
1425 		if (rev_c > 127) {
1426 		    *p = (char) rev_c;
1427 		    code = rev_c;
1428 		    state = S_got_outchar;
1429 		    break;
1430 		}
1431 	    } else if (code < 127) {
1432 		state = S_got_outchar;
1433 		break;
1434 	    }
1435 
1436 	    if (from_is_utf8) {
1437 		if (((*p) & 0xc0) == 0xc0) {
1438 		    puni = p;
1439 		    code = UCGetUniFromUtf8String(&puni);
1440 		    if (code <= 0) {
1441 			code = UCH(*p);
1442 		    } else {
1443 			what = P_utf8;
1444 		    }
1445 		}
1446 	    } else if (use_lynx_specials && !Back &&
1447 		       (code == 160 || code == 173) &&
1448 		       (LYCharSet_UC[cs_from].enc == UCT_ENC_8859 ||
1449 			(LYCharSet_UC[cs_from].like8859 & UCT_R_8859SPECL))) {
1450 		if (code == 160)
1451 		    code = *p = HT_NON_BREAK_SPACE;
1452 		else if (code == 173)
1453 		    code = *p = LY_SOFT_HYPHEN;
1454 		state = S_got_outchar;
1455 		break;
1456 	    } else if (T.trans_to_uni) {
1457 		code = UCTransToUni(*p, cs_from);
1458 		if (code <= 0) {
1459 		    /* What else can we do? */
1460 		    code = UCH(*p);
1461 		}
1462 	    } else if (!T.trans_from_uni) {
1463 		state = S_got_outchar;
1464 		break;
1465 	    }
1466 	    /*
1467 	     * Substitute Lynx special character for 160 (nbsp) if
1468 	     * use_lynx_specials is set.
1469 	     */
1470 	    if (use_lynx_specials && !Back &&
1471 		(code == 160 || code == 173)) {
1472 		code = ((code == 160 ? HT_NON_BREAK_SPACE : LY_SOFT_HYPHEN));
1473 		state = S_got_outchar;
1474 		break;
1475 	    }
1476 
1477 	    state = S_check_uni;
1478 	    break;
1479 
1480 	case S_check_ent:
1481 	    if (*p == '&') {
1482 		char *pp = p + 1;
1483 
1484 		len = strlen(pp);
1485 		/*
1486 		 * Check for a numeric entity.  - FM
1487 		 */
1488 		if (*pp == '#' && len > 2 &&
1489 		    (*(pp + 1) == 'x' || *(pp + 1) == 'X') &&
1490 		    UCH(*(pp + 2)) < 127 &&
1491 		    isxdigit(UCH(*(pp + 2)))) {
1492 		    what = P_hex;
1493 		    state = S_ncr;
1494 		} else if (*pp == '#' && len > 2 &&
1495 			   UCH(*(pp + 1)) < 127 &&
1496 			   isdigit(UCH(*(pp + 1)))) {
1497 		    what = P_decimal;
1498 		    state = S_ncr;
1499 		} else if (UCH(*pp) < 127 &&
1500 			   isalpha(UCH(*pp))) {
1501 		    what = P_named;
1502 		    state = S_named;
1503 		} else {
1504 		    state = S_trans_byte;
1505 		}
1506 	    } else {
1507 		state = S_trans_byte;
1508 	    }
1509 	    break;
1510 
1511 	case S_ncr:
1512 	    if (what == P_hex) {
1513 		p += 3;
1514 	    } else {		/* P_decimal */
1515 		p += 2;
1516 	    }
1517 	    cp = p;
1518 	    while (*p && UCH(*p) < 127 &&
1519 		   (what == P_hex ? isxdigit(UCH(*p)) :
1520 		    isdigit(UCH(*p)))) {
1521 		p++;
1522 	    }
1523 	    /*
1524 	     * Save the terminator and isolate the digit(s).  - FM
1525 	     */
1526 	    cpe = *p;
1527 	    if (*p)
1528 		*p++ = '\0';
1529 	    /*
1530 	     * Show the numeric entity if the value:
1531 	     * (1) Is greater than 255 and unhandled Unicode.
1532 	     * (2) Is less than 32, and not valid and we don't have HTCJK set.
1533 	     * (3) Is 127 and we don't have HTPassHighCtrlRaw or HTCJK set.
1534 	     * (4) Is 128 - 159 and we don't have HTPassHighCtrlNum set.
1535 	     */
1536 	    if (UCScanCode(&code, cp, (BOOL) (what == P_hex))) {
1537 		code = LYcp1252ToUnicode(code);
1538 		state = S_check_uni;
1539 	    } else {
1540 		state = S_recover;
1541 		break;
1542 	    }
1543 	    break;
1544 
1545 	case S_check_uni:
1546 	    /*
1547 	     * Show the numeric entity if the value:
1548 	     * (2) Is less than 32, and not valid and we don't have HTCJK set.
1549 	     * (3) Is 127 and we don't have HTPassHighCtrlRaw or HTCJK set.
1550 	     * (4) Is 128 - 159 and we don't have HTPassHighCtrlNum set.
1551 	     */
1552 	    if ((code < 32 &&
1553 		 code != 9 && code != 10 && code != 13 &&
1554 		 !IS_CJK_TTY) ||
1555 		(code == 127 &&
1556 		 !(HTPassHighCtrlRaw || IS_CJK_TTY)) ||
1557 		(code > 127 && code < 160 &&
1558 		 !HTPassHighCtrlNum)) {
1559 		state = S_recover;
1560 		break;
1561 	    }
1562 	    /*
1563 	     * Convert the value as an unsigned char, hex escaped if isURL is
1564 	     * set and it's 8-bit, and then recycle the terminator if it is not
1565 	     * a semicolon.  - FM
1566 	     */
1567 	    if (code > 159 && stype == st_URL) {
1568 		state = S_got_oututf8;
1569 		break;
1570 	    }
1571 	    /*
1572 	     * For 160 (nbsp), use that value if it's a hidden INPUT, otherwise
1573 	     * use an ASCII space (32) if plain_space is TRUE, otherwise use
1574 	     * the Lynx special character.  - FM
1575 	     */
1576 	    if (code == 160) {
1577 		if (plain_space) {
1578 		    code = ' ';
1579 		    state = S_got_outchar;
1580 		    break;
1581 		} else if (use_lynx_specials) {
1582 		    code = HT_NON_BREAK_SPACE;
1583 		    state = S_got_outchar;
1584 		    break;
1585 		} else if ((hidden && !Back)
1586 			   || (LYCharSet_UC[cs_to].codepoints & UCT_CP_SUPERSETOF_LAT1)
1587 			   || LYCharSet_UC[cs_to].enc == UCT_ENC_8859
1588 			   || (LYCharSet_UC[cs_to].like8859 &
1589 			       UCT_R_8859SPECL)) {
1590 		    state = S_got_outchar;
1591 		    break;
1592 		} else if (
1593 			      (LYCharSet_UC[cs_to].repertoire & UCT_REP_SUPERSETOF_LAT1)) {
1594 		    ;		/* nothing, may be translated later */
1595 		} else {
1596 		    code = ' ';
1597 		    state = S_got_outchar;
1598 		    break;
1599 		}
1600 	    }
1601 	    /*
1602 	     * For 173 (shy), use that value if it's a hidden INPUT, otherwise
1603 	     * ignore it if plain_space is TRUE, otherwise use the Lynx special
1604 	     * character.  - FM
1605 	     */
1606 	    if (code == 173) {
1607 		if (plain_space) {
1608 		    replace_buf[0] = '\0';
1609 		    state = S_got_outstring;
1610 		    break;
1611 		} else if (Back &&
1612 			   !(LYCharSet_UC[cs_to].enc == UCT_ENC_8859 ||
1613 			     (LYCharSet_UC[cs_to].like8859 &
1614 			      UCT_R_8859SPECL))) {
1615 		    ;		/* nothing, may be translated later */
1616 		} else if (hidden || Back) {
1617 		    state = S_got_outchar;
1618 		    break;
1619 		} else if (use_lynx_specials) {
1620 		    code = LY_SOFT_HYPHEN;
1621 		    state = S_got_outchar;
1622 		    break;
1623 		}
1624 	    }
1625 	    /*
1626 	     * Seek a translation from the chartrans tables.
1627 	     */
1628 	    if ((uck = UCTransUniChar(code,
1629 				      cs_to)) >= 32 &&
1630 		uck < 256 &&
1631 		(uck < 127 || uck >= lowest_8)) {
1632 		code = uck;
1633 		state = S_got_outchar;
1634 		break;
1635 	    } else if ((uck == -4 ||
1636 			(repl_translated_C0 &&
1637 			 uck > 0 && uck < 32)) &&
1638 		/*
1639 		 * Not found; look for replacement string.
1640 		 */
1641 		       UCTransUniCharStr(replace_buf,
1642 					 60, code,
1643 					 cs_to,
1644 					 0) >= 0) {
1645 		state = S_got_outstring;
1646 		break;
1647 	    }
1648 	    if (output_utf8 &&
1649 		code > 127 && code < 0x7fffffffL) {
1650 		state = S_got_oututf8;
1651 		break;
1652 	    }
1653 	    /*
1654 	     * For 8194 (ensp), 8195 (emsp), or 8201 (thinsp), use the
1655 	     * character reference if it's a hidden INPUT, otherwise use an
1656 	     * ASCII space (32) if plain_space is TRUE, otherwise use the Lynx
1657 	     * special character.  - FM
1658 	     */
1659 	    if (code == 8194 || code == 8195 || code == 8201) {
1660 		if (hidden) {
1661 		    state = S_recover;
1662 		} else if (plain_space) {
1663 		    code = ' ';
1664 		    state = S_got_outchar;
1665 		} else {
1666 		    code = HT_EN_SPACE;
1667 		    state = S_got_outchar;
1668 		}
1669 		break;
1670 		/*
1671 		 * Ignore 8204 (zwnj), 8205 (zwj) 8206 (lrm), and 8207 (rlm),
1672 		 * for now, if we got this far without finding a representation
1673 		 * for them.
1674 		 */
1675 	    } else if (code == 8204 || code == 8205 ||
1676 		       code == 8206 || code == 8207) {
1677 		CTRACE((tfp, "LYUCFullyTranslateString: Ignoring '%"
1678 			PRI_UCode_t "'.\n", code));
1679 		replace_buf[0] = '\0';
1680 		state = S_got_outstring;
1681 		break;
1682 		/*
1683 		 * Show the numeric entity if the value:  (1) Is greater than
1684 		 * 255 and unhandled Unicode.
1685 		 */
1686 	    } else if (code > 255) {
1687 		/*
1688 		 * Illegal or not yet handled value.  Return "&#" verbatim and
1689 		 * continue from there.  - FM
1690 		 */
1691 		state = S_recover;
1692 		break;
1693 		/*
1694 		 * If it's ASCII, or is 8-bit but HTPassEightBitNum is set or
1695 		 * the character set is "ISO Latin 1", use its value.  - FM
1696 		 */
1697 	    } else if (code < 161 ||
1698 		       (code < 256 &&
1699 			(HTPassEightBitNum || cs_to == LATIN1))) {
1700 		/*
1701 		 * No conversion needed.
1702 		 */
1703 		state = S_got_outchar;
1704 		break;
1705 
1706 		/* The following disabled section doesn't make sense any more.
1707 		 * It used to make sense in the past, when S_check_named would
1708 		 * look in "old style" tables in addition to what it does now.
1709 		 * Disabling of going to S_check_name here prevents endless
1710 		 * looping between S_check_uni and S_check_names states, which
1711 		 * could occur here for Latin 1 codes for some cs_to if they
1712 		 * had no translation in that cs_to.  Normally all cs_to
1713 		 * *should* now have valid translations via UCTransUniChar or
1714 		 * UCTransUniCharStr for all Latin 1 codes, so that we would
1715 		 * not get here anyway, and no loop could occur.  Still, if we
1716 		 * *do* get here, FALL THROUGH to case S_recover now.  - kw
1717 		 */
1718 #if 0
1719 		/*
1720 		 * If we get to here, convert and handle the character as a
1721 		 * named entity.  - FM
1722 		 */
1723 	    } else {
1724 		name = HTMLGetEntityName(code - 160);
1725 		state = S_check_name;
1726 		break;
1727 #endif
1728 	    }
1729 
1730 	case S_recover:
1731 	    if (what == P_decimal || what == P_hex) {
1732 		/*
1733 		 * Illegal or not yet handled value.  Return "&#" verbatim and
1734 		 * continue from there.  - FM
1735 		 */
1736 		*q++ = '&';
1737 		*q++ = '#';
1738 		if (what == P_hex)
1739 		    *q++ = 'x';
1740 		if (cpe != '\0')
1741 		    *(p - 1) = cpe;
1742 		p = cp;
1743 		state = S_done;
1744 	    } else if (what == P_named) {
1745 		*cp = cpe;
1746 		*q++ = '&';
1747 		state = S_done;
1748 	    } else if (!T.output_utf8 && stype == st_HTML && !hidden &&
1749 		       !(HTPassEightBitRaw &&
1750 			 UCH(*p) >= lowest_8)) {
1751 		sprintf(replace_buf, "U%.2" PRI_UCode_t "", code);
1752 
1753 		state = S_got_outstring;
1754 	    } else {
1755 		puni = p;
1756 		code = UCH(*p);
1757 		state = S_got_outchar;
1758 	    }
1759 	    break;
1760 
1761 	case S_named:
1762 	    cp = ++p;
1763 	    while (*cp && UCH(*cp) < 127 &&
1764 		   isalnum(UCH(*cp)))
1765 		cp++;
1766 	    cpe = *cp;
1767 	    *cp = '\0';
1768 	    name = p;
1769 	    state = S_check_name;
1770 	    break;
1771 
1772 	case S_check_name:
1773 	    /*
1774 	     * Seek the Unicode value for the named entity.
1775 	     *
1776 	     * !!!!  We manually recover the case of '=' terminator which is
1777 	     * commonly found on query to CGI-scripts enclosed as href= URLs
1778 	     * like "somepath/?x=1&yz=2" Without this dirty fix, submission of
1779 	     * such URLs was broken if &yz string happened to be a recognized
1780 	     * entity name.  - LP
1781 	     */
1782 	    if (((code = HTMLGetEntityUCValue(name)) > 0) &&
1783 		!((cpe == '=') && (stype == st_URL))) {
1784 		state = S_check_uni;
1785 		break;
1786 	    }
1787 	    /*
1788 	     * Didn't find the entity.  Return verbatim.
1789 	     */
1790 	    state = S_recover;
1791 	    break;
1792 
1793 	    /* * * O U T P U T   S T A T E S * * */
1794 
1795 	case S_got_oututf8:
1796 	    if (code > 255 ||
1797 		(code >= 128 && LYCharSet_UC[cs_to].enc == UCT_ENC_UTF8)) {
1798 		UCConvertUniToUtf8(code, replace_buf);
1799 		state = S_got_outstring;
1800 	    } else {
1801 		state = S_got_outchar;
1802 	    }
1803 	    break;
1804 	case S_got_outstring:
1805 	    if (what == P_decimal || what == P_hex) {
1806 		if (cpe != ';' && cpe != '\0')
1807 		    *(--p) = cpe;
1808 		p--;
1809 	    } else if (what == P_named) {
1810 		*cp = cpe;
1811 		p = (*cp != ';') ? (cp - 1) : cp;
1812 	    } else if (what == P_utf8) {
1813 		p = puni;
1814 	    }
1815 	    if (replace_buf[0] == '\0') {
1816 		state = S_next_char;
1817 		break;
1818 	    }
1819 	    if (stype == st_URL) {
1820 		code = replace_buf[0];	/* assume string OK if first char is */
1821 		if (code >= 127 ||
1822 		    (code < 32 && (code != 9 && code != 10 && code != 0))) {
1823 		    state = S_put_urlstring;
1824 		    break;
1825 		}
1826 	    }
1827 	    REPLACE_STRING(replace_buf);
1828 	    state = S_next_char;
1829 	    break;
1830 	case S_put_urlstring:
1831 	    esc = HTEscape(replace_buf, URL_XALPHAS);
1832 	    REPLACE_STRING(esc);
1833 	    FREE(esc);
1834 	    state = S_next_char;
1835 	    break;
1836 	case S_got_outchar:
1837 	    if (what == P_decimal || what == P_hex) {
1838 		if (cpe != ';' && cpe != '\0')
1839 		    *(--p) = cpe;
1840 		p--;
1841 	    } else if (what == P_named) {
1842 		*cp = cpe;
1843 		p = (*cp != ';') ? (cp - 1) : cp;
1844 	    } else if (what == P_utf8) {
1845 		p = puni;
1846 	    }
1847 	    if (stype == st_URL &&
1848 	    /*  Not a full HTEscape, only for 8bit and ctrl chars */
1849 		(TOASCII(code) >= 127 ||	/* S/390 -- gil -- 1925 */
1850 		 (code < ' ' && (code != '\t' && code != '\n')))) {
1851 		state = S_put_urlchar;
1852 		break;
1853 	    } else if (!hidden && code == 10 && *p == 10
1854 		       && q != qs && *(q - 1) == 13) {
1855 		/*
1856 		 * If this is not a hidden string, and the current char is the
1857 		 * LF ('\n') of a CRLF pair, drop the CR ('\r').  - KW
1858 		 */
1859 		*(q - 1) = *p++;
1860 		state = S_done;
1861 		break;
1862 	    }
1863 	    *q++ = (char) code;
1864 	    state = S_next_char;
1865 	    break;
1866 	case S_put_urlchar:
1867 	    *q++ = '%';
1868 	    REPLACE_CHAR(hex[(TOASCII(code) >> 4) & 15]);	/* S/390 -- gil -- 1944 */
1869 	    REPLACE_CHAR(hex[(TOASCII(code) & 15)]);
1870 	    /* fall through */
1871 	case S_next_char:
1872 	    p++;		/* fall through */
1873 	case S_done:
1874 	    state = S_text;
1875 	    what = P_text;
1876 	    /* for next round */
1877 	}
1878     }
1879 
1880     *q = '\0';
1881     if (chunk) {
1882 	HTChunkPutb(CHUNK, qs, (int) (q - qs + 1));	/* also terminates */
1883 	if (stype == st_URL || stype == st_other) {
1884 	    LYTrimHead(chunk->data);
1885 	    LYTrimTail(chunk->data);
1886 	}
1887 	StrAllocCopy(*str, chunk->data);
1888 	HTChunkFree(chunk);
1889     } else {
1890 	if (stype == st_URL || stype == st_other) {
1891 	    LYTrimHead(qs);
1892 	    LYTrimTail(qs);
1893 	}
1894     }
1895     return str;
1896 }
1897 
1898 #undef REPLACE_CHAR
1899 #undef REPLACE_STRING
1900 
LYUCTranslateHTMLString(char ** str,int cs_from,int cs_to,int use_lynx_specials,int plain_space,int hidden,CharUtil_st stype)1901 BOOL LYUCTranslateHTMLString(char **str,
1902 			     int cs_from,
1903 			     int cs_to,
1904 			     int use_lynx_specials,
1905 			     int plain_space,
1906 			     int hidden,
1907 			     CharUtil_st stype)
1908 {
1909     BOOL ret = YES;
1910 
1911     /* May reallocate *str even if cs_to == 0 */
1912     if (!LYUCFullyTranslateString(str, cs_from, cs_to, TRUE,
1913 				  use_lynx_specials, plain_space, hidden,
1914 				  NO, stype)) {
1915 	ret = NO;
1916     }
1917     return ret;
1918 }
1919 
LYUCTranslateBackFormData(char ** str,int cs_from,int cs_to,int plain_space)1920 BOOL LYUCTranslateBackFormData(char **str,
1921 			       int cs_from,
1922 			       int cs_to,
1923 			       int plain_space)
1924 {
1925     char **ret;
1926 
1927     /* May reallocate *str */
1928     ret = (LYUCFullyTranslateString(str, cs_from, cs_to, FALSE,
1929 				    NO, plain_space, YES,
1930 				    YES, st_HTML));
1931     return (BOOL) (ret != NULL);
1932 }
1933 
1934 /*
1935  * Parse a parameter from an HTML META tag, i.e., the CONTENT.
1936  */
LYParseTagParam(char * from,const char * name)1937 char *LYParseTagParam(char *from,
1938 		      const char *name)
1939 {
1940     size_t len = strlen(name);
1941     char *result = NULL;
1942     char *string = from;
1943 
1944     do {
1945 	if ((string = strchr(string, ';')) == NULL)
1946 	    return NULL;
1947 	while (*string != '\0' && (*string == ';' || isspace(UCH(*string)))) {
1948 	    string++;
1949 	}
1950 	if (strlen(string) < len)
1951 	    return NULL;
1952     } while (strncasecomp(string, name, (int) len) != 0);
1953     string += len;
1954     while (*string != '\0' && (isspace(UCH(*string)) || *string == '=')) {
1955 	string++;
1956     }
1957 
1958     StrAllocCopy(result, string);
1959     len = 0;
1960     while (isprint(UCH(string[len])) && !isspace(UCH(string[len]))) {
1961 	len++;
1962     }
1963     result[len] = '\0';
1964 
1965     /*
1966      * Strip single quotes, just in case.
1967      */
1968     if (len > 2 && result[0] == '\'' && result[len - 1] == result[0]) {
1969 	result[len - 1] = '\0';
1970 	for (string = result; (string[0] = string[1]) != '\0'; ++string) ;
1971     }
1972     return result;
1973 }
1974 
1975 /*
1976  * Given a refresh-URL content string, parses the delay time and the URL
1977  * string.  Ignore the remainder of the content.
1978  */
LYParseRefreshURL(char * content,char ** p_seconds,char ** p_address)1979 void LYParseRefreshURL(char *content,
1980 		       char **p_seconds,
1981 		       char **p_address)
1982 {
1983     char *cp;
1984     char *cp1 = NULL;
1985     char *Seconds = NULL;
1986 
1987     /*
1988      * Look for the Seconds field.  - FM
1989      */
1990     cp = LYSkipBlanks(content);
1991     if (*cp && isdigit(UCH(*cp))) {
1992 	cp1 = cp;
1993 	while (*cp1 && isdigit(UCH(*cp1)))
1994 	    cp1++;
1995 	StrnAllocCopy(Seconds, cp, (int) (cp1 - cp));
1996     }
1997     *p_seconds = Seconds;
1998     *p_address = LYParseTagParam(content, "URL");
1999 
2000     CTRACE((tfp,
2001 	    "LYParseRefreshURL\n\tcontent: %s\n\tseconds: %s\n\taddress: %s\n",
2002 	    content, NonNull(*p_seconds), NonNull(*p_address)));
2003 }
2004 
2005 /*
2006  *  This function processes META tags in HTML streams. - FM
2007  */
LYHandleMETA(HTStructured * me,const BOOL * present,STRING2PTR value,char ** include GCC_UNUSED)2008 void LYHandleMETA(HTStructured * me, const BOOL *present,
2009 		  STRING2PTR value,
2010 		  char **include GCC_UNUSED)
2011 {
2012     char *http_equiv = NULL, *name = NULL, *content = NULL, *charset = NULL;
2013     char *href = NULL, *id_string = NULL, *temp = NULL;
2014     char *cp, *cp0, *cp1 = NULL;
2015     int url_type = 0;
2016 
2017     if (!me || !present)
2018 	return;
2019 
2020     /*
2021      * Load the attributes for possible use by Lynx.  - FM
2022      */
2023     if (present[HTML_META_HTTP_EQUIV] &&
2024 	non_empty(value[HTML_META_HTTP_EQUIV])) {
2025 	StrAllocCopy(http_equiv, value[HTML_META_HTTP_EQUIV]);
2026 	convert_to_spaces(http_equiv, TRUE);
2027 	LYUCTranslateHTMLString(&http_equiv, me->tag_charset, me->tag_charset,
2028 				NO, NO, YES, st_other);
2029 	if (*http_equiv == '\0') {
2030 	    FREE(http_equiv);
2031 	}
2032     }
2033     if (present[HTML_META_NAME] &&
2034 	non_empty(value[HTML_META_NAME])) {
2035 	StrAllocCopy(name, value[HTML_META_NAME]);
2036 	convert_to_spaces(name, TRUE);
2037 	LYUCTranslateHTMLString(&name, me->tag_charset, me->tag_charset,
2038 				NO, NO, YES, st_other);
2039 	if (*name == '\0') {
2040 	    FREE(name);
2041 	}
2042     }
2043     if (present[HTML_META_CONTENT] &&
2044 	non_empty(value[HTML_META_CONTENT])) {
2045 	/*
2046 	 * Technically, we should be creating a comma-separated list, but META
2047 	 * tags come one at a time, and we'll handle (or ignore) them as each
2048 	 * is received.  Also, at this point, we only trim leading and trailing
2049 	 * blanks from the CONTENT value, without translating any named
2050 	 * entities or numeric character references, because how we should do
2051 	 * that depends on what type of information it contains, and whether or
2052 	 * not any of it might be sent to the screen.  - FM
2053 	 */
2054 	StrAllocCopy(content, value[HTML_META_CONTENT]);
2055 	convert_to_spaces(content, FALSE);
2056 	LYTrimHead(content);
2057 	LYTrimTail(content);
2058 	if (*content == '\0') {
2059 	    FREE(content);
2060 	}
2061     }
2062     if (present[HTML_META_CHARSET] &&
2063 	non_empty(value[HTML_META_CHARSET])) {
2064 	StrAllocCopy(charset, value[HTML_META_CHARSET]);
2065 	convert_to_spaces(charset, TRUE);
2066 	LYUCTranslateHTMLString(&charset, me->tag_charset, me->tag_charset,
2067 				NO, NO, YES, st_other);
2068 	if (*charset == '\0') {
2069 	    FREE(charset);
2070 	}
2071     }
2072     CTRACE((tfp,
2073 	    "LYHandleMETA: HTTP-EQUIV=\"%s\" NAME=\"%s\" CONTENT=\"%s\" CHARSET=\"%s\"\n",
2074 	    NONNULL(http_equiv),
2075 	    NONNULL(name),
2076 	    NONNULL(content),
2077 	    NONNULL(charset)));
2078 
2079     /*
2080      * Check for a text/html Content-Type with a charset directive, if we
2081      * didn't already set the charset via a server's header.  - AAC & FM
2082      */
2083     if (isEmpty(me->node_anchor->charset) &&
2084 	(charset ||
2085 	 (!strcasecomp(NonNull(http_equiv), "Content-Type") && content))) {
2086 	LYUCcharset *p_in = NULL;
2087 	LYUCcharset *p_out = NULL;
2088 
2089 	if (charset) {
2090 	    LYLowerCase(charset);
2091 	} else {
2092 	    LYUCTranslateHTMLString(&content, me->tag_charset, me->tag_charset,
2093 				    NO, NO, YES, st_other);
2094 	    LYLowerCase(content);
2095 	}
2096 
2097 	if ((cp1 = charset) != NULL ||
2098 	    (cp1 = strstr(content, "charset")) != NULL) {
2099 	    BOOL chartrans_ok = NO;
2100 	    char *cp3 = NULL, *cp4;
2101 	    int chndl;
2102 
2103 	    if (!charset)
2104 		cp1 += 7;
2105 	    while (*cp1 == ' ' || *cp1 == '=' || *cp1 == '"')
2106 		cp1++;
2107 
2108 	    StrAllocCopy(cp3, cp1);	/* copy to mutilate more */
2109 	    for (cp4 = cp3; (*cp4 != '\0' && *cp4 != '"' &&
2110 			     *cp4 != ';' && *cp4 != ':' &&
2111 			     !WHITE(*cp4)); cp4++) {
2112 		;		/* do nothing */
2113 	    }
2114 	    *cp4 = '\0';
2115 	    cp4 = cp3;
2116 	    chndl = UCGetLYhndl_byMIME(cp3);
2117 
2118 #ifdef CAN_SWITCH_DISPLAY_CHARSET
2119 	    /* Allow a switch to a more suitable display charset */
2120 	    if (Switch_Display_Charset(chndl, SWITCH_DISPLAY_CHARSET_MAYBE)) {
2121 		/* UCT_STAGE_STRUCTURED and UCT_STAGE_HTEXT
2122 		   should have the same setting for UCInfoStage. */
2123 		HTAnchor_getUCInfoStage(me->node_anchor, UCT_STAGE_STRUCTURED);
2124 
2125 		me->outUCLYhndl = current_char_set;
2126 		HTAnchor_setUCInfoStage(me->node_anchor,
2127 					current_char_set,
2128 					UCT_STAGE_HTEXT,
2129 					UCT_SETBY_MIME);	/* highest priorty! */
2130 		HTAnchor_setUCInfoStage(me->node_anchor,
2131 					current_char_set,
2132 					UCT_STAGE_STRUCTURED,
2133 					UCT_SETBY_MIME);	/* highest priorty! */
2134 		me->outUCI = HTAnchor_getUCInfoStage(me->node_anchor,
2135 						     UCT_STAGE_HTEXT);
2136 		/* The SGML stage will be reset in change_chartrans_handling */
2137 	    }
2138 #endif
2139 
2140 	    if (UCCanTranslateFromTo(chndl, current_char_set)) {
2141 		chartrans_ok = YES;
2142 		StrAllocCopy(me->node_anchor->charset, cp4);
2143 		HTAnchor_setUCInfoStage(me->node_anchor, chndl,
2144 					UCT_STAGE_PARSER,
2145 					UCT_SETBY_STRUCTURED);
2146 	    } else if (chndl < 0) {
2147 		/*
2148 		 * Got something but we don't recognize it.
2149 		 */
2150 		chndl = UCLYhndl_for_unrec;
2151 		if (chndl < 0)	/* UCLYhndl_for_unrec not defined :-( */
2152 		    chndl = UCLYhndl_for_unspec;	/* always >= 0 */
2153 		if (UCCanTranslateFromTo(chndl, current_char_set)) {
2154 		    chartrans_ok = YES;
2155 		    HTAnchor_setUCInfoStage(me->node_anchor, chndl,
2156 					    UCT_STAGE_PARSER,
2157 					    UCT_SETBY_STRUCTURED);
2158 		}
2159 	    }
2160 	    if (chartrans_ok) {
2161 		p_in = HTAnchor_getUCInfoStage(me->node_anchor,
2162 					       UCT_STAGE_PARSER);
2163 		p_out = HTAnchor_setUCInfoStage(me->node_anchor,
2164 						current_char_set,
2165 						UCT_STAGE_HTEXT,
2166 						UCT_SETBY_DEFAULT);
2167 		if (!p_out) {
2168 		    /*
2169 		     * Try again.
2170 		     */
2171 		    p_out = HTAnchor_getUCInfoStage(me->node_anchor,
2172 						    UCT_STAGE_HTEXT);
2173 		}
2174 		if (!strcmp(p_in->MIMEname, "x-transparent")) {
2175 		    HTPassEightBitRaw = TRUE;
2176 		    HTAnchor_setUCInfoStage(me->node_anchor,
2177 					    HTAnchor_getUCLYhndl(me->node_anchor,
2178 								 UCT_STAGE_HTEXT),
2179 					    UCT_STAGE_PARSER,
2180 					    UCT_SETBY_DEFAULT);
2181 		}
2182 		if (!strcmp(p_out->MIMEname, "x-transparent")) {
2183 		    HTPassEightBitRaw = TRUE;
2184 		    HTAnchor_setUCInfoStage(me->node_anchor,
2185 					    HTAnchor_getUCLYhndl(me->node_anchor,
2186 								 UCT_STAGE_PARSER),
2187 					    UCT_STAGE_HTEXT,
2188 					    UCT_SETBY_DEFAULT);
2189 		}
2190 		if ((p_in->enc != UCT_ENC_CJK)
2191 #ifdef EXP_JAPANESEUTF8_SUPPORT
2192 		    && (p_in->enc != UCT_ENC_UTF8)
2193 #endif
2194 		    ) {
2195 		    HTCJK = NOCJK;
2196 		    if (!(p_in->codepoints &
2197 			  UCT_CP_SUBSETOF_LAT1) &&
2198 			chndl == current_char_set) {
2199 			HTPassEightBitRaw = TRUE;
2200 		    }
2201 		} else if (p_out->enc == UCT_ENC_CJK) {
2202 		    Set_HTCJK(p_in->MIMEname, p_out->MIMEname);
2203 		}
2204 		LYGetChartransInfo(me);
2205 		/*
2206 		 * Update the chartrans info homologously to a Content-Type
2207 		 * MIME header with a charset parameter.  - FM
2208 		 */
2209 		if (me->UCLYhndl != chndl) {
2210 		    HTAnchor_setUCInfoStage(me->node_anchor, chndl,
2211 					    UCT_STAGE_MIME,
2212 					    UCT_SETBY_STRUCTURED);
2213 		    HTAnchor_setUCInfoStage(me->node_anchor, chndl,
2214 					    UCT_STAGE_PARSER,
2215 					    UCT_SETBY_STRUCTURED);
2216 		    me->inUCLYhndl = HTAnchor_getUCLYhndl(me->node_anchor,
2217 							  UCT_STAGE_PARSER);
2218 		    me->inUCI = HTAnchor_getUCInfoStage(me->node_anchor,
2219 							UCT_STAGE_PARSER);
2220 		}
2221 		UCSetTransParams(&me->T,
2222 				 me->inUCLYhndl, me->inUCI,
2223 				 me->outUCLYhndl, me->outUCI);
2224 	    } else {
2225 		/*
2226 		 * Cannot translate.  If according to some heuristic the given
2227 		 * charset and the current display character both are likely to
2228 		 * be like ISO-8859 in structure, pretend we have some kind of
2229 		 * match.
2230 		 */
2231 		BOOL given_is_8859 = (BOOL) (!StrNCmp(cp4, "iso-8859-", 9) &&
2232 					     isdigit(UCH(cp4[9])));
2233 		BOOL given_is_8859like = (BOOL) (given_is_8859
2234 						 || !StrNCmp(cp4, "windows-", 8)
2235 						 || !StrNCmp(cp4, "cp12", 4)
2236 						 || !StrNCmp(cp4, "cp-12", 5));
2237 		BOOL given_and_display_8859like = (BOOL) (given_is_8859like &&
2238 							  (strstr(LYchar_set_names[current_char_set],
2239 								  "ISO-8859") ||
2240 							   strstr(LYchar_set_names[current_char_set],
2241 								  "windows-")));
2242 
2243 		if (given_is_8859) {
2244 		    cp1 = &cp4[10];
2245 		    while (*cp1 &&
2246 			   isdigit(UCH((*cp1))))
2247 			cp1++;
2248 		    *cp1 = '\0';
2249 		}
2250 		if (given_and_display_8859like) {
2251 		    StrAllocCopy(me->node_anchor->charset, cp4);
2252 		    HTPassEightBitRaw = TRUE;
2253 		}
2254 		HTAlert(*cp4 ? cp4 : me->node_anchor->charset);
2255 
2256 	    }
2257 	    FREE(cp3);
2258 
2259 	    if (me->node_anchor->charset) {
2260 		CTRACE((tfp,
2261 			"LYHandleMETA: New charset: %s\n",
2262 			me->node_anchor->charset));
2263 	    }
2264 	}
2265 	/*
2266 	 * Set the kcode element based on the charset.  - FM
2267 	 */
2268 	HText_setKcode(me->text, me->node_anchor->charset, p_in);
2269     }
2270 
2271     /*
2272      * Make sure we have META name/value pairs to handle.  - FM
2273      */
2274     if (!(http_equiv || name) || !content)
2275 	goto free_META_copies;
2276 
2277     /*
2278      * Check for a no-cache Pragma
2279      * or Cache-Control directive. - FM
2280      */
2281     if (!strcasecomp(NonNull(http_equiv), "Pragma") ||
2282 	!strcasecomp(NonNull(http_equiv), "Cache-Control")) {
2283 	LYUCTranslateHTMLString(&content, me->tag_charset, me->tag_charset,
2284 				NO, NO, YES, st_other);
2285 	if (!strcasecomp(content, "no-cache")) {
2286 	    me->node_anchor->no_cache = TRUE;
2287 	    HText_setNoCache(me->text);
2288 	}
2289 
2290 	/*
2291 	 * If we didn't get a Cache-Control MIME header, and the META has one,
2292 	 * convert to lowercase, store it in the anchor element, and if we
2293 	 * haven't yet set no_cache, check whether we should.  - FM
2294 	 */
2295 	if ((!me->node_anchor->cache_control) &&
2296 	    !strcasecomp(NonNull(http_equiv), "Cache-Control")) {
2297 	    LYLowerCase(content);
2298 	    StrAllocCopy(me->node_anchor->cache_control, content);
2299 	    if (me->node_anchor->no_cache == FALSE) {
2300 		cp0 = content;
2301 		while ((cp = strstr(cp0, "no-cache")) != NULL) {
2302 		    cp += 8;
2303 		    while (*cp != '\0' && WHITE(*cp))
2304 			cp++;
2305 		    if (*cp == '\0' || *cp == ';') {
2306 			me->node_anchor->no_cache = TRUE;
2307 			HText_setNoCache(me->text);
2308 			break;
2309 		    }
2310 		    cp0 = cp;
2311 		}
2312 		if (me->node_anchor->no_cache == TRUE)
2313 		    goto free_META_copies;
2314 		cp0 = content;
2315 		while ((cp = strstr(cp0, "max-age")) != NULL) {
2316 		    cp += 7;
2317 		    while (*cp != '\0' && WHITE(*cp))
2318 			cp++;
2319 		    if (*cp == '=') {
2320 			cp++;
2321 			while (*cp != '\0' && WHITE(*cp))
2322 			    cp++;
2323 			if (isdigit(UCH(*cp))) {
2324 			    cp0 = cp;
2325 			    while (isdigit(UCH(*cp)))
2326 				cp++;
2327 			    if (*cp0 == '0' && cp == (cp0 + 1)) {
2328 				me->node_anchor->no_cache = TRUE;
2329 				HText_setNoCache(me->text);
2330 				break;
2331 			    }
2332 			}
2333 		    }
2334 		    cp0 = cp;
2335 		}
2336 	    }
2337 	}
2338 
2339 	/*
2340 	 * Check for an Expires directive. - FM
2341 	 */
2342     } else if (!strcasecomp(NonNull(http_equiv), "Expires")) {
2343 	/*
2344 	 * If we didn't get an Expires MIME header, store it in the anchor
2345 	 * element, and if we haven't yet set no_cache, check whether we
2346 	 * should.  Note that we don't accept a Date header via META tags,
2347 	 * because it's likely to be untrustworthy, but do check for a Date
2348 	 * header from a server when making the comparison.  - FM
2349 	 */
2350 	LYUCTranslateHTMLString(&content, me->tag_charset, me->tag_charset,
2351 				NO, NO, YES, st_other);
2352 	StrAllocCopy(me->node_anchor->expires, content);
2353 	if (me->node_anchor->no_cache == FALSE) {
2354 	    if (!strcmp(content, "0")) {
2355 		/*
2356 		 * The value is zero, which we treat as an absolute no-cache
2357 		 * directive.  - FM
2358 		 */
2359 		me->node_anchor->no_cache = TRUE;
2360 		HText_setNoCache(me->text);
2361 	    } else if (me->node_anchor->date != NULL) {
2362 		/*
2363 		 * We have a Date header, so check if the value is less than or
2364 		 * equal to that.  - FM
2365 		 */
2366 		if (LYmktime(content, TRUE) <=
2367 		    LYmktime(me->node_anchor->date, TRUE)) {
2368 		    me->node_anchor->no_cache = TRUE;
2369 		    HText_setNoCache(me->text);
2370 		}
2371 	    } else if (LYmktime(content, FALSE) == 0) {
2372 		/*
2373 		 * We don't have a Date header, and the value is in past for
2374 		 * us.  - FM
2375 		 */
2376 		me->node_anchor->no_cache = TRUE;
2377 		HText_setNoCache(me->text);
2378 	    }
2379 	}
2380 
2381 	/*
2382 	 * Check for a Refresh directive.  - FM
2383 	 */
2384     } else if (!strcasecomp(NonNull(http_equiv), "Refresh")) {
2385 	char *Seconds = NULL;
2386 
2387 	LYUCTranslateHTMLString(&content, me->tag_charset, me->tag_charset,
2388 				NO, NO, YES, st_other);
2389 	LYParseRefreshURL(content, &Seconds, &href);
2390 
2391 	if (Seconds) {
2392 	    if (href) {
2393 		/*
2394 		 * We found a URL field, so check it out.  - FM
2395 		 */
2396 		if (!LYLegitimizeHREF(me, &href, TRUE, FALSE)) {
2397 		    /*
2398 		     * The specs require a complete URL, but this is a
2399 		     * Netscapism, so don't expect the author to know that.  -
2400 		     * FM
2401 		     */
2402 		    HTUserMsg(REFRESH_URL_NOT_ABSOLUTE);
2403 		    /*
2404 		     * Use the document's address as the base.  - FM
2405 		     */
2406 		    if (*href != '\0') {
2407 			temp = HTParse(href,
2408 				       me->node_anchor->address, PARSE_ALL);
2409 			StrAllocCopy(href, temp);
2410 			FREE(temp);
2411 		    } else {
2412 			StrAllocCopy(href, me->node_anchor->address);
2413 			HText_setNoCache(me->text);
2414 		    }
2415 
2416 		} else {
2417 		    /*
2418 		     * Check whether to fill in localhost.  - FM
2419 		     */
2420 		    LYFillLocalFileURL(&href,
2421 				       (me->inBASE ?
2422 					me->base_href : me->node_anchor->address));
2423 		}
2424 
2425 		/*
2426 		 * Set the no_cache flag if the Refresh URL is the same as the
2427 		 * document's address.  - FM
2428 		 */
2429 		if (!strcmp(href, me->node_anchor->address)) {
2430 		    HText_setNoCache(me->text);
2431 		}
2432 	    } else {
2433 		/*
2434 		 * We didn't find a URL field, so use the document's own
2435 		 * address and set the no_cache flag.  - FM
2436 		 */
2437 		StrAllocCopy(href, me->node_anchor->address);
2438 		HText_setNoCache(me->text);
2439 	    }
2440 	    /*
2441 	     * Check for an anchor in http or https URLs.  - FM
2442 	     */
2443 	    cp = NULL;
2444 	    /* id_string seems to be used wrong below if given.
2445 	       not that it matters much.  avoid setting it here. - kw */
2446 	    if (track_internal_links &&
2447 		(StrNCmp(href, "http", 4) == 0) &&
2448 		(cp = strchr(href, '#')) != NULL) {
2449 		StrAllocCopy(id_string, cp);
2450 		*cp = '\0';
2451 	    }
2452 	    if (me->inA) {
2453 		/*
2454 		 * Ugh!  The META tag, which is a HEAD element, is in an
2455 		 * Anchor, which is BODY element.  All we can do is close the
2456 		 * Anchor and cross our fingers.  - FM
2457 		 */
2458 		if (me->inBoldA == TRUE && me->inBoldH == FALSE)
2459 		    HText_appendCharacter(me->text, LY_BOLD_END_CHAR);
2460 		me->inBoldA = FALSE;
2461 		HText_endAnchor(me->text, me->CurrentANum);
2462 		me->inA = FALSE;
2463 		me->CurrentANum = 0;
2464 	    }
2465 	    me->CurrentA = HTAnchor_findChildAndLink
2466 		(
2467 		    me->node_anchor,	/* Parent */
2468 		    id_string,	/* Tag */
2469 		    href,	/* Addresss */
2470 		    (HTLinkType *) 0);	/* Type */
2471 	    if (id_string)
2472 		*cp = '#';
2473 	    FREE(id_string);
2474 	    LYEnsureSingleSpace(me);
2475 	    if (me->inUnderline == FALSE)
2476 		HText_appendCharacter(me->text, LY_UNDERLINE_START_CHAR);
2477 	    HTML_put_string(me, "REFRESH(");
2478 	    HTML_put_string(me, Seconds);
2479 	    HTML_put_string(me, " sec):");
2480 	    FREE(Seconds);
2481 	    if (me->inUnderline == FALSE)
2482 		HText_appendCharacter(me->text, LY_UNDERLINE_END_CHAR);
2483 	    HTML_put_character(me, ' ');
2484 	    me->in_word = NO;
2485 	    HText_beginAnchor(me->text, me->inUnderline, me->CurrentA);
2486 	    if (me->inBoldH == FALSE)
2487 		HText_appendCharacter(me->text, LY_BOLD_START_CHAR);
2488 	    HTML_put_string(me, href);
2489 	    FREE(href);
2490 	    if (me->inBoldH == FALSE)
2491 		HText_appendCharacter(me->text, LY_BOLD_END_CHAR);
2492 	    HText_endAnchor(me->text, 0);
2493 	    LYEnsureSingleSpace(me);
2494 	}
2495 
2496 	/*
2497 	 * Check for a suggested filename via a Content-Disposition with a
2498 	 * filename=name.suffix in it, if we don't already have it via a server
2499 	 * header.  - FM
2500 	 */
2501     } else if (isEmpty(me->node_anchor->SugFname) &&
2502 	       !strcasecomp((http_equiv ?
2503 			     http_equiv : ""), "Content-Disposition")) {
2504 	cp = content;
2505 	while (*cp != '\0' && strncasecomp(cp, "filename", 8))
2506 	    cp++;
2507 	if (*cp != '\0') {
2508 	    cp = LYSkipBlanks(cp + 8);
2509 	    if (*cp == '=')
2510 		cp++;
2511 	    cp = LYSkipBlanks(cp);
2512 	    if (*cp != '\0') {
2513 		StrAllocCopy(me->node_anchor->SugFname, cp);
2514 		if (*me->node_anchor->SugFname == '"') {
2515 		    if ((cp = strchr((me->node_anchor->SugFname + 1),
2516 				     '"')) != NULL) {
2517 			*(cp + 1) = '\0';
2518 			HTMIME_TrimDoubleQuotes(me->node_anchor->SugFname);
2519 			if (isEmpty(me->node_anchor->SugFname)) {
2520 			    FREE(me->node_anchor->SugFname);
2521 			}
2522 		    } else {
2523 			FREE(me->node_anchor->SugFname);
2524 		    }
2525 		}
2526 #if defined(UNIX) && !defined(DOSPATH)
2527 		/*
2528 		 * If blanks are not legal for local filenames, replace them
2529 		 * with underscores.
2530 		 */
2531 		if ((cp = me->node_anchor->SugFname) != NULL) {
2532 		    while (*cp != '\0') {
2533 			if (isspace(UCH(*cp)))
2534 			    *cp = '_';
2535 			++cp;
2536 		    }
2537 		}
2538 #endif
2539 	    }
2540 	}
2541 	/*
2542 	 * Check for a Set-Cookie directive.  - AK
2543 	 */
2544     } else if (!strcasecomp(NonNull(http_equiv), "Set-Cookie")) {
2545 	/*
2546 	 * This will need to be updated when Set-Cookie/Set-Cookie2 handling is
2547 	 * finalized.  For now, we'll still assume "historical" cookies in META
2548 	 * directives.  - FM
2549 	 */
2550 	url_type = is_url(me->inBASE ?
2551 			  me->base_href : me->node_anchor->address);
2552 	if (url_type == HTTP_URL_TYPE || url_type == HTTPS_URL_TYPE) {
2553 	    LYSetCookie(content,
2554 			NULL,
2555 			(me->inBASE ?
2556 			 me->base_href : me->node_anchor->address));
2557 	}
2558     }
2559 
2560     /*
2561      * Free the copies.  - FM
2562      */
2563   free_META_copies:
2564     FREE(http_equiv);
2565     FREE(name);
2566     FREE(content);
2567     FREE(charset);
2568 }
2569 
2570 /*
2571  *  This function handles P elements in HTML streams.
2572  *  If start is TRUE it handles a start tag, and if
2573  *  FALSE, an end tag.	We presently handle start
2574  *  and end tags identically, but this can lead to
2575  *  a different number of blank lines between the
2576  *  current paragraph and subsequent text when a P
2577  *  end tag is present or not in the markup. - FM
2578  */
LYHandlePlike(HTStructured * me,const BOOL * present,STRING2PTR value,char ** include GCC_UNUSED,int align_idx,int start)2579 void LYHandlePlike(HTStructured * me, const BOOL *present,
2580 		   STRING2PTR value,
2581 		   char **include GCC_UNUSED,
2582 		   int align_idx,
2583 		   int start)
2584 {
2585     /*
2586      * FIG content should be a true block, which like P inherits the current
2587      * style.  APPLET is like character elements or an ALT attribute, unless
2588      * its content contains a block element.  If we encounter a P in either's
2589      * content, we set flags to treat the content as a block - FM
2590      */
2591     if (start) {
2592 	if (me->inFIG)
2593 	    me->inFIGwithP = TRUE;
2594 
2595 	if (me->inAPPLET)
2596 	    me->inAPPLETwithP = TRUE;
2597     }
2598 
2599     UPDATE_STYLE;
2600     if (me->List_Nesting_Level >= 0) {
2601 	/*
2602 	 * We're in a list.  Treat P as an instruction to create one blank
2603 	 * line, if not already present, then fall through to handle
2604 	 * attributes, with the "second line" margins - FM
2605 	 */
2606 	if (me->inP) {
2607 	    if (me->inFIG || me->inAPPLET ||
2608 		me->inCAPTION || me->inCREDIT ||
2609 		me->sp->style->spaceAfter > 0 ||
2610 		(start && me->sp->style->spaceBefore > 0)) {
2611 		LYEnsureDoubleSpace(me);
2612 	    } else {
2613 		LYEnsureSingleSpace(me);
2614 	    }
2615 	}
2616     } else if (me->sp[0].tag_number == HTML_ADDRESS) {
2617 	/*
2618 	 * We're in an ADDRESS.  Treat P as an instruction to start a newline,
2619 	 * if needed, then fall through to handle attributes - FM
2620 	 */
2621 	if (!HText_LastLineEmpty(me->text, FALSE)) {
2622 	    HText_setLastChar(me->text, ' ');	/* absorb white space */
2623 	    HText_appendCharacter(me->text, '\r');
2624 	}
2625     } else {
2626 	if (start) {
2627 	    if (!(me->inLABEL && !me->inP)) {
2628 		HText_appendParagraph(me->text);
2629 	    }
2630 	} else if (me->sp->style->spaceAfter > 0) {
2631 	    LYEnsureDoubleSpace(me);
2632 	} else {
2633 	    LYEnsureSingleSpace(me);
2634 	}
2635 	me->inLABEL = FALSE;
2636     }
2637     me->in_word = NO;
2638 
2639     if (LYoverride_default_alignment(me)) {
2640 	me->sp->style->alignment = LYstyles(me->sp[0].tag_number)->alignment;
2641     } else if ((me->List_Nesting_Level >= 0 &&
2642 		(me->sp->style->id == ST_DivCenter ||
2643 		 me->sp->style->id == ST_DivLeft ||
2644 		 me->sp->style->id == ST_DivRight)) ||
2645 	       ((me->Division_Level < 0) &&
2646 		(me->sp->style->id == ST_Normal ||
2647 		 me->sp->style->id == ST_Preformatted))) {
2648 	me->sp->style->alignment = HT_LEFT;
2649     } else {
2650 	me->sp->style->alignment = (short) me->current_default_alignment;
2651     }
2652 
2653     if (start && align_idx >= 0) {
2654 	if (present && present[align_idx] && value[align_idx]) {
2655 	    if (!strcasecomp(value[align_idx], "center") &&
2656 		!(me->List_Nesting_Level >= 0 && !me->inP))
2657 		me->sp->style->alignment = HT_CENTER;
2658 	    else if (!strcasecomp(value[align_idx], "right") &&
2659 		     !(me->List_Nesting_Level >= 0 && !me->inP))
2660 		me->sp->style->alignment = HT_RIGHT;
2661 	    else if (!strcasecomp(value[align_idx], "left") ||
2662 		     !strcasecomp(value[align_idx], "justify"))
2663 		me->sp->style->alignment = HT_LEFT;
2664 	}
2665 
2666     }
2667 
2668     /*
2669      * Mark that we are starting a new paragraph and don't have any of its
2670      * text yet - FM
2671      */
2672     me->inP = FALSE;
2673 
2674     return;
2675 }
2676 
2677 /*
2678  *  This function handles SELECT elements in HTML streams.
2679  *  If start is TRUE it handles a start tag, and if FALSE,
2680  *  an end tag. - FM
2681  */
LYHandleSELECT(HTStructured * me,const BOOL * present,STRING2PTR value,char ** include GCC_UNUSED,int start)2682 void LYHandleSELECT(HTStructured * me, const BOOL *present,
2683 		    STRING2PTR value,
2684 		    char **include GCC_UNUSED,
2685 		    int start)
2686 {
2687     int i;
2688 
2689     if (start == TRUE) {
2690 	char *name = NULL;
2691 	BOOLEAN multiple = NO;
2692 	char *size = NULL;
2693 
2694 	/*
2695 	 * Initialize the disable attribute.
2696 	 */
2697 	me->select_disabled = FALSE;
2698 
2699 	/*
2700 	 * Check for unclosed TEXTAREA.
2701 	 */
2702 	if (me->inTEXTAREA) {
2703 	    if (LYBadHTML(me)) {
2704 		LYShowBadHTML("Bad HTML: Missing TEXTAREA end tag\n");
2705 	    }
2706 	}
2707 
2708 	/*
2709 	 * Set to know we are in a select tag.
2710 	 */
2711 	me->inSELECT = TRUE;
2712 
2713 	if (!(present && present[HTML_SELECT_NAME] &&
2714 	      non_empty(value[HTML_SELECT_NAME]))) {
2715 	    StrAllocCopy(name, "");
2716 	} else if (strchr(value[HTML_SELECT_NAME], '&') == NULL) {
2717 	    StrAllocCopy(name, value[HTML_SELECT_NAME]);
2718 	} else {
2719 	    StrAllocCopy(name, value[HTML_SELECT_NAME]);
2720 	    UNESCAPE_FIELDNAME_TO_STD(&name);
2721 	}
2722 	if (present && present[HTML_SELECT_MULTIPLE])
2723 	    multiple = YES;
2724 	if (present && present[HTML_SELECT_DISABLED])
2725 	    me->select_disabled = TRUE;
2726 	if (present && present[HTML_SELECT_SIZE] &&
2727 	    non_empty(value[HTML_SELECT_SIZE])) {
2728 	    /*
2729 	     * Let the size be determined by the number of OPTIONs.  - FM
2730 	     */
2731 	    CTRACE((tfp, "LYHandleSELECT: Ignoring SIZE=\"%s\" for SELECT.\n",
2732 		    value[HTML_SELECT_SIZE]));
2733 	}
2734 
2735 	if (me->inBoldH == TRUE &&
2736 	    (multiple == NO || LYSelectPopups == FALSE)) {
2737 	    HText_appendCharacter(me->text, LY_BOLD_END_CHAR);
2738 	    me->inBoldH = FALSE;
2739 	    me->needBoldH = TRUE;
2740 	}
2741 	if (me->inUnderline == TRUE &&
2742 	    (multiple == NO || LYSelectPopups == FALSE)) {
2743 	    HText_appendCharacter(me->text, LY_UNDERLINE_END_CHAR);
2744 	    me->inUnderline = FALSE;
2745 	}
2746 
2747 	if ((multiple == NO && LYSelectPopups == TRUE) &&
2748 	    (me->sp[0].tag_number == HTML_PRE || me->inPRE == TRUE ||
2749 	     !me->sp->style->freeFormat) &&
2750 	    HText_LastLineSize(me->text, FALSE) > (LYcolLimit - 7)) {
2751 	    /*
2752 	     * Force a newline when we're using a popup in a PRE block and are
2753 	     * within 7 columns from the right margin.  This will allow for the
2754 	     * '[' popup designator and help avoid a wrap in the underscore
2755 	     * placeholder for the retracted popup entry in the HText
2756 	     * structure.  - FM
2757 	     */
2758 	    HTML_put_character(me, '\n');
2759 	    me->in_word = NO;
2760 	}
2761 
2762 	LYCheckForID(me, present, value, (int) HTML_SELECT_ID);
2763 
2764 	HText_beginSelect(name, ATTR_CS_IN, multiple, size);
2765 	FREE(name);
2766 	FREE(size);
2767 
2768 	me->first_option = TRUE;
2769     } else {
2770 	/*
2771 	 * Handle end tag.
2772 	 */
2773 	char *ptr;
2774 
2775 	/*
2776 	 * Make sure we had a select start tag.
2777 	 */
2778 	if (!me->inSELECT) {
2779 	    if (LYBadHTML(me)) {
2780 		LYShowBadHTML("Bad HTML: Unmatched SELECT end tag\n");
2781 	    }
2782 	    return;
2783 	}
2784 
2785 	/*
2786 	 * Set to know that we are no longer in a select tag.
2787 	 */
2788 	me->inSELECT = FALSE;
2789 
2790 	/*
2791 	 * Clear the disable attribute.
2792 	 */
2793 	me->select_disabled = FALSE;
2794 
2795 	/*
2796 	 * Finish the data off.
2797 	 */
2798 	HTChunkTerminate(&me->option);
2799 	/*
2800 	 * Finish the previous option.
2801 	 */
2802 	ptr = HText_setLastOptionValue(me->text,
2803 				       me->option.data,
2804 				       me->LastOptionValue,
2805 				       LAST_ORDER,
2806 				       me->LastOptionChecked,
2807 				       me->UCLYhndl,
2808 				       ATTR_CS_IN);
2809 	FREE(me->LastOptionValue);
2810 
2811 	me->LastOptionChecked = FALSE;
2812 
2813 	if (HTCurSelectGroupType == F_CHECKBOX_TYPE ||
2814 	    LYSelectPopups == FALSE) {
2815 	    /*
2816 	     * Start a newline after the last checkbox/button option.
2817 	     */
2818 	    LYEnsureSingleSpace(me);
2819 	} else {
2820 	    /*
2821 	     * Output popup box with the default option to screen, but use
2822 	     * non-breaking spaces for output.
2823 	     */
2824 	    if (ptr &&
2825 		me->sp[0].tag_number == HTML_PRE && strlen(ptr) > 6) {
2826 		/*
2827 		 * The code inadequately handles OPTION fields in PRE tags.
2828 		 * We'll put up a minimum of 6 characters, and if any more
2829 		 * would exceed the wrap column, we'll ignore them.
2830 		 */
2831 		for (i = 0; i < 6; i++) {
2832 		    if (*ptr == ' ')
2833 			HText_appendCharacter(me->text, HT_NON_BREAK_SPACE);
2834 		    else
2835 			HText_appendCharacter(me->text, *ptr);
2836 		    ptr++;
2837 		}
2838 	    }
2839 	    for (; non_empty(ptr); ptr++) {
2840 		if (*ptr == ' ')
2841 		    HText_appendCharacter(me->text, HT_NON_BREAK_SPACE);
2842 		else
2843 		    HText_appendCharacter(me->text, *ptr);
2844 	    }
2845 	    /*
2846 	     * Add end option character.
2847 	     */
2848 	    if (!me->first_option) {
2849 		HText_appendCharacter(me->text, ']');
2850 		HText_setLastChar(me->text, ']');
2851 		me->in_word = YES;
2852 	    }
2853 	}
2854 	HTChunkClear(&me->option);
2855 
2856 	if (me->Underline_Level > 0 && me->inUnderline == FALSE) {
2857 	    HText_appendCharacter(me->text, LY_UNDERLINE_START_CHAR);
2858 	    me->inUnderline = TRUE;
2859 	}
2860 	if (me->needBoldH == TRUE && me->inBoldH == FALSE) {
2861 	    HText_appendCharacter(me->text, LY_BOLD_START_CHAR);
2862 	    me->inBoldH = TRUE;
2863 	    me->needBoldH = FALSE;
2864 	}
2865     }
2866 }
2867 
2868 /*
2869  *  This function strips white characters and
2870  *  generally fixes up attribute values that
2871  *  were received from the SGML parser and
2872  *  are to be treated as partial or absolute
2873  *  URLs. - FM
2874  */
LYLegitimizeHREF(HTStructured * me,char ** href,int force_slash,int strip_dots)2875 int LYLegitimizeHREF(HTStructured * me, char **href,
2876 		     int force_slash,
2877 		     int strip_dots)
2878 {
2879     int url_type = 0;
2880     char *p = NULL;
2881     char *pound = NULL;
2882     const char *Base = NULL;
2883 
2884     if (!me || !href || isEmpty(*href))
2885 	return (url_type);
2886 
2887     if (!LYTrimStartfile(*href)) {
2888 	/*
2889 	 * Collapse spaces in the actual URL, but just protect against tabs or
2890 	 * newlines in the fragment, if present.  This seeks to cope with
2891 	 * atrocities inflicted on the Web by authoring tools such as
2892 	 * Frontpage.  - FM
2893 	 */
2894 
2895 	/*  Before working on spaces check if we have any, usually none. */
2896 	p = LYSkipNonBlanks(*href);
2897 
2898 	if (*p) {		/* p == first space character */
2899 	    /* no reallocs below, all converted in place */
2900 
2901 	    pound = findPoundSelector(*href);
2902 
2903 	    if (pound != NULL && pound < p) {
2904 		convert_to_spaces(p, FALSE);	/* done */
2905 
2906 	    } else {
2907 		if (pound != NULL)
2908 		    *pound = '\0';	/* mark */
2909 
2910 		/*
2911 		 * No blanks really belong in the HREF,
2912 		 * but if it refers to an actual file,
2913 		 * it may actually have blanks in the name.
2914 		 * Try to accommodate. See also HTParse().
2915 		 */
2916 		if (LYRemoveNewlines(p) || strchr(p, '\t') != 0) {
2917 		    LYRemoveBlanks(p);	/* a compromise... */
2918 		}
2919 
2920 		if (pound != NULL) {
2921 		    p = strchr(p, '\0');
2922 		    *pound = '#';	/* restore */
2923 		    convert_to_spaces(pound, FALSE);
2924 		    if (p < pound)
2925 			strcpy(p, pound);
2926 		}
2927 	    }
2928 	}
2929     }
2930     if (**href == '\0')
2931 	return (url_type);
2932 
2933     TRANSLATE_AND_UNESCAPE_TO_STD(href);
2934 
2935     Base = me->inBASE ?
2936 	me->base_href : me->node_anchor->address;
2937 
2938     url_type = is_url(*href);
2939     if (!url_type && force_slash && **href == '.' &&
2940 	(!strcmp(*href, ".") || !strcmp(*href, "..")) &&
2941 	!isFILE_URL(Base)) {
2942 	/*
2943 	 * The Fielding RFC/ID for resolving partial HREFs says that a slash
2944 	 * should be on the end of the preceding symbolic element for "." and
2945 	 * "..", but all tested browsers only do that for an explicit "./" or
2946 	 * "../", so we'll respect the RFC/ID only if force_slash was TRUE and
2947 	 * it's not a file URL.  - FM
2948 	 */
2949 	StrAllocCat(*href, "/");
2950     }
2951     if ((!url_type && LYStripDotDotURLs && strip_dots && **href == '.') &&
2952 	!strncasecomp(Base, "http", 4)) {
2953 	/*
2954 	 * We will be resolving a partial reference versus an http or https
2955 	 * URL, and it has lead dots, which may be retained when resolving via
2956 	 * HTParse(), but the request would fail if the first element of the
2957 	 * resultant path is two dots, because no http or https server accepts
2958 	 * such paths, and the current URL draft, likely to become an RFC, says
2959 	 * that it's optional for the UA to strip them as a form of error
2960 	 * recovery.  So we will, recursively, for http/https URLs, like the
2961 	 * "major market browsers" which made this problem so common on the
2962 	 * Web, but we'll also issue a message about it, such that the bad
2963 	 * partial reference might get corrected by the document provider.  -
2964 	 * FM
2965 	 */
2966 	char *temp = NULL, *path = NULL, *cp;
2967 	const char *str = "";
2968 
2969 	temp = HTParse(*href, Base, PARSE_ALL);
2970 	path = HTParse(temp, "", PARSE_PATH + PARSE_PUNCTUATION);
2971 	if (!StrNCmp(path, "/..", 3)) {
2972 	    cp = (path + 3);
2973 	    if (LYIsHtmlSep(*cp) || *cp == '\0') {
2974 		if (Base[4] == 's') {
2975 		    str = "s";
2976 		}
2977 		CTRACE((tfp,
2978 			"LYLegitimizeHREF: Bad value '%s' for http%s URL.\n",
2979 			*href, str));
2980 		CTRACE((tfp, "                  Stripping lead dots.\n"));
2981 		if (!me->inBadHREF) {
2982 		    HTUserMsg(BAD_PARTIAL_REFERENCE);
2983 		    me->inBadHREF = TRUE;
2984 		}
2985 	    }
2986 	    if (*cp == '\0') {
2987 		StrAllocCopy(*href, "/");
2988 	    } else if (LYIsHtmlSep(*cp)) {
2989 		while (!StrNCmp(cp, "/..", 3)) {
2990 		    if (*(cp + 3) == '/') {
2991 			cp += 3;
2992 			continue;
2993 		    } else if (*(cp + 3) == '\0') {
2994 			*(cp + 1) = '\0';
2995 			*(cp + 2) = '\0';
2996 		    }
2997 		    break;
2998 		}
2999 		StrAllocCopy(*href, cp);
3000 	    }
3001 	}
3002 	FREE(temp);
3003 	FREE(path);
3004     }
3005     return (url_type);
3006 }
3007 
3008 /*
3009  *  This function checks for a Content-Base header,
3010  *  and if not present, a Content-Location header
3011  *  which is an absolute URL, and sets the BASE
3012  *  accordingly.  If set, it will be replaced by
3013  *  any BASE tag in the HTML stream, itself. - FM
3014  */
LYCheckForContentBase(HTStructured * me)3015 void LYCheckForContentBase(HTStructured * me)
3016 {
3017     char *cp = NULL;
3018     BOOL present[HTML_BASE_ATTRIBUTES];
3019     const char *value[HTML_BASE_ATTRIBUTES];
3020     int i;
3021 
3022     if (!(me && me->node_anchor))
3023 	return;
3024 
3025     if (me->node_anchor->content_base != NULL) {
3026 	/*
3027 	 * We have a Content-Base value.  Use it if it's non-zero length.  - FM
3028 	 */
3029 	if (*me->node_anchor->content_base == '\0')
3030 	    return;
3031 	StrAllocCopy(cp, me->node_anchor->content_base);
3032 	LYRemoveBlanks(cp);
3033     } else if (me->node_anchor->content_location != NULL) {
3034 	/*
3035 	 * We didn't have a Content-Base value, but do have a Content-Location
3036 	 * value.  Use it if it's an absolute URL.  - FM
3037 	 */
3038 	if (*me->node_anchor->content_location == '\0')
3039 	    return;
3040 	StrAllocCopy(cp, me->node_anchor->content_location);
3041 	LYRemoveBlanks(cp);
3042 	if (!is_url(cp)) {
3043 	    FREE(cp);
3044 	    return;
3045 	}
3046     } else {
3047 	/*
3048 	 * We had neither a Content-Base nor Content-Location value.  - FM
3049 	 */
3050 	return;
3051     }
3052 
3053     /*
3054      * If we collapsed to a zero-length value, ignore it.  - FM
3055      */
3056     if (*cp == '\0') {
3057 	FREE(cp);
3058 	return;
3059     }
3060 
3061     /*
3062      * Pass the value to HTML_start_element as the HREF of a BASE tag.  - FM
3063      */
3064     for (i = 0; i < HTML_BASE_ATTRIBUTES; i++)
3065 	present[i] = NO;
3066     present[HTML_BASE_HREF] = YES;
3067     value[HTML_BASE_HREF] = (const char *) cp;
3068     (*me->isa->start_element) (me, HTML_BASE, present, value,
3069 			       0, 0);
3070     FREE(cp);
3071 }
3072 
3073 /*
3074  *  This function creates NAMEd Anchors if a non-zero-length NAME
3075  *  or ID attribute was present in the tag. - FM
3076  */
LYCheckForID(HTStructured * me,const BOOL * present,STRING2PTR value,int attribute)3077 void LYCheckForID(HTStructured * me, const BOOL *present,
3078 		  STRING2PTR value,
3079 		  int attribute)
3080 {
3081     HTChildAnchor *ID_A = NULL;
3082     char *temp = NULL;
3083 
3084     if (!(me && me->text))
3085 	return;
3086 
3087     if (present && present[attribute]
3088 	&& non_empty(value[attribute])) {
3089 	/*
3090 	 * Translate any named or numeric character references.  - FM
3091 	 */
3092 	StrAllocCopy(temp, value[attribute]);
3093 	LYUCTranslateHTMLString(&temp, me->tag_charset, me->tag_charset,
3094 				NO, NO, YES, st_URL);
3095 
3096 	/*
3097 	 * Create the link if we still have a non-zero-length string.  - FM
3098 	 */
3099 	if ((temp[0] != '\0') &&
3100 	    (ID_A = HTAnchor_findChildAndLink
3101 	     (
3102 		 me->node_anchor,	/* Parent */
3103 		 temp,		/* Tag */
3104 		 NULL,		/* Addresss */
3105 		 (HTLinkType *) 0))) {	/* Type */
3106 	    HText_beginAnchor(me->text, me->inUnderline, ID_A);
3107 	    HText_endAnchor(me->text, 0);
3108 	}
3109 	FREE(temp);
3110     }
3111 }
3112 
3113 /*
3114  *  This function creates a NAMEd Anchor for the ID string
3115  *  passed to it directly as an argument.  It assumes the
3116  *  does not need checking for character references. - FM
3117  */
LYHandleID(HTStructured * me,const char * id)3118 void LYHandleID(HTStructured * me, const char *id)
3119 {
3120     HTChildAnchor *ID_A = NULL;
3121 
3122     if (!(me && me->text) ||
3123 	isEmpty(id))
3124 	return;
3125 
3126     /*
3127      * Create the link if we still have a non-zero-length string.  - FM
3128      */
3129     if ((ID_A = HTAnchor_findChildAndLink
3130 	 (
3131 	     me->node_anchor,	/* Parent */
3132 	     id,		/* Tag */
3133 	     NULL,		/* Addresss */
3134 	     (HTLinkType *) 0)) != NULL) {	/* Type */
3135 	HText_beginAnchor(me->text, me->inUnderline, ID_A);
3136 	HText_endAnchor(me->text, 0);
3137     }
3138 }
3139 
3140 /*
3141  *  This function checks whether we want to override
3142  *  the current default alignment for paragraphs and
3143  *  instead use that specified in the element's style
3144  *  sheet. - FM
3145  */
LYoverride_default_alignment(HTStructured * me)3146 BOOLEAN LYoverride_default_alignment(HTStructured * me)
3147 {
3148     if (!me)
3149 	return NO;
3150 
3151     switch (me->sp[0].tag_number) {
3152     case HTML_BLOCKQUOTE:
3153     case HTML_BQ:
3154     case HTML_NOTE:
3155     case HTML_FN:
3156     case HTML_ADDRESS:
3157 	me->sp->style->alignment = HT_LEFT;
3158 	return YES;
3159 
3160     default:
3161 	break;
3162     }
3163     return NO;
3164 }
3165 
3166 /*
3167  *  This function inserts newlines if needed to create double spacing,
3168  *  and sets the left margin for subsequent text to the second line
3169  *  indentation of the current style. - FM
3170  */
LYEnsureDoubleSpace(HTStructured * me)3171 void LYEnsureDoubleSpace(HTStructured * me)
3172 {
3173     if (!me || !me->text)
3174 	return;
3175 
3176     if (!HText_LastLineEmpty(me->text, FALSE)) {
3177 	HText_setLastChar(me->text, ' ');	/* absorb white space */
3178 	HText_appendCharacter(me->text, '\r');
3179 	HText_appendCharacter(me->text, '\r');
3180     } else if (!HText_PreviousLineEmpty(me->text, FALSE)) {
3181 	HText_setLastChar(me->text, ' ');	/* absorb white space */
3182 	HText_appendCharacter(me->text, '\r');
3183     } else if (me->List_Nesting_Level >= 0) {
3184 	HText_NegateLineOne(me->text);
3185     }
3186     me->in_word = NO;
3187     return;
3188 }
3189 
3190 /*
3191  *  This function inserts a newline if needed to create single spacing,
3192  *  and sets the left margin for subsequent text to the second line
3193  *  indentation of the current style. - FM
3194  */
LYEnsureSingleSpace(HTStructured * me)3195 void LYEnsureSingleSpace(HTStructured * me)
3196 {
3197     if (!me || !me->text)
3198 	return;
3199 
3200     if (!HText_LastLineEmpty(me->text, FALSE)) {
3201 	HText_setLastChar(me->text, ' ');	/* absorb white space */
3202 	HText_appendCharacter(me->text, '\r');
3203     } else if (me->List_Nesting_Level >= 0) {
3204 	HText_NegateLineOne(me->text);
3205     }
3206     me->in_word = NO;
3207     return;
3208 }
3209 
3210 /*
3211  *  This function resets paragraph alignments for block
3212  *  elements which do not have a defined style sheet. - FM
3213  */
LYResetParagraphAlignment(HTStructured * me)3214 void LYResetParagraphAlignment(HTStructured * me)
3215 {
3216     if (!me)
3217 	return;
3218 
3219     if (me->List_Nesting_Level >= 0 ||
3220 	((me->Division_Level < 0) &&
3221 	 (me->sp->style->id == ST_Normal ||
3222 	  me->sp->style->id == ST_Preformatted))) {
3223 	me->sp->style->alignment = HT_LEFT;
3224     } else {
3225 	me->sp->style->alignment = (short) me->current_default_alignment;
3226     }
3227     return;
3228 }
3229 
3230 /*
3231  *  This example function checks whether the given anchor has
3232  *  an address with a file scheme, and if so, loads it into the
3233  *  the SGML parser's context->url element, which was passed as
3234  *  the second argument.  The handle_comment() calling function in
3235  *  SGML.c then calls LYDoCSI() in LYUtils.c to insert HTML markup
3236  *  into the corresponding stream, homologously to an SSI by an
3237  *  HTTP server. - FM
3238  *
3239  *  For functions similar to this but which depend on details of
3240  *  the HTML handler's internal data, the calling interface should
3241  *  be changed, and functions in SGML.c would have to make sure not
3242  *  to call such functions inappropriately (e.g., calling a function
3243  *  specific to the Lynx_HTML_Handler when SGML.c output goes to
3244  *  some other HTStructured object like in HTMLGen.c), or the new
3245  *  functions could be added to the SGML.h interface.
3246  */
LYCheckForCSI(HTParentAnchor * anchor,char ** url)3247 BOOLEAN LYCheckForCSI(HTParentAnchor *anchor,
3248 		      char **url)
3249 {
3250     if (!(anchor && anchor->address))
3251 	return FALSE;
3252 
3253     if (!isFILE_URL(anchor->address))
3254 	return FALSE;
3255 
3256     if (!LYisLocalHost(anchor->address))
3257 	return FALSE;
3258 
3259     StrAllocCopy(*url, anchor->address);
3260     return TRUE;
3261 }
3262 
3263 /*
3264  *  This function is called from the SGML parser to look at comments
3265  *  and see whether we should collect some info from them.  Currently
3266  *  it only looks for comments with Message-Id and Subject info, in the
3267  *  exact form generated by MHonArc for archived mailing list.  If found,
3268  *  the info is stored in the document's HTParentAnchor.  It can later be
3269  *  used for generating a mail response.
3270  *
3271  *  We are extra picky here because there isn't any official definition
3272  *  for these kinds of comments - we might (and still can) misinterpret
3273  *  arbitrary comments as something they aren't.
3274  *
3275  *  If something doesn't look right, for example invalid characters, the
3276  *  strings are not stored.  Mail responses will use something else as
3277  *  the subject, probably the document URL, and will not have an
3278  *  In-Reply-To header.
3279  *
3280  *  All this is a hack - to do this the right way, mailing list archivers
3281  *  would have to agree on some better mechanism to make this kind of info
3282  *  from original mail headers available, for example using LINK.  - kw
3283  */
LYCommentHacks(HTParentAnchor * anchor,const char * comment)3284 BOOLEAN LYCommentHacks(HTParentAnchor *anchor,
3285 		       const char *comment)
3286 {
3287     const char *cp;
3288     size_t len;
3289 
3290     if (comment == NULL)
3291 	return FALSE;
3292 
3293     if (!(anchor && anchor->address))
3294 	return FALSE;
3295 
3296     if (StrNCmp(comment, "!--X-Message-Id: ", 17) == 0) {
3297 	char *messageid = NULL;
3298 	char *p;
3299 
3300 	for (cp = comment + 17; *cp; cp++) {
3301 	    if (UCH(*cp) >= 127 || !isgraph(UCH(*cp))) {
3302 		break;
3303 	    }
3304 	}
3305 	if (strcmp(cp, " --")) {
3306 	    return FALSE;
3307 	}
3308 	cp = comment + 17;
3309 	StrAllocCopy(messageid, cp);
3310 	/* This should be ok - message-id should only contain 7-bit ASCII */
3311 	if (!LYUCTranslateHTMLString(&messageid, 0, 0, NO, NO, YES, st_URL))
3312 	    return FALSE;
3313 	for (p = messageid; *p; p++) {
3314 	    if (UCH(*p) >= 127 || !isgraph(UCH(*p))) {
3315 		break;
3316 	    }
3317 	}
3318 	if (strcmp(p, " --")) {
3319 	    FREE(messageid);
3320 	    return FALSE;
3321 	}
3322 	if ((p = strchr(messageid, '@')) == NULL || p[1] == '\0') {
3323 	    FREE(messageid);
3324 	    return FALSE;
3325 	}
3326 	p = messageid;
3327 	if ((len = strlen(p)) >= 8 && !strcmp(&p[len - 3], " --")) {
3328 	    p[len - 3] = '\0';
3329 	} else {
3330 	    FREE(messageid);
3331 	    return FALSE;
3332 	}
3333 	if (HTAnchor_setMessageID(anchor, messageid)) {
3334 	    FREE(messageid);
3335 	    return TRUE;
3336 	} else {
3337 	    FREE(messageid);
3338 	    return FALSE;
3339 	}
3340     }
3341     if (StrNCmp(comment, "!--X-Subject: ", 14) == 0) {
3342 	char *subject = NULL;
3343 	char *p;
3344 
3345 	for (cp = comment + 14; *cp; cp++) {
3346 	    if (UCH(*cp) >= 127 || !isprint(UCH(*cp))) {
3347 		return FALSE;
3348 	    }
3349 	}
3350 	cp = comment + 14;
3351 	StrAllocCopy(subject, cp);
3352 	/* @@@
3353 	 * This may not be the right thing for the subject - but mail
3354 	 * subjects shouldn't contain 8-bit characters in raw form anyway.
3355 	 * We have to unescape character entities, since that's what MHonArc
3356 	 * seems to generate.  But if after that there are 8-bit characters
3357 	 * the string is rejected.  We would probably not know correctly
3358 	 * what charset to assume anyway - the mail sender's can differ from
3359 	 * the archive's.  And the code for sending mail cannot deal well
3360 	 * with 8-bit characters - we should not put them in the Subject
3361 	 * header in raw form, but don't have MIME encoding implemented.
3362 	 * Someone may want to do more about this...  - kw
3363 	 */
3364 	if (!LYUCTranslateHTMLString(&subject, 0, 0, NO, YES, NO, st_HTML))
3365 	    return FALSE;
3366 	for (p = subject; *p; p++) {
3367 	    if (UCH(*p) >= 127 || !isprint(UCH(*p))) {
3368 		FREE(subject);
3369 		return FALSE;
3370 	    }
3371 	}
3372 	p = subject;
3373 	if ((len = strlen(p)) >= 4 && !strcmp(&p[len - 3], " --")) {
3374 	    p[len - 3] = '\0';
3375 	} else {
3376 	    FREE(subject);
3377 	    return FALSE;
3378 	}
3379 	if (HTAnchor_setSubject(anchor, subject)) {
3380 	    FREE(subject);
3381 	    return TRUE;
3382 	} else {
3383 	    FREE(subject);
3384 	    return FALSE;
3385 	}
3386     }
3387 
3388     return FALSE;
3389 }
3390 
3391     /*
3392      * Create the Title with any left-angle-brackets converted to &lt; entities
3393      * and any ampersands converted to &amp; entities.  - FM
3394      *
3395      * Convert 8-bit letters to &#xUUUU to avoid dependencies from display
3396      * character set which may need changing.  Do NOT convert any 8-bit chars
3397      * if we have CJK display.  - LP
3398      */
LYformTitle(char ** dst,const char * src)3399 void LYformTitle(char **dst,
3400 		 const char *src)
3401 {
3402     if (HTCJK == JAPANESE) {
3403 	char *tmp_buffer = NULL;
3404 
3405 	if ((tmp_buffer = (char *) malloc(strlen(src) + 1)) == 0)
3406 	    outofmem(__FILE__, "LYformTitle");
3407 
3408 	assert(tmp_buffer != NULL);
3409 
3410 	switch (kanji_code) {	/* 1997/11/22 (Sat) 09:28:00 */
3411 	case EUC:
3412 	    TO_EUC((const unsigned char *) src, (unsigned char *) tmp_buffer);
3413 	    break;
3414 	case SJIS:
3415 	    TO_SJIS((const unsigned char *) src, (unsigned char *) tmp_buffer);
3416 	    break;
3417 	default:
3418 	    CTRACE((tfp, "\nLYformTitle: kanji_code is an unexpected value."));
3419 	    strcpy(tmp_buffer, src);
3420 	    break;
3421 	}
3422 	StrAllocCopy(*dst, tmp_buffer);
3423 	FREE(tmp_buffer);
3424     } else {
3425 	StrAllocCopy(*dst, src);
3426     }
3427 }
3428