1 /*
2 * $LynxId: LYCharUtils.c,v 1.123 2013/06/04 20:42:47 tom Exp $
3 *
4 * Functions associated with LYCharSets.c and the Lynx version of HTML.c - FM
5 * ==========================================================================
6 */
7 #include <HTUtils.h>
8 #include <SGML.h>
9
10 #define Lynx_HTML_Handler
11 #include <HTChunk.h>
12 #include <HText.h>
13 #include <HTStyle.h>
14 #include <HTMIME.h>
15 #include <HTML.h>
16
17 #include <HTCJK.h>
18 #include <HTAtom.h>
19 #include <HTMLGen.h>
20 #include <HTParse.h>
21 #include <UCMap.h>
22 #include <UCDefs.h>
23 #include <UCAux.h>
24
25 #include <LYGlobalDefs.h>
26 #include <LYCharUtils.h>
27 #include <LYCharSets.h>
28
29 #include <HTAlert.h>
30 #include <HTForms.h>
31 #include <HTNestedList.h>
32 #include <GridText.h>
33 #include <LYStrings.h>
34 #include <LYUtils.h>
35 #include <LYMap.h>
36 #include <LYBookmark.h>
37 #include <LYCurses.h>
38 #include <LYCookie.h>
39
40 #include <LYexit.h>
41 #include <LYLeaks.h>
42
43 /*
44 * Used for nested lists. - FM
45 */
46 int OL_CONTINUE = -29999; /* flag for whether CONTINUE is set */
47 int OL_VOID = -29998; /* flag for whether a count is set */
48
49 /*
50 * This function converts any ampersands in allocated
51 * strings to "&". If isTITLE is TRUE, it also
52 * converts any angle-brackets to "<" or ">". - FM
53 */
LYEntify(char ** str,int isTITLE)54 void LYEntify(char **str,
55 int isTITLE)
56 {
57 char *p = *str;
58 char *q = NULL, *cp = NULL;
59 int amps = 0, lts = 0, gts = 0;
60
61 #ifdef CJK_EX
62 enum _state {
63 S_text,
64 S_esc,
65 S_dollar,
66 S_paren,
67 S_nonascii_text,
68 S_dollar_paren
69 } state = S_text;
70 int in_sjis = 0;
71 #endif
72
73 if (isEmpty(p))
74 return;
75
76 /*
77 * Count the ampersands. - FM
78 */
79 while ((*p != '\0') && (q = strchr(p, '&')) != NULL) {
80 amps++;
81 p = (q + 1);
82 }
83
84 /*
85 * Count the left-angle-brackets, if needed. - FM
86 */
87 if (isTITLE == TRUE) {
88 p = *str;
89 while ((*p != '\0') && (q = strchr(p, '<')) != NULL) {
90 lts++;
91 p = (q + 1);
92 }
93 }
94
95 /*
96 * Count the right-angle-brackets, if needed. - FM
97 */
98 if (isTITLE == TRUE) {
99 p = *str;
100 while ((*p != '\0') && (q = strchr(p, '>')) != NULL) {
101 gts++;
102 p = (q + 1);
103 }
104 }
105
106 /*
107 * Check whether we need to convert anything. - FM
108 */
109 if (amps == 0 && lts == 0 && gts == 0)
110 return;
111
112 /*
113 * Allocate space and convert. - FM
114 */
115 q = typecallocn(char,
116 (strlen(*str)
117 + (unsigned)(4 * amps)
118 + (unsigned)(3 * lts)
119 + (unsigned)(3 * gts) + 1));
120 if ((cp = q) == NULL)
121 outofmem(__FILE__, "LYEntify");
122
123 assert(cp != NULL);
124 assert(q != NULL);
125
126 for (p = *str; *p; p++) {
127 #ifdef CJK_EX
128 if (IS_CJK_TTY) {
129 switch (state) {
130 case S_text:
131 if (*p == '\033') {
132 state = S_esc;
133 *q++ = *p;
134 continue;
135 }
136 break;
137
138 case S_esc:
139 if (*p == '$') {
140 state = S_dollar;
141 *q++ = *p;
142 continue;
143 } else if (*p == '(') {
144 state = S_paren;
145 *q++ = *p;
146 continue;
147 } else {
148 state = S_text;
149 *q++ = *p;
150 continue;
151 }
152
153 case S_dollar:
154 if (*p == '@' || *p == 'B' || *p == 'A') {
155 state = S_nonascii_text;
156 *q++ = *p;
157 continue;
158 } else if (*p == '(') {
159 state = S_dollar_paren;
160 *q++ = *p;
161 continue;
162 } else {
163 state = S_text;
164 *q++ = *p;
165 continue;
166 }
167
168 case S_dollar_paren:
169 if (*p == 'C') {
170 state = S_nonascii_text;
171 *q++ = *p;
172 continue;
173 } else {
174 state = S_text;
175 *q++ = *p;
176 continue;
177 }
178
179 case S_paren:
180 if (*p == 'B' || *p == 'J' || *p == 'T') {
181 state = S_text;
182 *q++ = *p;
183 continue;
184 } else if (*p == 'I') {
185 state = S_nonascii_text;
186 *q++ = *p;
187 continue;
188 }
189 /* FALLTHRU */
190
191 case S_nonascii_text:
192 if (*p == '\033')
193 state = S_esc;
194 *q++ = *p;
195 continue;
196
197 default:
198 break;
199 }
200 if (*(p + 1) != '\0' &&
201 (IS_EUC(UCH(*p), UCH(*(p + 1))) ||
202 IS_SJIS(UCH(*p), UCH(*(p + 1)), in_sjis) ||
203 IS_BIG5(UCH(*p), UCH(*(p + 1))))) {
204 *q++ = *p++;
205 *q++ = *p;
206 continue;
207 }
208 }
209 #endif
210 if (*p == '&') {
211 *q++ = '&';
212 *q++ = 'a';
213 *q++ = 'm';
214 *q++ = 'p';
215 *q++ = ';';
216 } else if (isTITLE && *p == '<') {
217 *q++ = '&';
218 *q++ = 'l';
219 *q++ = 't';
220 *q++ = ';';
221 } else if (isTITLE && *p == '>') {
222 *q++ = '&';
223 *q++ = 'g';
224 *q++ = 't';
225 *q++ = ';';
226 } else {
227 *q++ = *p;
228 }
229 }
230 *q = '\0';
231 FREE(*str);
232 *str = cp;
233 }
234
235 /*
236 * Callers to LYEntifyTitle/LYEntifyValue do not look at the 'target' param.
237 * Optimize things a little by avoiding the memory allocation if not needed,
238 * as is usually the case.
239 */
MustEntify(const char * source)240 static BOOL MustEntify(const char *source)
241 {
242 BOOL result;
243
244 #ifdef CJK_EX
245 if (IS_CJK_TTY && strchr(source, '\033') != 0) {
246 result = TRUE;
247 } else
248 #endif
249 {
250 size_t length = strlen(source);
251 size_t reject = strcspn(source, "<&>");
252
253 result = (BOOL) (length != reject);
254 }
255
256 return result;
257 }
258
259 /*
260 * Wrappers for LYEntify() which do not assume that the source was allocated,
261 * e.g., output from gettext().
262 */
LYEntifyTitle(char ** target,const char * source)263 const char *LYEntifyTitle(char **target, const char *source)
264 {
265 const char *result = 0;
266
267 if (MustEntify(source)) {
268 StrAllocCopy(*target, source);
269 LYEntify(target, TRUE);
270 result = *target;
271 } else {
272 result = source;
273 }
274 return result;
275 }
276
LYEntifyValue(char ** target,const char * source)277 const char *LYEntifyValue(char **target, const char *source)
278 {
279 const char *result = 0;
280
281 if (MustEntify(source)) {
282 StrAllocCopy(*target, source);
283 LYEntify(target, FALSE);
284 result = *target;
285 } else {
286 result = source;
287 }
288 return result;
289 }
290
291 /*
292 * This function trims characters <= that of a space (32),
293 * including HT_NON_BREAK_SPACE (1) and HT_EN_SPACE (2),
294 * but not ESC, from the heads of strings. - FM
295 */
LYTrimHead(char * str)296 void LYTrimHead(char *str)
297 {
298 const char *s = str;
299
300 if (isEmpty(s))
301 return;
302
303 while (*s && WHITE(*s) && UCH(*s) != UCH(CH_ESC)) /* S/390 -- gil -- 1669 */
304 s++;
305 if (s > str) {
306 char *ns = str;
307
308 while (*s) {
309 *ns++ = *s++;
310 }
311 *ns = '\0';
312 }
313 }
314
315 /*
316 * This function trims characters <= that of a space (32),
317 * including HT_NON_BREAK_SPACE (1), HT_EN_SPACE (2), and
318 * ESC from the tails of strings. - FM
319 */
LYTrimTail(char * str)320 void LYTrimTail(char *str)
321 {
322 int i;
323
324 if (isEmpty(str))
325 return;
326
327 i = (int) strlen(str) - 1;
328 while (i >= 0) {
329 if (WHITE(str[i]))
330 str[i] = '\0';
331 else
332 break;
333 i--;
334 }
335 }
336
337 /*
338 * This function should receive a pointer to the start
339 * of a comment. It returns a pointer to the end ('>')
340 * character of comment, or its best guess if the comment
341 * is invalid. - FM
342 */
LYFindEndOfComment(char * str)343 char *LYFindEndOfComment(char *str)
344 {
345 char *cp, *cp1;
346 enum comment_state {
347 start1,
348 start2,
349 end1,
350 end2
351 } state;
352
353 if (str == NULL)
354 /*
355 * We got NULL, so return NULL. - FM
356 */
357 return NULL;
358
359 if (StrNCmp(str, "<!--", 4))
360 /*
361 * We don't have the start of a comment, so return the beginning of the
362 * string. - FM
363 */
364 return str;
365
366 cp = (str + 4);
367 if (*cp == '>')
368 /*
369 * It's an invalid comment, so
370 * return this end character. - FM
371 */
372 return cp;
373
374 if ((cp1 = strchr(cp, '>')) == NULL)
375 /*
376 * We don't have an end character, so return the beginning of the
377 * string. - FM
378 */
379 return str;
380
381 if (*cp == '-')
382 /*
383 * Ugh, it's a "decorative" series of dashes, so return the next end
384 * character. - FM
385 */
386 return cp1;
387
388 /*
389 * OK, we're ready to start parsing. - FM
390 */
391 state = start2;
392 while (*cp != '\0') {
393 switch (state) {
394 case start1:
395 if (*cp == '-')
396 state = start2;
397 else
398 /*
399 * Invalid comment, so return the first '>' from the start of
400 * the string. - FM
401 */
402 return cp1;
403 break;
404
405 case start2:
406 if (*cp == '-')
407 state = end1;
408 break;
409
410 case end1:
411 if (*cp == '-')
412 state = end2;
413 else
414 /*
415 * Invalid comment, so return the first '>' from the start of
416 * the string. - FM
417 */
418 return cp1;
419 break;
420
421 case end2:
422 if (*cp == '>')
423 /*
424 * Valid comment, so return the end character. - FM
425 */
426 return cp;
427 if (*cp == '-') {
428 state = start1;
429 } else if (!(WHITE(*cp) && UCH(*cp) != UCH(CH_ESC))) { /* S/390 -- gil -- 1686 */
430 /*
431 * Invalid comment, so return the first '>' from the start of
432 * the string. - FM
433 */
434 return cp1;
435 }
436 break;
437
438 default:
439 break;
440 }
441 cp++;
442 }
443
444 /*
445 * Invalid comment, so return the first '>' from the start of the string.
446 * - FM
447 */
448 return cp1;
449 }
450
451 /*
452 * If an HREF, itself or if resolved against a base,
453 * represents a file URL, and the host is defaulted,
454 * force in "//localhost". We need this until
455 * all the other Lynx code which performs security
456 * checks based on the "localhost" string is changed
457 * to assume "//localhost" when a host field is not
458 * present in file URLs - FM
459 */
LYFillLocalFileURL(char ** href,const char * base)460 void LYFillLocalFileURL(char **href,
461 const char *base)
462 {
463 char *temp = NULL;
464
465 if (isEmpty(*href))
466 return;
467
468 if (!strcmp(*href, "//") || !StrNCmp(*href, "///", 3)) {
469 if (base != NULL && isFILE_URL(base)) {
470 StrAllocCopy(temp, STR_FILE_URL);
471 StrAllocCat(temp, *href);
472 StrAllocCopy(*href, temp);
473 }
474 }
475 if (isFILE_URL(*href)) {
476 if (*(*href + 5) == '\0') {
477 StrAllocCat(*href, "//localhost");
478 } else if (!strcmp(*href, "file://")) {
479 StrAllocCat(*href, "localhost");
480 } else if (!StrNCmp(*href, "file:///", 8)) {
481 StrAllocCopy(temp, (*href + 7));
482 LYLocalFileToURL(href, temp);
483 } else if (!StrNCmp(*href, "file:/", 6) && !LYIsHtmlSep(*(*href + 6))) {
484 StrAllocCopy(temp, (*href + 5));
485 LYLocalFileToURL(href, temp);
486 }
487 }
488 #if defined(USE_DOS_DRIVES)
489 if (LYIsDosDrive(*href)) {
490 /*
491 * If it's a local DOS path beginning with drive letter,
492 * add file://localhost/ prefix and go ahead.
493 */
494 StrAllocCopy(temp, *href);
495 LYLocalFileToURL(href, temp);
496 }
497
498 /* use below: strlen("file://localhost/") = 17 */
499 if (!StrNCmp(*href, "file://localhost/", 17)
500 && (strlen(*href) == 19)
501 && LYIsDosDrive(*href + 17)) {
502 /*
503 * Terminate DOS drive letter with a slash to surf root successfully.
504 * Here seems a proper place to do so.
505 */
506 LYAddPathSep(href);
507 }
508 #endif /* USE_DOS_DRIVES */
509
510 /*
511 * No path in a file://localhost URL means a
512 * directory listing for the current default. - FM
513 */
514 if (!strcmp(*href, "file://localhost")) {
515 const char *temp2;
516
517 #ifdef VMS
518 temp2 = HTVMS_wwwName(LYGetEnv("PATH"));
519 #else
520 char curdir[LY_MAXPATH];
521
522 temp2 = wwwName(Current_Dir(curdir));
523 #endif /* VMS */
524 if (!LYIsHtmlSep(*temp2))
525 LYAddHtmlSep(href);
526 /*
527 * Check for pathological cases - current dir has chars which MUST BE
528 * URL-escaped - kw
529 */
530 if (strchr(temp2, '%') != NULL || strchr(temp2, '#') != NULL) {
531 FREE(temp);
532 temp = HTEscape(temp2, URL_PATH);
533 StrAllocCat(*href, temp);
534 } else {
535 StrAllocCat(*href, temp2);
536 }
537 }
538 #ifdef VMS
539 /*
540 * On VMS, a file://localhost/ URL means
541 * a listing for the login directory. - FM
542 */
543 if (!strcmp(*href, "file://localhost/"))
544 StrAllocCat(*href, (HTVMS_wwwName(Home_Dir()) + 1));
545 #endif /* VMS */
546
547 FREE(temp);
548 return;
549 }
550
LYAddMETAcharsetToStream(HTStream * target,int disp_chndl)551 void LYAddMETAcharsetToStream(HTStream *target, int disp_chndl)
552 {
553 char *buf = 0;
554
555 if (disp_chndl == -1)
556 /*
557 * -1 means use current_char_set.
558 */
559 disp_chndl = current_char_set;
560
561 if (target != 0 && disp_chndl >= 0) {
562 HTSprintf0(&buf, "<META %s content=\"text/html;charset=%s\">\n",
563 "http-equiv=\"content-type\"",
564 LYCharSet_UC[disp_chndl].MIMEname);
565 (*target->isa->put_string) (target, buf);
566 FREE(buf);
567 }
568 }
569
570 /*
571 * This function writes a line with a META tag to an open file,
572 * which will specify a charset parameter to use when the file is
573 * read back in. It is meant for temporary HTML files used by the
574 * various special pages which may show titles of documents. When those
575 * files are created, the title strings normally have been translated and
576 * expanded to the display character set, so we have to make sure they
577 * don't get translated again.
578 * If the user has changed the display character set during the lifetime
579 * of the Lynx session (or, more exactly, during the time the title
580 * strings to be written were generated), they may now have different
581 * character encodings and there is currently no way to get it all right.
582 * To change this, we would have to add a variable for each string which
583 * keeps track of its character encoding.
584 * But at least we can try to ensure that reading the file after future
585 * display character set changes will give reasonable output.
586 *
587 * The META tag is not written if the display character set (passed as
588 * disp_chndl) already corresponds to the charset assumption that
589 * would be made when the file is read. - KW
590 *
591 * Currently this function is used for temporary files like "Lynx Info Page"
592 * and for one permanent - bookmarks (so it may be a problem if you change
593 * the display charset later: new bookmark entries may be mistranslated).
594 * - LP
595 */
LYAddMETAcharsetToFD(FILE * fd,int disp_chndl)596 void LYAddMETAcharsetToFD(FILE *fd, int disp_chndl)
597 {
598 if (disp_chndl == -1)
599 /*
600 * -1 means use current_char_set.
601 */
602 disp_chndl = current_char_set;
603
604 if (fd == NULL || disp_chndl < 0)
605 /*
606 * Should not happen.
607 */
608 return;
609
610 if (UCLYhndl_HTFile_for_unspec == disp_chndl)
611 /*
612 * Not need to do, so we don't.
613 */
614 return;
615
616 if (LYCharSet_UC[disp_chndl].enc == UCT_ENC_7BIT)
617 /*
618 * There shouldn't be any 8-bit characters in this case.
619 */
620 return;
621
622 /*
623 * In other cases we don't know because UCLYhndl_for_unspec may change
624 * during the lifetime of the file (by toggling raw mode or changing the
625 * display character set), so proceed.
626 */
627 fprintf(fd, "<META %s content=\"text/html;charset=%s\">\n",
628 "http-equiv=\"content-type\"",
629 LYCharSet_UC[disp_chndl].MIMEname);
630 }
631
632 /*
633 * This function returns OL TYPE="A" strings in
634 * the range of " A." (1) to "ZZZ." (18278). - FM
635 */
LYUppercaseA_OL_String(int seqnum)636 char *LYUppercaseA_OL_String(int seqnum)
637 {
638 static char OLstring[8];
639
640 if (seqnum <= 1) {
641 strcpy(OLstring, " A.");
642 return OLstring;
643 }
644 if (seqnum < 27) {
645 sprintf(OLstring, " %c.", (seqnum + 64));
646 return OLstring;
647 }
648 if (seqnum < 703) {
649 sprintf(OLstring, "%c%c.", ((seqnum - 1) / 26 + 64),
650 (seqnum - ((seqnum - 1) / 26) * 26 + 64));
651 return OLstring;
652 }
653 if (seqnum < 18279) {
654 sprintf(OLstring, "%c%c%c.", ((seqnum - 27) / 676 + 64),
655 (((seqnum - ((seqnum - 27) / 676) * 676) - 1) / 26 + 64),
656 (seqnum - ((seqnum - 1) / 26) * 26 + 64));
657 return OLstring;
658 }
659 strcpy(OLstring, "ZZZ.");
660 return OLstring;
661 }
662
663 /*
664 * This function returns OL TYPE="a" strings in
665 * the range of " a." (1) to "zzz." (18278). - FM
666 */
LYLowercaseA_OL_String(int seqnum)667 char *LYLowercaseA_OL_String(int seqnum)
668 {
669 static char OLstring[8];
670
671 if (seqnum <= 1) {
672 strcpy(OLstring, " a.");
673 return OLstring;
674 }
675 if (seqnum < 27) {
676 sprintf(OLstring, " %c.", (seqnum + 96));
677 return OLstring;
678 }
679 if (seqnum < 703) {
680 sprintf(OLstring, "%c%c.", ((seqnum - 1) / 26 + 96),
681 (seqnum - ((seqnum - 1) / 26) * 26 + 96));
682 return OLstring;
683 }
684 if (seqnum < 18279) {
685 sprintf(OLstring, "%c%c%c.", ((seqnum - 27) / 676 + 96),
686 (((seqnum - ((seqnum - 27) / 676) * 676) - 1) / 26 + 96),
687 (seqnum - ((seqnum - 1) / 26) * 26 + 96));
688 return OLstring;
689 }
690 strcpy(OLstring, "zzz.");
691 return OLstring;
692 }
693
694 /*
695 * This function returns OL TYPE="I" strings in the
696 * range of " I." (1) to "MMM." (3000).- FM
697 * Maximum length: 16 -TD
698 */
LYUppercaseI_OL_String(int seqnum)699 char *LYUppercaseI_OL_String(int seqnum)
700 {
701 static char OLstring[20];
702 int Arabic = seqnum;
703
704 if (Arabic >= 3000) {
705 strcpy(OLstring, "MMM.");
706 return OLstring;
707 }
708
709 switch (Arabic) {
710 case 1:
711 strcpy(OLstring, " I.");
712 return OLstring;
713 case 5:
714 strcpy(OLstring, " V.");
715 return OLstring;
716 case 10:
717 strcpy(OLstring, " X.");
718 return OLstring;
719 case 50:
720 strcpy(OLstring, " L.");
721 return OLstring;
722 case 100:
723 strcpy(OLstring, " C.");
724 return OLstring;
725 case 500:
726 strcpy(OLstring, " D.");
727 return OLstring;
728 case 1000:
729 strcpy(OLstring, " M.");
730 return OLstring;
731 default:
732 OLstring[0] = '\0';
733 break;
734 }
735
736 while (Arabic >= 1000) {
737 strcat(OLstring, "M");
738 Arabic -= 1000;
739 }
740
741 if (Arabic >= 900) {
742 strcat(OLstring, "CM");
743 Arabic -= 900;
744 }
745
746 if (Arabic >= 500) {
747 strcat(OLstring, "D");
748 Arabic -= 500;
749 }
750
751 if (Arabic >= 400) {
752 strcat(OLstring, "CD");
753 Arabic -= 400;
754 }
755
756 while (Arabic >= 100) {
757 strcat(OLstring, "C");
758 Arabic -= 100;
759 }
760
761 if (Arabic >= 90) {
762 strcat(OLstring, "XC");
763 Arabic -= 90;
764 }
765
766 if (Arabic >= 50) {
767 strcat(OLstring, "L");
768 Arabic -= 50;
769 }
770
771 if (Arabic >= 40) {
772 strcat(OLstring, "XL");
773 Arabic -= 40;
774 }
775
776 while (Arabic > 10) {
777 strcat(OLstring, "X");
778 Arabic -= 10;
779 }
780
781 switch (Arabic) {
782 case 1:
783 strcat(OLstring, "I.");
784 break;
785 case 2:
786 strcat(OLstring, "II.");
787 break;
788 case 3:
789 strcat(OLstring, "III.");
790 break;
791 case 4:
792 strcat(OLstring, "IV.");
793 break;
794 case 5:
795 strcat(OLstring, "V.");
796 break;
797 case 6:
798 strcat(OLstring, "VI.");
799 break;
800 case 7:
801 strcat(OLstring, "VII.");
802 break;
803 case 8:
804 strcat(OLstring, "VIII.");
805 break;
806 case 9:
807 strcat(OLstring, "IX.");
808 break;
809 case 10:
810 strcat(OLstring, "X.");
811 break;
812 default:
813 strcat(OLstring, ".");
814 break;
815 }
816
817 return OLstring;
818 }
819
820 /*
821 * This function returns OL TYPE="i" strings in
822 * range of " i." (1) to "mmm." (3000).- FM
823 * Maximum length: 16 -TD
824 */
LYLowercaseI_OL_String(int seqnum)825 char *LYLowercaseI_OL_String(int seqnum)
826 {
827 static char OLstring[20];
828 int Arabic = seqnum;
829
830 if (Arabic >= 3000) {
831 strcpy(OLstring, "mmm.");
832 return OLstring;
833 }
834
835 switch (Arabic) {
836 case 1:
837 strcpy(OLstring, " i.");
838 return OLstring;
839 case 5:
840 strcpy(OLstring, " v.");
841 return OLstring;
842 case 10:
843 strcpy(OLstring, " x.");
844 return OLstring;
845 case 50:
846 strcpy(OLstring, " l.");
847 return OLstring;
848 case 100:
849 strcpy(OLstring, " c.");
850 return OLstring;
851 case 500:
852 strcpy(OLstring, " d.");
853 return OLstring;
854 case 1000:
855 strcpy(OLstring, " m.");
856 return OLstring;
857 default:
858 OLstring[0] = '\0';
859 break;
860 }
861
862 while (Arabic >= 1000) {
863 strcat(OLstring, "m");
864 Arabic -= 1000;
865 }
866
867 if (Arabic >= 900) {
868 strcat(OLstring, "cm");
869 Arabic -= 900;
870 }
871
872 if (Arabic >= 500) {
873 strcat(OLstring, "d");
874 Arabic -= 500;
875 }
876
877 if (Arabic >= 400) {
878 strcat(OLstring, "cd");
879 Arabic -= 400;
880 }
881
882 while (Arabic >= 100) {
883 strcat(OLstring, "c");
884 Arabic -= 100;
885 }
886
887 if (Arabic >= 90) {
888 strcat(OLstring, "xc");
889 Arabic -= 90;
890 }
891
892 if (Arabic >= 50) {
893 strcat(OLstring, "l");
894 Arabic -= 50;
895 }
896
897 if (Arabic >= 40) {
898 strcat(OLstring, "xl");
899 Arabic -= 40;
900 }
901
902 while (Arabic > 10) {
903 strcat(OLstring, "x");
904 Arabic -= 10;
905 }
906
907 switch (Arabic) {
908 case 1:
909 strcat(OLstring, "i.");
910 break;
911 case 2:
912 strcat(OLstring, "ii.");
913 break;
914 case 3:
915 strcat(OLstring, "iii.");
916 break;
917 case 4:
918 strcat(OLstring, "iv.");
919 break;
920 case 5:
921 strcat(OLstring, "v.");
922 break;
923 case 6:
924 strcat(OLstring, "vi.");
925 break;
926 case 7:
927 strcat(OLstring, "vii.");
928 break;
929 case 8:
930 strcat(OLstring, "viii.");
931 break;
932 case 9:
933 strcat(OLstring, "ix.");
934 break;
935 case 10:
936 strcat(OLstring, "x.");
937 break;
938 default:
939 strcat(OLstring, ".");
940 break;
941 }
942
943 return OLstring;
944 }
945
946 /*
947 * This function initializes the Ordered List counter. - FM
948 */
LYZero_OL_Counter(HTStructured * me)949 void LYZero_OL_Counter(HTStructured * me)
950 {
951 int i;
952
953 if (!me)
954 return;
955
956 for (i = 0; i < 12; i++) {
957 me->OL_Counter[i] = OL_VOID;
958 me->OL_Type[i] = '1';
959 }
960
961 me->Last_OL_Count = 0;
962 me->Last_OL_Type = '1';
963
964 return;
965 }
966
967 /*
968 * This function is used by the HTML Structured object. - KW
969 */
LYGetChartransInfo(HTStructured * me)970 void LYGetChartransInfo(HTStructured * me)
971 {
972 me->UCLYhndl = HTAnchor_getUCLYhndl(me->node_anchor,
973 UCT_STAGE_STRUCTURED);
974 if (me->UCLYhndl < 0) {
975 int chndl = HTAnchor_getUCLYhndl(me->node_anchor, UCT_STAGE_HTEXT);
976
977 if (chndl < 0) {
978 chndl = current_char_set;
979 HTAnchor_setUCInfoStage(me->node_anchor, chndl,
980 UCT_STAGE_HTEXT,
981 UCT_SETBY_STRUCTURED);
982 }
983 HTAnchor_setUCInfoStage(me->node_anchor, chndl,
984 UCT_STAGE_STRUCTURED,
985 UCT_SETBY_STRUCTURED);
986 me->UCLYhndl = HTAnchor_getUCLYhndl(me->node_anchor,
987 UCT_STAGE_STRUCTURED);
988 }
989 me->UCI = HTAnchor_getUCInfoStage(me->node_anchor,
990 UCT_STAGE_STRUCTURED);
991 }
992
993 /* as in HTParse.c, saves some calls - kw */
994 static const char *hex = "0123456789ABCDEF";
995
996 /*
997 * Any raw 8-bit or multibyte characters already have been
998 * handled in relation to the display character set
999 * in SGML_character(), including named and numeric entities.
1000 *
1001 * This function used for translations HTML special fields inside tags
1002 * (ALT=, VALUE=, etc.) from charset `cs_from' to charset `cs_to'.
1003 * It also unescapes non-ASCII characters from URL (#fragments !)
1004 * if st_URL is active.
1005 *
1006 * If `do_ent' is YES, it converts named entities
1007 * and numeric character references (NCRs) to their `cs_to' replacements.
1008 *
1009 * Named entities converted to unicodes. NCRs (unicodes) converted
1010 * by UCdomap.c chartrans functions.
1011 * ???NCRs with values in the ISO-8859-1 range 160-255 may be converted
1012 * to their HTML entity names (via old-style entities) and then translated
1013 * according to the LYCharSets.c array for `cs_out'???.
1014 *
1015 * Some characters (see descriptions in `put_special_unicodes' from SGML.c)
1016 * translated in relation with the state of boolean variables
1017 * `use_lynx_specials', `plain_space' and `hidden'. It is not clear yet:
1018 *
1019 * If plain_space is TRUE, nbsp (160) will be treated as an ASCII
1020 * space (32). If hidden is TRUE, entities will be translated
1021 * (if `do_ent' is YES) but escape sequences will be passed unaltered.
1022 * If `hidden' is FALSE, some characters are converted to Lynx special
1023 * codes (see `put_special_unicodes') or ASCII space if `plain_space'
1024 * applies). @@ is `use_lynx_specials' needed, does it have any effect? @@
1025 * If `use_lynx_specials' is YES, translate byte values 160 and 173
1026 * meaning U+00A0 and U+00AD given as or converted from raw char input
1027 * are converted to HT_NON_BREAK_SPACE and LY_SOFT_HYPHEN, respectively
1028 * (unless input and output charset are both iso-8859-1, for compatibility
1029 * with previous usage in HTML.c) even if `hidden' or `plain_space' is set.
1030 *
1031 * If `Back' is YES, the reverse is done instead i.e., Lynx special codes
1032 * in the input are translated back to character values.
1033 *
1034 * If `Back' is YES, an attempt is made to use UCReverseTransChar() for
1035 * back translation which may be more efficient. (?)
1036 *
1037 * If `stype' is st_URL, non-ASCII characters are URL-encoded instead.
1038 * The sequence of bytes being URL-encoded is the raw input character if
1039 * we couldn't translate it from `cs_in' (CJK etc.); otherwise it is the
1040 * UTF-8 representation if either `cs_to' requires this or if the
1041 * character's Unicode value is > 255, otherwise it should be the iso-8859-1
1042 * representation.
1043 * No general URL-encoding occurs for displayable ASCII characters and
1044 * spaces and some C0 controls valid in HTML (LF, TAB), it is expected
1045 * that other functions will take care of that as appropriate.
1046 *
1047 * Escape characters (0x1B, '\033') are
1048 * - URL-encoded if `stype' is st_URL, otherwise
1049 * - dropped if `stype' is st_other, otherwise (i.e., st_HTML)
1050 * - passed if `hidden' is TRUE or HTCJK is set, otherwise
1051 * - dropped.
1052 *
1053 * (If `stype' is st_URL or st_other most of the parameters really predefined:
1054 * cs_from=cs_to, use_lynx_specials=plain_space=NO, and hidden=YES)
1055 *
1056 *
1057 * Returns pointer to the char** passed in
1058 * if string translated or translation unnecessary,
1059 * NULL otherwise
1060 * (in which case something probably went wrong.)
1061 *
1062 *
1063 * In general, this somehow ugly function (KW)
1064 * cover three functions from v.2.7.2 (FM):
1065 * extern void LYExpandString (
1066 * HTStructured * me,
1067 * char ** str);
1068 * extern void LYUnEscapeEntities (
1069 * HTStructured * me,
1070 * char ** str);
1071 * extern void LYUnEscapeToLatinOne (
1072 * HTStructured * me,
1073 * char ** str,
1074 * BOOLEAN isURL);
1075 */
1076
LYUCFullyTranslateString(char ** str,int cs_from,int cs_to,int do_ent,int use_lynx_specials,int plain_space,int hidden,int Back,CharUtil_st stype)1077 char **LYUCFullyTranslateString(char **str,
1078 int cs_from,
1079 int cs_to,
1080 int do_ent,
1081 int use_lynx_specials,
1082 int plain_space,
1083 int hidden,
1084 int Back,
1085 CharUtil_st stype)
1086 {
1087 char *p;
1088 char *q, *qs;
1089 HTChunk *chunk = NULL;
1090 char *cp = 0;
1091 char cpe = 0;
1092 char *esc = NULL;
1093 char replace_buf[64];
1094 int uck;
1095 int lowest_8;
1096 UCode_t code = 0;
1097 BOOL output_utf8 = 0, repl_translated_C0 = 0;
1098 size_t len;
1099 const char *name = NULL;
1100 BOOLEAN no_bytetrans;
1101 UCTransParams T;
1102 BOOL from_is_utf8 = FALSE;
1103 char *puni;
1104 enum _state {
1105 S_text,
1106 S_esc,
1107 S_dollar,
1108 S_paren,
1109 S_nonascii_text,
1110 S_dollar_paren,
1111 S_trans_byte,
1112 S_check_ent,
1113 S_ncr,
1114 S_check_uni,
1115 S_named,
1116 S_check_name,
1117 S_recover,
1118 S_got_oututf8,
1119 S_got_outstring,
1120 S_put_urlstring,
1121 S_got_outchar,
1122 S_put_urlchar,
1123 S_next_char,
1124 S_done
1125 } state = S_text;
1126 enum _parsing_what {
1127 P_text,
1128 P_utf8,
1129 P_hex,
1130 P_decimal,
1131 P_named
1132 } what = P_text;
1133
1134 #ifdef KANJI_CODE_OVERRIDE
1135 static unsigned char sjis_1st = '\0';
1136
1137 unsigned char sjis_str[3];
1138 #endif
1139
1140 /*
1141 * Make sure we have a non-empty string. - FM
1142 */
1143 if (isEmpty(*str))
1144 return str;
1145
1146 /*
1147 * FIXME: something's wrong with the limit checks here (clearing the
1148 * buffer helps).
1149 */
1150 memset(replace_buf, 0, sizeof(replace_buf));
1151
1152 /*
1153 * Don't do byte translation if original AND target character sets are both
1154 * iso-8859-1 (and we are not called to back-translate), or if we are in
1155 * CJK mode.
1156 */
1157 if (IS_CJK_TTY
1158 #ifdef EXP_JAPANESEUTF8_SUPPORT
1159 && (strcmp(LYCharSet_UC[cs_from].MIMEname, "utf-8") != 0)
1160 && (strcmp(LYCharSet_UC[cs_to].MIMEname, "utf-8") != 0)
1161 #endif
1162 ) {
1163 no_bytetrans = TRUE;
1164 } else if (cs_to <= 0 && cs_from == cs_to && (!Back || cs_to < 0)) {
1165 no_bytetrans = TRUE;
1166 } else {
1167 /* No need to translate or examine the string any further */
1168 no_bytetrans = (BOOL) (!use_lynx_specials && !Back &&
1169 UCNeedNotTranslate(cs_from, cs_to));
1170 }
1171 /*
1172 * Save malloc/calloc overhead in simple case - kw
1173 */
1174 if (do_ent && hidden && (stype != st_URL) && (strchr(*str, '&') == NULL))
1175 do_ent = FALSE;
1176
1177 /* Can't do, caller should figure out what to do... */
1178 if (!UCCanTranslateFromTo(cs_from, cs_to)) {
1179 if (cs_to < 0)
1180 return NULL;
1181 if (!do_ent && no_bytetrans)
1182 return NULL;
1183 no_bytetrans = TRUE;
1184 } else if (cs_to < 0) {
1185 do_ent = FALSE;
1186 }
1187
1188 if (!do_ent && no_bytetrans)
1189 return str;
1190 p = *str;
1191
1192 if (!no_bytetrans) {
1193 UCTransParams_clear(&T);
1194 UCSetTransParams(&T, cs_from, &LYCharSet_UC[cs_from],
1195 cs_to, &LYCharSet_UC[cs_to]);
1196 from_is_utf8 = (BOOL) (LYCharSet_UC[cs_from].enc == UCT_ENC_UTF8);
1197 output_utf8 = T.output_utf8;
1198 repl_translated_C0 = T.repl_translated_C0;
1199 puni = p;
1200 } else if (do_ent) {
1201 output_utf8 = (BOOL) (LYCharSet_UC[cs_to].enc == UCT_ENC_UTF8 ||
1202 HText_hasUTF8OutputSet(HTMainText));
1203 repl_translated_C0 = (BOOL) (LYCharSet_UC[cs_to].enc == UCT_ENC_8BIT_C0);
1204 }
1205
1206 lowest_8 = LYlowest_eightbit[cs_to];
1207
1208 /*
1209 * Create a buffer string seven times the length of the original, so we
1210 * have plenty of room for expansions. - FM
1211 */
1212 len = strlen(p) + 16;
1213 q = p;
1214
1215 qs = q;
1216
1217 /* Create the HTChunk only if we need it */
1218 #define CHUNK (chunk ? chunk : (chunk = HTChunkCreate2(128, len+1)))
1219
1220 #define REPLACE_STRING(s) \
1221 if (q != qs) HTChunkPutb(CHUNK, qs, (int) (q - qs)); \
1222 HTChunkPuts(CHUNK, s); \
1223 qs = q = *str
1224
1225 #define REPLACE_CHAR(c) if (q > p) { \
1226 HTChunkPutb(CHUNK, qs, (int) (q - qs)); \
1227 qs = q = *str; \
1228 *q++ = c; \
1229 } else \
1230 *q++ = c
1231
1232 /*
1233 * Loop through string, making conversions as needed.
1234 *
1235 * The while() checks for a non-'\0' char only for the normal text states
1236 * since other states may temporarily modify p or *p (which should be
1237 * restored before S_done!) - kw
1238 */
1239 while (*p || (state != S_text && state != S_nonascii_text)) {
1240 switch (state) {
1241 case S_text:
1242 code = UCH(*p);
1243 #ifdef KANJI_CODE_OVERRIDE
1244 if (HTCJK == JAPANESE && last_kcode == SJIS) {
1245 if (sjis_1st == '\0' && (IS_SJIS_HI1(code) || IS_SJIS_HI2(code))) {
1246 sjis_1st = UCH(code);
1247 } else if (sjis_1st && IS_SJIS_LO(code)) {
1248 sjis_1st = '\0';
1249 } else {
1250 if (conv_jisx0201kana && 0xA1 <= code && code <= 0xDF) {
1251 sjis_str[2] = '\0';
1252 JISx0201TO0208_SJIS(UCH(code),
1253 sjis_str, sjis_str + 1);
1254 REPLACE_STRING(sjis_str);
1255 p++;
1256 continue;
1257 }
1258 }
1259 }
1260 #endif
1261 if (*p == '\033') {
1262 if ((IS_CJK_TTY && !hidden) || stype != st_HTML) {
1263 state = S_esc;
1264 if (stype == st_URL) {
1265 REPLACE_STRING("%1B");
1266 p++;
1267 continue;
1268 } else if (stype != st_HTML) {
1269 p++;
1270 continue;
1271 } else {
1272 *q++ = *p++;
1273 continue;
1274 }
1275 } else if (!hidden) {
1276 /*
1277 * CJK handling not on, and not a hidden INPUT, so block
1278 * escape. - FM
1279 */
1280 state = S_next_char;
1281 } else {
1282 state = S_trans_byte;
1283 }
1284 } else {
1285 state = (do_ent ? S_check_ent : S_trans_byte);
1286 }
1287 break;
1288
1289 case S_esc:
1290 if (*p == '$') {
1291 state = S_dollar;
1292 *q++ = *p++;
1293 continue;
1294 } else if (*p == '(') {
1295 state = S_paren;
1296 *q++ = *p++;
1297 continue;
1298 } else {
1299 state = S_text;
1300 }
1301 break;
1302
1303 case S_dollar:
1304 if (*p == '@' || *p == 'B' || *p == 'A') {
1305 state = S_nonascii_text;
1306 *q++ = *p++;
1307 continue;
1308 } else if (*p == '(') {
1309 state = S_dollar_paren;
1310 *q++ = *p++;
1311 continue;
1312 } else {
1313 state = S_text;
1314 }
1315 break;
1316
1317 case S_dollar_paren:
1318 if (*p == 'C') {
1319 state = S_nonascii_text;
1320 *q++ = *p++;
1321 continue;
1322 } else {
1323 state = S_text;
1324 }
1325 break;
1326
1327 case S_paren:
1328 if (*p == 'B' || *p == 'J' || *p == 'T') {
1329 state = S_text;
1330 *q++ = *p++;
1331 continue;
1332 } else if (*p == 'I') {
1333 state = S_nonascii_text;
1334 *q++ = *p++;
1335 continue;
1336 } else {
1337 state = S_text;
1338 }
1339 break;
1340
1341 case S_nonascii_text:
1342 if (*p == '\033') {
1343 if ((IS_CJK_TTY && !hidden) || stype != st_HTML) {
1344 state = S_esc;
1345 if (stype == st_URL) {
1346 REPLACE_STRING("%1B");
1347 p++;
1348 continue;
1349 } else if (stype != st_HTML) {
1350 p++;
1351 continue;
1352 }
1353 }
1354 }
1355 *q++ = *p++;
1356 continue;
1357
1358 case S_trans_byte:
1359 /* character translation goes here */
1360 /*
1361 * Don't do anything if we have no string, or if original AND
1362 * target character sets are both iso-8859-1, or if we are in CJK
1363 * mode.
1364 */
1365 if (*p == '\0' || no_bytetrans) {
1366 state = S_got_outchar;
1367 break;
1368 }
1369
1370 if (Back) {
1371 int rev_c;
1372
1373 if ((*p) == HT_NON_BREAK_SPACE ||
1374 (*p) == HT_EN_SPACE) {
1375 if (plain_space) {
1376 code = *p = ' ';
1377 state = S_got_outchar;
1378 break;
1379 } else {
1380 code = 160;
1381 if (LYCharSet_UC[cs_to].enc == UCT_ENC_8859 ||
1382 (LYCharSet_UC[cs_to].like8859 & UCT_R_8859SPECL)) {
1383 state = S_got_outchar;
1384 break;
1385 } else if (!(LYCharSet_UC[cs_from].enc == UCT_ENC_8859
1386 || (LYCharSet_UC[cs_from].like8859 & UCT_R_8859SPECL))) {
1387 state = S_check_uni;
1388 break;
1389 } else {
1390 *(unsigned char *) p = UCH(160);
1391 }
1392 }
1393 } else if ((*p) == LY_SOFT_HYPHEN) {
1394 code = 173;
1395 if (LYCharSet_UC[cs_to].enc == UCT_ENC_8859 ||
1396 (LYCharSet_UC[cs_to].like8859 & UCT_R_8859SPECL)) {
1397 state = S_got_outchar;
1398 break;
1399 } else if (!(LYCharSet_UC[cs_from].enc == UCT_ENC_8859
1400 || (LYCharSet_UC[cs_from].like8859 & UCT_R_8859SPECL))) {
1401 state = S_check_uni;
1402 break;
1403 } else {
1404 *(unsigned char *) p = UCH(173);
1405 }
1406 #ifdef EXP_JAPANESEUTF8_SUPPORT
1407 } else if (output_utf8) {
1408 if ((!strcmp(LYCharSet_UC[cs_from].MIMEname, "euc-jp") &&
1409 (IS_EUC((unsigned char) (*p),
1410 (unsigned char) (*(p + 1))))) ||
1411 (!strcmp(LYCharSet_UC[cs_from].MIMEname, "shift_jis") &&
1412 (IS_SJIS_2BYTE((unsigned char) (*p),
1413 (unsigned char) (*(p + 1)))))) {
1414 code = UCTransJPToUni(p, 2, cs_from);
1415 p++;
1416 state = S_check_uni;
1417 break;
1418 }
1419 #endif
1420 } else if (code < 127 || T.transp) {
1421 state = S_got_outchar;
1422 break;
1423 }
1424 rev_c = UCReverseTransChar(*p, cs_to, cs_from);
1425 if (rev_c > 127) {
1426 *p = (char) rev_c;
1427 code = rev_c;
1428 state = S_got_outchar;
1429 break;
1430 }
1431 } else if (code < 127) {
1432 state = S_got_outchar;
1433 break;
1434 }
1435
1436 if (from_is_utf8) {
1437 if (((*p) & 0xc0) == 0xc0) {
1438 puni = p;
1439 code = UCGetUniFromUtf8String(&puni);
1440 if (code <= 0) {
1441 code = UCH(*p);
1442 } else {
1443 what = P_utf8;
1444 }
1445 }
1446 } else if (use_lynx_specials && !Back &&
1447 (code == 160 || code == 173) &&
1448 (LYCharSet_UC[cs_from].enc == UCT_ENC_8859 ||
1449 (LYCharSet_UC[cs_from].like8859 & UCT_R_8859SPECL))) {
1450 if (code == 160)
1451 code = *p = HT_NON_BREAK_SPACE;
1452 else if (code == 173)
1453 code = *p = LY_SOFT_HYPHEN;
1454 state = S_got_outchar;
1455 break;
1456 } else if (T.trans_to_uni) {
1457 code = UCTransToUni(*p, cs_from);
1458 if (code <= 0) {
1459 /* What else can we do? */
1460 code = UCH(*p);
1461 }
1462 } else if (!T.trans_from_uni) {
1463 state = S_got_outchar;
1464 break;
1465 }
1466 /*
1467 * Substitute Lynx special character for 160 (nbsp) if
1468 * use_lynx_specials is set.
1469 */
1470 if (use_lynx_specials && !Back &&
1471 (code == 160 || code == 173)) {
1472 code = ((code == 160 ? HT_NON_BREAK_SPACE : LY_SOFT_HYPHEN));
1473 state = S_got_outchar;
1474 break;
1475 }
1476
1477 state = S_check_uni;
1478 break;
1479
1480 case S_check_ent:
1481 if (*p == '&') {
1482 char *pp = p + 1;
1483
1484 len = strlen(pp);
1485 /*
1486 * Check for a numeric entity. - FM
1487 */
1488 if (*pp == '#' && len > 2 &&
1489 (*(pp + 1) == 'x' || *(pp + 1) == 'X') &&
1490 UCH(*(pp + 2)) < 127 &&
1491 isxdigit(UCH(*(pp + 2)))) {
1492 what = P_hex;
1493 state = S_ncr;
1494 } else if (*pp == '#' && len > 2 &&
1495 UCH(*(pp + 1)) < 127 &&
1496 isdigit(UCH(*(pp + 1)))) {
1497 what = P_decimal;
1498 state = S_ncr;
1499 } else if (UCH(*pp) < 127 &&
1500 isalpha(UCH(*pp))) {
1501 what = P_named;
1502 state = S_named;
1503 } else {
1504 state = S_trans_byte;
1505 }
1506 } else {
1507 state = S_trans_byte;
1508 }
1509 break;
1510
1511 case S_ncr:
1512 if (what == P_hex) {
1513 p += 3;
1514 } else { /* P_decimal */
1515 p += 2;
1516 }
1517 cp = p;
1518 while (*p && UCH(*p) < 127 &&
1519 (what == P_hex ? isxdigit(UCH(*p)) :
1520 isdigit(UCH(*p)))) {
1521 p++;
1522 }
1523 /*
1524 * Save the terminator and isolate the digit(s). - FM
1525 */
1526 cpe = *p;
1527 if (*p)
1528 *p++ = '\0';
1529 /*
1530 * Show the numeric entity if the value:
1531 * (1) Is greater than 255 and unhandled Unicode.
1532 * (2) Is less than 32, and not valid and we don't have HTCJK set.
1533 * (3) Is 127 and we don't have HTPassHighCtrlRaw or HTCJK set.
1534 * (4) Is 128 - 159 and we don't have HTPassHighCtrlNum set.
1535 */
1536 if (UCScanCode(&code, cp, (BOOL) (what == P_hex))) {
1537 code = LYcp1252ToUnicode(code);
1538 state = S_check_uni;
1539 } else {
1540 state = S_recover;
1541 break;
1542 }
1543 break;
1544
1545 case S_check_uni:
1546 /*
1547 * Show the numeric entity if the value:
1548 * (2) Is less than 32, and not valid and we don't have HTCJK set.
1549 * (3) Is 127 and we don't have HTPassHighCtrlRaw or HTCJK set.
1550 * (4) Is 128 - 159 and we don't have HTPassHighCtrlNum set.
1551 */
1552 if ((code < 32 &&
1553 code != 9 && code != 10 && code != 13 &&
1554 !IS_CJK_TTY) ||
1555 (code == 127 &&
1556 !(HTPassHighCtrlRaw || IS_CJK_TTY)) ||
1557 (code > 127 && code < 160 &&
1558 !HTPassHighCtrlNum)) {
1559 state = S_recover;
1560 break;
1561 }
1562 /*
1563 * Convert the value as an unsigned char, hex escaped if isURL is
1564 * set and it's 8-bit, and then recycle the terminator if it is not
1565 * a semicolon. - FM
1566 */
1567 if (code > 159 && stype == st_URL) {
1568 state = S_got_oututf8;
1569 break;
1570 }
1571 /*
1572 * For 160 (nbsp), use that value if it's a hidden INPUT, otherwise
1573 * use an ASCII space (32) if plain_space is TRUE, otherwise use
1574 * the Lynx special character. - FM
1575 */
1576 if (code == 160) {
1577 if (plain_space) {
1578 code = ' ';
1579 state = S_got_outchar;
1580 break;
1581 } else if (use_lynx_specials) {
1582 code = HT_NON_BREAK_SPACE;
1583 state = S_got_outchar;
1584 break;
1585 } else if ((hidden && !Back)
1586 || (LYCharSet_UC[cs_to].codepoints & UCT_CP_SUPERSETOF_LAT1)
1587 || LYCharSet_UC[cs_to].enc == UCT_ENC_8859
1588 || (LYCharSet_UC[cs_to].like8859 &
1589 UCT_R_8859SPECL)) {
1590 state = S_got_outchar;
1591 break;
1592 } else if (
1593 (LYCharSet_UC[cs_to].repertoire & UCT_REP_SUPERSETOF_LAT1)) {
1594 ; /* nothing, may be translated later */
1595 } else {
1596 code = ' ';
1597 state = S_got_outchar;
1598 break;
1599 }
1600 }
1601 /*
1602 * For 173 (shy), use that value if it's a hidden INPUT, otherwise
1603 * ignore it if plain_space is TRUE, otherwise use the Lynx special
1604 * character. - FM
1605 */
1606 if (code == 173) {
1607 if (plain_space) {
1608 replace_buf[0] = '\0';
1609 state = S_got_outstring;
1610 break;
1611 } else if (Back &&
1612 !(LYCharSet_UC[cs_to].enc == UCT_ENC_8859 ||
1613 (LYCharSet_UC[cs_to].like8859 &
1614 UCT_R_8859SPECL))) {
1615 ; /* nothing, may be translated later */
1616 } else if (hidden || Back) {
1617 state = S_got_outchar;
1618 break;
1619 } else if (use_lynx_specials) {
1620 code = LY_SOFT_HYPHEN;
1621 state = S_got_outchar;
1622 break;
1623 }
1624 }
1625 /*
1626 * Seek a translation from the chartrans tables.
1627 */
1628 if ((uck = UCTransUniChar(code,
1629 cs_to)) >= 32 &&
1630 uck < 256 &&
1631 (uck < 127 || uck >= lowest_8)) {
1632 code = uck;
1633 state = S_got_outchar;
1634 break;
1635 } else if ((uck == -4 ||
1636 (repl_translated_C0 &&
1637 uck > 0 && uck < 32)) &&
1638 /*
1639 * Not found; look for replacement string.
1640 */
1641 UCTransUniCharStr(replace_buf,
1642 60, code,
1643 cs_to,
1644 0) >= 0) {
1645 state = S_got_outstring;
1646 break;
1647 }
1648 if (output_utf8 &&
1649 code > 127 && code < 0x7fffffffL) {
1650 state = S_got_oututf8;
1651 break;
1652 }
1653 /*
1654 * For 8194 (ensp), 8195 (emsp), or 8201 (thinsp), use the
1655 * character reference if it's a hidden INPUT, otherwise use an
1656 * ASCII space (32) if plain_space is TRUE, otherwise use the Lynx
1657 * special character. - FM
1658 */
1659 if (code == 8194 || code == 8195 || code == 8201) {
1660 if (hidden) {
1661 state = S_recover;
1662 } else if (plain_space) {
1663 code = ' ';
1664 state = S_got_outchar;
1665 } else {
1666 code = HT_EN_SPACE;
1667 state = S_got_outchar;
1668 }
1669 break;
1670 /*
1671 * Ignore 8204 (zwnj), 8205 (zwj) 8206 (lrm), and 8207 (rlm),
1672 * for now, if we got this far without finding a representation
1673 * for them.
1674 */
1675 } else if (code == 8204 || code == 8205 ||
1676 code == 8206 || code == 8207) {
1677 CTRACE((tfp, "LYUCFullyTranslateString: Ignoring '%"
1678 PRI_UCode_t "'.\n", code));
1679 replace_buf[0] = '\0';
1680 state = S_got_outstring;
1681 break;
1682 /*
1683 * Show the numeric entity if the value: (1) Is greater than
1684 * 255 and unhandled Unicode.
1685 */
1686 } else if (code > 255) {
1687 /*
1688 * Illegal or not yet handled value. Return "&#" verbatim and
1689 * continue from there. - FM
1690 */
1691 state = S_recover;
1692 break;
1693 /*
1694 * If it's ASCII, or is 8-bit but HTPassEightBitNum is set or
1695 * the character set is "ISO Latin 1", use its value. - FM
1696 */
1697 } else if (code < 161 ||
1698 (code < 256 &&
1699 (HTPassEightBitNum || cs_to == LATIN1))) {
1700 /*
1701 * No conversion needed.
1702 */
1703 state = S_got_outchar;
1704 break;
1705
1706 /* The following disabled section doesn't make sense any more.
1707 * It used to make sense in the past, when S_check_named would
1708 * look in "old style" tables in addition to what it does now.
1709 * Disabling of going to S_check_name here prevents endless
1710 * looping between S_check_uni and S_check_names states, which
1711 * could occur here for Latin 1 codes for some cs_to if they
1712 * had no translation in that cs_to. Normally all cs_to
1713 * *should* now have valid translations via UCTransUniChar or
1714 * UCTransUniCharStr for all Latin 1 codes, so that we would
1715 * not get here anyway, and no loop could occur. Still, if we
1716 * *do* get here, FALL THROUGH to case S_recover now. - kw
1717 */
1718 #if 0
1719 /*
1720 * If we get to here, convert and handle the character as a
1721 * named entity. - FM
1722 */
1723 } else {
1724 name = HTMLGetEntityName(code - 160);
1725 state = S_check_name;
1726 break;
1727 #endif
1728 }
1729
1730 case S_recover:
1731 if (what == P_decimal || what == P_hex) {
1732 /*
1733 * Illegal or not yet handled value. Return "&#" verbatim and
1734 * continue from there. - FM
1735 */
1736 *q++ = '&';
1737 *q++ = '#';
1738 if (what == P_hex)
1739 *q++ = 'x';
1740 if (cpe != '\0')
1741 *(p - 1) = cpe;
1742 p = cp;
1743 state = S_done;
1744 } else if (what == P_named) {
1745 *cp = cpe;
1746 *q++ = '&';
1747 state = S_done;
1748 } else if (!T.output_utf8 && stype == st_HTML && !hidden &&
1749 !(HTPassEightBitRaw &&
1750 UCH(*p) >= lowest_8)) {
1751 sprintf(replace_buf, "U%.2" PRI_UCode_t "", code);
1752
1753 state = S_got_outstring;
1754 } else {
1755 puni = p;
1756 code = UCH(*p);
1757 state = S_got_outchar;
1758 }
1759 break;
1760
1761 case S_named:
1762 cp = ++p;
1763 while (*cp && UCH(*cp) < 127 &&
1764 isalnum(UCH(*cp)))
1765 cp++;
1766 cpe = *cp;
1767 *cp = '\0';
1768 name = p;
1769 state = S_check_name;
1770 break;
1771
1772 case S_check_name:
1773 /*
1774 * Seek the Unicode value for the named entity.
1775 *
1776 * !!!! We manually recover the case of '=' terminator which is
1777 * commonly found on query to CGI-scripts enclosed as href= URLs
1778 * like "somepath/?x=1&yz=2" Without this dirty fix, submission of
1779 * such URLs was broken if &yz string happened to be a recognized
1780 * entity name. - LP
1781 */
1782 if (((code = HTMLGetEntityUCValue(name)) > 0) &&
1783 !((cpe == '=') && (stype == st_URL))) {
1784 state = S_check_uni;
1785 break;
1786 }
1787 /*
1788 * Didn't find the entity. Return verbatim.
1789 */
1790 state = S_recover;
1791 break;
1792
1793 /* * * O U T P U T S T A T E S * * */
1794
1795 case S_got_oututf8:
1796 if (code > 255 ||
1797 (code >= 128 && LYCharSet_UC[cs_to].enc == UCT_ENC_UTF8)) {
1798 UCConvertUniToUtf8(code, replace_buf);
1799 state = S_got_outstring;
1800 } else {
1801 state = S_got_outchar;
1802 }
1803 break;
1804 case S_got_outstring:
1805 if (what == P_decimal || what == P_hex) {
1806 if (cpe != ';' && cpe != '\0')
1807 *(--p) = cpe;
1808 p--;
1809 } else if (what == P_named) {
1810 *cp = cpe;
1811 p = (*cp != ';') ? (cp - 1) : cp;
1812 } else if (what == P_utf8) {
1813 p = puni;
1814 }
1815 if (replace_buf[0] == '\0') {
1816 state = S_next_char;
1817 break;
1818 }
1819 if (stype == st_URL) {
1820 code = replace_buf[0]; /* assume string OK if first char is */
1821 if (code >= 127 ||
1822 (code < 32 && (code != 9 && code != 10 && code != 0))) {
1823 state = S_put_urlstring;
1824 break;
1825 }
1826 }
1827 REPLACE_STRING(replace_buf);
1828 state = S_next_char;
1829 break;
1830 case S_put_urlstring:
1831 esc = HTEscape(replace_buf, URL_XALPHAS);
1832 REPLACE_STRING(esc);
1833 FREE(esc);
1834 state = S_next_char;
1835 break;
1836 case S_got_outchar:
1837 if (what == P_decimal || what == P_hex) {
1838 if (cpe != ';' && cpe != '\0')
1839 *(--p) = cpe;
1840 p--;
1841 } else if (what == P_named) {
1842 *cp = cpe;
1843 p = (*cp != ';') ? (cp - 1) : cp;
1844 } else if (what == P_utf8) {
1845 p = puni;
1846 }
1847 if (stype == st_URL &&
1848 /* Not a full HTEscape, only for 8bit and ctrl chars */
1849 (TOASCII(code) >= 127 || /* S/390 -- gil -- 1925 */
1850 (code < ' ' && (code != '\t' && code != '\n')))) {
1851 state = S_put_urlchar;
1852 break;
1853 } else if (!hidden && code == 10 && *p == 10
1854 && q != qs && *(q - 1) == 13) {
1855 /*
1856 * If this is not a hidden string, and the current char is the
1857 * LF ('\n') of a CRLF pair, drop the CR ('\r'). - KW
1858 */
1859 *(q - 1) = *p++;
1860 state = S_done;
1861 break;
1862 }
1863 *q++ = (char) code;
1864 state = S_next_char;
1865 break;
1866 case S_put_urlchar:
1867 *q++ = '%';
1868 REPLACE_CHAR(hex[(TOASCII(code) >> 4) & 15]); /* S/390 -- gil -- 1944 */
1869 REPLACE_CHAR(hex[(TOASCII(code) & 15)]);
1870 /* fall through */
1871 case S_next_char:
1872 p++; /* fall through */
1873 case S_done:
1874 state = S_text;
1875 what = P_text;
1876 /* for next round */
1877 }
1878 }
1879
1880 *q = '\0';
1881 if (chunk) {
1882 HTChunkPutb(CHUNK, qs, (int) (q - qs + 1)); /* also terminates */
1883 if (stype == st_URL || stype == st_other) {
1884 LYTrimHead(chunk->data);
1885 LYTrimTail(chunk->data);
1886 }
1887 StrAllocCopy(*str, chunk->data);
1888 HTChunkFree(chunk);
1889 } else {
1890 if (stype == st_URL || stype == st_other) {
1891 LYTrimHead(qs);
1892 LYTrimTail(qs);
1893 }
1894 }
1895 return str;
1896 }
1897
1898 #undef REPLACE_CHAR
1899 #undef REPLACE_STRING
1900
LYUCTranslateHTMLString(char ** str,int cs_from,int cs_to,int use_lynx_specials,int plain_space,int hidden,CharUtil_st stype)1901 BOOL LYUCTranslateHTMLString(char **str,
1902 int cs_from,
1903 int cs_to,
1904 int use_lynx_specials,
1905 int plain_space,
1906 int hidden,
1907 CharUtil_st stype)
1908 {
1909 BOOL ret = YES;
1910
1911 /* May reallocate *str even if cs_to == 0 */
1912 if (!LYUCFullyTranslateString(str, cs_from, cs_to, TRUE,
1913 use_lynx_specials, plain_space, hidden,
1914 NO, stype)) {
1915 ret = NO;
1916 }
1917 return ret;
1918 }
1919
LYUCTranslateBackFormData(char ** str,int cs_from,int cs_to,int plain_space)1920 BOOL LYUCTranslateBackFormData(char **str,
1921 int cs_from,
1922 int cs_to,
1923 int plain_space)
1924 {
1925 char **ret;
1926
1927 /* May reallocate *str */
1928 ret = (LYUCFullyTranslateString(str, cs_from, cs_to, FALSE,
1929 NO, plain_space, YES,
1930 YES, st_HTML));
1931 return (BOOL) (ret != NULL);
1932 }
1933
1934 /*
1935 * Parse a parameter from an HTML META tag, i.e., the CONTENT.
1936 */
LYParseTagParam(char * from,const char * name)1937 char *LYParseTagParam(char *from,
1938 const char *name)
1939 {
1940 size_t len = strlen(name);
1941 char *result = NULL;
1942 char *string = from;
1943
1944 do {
1945 if ((string = strchr(string, ';')) == NULL)
1946 return NULL;
1947 while (*string != '\0' && (*string == ';' || isspace(UCH(*string)))) {
1948 string++;
1949 }
1950 if (strlen(string) < len)
1951 return NULL;
1952 } while (strncasecomp(string, name, (int) len) != 0);
1953 string += len;
1954 while (*string != '\0' && (isspace(UCH(*string)) || *string == '=')) {
1955 string++;
1956 }
1957
1958 StrAllocCopy(result, string);
1959 len = 0;
1960 while (isprint(UCH(string[len])) && !isspace(UCH(string[len]))) {
1961 len++;
1962 }
1963 result[len] = '\0';
1964
1965 /*
1966 * Strip single quotes, just in case.
1967 */
1968 if (len > 2 && result[0] == '\'' && result[len - 1] == result[0]) {
1969 result[len - 1] = '\0';
1970 for (string = result; (string[0] = string[1]) != '\0'; ++string) ;
1971 }
1972 return result;
1973 }
1974
1975 /*
1976 * Given a refresh-URL content string, parses the delay time and the URL
1977 * string. Ignore the remainder of the content.
1978 */
LYParseRefreshURL(char * content,char ** p_seconds,char ** p_address)1979 void LYParseRefreshURL(char *content,
1980 char **p_seconds,
1981 char **p_address)
1982 {
1983 char *cp;
1984 char *cp1 = NULL;
1985 char *Seconds = NULL;
1986
1987 /*
1988 * Look for the Seconds field. - FM
1989 */
1990 cp = LYSkipBlanks(content);
1991 if (*cp && isdigit(UCH(*cp))) {
1992 cp1 = cp;
1993 while (*cp1 && isdigit(UCH(*cp1)))
1994 cp1++;
1995 StrnAllocCopy(Seconds, cp, (int) (cp1 - cp));
1996 }
1997 *p_seconds = Seconds;
1998 *p_address = LYParseTagParam(content, "URL");
1999
2000 CTRACE((tfp,
2001 "LYParseRefreshURL\n\tcontent: %s\n\tseconds: %s\n\taddress: %s\n",
2002 content, NonNull(*p_seconds), NonNull(*p_address)));
2003 }
2004
2005 /*
2006 * This function processes META tags in HTML streams. - FM
2007 */
LYHandleMETA(HTStructured * me,const BOOL * present,STRING2PTR value,char ** include GCC_UNUSED)2008 void LYHandleMETA(HTStructured * me, const BOOL *present,
2009 STRING2PTR value,
2010 char **include GCC_UNUSED)
2011 {
2012 char *http_equiv = NULL, *name = NULL, *content = NULL, *charset = NULL;
2013 char *href = NULL, *id_string = NULL, *temp = NULL;
2014 char *cp, *cp0, *cp1 = NULL;
2015 int url_type = 0;
2016
2017 if (!me || !present)
2018 return;
2019
2020 /*
2021 * Load the attributes for possible use by Lynx. - FM
2022 */
2023 if (present[HTML_META_HTTP_EQUIV] &&
2024 non_empty(value[HTML_META_HTTP_EQUIV])) {
2025 StrAllocCopy(http_equiv, value[HTML_META_HTTP_EQUIV]);
2026 convert_to_spaces(http_equiv, TRUE);
2027 LYUCTranslateHTMLString(&http_equiv, me->tag_charset, me->tag_charset,
2028 NO, NO, YES, st_other);
2029 if (*http_equiv == '\0') {
2030 FREE(http_equiv);
2031 }
2032 }
2033 if (present[HTML_META_NAME] &&
2034 non_empty(value[HTML_META_NAME])) {
2035 StrAllocCopy(name, value[HTML_META_NAME]);
2036 convert_to_spaces(name, TRUE);
2037 LYUCTranslateHTMLString(&name, me->tag_charset, me->tag_charset,
2038 NO, NO, YES, st_other);
2039 if (*name == '\0') {
2040 FREE(name);
2041 }
2042 }
2043 if (present[HTML_META_CONTENT] &&
2044 non_empty(value[HTML_META_CONTENT])) {
2045 /*
2046 * Technically, we should be creating a comma-separated list, but META
2047 * tags come one at a time, and we'll handle (or ignore) them as each
2048 * is received. Also, at this point, we only trim leading and trailing
2049 * blanks from the CONTENT value, without translating any named
2050 * entities or numeric character references, because how we should do
2051 * that depends on what type of information it contains, and whether or
2052 * not any of it might be sent to the screen. - FM
2053 */
2054 StrAllocCopy(content, value[HTML_META_CONTENT]);
2055 convert_to_spaces(content, FALSE);
2056 LYTrimHead(content);
2057 LYTrimTail(content);
2058 if (*content == '\0') {
2059 FREE(content);
2060 }
2061 }
2062 if (present[HTML_META_CHARSET] &&
2063 non_empty(value[HTML_META_CHARSET])) {
2064 StrAllocCopy(charset, value[HTML_META_CHARSET]);
2065 convert_to_spaces(charset, TRUE);
2066 LYUCTranslateHTMLString(&charset, me->tag_charset, me->tag_charset,
2067 NO, NO, YES, st_other);
2068 if (*charset == '\0') {
2069 FREE(charset);
2070 }
2071 }
2072 CTRACE((tfp,
2073 "LYHandleMETA: HTTP-EQUIV=\"%s\" NAME=\"%s\" CONTENT=\"%s\" CHARSET=\"%s\"\n",
2074 NONNULL(http_equiv),
2075 NONNULL(name),
2076 NONNULL(content),
2077 NONNULL(charset)));
2078
2079 /*
2080 * Check for a text/html Content-Type with a charset directive, if we
2081 * didn't already set the charset via a server's header. - AAC & FM
2082 */
2083 if (isEmpty(me->node_anchor->charset) &&
2084 (charset ||
2085 (!strcasecomp(NonNull(http_equiv), "Content-Type") && content))) {
2086 LYUCcharset *p_in = NULL;
2087 LYUCcharset *p_out = NULL;
2088
2089 if (charset) {
2090 LYLowerCase(charset);
2091 } else {
2092 LYUCTranslateHTMLString(&content, me->tag_charset, me->tag_charset,
2093 NO, NO, YES, st_other);
2094 LYLowerCase(content);
2095 }
2096
2097 if ((cp1 = charset) != NULL ||
2098 (cp1 = strstr(content, "charset")) != NULL) {
2099 BOOL chartrans_ok = NO;
2100 char *cp3 = NULL, *cp4;
2101 int chndl;
2102
2103 if (!charset)
2104 cp1 += 7;
2105 while (*cp1 == ' ' || *cp1 == '=' || *cp1 == '"')
2106 cp1++;
2107
2108 StrAllocCopy(cp3, cp1); /* copy to mutilate more */
2109 for (cp4 = cp3; (*cp4 != '\0' && *cp4 != '"' &&
2110 *cp4 != ';' && *cp4 != ':' &&
2111 !WHITE(*cp4)); cp4++) {
2112 ; /* do nothing */
2113 }
2114 *cp4 = '\0';
2115 cp4 = cp3;
2116 chndl = UCGetLYhndl_byMIME(cp3);
2117
2118 #ifdef CAN_SWITCH_DISPLAY_CHARSET
2119 /* Allow a switch to a more suitable display charset */
2120 if (Switch_Display_Charset(chndl, SWITCH_DISPLAY_CHARSET_MAYBE)) {
2121 /* UCT_STAGE_STRUCTURED and UCT_STAGE_HTEXT
2122 should have the same setting for UCInfoStage. */
2123 HTAnchor_getUCInfoStage(me->node_anchor, UCT_STAGE_STRUCTURED);
2124
2125 me->outUCLYhndl = current_char_set;
2126 HTAnchor_setUCInfoStage(me->node_anchor,
2127 current_char_set,
2128 UCT_STAGE_HTEXT,
2129 UCT_SETBY_MIME); /* highest priorty! */
2130 HTAnchor_setUCInfoStage(me->node_anchor,
2131 current_char_set,
2132 UCT_STAGE_STRUCTURED,
2133 UCT_SETBY_MIME); /* highest priorty! */
2134 me->outUCI = HTAnchor_getUCInfoStage(me->node_anchor,
2135 UCT_STAGE_HTEXT);
2136 /* The SGML stage will be reset in change_chartrans_handling */
2137 }
2138 #endif
2139
2140 if (UCCanTranslateFromTo(chndl, current_char_set)) {
2141 chartrans_ok = YES;
2142 StrAllocCopy(me->node_anchor->charset, cp4);
2143 HTAnchor_setUCInfoStage(me->node_anchor, chndl,
2144 UCT_STAGE_PARSER,
2145 UCT_SETBY_STRUCTURED);
2146 } else if (chndl < 0) {
2147 /*
2148 * Got something but we don't recognize it.
2149 */
2150 chndl = UCLYhndl_for_unrec;
2151 if (chndl < 0) /* UCLYhndl_for_unrec not defined :-( */
2152 chndl = UCLYhndl_for_unspec; /* always >= 0 */
2153 if (UCCanTranslateFromTo(chndl, current_char_set)) {
2154 chartrans_ok = YES;
2155 HTAnchor_setUCInfoStage(me->node_anchor, chndl,
2156 UCT_STAGE_PARSER,
2157 UCT_SETBY_STRUCTURED);
2158 }
2159 }
2160 if (chartrans_ok) {
2161 p_in = HTAnchor_getUCInfoStage(me->node_anchor,
2162 UCT_STAGE_PARSER);
2163 p_out = HTAnchor_setUCInfoStage(me->node_anchor,
2164 current_char_set,
2165 UCT_STAGE_HTEXT,
2166 UCT_SETBY_DEFAULT);
2167 if (!p_out) {
2168 /*
2169 * Try again.
2170 */
2171 p_out = HTAnchor_getUCInfoStage(me->node_anchor,
2172 UCT_STAGE_HTEXT);
2173 }
2174 if (!strcmp(p_in->MIMEname, "x-transparent")) {
2175 HTPassEightBitRaw = TRUE;
2176 HTAnchor_setUCInfoStage(me->node_anchor,
2177 HTAnchor_getUCLYhndl(me->node_anchor,
2178 UCT_STAGE_HTEXT),
2179 UCT_STAGE_PARSER,
2180 UCT_SETBY_DEFAULT);
2181 }
2182 if (!strcmp(p_out->MIMEname, "x-transparent")) {
2183 HTPassEightBitRaw = TRUE;
2184 HTAnchor_setUCInfoStage(me->node_anchor,
2185 HTAnchor_getUCLYhndl(me->node_anchor,
2186 UCT_STAGE_PARSER),
2187 UCT_STAGE_HTEXT,
2188 UCT_SETBY_DEFAULT);
2189 }
2190 if ((p_in->enc != UCT_ENC_CJK)
2191 #ifdef EXP_JAPANESEUTF8_SUPPORT
2192 && (p_in->enc != UCT_ENC_UTF8)
2193 #endif
2194 ) {
2195 HTCJK = NOCJK;
2196 if (!(p_in->codepoints &
2197 UCT_CP_SUBSETOF_LAT1) &&
2198 chndl == current_char_set) {
2199 HTPassEightBitRaw = TRUE;
2200 }
2201 } else if (p_out->enc == UCT_ENC_CJK) {
2202 Set_HTCJK(p_in->MIMEname, p_out->MIMEname);
2203 }
2204 LYGetChartransInfo(me);
2205 /*
2206 * Update the chartrans info homologously to a Content-Type
2207 * MIME header with a charset parameter. - FM
2208 */
2209 if (me->UCLYhndl != chndl) {
2210 HTAnchor_setUCInfoStage(me->node_anchor, chndl,
2211 UCT_STAGE_MIME,
2212 UCT_SETBY_STRUCTURED);
2213 HTAnchor_setUCInfoStage(me->node_anchor, chndl,
2214 UCT_STAGE_PARSER,
2215 UCT_SETBY_STRUCTURED);
2216 me->inUCLYhndl = HTAnchor_getUCLYhndl(me->node_anchor,
2217 UCT_STAGE_PARSER);
2218 me->inUCI = HTAnchor_getUCInfoStage(me->node_anchor,
2219 UCT_STAGE_PARSER);
2220 }
2221 UCSetTransParams(&me->T,
2222 me->inUCLYhndl, me->inUCI,
2223 me->outUCLYhndl, me->outUCI);
2224 } else {
2225 /*
2226 * Cannot translate. If according to some heuristic the given
2227 * charset and the current display character both are likely to
2228 * be like ISO-8859 in structure, pretend we have some kind of
2229 * match.
2230 */
2231 BOOL given_is_8859 = (BOOL) (!StrNCmp(cp4, "iso-8859-", 9) &&
2232 isdigit(UCH(cp4[9])));
2233 BOOL given_is_8859like = (BOOL) (given_is_8859
2234 || !StrNCmp(cp4, "windows-", 8)
2235 || !StrNCmp(cp4, "cp12", 4)
2236 || !StrNCmp(cp4, "cp-12", 5));
2237 BOOL given_and_display_8859like = (BOOL) (given_is_8859like &&
2238 (strstr(LYchar_set_names[current_char_set],
2239 "ISO-8859") ||
2240 strstr(LYchar_set_names[current_char_set],
2241 "windows-")));
2242
2243 if (given_is_8859) {
2244 cp1 = &cp4[10];
2245 while (*cp1 &&
2246 isdigit(UCH((*cp1))))
2247 cp1++;
2248 *cp1 = '\0';
2249 }
2250 if (given_and_display_8859like) {
2251 StrAllocCopy(me->node_anchor->charset, cp4);
2252 HTPassEightBitRaw = TRUE;
2253 }
2254 HTAlert(*cp4 ? cp4 : me->node_anchor->charset);
2255
2256 }
2257 FREE(cp3);
2258
2259 if (me->node_anchor->charset) {
2260 CTRACE((tfp,
2261 "LYHandleMETA: New charset: %s\n",
2262 me->node_anchor->charset));
2263 }
2264 }
2265 /*
2266 * Set the kcode element based on the charset. - FM
2267 */
2268 HText_setKcode(me->text, me->node_anchor->charset, p_in);
2269 }
2270
2271 /*
2272 * Make sure we have META name/value pairs to handle. - FM
2273 */
2274 if (!(http_equiv || name) || !content)
2275 goto free_META_copies;
2276
2277 /*
2278 * Check for a no-cache Pragma
2279 * or Cache-Control directive. - FM
2280 */
2281 if (!strcasecomp(NonNull(http_equiv), "Pragma") ||
2282 !strcasecomp(NonNull(http_equiv), "Cache-Control")) {
2283 LYUCTranslateHTMLString(&content, me->tag_charset, me->tag_charset,
2284 NO, NO, YES, st_other);
2285 if (!strcasecomp(content, "no-cache")) {
2286 me->node_anchor->no_cache = TRUE;
2287 HText_setNoCache(me->text);
2288 }
2289
2290 /*
2291 * If we didn't get a Cache-Control MIME header, and the META has one,
2292 * convert to lowercase, store it in the anchor element, and if we
2293 * haven't yet set no_cache, check whether we should. - FM
2294 */
2295 if ((!me->node_anchor->cache_control) &&
2296 !strcasecomp(NonNull(http_equiv), "Cache-Control")) {
2297 LYLowerCase(content);
2298 StrAllocCopy(me->node_anchor->cache_control, content);
2299 if (me->node_anchor->no_cache == FALSE) {
2300 cp0 = content;
2301 while ((cp = strstr(cp0, "no-cache")) != NULL) {
2302 cp += 8;
2303 while (*cp != '\0' && WHITE(*cp))
2304 cp++;
2305 if (*cp == '\0' || *cp == ';') {
2306 me->node_anchor->no_cache = TRUE;
2307 HText_setNoCache(me->text);
2308 break;
2309 }
2310 cp0 = cp;
2311 }
2312 if (me->node_anchor->no_cache == TRUE)
2313 goto free_META_copies;
2314 cp0 = content;
2315 while ((cp = strstr(cp0, "max-age")) != NULL) {
2316 cp += 7;
2317 while (*cp != '\0' && WHITE(*cp))
2318 cp++;
2319 if (*cp == '=') {
2320 cp++;
2321 while (*cp != '\0' && WHITE(*cp))
2322 cp++;
2323 if (isdigit(UCH(*cp))) {
2324 cp0 = cp;
2325 while (isdigit(UCH(*cp)))
2326 cp++;
2327 if (*cp0 == '0' && cp == (cp0 + 1)) {
2328 me->node_anchor->no_cache = TRUE;
2329 HText_setNoCache(me->text);
2330 break;
2331 }
2332 }
2333 }
2334 cp0 = cp;
2335 }
2336 }
2337 }
2338
2339 /*
2340 * Check for an Expires directive. - FM
2341 */
2342 } else if (!strcasecomp(NonNull(http_equiv), "Expires")) {
2343 /*
2344 * If we didn't get an Expires MIME header, store it in the anchor
2345 * element, and if we haven't yet set no_cache, check whether we
2346 * should. Note that we don't accept a Date header via META tags,
2347 * because it's likely to be untrustworthy, but do check for a Date
2348 * header from a server when making the comparison. - FM
2349 */
2350 LYUCTranslateHTMLString(&content, me->tag_charset, me->tag_charset,
2351 NO, NO, YES, st_other);
2352 StrAllocCopy(me->node_anchor->expires, content);
2353 if (me->node_anchor->no_cache == FALSE) {
2354 if (!strcmp(content, "0")) {
2355 /*
2356 * The value is zero, which we treat as an absolute no-cache
2357 * directive. - FM
2358 */
2359 me->node_anchor->no_cache = TRUE;
2360 HText_setNoCache(me->text);
2361 } else if (me->node_anchor->date != NULL) {
2362 /*
2363 * We have a Date header, so check if the value is less than or
2364 * equal to that. - FM
2365 */
2366 if (LYmktime(content, TRUE) <=
2367 LYmktime(me->node_anchor->date, TRUE)) {
2368 me->node_anchor->no_cache = TRUE;
2369 HText_setNoCache(me->text);
2370 }
2371 } else if (LYmktime(content, FALSE) == 0) {
2372 /*
2373 * We don't have a Date header, and the value is in past for
2374 * us. - FM
2375 */
2376 me->node_anchor->no_cache = TRUE;
2377 HText_setNoCache(me->text);
2378 }
2379 }
2380
2381 /*
2382 * Check for a Refresh directive. - FM
2383 */
2384 } else if (!strcasecomp(NonNull(http_equiv), "Refresh")) {
2385 char *Seconds = NULL;
2386
2387 LYUCTranslateHTMLString(&content, me->tag_charset, me->tag_charset,
2388 NO, NO, YES, st_other);
2389 LYParseRefreshURL(content, &Seconds, &href);
2390
2391 if (Seconds) {
2392 if (href) {
2393 /*
2394 * We found a URL field, so check it out. - FM
2395 */
2396 if (!LYLegitimizeHREF(me, &href, TRUE, FALSE)) {
2397 /*
2398 * The specs require a complete URL, but this is a
2399 * Netscapism, so don't expect the author to know that. -
2400 * FM
2401 */
2402 HTUserMsg(REFRESH_URL_NOT_ABSOLUTE);
2403 /*
2404 * Use the document's address as the base. - FM
2405 */
2406 if (*href != '\0') {
2407 temp = HTParse(href,
2408 me->node_anchor->address, PARSE_ALL);
2409 StrAllocCopy(href, temp);
2410 FREE(temp);
2411 } else {
2412 StrAllocCopy(href, me->node_anchor->address);
2413 HText_setNoCache(me->text);
2414 }
2415
2416 } else {
2417 /*
2418 * Check whether to fill in localhost. - FM
2419 */
2420 LYFillLocalFileURL(&href,
2421 (me->inBASE ?
2422 me->base_href : me->node_anchor->address));
2423 }
2424
2425 /*
2426 * Set the no_cache flag if the Refresh URL is the same as the
2427 * document's address. - FM
2428 */
2429 if (!strcmp(href, me->node_anchor->address)) {
2430 HText_setNoCache(me->text);
2431 }
2432 } else {
2433 /*
2434 * We didn't find a URL field, so use the document's own
2435 * address and set the no_cache flag. - FM
2436 */
2437 StrAllocCopy(href, me->node_anchor->address);
2438 HText_setNoCache(me->text);
2439 }
2440 /*
2441 * Check for an anchor in http or https URLs. - FM
2442 */
2443 cp = NULL;
2444 /* id_string seems to be used wrong below if given.
2445 not that it matters much. avoid setting it here. - kw */
2446 if (track_internal_links &&
2447 (StrNCmp(href, "http", 4) == 0) &&
2448 (cp = strchr(href, '#')) != NULL) {
2449 StrAllocCopy(id_string, cp);
2450 *cp = '\0';
2451 }
2452 if (me->inA) {
2453 /*
2454 * Ugh! The META tag, which is a HEAD element, is in an
2455 * Anchor, which is BODY element. All we can do is close the
2456 * Anchor and cross our fingers. - FM
2457 */
2458 if (me->inBoldA == TRUE && me->inBoldH == FALSE)
2459 HText_appendCharacter(me->text, LY_BOLD_END_CHAR);
2460 me->inBoldA = FALSE;
2461 HText_endAnchor(me->text, me->CurrentANum);
2462 me->inA = FALSE;
2463 me->CurrentANum = 0;
2464 }
2465 me->CurrentA = HTAnchor_findChildAndLink
2466 (
2467 me->node_anchor, /* Parent */
2468 id_string, /* Tag */
2469 href, /* Addresss */
2470 (HTLinkType *) 0); /* Type */
2471 if (id_string)
2472 *cp = '#';
2473 FREE(id_string);
2474 LYEnsureSingleSpace(me);
2475 if (me->inUnderline == FALSE)
2476 HText_appendCharacter(me->text, LY_UNDERLINE_START_CHAR);
2477 HTML_put_string(me, "REFRESH(");
2478 HTML_put_string(me, Seconds);
2479 HTML_put_string(me, " sec):");
2480 FREE(Seconds);
2481 if (me->inUnderline == FALSE)
2482 HText_appendCharacter(me->text, LY_UNDERLINE_END_CHAR);
2483 HTML_put_character(me, ' ');
2484 me->in_word = NO;
2485 HText_beginAnchor(me->text, me->inUnderline, me->CurrentA);
2486 if (me->inBoldH == FALSE)
2487 HText_appendCharacter(me->text, LY_BOLD_START_CHAR);
2488 HTML_put_string(me, href);
2489 FREE(href);
2490 if (me->inBoldH == FALSE)
2491 HText_appendCharacter(me->text, LY_BOLD_END_CHAR);
2492 HText_endAnchor(me->text, 0);
2493 LYEnsureSingleSpace(me);
2494 }
2495
2496 /*
2497 * Check for a suggested filename via a Content-Disposition with a
2498 * filename=name.suffix in it, if we don't already have it via a server
2499 * header. - FM
2500 */
2501 } else if (isEmpty(me->node_anchor->SugFname) &&
2502 !strcasecomp((http_equiv ?
2503 http_equiv : ""), "Content-Disposition")) {
2504 cp = content;
2505 while (*cp != '\0' && strncasecomp(cp, "filename", 8))
2506 cp++;
2507 if (*cp != '\0') {
2508 cp = LYSkipBlanks(cp + 8);
2509 if (*cp == '=')
2510 cp++;
2511 cp = LYSkipBlanks(cp);
2512 if (*cp != '\0') {
2513 StrAllocCopy(me->node_anchor->SugFname, cp);
2514 if (*me->node_anchor->SugFname == '"') {
2515 if ((cp = strchr((me->node_anchor->SugFname + 1),
2516 '"')) != NULL) {
2517 *(cp + 1) = '\0';
2518 HTMIME_TrimDoubleQuotes(me->node_anchor->SugFname);
2519 if (isEmpty(me->node_anchor->SugFname)) {
2520 FREE(me->node_anchor->SugFname);
2521 }
2522 } else {
2523 FREE(me->node_anchor->SugFname);
2524 }
2525 }
2526 #if defined(UNIX) && !defined(DOSPATH)
2527 /*
2528 * If blanks are not legal for local filenames, replace them
2529 * with underscores.
2530 */
2531 if ((cp = me->node_anchor->SugFname) != NULL) {
2532 while (*cp != '\0') {
2533 if (isspace(UCH(*cp)))
2534 *cp = '_';
2535 ++cp;
2536 }
2537 }
2538 #endif
2539 }
2540 }
2541 /*
2542 * Check for a Set-Cookie directive. - AK
2543 */
2544 } else if (!strcasecomp(NonNull(http_equiv), "Set-Cookie")) {
2545 /*
2546 * This will need to be updated when Set-Cookie/Set-Cookie2 handling is
2547 * finalized. For now, we'll still assume "historical" cookies in META
2548 * directives. - FM
2549 */
2550 url_type = is_url(me->inBASE ?
2551 me->base_href : me->node_anchor->address);
2552 if (url_type == HTTP_URL_TYPE || url_type == HTTPS_URL_TYPE) {
2553 LYSetCookie(content,
2554 NULL,
2555 (me->inBASE ?
2556 me->base_href : me->node_anchor->address));
2557 }
2558 }
2559
2560 /*
2561 * Free the copies. - FM
2562 */
2563 free_META_copies:
2564 FREE(http_equiv);
2565 FREE(name);
2566 FREE(content);
2567 FREE(charset);
2568 }
2569
2570 /*
2571 * This function handles P elements in HTML streams.
2572 * If start is TRUE it handles a start tag, and if
2573 * FALSE, an end tag. We presently handle start
2574 * and end tags identically, but this can lead to
2575 * a different number of blank lines between the
2576 * current paragraph and subsequent text when a P
2577 * end tag is present or not in the markup. - FM
2578 */
LYHandlePlike(HTStructured * me,const BOOL * present,STRING2PTR value,char ** include GCC_UNUSED,int align_idx,int start)2579 void LYHandlePlike(HTStructured * me, const BOOL *present,
2580 STRING2PTR value,
2581 char **include GCC_UNUSED,
2582 int align_idx,
2583 int start)
2584 {
2585 /*
2586 * FIG content should be a true block, which like P inherits the current
2587 * style. APPLET is like character elements or an ALT attribute, unless
2588 * its content contains a block element. If we encounter a P in either's
2589 * content, we set flags to treat the content as a block - FM
2590 */
2591 if (start) {
2592 if (me->inFIG)
2593 me->inFIGwithP = TRUE;
2594
2595 if (me->inAPPLET)
2596 me->inAPPLETwithP = TRUE;
2597 }
2598
2599 UPDATE_STYLE;
2600 if (me->List_Nesting_Level >= 0) {
2601 /*
2602 * We're in a list. Treat P as an instruction to create one blank
2603 * line, if not already present, then fall through to handle
2604 * attributes, with the "second line" margins - FM
2605 */
2606 if (me->inP) {
2607 if (me->inFIG || me->inAPPLET ||
2608 me->inCAPTION || me->inCREDIT ||
2609 me->sp->style->spaceAfter > 0 ||
2610 (start && me->sp->style->spaceBefore > 0)) {
2611 LYEnsureDoubleSpace(me);
2612 } else {
2613 LYEnsureSingleSpace(me);
2614 }
2615 }
2616 } else if (me->sp[0].tag_number == HTML_ADDRESS) {
2617 /*
2618 * We're in an ADDRESS. Treat P as an instruction to start a newline,
2619 * if needed, then fall through to handle attributes - FM
2620 */
2621 if (!HText_LastLineEmpty(me->text, FALSE)) {
2622 HText_setLastChar(me->text, ' '); /* absorb white space */
2623 HText_appendCharacter(me->text, '\r');
2624 }
2625 } else {
2626 if (start) {
2627 if (!(me->inLABEL && !me->inP)) {
2628 HText_appendParagraph(me->text);
2629 }
2630 } else if (me->sp->style->spaceAfter > 0) {
2631 LYEnsureDoubleSpace(me);
2632 } else {
2633 LYEnsureSingleSpace(me);
2634 }
2635 me->inLABEL = FALSE;
2636 }
2637 me->in_word = NO;
2638
2639 if (LYoverride_default_alignment(me)) {
2640 me->sp->style->alignment = LYstyles(me->sp[0].tag_number)->alignment;
2641 } else if ((me->List_Nesting_Level >= 0 &&
2642 (me->sp->style->id == ST_DivCenter ||
2643 me->sp->style->id == ST_DivLeft ||
2644 me->sp->style->id == ST_DivRight)) ||
2645 ((me->Division_Level < 0) &&
2646 (me->sp->style->id == ST_Normal ||
2647 me->sp->style->id == ST_Preformatted))) {
2648 me->sp->style->alignment = HT_LEFT;
2649 } else {
2650 me->sp->style->alignment = (short) me->current_default_alignment;
2651 }
2652
2653 if (start && align_idx >= 0) {
2654 if (present && present[align_idx] && value[align_idx]) {
2655 if (!strcasecomp(value[align_idx], "center") &&
2656 !(me->List_Nesting_Level >= 0 && !me->inP))
2657 me->sp->style->alignment = HT_CENTER;
2658 else if (!strcasecomp(value[align_idx], "right") &&
2659 !(me->List_Nesting_Level >= 0 && !me->inP))
2660 me->sp->style->alignment = HT_RIGHT;
2661 else if (!strcasecomp(value[align_idx], "left") ||
2662 !strcasecomp(value[align_idx], "justify"))
2663 me->sp->style->alignment = HT_LEFT;
2664 }
2665
2666 }
2667
2668 /*
2669 * Mark that we are starting a new paragraph and don't have any of its
2670 * text yet - FM
2671 */
2672 me->inP = FALSE;
2673
2674 return;
2675 }
2676
2677 /*
2678 * This function handles SELECT elements in HTML streams.
2679 * If start is TRUE it handles a start tag, and if FALSE,
2680 * an end tag. - FM
2681 */
LYHandleSELECT(HTStructured * me,const BOOL * present,STRING2PTR value,char ** include GCC_UNUSED,int start)2682 void LYHandleSELECT(HTStructured * me, const BOOL *present,
2683 STRING2PTR value,
2684 char **include GCC_UNUSED,
2685 int start)
2686 {
2687 int i;
2688
2689 if (start == TRUE) {
2690 char *name = NULL;
2691 BOOLEAN multiple = NO;
2692 char *size = NULL;
2693
2694 /*
2695 * Initialize the disable attribute.
2696 */
2697 me->select_disabled = FALSE;
2698
2699 /*
2700 * Check for unclosed TEXTAREA.
2701 */
2702 if (me->inTEXTAREA) {
2703 if (LYBadHTML(me)) {
2704 LYShowBadHTML("Bad HTML: Missing TEXTAREA end tag\n");
2705 }
2706 }
2707
2708 /*
2709 * Set to know we are in a select tag.
2710 */
2711 me->inSELECT = TRUE;
2712
2713 if (!(present && present[HTML_SELECT_NAME] &&
2714 non_empty(value[HTML_SELECT_NAME]))) {
2715 StrAllocCopy(name, "");
2716 } else if (strchr(value[HTML_SELECT_NAME], '&') == NULL) {
2717 StrAllocCopy(name, value[HTML_SELECT_NAME]);
2718 } else {
2719 StrAllocCopy(name, value[HTML_SELECT_NAME]);
2720 UNESCAPE_FIELDNAME_TO_STD(&name);
2721 }
2722 if (present && present[HTML_SELECT_MULTIPLE])
2723 multiple = YES;
2724 if (present && present[HTML_SELECT_DISABLED])
2725 me->select_disabled = TRUE;
2726 if (present && present[HTML_SELECT_SIZE] &&
2727 non_empty(value[HTML_SELECT_SIZE])) {
2728 /*
2729 * Let the size be determined by the number of OPTIONs. - FM
2730 */
2731 CTRACE((tfp, "LYHandleSELECT: Ignoring SIZE=\"%s\" for SELECT.\n",
2732 value[HTML_SELECT_SIZE]));
2733 }
2734
2735 if (me->inBoldH == TRUE &&
2736 (multiple == NO || LYSelectPopups == FALSE)) {
2737 HText_appendCharacter(me->text, LY_BOLD_END_CHAR);
2738 me->inBoldH = FALSE;
2739 me->needBoldH = TRUE;
2740 }
2741 if (me->inUnderline == TRUE &&
2742 (multiple == NO || LYSelectPopups == FALSE)) {
2743 HText_appendCharacter(me->text, LY_UNDERLINE_END_CHAR);
2744 me->inUnderline = FALSE;
2745 }
2746
2747 if ((multiple == NO && LYSelectPopups == TRUE) &&
2748 (me->sp[0].tag_number == HTML_PRE || me->inPRE == TRUE ||
2749 !me->sp->style->freeFormat) &&
2750 HText_LastLineSize(me->text, FALSE) > (LYcolLimit - 7)) {
2751 /*
2752 * Force a newline when we're using a popup in a PRE block and are
2753 * within 7 columns from the right margin. This will allow for the
2754 * '[' popup designator and help avoid a wrap in the underscore
2755 * placeholder for the retracted popup entry in the HText
2756 * structure. - FM
2757 */
2758 HTML_put_character(me, '\n');
2759 me->in_word = NO;
2760 }
2761
2762 LYCheckForID(me, present, value, (int) HTML_SELECT_ID);
2763
2764 HText_beginSelect(name, ATTR_CS_IN, multiple, size);
2765 FREE(name);
2766 FREE(size);
2767
2768 me->first_option = TRUE;
2769 } else {
2770 /*
2771 * Handle end tag.
2772 */
2773 char *ptr;
2774
2775 /*
2776 * Make sure we had a select start tag.
2777 */
2778 if (!me->inSELECT) {
2779 if (LYBadHTML(me)) {
2780 LYShowBadHTML("Bad HTML: Unmatched SELECT end tag\n");
2781 }
2782 return;
2783 }
2784
2785 /*
2786 * Set to know that we are no longer in a select tag.
2787 */
2788 me->inSELECT = FALSE;
2789
2790 /*
2791 * Clear the disable attribute.
2792 */
2793 me->select_disabled = FALSE;
2794
2795 /*
2796 * Finish the data off.
2797 */
2798 HTChunkTerminate(&me->option);
2799 /*
2800 * Finish the previous option.
2801 */
2802 ptr = HText_setLastOptionValue(me->text,
2803 me->option.data,
2804 me->LastOptionValue,
2805 LAST_ORDER,
2806 me->LastOptionChecked,
2807 me->UCLYhndl,
2808 ATTR_CS_IN);
2809 FREE(me->LastOptionValue);
2810
2811 me->LastOptionChecked = FALSE;
2812
2813 if (HTCurSelectGroupType == F_CHECKBOX_TYPE ||
2814 LYSelectPopups == FALSE) {
2815 /*
2816 * Start a newline after the last checkbox/button option.
2817 */
2818 LYEnsureSingleSpace(me);
2819 } else {
2820 /*
2821 * Output popup box with the default option to screen, but use
2822 * non-breaking spaces for output.
2823 */
2824 if (ptr &&
2825 me->sp[0].tag_number == HTML_PRE && strlen(ptr) > 6) {
2826 /*
2827 * The code inadequately handles OPTION fields in PRE tags.
2828 * We'll put up a minimum of 6 characters, and if any more
2829 * would exceed the wrap column, we'll ignore them.
2830 */
2831 for (i = 0; i < 6; i++) {
2832 if (*ptr == ' ')
2833 HText_appendCharacter(me->text, HT_NON_BREAK_SPACE);
2834 else
2835 HText_appendCharacter(me->text, *ptr);
2836 ptr++;
2837 }
2838 }
2839 for (; non_empty(ptr); ptr++) {
2840 if (*ptr == ' ')
2841 HText_appendCharacter(me->text, HT_NON_BREAK_SPACE);
2842 else
2843 HText_appendCharacter(me->text, *ptr);
2844 }
2845 /*
2846 * Add end option character.
2847 */
2848 if (!me->first_option) {
2849 HText_appendCharacter(me->text, ']');
2850 HText_setLastChar(me->text, ']');
2851 me->in_word = YES;
2852 }
2853 }
2854 HTChunkClear(&me->option);
2855
2856 if (me->Underline_Level > 0 && me->inUnderline == FALSE) {
2857 HText_appendCharacter(me->text, LY_UNDERLINE_START_CHAR);
2858 me->inUnderline = TRUE;
2859 }
2860 if (me->needBoldH == TRUE && me->inBoldH == FALSE) {
2861 HText_appendCharacter(me->text, LY_BOLD_START_CHAR);
2862 me->inBoldH = TRUE;
2863 me->needBoldH = FALSE;
2864 }
2865 }
2866 }
2867
2868 /*
2869 * This function strips white characters and
2870 * generally fixes up attribute values that
2871 * were received from the SGML parser and
2872 * are to be treated as partial or absolute
2873 * URLs. - FM
2874 */
LYLegitimizeHREF(HTStructured * me,char ** href,int force_slash,int strip_dots)2875 int LYLegitimizeHREF(HTStructured * me, char **href,
2876 int force_slash,
2877 int strip_dots)
2878 {
2879 int url_type = 0;
2880 char *p = NULL;
2881 char *pound = NULL;
2882 const char *Base = NULL;
2883
2884 if (!me || !href || isEmpty(*href))
2885 return (url_type);
2886
2887 if (!LYTrimStartfile(*href)) {
2888 /*
2889 * Collapse spaces in the actual URL, but just protect against tabs or
2890 * newlines in the fragment, if present. This seeks to cope with
2891 * atrocities inflicted on the Web by authoring tools such as
2892 * Frontpage. - FM
2893 */
2894
2895 /* Before working on spaces check if we have any, usually none. */
2896 p = LYSkipNonBlanks(*href);
2897
2898 if (*p) { /* p == first space character */
2899 /* no reallocs below, all converted in place */
2900
2901 pound = findPoundSelector(*href);
2902
2903 if (pound != NULL && pound < p) {
2904 convert_to_spaces(p, FALSE); /* done */
2905
2906 } else {
2907 if (pound != NULL)
2908 *pound = '\0'; /* mark */
2909
2910 /*
2911 * No blanks really belong in the HREF,
2912 * but if it refers to an actual file,
2913 * it may actually have blanks in the name.
2914 * Try to accommodate. See also HTParse().
2915 */
2916 if (LYRemoveNewlines(p) || strchr(p, '\t') != 0) {
2917 LYRemoveBlanks(p); /* a compromise... */
2918 }
2919
2920 if (pound != NULL) {
2921 p = strchr(p, '\0');
2922 *pound = '#'; /* restore */
2923 convert_to_spaces(pound, FALSE);
2924 if (p < pound)
2925 strcpy(p, pound);
2926 }
2927 }
2928 }
2929 }
2930 if (**href == '\0')
2931 return (url_type);
2932
2933 TRANSLATE_AND_UNESCAPE_TO_STD(href);
2934
2935 Base = me->inBASE ?
2936 me->base_href : me->node_anchor->address;
2937
2938 url_type = is_url(*href);
2939 if (!url_type && force_slash && **href == '.' &&
2940 (!strcmp(*href, ".") || !strcmp(*href, "..")) &&
2941 !isFILE_URL(Base)) {
2942 /*
2943 * The Fielding RFC/ID for resolving partial HREFs says that a slash
2944 * should be on the end of the preceding symbolic element for "." and
2945 * "..", but all tested browsers only do that for an explicit "./" or
2946 * "../", so we'll respect the RFC/ID only if force_slash was TRUE and
2947 * it's not a file URL. - FM
2948 */
2949 StrAllocCat(*href, "/");
2950 }
2951 if ((!url_type && LYStripDotDotURLs && strip_dots && **href == '.') &&
2952 !strncasecomp(Base, "http", 4)) {
2953 /*
2954 * We will be resolving a partial reference versus an http or https
2955 * URL, and it has lead dots, which may be retained when resolving via
2956 * HTParse(), but the request would fail if the first element of the
2957 * resultant path is two dots, because no http or https server accepts
2958 * such paths, and the current URL draft, likely to become an RFC, says
2959 * that it's optional for the UA to strip them as a form of error
2960 * recovery. So we will, recursively, for http/https URLs, like the
2961 * "major market browsers" which made this problem so common on the
2962 * Web, but we'll also issue a message about it, such that the bad
2963 * partial reference might get corrected by the document provider. -
2964 * FM
2965 */
2966 char *temp = NULL, *path = NULL, *cp;
2967 const char *str = "";
2968
2969 temp = HTParse(*href, Base, PARSE_ALL);
2970 path = HTParse(temp, "", PARSE_PATH + PARSE_PUNCTUATION);
2971 if (!StrNCmp(path, "/..", 3)) {
2972 cp = (path + 3);
2973 if (LYIsHtmlSep(*cp) || *cp == '\0') {
2974 if (Base[4] == 's') {
2975 str = "s";
2976 }
2977 CTRACE((tfp,
2978 "LYLegitimizeHREF: Bad value '%s' for http%s URL.\n",
2979 *href, str));
2980 CTRACE((tfp, " Stripping lead dots.\n"));
2981 if (!me->inBadHREF) {
2982 HTUserMsg(BAD_PARTIAL_REFERENCE);
2983 me->inBadHREF = TRUE;
2984 }
2985 }
2986 if (*cp == '\0') {
2987 StrAllocCopy(*href, "/");
2988 } else if (LYIsHtmlSep(*cp)) {
2989 while (!StrNCmp(cp, "/..", 3)) {
2990 if (*(cp + 3) == '/') {
2991 cp += 3;
2992 continue;
2993 } else if (*(cp + 3) == '\0') {
2994 *(cp + 1) = '\0';
2995 *(cp + 2) = '\0';
2996 }
2997 break;
2998 }
2999 StrAllocCopy(*href, cp);
3000 }
3001 }
3002 FREE(temp);
3003 FREE(path);
3004 }
3005 return (url_type);
3006 }
3007
3008 /*
3009 * This function checks for a Content-Base header,
3010 * and if not present, a Content-Location header
3011 * which is an absolute URL, and sets the BASE
3012 * accordingly. If set, it will be replaced by
3013 * any BASE tag in the HTML stream, itself. - FM
3014 */
LYCheckForContentBase(HTStructured * me)3015 void LYCheckForContentBase(HTStructured * me)
3016 {
3017 char *cp = NULL;
3018 BOOL present[HTML_BASE_ATTRIBUTES];
3019 const char *value[HTML_BASE_ATTRIBUTES];
3020 int i;
3021
3022 if (!(me && me->node_anchor))
3023 return;
3024
3025 if (me->node_anchor->content_base != NULL) {
3026 /*
3027 * We have a Content-Base value. Use it if it's non-zero length. - FM
3028 */
3029 if (*me->node_anchor->content_base == '\0')
3030 return;
3031 StrAllocCopy(cp, me->node_anchor->content_base);
3032 LYRemoveBlanks(cp);
3033 } else if (me->node_anchor->content_location != NULL) {
3034 /*
3035 * We didn't have a Content-Base value, but do have a Content-Location
3036 * value. Use it if it's an absolute URL. - FM
3037 */
3038 if (*me->node_anchor->content_location == '\0')
3039 return;
3040 StrAllocCopy(cp, me->node_anchor->content_location);
3041 LYRemoveBlanks(cp);
3042 if (!is_url(cp)) {
3043 FREE(cp);
3044 return;
3045 }
3046 } else {
3047 /*
3048 * We had neither a Content-Base nor Content-Location value. - FM
3049 */
3050 return;
3051 }
3052
3053 /*
3054 * If we collapsed to a zero-length value, ignore it. - FM
3055 */
3056 if (*cp == '\0') {
3057 FREE(cp);
3058 return;
3059 }
3060
3061 /*
3062 * Pass the value to HTML_start_element as the HREF of a BASE tag. - FM
3063 */
3064 for (i = 0; i < HTML_BASE_ATTRIBUTES; i++)
3065 present[i] = NO;
3066 present[HTML_BASE_HREF] = YES;
3067 value[HTML_BASE_HREF] = (const char *) cp;
3068 (*me->isa->start_element) (me, HTML_BASE, present, value,
3069 0, 0);
3070 FREE(cp);
3071 }
3072
3073 /*
3074 * This function creates NAMEd Anchors if a non-zero-length NAME
3075 * or ID attribute was present in the tag. - FM
3076 */
LYCheckForID(HTStructured * me,const BOOL * present,STRING2PTR value,int attribute)3077 void LYCheckForID(HTStructured * me, const BOOL *present,
3078 STRING2PTR value,
3079 int attribute)
3080 {
3081 HTChildAnchor *ID_A = NULL;
3082 char *temp = NULL;
3083
3084 if (!(me && me->text))
3085 return;
3086
3087 if (present && present[attribute]
3088 && non_empty(value[attribute])) {
3089 /*
3090 * Translate any named or numeric character references. - FM
3091 */
3092 StrAllocCopy(temp, value[attribute]);
3093 LYUCTranslateHTMLString(&temp, me->tag_charset, me->tag_charset,
3094 NO, NO, YES, st_URL);
3095
3096 /*
3097 * Create the link if we still have a non-zero-length string. - FM
3098 */
3099 if ((temp[0] != '\0') &&
3100 (ID_A = HTAnchor_findChildAndLink
3101 (
3102 me->node_anchor, /* Parent */
3103 temp, /* Tag */
3104 NULL, /* Addresss */
3105 (HTLinkType *) 0))) { /* Type */
3106 HText_beginAnchor(me->text, me->inUnderline, ID_A);
3107 HText_endAnchor(me->text, 0);
3108 }
3109 FREE(temp);
3110 }
3111 }
3112
3113 /*
3114 * This function creates a NAMEd Anchor for the ID string
3115 * passed to it directly as an argument. It assumes the
3116 * does not need checking for character references. - FM
3117 */
LYHandleID(HTStructured * me,const char * id)3118 void LYHandleID(HTStructured * me, const char *id)
3119 {
3120 HTChildAnchor *ID_A = NULL;
3121
3122 if (!(me && me->text) ||
3123 isEmpty(id))
3124 return;
3125
3126 /*
3127 * Create the link if we still have a non-zero-length string. - FM
3128 */
3129 if ((ID_A = HTAnchor_findChildAndLink
3130 (
3131 me->node_anchor, /* Parent */
3132 id, /* Tag */
3133 NULL, /* Addresss */
3134 (HTLinkType *) 0)) != NULL) { /* Type */
3135 HText_beginAnchor(me->text, me->inUnderline, ID_A);
3136 HText_endAnchor(me->text, 0);
3137 }
3138 }
3139
3140 /*
3141 * This function checks whether we want to override
3142 * the current default alignment for paragraphs and
3143 * instead use that specified in the element's style
3144 * sheet. - FM
3145 */
LYoverride_default_alignment(HTStructured * me)3146 BOOLEAN LYoverride_default_alignment(HTStructured * me)
3147 {
3148 if (!me)
3149 return NO;
3150
3151 switch (me->sp[0].tag_number) {
3152 case HTML_BLOCKQUOTE:
3153 case HTML_BQ:
3154 case HTML_NOTE:
3155 case HTML_FN:
3156 case HTML_ADDRESS:
3157 me->sp->style->alignment = HT_LEFT;
3158 return YES;
3159
3160 default:
3161 break;
3162 }
3163 return NO;
3164 }
3165
3166 /*
3167 * This function inserts newlines if needed to create double spacing,
3168 * and sets the left margin for subsequent text to the second line
3169 * indentation of the current style. - FM
3170 */
LYEnsureDoubleSpace(HTStructured * me)3171 void LYEnsureDoubleSpace(HTStructured * me)
3172 {
3173 if (!me || !me->text)
3174 return;
3175
3176 if (!HText_LastLineEmpty(me->text, FALSE)) {
3177 HText_setLastChar(me->text, ' '); /* absorb white space */
3178 HText_appendCharacter(me->text, '\r');
3179 HText_appendCharacter(me->text, '\r');
3180 } else if (!HText_PreviousLineEmpty(me->text, FALSE)) {
3181 HText_setLastChar(me->text, ' '); /* absorb white space */
3182 HText_appendCharacter(me->text, '\r');
3183 } else if (me->List_Nesting_Level >= 0) {
3184 HText_NegateLineOne(me->text);
3185 }
3186 me->in_word = NO;
3187 return;
3188 }
3189
3190 /*
3191 * This function inserts a newline if needed to create single spacing,
3192 * and sets the left margin for subsequent text to the second line
3193 * indentation of the current style. - FM
3194 */
LYEnsureSingleSpace(HTStructured * me)3195 void LYEnsureSingleSpace(HTStructured * me)
3196 {
3197 if (!me || !me->text)
3198 return;
3199
3200 if (!HText_LastLineEmpty(me->text, FALSE)) {
3201 HText_setLastChar(me->text, ' '); /* absorb white space */
3202 HText_appendCharacter(me->text, '\r');
3203 } else if (me->List_Nesting_Level >= 0) {
3204 HText_NegateLineOne(me->text);
3205 }
3206 me->in_word = NO;
3207 return;
3208 }
3209
3210 /*
3211 * This function resets paragraph alignments for block
3212 * elements which do not have a defined style sheet. - FM
3213 */
LYResetParagraphAlignment(HTStructured * me)3214 void LYResetParagraphAlignment(HTStructured * me)
3215 {
3216 if (!me)
3217 return;
3218
3219 if (me->List_Nesting_Level >= 0 ||
3220 ((me->Division_Level < 0) &&
3221 (me->sp->style->id == ST_Normal ||
3222 me->sp->style->id == ST_Preformatted))) {
3223 me->sp->style->alignment = HT_LEFT;
3224 } else {
3225 me->sp->style->alignment = (short) me->current_default_alignment;
3226 }
3227 return;
3228 }
3229
3230 /*
3231 * This example function checks whether the given anchor has
3232 * an address with a file scheme, and if so, loads it into the
3233 * the SGML parser's context->url element, which was passed as
3234 * the second argument. The handle_comment() calling function in
3235 * SGML.c then calls LYDoCSI() in LYUtils.c to insert HTML markup
3236 * into the corresponding stream, homologously to an SSI by an
3237 * HTTP server. - FM
3238 *
3239 * For functions similar to this but which depend on details of
3240 * the HTML handler's internal data, the calling interface should
3241 * be changed, and functions in SGML.c would have to make sure not
3242 * to call such functions inappropriately (e.g., calling a function
3243 * specific to the Lynx_HTML_Handler when SGML.c output goes to
3244 * some other HTStructured object like in HTMLGen.c), or the new
3245 * functions could be added to the SGML.h interface.
3246 */
LYCheckForCSI(HTParentAnchor * anchor,char ** url)3247 BOOLEAN LYCheckForCSI(HTParentAnchor *anchor,
3248 char **url)
3249 {
3250 if (!(anchor && anchor->address))
3251 return FALSE;
3252
3253 if (!isFILE_URL(anchor->address))
3254 return FALSE;
3255
3256 if (!LYisLocalHost(anchor->address))
3257 return FALSE;
3258
3259 StrAllocCopy(*url, anchor->address);
3260 return TRUE;
3261 }
3262
3263 /*
3264 * This function is called from the SGML parser to look at comments
3265 * and see whether we should collect some info from them. Currently
3266 * it only looks for comments with Message-Id and Subject info, in the
3267 * exact form generated by MHonArc for archived mailing list. If found,
3268 * the info is stored in the document's HTParentAnchor. It can later be
3269 * used for generating a mail response.
3270 *
3271 * We are extra picky here because there isn't any official definition
3272 * for these kinds of comments - we might (and still can) misinterpret
3273 * arbitrary comments as something they aren't.
3274 *
3275 * If something doesn't look right, for example invalid characters, the
3276 * strings are not stored. Mail responses will use something else as
3277 * the subject, probably the document URL, and will not have an
3278 * In-Reply-To header.
3279 *
3280 * All this is a hack - to do this the right way, mailing list archivers
3281 * would have to agree on some better mechanism to make this kind of info
3282 * from original mail headers available, for example using LINK. - kw
3283 */
LYCommentHacks(HTParentAnchor * anchor,const char * comment)3284 BOOLEAN LYCommentHacks(HTParentAnchor *anchor,
3285 const char *comment)
3286 {
3287 const char *cp;
3288 size_t len;
3289
3290 if (comment == NULL)
3291 return FALSE;
3292
3293 if (!(anchor && anchor->address))
3294 return FALSE;
3295
3296 if (StrNCmp(comment, "!--X-Message-Id: ", 17) == 0) {
3297 char *messageid = NULL;
3298 char *p;
3299
3300 for (cp = comment + 17; *cp; cp++) {
3301 if (UCH(*cp) >= 127 || !isgraph(UCH(*cp))) {
3302 break;
3303 }
3304 }
3305 if (strcmp(cp, " --")) {
3306 return FALSE;
3307 }
3308 cp = comment + 17;
3309 StrAllocCopy(messageid, cp);
3310 /* This should be ok - message-id should only contain 7-bit ASCII */
3311 if (!LYUCTranslateHTMLString(&messageid, 0, 0, NO, NO, YES, st_URL))
3312 return FALSE;
3313 for (p = messageid; *p; p++) {
3314 if (UCH(*p) >= 127 || !isgraph(UCH(*p))) {
3315 break;
3316 }
3317 }
3318 if (strcmp(p, " --")) {
3319 FREE(messageid);
3320 return FALSE;
3321 }
3322 if ((p = strchr(messageid, '@')) == NULL || p[1] == '\0') {
3323 FREE(messageid);
3324 return FALSE;
3325 }
3326 p = messageid;
3327 if ((len = strlen(p)) >= 8 && !strcmp(&p[len - 3], " --")) {
3328 p[len - 3] = '\0';
3329 } else {
3330 FREE(messageid);
3331 return FALSE;
3332 }
3333 if (HTAnchor_setMessageID(anchor, messageid)) {
3334 FREE(messageid);
3335 return TRUE;
3336 } else {
3337 FREE(messageid);
3338 return FALSE;
3339 }
3340 }
3341 if (StrNCmp(comment, "!--X-Subject: ", 14) == 0) {
3342 char *subject = NULL;
3343 char *p;
3344
3345 for (cp = comment + 14; *cp; cp++) {
3346 if (UCH(*cp) >= 127 || !isprint(UCH(*cp))) {
3347 return FALSE;
3348 }
3349 }
3350 cp = comment + 14;
3351 StrAllocCopy(subject, cp);
3352 /* @@@
3353 * This may not be the right thing for the subject - but mail
3354 * subjects shouldn't contain 8-bit characters in raw form anyway.
3355 * We have to unescape character entities, since that's what MHonArc
3356 * seems to generate. But if after that there are 8-bit characters
3357 * the string is rejected. We would probably not know correctly
3358 * what charset to assume anyway - the mail sender's can differ from
3359 * the archive's. And the code for sending mail cannot deal well
3360 * with 8-bit characters - we should not put them in the Subject
3361 * header in raw form, but don't have MIME encoding implemented.
3362 * Someone may want to do more about this... - kw
3363 */
3364 if (!LYUCTranslateHTMLString(&subject, 0, 0, NO, YES, NO, st_HTML))
3365 return FALSE;
3366 for (p = subject; *p; p++) {
3367 if (UCH(*p) >= 127 || !isprint(UCH(*p))) {
3368 FREE(subject);
3369 return FALSE;
3370 }
3371 }
3372 p = subject;
3373 if ((len = strlen(p)) >= 4 && !strcmp(&p[len - 3], " --")) {
3374 p[len - 3] = '\0';
3375 } else {
3376 FREE(subject);
3377 return FALSE;
3378 }
3379 if (HTAnchor_setSubject(anchor, subject)) {
3380 FREE(subject);
3381 return TRUE;
3382 } else {
3383 FREE(subject);
3384 return FALSE;
3385 }
3386 }
3387
3388 return FALSE;
3389 }
3390
3391 /*
3392 * Create the Title with any left-angle-brackets converted to < entities
3393 * and any ampersands converted to & entities. - FM
3394 *
3395 * Convert 8-bit letters to &#xUUUU to avoid dependencies from display
3396 * character set which may need changing. Do NOT convert any 8-bit chars
3397 * if we have CJK display. - LP
3398 */
LYformTitle(char ** dst,const char * src)3399 void LYformTitle(char **dst,
3400 const char *src)
3401 {
3402 if (HTCJK == JAPANESE) {
3403 char *tmp_buffer = NULL;
3404
3405 if ((tmp_buffer = (char *) malloc(strlen(src) + 1)) == 0)
3406 outofmem(__FILE__, "LYformTitle");
3407
3408 assert(tmp_buffer != NULL);
3409
3410 switch (kanji_code) { /* 1997/11/22 (Sat) 09:28:00 */
3411 case EUC:
3412 TO_EUC((const unsigned char *) src, (unsigned char *) tmp_buffer);
3413 break;
3414 case SJIS:
3415 TO_SJIS((const unsigned char *) src, (unsigned char *) tmp_buffer);
3416 break;
3417 default:
3418 CTRACE((tfp, "\nLYformTitle: kanji_code is an unexpected value."));
3419 strcpy(tmp_buffer, src);
3420 break;
3421 }
3422 StrAllocCopy(*dst, tmp_buffer);
3423 FREE(tmp_buffer);
3424 } else {
3425 StrAllocCopy(*dst, src);
3426 }
3427 }
3428