1 /*
2  * $LynxId: HTMLDTD.c,v 1.57 2010/09/25 00:30:56 tom Exp $
3  *
4  *		Our Static DTD for HTML
5  *		-----------------------
6  */
7 
8 /* Implements:
9 */
10 
11 #include <HTUtils.h>
12 #include <HTMLDTD.h>
13 #include <LYLeaks.h>
14 #include <LYJustify.h>
15 
16 /*
17  * Character entities like &nbsp now excluded from our DTD tables, they are
18  * mapped to Unicode and handled by chartrans code directly the similar way the
19  * numeric entities like &#123 does.  See src/chrtrans/entities.h for real
20  * mapping.
21  */
22 
23 /*	Entity Names
24  *	------------
25  *
26  *	This table must be matched exactly with ALL the translation tables
27  *		(this is an obsolete translation mechanism, probably unused,
28  *		currently replaced with Unicode chartrans in most cases...)
29  */
30 static const char *entities[] =
31 {
32     "AElig",			/* capital AE diphthong (ligature) */
33     "Aacute",			/* capital A, acute accent */
34     "Acirc",			/* capital A, circumflex accent */
35     "Agrave",			/* capital A, grave accent */
36     "Aring",			/* capital A, ring */
37     "Atilde",			/* capital A, tilde */
38     "Auml",			/* capital A, dieresis or umlaut mark */
39     "Ccedil",			/* capital C, cedilla */
40     "Dstrok",			/* capital Eth, Icelandic */
41     "ETH",			/* capital Eth, Icelandic */
42     "Eacute",			/* capital E, acute accent */
43     "Ecirc",			/* capital E, circumflex accent */
44     "Egrave",			/* capital E, grave accent */
45     "Euml",			/* capital E, dieresis or umlaut mark */
46     "Iacute",			/* capital I, acute accent */
47     "Icirc",			/* capital I, circumflex accent */
48     "Igrave",			/* capital I, grave accent */
49     "Iuml",			/* capital I, dieresis or umlaut mark */
50     "Ntilde",			/* capital N, tilde */
51     "Oacute",			/* capital O, acute accent */
52     "Ocirc",			/* capital O, circumflex accent */
53     "Ograve",			/* capital O, grave accent */
54     "Oslash",			/* capital O, slash */
55     "Otilde",			/* capital O, tilde */
56     "Ouml",			/* capital O, dieresis or umlaut mark */
57     "THORN",			/* capital THORN, Icelandic */
58     "Uacute",			/* capital U, acute accent */
59     "Ucirc",			/* capital U, circumflex accent */
60     "Ugrave",			/* capital U, grave accent */
61     "Uuml",			/* capital U, dieresis or umlaut mark */
62     "Yacute",			/* capital Y, acute accent */
63     "aacute",			/* small a, acute accent */
64     "acirc",			/* small a, circumflex accent */
65     "acute",			/* spacing acute */
66     "aelig",			/* small ae diphthong (ligature) */
67     "agrave",			/* small a, grave accent */
68     "amp",			/* ampersand */
69     "aring",			/* small a, ring */
70     "atilde",			/* small a, tilde */
71     "auml",			/* small a, dieresis or umlaut mark */
72     "brkbar",			/* broken vertical bar */
73     "brvbar",			/* broken vertical bar */
74     "ccedil",			/* small c, cedilla */
75     "cedil",			/* spacing cedilla */
76     "cent",			/* cent sign */
77     "copy",			/* copyright sign */
78     "curren",			/* currency sign */
79     "deg",			/* degree sign */
80     "die",			/* spacing dieresis */
81     "divide",			/* division sign */
82     "eacute",			/* small e, acute accent */
83     "ecirc",			/* small e, circumflex accent */
84     "egrave",			/* small e, grave accent */
85     "emdash",			/* dash the width of emsp */
86     "emsp",			/* em space - not collapsed */
87     "endash",			/* dash the width of ensp */
88     "ensp",			/* en space - not collapsed */
89     "eth",			/* small eth, Icelandic */
90     "euml",			/* small e, dieresis or umlaut mark */
91     "frac12",			/* fraction 1/2 */
92     "frac14",			/* fraction 1/4 */
93     "frac34",			/* fraction 3/4 */
94     "gt",			/* greater than */
95     "hibar",			/* spacing macron */
96     "iacute",			/* small i, acute accent */
97     "icirc",			/* small i, circumflex accent */
98     "iexcl",			/* inverted exclamation mark */
99     "igrave",			/* small i, grave accent */
100     "iquest",			/* inverted question mark */
101     "iuml",			/* small i, dieresis or umlaut mark */
102     "laquo",			/* angle quotation mark, left */
103     "lt",			/* less than */
104     "macr",			/* spacing macron */
105     "mdash",			/* dash the width of emsp */
106     "micro",			/* micro sign */
107     "middot",			/* middle dot */
108     "nbsp",			/* non breaking space */
109     "ndash",			/* dash the width of ensp */
110     "not",			/* negation sign */
111     "ntilde",			/* small n, tilde */
112     "oacute",			/* small o, acute accent */
113     "ocirc",			/* small o, circumflex accent */
114     "ograve",			/* small o, grave accent */
115     "ordf",			/* feminine ordinal indicator */
116     "ordm",			/* masculine ordinal indicator */
117     "oslash",			/* small o, slash */
118     "otilde",			/* small o, tilde */
119     "ouml",			/* small o, dieresis or umlaut mark */
120     "para",			/* paragraph sign */
121     "plusmn",			/* plus-or-minus sign */
122     "pound",			/* pound sign */
123     "quot",			/* quote '"' */
124     "raquo",			/* angle quotation mark, right */
125     "reg",			/* circled R registered sign */
126     "sect",			/* section sign */
127     "shy",			/* soft hyphen */
128     "sup1",			/* superscript 1 */
129     "sup2",			/* superscript 2 */
130     "sup3",			/* superscript 3 */
131     "szlig",			/* small sharp s, German (sz ligature) */
132     "thinsp",			/* thin space (not collapsed) */
133     "thorn",			/* small thorn, Icelandic */
134     "times",			/* multiplication sign */
135     "trade",			/* trade mark sign (U+2122) */
136     "uacute",			/* small u, acute accent */
137     "ucirc",			/* small u, circumflex accent */
138     "ugrave",			/* small u, grave accent */
139     "uml",			/* spacing dieresis */
140     "uuml",			/* small u, dieresis or umlaut mark */
141     "yacute",			/* small y, acute accent */
142     "yen",			/* yen sign */
143     "yuml",			/* small y, dieresis or umlaut mark */
144 };
145 
146 /*		Attribute Lists
147  *		---------------
148  *
149  *	Lists must be in alphabetical order by attribute name
150  *	The tag elements contain the number of attributes
151  */
152 
153 /* From Peter Flynn's intro to the HTML Pro DTD:
154 
155    %structure;
156 
157    DIV, CENTER, H1 to H6, P, UL, OL, DL, DIR, MENU, PRE, XMP, LISTING, BLOCKQUOTE, BQ,
158    2	1	2     2   1  8	 8   8	 8    8     8	 8    8        4	   4
159    MULTICOL,?NOBR, FORM, TABLE, ADDRESS, FIG, BDO, NOTE, and FN; plus?WBR, LI, and LH
160    8 n	    ?1 n   8	 8	2	 2    2    2	     2	    ?1 nE  4	   4
161 
162    %insertions;
163 
164    Elements which usually contain special-purpose material, or no text material at all.
165 
166    BASEFONT, APPLET, OBJECT, EMBED, SCRIPT, MAP, MARQUEE, HR, ISINDEX, BGSOUND, TAB,?IMG,
167    1 e?      2	     2 l     1 e    2 l     8	 4	  4 E 1? E     1 E	! E ?1 E
168    IMAGE, BR, plus NOEMBED, SERVER, SPACER, AUDIOSCOPE, and SIDEBAR; ?area
169    1 n	  1 E	     n	      n	      n	      n		      n	      8 E
170 
171    %text;
172 
173    Elements within the %structure; which directly contain running text.
174 
175    Descriptive or analytic markup: EM, STRONG, DFN, CODE, SAMP, KBD, VAR, CITE, Q, LANG, AU,
176 				   2   2       2    2	  2	2    2	  2	2  2 n	 2
177    AUTHOR, PERSON, ACRONYM, ABBR, INS, DEL, and SPAN
178    2	   2 n	   2	    2	    2	 2	  2
179    Visual markup:S, STRIKE, I, B, TT, U,?NOBR,?WBR, BR, BIG, SMALL, FONT, STYLE, BLINK, TAB,
180 		 1  1	    1  1  1   1  ?1 n ?1nE? 1 E  1   1	    1	  1 l	 1	1 E?
181    BLACKFACE, LIMITTEXT, NOSMARTQUOTES, and SHADOW
182    1 n	      1 n	 1 n		    1 n
183    Hypertext and graphics: A and?IMG
184 			   8	?8 E
185    Mathematical: SUB, SUP, and MATH
186 		 4    4        4 l
187    Documentary: COMMENT, ENTITY, ELEMENT, and ATTRIB
188 		4	 4 n	 4 n	      4 n
189    %formula;
190  */
191 
192 /*	Elements
193  *	--------
194  *
195  *	Must match definitions in HTMLDTD.html!
196  *	Must be in alphabetical order.
197  *
198  *  The T_* extra info is listed here, even though most fields are not used
199  *  in SGML.c if Old_DTD is set (with the exception of some Tgf_* flags).
200  *  This simplifies comparison of the tags_table0[] table (otherwise unchanged
201  *  from original Lynx treatment) with the tags_table1[] table below. - kw
202  *
203  *    Name*,	Attributes,	No. of attributes,     content,   extra info...
204  */
205 
206 #include <src0_HTMLDTD.h>
207 #include <src1_HTMLDTD.h>
208 
209 /* Dummy space, will be filled with the contents of either tags_table1
210    or tags_table0 on calling HTSwitchDTD - kw */
211 
212 static HTTag tags[HTML_ALL_ELEMENTS];
213 
214 const SGML_dtd HTML_dtd =
215 {
216     tags,
217     HTML_ELEMENTS,
218     entities,			/* probably unused */
219     TABLESIZE(entities),
220 };
221 
222 /* This function fills the "tags" part of the HTML_dtd structure with
223    what we want to use, either tags_table0 or tags_table1.  Note that it
224    has to be called at least once before HTML_dtd is used, otherwise
225    the HTML_dtd contents will be invalid!  This could be coded in a way
226    that would make an initialisation call unnecessary, but my C knowledge
227    is limited and I didn't want to list the whole tags_table1 table
228    twice... - kw */
HTSwitchDTD(int new_flag)229 void HTSwitchDTD(int new_flag)
230 {
231     if (TRACE)
232 	CTRACE((tfp,
233 		"HTMLDTD: Copying %s DTD element info of size %d, %d * %d\n",
234 		new_flag ? "strict" : "tagsoup",
235 		(int) (new_flag ? sizeof(tags_table1) : sizeof(tags_table0)),
236 		HTML_ALL_ELEMENTS,
237 		(int) sizeof(HTTag)));
238     if (new_flag)
239 	MemCpy(tags, tags_table1, HTML_ALL_ELEMENTS * sizeof(HTTag));
240     else
241 	MemCpy(tags, tags_table0, HTML_ALL_ELEMENTS * sizeof(HTTag));
242 }
243 
244 HTTag HTTag_unrecognized =
245 
246 {NULL_HTTag, NULL, 0, 0, SGML_EMPTY, T__UNREC_};
247 
248 /*
249  *	Utility Routine:  Useful for people building HTML objects.
250  */
251 
252 /*	Start anchor element
253  *	--------------------
254  *
255  *	It is kinda convenient to have a particulr routine for
256  *	starting an anchor element, as everything else for HTML is
257  *	simple anyway.
258  */
259 struct _HTStructured {
260     HTStructuredClass *isa;
261     /* ... */
262 };
263 
HTStartAnchor(HTStructured * obj,const char * name,const char * href)264 void HTStartAnchor(HTStructured * obj, const char *name,
265 		   const char *href)
266 {
267     BOOL present[HTML_A_ATTRIBUTES];
268     const char *value[HTML_A_ATTRIBUTES];
269     int i;
270 
271     for (i = 0; i < HTML_A_ATTRIBUTES; i++)
272 	present[i] = NO;
273 
274     if (name && *name) {
275 	present[HTML_A_NAME] = YES;
276 	value[HTML_A_NAME] = (const char *) name;
277     }
278     if (href) {
279 	present[HTML_A_HREF] = YES;
280 	value[HTML_A_HREF] = (const char *) href;
281     }
282 
283     (*obj->isa->start_element) (obj, HTML_A, present, value, -1, 0);
284 }
285 
HTStartAnchor5(HTStructured * obj,const char * name,const char * href,const char * linktype,int tag_charset)286 void HTStartAnchor5(HTStructured * obj, const char *name,
287 		    const char *href,
288 		    const char *linktype,
289 		    int tag_charset)
290 {
291     BOOL present[HTML_A_ATTRIBUTES];
292     const char *value[HTML_A_ATTRIBUTES];
293     int i;
294 
295     for (i = 0; i < HTML_A_ATTRIBUTES; i++)
296 	present[i] = NO;
297 
298     if (name && *name) {
299 	present[HTML_A_NAME] = YES;
300 	value[HTML_A_NAME] = name;
301     }
302     if (href && *href) {
303 	present[HTML_A_HREF] = YES;
304 	value[HTML_A_HREF] = href;
305     }
306     if (linktype && *linktype) {
307 	present[HTML_A_TYPE] = YES;
308 	value[HTML_A_TYPE] = linktype;
309     }
310 
311     (*obj->isa->start_element) (obj, HTML_A, present, value, tag_charset, 0);
312 }
313 
HTStartIsIndex(HTStructured * obj,const char * prompt,const char * href)314 void HTStartIsIndex(HTStructured * obj, const char *prompt,
315 		    const char *href)
316 {
317     BOOL present[HTML_ISINDEX_ATTRIBUTES];
318     const char *value[HTML_ISINDEX_ATTRIBUTES];
319     int i;
320 
321     for (i = 0; i < HTML_ISINDEX_ATTRIBUTES; i++)
322 	present[i] = NO;
323 
324     if (prompt && *prompt) {
325 	present[HTML_ISINDEX_PROMPT] = YES;
326 	value[HTML_ISINDEX_PROMPT] = (const char *) prompt;
327     }
328     if (href) {
329 	present[HTML_ISINDEX_HREF] = YES;
330 	value[HTML_ISINDEX_HREF] = (const char *) href;
331     }
332 
333     (*obj->isa->start_element) (obj, HTML_ISINDEX, present, value, -1, 0);
334 }
335