1 /*
2  * $LynxId: HTML.h,v 1.33 2011/05/19 09:57:53 tom Exp $
3  *
4  *					HTML to rich text converter for libwww
5  *
6  *			THE HTML TO RTF OBJECT CONVERTER
7  *
8  *  This interprets the HTML semantics.
9  */
10 #ifndef HTML_H
11 #define HTML_H
12 
13 #ifndef HTUTILS_H
14 #include <HTUtils.h>
15 #endif /* HTUTILS_H */
16 
17 #include <UCDefs.h>
18 #include <UCAux.h>
19 #include <HTAnchor.h>
20 #include <HTMLDTD.h>
21 
22 #ifdef __cplusplus
23 extern "C" {
24 #endif
25 /* #define ATTR_CS_IN (me->T.output_utf8 ? me->UCLYhndl : 0) */
26 #define ATTR_CS_IN me->tag_charset
27 #define TRANSLATE_AND_UNESCAPE_ENTITIES(s, p, h) \
28 	LYUCTranslateHTMLString(s, ATTR_CS_IN, current_char_set, YES, p, h, st_HTML)
29 #define TRANSLATE_AND_UNESCAPE_ENTITIES5(s,cs_from,cs_to,p,h) \
30 	LYUCTranslateHTMLString(s, cs_from, cs_to, YES, p, h, st_HTML)
31 #define TRANSLATE_AND_UNESCAPE_ENTITIES6(s,cs_from,cs_to,spcls,p,h) \
32 	LYUCTranslateHTMLString(s, cs_from, cs_to, spcls, p, h, st_HTML)
33 #define TRANSLATE_HTML(s,p,h) \
34 	LYUCFullyTranslateString(s, me->UCLYhndl, current_char_set, NO, YES, p, h, NO, st_HTML)
35 #define TRANSLATE_HTML5(s,cs_from,cs_to,p,h) \
36 	LYUCFullyTranslateString(s, cs_from, cs_to, NO, YES, p, h, NO, st_HTML)
37 #define TRANSLATE_HTML7(s,cs_from,cs_to,spcls,p,h,Back) \
38 	LYUCFullyTranslateString(s, cs_from, cs_to, NO, spcls, p, h, Back, st_HTML)
39 /*
40  * Strings from attributes which should be converted to some kind of "standard"
41  * representation (character encoding), was Latin-1, esp.  URLs (incl.
42  * #fragments) and HTML NAME and ID stuff.
43  */
44 #define TRANSLATE_AND_UNESCAPE_TO_STD(s) \
45 	LYUCTranslateHTMLString(s, ATTR_CS_IN, ATTR_CS_IN, NO, NO, YES, st_URL)
46 #define UNESCAPE_FIELDNAME_TO_STD(s) \
47 	LYUCTranslateHTMLString(s, ATTR_CS_IN, ATTR_CS_IN, NO, NO, YES, st_HTML)
48     extern const HTStructuredClass HTMLPresentation;
49 
50 #ifdef Lynx_HTML_Handler
51 /*
52  *	This section is semi-private to HTML.c and its helper modules. - FM
53  *	-------------------------------------------------------------------
54  */
55 
56     typedef struct _stack_element {
57 	HTStyle *style;
58 	int tag_number;
59     } stack_element;
60 
61 /*		HTML Object
62  *		-----------
63  */
64 #define MAX_NESTING 800		/* Should be checked by parser */
65 
66     struct _HTStructured {
67 	const HTStructuredClass *isa;
68 	HTParentAnchor *node_anchor;
69 	HText *text;
70 
71 	HTStream *target;	/* Output stream */
72 	HTStreamClass targetClass;	/* Output routines */
73 
74 	HTChildAnchor *CurrentA;	/* current HTML_A anchor */
75 	int CurrentANum;	/* current HTML_A number */
76 	char *base_href;	/* current HTML_BASE href */
77 	char *map_address;	/* current HTML_MAP address */
78 
79 	HTChunk title;		/* Grow by 128 */
80 	HTChunk object;		/* Grow by 128 */
81 	BOOL object_started;
82 	BOOL object_declare;
83 	BOOL object_shapes;
84 	BOOL object_ismap;
85 	char *object_usemap;
86 	char *object_id;
87 	char *object_title;
88 	char *object_data;
89 	char *object_type;
90 	char *object_classid;
91 	char *object_codebase;
92 	char *object_codetype;
93 	char *object_name;
94 	int objects_mixed_open, objects_figged_open;
95 	HTChunk option;		/* Grow by 128 */
96 	BOOL first_option;	/* First OPTION in SELECT? */
97 	char *LastOptionValue;
98 	BOOL LastOptionChecked;
99 	BOOL select_disabled;
100 	HTChunk textarea;	/* Grow by 128 */
101 	char *textarea_name;
102 	int textarea_name_cs;
103 	char *textarea_accept_cs;
104 	int textarea_cols;
105 	int textarea_rows;
106 	int textarea_disabled;
107 	int textarea_readonly;
108 	char *textarea_id;
109 	HTChunk math;		/* Grow by 128 */
110 	HTChunk style_block;	/* Grow by 128 */
111 	HTChunk script;		/* Grow by 128 */
112 
113 	/*
114 	 *  Used for nested lists. - FM
115 	 */
116 	int List_Nesting_Level;	/* counter for list nesting level */
117 	int OL_Counter[12];	/* counter for ordered lists */
118 	char OL_Type[12];	/* types for ordered lists */
119 	int Last_OL_Count;	/* last count in ordered lists */
120 	char Last_OL_Type;	/* last type in ordered lists */
121 
122 	int Division_Level;
123 	short DivisionAlignments[MAX_NESTING];
124 	int Underline_Level;
125 	int Quote_Level;
126 
127 	BOOL UsePlainSpace;
128 	BOOL HiddenValue;
129 	int lastraw;
130 
131 	const char *comment_start;	/* for literate programming */
132 	const char *comment_end;
133 
134 	HTTag *current_tag;
135 	BOOL style_change;
136 	HTStyle *new_style;
137 	HTStyle *old_style;
138 	int current_default_alignment;
139 	BOOL in_word;		/* Have just had a non-white char */
140 	stack_element stack[MAX_NESTING];
141 	stack_element *sp;	/* Style stack pointer */
142 	BOOL stack_overrun;	/* Was MAX_NESTING exceeded? */
143 	int skip_stack;		/* flag to skip next style stack operation */
144 
145 	/*
146 	 *  Track if we are in an anchor, paragraph, address, base, etc.
147 	 */
148 	BOOL inA;
149 	BOOL inAPPLET;
150 	BOOL inAPPLETwithP;
151 	BOOL inBadBASE;
152 	BOOL inBadHREF;
153 	BOOL inBadHTML;
154 	BOOL inBASE;
155 	BOOL inBoldA;
156 	BOOL inBoldH;
157 	BOOL inCAPTION;
158 	BOOL inCREDIT;
159 	BOOL inFIG;
160 	BOOL inFIGwithP;
161 	BOOL inFONT;
162 	BOOL inFORM;
163 	BOOL inLABEL;
164 	BOOL inP;
165 	BOOL inPRE;
166 	BOOL inSELECT;
167 	BOOL inTABLE;
168 	BOOL inTEXTAREA;
169 	BOOL inUnderline;
170 
171 	BOOL needBoldH;
172 
173 	char *xinclude;		/* if no include strin address passed */
174 	/*
175 	 * UCI and UCLYhndl give the UCInfo and charset registered for the HTML
176 	 * parser in the node_anchor's UCStages structure.  It indicates what is
177 	 * fed to the HTML parser as the stream of character data (not necessarily
178 	 * tags and attributes).  It should currently always be set to be the same
179 	 * as UCI and UCLhndl for the HTEXT stage in the node_anchor's UCStages
180 	 * structure, since the HTML parser sends its input character data to the
181 	 * output without further charset translation.
182 	 */
183 	LYUCcharset *UCI;
184 	int UCLYhndl;
185 	/*
186 	 * inUCI and inUCLYhndl indicate the UCInfo and charset which the HTML
187 	 * parser treats at the input charset.  It is normally set to the UCI and
188 	 * UCLhndl for the SGML parser in the node_anchor's UCStages structure
189 	 * (which may be a dummy, based on the MIME parser's UCI and UCLhndl in
190 	 * that structure, when we are handling a local file or non-http(s)
191 	 * gateway).  It could be changed temporarily by the HTML parser, for
192 	 * conversions of attribute strings, but should be reset once done.  - FM
193 	 */
194 	LYUCcharset *inUCI;
195 	int inUCLYhndl;
196 	/*
197 	 * outUCI and outUCLYhndl indicate the UCInfo and charset which the HTML
198 	 * parser treats as the output charset.  It is normally set to its own UCI
199 	 * and UCLhndl.  It could be changed for conversions of attribute strings,
200 	 * but should be reset once done.  - FM
201 	 */
202 	LYUCcharset *outUCI;
203 	int outUCLYhndl;
204 	/*
205 	 * T holds the transformation rules for conversions of strings between the
206 	 * input and output charsets by the HTML parser.  - FM
207 	 */
208 	UCTransParams T;
209 
210 	int tag_charset;	/* charset for attribute values etc. */
211     };
212 
213     extern HTStyle *LYstyles(int style_number);
214     extern BOOL LYBadHTML(HTStructured * me);
215     extern void LYShowBadHTML(const char *s);
216 
217 /*
218  *	Semi-Private functions. - FM
219  */
220     extern void HTML_put_character(HTStructured * me, int c);
221     extern void HTML_put_string(HTStructured * me, const char *s);
222     extern void HTML_write(HTStructured * me, const char *s, int l);
223     extern int HTML_put_entity(HTStructured * me, int entity_number);
224     extern void actually_set_style(HTStructured * me);
225 
226 /*	Style buffering avoids dummy paragraph begin/ends.
227 */
228 #define UPDATE_STYLE if (me->style_change) { actually_set_style(me); }
229 #endif				/* Lynx_HTML_Handler */
230 
231     extern void strtolower(char *i);
232 
233 /*				P U B L I C
234 */
235 
236 /*
237  *  HTConverter to present HTML
238  */
239     extern HTStream *HTMLToPlain(HTPresentation *pres,
240 				 HTParentAnchor *anchor,
241 				 HTStream *sink);
242 
243     extern HTStream *HTMLParsedPresent(HTPresentation *pres,
244 				       HTParentAnchor *anchor,
245 				       HTStream *sink);
246 
247     extern HTStream *HTMLToC(HTPresentation *pres,
248 			     HTParentAnchor *anchor,
249 			     HTStream *sink);
250 
251     extern HTStream *HTMLPresent(HTPresentation *pres,
252 				 HTParentAnchor *anchor,
253 				 HTStream *sink);
254 
255     extern HTStructured *HTML_new(HTParentAnchor *anchor,
256 				  HTFormat format_out,
257 				  HTStream *target);
258 
259 /*
260  * Record error message as a hypertext object.
261  *
262  * The error message should be marked as an error so that it can be reloaded
263  * later.  This implementation just throws up an error message and leaves the
264  * document unloaded.
265  *
266  * On entry,
267  *      sink    is a stream to the output device if any
268  *      number  is the HTTP error number
269  *      message is the human readable message.
270  * On exit,
271  *      a return code like HT_LOADED if object exists else 60; 0
272  */
273     extern int HTLoadError(HTStream *sink,
274 			   int number,
275 			   const char *message);
276 
277 #ifdef __cplusplus
278 }
279 #endif
280 #endif				/* HTML_H */
281