1 /*
2  * $LynxId: LYCharSets.c,v 1.68 2013/01/04 21:47:16 tom Exp $
3  */
4 #include <HTUtils.h>
5 #include <HTCJK.h>
6 #include <HTMLDTD.h>
7 
8 #include <LYGlobalDefs.h>
9 #include <UCMap.h>
10 #include <UCdomap.h>
11 #include <UCDefs.h>
12 #include <LYCharSets.h>
13 #include <GridText.h>
14 #include <LYCurses.h>
15 #include <LYStrings.h>
16 
17 #include <LYLeaks.h>
18 
19 #if defined(__MirBSD__) && defined(_nc_set_locale)
20 extern void _nc_set_locale(char *);
21 #endif
22 
23 HTkcode kanji_code = NOKANJI;
24 BOOLEAN LYHaveCJKCharacterSet = FALSE;
25 BOOLEAN DisplayCharsetMatchLocale = TRUE;
26 BOOL force_old_UCLYhndl_on_reload = FALSE;
27 int forced_UCLYhdnl;
28 int LYNumCharsets = 0;		/* Will be initialized later by UC_Register. */
29 int current_char_set = -1;	/* will be intitialized later in LYMain.c */
30 int linedrawing_char_set = -1;
31 STRING2PTR p_entity_values = NULL;	/* Pointer, for HTML_put_entity() */
32 
33 			      /* obsolete and probably not used(???)        */
34 			      /* will be initialized in HTMLUseCharacterSet */
35 #ifdef USE_CHARSET_CHOICE
36 charset_subset_t charset_subsets[MAXCHARSETS];
37 BOOL custom_display_charset = FALSE;
38 BOOL custom_assumed_doc_charset = FALSE;
39 
40 #ifndef ALL_CHARSETS_IN_O_MENU_SCREEN
41 int display_charset_map[MAXCHARSETS];
42 int assumed_doc_charset_map[MAXCHARSETS];
43 
44 const char *display_charset_choices[MAXCHARSETS + 1];
45 const char *assumed_charset_choices[MAXCHARSETS + 1];
46 int displayed_display_charset_idx;
47 #endif
48 #endif /* USE_CHARSET_CHOICE */
49 
50 /*
51  * New character sets now declared with UCInit() in UCdomap.c
52  *
53  * INSTRUCTIONS for adding new character sets which do not have
54  *		Unicode tables now in UCdomap.h
55  *
56  *
57  * [We hope you need not correct/add old-style mapping below as in ISO_LATIN1[]
58  * or SevenBitApproximations[] any more - it works now via new chartrans
59  * mechanism, but kept for compatibility only:  we should cleanup the stuff,
60  * but this is not so easy...]
61  *
62  * Currently we only declare some charset's properties here (such as MIME
63  * names, etc.), it does not include real mapping.
64  *
65  * There is a place marked "Add your new character sets HERE" in this file.
66  * Make up a character set and add it in the same style as the ISO_LATIN1 set
67  * below, giving it a unique name.
68  *
69  * Add the name of the set to LYCharSets.  Similarly add the appropriate
70  * information to the tables below:  LYchar_set_names, LYCharSet_UC,
71  * LYlowest_eightbit.  These 4 tables all MUST have the same order.  (And this
72  * is the order you will see in Lynx Options Menu, which is why few
73  * unicode-based charsets are listed here).
74  *
75  */
76 
77 /*	Entity values -- for ISO Latin 1 local representation
78  *
79  *	This MUST match exactly the table referred to in the DTD!
80  */
81 static const char *ISO_Latin1[] =
82 {
83     "\306",			/* capital AE diphthong (ligature) (&#198;) - AElig */
84     "\301",			/* capital A, acute accent (&#193;) - Aacute */
85     "\302",			/* capital A, circumflex accent (&#194;) - Acirc */
86     "\300",			/* capital A, grave accent (&#192;) - Agrave */
87     "\305",			/* capital A, ring - Aring (&#197;) */
88     "\303",			/* capital A, tilde - Atilde (&#195;) */
89     "\304",			/* capital A, dieresis or umlaut mark (&#196;) - Auml */
90     "\307",			/* capital C, cedilla - Ccedil (&#199;) */
91     "\320",			/* capital Eth or D with stroke (&#208;) - Dstrok */
92     "\320",			/* capital Eth, Icelandic (&#208;) - ETH */
93     "\311",			/* capital E, acute accent (&#201;) - Eacute */
94     "\312",			/* capital E, circumflex accent (&#202;) - Ecirc */
95     "\310",			/* capital E, grave accent (&#200;) - Egrave */
96     "\313",			/* capital E, dieresis or umlaut mark (&#203;) - Euml */
97     "\315",			/* capital I, acute accent (&#205;) - Iacute */
98     "\316",			/* capital I, circumflex accent (&#206;) - Icirc */
99     "\314",			/* capital I, grave accent (&#204;) - Igrave */
100     "\317",			/* capital I, dieresis or umlaut mark (&#207;) - Iuml */
101     "\321",			/* capital N, tilde (&#209;) - Ntilde */
102     "\323",			/* capital O, acute accent (&#211;) - Oacute */
103     "\324",			/* capital O, circumflex accent (&#212;) - Ocirc */
104     "\322",			/* capital O, grave accent (&#210;) - Ograve */
105     "\330",			/* capital O, slash (&#216;) - Oslash */
106     "\325",			/* capital O, tilde (&#213;) - Otilde */
107     "\326",			/* capital O, dieresis or umlaut mark (&#214;) - Ouml */
108     "\336",			/* capital THORN, Icelandic (&#222;) - THORN */
109     "\332",			/* capital U, acute accent (&#218;) - Uacute */
110     "\333",			/* capital U, circumflex accent (&#219;) - Ucirc */
111     "\331",			/* capital U, grave accent (&#217;) - Ugrave */
112     "\334",			/* capital U, dieresis or umlaut mark (&#220;) - Uuml */
113     "\335",			/* capital Y, acute accent (&#221;) - Yacute */
114     "\341",			/* small a, acute accent (&#225;) - aacute */
115     "\342",			/* small a, circumflex accent (&#226;) - acirc */
116     "\264",			/* spacing acute (&#180;) - acute */
117     "\346",			/* small ae diphthong (ligature) (&#230;) - aelig */
118     "\340",			/* small a, grave accent (&#224;) - agrave */
119     "\046",			/* ampersand (&#38;) - amp */
120     "\345",			/* small a, ring (&#229;) - aring */
121     "\343",			/* small a, tilde (&#227;) - atilde */
122     "\344",			/* small a, dieresis or umlaut mark (&#228;) - auml */
123     "\246",			/* broken vertical bar (&#166;) - brkbar */
124     "\246",			/* broken vertical bar (&#166;) - brvbar */
125     "\347",			/* small c, cedilla (&#231;) - ccedil */
126     "\270",			/* spacing cedilla (&#184;) - cedil */
127     "\242",			/* cent sign (&#162;) - cent */
128     "\251",			/* copyright sign (&#169;) - copy */
129     "\244",			/* currency sign (&#164;) - curren */
130     "\260",			/* degree sign (&#176;) - deg */
131     "\250",			/* spacing dieresis (&#168;) - die */
132     "\367",			/* division sign (&#247;) - divide */
133     "\351",			/* small e, acute accent (&#233;) - eacute */
134     "\352",			/* small e, circumflex accent (&#234;) - ecirc */
135     "\350",			/* small e, grave accent (&#232;) - egrave */
136     "-",			/* dash the width of emsp - emdash */
137     "\002",			/* emsp, em space - not collapsed NEVER CHANGE THIS - emsp */
138     "-",			/* dash the width of ensp - endash */
139     "\002",			/* ensp, en space - not collapsed NEVER CHANGE THIS - ensp */
140     "\360",			/* small eth, Icelandic (&#240;) - eth */
141     "\353",			/* small e, dieresis or umlaut mark (&#235;) - euml */
142     "\275",			/* fraction 1/2 (&#189;) - frac12 */
143     "\274",			/* fraction 1/4 (&#188;) - frac14 */
144     "\276",			/* fraction 3/4 (&#190;) - frac34 */
145     "\076",			/* greater than (&#62;) - gt */
146     "\257",			/* spacing macron (&#175;) - hibar */
147     "\355",			/* small i, acute accent (&#237;) - iacute */
148     "\356",			/* small i, circumflex accent (&#238;) - icirc */
149     "\241",			/* inverted exclamation mark (&#161;) - iexcl */
150     "\354",			/* small i, grave accent (&#236;) - igrave */
151     "\277",			/* inverted question mark (&#191;) - iquest */
152     "\357",			/* small i, dieresis or umlaut mark (&#239;) - iuml */
153     "\253",			/* angle quotation mark, left (&#171;) - laquo */
154     "\074",			/* less than (&#60;) - lt */
155     "\257",			/* spacing macron (&#175;) - macr */
156     "-",			/* dash the width of emsp - mdash */
157     "\265",			/* micro sign (&#181;) - micro */
158     "\267",			/* middle dot (&#183;) - middot */
159     "\001",			/* nbsp non-breaking space NEVER CHANGE THIS - nbsp */
160     "-",			/* dash the width of ensp - ndash */
161     "\254",			/* negation sign (&#172;) - not */
162     "\361",			/* small n, tilde (&#241;) - ntilde */
163     "\363",			/* small o, acute accent (&#243;) - oacute */
164     "\364",			/* small o, circumflex accent (&#244;) - ocirc */
165     "\362",			/* small o, grave accent (&#242;) - ograve */
166     "\252",			/* feminine ordinal indicator (&#170;) - ordf */
167     "\272",			/* masculine ordinal indicator (&#186;) - ordm */
168     "\370",			/* small o, slash (&#248;) - oslash */
169     "\365",			/* small o, tilde (&#245;) - otilde */
170     "\366",			/* small o, dieresis or umlaut mark (&#246;) - ouml */
171     "\266",			/* paragraph sign (&#182;) - para */
172     "\261",			/* plus-or-minus sign (&#177;) - plusmn */
173     "\243",			/* pound sign (&#163;) - pound */
174     "\042",			/* quote '"' (&#34;) - quot */
175     "\273",			/* angle quotation mark, right (&#187;) - raquo */
176     "\256",			/* circled R registered sign (&#174;) - reg */
177     "\247",			/* section sign (&#167;) - sect */
178     "\007",			/* soft hyphen (&#173;) NEVER CHANGE THIS - shy */
179     "\271",			/* superscript 1 (&#185;) - sup1 */
180     "\262",			/* superscript 2 (&#178;) - sup2 */
181     "\263",			/* superscript 3 (&#179;) - sup3 */
182     "\337",			/* small sharp s, German (sz ligature) (&#223;) - szlig */
183     "\002",			/* thin space - not collapsed NEVER CHANGE THIS - thinsp */
184     "\376",			/* small thorn, Icelandic (&#254;) - thorn */
185     "\327",			/* multiplication sign (&#215;) - times */
186     "(TM)",			/* circled TM trade mark sign (&#8482;) - trade */
187     "\372",			/* small u, acute accent (&#250;) - uacute */
188     "\373",			/* small u, circumflex accent (&#251;) - ucirc */
189     "\371",			/* small u, grave accent (&#249;) - ugrave */
190     "\250",			/* spacing dieresis (&#168;) - uml */
191     "\374",			/* small u, dieresis or umlaut mark (&#252;) - uuml */
192     "\375",			/* small y, acute accent (&#253;) - yacute */
193     "\245",			/* yen sign (&#165;) - yen */
194     "\377",			/* small y, dieresis or umlaut mark (&#255;) - yuml */
195 };
196 
197 /*	Entity values -- 7 bit character approximations
198  *
199  *	This MUST match exactly the table referred to in the DTD!
200  */
201 const char *SevenBitApproximations[] =
202 {
203     "AE",			/* capital AE diphthong (ligature) (&#198;) - AElig */
204     "A",			/* capital A, acute accent (&#193;) - Aacute */
205     "A",			/* capital A, circumflex accent (&#194;) - Acirc */
206     "A",			/* capital A, grave accent (&#192;) - Agrave */
207     "A",			/* capital A, ring - Aring (&#197;) */
208     "A",			/* capital A, tilde - Atilde (&#195;) */
209 #ifdef LY_UMLAUT
210     "Ae",			/* capital A, dieresis or umlaut mark (&#196;) - Auml */
211 #else
212     "A",			/* capital A, dieresis or umlaut mark (&#196;) - Auml */
213 #endif				/* LY_UMLAUT */
214     "C",			/* capital C, cedilla (&#199;) - Ccedil */
215     "Dj",			/* capital D with stroke (&#208;) - Dstrok */
216     "DH",			/* capital Eth, Icelandic (&#208;) - ETH */
217     "E",			/* capital E, acute accent (&#201;) - Eacute */
218     "E",			/* capital E, circumflex accent (&#202;) - Ecirc */
219     "E",			/* capital E, grave accent (&#200;) - Egrave */
220     "E",			/* capital E, dieresis or umlaut mark (&#203;) - Euml */
221     "I",			/* capital I, acute accent (&#205;) - Iacute */
222     "I",			/* capital I, circumflex accent (&#206;) - Icirc */
223     "I",			/* capital I, grave accent (&#204;) - Igrave */
224     "I",			/* capital I, dieresis or umlaut mark (&#207;) - Iuml */
225     "N",			/* capital N, tilde - Ntilde (&#209;) */
226     "O",			/* capital O, acute accent (&#211;) - Oacute */
227     "O",			/* capital O, circumflex accent (&#212;) - Ocirc */
228     "O",			/* capital O, grave accent (&#210;) - Ograve */
229     "O",			/* capital O, slash (&#216;) - Oslash */
230     "O",			/* capital O, tilde (&#213;) - Otilde */
231 #ifdef LY_UMLAUT
232     "Oe",			/* capital O, dieresis or umlaut mark (&#214;) - Ouml */
233 #else
234     "O",			/* capital O, dieresis or umlaut mark (&#214;) - Ouml */
235 #endif				/* LY_UMLAUT */
236     "P",			/* capital THORN, Icelandic (&#222;) - THORN */
237     "U",			/* capital U, acute accent (&#218;) - Uacute */
238     "U",			/* capital U, circumflex accent (&#219;) - Ucirc */
239     "U",			/* capital U, grave accent (&#217;) - Ugrave */
240 #ifdef LY_UMLAUT
241     "Ue",			/* capital U, dieresis or umlaut mark (&#220;) - Uuml */
242 #else
243     "U",			/* capital U, dieresis or umlaut mark (&#220;) - Uuml */
244 #endif				/* LY_UMLAUT */
245     "Y",			/* capital Y, acute accent (&#221;) - Yacute */
246     "a",			/* small a, acute accent (&#225;) - aacute */
247     "a",			/* small a, circumflex accent (&#226;) - acirc */
248     "'",			/* spacing acute (&#180;) - acute */
249     "ae",			/* small ae diphthong (ligature) (&#230;) - aelig */
250     "`a",			/* small a, grave accent (&#232;) - agrave */
251     "&",			/* ampersand (&#38;) - amp */
252     "a",			/* small a, ring (&#229;) - aring */
253     "a",			/* small a, tilde (&#227;) - atilde */
254 #ifdef LY_UMLAUT
255     "ae",			/* small a, dieresis or umlaut mark (&#228;) - auml */
256 #else
257     "a",			/* small a, dieresis or umlaut mark (&#228;) - auml */
258 #endif				/* LY_UMLAUT */
259     "|",			/* broken vertical bar (&#166;) - brkbar */
260     "|",			/* broken vertical bar (&#166;) - brvbar */
261     "c",			/* small c, cedilla (&#231;) - ccedil */
262     ",",			/* spacing cedilla (&#184;) - cedil */
263     "-c-",			/* cent sign (&#162;) - cent */
264     "(c)",			/* copyright sign (&#169;) - copy */
265     "CUR",			/* currency sign (&#164;) - curren */
266     "DEG",			/* degree sign (&#176;) - deg */
267     "\042",			/* spacing dieresis (&#168;) - die */
268     "/",			/* division sign (&#247;) - divide */
269     "e",			/* small e, acute accent (&#233;) - eacute */
270     "e",			/* small e, circumflex accent (&#234;) - ecirc */
271     "e",			/* small e, grave accent (&#232;) - egrave */
272     "-",			/* dash the width of emsp - emdash */
273     "\002",			/* emsp NEVER CHANGE THIS - emsp */
274     "-",			/* dash the width of ensp - endash */
275     "\002",			/* ensp NEVER CHANGE THIS - ensp */
276     "dh",			/* small eth, Icelandic eth (&#240;) */
277     "e",			/* small e, dieresis or umlaut mark (&#235;) - euml */
278     " 1/2",			/* fraction 1/2 (&#189;) - frac12 */
279     " 1/4",			/* fraction 1/4 (&#188;) - frac14 */
280     " 3/4",			/* fraction 3/4 (&#190;) - frac34 */
281     ">",			/* greater than (&#62;) - gt */
282     "-",			/* spacing macron (&#175;) - hibar */
283     "i",			/* small i, acute accent (&#237;) - iacute */
284     "i",			/* small i, circumflex accent (&#238;) - icirc */
285     "!",			/* inverted exclamation mark (&#161;) - iexcl */
286     "`i",			/* small i, grave accent (&#236;) - igrave */
287     "?",			/* inverted question mark (&#191;) - iquest */
288     "i",			/* small i, dieresis or umlaut mark (&#239;) - iuml */
289     "<<",			/* angle quotation mark, left (&#171;) - laquo */
290     "<",			/* less than - lt (&#60;) */
291     "-",			/* spacing macron (&#175;) - macr */
292     "-",			/* dash the width of emsp - mdash */
293     "u",			/* micro sign (&#181;) - micro */
294     ".",			/* middle dot (&#183;) - middot */
295     "\001",			/* nbsp non-breaking space NEVER CHANGE THIS - nbsp */
296     "-",			/* dash the width of ensp - ndash */
297     "NOT",			/* negation sign (&#172;) - not */
298     "n",			/* small n, tilde (&#241;) - ntilde */
299     "o",			/* small o, acute accent (&#243;) - oacute */
300     "o",			/* small o, circumflex accent (&#244;) - ocirc */
301     "o",			/* small o, grave accent (&#242;) - ograve */
302     "-a",			/* feminine ordinal indicator (&#170;) - ordf */
303     "-o",			/* masculine ordinal indicator (&#186;) - ordm */
304     "o",			/* small o, slash (&#248;) - oslash */
305     "o",			/* small o, tilde (&#245;) - otilde */
306 #ifdef LY_UMLAUT
307     "oe",			/* small o, dieresis or umlaut mark (&#246;) - ouml */
308 #else
309     "o",			/* small o, dieresis or umlaut mark (&#246;) - ouml */
310 #endif				/* LY_UMLAUT */
311     "P:",			/* paragraph sign (&#182;) - para */
312     "+-",			/* plus-or-minus sign (&#177;) - plusmn */
313     "-L-",			/* pound sign (&#163;) - pound */
314     "\"",			/* quote '"' (&#34;) - quot */
315     ">>",			/* angle quotation mark, right (&#187;) - raquo */
316     "(R)",			/* circled R registered sign (&#174;) - reg */
317     "S:",			/* section sign (&#167;) - sect */
318     "\007",			/* soft hyphen (&#173;) NEVER CHANGE THIS - shy */
319     "^1",			/* superscript 1 (&#185;) - sup1 */
320     "^2",			/* superscript 2 (&#178;) - sup2 */
321     "^3",			/* superscript 3 (&#179;) - sup3 */
322     "ss",			/* small sharp s, German (sz ligature) (&#223;) - szlig */
323     "\002",			/* thin space - not collapsed NEVER CHANGE THIS - thinsp */
324     "p",			/* small thorn, Icelandic (&#254;) - thorn */
325     "*",			/* multiplication sign (&#215;) - times */
326     "(TM)",			/* circled TM trade mark sign (&#8482;) - trade */
327     "u",			/* small u, acute accent (&#250;) - uacute */
328     "u",			/* small u, circumflex accent (&#251;) - ucirc */
329     "u",			/* small u, grave accent (&#249;) - ugrave */
330     "\042",			/* spacing dieresis (&#168;) - uml */
331 #ifdef LY_UMLAUT
332     "ue",			/* small u, dieresis or umlaut mark (&#252;) - uuml */
333 #else
334     "u",			/* small u, dieresis or umlaut mark (&#252;) - uuml */
335 #endif				/* LY_UMLAUT */
336     "y",			/* small y, acute accent (&#253;) - yacute */
337     "YEN",			/* yen sign (&#165;) - yen */
338     "y",			/* small y, dieresis or umlaut mark (&#255;) - yuml */
339 };
340 
341 /*
342  * Add your new character sets HERE (but only if you can't construct Unicode
343  * tables for them).  - FM
344  */
345 
346 /*
347  * Add the array name to LYCharSets
348  */
349 STRING2PTR LYCharSets[MAXCHARSETS] =
350 {
351     ISO_Latin1,			/* ISO Latin 1          */
352     SevenBitApproximations,	/* 7 Bit Approximations */
353 };
354 
355 /*
356  * Add the name that the user will see below.  The order of LYCharSets and
357  * LYchar_set_names MUST be the same
358  */
359 const char *LYchar_set_names[MAXCHARSETS + 1] =
360 {
361     "Western (ISO-8859-1)",
362     "7 bit approximations (US-ASCII)",
363     (char *) 0
364 };
365 
366 /*
367  * Associate additional pieces of info with each of the charsets listed above.
368  * Will be automatically modified (and extended) by charset translations which
369  * are loaded using the chartrans mechanism.  Most important piece of info to
370  * put here is a MIME charset name.  Used for chartrans (see UCDefs.h).  The
371  * order of LYCharSets and LYCharSet_UC MUST be the same.
372  *
373  * Note that most of the charsets added by the new mechanism in src/chrtrans
374  * don't show up here at all.  They don't have to.
375  */
376 LYUCcharset LYCharSet_UC[MAXCHARSETS] =
377 {
378   /*
379    * Zero position placeholder and HTMLGetEntityUCValue() reference.  - FM
380    */
381     {-1, "iso-8859-1", UCT_ENC_8BIT, 0,
382      UCT_REP_IS_LAT1,
383      UCT_CP_IS_LAT1, UCT_R_LAT1, UCT_R_LAT1},
384 
385   /*
386    * Placeholders for Unicode tables.  - FM
387    */
388     {-1, "us-ascii", UCT_ENC_7BIT, 0,
389      UCT_REP_SUBSETOF_LAT1,
390      UCT_CP_SUBSETOF_LAT1, UCT_R_ASCII, UCT_R_ASCII},
391 
392 };
393 
394 /*
395  * Add the code of the the lowest character with the high bit set that can be
396  * directly displayed.  The order of LYCharSets and LYlowest_eightbit MUST be
397  * the same.
398  *
399  * (If charset have chartrans unicode table, LYlowest_eightbit will be
400  * verified/modified anyway.)
401  */
402 int LYlowest_eightbit[MAXCHARSETS] =
403 {
404     160,			/* ISO Latin 1          */
405     999,			/* 7 bit approximations */
406 };
407 
408 /*
409  * Function to set the handling of selected character sets based on the current
410  * LYUseDefaultRawMode value.  - FM
411  */
HTMLSetCharacterHandling(int i)412 void HTMLSetCharacterHandling(int i)
413 {
414     int chndl = safeUCGetLYhndl_byMIME(UCAssume_MIMEcharset);
415     BOOLEAN LYRawMode_flag = LYRawMode;
416     int UCLYhndl_for_unspec_flag = UCLYhndl_for_unspec;
417 
418     if (LYCharSet_UC[i].enc != UCT_ENC_CJK) {
419 	HTCJK = NOCJK;
420 	kanji_code = NOKANJI;
421 	if (i == chndl)
422 	    LYRawMode = LYUseDefaultRawMode;
423 	else
424 	    LYRawMode = (BOOL) (!LYUseDefaultRawMode);
425 
426 	HTPassEightBitNum = (BOOL) ((LYCharSet_UC[i].codepoints & UCT_CP_SUPERSETOF_LAT1)
427 				    || (LYCharSet_UC[i].like8859 & UCT_R_HIGH8BIT));
428 
429 	if (LYRawMode) {
430 	    HTPassEightBitRaw = (BOOL) (LYlowest_eightbit[i] <= 160);
431 	} else {
432 	    HTPassEightBitRaw = FALSE;
433 	}
434 	if (LYRawMode || i == chndl) {
435 	    HTPassHighCtrlRaw = (BOOL) (LYlowest_eightbit[i] <= 130);
436 	} else {
437 	    HTPassHighCtrlRaw = FALSE;
438 	}
439 
440 	HTPassHighCtrlNum = FALSE;
441 
442     } else {			/* CJK encoding: */
443 	const char *mime = LYCharSet_UC[i].MIMEname;
444 
445 	if (!strcmp(mime, "euc-cn")) {
446 	    HTCJK = CHINESE;
447 	    kanji_code = EUC;
448 	} else if (!strcmp(mime, "euc-jp")) {
449 	    HTCJK = JAPANESE;
450 	    kanji_code = EUC;
451 	} else if (!strcmp(mime, "shift_jis")) {
452 	    HTCJK = JAPANESE;
453 	    kanji_code = SJIS;
454 	} else if (!strcmp(mime, "euc-kr")) {
455 	    HTCJK = KOREAN;
456 	    kanji_code = EUC;
457 	} else if (!strcmp(mime, "big5")) {
458 	    HTCJK = TAIPEI;
459 	    kanji_code = EUC;
460 	}
461 
462 	/* for any CJK: */
463 	if (!LYUseDefaultRawMode)
464 	    HTCJK = NOCJK;
465 	LYRawMode = (BOOL) (IS_CJK_TTY ? TRUE : FALSE);
466 	HTPassEightBitRaw = FALSE;
467 	HTPassEightBitNum = FALSE;
468 	HTPassHighCtrlRaw = (BOOL) (IS_CJK_TTY ? TRUE : FALSE);
469 	HTPassHighCtrlNum = FALSE;
470     }
471 
472     /*
473      * Comment for coding below:
474      * UCLYhndl_for_unspec is "current" state with LYRawMode, but
475      * UCAssume_MIMEcharset is independent from LYRawMode:  holds the history
476      * and may be changed from 'O'ptions menu only.  - LP
477      */
478     if (LYRawMode) {
479 	UCLYhndl_for_unspec = i;	/* UCAssume_MIMEcharset not changed! */
480     } else {
481 	if (chndl != i &&
482 	    (LYCharSet_UC[i].enc != UCT_ENC_CJK ||
483 	     LYCharSet_UC[chndl].enc != UCT_ENC_CJK)) {
484 	    UCLYhndl_for_unspec = chndl;	/* fall to UCAssume_MIMEcharset */
485 	} else {
486 	    UCLYhndl_for_unspec = LATIN1;	/* UCAssume_MIMEcharset not changed! */
487 	}
488     }
489 
490 #ifdef USE_SLANG
491     if (LYlowest_eightbit[i] > 191) {
492 	/*
493 	 * Higher than this may output cntrl chars to screen.  - KW
494 	 */
495 	SLsmg_Display_Eight_Bit = 191;
496     } else {
497 	SLsmg_Display_Eight_Bit = LYlowest_eightbit[i];
498     }
499 #endif /* USE_SLANG */
500 
501     ena_csi(LYlowest_eightbit[current_char_set] > 155);
502 
503     /* some diagnostics */
504     if (TRACE) {
505 	if (LYRawMode_flag != LYRawMode)
506 	    CTRACE((tfp,
507 		    "HTMLSetCharacterHandling: LYRawMode changed %s -> %s\n",
508 		    (LYRawMode_flag ? "ON" : "OFF"),
509 		    (LYRawMode ? "ON" : "OFF")));
510 	if (UCLYhndl_for_unspec_flag != UCLYhndl_for_unspec)
511 	    CTRACE((tfp,
512 		    "HTMLSetCharacterHandling: UCLYhndl_for_unspec changed %d -> %d\n",
513 		    UCLYhndl_for_unspec_flag,
514 		    UCLYhndl_for_unspec));
515     }
516 
517     return;
518 }
519 
520 /*
521  * Function to set HTCJK based on "in" and "out" charsets.
522  */
Set_HTCJK(const char * inMIMEname,const char * outMIMEname)523 void Set_HTCJK(const char *inMIMEname,
524 	       const char *outMIMEname)
525 {
526     /* need not check for synonyms: MIMEnames got from LYCharSet_UC */
527 
528     if (LYRawMode) {
529 	if ((!strcmp(inMIMEname, "euc-jp") ||
530 #ifdef EXP_JAPANESEUTF8_SUPPORT
531 	     !strcmp(inMIMEname, "utf-8") ||
532 #endif
533 	     !strcmp(inMIMEname, "shift_jis")) &&
534 	    (!strcmp(outMIMEname, "euc-jp") ||
535 	     !strcmp(outMIMEname, "shift_jis"))) {
536 	    HTCJK = JAPANESE;
537 	} else if (!strcmp(inMIMEname, "euc-cn") &&
538 		   !strcmp(outMIMEname, "euc-cn")) {
539 	    HTCJK = CHINESE;
540 	} else if (!strcmp(inMIMEname, "big5") &&
541 		   !strcmp(outMIMEname, "big5")) {
542 	    HTCJK = TAIPEI;
543 	} else if (!strcmp(inMIMEname, "euc-kr") &&
544 		   !strcmp(outMIMEname, "euc-kr")) {
545 	    HTCJK = KOREAN;
546 	} else {
547 	    HTCJK = NOCJK;
548 	}
549     } else {
550 	HTCJK = NOCJK;
551     }
552 }
553 
554 /*
555  * Function to set the LYDefaultRawMode value based on the selected character
556  * set.  - FM
557  *
558  * Currently unused:  the default value so obvious that LYUseDefaultRawMode
559  * utilized directly by someone's mistake.  - LP
560  */
HTMLSetRawModeDefault(int i)561 static void HTMLSetRawModeDefault(int i)
562 {
563     LYDefaultRawMode = (BOOL) (LYCharSet_UC[i].enc == UCT_ENC_CJK);
564     return;
565 }
566 
567 /*
568  * Function to set the LYUseDefaultRawMode value based on the selected
569  * character set and the current LYRawMode value.  - FM
570  */
HTMLSetUseDefaultRawMode(int i,int modeflag)571 void HTMLSetUseDefaultRawMode(int i,
572 			      int modeflag)
573 {
574     if (LYCharSet_UC[i].enc != UCT_ENC_CJK) {
575 
576 	int chndl = safeUCGetLYhndl_byMIME(UCAssume_MIMEcharset);
577 
578 	if (i == chndl)
579 	    LYUseDefaultRawMode = (BOOLEAN) modeflag;
580 	else
581 	    LYUseDefaultRawMode = (BOOL) (!modeflag);
582     } else			/* CJK encoding: */
583 	LYUseDefaultRawMode = (BOOLEAN) modeflag;
584 
585     return;
586 }
587 
588 /*
589  * Function to set the LYHaveCJKCharacterSet value based on the selected
590  * character set.  - FM
591  */
HTMLSetHaveCJKCharacterSet(int i)592 static void HTMLSetHaveCJKCharacterSet(int i)
593 {
594     LYHaveCJKCharacterSet = (BOOL) (LYCharSet_UC[i].enc == UCT_ENC_CJK);
595     return;
596 }
597 
598 /*
599  * Function to set the DisplayCharsetMatchLocale value based on the selected
600  * character set.  It is used in UPPER8 for 8bit case-insensitive search by
601  * matching def7_uni.tbl images.  - LP
602  */
HTMLSetDisplayCharsetMatchLocale(int i)603 static void HTMLSetDisplayCharsetMatchLocale(int i)
604 {
605     BOOLEAN match;
606 
607 #if defined(__MirBSD__) && defined(_nc_set_locale)
608     if (LYCharSet_UC[i].enc == UCT_ENC_UTF8) {
609 	_nc_set_locale(NULL);
610 	match = TRUE;
611     } else {
612 	static char locbuf[32];
613 
614 	snprintf(locbuf, sizeof(locbuf), "en_US.%s", LYCharSet_UC[i].MIMEname);
615 	_nc_set_locale(locbuf);
616 	match = FALSE;
617     }
618 #else
619     if (LYHaveCJKCharacterSet) {
620 	/*
621 	 * We have no intention to pass CJK via UCTransChar if that happened.
622 	 * Let someone from CJK correct this if necessary.
623 	 */
624 	DisplayCharsetMatchLocale = TRUE;	/* old-style */
625 	return;
626 
627     } else if (strncasecomp(LYCharSet_UC[i].MIMEname, "cp", 2) ||
628 	       strncasecomp(LYCharSet_UC[i].MIMEname, "windows", 7)) {
629 	/*
630 	 * Assume dos/windows displays usually on remote terminal, hence it
631 	 * rarely matches locale.  (In fact, MS Windows codepoints locale are
632 	 * never seen on UNIX).
633 	 */
634 	match = FALSE;
635     } else {
636 	match = TRUE;		/* guess, but see below */
637 
638 #if !defined(LOCALE)
639 	if (LYCharSet_UC[i].enc != UCT_ENC_UTF8)
640 	    /*
641 	     * Leave true for utf-8 display - the code doesn't deal very well
642 	     * with this case.  - kw
643 	     */
644 	    match = FALSE;
645 #else
646 	if (UCForce8bitTOUPPER) {
647 	    /*
648 	     * Force disable locale (from lynx.cfg)
649 	     */
650 	    match = FALSE;
651 	}
652 #endif
653     }
654 #endif /* MirBSD, _nc_set_locale */
655 
656     DisplayCharsetMatchLocale = match;
657     return;
658 }
659 
660 /*
661  * lynx 2.8/2.7.2(and more early) compatibility code:  "human-readable" charset
662  * names changes with time so we map that history names to MIME here to get old
663  * lynx.cfg and (especially) .lynxrc always recognized.  Please update this
664  * table when you change "fullname" of any present charset.
665  */
666 typedef struct _names_pairs {
667     const char *fullname;
668     const char *MIMEname;
669 } names_pairs;
670 /* *INDENT-OFF* */
671 static const names_pairs OLD_charset_names[] =
672 {
673     {"ISO Latin 1",		"iso-8859-1"},
674     {"ISO Latin 2",             "iso-8859-2"},
675     {"WinLatin1 (cp1252)",      "windows-1252"},
676     {"DEC Multinational",       "dec-mcs"},
677     {"Macintosh (8 bit)",       "macintosh"},
678     {"NeXT character set",      "next"},
679     {"KOI8-R Cyrillic",         "koi8-r"},
680     {"Chinese",                 "euc-cn"},
681     {"Japanese (EUC)",          "euc-jp"},
682     {"Japanese (SJIS)",         "shift_jis"},
683     {"Korean",                  "euc-kr"},
684     {"Taipei (Big5)",           "big5"},
685     {"Vietnamese (VISCII)",     "viscii"},
686     {"7 bit approximations",    "us-ascii"},
687     {"Transparent",             "x-transparent"},
688     {"DosLatinUS (cp437)",      "cp437"},
689     {"IBM PC character set",    "cp437"},
690     {"DosLatin1 (cp850)",       "cp850"},
691     {"IBM PC codepage 850",     "cp850"},
692     {"DosLatin2 (cp852)",       "cp852"},
693     {"PC Latin2 CP 852",        "cp852"},
694     {"DosCyrillic (cp866)",     "cp866"},
695     {"DosArabic (cp864)",       "cp864"},
696     {"DosGreek (cp737)",        "cp737"},
697     {"DosBaltRim (cp775)",      "cp775"},
698     {"DosGreek2 (cp869)",       "cp869"},
699     {"DosHebrew (cp862)",       "cp862"},
700     {"WinLatin2 (cp1250)",      "windows-1250"},
701     {"WinCyrillic (cp1251)",    "windows-1251"},
702     {"WinGreek (cp1253)",       "windows-1253"},
703     {"WinHebrew (cp1255)",      "windows-1255"},
704     {"WinArabic (cp1256)",      "windows-1256"},
705     {"WinBaltRim (cp1257)",     "windows-1257"},
706     {"ISO Latin 3",             "iso-8859-3"},
707     {"ISO Latin 4",             "iso-8859-4"},
708     {"ISO 8859-5 Cyrillic",     "iso-8859-5"},
709     {"ISO 8859-6 Arabic",       "iso-8859-6"},
710     {"ISO 8859-7 Greek",        "iso-8859-7"},
711     {"ISO 8859-8 Hebrew",       "iso-8859-8"},
712     {"ISO-8859-8-I",            "iso-8859-8"},
713     {"ISO-8859-8-E",            "iso-8859-8"},
714     {"ISO 8859-9 (Latin 5)",    "iso-8859-9"},
715     {"ISO 8859-10",             "iso-8859-10"},
716     {"UNICODE UTF 8",           "utf-8"},
717     {"RFC 1345 w/o Intro",      "mnemonic+ascii+0"},
718     {"RFC 1345 Mnemonic",       "mnemonic"},
719     {NULL, NULL},		/* terminated with NULL */
720 };
721 /* *INDENT-ON* */
722 
723 /*
724  * lynx 2.8/2.7.2 compatibility code:  read "character_set" parameter from
725  * lynx.cfg and .lynxrc in both MIME name and "human-readable" name (old and
726  * new style).  Returns -1 if not recognized.
727  */
UCGetLYhndl_byAnyName(char * value)728 int UCGetLYhndl_byAnyName(char *value)
729 {
730     int i;
731 
732     if (value == NULL)
733 	return -1;
734 
735     LYTrimTrailing(value);
736     CTRACE((tfp, "UCGetLYhndl_byAnyName(%s)\n", value));
737 
738     /* search by name */
739     for (i = 0; (i < MAXCHARSETS && LYchar_set_names[i]); i++) {
740 	if (!strcmp(value, LYchar_set_names[i])) {
741 	    return i;		/* OK */
742 	}
743     }
744 
745     /* search by old name from 2.8/2.7.2 version */
746     for (i = 0; (OLD_charset_names[i].fullname); i++) {
747 	if (!strcmp(value, OLD_charset_names[i].fullname)) {
748 	    return UCGetLYhndl_byMIME(OLD_charset_names[i].MIMEname);	/* OK */
749 	}
750     }
751 
752     return UCGetLYhndl_byMIME(value);	/* by MIME */
753 }
754 
755 /*
756  * Entity names -- Ordered by ISO Latin 1 value.
757  * ---------------------------------------------
758  * For conversions of DECIMAL escaped entities.
759  * Must be in order of ascending value.
760  */
761 static const char *LYEntityNames[] =
762 {
763 /*	 NAME		   DECIMAL VALUE */
764     "nbsp",			/* 160, non breaking space */
765     "iexcl",			/* 161, inverted exclamation mark */
766     "cent",			/* 162, cent sign */
767     "pound",			/* 163, pound sign */
768     "curren",			/* 164, currency sign */
769     "yen",			/* 165, yen sign */
770     "brvbar",			/* 166, broken vertical bar, (brkbar) */
771     "sect",			/* 167, section sign */
772     "uml",			/* 168, spacing dieresis */
773     "copy",			/* 169, copyright sign */
774     "ordf",			/* 170, feminine ordinal indicator */
775     "laquo",			/* 171, angle quotation mark, left */
776     "not",			/* 172, negation sign */
777     "shy",			/* 173, soft hyphen */
778     "reg",			/* 174, circled R registered sign */
779     "hibar",			/* 175, spacing macron */
780     "deg",			/* 176, degree sign */
781     "plusmn",			/* 177, plus-or-minus sign */
782     "sup2",			/* 178, superscript 2 */
783     "sup3",			/* 179, superscript 3 */
784     "acute",			/* 180, spacing acute (96) */
785     "micro",			/* 181, micro sign */
786     "para",			/* 182, paragraph sign */
787     "middot",			/* 183, middle dot */
788     "cedil",			/* 184, spacing cedilla */
789     "sup1",			/* 185, superscript 1 */
790     "ordm",			/* 186, masculine ordinal indicator */
791     "raquo",			/* 187, angle quotation mark, right */
792     "frac14",			/* 188, fraction 1/4 */
793     "frac12",			/* 189, fraction 1/2 */
794     "frac34",			/* 190, fraction 3/4 */
795     "iquest",			/* 191, inverted question mark */
796     "Agrave",			/* 192, capital A, grave accent */
797     "Aacute",			/* 193, capital A, acute accent */
798     "Acirc",			/* 194, capital A, circumflex accent */
799     "Atilde",			/* 195, capital A, tilde */
800     "Auml",			/* 196, capital A, dieresis or umlaut mark */
801     "Aring",			/* 197, capital A, ring */
802     "AElig",			/* 198, capital AE diphthong (ligature) */
803     "Ccedil",			/* 199, capital C, cedilla */
804     "Egrave",			/* 200, capital E, grave accent */
805     "Eacute",			/* 201, capital E, acute accent */
806     "Ecirc",			/* 202, capital E, circumflex accent */
807     "Euml",			/* 203, capital E, dieresis or umlaut mark */
808     "Igrave",			/* 204, capital I, grave accent */
809     "Iacute",			/* 205, capital I, acute accent */
810     "Icirc",			/* 206, capital I, circumflex accent */
811     "Iuml",			/* 207, capital I, dieresis or umlaut mark */
812     "ETH",			/* 208, capital Eth, Icelandic (or Latin2 Dstrok) */
813     "Ntilde",			/* 209, capital N, tilde */
814     "Ograve",			/* 210, capital O, grave accent */
815     "Oacute",			/* 211, capital O, acute accent */
816     "Ocirc",			/* 212, capital O, circumflex accent */
817     "Otilde",			/* 213, capital O, tilde */
818     "Ouml",			/* 214, capital O, dieresis or umlaut mark */
819     "times",			/* 215, multiplication sign */
820     "Oslash",			/* 216, capital O, slash */
821     "Ugrave",			/* 217, capital U, grave accent */
822     "Uacute",			/* 218, capital U, acute accent */
823     "Ucirc",			/* 219, capital U, circumflex accent */
824     "Uuml",			/* 220, capital U, dieresis or umlaut mark */
825     "Yacute",			/* 221, capital Y, acute accent */
826     "THORN",			/* 222, capital THORN, Icelandic */
827     "szlig",			/* 223, small sharp s, German (sz ligature) */
828     "agrave",			/* 224, small a, grave accent */
829     "aacute",			/* 225, small a, acute accent */
830     "acirc",			/* 226, small a, circumflex accent */
831     "atilde",			/* 227, small a, tilde */
832     "auml",			/* 228, small a, dieresis or umlaut mark */
833     "aring",			/* 229, small a, ring */
834     "aelig",			/* 230, small ae diphthong (ligature) */
835     "ccedil",			/* 231, small c, cedilla */
836     "egrave",			/* 232, small e, grave accent */
837     "eacute",			/* 233, small e, acute accent */
838     "ecirc",			/* 234, small e, circumflex accent */
839     "euml",			/* 235, small e, dieresis or umlaut mark */
840     "igrave",			/* 236, small i, grave accent */
841     "iacute",			/* 237, small i, acute accent */
842     "icirc",			/* 238, small i, circumflex accent */
843     "iuml",			/* 239, small i, dieresis or umlaut mark */
844     "eth",			/* 240, small eth, Icelandic */
845     "ntilde",			/* 241, small n, tilde */
846     "ograve",			/* 242, small o, grave accent */
847     "oacute",			/* 243, small o, acute accent */
848     "ocirc",			/* 244, small o, circumflex accent */
849     "otilde",			/* 245, small o, tilde */
850     "ouml",			/* 246, small o, dieresis or umlaut mark */
851     "divide",			/* 247, division sign */
852     "oslash",			/* 248, small o, slash */
853     "ugrave",			/* 249, small u, grave accent */
854     "uacute",			/* 250, small u, acute accent */
855     "ucirc",			/* 251, small u, circumflex accent */
856     "uuml",			/* 252, small u, dieresis or umlaut mark */
857     "yacute",			/* 253, small y, acute accent */
858     "thorn",			/* 254, small thorn, Icelandic */
859     "yuml",			/* 255, small y, dieresis or umlaut mark */
860 };
861 
862 /*
863  * Function to return the entity names of ISO-8859-1 8-bit characters.  - FM
864  */
HTMLGetEntityName(UCode_t code)865 const char *HTMLGetEntityName(UCode_t code)
866 {
867 #define IntValue code
868     int MaxValue = (TABLESIZE(LYEntityNames) - 1);
869 
870     if (IntValue < 0 || IntValue > MaxValue) {
871 	return "";
872     }
873 
874     return LYEntityNames[IntValue];
875 }
876 
877 /*
878  * Function to return the UCode_t (long int) value for entity names.  It
879  * returns 0 if not found.
880  *
881  * unicode_entities[] handles all the names from old style entities[] too.
882  * Lynx now calls unicode_entities[] only through this function:
883  * HTMLGetEntityUCValue().  Note, we need not check for special characters here
884  * in function or even before it, we should check them *after* invoking this
885  * function, see put_special_unicodes() in SGML.c.
886  *
887  * In the future we will try to isolate all calls to entities[] in favor of new
888  * unicode-based chartrans scheme.  - LP
889  */
HTMLGetEntityUCValue(const char * name)890 UCode_t HTMLGetEntityUCValue(const char *name)
891 {
892 #include <entities.h>
893 
894     UCode_t value = 0;
895     size_t i, high, low;
896     int diff = 0;
897     size_t number_of_unicode_entities = TABLESIZE(unicode_entities);
898 
899     /*
900      * Make sure we have a non-zero length name.  - FM
901      */
902     if (isEmpty(name))
903 	return (value);
904 
905     /*
906      * Try UC_entity_info unicode_entities[].
907      */
908     for (low = 0, high = number_of_unicode_entities;
909 	 high > low;
910 	 diff < 0 ? (low = i + 1) : (high = i)) {
911 	/*
912 	 * Binary search.
913 	 */
914 	i = (low + (high - low) / 2);
915 	diff = AS_cmp(unicode_entities[i].name, name);	/* Case sensitive! */
916 	if (diff == 0) {
917 	    value = unicode_entities[i].code;
918 	    break;
919 	}
920     }
921     return (value);
922 }
923 
924 /*
925  * Original comment -
926  * Assume these are Microsoft code points, inflicted on us by FrontPage.  - FM
927  *
928  * MS FrontPage uses syntax like &#153; in 128-159 range and doesn't follow
929  * Unicode standards for this area.  Windows-1252 codepoints are assumed here.
930  *
931  * However see -
932  * http://www.whatwg.org/specs/web-apps/current-work/multipage/infrastructure.html#character-encodings-0
933  */
LYcp1252ToUnicode(UCode_t code)934 UCode_t LYcp1252ToUnicode(UCode_t code)
935 {
936     if ((code == 1) ||
937 	(code > 127 && code < 160)) {
938 	switch (code) {
939 	case 1:
940 	    /*
941 	     * WHITE SMILING FACE
942 	     */
943 	    code = 0x263a;
944 	    break;
945 	case 128:
946 	    /*
947 	     * EURO currency sign
948 	     */
949 	    code = 0x20ac;
950 	    break;
951 	case 130:
952 	    /*
953 	     * SINGLE LOW-9 QUOTATION MARK (sbquo)
954 	     */
955 	    code = 0x201a;
956 	    break;
957 	case 131:
958 	    /*
959 	     * LATIN SMALL LETTER F WITH HOOK
960 	     */
961 	    code = 0x192;
962 	    break;
963 	case 132:
964 	    /*
965 	     * DOUBLE LOW-9 QUOTATION MARK (bdquo)
966 	     */
967 	    code = 0x201e;
968 	    break;
969 	case 133:
970 	    /*
971 	     * HORIZONTAL ELLIPSIS (hellip)
972 	     */
973 	    code = 0x2026;
974 	    break;
975 	case 134:
976 	    /*
977 	     * DAGGER (dagger)
978 	     */
979 	    code = 0x2020;
980 	    break;
981 	case 135:
982 	    /*
983 	     * DOUBLE DAGGER (Dagger)
984 	     */
985 	    code = 0x2021;
986 	    break;
987 	case 136:
988 	    /*
989 	     * MODIFIER LETTER CIRCUMFLEX ACCENT
990 	     */
991 	    code = 0x2c6;
992 	    break;
993 	case 137:
994 	    /*
995 	     * PER MILLE SIGN (permil)
996 	     */
997 	    code = 0x2030;
998 	    break;
999 	case 138:
1000 	    /*
1001 	     * LATIN CAPITAL LETTER S WITH CARON
1002 	     */
1003 	    code = 0x160;
1004 	    break;
1005 	case 139:
1006 	    /*
1007 	     * SINGLE LEFT-POINTING ANGLE QUOTATION MARK (lsaquo)
1008 	     */
1009 	    code = 0x2039;
1010 	    break;
1011 	case 140:
1012 	    /*
1013 	     * LATIN CAPITAL LIGATURE OE
1014 	     */
1015 	    code = 0x152;
1016 	    break;
1017 	case 142:
1018 	    /*
1019 	     * LATIN CAPITAL LETTER Z WITH CARON
1020 	     */
1021 	    code = 0x17d;
1022 	    break;
1023 	case 145:
1024 	    /*
1025 	     * LEFT SINGLE QUOTATION MARK (lsquo)
1026 	     */
1027 	    code = 0x2018;
1028 	    break;
1029 	case 146:
1030 	    /*
1031 	     * RIGHT SINGLE QUOTATION MARK (rsquo)
1032 	     */
1033 	    code = 0x2019;
1034 	    break;
1035 	case 147:
1036 	    /*
1037 	     * LEFT DOUBLE QUOTATION MARK (ldquo)
1038 	     */
1039 	    code = 0x201c;
1040 	    break;
1041 	case 148:
1042 	    /*
1043 	     * RIGHT DOUBLE QUOTATION MARK (rdquo)
1044 	     */
1045 	    code = 0x201d;
1046 	    break;
1047 	case 149:
1048 	    /*
1049 	     * BULLET (bull)
1050 	     */
1051 	    code = 0x2022;
1052 	    break;
1053 	case 150:
1054 	    /*
1055 	     * EN DASH (ndash)
1056 	     */
1057 	    code = 0x2013;
1058 	    break;
1059 	case 151:
1060 	    /*
1061 	     * EM DASH (mdash)
1062 	     */
1063 	    code = 0x2014;
1064 	    break;
1065 	case 152:
1066 	    /*
1067 	     * SMALL TILDE (tilde)
1068 	     */
1069 	    code = 0x02dc;
1070 	    break;
1071 	case 153:
1072 	    /*
1073 	     * TRADE MARK SIGN (trade)
1074 	     */
1075 	    code = 0x2122;
1076 	    break;
1077 	case 154:
1078 	    /*
1079 	     * LATIN SMALL LETTER S WITH CARON
1080 	     */
1081 	    code = 0x161;
1082 	    break;
1083 	case 155:
1084 	    /*
1085 	     * SINGLE RIGHT-POINTING ANGLE QUOTATION MARK (rsaquo)
1086 	     */
1087 	    code = 0x203a;
1088 	    break;
1089 	case 156:
1090 	    /*
1091 	     * LATIN SMALL LIGATURE OE
1092 	     */
1093 	    code = 0x153;
1094 	    break;
1095 	case 158:
1096 	    /*
1097 	     * LATIN SMALL LETTER Z WITH CARON
1098 	     */
1099 	    code = 0x17e;
1100 	    break;
1101 	case 159:
1102 	    /*
1103 	     * LATIN CAPITAL LETTER Y WITH DIAERESIS
1104 	     */
1105 	    code = 0x178;
1106 	    break;
1107 	default:
1108 	    /*
1109 	     * Undefined (by convention, use the replacement character).
1110 	     */
1111 	    code = 0xfffd;
1112 	    break;
1113 	}
1114     }
1115     return code;
1116 }
1117 
1118 /*
1119  * Function to select a character set and then set the character handling and
1120  * LYHaveCJKCharacterSet flag.  - FM
1121  */
HTMLUseCharacterSet(int i)1122 void HTMLUseCharacterSet(int i)
1123 {
1124     HTMLSetRawModeDefault(i);
1125     p_entity_values = LYCharSets[i];
1126     HTMLSetCharacterHandling(i);	/* set LYRawMode and CJK attributes */
1127     HTMLSetHaveCJKCharacterSet(i);
1128     HTMLSetDisplayCharsetMatchLocale(i);
1129     return;
1130 }
1131 
1132 /*
1133  * Initializer, calls initialization function for the CHARTRANS handling.  - KW
1134  */
LYCharSetsDeclared(void)1135 int LYCharSetsDeclared(void)
1136 {
1137     UCInit();
1138 
1139     return UCInitialized;
1140 }
1141 
1142 #ifdef USE_CHARSET_CHOICE
init_charset_subsets(void)1143 void init_charset_subsets(void)
1144 {
1145     int i, n;
1146     int cur_display = 0;
1147     int cur_assumed = 0;
1148 
1149     /* add them to displayed values */
1150     charset_subsets[UCLYhndl_for_unspec].hide_assumed = FALSE;
1151     charset_subsets[current_char_set].hide_display = FALSE;
1152 
1153 #ifndef ALL_CHARSETS_IN_O_MENU_SCREEN
1154     /*all this stuff is for supporting old menu screen... */
1155     for (i = 0; i < LYNumCharsets; ++i) {
1156 	if (charset_subsets[i].hide_display == FALSE) {
1157 	    n = cur_display++;
1158 	    if (i == current_char_set)
1159 		displayed_display_charset_idx = n;
1160 	    display_charset_map[n] = i;
1161 	    display_charset_choices[n] = LYchar_set_names[i];
1162 	}
1163 	if (charset_subsets[i].hide_assumed == FALSE) {
1164 	    n = cur_assumed++;
1165 	    assumed_doc_charset_map[n] = i;
1166 	    assumed_charset_choices[n] = LYCharSet_UC[i].MIMEname;
1167 	    charset_subsets[i].assumed_idx = n;
1168 	}
1169 	display_charset_choices[cur_display] = NULL;
1170 	assumed_charset_choices[cur_assumed] = NULL;
1171     }
1172 #endif
1173 }
1174 #endif /* USE_CHARSET_CHOICE */
1175