1 /*
2  * $LynxId: UCdomap.c,v 1.95 2012/02/23 01:05:42 tom Exp $
3  *
4  *  UCdomap.c
5  *  =========
6  *
7  * This is a Lynx chartrans engine, its external calls are in UCMap.h
8  *
9  * Derived from code in the Linux kernel console driver.
10  * The GNU Public Licence therefore applies, see
11  * the file COPYING in the top-level directory
12  * which should come with every Lynx distribution.
13  *
14  *  [ original comment: - KW ]
15  *
16  * Mapping from internal code (such as Latin-1 or Unicode or IBM PC code)
17  * to font positions.
18  *
19  * aeb, 950210
20  */
21 #include <HTUtils.h>
22 #include <HTMLDTD.h>
23 
24 #include <LYGlobalDefs.h>
25 #include <UCdomap.h>
26 #include <UCMap.h>
27 #include <UCAux.h>
28 #include <UCDefs.h>
29 #include <LYCharSets.h>
30 #include <LYStrings.h>
31 #include <LYUtils.h>
32 
33 #if defined(USE_LOCALE_CHARSET) && defined(HAVE_LANGINFO_CODESET)
34 #include <langinfo.h>
35 #endif
36 
37 #ifdef EXP_JAPANESEUTF8_SUPPORT
38 #include <iconv.h>
39 #endif
40 
41 #include <LYLeaks.h>
42 
43 /*
44  * Include chartrans tables:
45  */
46 #include <cp1250_uni.h>		/* WinLatin2 (cp1250)   */
47 #include <cp1251_uni.h>		/* WinCyrillic (cp1251) */
48 #include <cp1252_uni.h>		/* WinLatin1 (cp1252)   */
49 #include <cp1253_uni.h>		/* WinGreek (cp1253)    */
50 #include <cp1255_uni.h>		/* WinHebrew (cp1255)   */
51 #include <cp1256_uni.h>		/* WinArabic (cp1256)   */
52 #include <cp1257_uni.h>		/* WinBaltRim (cp1257)  */
53 #include <cp437_uni.h>		/* DosLatinUS (cp437)   */
54 #include <cp737_uni.h>		/* DosGreek (cp737)     */
55 #include <cp775_uni.h>		/* DosBaltRim (cp775)   */
56 #include <cp850_uni.h>		/* DosLatin1 (cp850)    */
57 #include <cp852_uni.h>		/* DosLatin2 (cp852)    */
58 #include <cp857_uni.h>		/* DosTurkish (cp857)   */
59 #include <cp862_uni.h>		/* DosHebrew (cp862)    */
60 #include <cp864_uni.h>		/* DosArabic (cp864)    */
61 #include <cp866_uni.h>		/* DosCyrillic (cp866)  */
62 #include <cp869_uni.h>		/* DosGreek2 (cp869)    */
63 #include <def7_uni.h>		/* 7 bit approximations */
64 #include <dmcs_uni.h>		/* DEC Multinational    */
65 #include <hp_uni.h>		/* HP Roman8            */
66 #include <iso01_uni.h>		/* ISO Latin 1          */
67 #include <iso02_uni.h>		/* ISO Latin 2          */
68 #include <iso03_uni.h>		/* ISO Latin 3          */
69 #include <iso04_uni.h>		/* ISO Latin 4          */
70 #include <iso05_uni.h>		/* ISO 8859-5 Cyrillic  */
71 #include <iso06_uni.h>		/* ISO 8859-6 Arabic    */
72 #include <iso07_uni.h>		/* ISO 8859-7 Greek     */
73 #include <iso08_uni.h>		/* ISO 8859-8 Hebrew    */
74 #include <iso09_uni.h>		/* ISO 8859-9 (Latin 5) */
75 #include <iso10_uni.h>		/* ISO 8859-10          */
76 #include <iso13_uni.h>		/* ISO 8859-13 (Latin 7) */
77 #include <iso14_uni.h>		/* ISO 8859-14 (Latin 8) */
78 #include <iso15_uni.h>		/* ISO 8859-15 (Latin 9) */
79 #include <koi8r_uni.h>		/* KOI8-R Cyrillic      */
80 #include <mac_uni.h>		/* Macintosh (8 bit)    */
81 #include <mnem2_suni.h>		/* RFC 1345 Mnemonic    */
82 #include <next_uni.h>		/* NeXT character set   */
83 #include <rfc_suni.h>		/* RFC 1345 w/o Intro   */
84 /* #include <utf8_uni.h> */ /* UNICODE UTF 8        */
85 #include <viscii_uni.h>		/* Vietnamese (VISCII)  */
86 #include <cp866u_uni.h>		/* Ukrainian Cyrillic (866) */
87 #include <koi8u_uni.h>		/* Ukrainian Cyrillic (koi8-u */
88 #include <pt154_uni.h>		/* Cyrillic-Asian (PT154) */
89 
90 #ifdef CAN_AUTODETECT_DISPLAY_CHARSET
91 int auto_display_charset = -1;
92 #endif
93 
94 static const char *UC_GNsetMIMEnames[4] =
95 {
96     "iso-8859-1", "x-dec-graphics", "cp437", "x-transparent"
97 };
98 
99 static int UC_GNhandles[4] =
100 {
101     -1, -1, -1, -1
102 };
103 
104 /*
105  * Some of the code below, and some of the comments, are left in for
106  * historical reasons.  Not all those tables below are currently
107  * really needed (and what with all those hardwired codepoints),
108  * but let's keep them around for now.  They may come in handy if we
109  * decide to make more extended use of the mechanisms (including e.g.
110  * for chars < 127...).  - KW
111  */
112 
113 static u16 translations[][256] =
114 {
115     /*
116      * 8-bit Latin-1 mapped to Unicode -- trivial mapping.
117      */
118     {
119 	0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
120 	0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
121 	0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017,
122 	0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f,
123 	0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027,
124 	0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f,
125 	0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
126 	0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f,
127 	0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
128 	0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f,
129 	0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,
130 	0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f,
131 	0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
132 	0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f,
133 	0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,
134 	0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x007f,
135 	0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,
136 	0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f,
137 	0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,
138 	0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f,
139 	0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7,
140 	0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af,
141 	0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7,
142 	0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf,
143 	0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7,
144 	0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,
145 	0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7,
146 	0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df,
147 	0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7,
148 	0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef,
149 	0x00f0, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7,
150 	0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff
151     },
152     /*
153      * VT100 graphics mapped to Unicode.
154      */
155     {
156 	0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
157 	0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
158 	0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017,
159 	0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f,
160 	0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027,
161 	0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f,
162 	0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
163 	0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f,
164 	0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
165 	0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f,
166 	0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,
167 	0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x00a0,
168 	0x25c6, 0x2592, 0x2409, 0x240c, 0x240d, 0x240a, 0x00b0, 0x00b1,
169 	0x2424, 0x240b, 0x2518, 0x2510, 0x250c, 0x2514, 0x253c, 0xf800,
170 	0xf801, 0x2500, 0xf803, 0xf804, 0x251c, 0x2524, 0x2534, 0x252c,
171 	0x2502, 0x2264, 0x2265, 0x03c0, 0x2260, 0x00a3, 0x00b7, 0x007f,
172 	0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,
173 	0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f,
174 	0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,
175 	0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f,
176 	0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7,
177 	0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af,
178 	0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7,
179 	0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf,
180 	0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7,
181 	0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,
182 	0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7,
183 	0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df,
184 	0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7,
185 	0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef,
186 	0x00f0, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7,
187 	0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff
188     },
189     /*
190      * IBM Codepage 437 mapped to Unicode.
191      */
192     {
193 	0x0000, 0x263a, 0x263b, 0x2665, 0x2666, 0x2663, 0x2660, 0x2022,
194 	0x25d8, 0x25cb, 0x25d9, 0x2642, 0x2640, 0x266a, 0x266b, 0x263c,
195 	0x25ba, 0x25c4, 0x2195, 0x203c, 0x00b6, 0x00a7, 0x25ac, 0x21a8,
196 	0x2191, 0x2193, 0x2192, 0x2190, 0x221f, 0x2194, 0x25b2, 0x25bc,
197 	0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027,
198 	0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f,
199 	0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
200 	0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f,
201 	0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
202 	0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f,
203 	0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,
204 	0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f,
205 	0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
206 	0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f,
207 	0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,
208 	0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x2302,
209 	0x00c7, 0x00fc, 0x00e9, 0x00e2, 0x00e4, 0x00e0, 0x00e5, 0x00e7,
210 	0x00ea, 0x00eb, 0x00e8, 0x00ef, 0x00ee, 0x00ec, 0x00c4, 0x00c5,
211 	0x00c9, 0x00e6, 0x00c6, 0x00f4, 0x00f6, 0x00f2, 0x00fb, 0x00f9,
212 	0x00ff, 0x00d6, 0x00dc, 0x00a2, 0x00a3, 0x00a5, 0x20a7, 0x0192,
213 	0x00e1, 0x00ed, 0x00f3, 0x00fa, 0x00f1, 0x00d1, 0x00aa, 0x00ba,
214 	0x00bf, 0x2310, 0x00ac, 0x00bd, 0x00bc, 0x00a1, 0x00ab, 0x00bb,
215 	0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x2561, 0x2562, 0x2556,
216 	0x2555, 0x2563, 0x2551, 0x2557, 0x255d, 0x255c, 0x255b, 0x2510,
217 	0x2514, 0x2534, 0x252c, 0x251c, 0x2500, 0x253c, 0x255e, 0x255f,
218 	0x255a, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x2567,
219 	0x2568, 0x2564, 0x2565, 0x2559, 0x2558, 0x2552, 0x2553, 0x256b,
220 	0x256a, 0x2518, 0x250c, 0x2588, 0x2584, 0x258c, 0x2590, 0x2580,
221 	0x03b1, 0x00df, 0x0393, 0x03c0, 0x03a3, 0x03c3, 0x00b5, 0x03c4,
222 	0x03a6, 0x0398, 0x03a9, 0x03b4, 0x221e, 0x03c6, 0x03b5, 0x2229,
223 	0x2261, 0x00b1, 0x2265, 0x2264, 0x2320, 0x2321, 0x00f7, 0x2248,
224 	0x00b0, 0x2219, 0x00b7, 0x221a, 0x207f, 0x00b2, 0x25a0, 0x00a0
225     },
226     /*
227      * User mapping -- default to codes for direct font mapping.
228      */
229     {
230 	0xf000, 0xf001, 0xf002, 0xf003, 0xf004, 0xf005, 0xf006, 0xf007,
231 	0xf008, 0xf009, 0xf00a, 0xf00b, 0xf00c, 0xf00d, 0xf00e, 0xf00f,
232 	0xf010, 0xf011, 0xf012, 0xf013, 0xf014, 0xf015, 0xf016, 0xf017,
233 	0xf018, 0xf019, 0xf01a, 0xf01b, 0xf01c, 0xf01d, 0xf01e, 0xf01f,
234 	0xf020, 0xf021, 0xf022, 0xf023, 0xf024, 0xf025, 0xf026, 0xf027,
235 	0xf028, 0xf029, 0xf02a, 0xf02b, 0xf02c, 0xf02d, 0xf02e, 0xf02f,
236 	0xf030, 0xf031, 0xf032, 0xf033, 0xf034, 0xf035, 0xf036, 0xf037,
237 	0xf038, 0xf039, 0xf03a, 0xf03b, 0xf03c, 0xf03d, 0xf03e, 0xf03f,
238 	0xf040, 0xf041, 0xf042, 0xf043, 0xf044, 0xf045, 0xf046, 0xf047,
239 	0xf048, 0xf049, 0xf04a, 0xf04b, 0xf04c, 0xf04d, 0xf04e, 0xf04f,
240 	0xf050, 0xf051, 0xf052, 0xf053, 0xf054, 0xf055, 0xf056, 0xf057,
241 	0xf058, 0xf059, 0xf05a, 0xf05b, 0xf05c, 0xf05d, 0xf05e, 0xf05f,
242 	0xf060, 0xf061, 0xf062, 0xf063, 0xf064, 0xf065, 0xf066, 0xf067,
243 	0xf068, 0xf069, 0xf06a, 0xf06b, 0xf06c, 0xf06d, 0xf06e, 0xf06f,
244 	0xf070, 0xf071, 0xf072, 0xf073, 0xf074, 0xf075, 0xf076, 0xf077,
245 	0xf078, 0xf079, 0xf07a, 0xf07b, 0xf07c, 0xf07d, 0xf07e, 0xf07f,
246 	0xf080, 0xf081, 0xf082, 0xf083, 0xf084, 0xf085, 0xf086, 0xf087,
247 	0xf088, 0xf089, 0xf08a, 0xf08b, 0xf08c, 0xf08d, 0xf08e, 0xf08f,
248 	0xf090, 0xf091, 0xf092, 0xf093, 0xf094, 0xf095, 0xf096, 0xf097,
249 	0xf098, 0xf099, 0xf09a, 0xf09b, 0xf09c, 0xf09d, 0xf09e, 0xf09f,
250 	0xf0a0, 0xf0a1, 0xf0a2, 0xf0a3, 0xf0a4, 0xf0a5, 0xf0a6, 0xf0a7,
251 	0xf0a8, 0xf0a9, 0xf0aa, 0xf0ab, 0xf0ac, 0xf0ad, 0xf0ae, 0xf0af,
252 	0xf0b0, 0xf0b1, 0xf0b2, 0xf0b3, 0xf0b4, 0xf0b5, 0xf0b6, 0xf0b7,
253 	0xf0b8, 0xf0b9, 0xf0ba, 0xf0bb, 0xf0bc, 0xf0bd, 0xf0be, 0xf0bf,
254 	0xf0c0, 0xf0c1, 0xf0c2, 0xf0c3, 0xf0c4, 0xf0c5, 0xf0c6, 0xf0c7,
255 	0xf0c8, 0xf0c9, 0xf0ca, 0xf0cb, 0xf0cc, 0xf0cd, 0xf0ce, 0xf0cf,
256 	0xf0d0, 0xf0d1, 0xf0d2, 0xf0d3, 0xf0d4, 0xf0d5, 0xf0d6, 0xf0d7,
257 	0xf0d8, 0xf0d9, 0xf0da, 0xf0db, 0xf0dc, 0xf0dd, 0xf0de, 0xf0df,
258 	0xf0e0, 0xf0e1, 0xf0e2, 0xf0e3, 0xf0e4, 0xf0e5, 0xf0e6, 0xf0e7,
259 	0xf0e8, 0xf0e9, 0xf0ea, 0xf0eb, 0xf0ec, 0xf0ed, 0xf0ee, 0xf0ef,
260 	0xf0f0, 0xf0f1, 0xf0f2, 0xf0f3, 0xf0f4, 0xf0f5, 0xf0f6, 0xf0f7,
261 	0xf0f8, 0xf0f9, 0xf0fa, 0xf0fb, 0xf0fc, 0xf0fd, 0xf0fe, 0xf0ff
262     }
263 };
264 static u16 *UC_translate = NULL;
265 
266 static struct UC_charset UCInfo[MAXCHARSETS];
267 
268 /*
269  * The standard kernel character-to-font mappings are not invertible
270  * -- this is just a best effort.
271  */
272 #define MAX_GLYPH 512		/* Max possible glyph value */
273 
274 static unsigned char *inv_translate = NULL;
275 static unsigned char inv_norm_transl[MAX_GLYPH];
276 static unsigned char *inverse_translations[4] =
277 {NULL, NULL, NULL, NULL};
278 
279 static void set_inverse_transl(int i);
280 static u16 *set_translate(int m);
281 static int UC_valid_UC_charset(int UC_charset_hndl);
282 static void UC_con_set_trans(int UC_charset_in_hndl, int Gn, int update_flag);
283 static int con_insert_unipair(unsigned unicode, unsigned fontpos, int fordefault);
284 static int con_insert_unipair_str(unsigned unicode, const char *replace_str, int fordefault);
285 static void con_clear_unimap(int fordefault);
286 static void con_clear_unimap_str(int fordefault);
287 static void con_set_default_unimap(void);
288 static int UC_con_set_unimap(int UC_charset_out_hndl, int update_flag);
289 static int UC_con_set_unimap_str(unsigned ct, struct unipair_str *list, int fordefault);
290 static int conv_uni_to_pc(long ucs, int usedefault);
291 static int conv_uni_to_str(char *outbuf, int buflen, UCode_t ucs, int usedefault);
292 static void UCconsole_map_init(void);
293 static int UC_MapGN(int UChndl, int update_flag);
294 static int UC_FindGN_byMIME(const char *UC_MIMEcharset);
295 static void UCreset_allocated_LYCharSets(void);
296 static STRING2PTR UC_setup_LYCharSets_repl(int UC_charset_in_hndl, unsigned lowest8);
297 static int UC_Register_with_LYCharSets(int s,
298 				       const char *UC_MIMEcharset,
299 				       const char *UC_LYNXcharset,
300 				       int lowest_eightbit);
301 
302 #ifdef LY_FIND_LEAKS
303 static void UCfree_allocated_LYCharSets(void);
304 static void UCcleanup_mem(void);
305 #endif
306 
307 static int default_UChndl = -1;
308 
set_inverse_transl(int i)309 static void set_inverse_transl(int i)
310 {
311     int j, glyph;
312     u16 *p = translations[i];
313     unsigned char *q = inverse_translations[i];
314 
315     if (!q) {
316 	/*
317 	 * Slightly messy to avoid calling kmalloc too early.
318 	 */
319 	q = inverse_translations[i] = ((i == LAT1_MAP) ?
320 				       inv_norm_transl :
321 				       typeMallocn(unsigned char, MAX_GLYPH));
322 
323 	if (!q)
324 	    return;
325     }
326     for (j = 0; j < MAX_GLYPH; j++)
327 	q[j] = 0;
328 
329     for (j = 0; j < E_TABSZ; j++) {
330 	glyph = conv_uni_to_pc((long) p[j], 0);
331 	if (glyph >= 0 && glyph < MAX_GLYPH && q[glyph] < 32) {
332 	    /*
333 	     * Prefer '-' above SHY etc.
334 	     */
335 	    q[glyph] = UCH(j);
336 	}
337     }
338 }
339 
set_translate(int m)340 static u16 *set_translate(int m)
341 {
342     if (!inverse_translations[m])
343 	set_inverse_transl(m);
344     inv_translate = inverse_translations[m];
345     return translations[m];
346 }
347 
UC_valid_UC_charset(int UC_charset_hndl)348 static int UC_valid_UC_charset(int UC_charset_hndl)
349 {
350     return (UC_charset_hndl >= 0 && UC_charset_hndl < UCNumCharsets);
351 }
352 
UC_con_set_trans(int UC_charset_in_hndl,int Gn,int update_flag)353 static void UC_con_set_trans(int UC_charset_in_hndl,
354 			     int Gn,
355 			     int update_flag)
356 {
357     int i, j;
358     const u16 *p;
359     u16 *ptrans;
360 
361     if (!UC_valid_UC_charset(UC_charset_in_hndl)) {
362 	CTRACE((tfp, "UC_con_set_trans: Invalid charset handle %d.\n",
363 		UC_charset_in_hndl));
364 	return;
365     }
366     ptrans = translations[Gn];
367     p = UCInfo[UC_charset_in_hndl].unitable;
368 #if(0)
369     if (p == UC_current_unitable) {	/* test whether pointers are equal */
370 	return;			/* nothing to be done */
371     }
372     /*
373      * The font is always 256 characters - so far.
374      */
375     con_clear_unimap();
376 #endif
377     for (i = 0; i < 256; i++) {
378 	if ((j = UCInfo[UC_charset_in_hndl].unicount[i])) {
379 	    ptrans[i] = *p;
380 	    for (; j; j--) {
381 		p++;
382 	    }
383 	} else {
384 	    ptrans[i] = 0xfffd;
385 	}
386     }
387     if (update_flag) {
388 	set_inverse_transl(Gn);	/* Update inverse translation for this one */
389     }
390 }
391 
392 /*
393  * Unicode -> current font conversion
394  *
395  * A font has at most 512 chars, usually 256.
396  * But one font position may represent several Unicode chars.
397  * A hashtable is somewhat of a pain to deal with, so use a
398  * "paged table" instead.  Simulation has shown the memory cost of
399  * this 3-level paged table scheme to be comparable to a hash table.
400  */
401 static int hashtable_contents_valid = 0;	/* Use ASCII-only mode for bootup */
402 static int hashtable_str_contents_valid = 0;
403 
404 static u16 **uni_pagedir[32] =
405 {
406     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
407     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
408     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
409     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
410 };
411 
412 static char ***uni_pagedir_str[32] =
413 {
414     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
415     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
416     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
417     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
418 };
419 
420 static const u16 *UC_current_unitable = NULL;
421 static struct unimapdesc_str *UC_current_unitable_str = NULL;
422 
423 /*
424  * Keep a second set of structures for the translation designated
425  * as "default" - kw
426  */
427 static int unidefault_contents_valid = 0;	/* Use ASCII-only mode for bootup */
428 static int unidefault_str_contents_valid = 0;
429 
430 static u16 **unidefault_pagedir[32] =
431 {
432     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
433     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
434     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
435     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
436 };
437 static char ***unidefault_pagedir_str[32] =
438 {
439     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
440     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
441     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
442     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
443 };
444 
445 static const u16 *UC_default_unitable = 0;
446 static const struct unimapdesc_str *UC_default_unitable_str = 0;
447 
con_insert_unipair(unsigned unicode,unsigned fontpos,int fordefault)448 static int con_insert_unipair(unsigned unicode, unsigned fontpos, int fordefault)
449 {
450     int i;
451     unsigned n;
452     u16 **p1, *p2;
453 
454     if (fordefault)
455 	p1 = unidefault_pagedir[n = unicode >> 11];
456     else
457 	p1 = uni_pagedir[n = unicode >> 11];
458     if (!p1) {
459 	p1 = (u16 * *)malloc(32 * sizeof(u16 *));
460 	if (fordefault)
461 	    unidefault_pagedir[n] = p1;
462 	else
463 	    uni_pagedir[n] = p1;
464 	if (!p1)
465 	    return ucError;
466 
467 	for (i = 0; i < 32; i++) {
468 	    p1[i] = NULL;
469 	}
470     }
471 
472     if (!(p2 = p1[n = (unicode >> 6) & 0x1f])) {
473 	p2 = p1[n] = (u16 *) malloc(64 * sizeof(u16));
474 	if (!p2)
475 	    return ucError;
476 
477 	for (i = 0; i < 64; i++) {
478 	    p2[i] = 0xffff;	/* No glyph for this character (yet) */
479 	}
480     }
481 
482     p2[unicode & 0x3f] = (u16) fontpos;
483 
484     return 0;
485 }
486 
con_insert_unipair_str(unsigned unicode,const char * replace_str,int fordefault)487 static int con_insert_unipair_str(unsigned unicode, const char *replace_str,
488 				  int fordefault)
489 {
490     int i;
491     unsigned n;
492     char ***p1;
493     const char **p2;
494 
495     if (fordefault)
496 	p1 = unidefault_pagedir_str[n = unicode >> 11];
497     else
498 	p1 = uni_pagedir_str[n = unicode >> 11];
499     if (!p1) {
500 	p1 = (char ***) malloc(32 * sizeof(char **));
501 
502 	if (fordefault)
503 	    unidefault_pagedir_str[n] = p1;
504 	else
505 	    uni_pagedir_str[n] = p1;
506 	if (!p1)
507 	    return ucError;
508 
509 	for (i = 0; i < 32; i++) {
510 	    p1[i] = NULL;
511 	}
512     }
513 
514     n = ((unicode >> 6) & 0x1f);
515     if (!p1[n]) {
516 	p1[n] = (char **) malloc(64 * sizeof(char *));
517 
518 	if (!p1[n])
519 	    return ucError;
520 
521 	p2 = (const char **) p1[n];
522 	for (i = 0; i < 64; i++) {
523 	    p2[i] = NULL;	/* No replace string this character (yet) */
524 	}
525     }
526     p2 = (const char **) p1[n];
527 
528     p2[unicode & 0x3f] = replace_str;
529 
530     return 0;
531 }
532 
533 /*
534  * ui arg was a leftover, deleted.  - KW
535  */
con_clear_unimap(int fordefault)536 static void con_clear_unimap(int fordefault)
537 {
538     int i, j;
539     u16 **p1;
540 
541     if (fordefault) {
542 	for (i = 0; i < 32; i++) {
543 	    if ((p1 = unidefault_pagedir[i]) != NULL) {
544 		for (j = 0; j < 32; j++) {
545 		    FREE(p1[j]);
546 		}
547 		FREE(p1);
548 	    }
549 	    unidefault_pagedir[i] = NULL;
550 	}
551 
552 	unidefault_contents_valid = 1;
553     } else {
554 	for (i = 0; i < 32; i++) {
555 	    if ((p1 = uni_pagedir[i]) != NULL) {
556 		for (j = 0; j < 32; j++) {
557 		    FREE(p1[j]);
558 		}
559 		FREE(p1);
560 	    }
561 	    uni_pagedir[i] = NULL;
562 	}
563 
564 	hashtable_contents_valid = 1;
565     }
566 }
567 
con_clear_unimap_str(int fordefault)568 static void con_clear_unimap_str(int fordefault)
569 {
570     int i, j;
571     char ***p1;
572 
573     if (fordefault) {
574 	for (i = 0; i < 32; i++) {
575 	    if ((p1 = unidefault_pagedir_str[i]) != NULL) {
576 		for (j = 0; j < 32; j++) {
577 		    FREE(p1[j]);
578 		}
579 		FREE(p1);
580 	    }
581 	    unidefault_pagedir_str[i] = NULL;
582 	}
583 
584 	unidefault_str_contents_valid = 1;	/* ??? probably no use... */
585     } else {
586 	for (i = 0; i < 32; i++) {
587 	    if ((p1 = uni_pagedir_str[i]) != NULL) {
588 		for (j = 0; j < 32; j++) {
589 		    FREE(p1[j]);
590 		}
591 		FREE(p1);
592 	    }
593 	    uni_pagedir_str[i] = NULL;
594 	}
595 
596 	hashtable_str_contents_valid = 1;	/* ??? probably no use... */
597     }
598 }
599 
600 /*
601  * Loads the unimap for the hardware font, as defined in uni_hash.tbl.
602  * The representation used was the most compact I could come up
603  * with.  This routine is executed at sys_setup time, and when the
604  * PIO_FONTRESET ioctl is called.
605  */
con_set_default_unimap(void)606 static void con_set_default_unimap(void)
607 {
608     int i, j;
609     const u16 *p;
610 
611     /*
612      * The default font is always 256 characters.
613      */
614     con_clear_unimap(1);
615 
616     p = dfont_unitable;
617     for (i = 0; i < 256; i++) {
618 	for (j = dfont_unicount[i]; j; j--) {
619 	    con_insert_unipair(*(p++), (u16) i, 1);
620 	}
621     }
622 
623     UC_default_unitable = dfont_unitable;
624 
625     con_clear_unimap_str(1);
626     UC_con_set_unimap_str(dfont_replacedesc.entry_ct, repl_map, 1);
627     UC_default_unitable_str = &dfont_replacedesc;
628 }
629 
630 int UCNumCharsets = 0;
631 
632 int UCLYhndl_HTFile_for_unspec = -1;
633 int UCLYhndl_HTFile_for_unrec = -1;
634 int UCLYhndl_for_unspec = -1;
635 int UCLYhndl_for_unrec = -1;
636 
637 /* easy to type, will initialize later */
638 int LATIN1 = -1;		/* UCGetLYhndl_byMIME("iso-8859-1") */
639 int US_ASCII = -1;		/* UCGetLYhndl_byMIME("us-ascii")   */
640 int UTF8_handle = -1;		/* UCGetLYhndl_byMIME("utf-8")      */
641 int TRANSPARENT = -1;		/* UCGetLYhndl_byMIME("x-transparent")  */
642 
UC_con_set_unimap(int UC_charset_out_hndl,int update_flag)643 static int UC_con_set_unimap(int UC_charset_out_hndl,
644 			     int update_flag)
645 {
646     int i, j;
647     const u16 *p;
648 
649     if (!UC_valid_UC_charset(UC_charset_out_hndl)) {
650 	CTRACE((tfp, "UC_con_set_unimap: Invalid charset handle %d.\n",
651 		UC_charset_out_hndl));
652 	return ucError;
653     }
654 
655     p = UCInfo[UC_charset_out_hndl].unitable;
656     if (p == UC_current_unitable) {	/* test whether pointers are equal */
657 	return update_flag;	/* nothing to be done */
658     }
659     UC_current_unitable = p;
660 
661     /*
662      * The font is always 256 characters - so far.
663      */
664     con_clear_unimap(0);
665 
666     for (i = 0; i < 256; i++) {
667 	for (j = UCInfo[UC_charset_out_hndl].unicount[i]; j; j--) {
668 	    con_insert_unipair(*(p++), (u16) i, 0);
669 	}
670     }
671 
672     if (update_flag) {
673 	for (i = 0; i <= 3; i++) {
674 	    set_inverse_transl(i);	/* Update all inverse translations */
675 	}
676     }
677 
678     return 0;
679 }
680 
UC_con_set_unimap_str(unsigned ct,struct unipair_str * list,int fordefault)681 static int UC_con_set_unimap_str(unsigned ct, struct unipair_str *list,
682 				 int fordefault)
683 {
684     int err = 0, err1;
685 
686     while (ct--) {
687 	if ((err1 = con_insert_unipair_str(list->unicode,
688 					   list->replace_str,
689 					   fordefault)) != 0) {
690 	    err = err1;
691 	}
692 	list++;
693     }
694 
695     /*
696      * No inverse translations for replacement strings!
697      */
698     if (!err) {
699 	if (fordefault)
700 	    unidefault_str_contents_valid = 1;
701 	else
702 	    hashtable_str_contents_valid = 1;
703     }
704 
705     return err;
706 }
707 
conv_uni_to_pc(long ucs,int usedefault)708 static int conv_uni_to_pc(long ucs,
709 			  int usedefault)
710 {
711     int h;
712     u16 **p1, *p2;
713 
714     /*
715      * Only 16-bit codes supported at this time.
716      */
717     if (ucs > 0xffff) {
718 	/*
719 	 * U+FFFD:  REPLACEMENT CHARACTER.
720 	 */
721 	ucs = 0xfffd;
722     } else if (ucs < 0x20 || ucs >= 0xfffe) {
723 	/*
724 	 * Not a printable character.
725 	 */
726 	return ucError;
727     } else if (ucs == 0xfeff || (ucs >= 0x200b && ucs <= 0x200f)) {
728 	/*
729 	 * Zero-width space.
730 	 */
731 	return ucZeroWidth;
732     } else if ((ucs & ~UNI_DIRECT_MASK) == UNI_DIRECT_BASE) {
733 	/*
734 	 * UNI_DIRECT_BASE indicates the start of the region in the
735 	 * User Zone which always has a 1:1 mapping to the currently
736 	 * loaded font.  The UNI_DIRECT_MASK indicates the bit span
737 	 * of the region.
738 	 */
739 	return (ucs & UNI_DIRECT_MASK);
740     }
741 
742     if (usedefault) {
743 	if (!unidefault_contents_valid)
744 	    return ucInvalidHash;
745 	p1 = unidefault_pagedir[ucs >> 11];
746     } else {
747 	if (!hashtable_contents_valid)
748 	    return ucInvalidHash;
749 	p1 = uni_pagedir[ucs >> 11];
750     }
751 
752     if (p1 &&
753 	(p2 = p1[(ucs >> 6) & 0x1f]) &&
754 	(h = p2[ucs & 0x3f]) < MAX_GLYPH) {
755 	return h;
756     }
757 
758     /*
759      * Not found.
760      */
761     return ucNotFound;
762 }
763 
764 /*
765  * Note:  contents of outbuf is not changes for negative return value!
766  */
conv_uni_to_str(char * outbuf,int buflen,UCode_t ucs,int usedefault)767 static int conv_uni_to_str(char *outbuf,
768 			   int buflen,
769 			   UCode_t ucs,
770 			   int usedefault)
771 {
772     char *h;
773     char ***p1, **p2;
774 
775     /*
776      * Only 16-bit codes supported at this time.
777      */
778     if (ucs > 0xffff) {
779 	/*
780 	 * U+FFFD:  REPLACEMENT CHARACTER.
781 	 */
782 	ucs = 0xfffd;
783 	/*
784 	 * Maybe the following two cases should be allowed here??  - KW
785 	 */
786     } else if (ucs < 0x20 || ucs >= 0xfffe) {
787 	/*
788 	 * Not a printable character.
789 	 */
790 	return ucError;
791     } else if (ucs == 0xfeff || (ucs >= 0x200b && ucs <= 0x200f)) {
792 	/*
793 	 * Zero-width space.
794 	 */
795 	return ucZeroWidth;
796     }
797 
798     if (usedefault) {
799 	if (!unidefault_str_contents_valid)
800 	    return ucInvalidHash;
801 	p1 = unidefault_pagedir_str[ucs >> 11];
802     } else {
803 	if (!hashtable_str_contents_valid)
804 	    return ucInvalidHash;
805 	p1 = uni_pagedir_str[ucs >> 11];
806     }
807 
808     if (p1 &&
809 	(p2 = p1[(ucs >> 6) & 0x1f]) &&
810 	(h = p2[ucs & 0x3f])) {
811 	StrNCpy(outbuf, h, (buflen - 1));
812 	return 1;		/* ok ! */
813     }
814 
815     /*
816      * Not found.
817      */
818     return ucNotFound;
819 }
820 
821 int UCInitialized = 0;
822 
823 /*
824  * [ original comment:  - KW ]
825  * This is called at sys_setup time, after memory and the console are
826  * initialized.  It must be possible to call kmalloc(..., GFP_KERNEL)
827  * from this function, hence the call from sys_setup.
828  */
UCconsole_map_init(void)829 static void UCconsole_map_init(void)
830 {
831     con_set_default_unimap();
832     UCInitialized = 1;
833 }
834 
835 /*
836  * OK now, finally, some stuff that is more specifically for Lynx:  - KW
837  */
UCTransUniChar(UCode_t unicode,int charset_out)838 int UCTransUniChar(UCode_t unicode,
839 		   int charset_out)
840 {
841     int rc = 0;
842     int UChndl_out;
843     int isdefault, trydefault = 0;
844     const u16 *ut;
845 
846     if ((UChndl_out = LYCharSet_UC[charset_out].UChndl) < 0) {
847 	if (LYCharSet_UC[charset_out].codepage < 0) {
848 	    if (unicode < 128) {
849 		rc = (int) unicode;
850 	    } else {
851 		rc = LYCharSet_UC[charset_out].codepage;
852 	    }
853 	    return rc;
854 	}
855 	if ((UChndl_out = default_UChndl) < 0) {
856 	    return ucCannotOutput;
857 	}
858 	isdefault = 1;
859     } else {
860 	isdefault = UCInfo[UChndl_out].replacedesc.isdefault;
861 	trydefault = UCInfo[UChndl_out].replacedesc.trydefault;
862     }
863 
864     if (!isdefault) {
865 	ut = UCInfo[UChndl_out].unitable;
866 	if (ut != UC_current_unitable) {
867 	    rc = UC_con_set_unimap(UChndl_out, 1);
868 	    if (rc < 0) {
869 		return rc;
870 	    }
871 	}
872 	rc = conv_uni_to_pc(unicode, 0);
873 	if (rc >= 0) {
874 	    return rc;
875 	}
876     }
877     if (isdefault || trydefault) {
878 	rc = conv_uni_to_pc(unicode, 1);
879 	if (rc >= 0) {
880 	    return rc;
881 	}
882     }
883     if (!isdefault && (rc == ucNotFound)) {
884 	rc = conv_uni_to_pc(0xfffdL, 0);
885     }
886     if ((isdefault || trydefault) && (rc == ucNotFound)) {
887 	rc = conv_uni_to_pc(0xfffdL, 1);
888     }
889     return rc;
890 }
891 
892 /*
893  * Returns string length, or negative value for error.
894  */
UCTransUniCharStr(char * outbuf,int buflen,UCode_t unicode,int charset_out,int chk_single_flag)895 int UCTransUniCharStr(char *outbuf,
896 		      int buflen,
897 		      UCode_t unicode,
898 		      int charset_out,
899 		      int chk_single_flag)
900 {
901     int rc = ucUnknown, src = 0;
902     int UChndl_out;
903     int isdefault, trydefault = 0;
904     struct unimapdesc_str *repl;
905     const u16 *ut;
906 
907     if (buflen < 2)
908 	return ucBufferTooSmall;
909 
910     if ((UChndl_out = LYCharSet_UC[charset_out].UChndl) < 0) {
911 	if (LYCharSet_UC[charset_out].codepage < 0)
912 	    return LYCharSet_UC[charset_out].codepage;
913 	if ((UChndl_out = default_UChndl) < 0)
914 	    return ucCannotOutput;
915 	isdefault = 1;
916     } else {
917 	isdefault = UCInfo[UChndl_out].replacedesc.isdefault;
918 	trydefault = UCInfo[UChndl_out].replacedesc.trydefault;
919     }
920 
921     if (chk_single_flag) {
922 	if (!isdefault) {
923 	    ut = UCInfo[UChndl_out].unitable;
924 	    if (ut != UC_current_unitable) {
925 		src = UC_con_set_unimap(UChndl_out, 1);
926 		if (src < 0) {
927 		    return src;
928 		}
929 	    }
930 	}
931 	src = conv_uni_to_pc(unicode, isdefault);
932 	if (src >= 32) {
933 	    outbuf[0] = (char) src;
934 	    outbuf[1] = '\0';
935 	    return 1;
936 	}
937     }
938 
939     repl = &(UCInfo[UChndl_out].replacedesc);
940     if (!isdefault) {
941 	if (repl != UC_current_unitable_str) {
942 	    con_clear_unimap_str(0);
943 	    (void) UC_con_set_unimap_str(repl->entry_ct, repl->entries, 0);
944 	    UC_current_unitable_str = repl;
945 	}
946 	rc = conv_uni_to_str(outbuf, buflen, unicode, 0);
947 	if (rc >= 0)
948 	    return (int) strlen(outbuf);
949     }
950     if (trydefault && chk_single_flag) {
951 	src = conv_uni_to_pc(unicode, 1);
952 	if (src >= 32) {
953 	    outbuf[0] = (char) src;
954 	    outbuf[1] = '\0';
955 	    return 1;
956 	}
957     }
958     if (isdefault || trydefault) {
959 #ifdef EXP_JAPANESEUTF8_SUPPORT
960 	if (LYCharSet_UC[charset_out].codepage == 0 &&
961 	    LYCharSet_UC[charset_out].codepoints == 0) {
962 	    iconv_t cd;
963 	    char str[3], *pin, *pout;
964 	    size_t inleft, outleft;
965 	    char *tocode = NULL;
966 
967 	    str[0] = (char) (unicode >> 8);
968 	    str[1] = (char) (unicode & 0xFF);
969 	    str[2] = 0;
970 	    pin = str;
971 	    inleft = 2;
972 	    pout = outbuf;
973 	    outleft = (size_t) buflen;
974 	    /*
975 	     * Try TRANSLIT first, since it is an extension which can provide
976 	     * translations when there is no available exact translation to
977 	     * the target character set.
978 	     */
979 	    HTSprintf0(&tocode, "%s//TRANSLIT", LYCharSet_UC[charset_out].MIMEname);
980 	    cd = iconv_open(tocode, "UTF-16BE");
981 	    if (cd == (iconv_t) -1) {
982 		/*
983 		 * Try again, without TRANSLIT
984 		 */
985 		HTSprintf0(&tocode, "%s", LYCharSet_UC[charset_out].MIMEname);
986 		cd = iconv_open(tocode, "UTF-16BE");
987 
988 		if (cd == (iconv_t) -1) {
989 		    CTRACE((tfp,
990 			    "Warning: Cannot transcode form charset %s to %s!\n",
991 			    "UTF-16BE", tocode));
992 		}
993 	    }
994 	    FREE(tocode);
995 
996 	    if (cd != (iconv_t) -1) {
997 		rc = (int) iconv(cd, (ICONV_CONST char **) &pin, &inleft,
998 				 &pout, &outleft);
999 		iconv_close(cd);
1000 		if ((pout - outbuf) == 3) {
1001 		    CTRACE((tfp,
1002 			    "It seems to be a JIS X 0201 code(%" PRI_UCode_t
1003 			    "). Not supported.\n", unicode));
1004 		    pin = str;
1005 		    inleft = 2;
1006 		    pout = outbuf;
1007 		    outleft = (size_t) buflen;
1008 		} else if (rc >= 0) {
1009 		    *pout = '\0';
1010 		    return (int) strlen(outbuf);
1011 		}
1012 	    }
1013 	}
1014 #endif
1015 	rc = conv_uni_to_str(outbuf, buflen, unicode, 1);
1016 	if (rc >= 0)
1017 	    return (int) strlen(outbuf);
1018     }
1019     if (rc == ucNotFound) {
1020 	if (!isdefault)
1021 	    rc = conv_uni_to_str(outbuf, buflen, 0xfffdL, 0);
1022 	if ((rc == ucNotFound) && (isdefault || trydefault))
1023 	    rc = conv_uni_to_str(outbuf, buflen, 0xfffdL, 1);
1024 	if (rc >= 0)
1025 	    return (int) strlen(outbuf);
1026     }
1027     if (chk_single_flag && src == ucNotFound) {
1028 	if (!isdefault)
1029 	    rc = conv_uni_to_pc(0xfffdL, 0);
1030 	if ((rc == ucNotFound) && (isdefault || trydefault))
1031 	    rc = conv_uni_to_pc(0xfffdL, 1);
1032 	if (rc >= 32) {
1033 	    outbuf[0] = (char) rc;
1034 	    outbuf[1] = '\0';
1035 	    return 1;
1036 	}
1037 	return rc;
1038     }
1039     return ucNotFound;
1040 }
1041 
1042 static int UC_lastautoGN = 0;
1043 
UC_MapGN(int UChndl,int update_flag)1044 static int UC_MapGN(int UChndl,
1045 		    int update_flag)
1046 {
1047     int i, Gn, found, lasthndl;
1048 
1049     found = 0;
1050     Gn = -1;
1051     for (i = 0; i < 4 && Gn < 0; i++) {
1052 	if (UC_GNhandles[i] < 0) {
1053 	    Gn = i;
1054 	} else if (UC_GNhandles[i] == UChndl) {
1055 	    Gn = i;
1056 	    found = 1;
1057 	}
1058     }
1059     if (found)
1060 	return Gn;
1061     if (Gn >= 0) {
1062 	UCInfo[UChndl].GN = Gn;
1063 	UC_GNhandles[Gn] = UChndl;
1064     } else {
1065 	if (UC_lastautoGN == GRAF_MAP) {
1066 	    Gn = IBMPC_MAP;
1067 	} else {
1068 	    Gn = GRAF_MAP;
1069 	}
1070 	UC_lastautoGN = Gn;
1071 	lasthndl = UC_GNhandles[Gn];
1072 	UCInfo[lasthndl].GN = -1;
1073 	UCInfo[UChndl].GN = Gn;
1074 	UC_GNhandles[Gn] = UChndl;
1075     }
1076     CTRACE((tfp, "UC_MapGN: Using %d <- %d (%s)\n",
1077 	    Gn, UChndl, UCInfo[UChndl].MIMEname));
1078     UC_con_set_trans(UChndl, Gn, update_flag);
1079     return Gn;
1080 }
1081 
UCTransChar(int ch_in,int charset_in,int charset_out)1082 int UCTransChar(int ch_in,
1083 		int charset_in,
1084 		int charset_out)
1085 {
1086     UCode_t unicode;
1087     int Gn;
1088     int rc = ucNotFound;
1089     int UChndl_in, UChndl_out;
1090     int isdefault, trydefault = 0;
1091     const u16 *ut;
1092     int upd = 0;
1093 
1094     if (charset_in == charset_out)
1095 	return UCH(ch_in);
1096     if (charset_in < 0)
1097 	return ucCannotConvert;
1098     if ((UChndl_in = LYCharSet_UC[charset_in].UChndl) < 0)
1099 	return ucCannotConvert;
1100     if ((UChndl_out = LYCharSet_UC[charset_out].UChndl) < 0) {
1101 	if (LYCharSet_UC[charset_out].codepage < 0)
1102 	    return LYCharSet_UC[charset_out].codepage;
1103 	if ((UChndl_out = default_UChndl) < 0)
1104 	    return ucCannotOutput;
1105 	isdefault = 1;
1106     } else {
1107 	isdefault = UCInfo[UChndl_out].replacedesc.isdefault;
1108 	trydefault = UCInfo[UChndl_out].replacedesc.trydefault;
1109     }
1110     if (!UCInfo[UChndl_in].num_uni)
1111 	return ucCannotConvert;
1112     if ((Gn = UCInfo[UChndl_in].GN) < 0) {
1113 	Gn = UC_MapGN(UChndl_in, 0);
1114 	upd = 1;
1115     }
1116 
1117     ut = UCInfo[UChndl_out].unitable;
1118     if (!isdefault) {
1119 	if (ut == UC_current_unitable) {
1120 	    if (upd) {
1121 		set_inverse_transl(Gn);
1122 	    }
1123 	} else {
1124 	    rc = UC_con_set_unimap(UChndl_out, 1);
1125 	    if (rc > 0) {
1126 		set_inverse_transl(Gn);
1127 	    } else if (rc < 0) {
1128 		return rc;
1129 	    }
1130 	}
1131     }
1132     UC_translate = set_translate(Gn);
1133     unicode = UC_translate[UCH(ch_in)];
1134     if (!isdefault) {
1135 	rc = conv_uni_to_pc(unicode, 0);
1136 	if (rc >= 0)
1137 	    return rc;
1138     }
1139     if ((rc == ucNotFound) && (isdefault || trydefault)) {
1140 	rc = conv_uni_to_pc(unicode, 1);
1141     }
1142     if ((rc == ucNotFound) && !isdefault) {
1143 	rc = conv_uni_to_pc(0xfffdL, 0);
1144     }
1145     if ((rc == ucNotFound) && (isdefault || trydefault)) {
1146 	rc = conv_uni_to_pc(0xfffdL, 1);
1147     }
1148     return rc;
1149 }
1150 
1151 #ifdef EXP_JAPANESEUTF8_SUPPORT
UCTransJPToUni(char * inbuf,int buflen,int charset_in)1152 UCode_t UCTransJPToUni(char *inbuf,
1153 		       int buflen,
1154 		       int charset_in)
1155 {
1156     char outbuf[3], *pin, *pout;
1157     size_t ilen, olen;
1158     iconv_t cd;
1159 
1160     pin = inbuf;
1161     pout = outbuf;
1162     ilen = 2;
1163     olen = (size_t) buflen;
1164 
1165     cd = iconv_open("UTF-16BE", LYCharSet_UC[charset_in].MIMEname);
1166     (void) iconv(cd, (ICONV_CONST char **) &pin, &ilen, &pout, &olen);
1167     iconv_close(cd);
1168     if ((ilen == 0) && (olen == 0)) {
1169 	return (((unsigned char) outbuf[0]) << 8) + (unsigned char) outbuf[1];
1170     }
1171     return ucCannotConvert;
1172 }
1173 #endif
1174 
1175 /*
1176  * Translate a character to Unicode.  If additional bytes are needed, this
1177  * returns ucNeedMore, based on its internal state.  To reset the state,
1178  * call this with charset_in < 0.
1179  */
UCTransToUni(int ch_in,int charset_in)1180 UCode_t UCTransToUni(int ch_in,
1181 		     int charset_in)
1182 {
1183     static char buffer[10];
1184     static unsigned inx = 0;
1185 
1186     UCode_t unicode;
1187     int Gn;
1188     unsigned char ch_iu = UCH(ch_in);
1189     int UChndl_in;
1190 
1191     /*
1192      * Reset saved-state.
1193      */
1194     if (charset_in < 0) {
1195 	inx = 0;
1196 	return ucCannotConvert;
1197     } else if (charset_in == LATIN1) {
1198 	return ch_iu;
1199     } else if (charset_in == UTF8_handle) {
1200 	if (is8bits(ch_iu)) {
1201 	    unsigned need;
1202 	    char *ptr;
1203 
1204 	    buffer[inx++] = (char) ch_iu;
1205 	    buffer[inx] = '\0';
1206 	    need = (unsigned) utf8_length(TRUE, buffer);
1207 	    if (need && (need + 1) == inx) {
1208 		inx = 0;
1209 		ptr = buffer;
1210 		return UCGetUniFromUtf8String(&ptr);
1211 	    } else if (inx < sizeof(buffer) - 1) {
1212 		return ucNeedMore;
1213 	    } else {
1214 		inx = 0;
1215 	    }
1216 	} else {
1217 	    inx = 0;
1218 	}
1219     }
1220 #ifdef EXP_JAPANESEUTF8_SUPPORT
1221     if ((strcmp(LYCharSet_UC[charset_in].MIMEname, "shift_jis") == 0) ||
1222 	(strcmp(LYCharSet_UC[charset_in].MIMEname, "euc-jp") == 0)) {
1223 	char obuffer[3], *pin, *pout;
1224 	size_t ilen, olen;
1225 	iconv_t cd;
1226 
1227 	pin = buffer;
1228 	pout = obuffer;
1229 	ilen = olen = 2;
1230 	if (strcmp(LYCharSet_UC[charset_in].MIMEname, "shift_jis") == 0) {
1231 	    if (inx == 0) {
1232 		if (IS_SJIS_HI1(ch_iu) ||
1233 		    IS_SJIS_HI2(ch_iu)) {
1234 		    buffer[0] = (char) ch_in;
1235 		    inx = 1;
1236 		    return ucNeedMore;
1237 		}
1238 	    } else {
1239 		if (IS_SJIS_LO(ch_iu)) {
1240 		    buffer[1] = (char) ch_in;
1241 		    buffer[2] = 0;
1242 
1243 		    cd = iconv_open("UTF-16BE", "Shift_JIS");
1244 		    (void) iconv(cd, (ICONV_CONST char **) &pin, &ilen, &pout, &olen);
1245 		    iconv_close(cd);
1246 		    inx = 0;
1247 		    if ((ilen == 0) && (olen == 0)) {
1248 			return (UCH(obuffer[0]) << 8) + UCH(obuffer[1]);
1249 		    }
1250 		}
1251 	    }
1252 	}
1253 	if (strcmp(LYCharSet_UC[charset_in].MIMEname, "euc-jp") == 0) {
1254 	    if (inx == 0) {
1255 		if (IS_EUC_HI(ch_iu)) {
1256 		    buffer[0] = (char) ch_in;
1257 		    inx = 1;
1258 		    return ucNeedMore;
1259 		}
1260 	    } else {
1261 		if (IS_EUC_LOX(ch_iu)) {
1262 		    buffer[1] = (char) ch_in;
1263 		    buffer[2] = 0;
1264 
1265 		    cd = iconv_open("UTF-16BE", "EUC-JP");
1266 		    (void) iconv(cd, (ICONV_CONST char **) &pin, &ilen, &pout, &olen);
1267 		    iconv_close(cd);
1268 		    inx = 0;
1269 		    if ((ilen == 0) && (olen == 0)) {
1270 			return (UCH(obuffer[0]) << 8) + UCH(obuffer[1]);
1271 		    }
1272 		}
1273 	    }
1274 	}
1275 	inx = 0;
1276     }
1277 #endif
1278     if (ch_iu < 128 && ch_iu >= 32)
1279 	return ch_iu;
1280 
1281     if (ch_iu < 32 &&
1282 	LYCharSet_UC[charset_in].enc != UCT_ENC_8BIT_C0) {
1283 	/*
1284 	 * Don't translate C0 chars except for specific charsets.
1285 	 */
1286 	return ch_iu;
1287     } else if ((UChndl_in = LYCharSet_UC[charset_in].UChndl) < 0) {
1288 	return ucCannotConvert;
1289     } else if (!UCInfo[UChndl_in].num_uni) {
1290 	return ucCannotConvert;
1291     }
1292 
1293     if ((Gn = UCInfo[UChndl_in].GN) < 0) {
1294 	Gn = UC_MapGN(UChndl_in, 1);
1295     }
1296 
1297     UC_translate = set_translate(Gn);
1298     unicode = UC_translate[ch_iu];
1299 
1300     return unicode;
1301 }
1302 
UCReverseTransChar(int ch_out,int charset_in,int charset_out)1303 int UCReverseTransChar(int ch_out,
1304 		       int charset_in,
1305 		       int charset_out)
1306 {
1307     int Gn;
1308     int rc = ucError;
1309     int UChndl_in, UChndl_out;
1310     int isdefault;
1311     int i_ch = UCH(ch_out);
1312     const u16 *ut;
1313 
1314     if (charset_in == charset_out)
1315 	return UCH(ch_out);
1316     if (charset_in < 0)
1317 	return ucCannotConvert;
1318     if ((UChndl_in = LYCharSet_UC[charset_in].UChndl) < 0)
1319 	return ucCannotConvert;
1320     if (!UCInfo[UChndl_in].num_uni)
1321 	return ucCannotConvert;
1322     if (charset_out < 0)
1323 	return ucCannotOutput;
1324     if ((UChndl_out = LYCharSet_UC[charset_out].UChndl) < 0) {
1325 	if (LYCharSet_UC[charset_out].codepage < 0)
1326 	    return LYCharSet_UC[charset_out].codepage;
1327 	if ((UChndl_out = default_UChndl) < 0)
1328 	    return ucCannotOutput;
1329 	isdefault = 1;
1330     } else {
1331 	isdefault = UCInfo[UChndl_out].replacedesc.isdefault;
1332     }
1333 
1334     if (!isdefault) {
1335 	/*
1336 	 * Try to use the inverse table if charset_out is not equivalent
1337 	 * to using just the default table.  If it is, it should have
1338 	 * just ASCII chars and trying to back-translate those should
1339 	 * not give anything but themselves.  - kw
1340 	 */
1341 	ut = UCInfo[UChndl_out].unitable;
1342 	if (ut == UC_current_unitable) {
1343 	    if ((Gn = UCInfo[UChndl_in].GN) < 0) {
1344 		Gn = UC_MapGN(UChndl_in, 1);
1345 	    }
1346 	    UC_translate = set_translate(Gn);
1347 	    if (inv_translate)
1348 		rc = inv_translate[i_ch];
1349 	    if (rc >= 32) {
1350 		return rc;
1351 	    }
1352 	}
1353     }
1354     return UCTransChar(ch_out, charset_out, charset_in);
1355 }
1356 
1357 /*
1358  * Returns string length, or negative value for error.
1359  */
UCTransCharStr(char * outbuf,int buflen,int ch_in,int charset_in,int charset_out,int chk_single_flag)1360 int UCTransCharStr(char *outbuf,
1361 		   int buflen,
1362 		   int ch_in,
1363 		   int charset_in,
1364 		   int charset_out,
1365 		   int chk_single_flag)
1366 {
1367     UCode_t unicode;
1368     int Gn;
1369     int rc = ucUnknown, src = 0;
1370     int UChndl_in, UChndl_out;
1371     int isdefault, trydefault = 0;
1372     struct unimapdesc_str *repl;
1373     const u16 *ut;
1374     int upd = 0;
1375 
1376     if (buflen < 2)
1377 	return ucBufferTooSmall;
1378     if (chk_single_flag && charset_in == charset_out) {
1379 	outbuf[0] = (char) ch_in;
1380 	outbuf[1] = '\0';
1381 	return 1;
1382     }
1383     if (charset_in < 0)
1384 	return ucCannotConvert;
1385     if ((UChndl_in = LYCharSet_UC[charset_in].UChndl) < 0)
1386 	return ucCannotConvert;
1387     if (!UCInfo[UChndl_in].num_uni)
1388 	return ucCannotConvert;
1389     if ((UChndl_out = LYCharSet_UC[charset_out].UChndl) < 0) {
1390 	if (LYCharSet_UC[charset_out].codepage < 0)
1391 	    return LYCharSet_UC[charset_out].codepage;
1392 	if ((UChndl_out = default_UChndl) < 0)
1393 	    return ucCannotOutput;
1394 	isdefault = 1;
1395     } else {
1396 	isdefault = UCInfo[UChndl_out].replacedesc.isdefault;
1397 	trydefault = UCInfo[UChndl_out].replacedesc.trydefault;
1398     }
1399     if ((Gn = UCInfo[UChndl_in].GN) < 0) {
1400 	Gn = UC_MapGN(UChndl_in, !chk_single_flag);
1401 	upd = chk_single_flag;
1402     }
1403 
1404     UC_translate = set_translate(Gn);
1405     unicode = UC_translate[UCH(ch_in)];
1406 
1407     if (chk_single_flag) {
1408 	if (!isdefault) {
1409 	    ut = UCInfo[UChndl_out].unitable;
1410 	    if (ut == UC_current_unitable) {
1411 		if (upd)
1412 		    set_inverse_transl(Gn);
1413 	    } else {
1414 		src = UC_con_set_unimap(UChndl_out, 1);
1415 		if (src > 0) {
1416 		    set_inverse_transl(Gn);
1417 		} else if (src < 0) {
1418 		    return src;
1419 		}
1420 	    }
1421 	}
1422 	src = conv_uni_to_pc(unicode, isdefault);
1423 	if (src >= 32) {
1424 	    outbuf[0] = (char) src;
1425 	    outbuf[1] = '\0';
1426 	    return 1;
1427 	}
1428     }
1429 
1430     repl = &(UCInfo[UChndl_out].replacedesc);
1431     if (!isdefault) {
1432 	if (repl != UC_current_unitable_str) {
1433 	    con_clear_unimap_str(0);
1434 	    (void) UC_con_set_unimap_str(repl->entry_ct, repl->entries, 0);
1435 	    UC_current_unitable_str = repl;
1436 	}
1437 	rc = conv_uni_to_str(outbuf, buflen, unicode, 0);
1438 	if (rc >= 0)
1439 	    return (int) strlen(outbuf);
1440     }
1441     if (trydefault && chk_single_flag) {
1442 	src = conv_uni_to_pc(unicode, 1);
1443 	if (src >= 32) {
1444 	    outbuf[0] = (char) src;
1445 	    outbuf[1] = '\0';
1446 	    return 1;
1447 	}
1448     }
1449     if (isdefault || trydefault) {
1450 	rc = conv_uni_to_str(outbuf, buflen, unicode, 1);
1451 	if (rc >= 0)
1452 	    return (int) strlen(outbuf);
1453     }
1454     if (rc == ucNotFound) {
1455 	if (!isdefault)
1456 	    rc = conv_uni_to_str(outbuf, buflen, 0xfffdL, 0);
1457 	if ((rc == ucNotFound) && (isdefault || trydefault))
1458 	    rc = conv_uni_to_str(outbuf, buflen, 0xfffdL, 1);
1459 	if (rc >= 0)
1460 	    return (int) strlen(outbuf);
1461     }
1462     if (chk_single_flag && src == ucNotFound) {
1463 	if (!isdefault)
1464 	    rc = conv_uni_to_pc(0xfffdL, 0);
1465 	if ((rc == ucNotFound) && (isdefault || trydefault))
1466 	    rc = conv_uni_to_pc(0xfffdL, 1);
1467 	if (rc >= 32) {
1468 	    outbuf[0] = (char) rc;
1469 	    outbuf[1] = '\0';
1470 	    return 1;
1471 	} else if (rc <= 0) {
1472 	    outbuf[0] = '\0';
1473 	    return rc;
1474 	}
1475 	return rc;
1476     }
1477     return ucNotFound;
1478 }
1479 
UC_FindGN_byMIME(const char * UC_MIMEcharset)1480 static int UC_FindGN_byMIME(const char *UC_MIMEcharset)
1481 {
1482     int i;
1483 
1484     for (i = 0; i < 4; i++) {
1485 	if (!strcmp(UC_MIMEcharset, UC_GNsetMIMEnames[i])) {
1486 	    return i;
1487 	}
1488     }
1489     return ucError;
1490 }
1491 
UCGetRawUniMode_byLYhndl(int i)1492 int UCGetRawUniMode_byLYhndl(int i)
1493 {
1494     if (i < 0)
1495 	return 0;
1496     return LYCharSet_UC[i].enc;
1497 }
1498 
1499 /*
1500  * Construct a new charset name, given prefix and codepage.  This introduces
1501  * potentially unchecked recursion into UCGetLYhntl_byMIME if neither the "cp"
1502  * nor "windows-" prefixes are configured, so we check it here.
1503  */
getLYhndl_byCP(const char * prefix,const char * codepage)1504 static int getLYhndl_byCP(const char *prefix,
1505 			  const char *codepage)
1506 {
1507     static int nested;
1508     int result = ucError;
1509 
1510     if (!nested++) {
1511 	char *cptmp = NULL;
1512 
1513 	StrAllocCopy(cptmp, prefix);
1514 	StrAllocCat(cptmp, codepage);
1515 	result = UCGetLYhndl_byMIME(cptmp);
1516 	FREE(cptmp);
1517     }
1518     nested--;
1519     return result;
1520 }
1521 
1522 /*
1523  * Get Lynx internal charset handler from MIME name,
1524  * return -1 if we got NULL or did not recognize value.
1525  * According to RFC, MIME headers should match case-insensitively.
1526  */
UCGetLYhndl_byMIME(const char * value)1527 int UCGetLYhndl_byMIME(const char *value)
1528 {
1529     int i;
1530     int LYhndl = -1;
1531 
1532     if (!value || !(*value)) {
1533 	CTRACE((tfp,
1534 		"UCGetLYhndl_byMIME: NULL argument instead of MIME name.\n"));
1535 	return ucError;
1536     }
1537 
1538     for (i = 0;
1539 	 (i < MAXCHARSETS && i < LYNumCharsets &&
1540 	  LYchar_set_names[i]); i++) {
1541 	if (LYCharSet_UC[i].MIMEname &&
1542 	    !strcasecomp(value, LYCharSet_UC[i].MIMEname)) {
1543 	    return i;
1544 	}
1545     }
1546 
1547     /*
1548      * Not yet found, try synonyms.  - FM
1549      */
1550 #if !NO_CHARSET_utf_8
1551     if (!strcasecomp(value, "unicode-1-1-utf-8") ||
1552 	!strcasecomp(value, "utf8")) {
1553 	/*
1554 	 * Treat these as synonyms for the IANA registered name.  - FM
1555 	 */
1556 	return UCGetLYhndl_byMIME("utf-8");
1557     }
1558 #endif
1559     if (!strncasecomp(value, "iso", 3) && !StrNCmp(value + 3, "8859", 4)) {
1560 	return getLYhndl_byCP("iso-", value + 3);
1561     }
1562     if (!strcasecomp(value, "iso-8859-8-i") ||
1563 	!strcasecomp(value, "iso-8859-8-e")) {
1564 	return UCGetLYhndl_byMIME("iso-8859-8");
1565     }
1566 #if !NO_CHARSET_euc_jp
1567     if (!strcasecomp(value, "x-euc-jp") ||
1568 	!strcasecomp(value, "eucjp")) {
1569 	return UCGetLYhndl_byMIME("euc-jp");
1570     }
1571 #endif
1572 #if !NO_CHARSET_shift_jis
1573     if ((!strcasecomp(value, "x-shift-jis")) ||
1574 	(!strcasecomp(value, "x-sjis")) ||
1575 	(!strcasecomp(value, "pck"))) {
1576 	return UCGetLYhndl_byMIME("shift_jis");
1577     }
1578 #endif
1579 #if !NO_CHARSET_euc_kr
1580     if (!strcasecomp(value, "iso-2022-kr")) {
1581 	return UCGetLYhndl_byMIME("euc-kr");
1582     }
1583 #endif
1584 #if !NO_CHARSET_euc_cn
1585     if (!strcasecomp(value, "gb2312") ||
1586 	!strncasecomp(value, "cn-gb", 5) ||
1587 	!strcasecomp(value, "iso-2022-cn")) {
1588 	return UCGetLYhndl_byMIME("euc-cn");
1589     }
1590 #endif
1591 #if !NO_CHARSET_big5
1592     if (!strcasecomp(value, "cn-big5")) {
1593 	return UCGetLYhndl_byMIME("big5");
1594     }
1595 #endif
1596 #if !NO_CHARSET_macintosh
1597     if (!strcasecomp(value, "x-mac-roman") ||
1598 	!strcasecomp(value, "mac-roman")) {
1599 	return UCGetLYhndl_byMIME("macintosh");
1600     }
1601 #endif
1602 #if !NO_CHARSET_next
1603     if (!strcasecomp(value, "x-next") ||
1604 	!strcasecomp(value, "nextstep") ||
1605 	!strcasecomp(value, "x-nextstep")) {
1606 	return UCGetLYhndl_byMIME("next");
1607     }
1608 #endif
1609 #if !NO_CHARSET_windows_1252
1610     if (!strcasecomp(value, "iso-8859-1-windows-3.1-latin-1") ||
1611 	!strcasecomp(value, "cp1252") ||
1612 	!strcasecomp(value, "cp-1252") ||
1613 	!strcasecomp(value, "ibm1252") ||
1614 	!strcasecomp(value, "iso-8859-1-windows-3.0-latin-1")) {
1615 	/*
1616 	 * Treat these as synonyms for windows-1252, which is more
1617 	 * commonly used than the IANA registered name.  - FM
1618 	 */
1619 	return UCGetLYhndl_byMIME("windows-1252");
1620     }
1621 #endif
1622 #if !NO_CHARSET_windows_1251
1623     if (!strcasecomp(value, "ansi-1251")) {
1624 	return UCGetLYhndl_byMIME("windows-1251");
1625     }
1626 #endif
1627 #if !NO_CHARSET_windows_1250
1628     if (!strcasecomp(value, "iso-8859-2-windows-latin-2") ||
1629 	!strcasecomp(value, "cp1250") ||
1630 	!strcasecomp(value, "cp-1250") ||
1631 	!strcasecomp(value, "ibm1250")) {
1632 	/*
1633 	 * Treat these as synonyms for windows-1250.  - FM
1634 	 */
1635 	return UCGetLYhndl_byMIME("windows-1250");
1636     }
1637 #endif
1638     if ((!strncasecomp(value, "ibm", 3) ||
1639 	 !strncasecomp(value, "cp-", 3)) &&
1640 	isdigit(UCH(value[3])) &&
1641 	isdigit(UCH(value[4])) &&
1642 	isdigit(UCH(value[5]))) {
1643 	/*
1644 	 * For "ibmNNN<...>" or "cp-NNN", try "cpNNN<...>"
1645 	 * if not yet found.  - KW & FM
1646 	 */
1647 	if ((LYhndl = getLYhndl_byCP("cp", value + 3)) >= 0)
1648 	    return LYhndl;
1649 	/*
1650 	 * Try windows-NNN<...> if not yet found.  - FM
1651 	 */
1652 	return getLYhndl_byCP("windows-", value + 3);
1653     }
1654     if (!strncasecomp(value, "windows-", 8) &&
1655 	isdigit(UCH(value[8])) &&
1656 	isdigit(UCH(value[9])) &&
1657 	isdigit(UCH(value[10]))) {
1658 	/*
1659 	 * For "windows-NNN<...>", try "cpNNN<...>" - FM
1660 	 */
1661 	return getLYhndl_byCP("cp", value + 8);
1662     }
1663 #if !NO_CHARSET_koi8_r
1664     if (!strcasecomp(value, "koi-8")) {		/* accentsoft bugosity */
1665 	return UCGetLYhndl_byMIME("koi8-r");
1666     }
1667 #endif
1668     if (!strcasecomp(value, "ANSI_X3.4-1968")) {
1669 	return US_ASCII;
1670     }
1671     /* no more synonyms if come here... */
1672 
1673     CTRACE((tfp, "UCGetLYhndl_byMIME: unrecognized MIME name \"%s\"\n", value));
1674     return ucError;		/* returns -1 if no charset found by that MIME name */
1675 }
1676 
1677 /*
1678  * Function UC_setup_LYCharSets_repl() tries to set up a subtable in
1679  * LYCharSets[] appropriate for this new charset, for compatibility with the
1680  * "old method".  Maybe not nice (maybe not even necessary any more), but it
1681  * works (as far as it goes..).
1682  *
1683  * We try to be conservative and only allocate new memory for this if needed.
1684  * If not needed, just point to SevenBitApproximations[i].  [Could do the same
1685  * for ISO_Latin1[] if it's identical to that, but would make it even *more*
1686  * messy than it already is...] This the only function in this file that knows,
1687  * or cares, about the HTMLDTD or details of LYCharSets[] subtables (and
1688  * therefore somewhat violates the idea that this file should be independent of
1689  * those).  As in other places, we rely on ISO_Latin1 being the *first* table
1690  * in LYCharSets.  - KW
1691  */
1692 
1693 /*
1694  * We need to remember which ones were allocated and which are static.
1695  */
1696 static STRING2PTR remember_allocated_LYCharSets[MAXCHARSETS];
1697 
UCreset_allocated_LYCharSets(void)1698 static void UCreset_allocated_LYCharSets(void)
1699 {
1700     int i = 0;
1701 
1702     for (; i < MAXCHARSETS; i++) {
1703 	remember_allocated_LYCharSets[i] = NULL;
1704     }
1705 }
1706 
1707 #ifdef LY_FIND_LEAKS
UCfree_allocated_LYCharSets(void)1708 static void UCfree_allocated_LYCharSets(void)
1709 {
1710     int i = 0;
1711 
1712     for (; i < MAXCHARSETS; i++) {
1713 	if (remember_allocated_LYCharSets[i] != NULL) {
1714 	    FREE(remember_allocated_LYCharSets[i]);
1715 	}
1716     }
1717 }
1718 #endif
1719 
UC_setup_LYCharSets_repl(int UC_charset_in_hndl,unsigned lowest8)1720 static STRING2PTR UC_setup_LYCharSets_repl(int UC_charset_in_hndl,
1721 					   unsigned lowest8)
1722 {
1723     STRING2PTR ISO_Latin1 = LYCharSets[0];
1724     const char **p;
1725     char **prepl;
1726     const u16 *pp;
1727     const char **tp;
1728     const char *s7;
1729     const char *s8;
1730     size_t i;
1731     int j, changed;
1732     u16 k;
1733     u8 *ti;
1734 
1735     /*
1736      * Create a temporary table for reverse lookup of latin1 codes:
1737      */
1738     tp = (const char **) malloc(96 * sizeof(char *));
1739 
1740     if (!tp)
1741 	return NULL;
1742     for (i = 0; i < 96; i++)
1743 	tp[i] = NULL;
1744     ti = (u8 *) malloc(96 * sizeof(u8));
1745     if (!ti) {
1746 	FREE(tp);
1747 	return NULL;
1748     }
1749     for (i = 0; i < 96; i++)
1750 	ti[i] = 0;
1751 
1752     pp = UCInfo[UC_charset_in_hndl].unitable;
1753 
1754     /*
1755      * Determine if we have any mapping of a Unicode in the range 160-255
1756      * to an allowed code point > 0x80 in our new charset...
1757      * Store any mappings found in ti[].
1758      */
1759     if (UCInfo[UC_charset_in_hndl].num_uni > 0) {
1760 	for (i = 0; i < 256; i++) {
1761 	    if ((j = UCInfo[UC_charset_in_hndl].unicount[i])) {
1762 		if ((k = *pp) >= 160 && k < 256 && i >= lowest8) {
1763 		    ti[k - 160] = UCH(i);
1764 		}
1765 		for (; j; j--) {
1766 		    pp++;
1767 		}
1768 	    }
1769 	}
1770     } {
1771 	u16 ct;
1772 	struct unipair_str *list;
1773 
1774 	/*
1775 	 * Determine if we have any mapping of a Unicode in the range
1776 	 * 160-255 to a replacement string for our new charset...
1777 	 * Store any mappings found in tp[].
1778 	 */
1779 	ct = UCInfo[UC_charset_in_hndl].replacedesc.entry_ct;
1780 	list = UCInfo[UC_charset_in_hndl].replacedesc.entries;
1781 	while (ct--) {
1782 	    if ((k = list->unicode) >= 160 && k < 256) {
1783 		tp[k - 160] = list->replace_str;
1784 	    }
1785 	    list++;
1786 	}
1787     }
1788     /*
1789      * Now allocate a new table compatible with LYCharSets[]
1790      * and with the HTMLDTD for entities.
1791      * We don't know yet whether we'll keep it around.
1792      */
1793     prepl = (char **) malloc(HTML_dtd.number_of_entities * sizeof(char *));
1794 
1795     if (!prepl) {
1796 	FREE(tp);
1797 	FREE(ti);
1798 	return 0;
1799     }
1800 
1801     p = (const char **) prepl;
1802     changed = 0;
1803     for (i = 0; i < HTML_dtd.number_of_entities; i++, p++) {
1804 	/*
1805 	 * For each of those entities, we check what the "old method"
1806 	 * ISO_Latin1[] mapping does with them.  If it is nothing we
1807 	 * want to use, just point to the SevenBitApproximations[] string.
1808 	 */
1809 	s7 = SevenBitApproximations[i];
1810 	s8 = ISO_Latin1[i];
1811 	*p = s7;
1812 	if (s8 && UCH(*s8) >= 160 && s8[1] == '\0') {
1813 	    /*
1814 	     * We have an entity that is mapped to
1815 	     * one valid eightbit latin1 char.
1816 	     */
1817 	    if (ti[UCH(*s8) - 160] >= UCH(lowest8) &&
1818 		!(UCH(s7[0]) == ti[UCH(*s8) - 160] &&
1819 		  s7[1] == '\0')) {
1820 		/*
1821 		 * ...which in turn is mapped, by our "new method",
1822 		 * to another valid eightbit char for this new
1823 		 * charset:  either to itself...
1824 		 */
1825 		if (ti[UCH(*s8) - 160] == UCH(*s8)) {
1826 		    *p = s8;
1827 		} else {
1828 		    /*
1829 		     * make those 1-char strings
1830 		     * into HTAtoms, so they will be cleaned up
1831 		     * at exit...  all for the sake of preventing
1832 		     * memory leaks, sigh.
1833 		     */
1834 		    static char dummy[2];	/* one char dummy string */
1835 
1836 		    dummy[0] = (char) ti[UCH(*s8) - 160];
1837 		    *p = HTAtom_name(HTAtom_for(dummy));
1838 		}
1839 		changed = 1;
1840 	    } else if (tp[UCH(*s8) - 160] &&
1841 		       strcmp(s7, tp[UCH(*s8) - 160])) {
1842 		/*
1843 		 * ...or which is mapped, by our "new method",
1844 		 * to a replacement string for this new charset.
1845 		 */
1846 		*p = tp[UCH(*s8) - 160];
1847 		changed = 1;
1848 	    }
1849 	}
1850     }
1851     FREE(tp);
1852     FREE(ti);
1853     if (!changed) {
1854 	FREE(prepl);
1855 	return NULL;
1856     }
1857     return (STRING2PTR) prepl;
1858 }
1859 
1860 /*
1861  * "New method" meets "Old method" ...
1862  */
UC_Register_with_LYCharSets(int s,const char * UC_MIMEcharset,const char * UC_LYNXcharset,int lowest_eightbit)1863 static int UC_Register_with_LYCharSets(int s,
1864 				       const char *UC_MIMEcharset,
1865 				       const char *UC_LYNXcharset,
1866 				       int lowest_eightbit)
1867 {
1868     int i, LYhndl, found;
1869     STRING2PTR repl;
1870 
1871     LYhndl = -1;
1872     if (LYNumCharsets == 0) {
1873 	/*
1874 	 * Initialize here; so whoever changes
1875 	 * LYCharSets.c doesn't have to count...
1876 	 */
1877 	for (i = 0; (i < MAXCHARSETS) && LYchar_set_names[i]; i++) {
1878 	    LYNumCharsets = i + 1;
1879 	}
1880     }
1881 
1882     /*
1883      * Search by MIME name, (LYchar_set_names may differ...)
1884      */
1885     for (i = 0; i < MAXCHARSETS && LYchar_set_names[i] && LYhndl < 0; i++) {
1886 	if (LYCharSet_UC[i].MIMEname &&
1887 	    !strcmp(UC_MIMEcharset, LYCharSet_UC[i].MIMEname)) {
1888 	    LYhndl = i;
1889 	}
1890     }
1891 
1892     if (LYhndl < 0) {		/* not found */
1893 	found = 0;
1894 	if (LYNumCharsets >= MAXCHARSETS) {
1895 	    CTRACE((tfp,
1896 		    "UC_Register_with_LYCharSets: Too many.  Ignoring %s/%s.",
1897 		    UC_MIMEcharset, UC_LYNXcharset));
1898 	    return ucError;
1899 	}
1900 	/*
1901 	 * Add to LYCharSets.c lists.
1902 	 */
1903 	LYhndl = LYNumCharsets;
1904 	LYNumCharsets++;
1905 	LYlowest_eightbit[LYhndl] = 999;
1906 	LYCharSets[LYhndl] = SevenBitApproximations;
1907 	/*
1908 	 * Hmm, try to be conservative here.
1909 	 */
1910 	LYchar_set_names[LYhndl] = UC_LYNXcharset;
1911 	LYchar_set_names[LYhndl + 1] = NULL;
1912 	/*
1913 	 * Terminating NULL may be looked for by Lynx code.
1914 	 */
1915     } else {
1916 	found = 1;
1917     }
1918     LYCharSet_UC[LYhndl].UChndl = s;
1919     /*
1920      * Can we just copy the pointer?  Hope so...
1921      */
1922     LYCharSet_UC[LYhndl].MIMEname = UC_MIMEcharset;
1923     LYCharSet_UC[LYhndl].enc = UCInfo[s].enc;
1924     LYCharSet_UC[LYhndl].codepage = UCInfo[s].codepage;
1925 
1926     /*
1927      * @@@ We really SHOULD get more info from the table files,
1928      * and set relevant flags in the LYCharSet_UC[] entry with
1929      * that info...  For now, let's try it without.  - KW
1930      */
1931     if (lowest_eightbit < LYlowest_eightbit[LYhndl]) {
1932 	LYlowest_eightbit[LYhndl] = lowest_eightbit;
1933     } else if (lowest_eightbit > LYlowest_eightbit[LYhndl]) {
1934 	UCInfo[s].lowest_eight = LYlowest_eightbit[LYhndl];
1935     }
1936 
1937     if (!found && LYhndl > 0) {
1938 	repl = UC_setup_LYCharSets_repl(s, (unsigned) UCInfo[s].lowest_eight);
1939 	if (repl) {
1940 	    LYCharSets[LYhndl] = repl;
1941 	    /*
1942 	     * Remember to FREE at exit.
1943 	     */
1944 	    remember_allocated_LYCharSets[LYhndl] = repl;
1945 	}
1946     }
1947     return LYhndl;
1948 }
1949 
1950 /*
1951  * This only sets up the structure - no initialization of the tables
1952  * is done here yet.
1953  */
UC_Charset_Setup(const char * UC_MIMEcharset,const char * UC_LYNXcharset,const u8 * unicount,const u16 * unitable,int nnuni,struct unimapdesc_str replacedesc,int lowest_eight,int UC_rawuni,int codepage)1954 void UC_Charset_Setup(const char *UC_MIMEcharset,
1955 		      const char *UC_LYNXcharset,
1956 		      const u8 * unicount,
1957 		      const u16 * unitable,
1958 		      int nnuni,
1959 		      struct unimapdesc_str replacedesc,
1960 		      int lowest_eight,
1961 		      int UC_rawuni,
1962 		      int codepage)
1963 {
1964     int s, Gn;
1965     int i, status = 0, found;
1966 
1967     /*
1968      * Get (new?) slot.
1969      */
1970     found = -1;
1971     for (i = 0; i < UCNumCharsets && found < 0; i++) {
1972 	if (!strcmp(UCInfo[i].MIMEname, UC_MIMEcharset)) {
1973 	    found = i;
1974 	}
1975     }
1976     if (found >= 0) {
1977 	s = found;
1978     } else {
1979 	if (UCNumCharsets >= MAXCHARSETS) {
1980 	    CTRACE((tfp, "UC_Charset_Setup: Too many.  Ignoring %s/%s.",
1981 		    UC_MIMEcharset, UC_LYNXcharset));
1982 	    return;
1983 	}
1984 	s = UCNumCharsets;
1985 	UCInfo[s].MIMEname = UC_MIMEcharset;
1986     }
1987     UCInfo[s].LYNXname = UC_LYNXcharset;
1988     UCInfo[s].unicount = unicount;
1989     UCInfo[s].unitable = unitable;
1990     UCInfo[s].num_uni = nnuni;
1991     UCInfo[s].replacedesc = replacedesc;
1992     if (replacedesc.isdefault) {
1993 	default_UChndl = s;
1994     }
1995     Gn = UC_FindGN_byMIME(UC_MIMEcharset);
1996     if (Gn >= 0)
1997 	UC_GNhandles[Gn] = s;
1998     UCInfo[s].GN = Gn;
1999     if (UC_rawuni == UCT_ENC_UTF8)
2000 	lowest_eight = 128;	/* cheat here */
2001     UCInfo[s].lowest_eight = lowest_eight;
2002     UCInfo[s].enc = UC_rawuni;
2003     UCInfo[s].codepage = codepage;
2004     UCInfo[s].LYhndl = UC_Register_with_LYCharSets(s,
2005 						   UC_MIMEcharset,
2006 						   UC_LYNXcharset,
2007 						   lowest_eight);
2008     CTRACE2(TRACE_CFG, (tfp, "registered charset %d mime \"%s\" lynx \"%s\"\n",
2009 			s, UC_MIMEcharset, UC_LYNXcharset));
2010     UCInfo[s].uc_status = status;
2011     if (found < 0)
2012 	UCNumCharsets++;
2013     return;
2014 }
2015 
2016 /*
2017  * UC_NoUctb_Register_with_LYCharSets, UC_Charset_NoUctb_Setup -
2018  * Alternative functions for adding character set info to the lists
2019  * kept in LYCharSets.c.
2020  *
2021  * These are for character sets without any real tables of their own.
2022  * We don't keep an entry in UCinfo[] for them.
2023  */
UC_NoUctb_Register_with_LYCharSets(const char * UC_MIMEcharset,const char * UC_LYNXcharset,int lowest_eightbit,int UC_rawuni,int codepage)2024 static int UC_NoUctb_Register_with_LYCharSets(const char *UC_MIMEcharset,
2025 					      const char *UC_LYNXcharset,
2026 					      int lowest_eightbit,
2027 					      int UC_rawuni,
2028 					      int codepage)
2029 {
2030     int i, LYhndl = -1;
2031 
2032     if (LYNumCharsets == 0) {
2033 	/*
2034 	 * Initialize here; so whoever changes
2035 	 * LYCharSets.c doesn't have to count...
2036 	 */
2037 	for (i = 0; (i < MAXCHARSETS) && LYchar_set_names[i]; i++) {
2038 	    LYNumCharsets = i + 1;
2039 	}
2040     }
2041 
2042     /*
2043      * Search by MIME name, (LYchar_set_names may differ...)
2044      * ignore if already present!
2045      */
2046     for (i = 0; i < MAXCHARSETS && LYchar_set_names[i] && LYhndl < 0; i++) {
2047 	if (LYCharSet_UC[i].MIMEname &&
2048 	    !strcmp(UC_MIMEcharset, LYCharSet_UC[i].MIMEname)) {
2049 	    return ucError;
2050 	}
2051     }
2052 
2053     /* not found */
2054     if (LYNumCharsets >= MAXCHARSETS) {
2055 	CTRACE((tfp,
2056 		"UC_NoUctb_Register_with_LYCharSets: Too many.  Ignoring %s/%s.",
2057 		UC_MIMEcharset, UC_LYNXcharset));
2058 	return ucError;
2059     }
2060     /*
2061      * Add to LYCharSets.c lists.
2062      */
2063     LYhndl = LYNumCharsets;
2064     LYNumCharsets++;
2065     LYlowest_eightbit[LYhndl] = lowest_eightbit;
2066     LYCharSets[LYhndl] = SevenBitApproximations;
2067     LYchar_set_names[LYhndl] = UC_LYNXcharset;
2068     LYchar_set_names[LYhndl + 1] = NULL;
2069     /*
2070      * Terminating NULL may be looked for by Lynx code.
2071      */
2072 
2073     LYCharSet_UC[LYhndl].UChndl = -1;	/* no corresponding UChndl ! */
2074     LYCharSet_UC[LYhndl].MIMEname = UC_MIMEcharset;
2075     LYCharSet_UC[LYhndl].enc = UC_rawuni;
2076     LYCharSet_UC[LYhndl].codepage = codepage;
2077 
2078     /*
2079      * @@@ We really SHOULD get more info from the table files,
2080      * and set relevant flags in the LYCharSet_UC[] entry with
2081      * that info...  For now, let's try it without.  - KW
2082      */
2083 
2084     return LYhndl;
2085 }
2086 
2087 /*
2088  * A wrapper for the previous function.
2089  */
UC_Charset_NoUctb_Setup(const char * UC_MIMEcharset,const char * UC_LYNXcharset,int trydefault,int lowest_eight,int UC_rawuni,int codepage)2090 static void UC_Charset_NoUctb_Setup(const char *UC_MIMEcharset,
2091 				    const char *UC_LYNXcharset,
2092 				    int trydefault,
2093 				    int lowest_eight,
2094 				    int UC_rawuni,
2095 				    int codepage)
2096 {
2097     int i;
2098 
2099     /*
2100      * Ignore completely if already in slot.
2101      */
2102     for (i = 0; i < UCNumCharsets; i++) {
2103 	if (!strcmp(UCInfo[i].MIMEname, UC_MIMEcharset)) {
2104 	    return;
2105 	}
2106     }
2107     if (UC_rawuni == UCT_ENC_UTF8)
2108 	lowest_eight = 128;	/* cheat here */
2109     /* 'codepage' doubles as a flag for 'do not try any table
2110      * lookup, not even default' when negative.  The value will
2111      * be returned immediately by UCTrans* functions.
2112      */
2113     if (!trydefault && codepage == 0)
2114 	codepage = ucCannotOutput;	/* if not already set; any negative should do. */
2115     UC_NoUctb_Register_with_LYCharSets(UC_MIMEcharset,
2116 				       UC_LYNXcharset,
2117 				       lowest_eight,
2118 				       UC_rawuni,
2119 				       codepage);
2120     return;
2121 }
2122 
2123 #ifdef LY_FIND_LEAKS
UCcleanup_mem(void)2124 static void UCcleanup_mem(void)
2125 {
2126     int i;
2127 
2128     UCfree_allocated_LYCharSets();
2129     con_clear_unimap_str(0);
2130     con_clear_unimap_str(1);
2131     con_clear_unimap(0);
2132     con_clear_unimap(1);
2133     for (i = 1; i < 4; i++) {	/* first one is static! */
2134 	FREE(inverse_translations[i]);
2135     }
2136 }
2137 #endif /* LY_FIND_LEAKS */
2138 
2139 #ifdef EXP_CHARTRANS_AUTOSWITCH
2140 #ifdef CAN_AUTODETECT_DISPLAY_CHARSET
2141 #  ifdef __EMX__
CpOrdinal(const unsigned UCode_t cp,const int other)2142 static int CpOrdinal(const unsigned UCode_t cp, const int other)
2143 {
2144     char lyName[80];
2145     char myMimeName[80];
2146     char *mimeName, *mName = NULL, *lName = NULL;
2147     int s, i, exists = 0, ret;
2148 
2149     CTRACE((tfp, "CpOrdinal(cp=%lu, other=%d).\n", cp, other));
2150     sprintf(myMimeName, "auto%s-cp%lu", (other ? "2" : ""), cp);
2151     mimeName = myMimeName + 5 + (other != 0);
2152     sprintf(lyName, "AutoDetect%s (cp%lu)",
2153 	    (other ? "-2" : ""), cp);
2154     /* Find slot. */
2155     s = -1;
2156     for (i = 0; i < UCNumCharsets; i++) {
2157 	if (!strcmp(UCInfo[i].LYNXname, lyName))
2158 	    return UCGetLYhndl_byMIME(myMimeName);
2159 	else if (!strcasecomp(UCInfo[i].MIMEname, mimeName))
2160 	    s = i;
2161     }
2162     if (s < 0)
2163 	return ucError;
2164     /* Store the "real" charset info */
2165     real_charsets[other != 0] = UCGetLYhndl_byMIME(mimeName);
2166     /* Duplicate the record. */
2167     StrAllocCopy(mName, myMimeName);
2168     StrAllocCopy(lName, lyName);
2169     UC_Charset_Setup(mName, lName,
2170 		     UCInfo[s].unicount, UCInfo[s].unitable,
2171 		     UCInfo[s].num_uni, UCInfo[s].replacedesc,
2172 		     UCInfo[s].lowest_eight, UCInfo[s].enc,
2173 		     UCInfo[s].codepage);
2174     ret = UCGetLYhndl_byMIME(myMimeName);
2175     CTRACE((tfp, "Found %i.\n", ret));
2176     return ret;
2177 }
2178 #  endif /* __EMX__ */
2179 #endif /* CAN_AUTODETECT_DISPLAY_CHARSET */
2180 #endif /* EXP_CHARTRANS_AUTOSWITCH */
2181 
UCInit(void)2182 void UCInit(void)
2183 {
2184 
2185     UCreset_allocated_LYCharSets();
2186 #ifdef LY_FIND_LEAKS
2187     atexit(UCcleanup_mem);
2188 #endif
2189     UCconsole_map_init();
2190 
2191     /*
2192      * The order of charset names visible in Lynx Options menu correspond to
2193      * the order of lines below, except the first two described in LYCharSet.c
2194      *
2195      * Entries whose comment is marked with *** are declared in UCdomap.h,
2196      * others are based on the included tables - UCdomap.c, near the top.
2197      */
2198 
2199     UC_CHARSET_SETUP_iso_8859_1;	/* ISO Latin 1          */
2200     UC_CHARSET_SETUP_iso_8859_15;	/* ISO 8859-15 (Latin 9) */
2201     UC_CHARSET_SETUP_cp850;	/* DosLatin1 (cp850)    */
2202     UC_CHARSET_SETUP_windows_1252;	/* WinLatin1 (cp1252)   */
2203     UC_CHARSET_SETUP_cp437;	/* DosLatinUS (cp437)   */
2204 
2205     UC_CHARSET_SETUP_dec_mcs;	/* DEC Multinational    */
2206     UC_CHARSET_SETUP_macintosh;	/* Macintosh (8 bit)    */
2207     UC_CHARSET_SETUP_next;	/* NeXT character set   */
2208     UC_CHARSET_SETUP_hp_roman8;	/* HP Roman8            */
2209 
2210     UC_CHARSET_SETUP_euc_cn;		  /*** Chinese		    */
2211     UC_CHARSET_SETUP_euc_jp;		  /*** Japanese (EUC_JP)    */
2212     UC_CHARSET_SETUP_shift_jis;		  /*** Japanese (Shift_JIS) */
2213     UC_CHARSET_SETUP_euc_kr;		  /*** Korean		    */
2214     UC_CHARSET_SETUP_big5;		  /*** Taipei (Big5)	    */
2215 
2216     UC_CHARSET_SETUP_viscii;	/* Vietnamese (VISCII)  */
2217     UC_CHARSET_SETUP;		/* us-ascii */ /* 7 bit approximations */
2218 
2219     UC_CHARSET_SETUP_x_transparent;	  /*** Transparent	  */
2220 
2221     UC_CHARSET_SETUP_iso_8859_2;	/* ISO Latin 2          */
2222     UC_CHARSET_SETUP_cp852;	/* DosLatin2 (cp852)    */
2223     UC_CHARSET_SETUP_windows_1250;	/* WinLatin2 (cp1250)   */
2224 
2225     UC_CHARSET_SETUP_iso_8859_3;	/* ISO Latin 3          */
2226     UC_CHARSET_SETUP_iso_8859_4;	/* ISO Latin 4          */
2227     UC_CHARSET_SETUP_iso_8859_13;	/* ISO 8859-13 Baltic Rim */
2228     UC_CHARSET_SETUP_cp775;	/* DosBaltRim (cp775)   */
2229     UC_CHARSET_SETUP_windows_1257;	/* WinBaltRim (cp1257)  */
2230     UC_CHARSET_SETUP_iso_8859_5;	/* ISO 8859-5 Cyrillic  */
2231     UC_CHARSET_SETUP_cp866;	/* DosCyrillic (cp866)  */
2232     UC_CHARSET_SETUP_windows_1251;	/* WinCyrillic (cp1251) */
2233     UC_CHARSET_SETUP_koi8_r;	/* KOI8-R Cyrillic      */
2234     UC_CHARSET_SETUP_iso_8859_6;	/* ISO 8869-6 Arabic    */
2235     UC_CHARSET_SETUP_cp864;	/* DosArabic (cp864)    */
2236     UC_CHARSET_SETUP_windows_1256;	/* WinArabic (cp1256)   */
2237     UC_CHARSET_SETUP_iso_8859_14;	/* ISO 8859-14 Celtic   */
2238     UC_CHARSET_SETUP_iso_8859_7;	/* ISO 8859-7 Greek     */
2239     UC_CHARSET_SETUP_cp737;	/* DosGreek (cp737)     */
2240     UC_CHARSET_SETUP_cp869;	/* DosGreek2 (cp869)    */
2241     UC_CHARSET_SETUP_windows_1253;	/* WinGreek (cp1253)    */
2242     UC_CHARSET_SETUP_iso_8859_8;	/* ISO 8859-8 Hebrew    */
2243     UC_CHARSET_SETUP_cp862;	/* DosHebrew (cp862)    */
2244     UC_CHARSET_SETUP_windows_1255;	/* WinHebrew (cp1255)   */
2245     UC_CHARSET_SETUP_iso_8859_9;	/* ISO 8859-9 (Latin 5) */
2246     UC_CHARSET_SETUP_cp857;	/* DosTurkish (cp857) */
2247     UC_CHARSET_SETUP_iso_8859_10;	/* ISO 8859-10 North European */
2248 
2249     UC_CHARSET_SETUP_utf_8;		  /*** UNICODE UTF-8	  */
2250     UC_CHARSET_SETUP_mnemonic_ascii_0;	/* RFC 1345 w/o Intro   */
2251     UC_CHARSET_SETUP_mnemonic;	/* RFC 1345 Mnemonic    */
2252     UC_CHARSET_SETUP_cp866u;	/* Ukrainian Cyrillic (866) */
2253     UC_CHARSET_SETUP_koi8_u;	/* Ukrainian Cyrillic (koi8-u) */
2254     UC_CHARSET_SETUP_ptcp154;	/* Cyrillic-Asian (PT154) */
2255 
2256 #ifdef EXP_CHARTRANS_AUTOSWITCH
2257 #ifdef CAN_AUTODETECT_DISPLAY_CHARSET
2258 #  ifdef __EMX__
2259     {
2260 	unsigned UCode_t lst[3];
2261 	unsigned UCode_t len, rc;
2262 
2263 	rc = DosQueryCp(sizeof(lst), lst, &len);
2264 	if (rc == 0) {
2265 	    if (len >= 1)
2266 		auto_display_charset = CpOrdinal(lst[0], 0);
2267 #    ifdef CAN_SWITCH_DISPLAY_CHARSET
2268 	    if (len >= 3) {
2269 		codepages[0] = lst[0];
2270 		codepages[1] = (lst[0] == lst[1] ? lst[2] : lst[1]);
2271 		auto_other_display_charset = CpOrdinal(codepages[1], 1);
2272 	    }
2273 #    endif
2274 	} else {
2275 	    CTRACE((tfp, "DosQueryCp() returned %#lx=%lu.\n", rc, rc));
2276 	}
2277     }
2278 #  endif
2279 #endif
2280 #endif
2281 
2282 /*
2283  * To add synonyms for any charset name check function UCGetLYhndl_byMIME in
2284  * this file.
2285  */
2286 
2287 /* for coding/performance - easy to type: */
2288     LATIN1 = UCGetLYhndl_byMIME("iso-8859-1");
2289     US_ASCII = UCGetLYhndl_byMIME("us-ascii");
2290     UTF8_handle = UCGetLYhndl_byMIME("utf-8");
2291     TRANSPARENT = UCGetLYhndl_byMIME("x-transparent");
2292 }
2293 
2294 /*
2295  * Safe variant of UCGetLYhndl_byMIME, with blind recovery from typo in user
2296  * input:  lynx.cfg, userdefs.h, command line switches.
2297  */
safeUCGetLYhndl_byMIME(const char * value)2298 int safeUCGetLYhndl_byMIME(const char *value)
2299 {
2300     int i = UCGetLYhndl_byMIME(value);
2301 
2302     if (i == -1) {		/* was user's typo or not yet recognized value */
2303 	i = LATIN1;		/* error recovery? */
2304 	CTRACE((tfp, "safeUCGetLYhndl_byMIME: ISO-8859-1 assumed.\n"));
2305     }
2306 
2307     return (i);
2308 }
2309 
2310 #ifdef USE_LOCALE_CHARSET
2311 
2312 #if defined(USE_LOCALE_CHARSET) && !defined(HAVE_LANGINFO_CODESET)
2313 /*
2314  * This is a quick-and-dirty emulator of the nl_langinfo(CODESET)
2315  * function defined in the Single Unix Specification for those systems
2316  * (FreeBSD, etc.) that don't have one yet. It behaves as if it had
2317  * been called after setlocale(LC_CTYPE, ""), that is it looks at
2318  * the locale environment variables.
2319  *
2320  * http://www.opengroup.org/onlinepubs/7908799/xsh/langinfo.h.html
2321  *
2322  * Please extend it as needed and suggest improvements to the author.
2323  * This emulator will hopefully become redundant soon as
2324  * nl_langinfo(CODESET) becomes more widely implemented.
2325  *
2326  * Since the proposed Li18nux encoding name registry is still not mature,
2327  * the output follows the MIME registry where possible:
2328  *
2329  *   http://www.iana.org/assignments/character-sets
2330  *
2331  * A possible autoconf test for the availability of nl_langinfo(CODESET)
2332  * can be found in
2333  *
2334  *   http://www.cl.cam.ac.uk/~mgk25/unicode.html#activate
2335  *
2336  * Markus.Kuhn@cl.cam.ac.uk -- 2002-03-11
2337  * Permission to use, copy, modify, and distribute this software
2338  * for any purpose and without fee is hereby granted. The author
2339  * disclaims all warranties with regard to this software.
2340  *
2341  * Latest version:
2342  *
2343  *   http://www.cl.cam.ac.uk/~mgk25/ucs/langinfo.c
2344  */
2345 
2346 /*
2347 #include "langinfo.h"
2348 */
2349 typedef int nl_item;
2350 
2351 #define CODESET 1
2352 
2353 #define C_CODESET "US-ASCII"	/* Return this as the encoding of the
2354 				 * C/POSIX locale. Could as well one day
2355 				 * become "UTF-8". */
2356 
2357 #define digit(x) ((x) >= '0' && (x) <= '9')
2358 
2359 static char buf[16];
2360 
nl_langinfo(nl_item item)2361 static char *nl_langinfo(nl_item item)
2362 {
2363     char *l, *p;
2364 
2365     if (item != CODESET)
2366 	return NULL;
2367 
2368     if (((l = LYGetEnv("LC_ALL")) != 0) ||
2369 	((l = LYGetEnv("LC_CTYPE")) != 0) ||
2370 	((l = LYGetEnv("LANG")) != 0)) {
2371 	/* check standardized locales */
2372 	if (!strcmp(l, "C") || !strcmp(l, "POSIX"))
2373 	    return C_CODESET;
2374 	/* check for encoding name fragment */
2375 	if (strstr(l, "UTF") || strstr(l, "utf"))
2376 	    return "UTF-8";
2377 	if ((p = strstr(l, "8859-"))) {
2378 	    memcpy(buf, "ISO-8859-\0\0", 12);
2379 	    p += 5;
2380 	    if (digit(*p)) {
2381 		buf[9] = *p++;
2382 		if (digit(*p))
2383 		    buf[10] = *p++;
2384 		return buf;
2385 	    }
2386 	}
2387 	if (strstr(l, "KOI8-R"))
2388 	    return "KOI8-R";
2389 	if (strstr(l, "KOI8-U"))
2390 	    return "KOI8-U";
2391 	if (strstr(l, "620"))
2392 	    return "TIS-620";
2393 	if (strstr(l, "2312"))
2394 	    return "GB2312";
2395 	if (strstr(l, "HKSCS"))
2396 	    return "Big5HKSCS";	/* no MIME charset */
2397 	if (strstr(l, "Big5") || strstr(l, "BIG5"))
2398 	    return "Big5";
2399 	if (strstr(l, "GBK"))
2400 	    return "GBK";	/* no MIME charset */
2401 	if (strstr(l, "18030"))
2402 	    return "GB18030";	/* no MIME charset */
2403 	if (strstr(l, "Shift_JIS") || strstr(l, "SJIS"))
2404 	    return "Shift_JIS";
2405 	/* check for conclusive modifier */
2406 	if (strstr(l, "euro"))
2407 	    return "ISO-8859-15";
2408 	/* check for language (and perhaps country) codes */
2409 	if (strstr(l, "zh_TW"))
2410 	    return "Big5";
2411 	if (strstr(l, "zh_HK"))
2412 	    return "Big5HKSCS";	/* no MIME charset */
2413 	if (strstr(l, "zh"))
2414 	    return "GB2312";
2415 	if (strstr(l, "ja"))
2416 	    return "EUC-JP";
2417 	if (strstr(l, "ko"))
2418 	    return "EUC-KR";
2419 	if (strstr(l, "ru"))
2420 	    return "KOI8-R";
2421 	if (strstr(l, "uk"))
2422 	    return "KOI8-U";
2423 	if (strstr(l, "pl") || strstr(l, "hr") ||
2424 	    strstr(l, "hu") || strstr(l, "cs") ||
2425 	    strstr(l, "sk") || strstr(l, "sl"))
2426 	    return "ISO-8859-2";
2427 	if (strstr(l, "eo") || strstr(l, "mt"))
2428 	    return "ISO-8859-3";
2429 	if (strstr(l, "el"))
2430 	    return "ISO-8859-7";
2431 	if (strstr(l, "he"))
2432 	    return "ISO-8859-8";
2433 	if (strstr(l, "tr"))
2434 	    return "ISO-8859-9";
2435 	if (strstr(l, "th"))
2436 	    return "TIS-620";	/* or ISO-8859-11 */
2437 	if (strstr(l, "lt"))
2438 	    return "ISO-8859-13";
2439 	if (strstr(l, "cy"))
2440 	    return "ISO-8859-14";
2441 	if (strstr(l, "ro"))
2442 	    return "ISO-8859-2";	/* or ISO-8859-16 */
2443 	if (strstr(l, "am") || strstr(l, "vi"))
2444 	    return "UTF-8";
2445 	/* Send me further rules if you like, but don't forget that we are
2446 	 * *only* interested in locale naming conventions on platforms
2447 	 * that do not already provide an nl_langinfo(CODESET) implementation. */
2448 	return "ISO-8859-1";	/* should perhaps be "UTF-8" instead */
2449     }
2450     return C_CODESET;
2451 }
2452 #endif /* defined(USE_LOCALE_CHARSET) && !defined(HAVE_LANGINFO_CODESET) */
2453 
2454 /*
2455  * If LYLocaleCharset is true, use the current locale to lookup a MIME name
2456  * that corresponds, and use that as the display charset.  This feature is
2457  * experimental because while nl_langinfo(CODESET) itself is standardized,
2458  * the return values and their relationship to the locale value is not.
2459  * GNU libiconv happens to give useful values, but other implementations are
2460  * not guaranteed to do this.
2461  *
2462  * Not all Linux versions provide useful information.  GNU libc 2.2 returns
2463  *	"ANSI_X3.4-1968"
2464  * whether locale is POSIX or en_US.UTF-8.
2465  *
2466  * Another possible thing to investigate is the locale_charset() function
2467  * provided in libiconv 1.5.1.
2468  */
LYFindLocaleCharset(void)2469 void LYFindLocaleCharset(void)
2470 {
2471     BOOL found = FALSE;
2472     char *name;
2473 
2474     CTRACE((tfp, "LYFindLocaleCharset(%d)\n", LYLocaleCharset));
2475     name = nl_langinfo(CODESET);
2476 
2477     if (name != 0) {
2478 	int value = UCGetLYhndl_byMIME(name);
2479 
2480 	if (value >= 0) {
2481 	    found = TRUE;
2482 	    linedrawing_char_set = value;
2483 	    CTRACE((tfp, "Found name \"%s\" -> %d\n", name, value));
2484 	} else {
2485 	    CTRACE((tfp, "Cannot find a handle for MIME name \"%s\"\n", name));
2486 	}
2487     } else {
2488 	CTRACE((tfp, "Cannot find a MIME name for locale\n"));
2489     }
2490 
2491     if (found && LYLocaleCharset) {
2492 	current_char_set = linedrawing_char_set;
2493     }
2494 }
2495 #endif /* USE_LOCALE_CHARSET */
2496 
UCScanCode(UCode_t * target,const char * source,BOOL isHex)2497 BOOL UCScanCode(UCode_t *target, const char *source, BOOL isHex)
2498 {
2499     BOOL status = FALSE;
2500     long lcode;
2501     char *endptr;
2502 
2503     errno = 0;
2504     *target = 0;
2505     lcode = strtol(source, &endptr, isHex ? 16 : 10);
2506     if (lcode >= 0
2507 	&& (endptr > source)
2508 #if defined(ERANGE) && defined(LONG_MAX) && defined(LONG_MIN)
2509 	&& (errno != ERANGE || (lcode != LONG_MAX && lcode != LONG_MIN))
2510 #else
2511 	&& (endptr - source) < (isHex ? 8 : 10)
2512 #endif
2513 	&& (endptr != 0)
2514 	&& (*endptr == '\0')) {
2515 	*target = (UCode_t) lcode;
2516 	status = TRUE;
2517     }
2518     return status;
2519 }
2520