1 /*
2 * $LynxId: UCdomap.c,v 1.95 2012/02/23 01:05:42 tom Exp $
3 *
4 * UCdomap.c
5 * =========
6 *
7 * This is a Lynx chartrans engine, its external calls are in UCMap.h
8 *
9 * Derived from code in the Linux kernel console driver.
10 * The GNU Public Licence therefore applies, see
11 * the file COPYING in the top-level directory
12 * which should come with every Lynx distribution.
13 *
14 * [ original comment: - KW ]
15 *
16 * Mapping from internal code (such as Latin-1 or Unicode or IBM PC code)
17 * to font positions.
18 *
19 * aeb, 950210
20 */
21 #include <HTUtils.h>
22 #include <HTMLDTD.h>
23
24 #include <LYGlobalDefs.h>
25 #include <UCdomap.h>
26 #include <UCMap.h>
27 #include <UCAux.h>
28 #include <UCDefs.h>
29 #include <LYCharSets.h>
30 #include <LYStrings.h>
31 #include <LYUtils.h>
32
33 #if defined(USE_LOCALE_CHARSET) && defined(HAVE_LANGINFO_CODESET)
34 #include <langinfo.h>
35 #endif
36
37 #ifdef EXP_JAPANESEUTF8_SUPPORT
38 #include <iconv.h>
39 #endif
40
41 #include <LYLeaks.h>
42
43 /*
44 * Include chartrans tables:
45 */
46 #include <cp1250_uni.h> /* WinLatin2 (cp1250) */
47 #include <cp1251_uni.h> /* WinCyrillic (cp1251) */
48 #include <cp1252_uni.h> /* WinLatin1 (cp1252) */
49 #include <cp1253_uni.h> /* WinGreek (cp1253) */
50 #include <cp1255_uni.h> /* WinHebrew (cp1255) */
51 #include <cp1256_uni.h> /* WinArabic (cp1256) */
52 #include <cp1257_uni.h> /* WinBaltRim (cp1257) */
53 #include <cp437_uni.h> /* DosLatinUS (cp437) */
54 #include <cp737_uni.h> /* DosGreek (cp737) */
55 #include <cp775_uni.h> /* DosBaltRim (cp775) */
56 #include <cp850_uni.h> /* DosLatin1 (cp850) */
57 #include <cp852_uni.h> /* DosLatin2 (cp852) */
58 #include <cp857_uni.h> /* DosTurkish (cp857) */
59 #include <cp862_uni.h> /* DosHebrew (cp862) */
60 #include <cp864_uni.h> /* DosArabic (cp864) */
61 #include <cp866_uni.h> /* DosCyrillic (cp866) */
62 #include <cp869_uni.h> /* DosGreek2 (cp869) */
63 #include <def7_uni.h> /* 7 bit approximations */
64 #include <dmcs_uni.h> /* DEC Multinational */
65 #include <hp_uni.h> /* HP Roman8 */
66 #include <iso01_uni.h> /* ISO Latin 1 */
67 #include <iso02_uni.h> /* ISO Latin 2 */
68 #include <iso03_uni.h> /* ISO Latin 3 */
69 #include <iso04_uni.h> /* ISO Latin 4 */
70 #include <iso05_uni.h> /* ISO 8859-5 Cyrillic */
71 #include <iso06_uni.h> /* ISO 8859-6 Arabic */
72 #include <iso07_uni.h> /* ISO 8859-7 Greek */
73 #include <iso08_uni.h> /* ISO 8859-8 Hebrew */
74 #include <iso09_uni.h> /* ISO 8859-9 (Latin 5) */
75 #include <iso10_uni.h> /* ISO 8859-10 */
76 #include <iso13_uni.h> /* ISO 8859-13 (Latin 7) */
77 #include <iso14_uni.h> /* ISO 8859-14 (Latin 8) */
78 #include <iso15_uni.h> /* ISO 8859-15 (Latin 9) */
79 #include <koi8r_uni.h> /* KOI8-R Cyrillic */
80 #include <mac_uni.h> /* Macintosh (8 bit) */
81 #include <mnem2_suni.h> /* RFC 1345 Mnemonic */
82 #include <next_uni.h> /* NeXT character set */
83 #include <rfc_suni.h> /* RFC 1345 w/o Intro */
84 /* #include <utf8_uni.h> */ /* UNICODE UTF 8 */
85 #include <viscii_uni.h> /* Vietnamese (VISCII) */
86 #include <cp866u_uni.h> /* Ukrainian Cyrillic (866) */
87 #include <koi8u_uni.h> /* Ukrainian Cyrillic (koi8-u */
88 #include <pt154_uni.h> /* Cyrillic-Asian (PT154) */
89
90 #ifdef CAN_AUTODETECT_DISPLAY_CHARSET
91 int auto_display_charset = -1;
92 #endif
93
94 static const char *UC_GNsetMIMEnames[4] =
95 {
96 "iso-8859-1", "x-dec-graphics", "cp437", "x-transparent"
97 };
98
99 static int UC_GNhandles[4] =
100 {
101 -1, -1, -1, -1
102 };
103
104 /*
105 * Some of the code below, and some of the comments, are left in for
106 * historical reasons. Not all those tables below are currently
107 * really needed (and what with all those hardwired codepoints),
108 * but let's keep them around for now. They may come in handy if we
109 * decide to make more extended use of the mechanisms (including e.g.
110 * for chars < 127...). - KW
111 */
112
113 static u16 translations[][256] =
114 {
115 /*
116 * 8-bit Latin-1 mapped to Unicode -- trivial mapping.
117 */
118 {
119 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
120 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
121 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017,
122 0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f,
123 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027,
124 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f,
125 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
126 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f,
127 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
128 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f,
129 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,
130 0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f,
131 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
132 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f,
133 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,
134 0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x007f,
135 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,
136 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f,
137 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,
138 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f,
139 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7,
140 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af,
141 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7,
142 0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf,
143 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7,
144 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,
145 0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7,
146 0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df,
147 0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7,
148 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef,
149 0x00f0, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7,
150 0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff
151 },
152 /*
153 * VT100 graphics mapped to Unicode.
154 */
155 {
156 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
157 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
158 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017,
159 0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f,
160 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027,
161 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f,
162 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
163 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f,
164 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
165 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f,
166 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,
167 0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x00a0,
168 0x25c6, 0x2592, 0x2409, 0x240c, 0x240d, 0x240a, 0x00b0, 0x00b1,
169 0x2424, 0x240b, 0x2518, 0x2510, 0x250c, 0x2514, 0x253c, 0xf800,
170 0xf801, 0x2500, 0xf803, 0xf804, 0x251c, 0x2524, 0x2534, 0x252c,
171 0x2502, 0x2264, 0x2265, 0x03c0, 0x2260, 0x00a3, 0x00b7, 0x007f,
172 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,
173 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f,
174 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,
175 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f,
176 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7,
177 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af,
178 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7,
179 0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf,
180 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7,
181 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,
182 0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7,
183 0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df,
184 0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7,
185 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef,
186 0x00f0, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7,
187 0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff
188 },
189 /*
190 * IBM Codepage 437 mapped to Unicode.
191 */
192 {
193 0x0000, 0x263a, 0x263b, 0x2665, 0x2666, 0x2663, 0x2660, 0x2022,
194 0x25d8, 0x25cb, 0x25d9, 0x2642, 0x2640, 0x266a, 0x266b, 0x263c,
195 0x25ba, 0x25c4, 0x2195, 0x203c, 0x00b6, 0x00a7, 0x25ac, 0x21a8,
196 0x2191, 0x2193, 0x2192, 0x2190, 0x221f, 0x2194, 0x25b2, 0x25bc,
197 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027,
198 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f,
199 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
200 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f,
201 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
202 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f,
203 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,
204 0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f,
205 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
206 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f,
207 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,
208 0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x2302,
209 0x00c7, 0x00fc, 0x00e9, 0x00e2, 0x00e4, 0x00e0, 0x00e5, 0x00e7,
210 0x00ea, 0x00eb, 0x00e8, 0x00ef, 0x00ee, 0x00ec, 0x00c4, 0x00c5,
211 0x00c9, 0x00e6, 0x00c6, 0x00f4, 0x00f6, 0x00f2, 0x00fb, 0x00f9,
212 0x00ff, 0x00d6, 0x00dc, 0x00a2, 0x00a3, 0x00a5, 0x20a7, 0x0192,
213 0x00e1, 0x00ed, 0x00f3, 0x00fa, 0x00f1, 0x00d1, 0x00aa, 0x00ba,
214 0x00bf, 0x2310, 0x00ac, 0x00bd, 0x00bc, 0x00a1, 0x00ab, 0x00bb,
215 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x2561, 0x2562, 0x2556,
216 0x2555, 0x2563, 0x2551, 0x2557, 0x255d, 0x255c, 0x255b, 0x2510,
217 0x2514, 0x2534, 0x252c, 0x251c, 0x2500, 0x253c, 0x255e, 0x255f,
218 0x255a, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x2567,
219 0x2568, 0x2564, 0x2565, 0x2559, 0x2558, 0x2552, 0x2553, 0x256b,
220 0x256a, 0x2518, 0x250c, 0x2588, 0x2584, 0x258c, 0x2590, 0x2580,
221 0x03b1, 0x00df, 0x0393, 0x03c0, 0x03a3, 0x03c3, 0x00b5, 0x03c4,
222 0x03a6, 0x0398, 0x03a9, 0x03b4, 0x221e, 0x03c6, 0x03b5, 0x2229,
223 0x2261, 0x00b1, 0x2265, 0x2264, 0x2320, 0x2321, 0x00f7, 0x2248,
224 0x00b0, 0x2219, 0x00b7, 0x221a, 0x207f, 0x00b2, 0x25a0, 0x00a0
225 },
226 /*
227 * User mapping -- default to codes for direct font mapping.
228 */
229 {
230 0xf000, 0xf001, 0xf002, 0xf003, 0xf004, 0xf005, 0xf006, 0xf007,
231 0xf008, 0xf009, 0xf00a, 0xf00b, 0xf00c, 0xf00d, 0xf00e, 0xf00f,
232 0xf010, 0xf011, 0xf012, 0xf013, 0xf014, 0xf015, 0xf016, 0xf017,
233 0xf018, 0xf019, 0xf01a, 0xf01b, 0xf01c, 0xf01d, 0xf01e, 0xf01f,
234 0xf020, 0xf021, 0xf022, 0xf023, 0xf024, 0xf025, 0xf026, 0xf027,
235 0xf028, 0xf029, 0xf02a, 0xf02b, 0xf02c, 0xf02d, 0xf02e, 0xf02f,
236 0xf030, 0xf031, 0xf032, 0xf033, 0xf034, 0xf035, 0xf036, 0xf037,
237 0xf038, 0xf039, 0xf03a, 0xf03b, 0xf03c, 0xf03d, 0xf03e, 0xf03f,
238 0xf040, 0xf041, 0xf042, 0xf043, 0xf044, 0xf045, 0xf046, 0xf047,
239 0xf048, 0xf049, 0xf04a, 0xf04b, 0xf04c, 0xf04d, 0xf04e, 0xf04f,
240 0xf050, 0xf051, 0xf052, 0xf053, 0xf054, 0xf055, 0xf056, 0xf057,
241 0xf058, 0xf059, 0xf05a, 0xf05b, 0xf05c, 0xf05d, 0xf05e, 0xf05f,
242 0xf060, 0xf061, 0xf062, 0xf063, 0xf064, 0xf065, 0xf066, 0xf067,
243 0xf068, 0xf069, 0xf06a, 0xf06b, 0xf06c, 0xf06d, 0xf06e, 0xf06f,
244 0xf070, 0xf071, 0xf072, 0xf073, 0xf074, 0xf075, 0xf076, 0xf077,
245 0xf078, 0xf079, 0xf07a, 0xf07b, 0xf07c, 0xf07d, 0xf07e, 0xf07f,
246 0xf080, 0xf081, 0xf082, 0xf083, 0xf084, 0xf085, 0xf086, 0xf087,
247 0xf088, 0xf089, 0xf08a, 0xf08b, 0xf08c, 0xf08d, 0xf08e, 0xf08f,
248 0xf090, 0xf091, 0xf092, 0xf093, 0xf094, 0xf095, 0xf096, 0xf097,
249 0xf098, 0xf099, 0xf09a, 0xf09b, 0xf09c, 0xf09d, 0xf09e, 0xf09f,
250 0xf0a0, 0xf0a1, 0xf0a2, 0xf0a3, 0xf0a4, 0xf0a5, 0xf0a6, 0xf0a7,
251 0xf0a8, 0xf0a9, 0xf0aa, 0xf0ab, 0xf0ac, 0xf0ad, 0xf0ae, 0xf0af,
252 0xf0b0, 0xf0b1, 0xf0b2, 0xf0b3, 0xf0b4, 0xf0b5, 0xf0b6, 0xf0b7,
253 0xf0b8, 0xf0b9, 0xf0ba, 0xf0bb, 0xf0bc, 0xf0bd, 0xf0be, 0xf0bf,
254 0xf0c0, 0xf0c1, 0xf0c2, 0xf0c3, 0xf0c4, 0xf0c5, 0xf0c6, 0xf0c7,
255 0xf0c8, 0xf0c9, 0xf0ca, 0xf0cb, 0xf0cc, 0xf0cd, 0xf0ce, 0xf0cf,
256 0xf0d0, 0xf0d1, 0xf0d2, 0xf0d3, 0xf0d4, 0xf0d5, 0xf0d6, 0xf0d7,
257 0xf0d8, 0xf0d9, 0xf0da, 0xf0db, 0xf0dc, 0xf0dd, 0xf0de, 0xf0df,
258 0xf0e0, 0xf0e1, 0xf0e2, 0xf0e3, 0xf0e4, 0xf0e5, 0xf0e6, 0xf0e7,
259 0xf0e8, 0xf0e9, 0xf0ea, 0xf0eb, 0xf0ec, 0xf0ed, 0xf0ee, 0xf0ef,
260 0xf0f0, 0xf0f1, 0xf0f2, 0xf0f3, 0xf0f4, 0xf0f5, 0xf0f6, 0xf0f7,
261 0xf0f8, 0xf0f9, 0xf0fa, 0xf0fb, 0xf0fc, 0xf0fd, 0xf0fe, 0xf0ff
262 }
263 };
264 static u16 *UC_translate = NULL;
265
266 static struct UC_charset UCInfo[MAXCHARSETS];
267
268 /*
269 * The standard kernel character-to-font mappings are not invertible
270 * -- this is just a best effort.
271 */
272 #define MAX_GLYPH 512 /* Max possible glyph value */
273
274 static unsigned char *inv_translate = NULL;
275 static unsigned char inv_norm_transl[MAX_GLYPH];
276 static unsigned char *inverse_translations[4] =
277 {NULL, NULL, NULL, NULL};
278
279 static void set_inverse_transl(int i);
280 static u16 *set_translate(int m);
281 static int UC_valid_UC_charset(int UC_charset_hndl);
282 static void UC_con_set_trans(int UC_charset_in_hndl, int Gn, int update_flag);
283 static int con_insert_unipair(unsigned unicode, unsigned fontpos, int fordefault);
284 static int con_insert_unipair_str(unsigned unicode, const char *replace_str, int fordefault);
285 static void con_clear_unimap(int fordefault);
286 static void con_clear_unimap_str(int fordefault);
287 static void con_set_default_unimap(void);
288 static int UC_con_set_unimap(int UC_charset_out_hndl, int update_flag);
289 static int UC_con_set_unimap_str(unsigned ct, struct unipair_str *list, int fordefault);
290 static int conv_uni_to_pc(long ucs, int usedefault);
291 static int conv_uni_to_str(char *outbuf, int buflen, UCode_t ucs, int usedefault);
292 static void UCconsole_map_init(void);
293 static int UC_MapGN(int UChndl, int update_flag);
294 static int UC_FindGN_byMIME(const char *UC_MIMEcharset);
295 static void UCreset_allocated_LYCharSets(void);
296 static STRING2PTR UC_setup_LYCharSets_repl(int UC_charset_in_hndl, unsigned lowest8);
297 static int UC_Register_with_LYCharSets(int s,
298 const char *UC_MIMEcharset,
299 const char *UC_LYNXcharset,
300 int lowest_eightbit);
301
302 #ifdef LY_FIND_LEAKS
303 static void UCfree_allocated_LYCharSets(void);
304 static void UCcleanup_mem(void);
305 #endif
306
307 static int default_UChndl = -1;
308
set_inverse_transl(int i)309 static void set_inverse_transl(int i)
310 {
311 int j, glyph;
312 u16 *p = translations[i];
313 unsigned char *q = inverse_translations[i];
314
315 if (!q) {
316 /*
317 * Slightly messy to avoid calling kmalloc too early.
318 */
319 q = inverse_translations[i] = ((i == LAT1_MAP) ?
320 inv_norm_transl :
321 typeMallocn(unsigned char, MAX_GLYPH));
322
323 if (!q)
324 return;
325 }
326 for (j = 0; j < MAX_GLYPH; j++)
327 q[j] = 0;
328
329 for (j = 0; j < E_TABSZ; j++) {
330 glyph = conv_uni_to_pc((long) p[j], 0);
331 if (glyph >= 0 && glyph < MAX_GLYPH && q[glyph] < 32) {
332 /*
333 * Prefer '-' above SHY etc.
334 */
335 q[glyph] = UCH(j);
336 }
337 }
338 }
339
set_translate(int m)340 static u16 *set_translate(int m)
341 {
342 if (!inverse_translations[m])
343 set_inverse_transl(m);
344 inv_translate = inverse_translations[m];
345 return translations[m];
346 }
347
UC_valid_UC_charset(int UC_charset_hndl)348 static int UC_valid_UC_charset(int UC_charset_hndl)
349 {
350 return (UC_charset_hndl >= 0 && UC_charset_hndl < UCNumCharsets);
351 }
352
UC_con_set_trans(int UC_charset_in_hndl,int Gn,int update_flag)353 static void UC_con_set_trans(int UC_charset_in_hndl,
354 int Gn,
355 int update_flag)
356 {
357 int i, j;
358 const u16 *p;
359 u16 *ptrans;
360
361 if (!UC_valid_UC_charset(UC_charset_in_hndl)) {
362 CTRACE((tfp, "UC_con_set_trans: Invalid charset handle %d.\n",
363 UC_charset_in_hndl));
364 return;
365 }
366 ptrans = translations[Gn];
367 p = UCInfo[UC_charset_in_hndl].unitable;
368 #if(0)
369 if (p == UC_current_unitable) { /* test whether pointers are equal */
370 return; /* nothing to be done */
371 }
372 /*
373 * The font is always 256 characters - so far.
374 */
375 con_clear_unimap();
376 #endif
377 for (i = 0; i < 256; i++) {
378 if ((j = UCInfo[UC_charset_in_hndl].unicount[i])) {
379 ptrans[i] = *p;
380 for (; j; j--) {
381 p++;
382 }
383 } else {
384 ptrans[i] = 0xfffd;
385 }
386 }
387 if (update_flag) {
388 set_inverse_transl(Gn); /* Update inverse translation for this one */
389 }
390 }
391
392 /*
393 * Unicode -> current font conversion
394 *
395 * A font has at most 512 chars, usually 256.
396 * But one font position may represent several Unicode chars.
397 * A hashtable is somewhat of a pain to deal with, so use a
398 * "paged table" instead. Simulation has shown the memory cost of
399 * this 3-level paged table scheme to be comparable to a hash table.
400 */
401 static int hashtable_contents_valid = 0; /* Use ASCII-only mode for bootup */
402 static int hashtable_str_contents_valid = 0;
403
404 static u16 **uni_pagedir[32] =
405 {
406 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
407 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
408 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
409 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
410 };
411
412 static char ***uni_pagedir_str[32] =
413 {
414 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
415 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
416 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
417 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
418 };
419
420 static const u16 *UC_current_unitable = NULL;
421 static struct unimapdesc_str *UC_current_unitable_str = NULL;
422
423 /*
424 * Keep a second set of structures for the translation designated
425 * as "default" - kw
426 */
427 static int unidefault_contents_valid = 0; /* Use ASCII-only mode for bootup */
428 static int unidefault_str_contents_valid = 0;
429
430 static u16 **unidefault_pagedir[32] =
431 {
432 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
433 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
434 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
435 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
436 };
437 static char ***unidefault_pagedir_str[32] =
438 {
439 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
440 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
441 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
442 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
443 };
444
445 static const u16 *UC_default_unitable = 0;
446 static const struct unimapdesc_str *UC_default_unitable_str = 0;
447
con_insert_unipair(unsigned unicode,unsigned fontpos,int fordefault)448 static int con_insert_unipair(unsigned unicode, unsigned fontpos, int fordefault)
449 {
450 int i;
451 unsigned n;
452 u16 **p1, *p2;
453
454 if (fordefault)
455 p1 = unidefault_pagedir[n = unicode >> 11];
456 else
457 p1 = uni_pagedir[n = unicode >> 11];
458 if (!p1) {
459 p1 = (u16 * *)malloc(32 * sizeof(u16 *));
460 if (fordefault)
461 unidefault_pagedir[n] = p1;
462 else
463 uni_pagedir[n] = p1;
464 if (!p1)
465 return ucError;
466
467 for (i = 0; i < 32; i++) {
468 p1[i] = NULL;
469 }
470 }
471
472 if (!(p2 = p1[n = (unicode >> 6) & 0x1f])) {
473 p2 = p1[n] = (u16 *) malloc(64 * sizeof(u16));
474 if (!p2)
475 return ucError;
476
477 for (i = 0; i < 64; i++) {
478 p2[i] = 0xffff; /* No glyph for this character (yet) */
479 }
480 }
481
482 p2[unicode & 0x3f] = (u16) fontpos;
483
484 return 0;
485 }
486
con_insert_unipair_str(unsigned unicode,const char * replace_str,int fordefault)487 static int con_insert_unipair_str(unsigned unicode, const char *replace_str,
488 int fordefault)
489 {
490 int i;
491 unsigned n;
492 char ***p1;
493 const char **p2;
494
495 if (fordefault)
496 p1 = unidefault_pagedir_str[n = unicode >> 11];
497 else
498 p1 = uni_pagedir_str[n = unicode >> 11];
499 if (!p1) {
500 p1 = (char ***) malloc(32 * sizeof(char **));
501
502 if (fordefault)
503 unidefault_pagedir_str[n] = p1;
504 else
505 uni_pagedir_str[n] = p1;
506 if (!p1)
507 return ucError;
508
509 for (i = 0; i < 32; i++) {
510 p1[i] = NULL;
511 }
512 }
513
514 n = ((unicode >> 6) & 0x1f);
515 if (!p1[n]) {
516 p1[n] = (char **) malloc(64 * sizeof(char *));
517
518 if (!p1[n])
519 return ucError;
520
521 p2 = (const char **) p1[n];
522 for (i = 0; i < 64; i++) {
523 p2[i] = NULL; /* No replace string this character (yet) */
524 }
525 }
526 p2 = (const char **) p1[n];
527
528 p2[unicode & 0x3f] = replace_str;
529
530 return 0;
531 }
532
533 /*
534 * ui arg was a leftover, deleted. - KW
535 */
con_clear_unimap(int fordefault)536 static void con_clear_unimap(int fordefault)
537 {
538 int i, j;
539 u16 **p1;
540
541 if (fordefault) {
542 for (i = 0; i < 32; i++) {
543 if ((p1 = unidefault_pagedir[i]) != NULL) {
544 for (j = 0; j < 32; j++) {
545 FREE(p1[j]);
546 }
547 FREE(p1);
548 }
549 unidefault_pagedir[i] = NULL;
550 }
551
552 unidefault_contents_valid = 1;
553 } else {
554 for (i = 0; i < 32; i++) {
555 if ((p1 = uni_pagedir[i]) != NULL) {
556 for (j = 0; j < 32; j++) {
557 FREE(p1[j]);
558 }
559 FREE(p1);
560 }
561 uni_pagedir[i] = NULL;
562 }
563
564 hashtable_contents_valid = 1;
565 }
566 }
567
con_clear_unimap_str(int fordefault)568 static void con_clear_unimap_str(int fordefault)
569 {
570 int i, j;
571 char ***p1;
572
573 if (fordefault) {
574 for (i = 0; i < 32; i++) {
575 if ((p1 = unidefault_pagedir_str[i]) != NULL) {
576 for (j = 0; j < 32; j++) {
577 FREE(p1[j]);
578 }
579 FREE(p1);
580 }
581 unidefault_pagedir_str[i] = NULL;
582 }
583
584 unidefault_str_contents_valid = 1; /* ??? probably no use... */
585 } else {
586 for (i = 0; i < 32; i++) {
587 if ((p1 = uni_pagedir_str[i]) != NULL) {
588 for (j = 0; j < 32; j++) {
589 FREE(p1[j]);
590 }
591 FREE(p1);
592 }
593 uni_pagedir_str[i] = NULL;
594 }
595
596 hashtable_str_contents_valid = 1; /* ??? probably no use... */
597 }
598 }
599
600 /*
601 * Loads the unimap for the hardware font, as defined in uni_hash.tbl.
602 * The representation used was the most compact I could come up
603 * with. This routine is executed at sys_setup time, and when the
604 * PIO_FONTRESET ioctl is called.
605 */
con_set_default_unimap(void)606 static void con_set_default_unimap(void)
607 {
608 int i, j;
609 const u16 *p;
610
611 /*
612 * The default font is always 256 characters.
613 */
614 con_clear_unimap(1);
615
616 p = dfont_unitable;
617 for (i = 0; i < 256; i++) {
618 for (j = dfont_unicount[i]; j; j--) {
619 con_insert_unipair(*(p++), (u16) i, 1);
620 }
621 }
622
623 UC_default_unitable = dfont_unitable;
624
625 con_clear_unimap_str(1);
626 UC_con_set_unimap_str(dfont_replacedesc.entry_ct, repl_map, 1);
627 UC_default_unitable_str = &dfont_replacedesc;
628 }
629
630 int UCNumCharsets = 0;
631
632 int UCLYhndl_HTFile_for_unspec = -1;
633 int UCLYhndl_HTFile_for_unrec = -1;
634 int UCLYhndl_for_unspec = -1;
635 int UCLYhndl_for_unrec = -1;
636
637 /* easy to type, will initialize later */
638 int LATIN1 = -1; /* UCGetLYhndl_byMIME("iso-8859-1") */
639 int US_ASCII = -1; /* UCGetLYhndl_byMIME("us-ascii") */
640 int UTF8_handle = -1; /* UCGetLYhndl_byMIME("utf-8") */
641 int TRANSPARENT = -1; /* UCGetLYhndl_byMIME("x-transparent") */
642
UC_con_set_unimap(int UC_charset_out_hndl,int update_flag)643 static int UC_con_set_unimap(int UC_charset_out_hndl,
644 int update_flag)
645 {
646 int i, j;
647 const u16 *p;
648
649 if (!UC_valid_UC_charset(UC_charset_out_hndl)) {
650 CTRACE((tfp, "UC_con_set_unimap: Invalid charset handle %d.\n",
651 UC_charset_out_hndl));
652 return ucError;
653 }
654
655 p = UCInfo[UC_charset_out_hndl].unitable;
656 if (p == UC_current_unitable) { /* test whether pointers are equal */
657 return update_flag; /* nothing to be done */
658 }
659 UC_current_unitable = p;
660
661 /*
662 * The font is always 256 characters - so far.
663 */
664 con_clear_unimap(0);
665
666 for (i = 0; i < 256; i++) {
667 for (j = UCInfo[UC_charset_out_hndl].unicount[i]; j; j--) {
668 con_insert_unipair(*(p++), (u16) i, 0);
669 }
670 }
671
672 if (update_flag) {
673 for (i = 0; i <= 3; i++) {
674 set_inverse_transl(i); /* Update all inverse translations */
675 }
676 }
677
678 return 0;
679 }
680
UC_con_set_unimap_str(unsigned ct,struct unipair_str * list,int fordefault)681 static int UC_con_set_unimap_str(unsigned ct, struct unipair_str *list,
682 int fordefault)
683 {
684 int err = 0, err1;
685
686 while (ct--) {
687 if ((err1 = con_insert_unipair_str(list->unicode,
688 list->replace_str,
689 fordefault)) != 0) {
690 err = err1;
691 }
692 list++;
693 }
694
695 /*
696 * No inverse translations for replacement strings!
697 */
698 if (!err) {
699 if (fordefault)
700 unidefault_str_contents_valid = 1;
701 else
702 hashtable_str_contents_valid = 1;
703 }
704
705 return err;
706 }
707
conv_uni_to_pc(long ucs,int usedefault)708 static int conv_uni_to_pc(long ucs,
709 int usedefault)
710 {
711 int h;
712 u16 **p1, *p2;
713
714 /*
715 * Only 16-bit codes supported at this time.
716 */
717 if (ucs > 0xffff) {
718 /*
719 * U+FFFD: REPLACEMENT CHARACTER.
720 */
721 ucs = 0xfffd;
722 } else if (ucs < 0x20 || ucs >= 0xfffe) {
723 /*
724 * Not a printable character.
725 */
726 return ucError;
727 } else if (ucs == 0xfeff || (ucs >= 0x200b && ucs <= 0x200f)) {
728 /*
729 * Zero-width space.
730 */
731 return ucZeroWidth;
732 } else if ((ucs & ~UNI_DIRECT_MASK) == UNI_DIRECT_BASE) {
733 /*
734 * UNI_DIRECT_BASE indicates the start of the region in the
735 * User Zone which always has a 1:1 mapping to the currently
736 * loaded font. The UNI_DIRECT_MASK indicates the bit span
737 * of the region.
738 */
739 return (ucs & UNI_DIRECT_MASK);
740 }
741
742 if (usedefault) {
743 if (!unidefault_contents_valid)
744 return ucInvalidHash;
745 p1 = unidefault_pagedir[ucs >> 11];
746 } else {
747 if (!hashtable_contents_valid)
748 return ucInvalidHash;
749 p1 = uni_pagedir[ucs >> 11];
750 }
751
752 if (p1 &&
753 (p2 = p1[(ucs >> 6) & 0x1f]) &&
754 (h = p2[ucs & 0x3f]) < MAX_GLYPH) {
755 return h;
756 }
757
758 /*
759 * Not found.
760 */
761 return ucNotFound;
762 }
763
764 /*
765 * Note: contents of outbuf is not changes for negative return value!
766 */
conv_uni_to_str(char * outbuf,int buflen,UCode_t ucs,int usedefault)767 static int conv_uni_to_str(char *outbuf,
768 int buflen,
769 UCode_t ucs,
770 int usedefault)
771 {
772 char *h;
773 char ***p1, **p2;
774
775 /*
776 * Only 16-bit codes supported at this time.
777 */
778 if (ucs > 0xffff) {
779 /*
780 * U+FFFD: REPLACEMENT CHARACTER.
781 */
782 ucs = 0xfffd;
783 /*
784 * Maybe the following two cases should be allowed here?? - KW
785 */
786 } else if (ucs < 0x20 || ucs >= 0xfffe) {
787 /*
788 * Not a printable character.
789 */
790 return ucError;
791 } else if (ucs == 0xfeff || (ucs >= 0x200b && ucs <= 0x200f)) {
792 /*
793 * Zero-width space.
794 */
795 return ucZeroWidth;
796 }
797
798 if (usedefault) {
799 if (!unidefault_str_contents_valid)
800 return ucInvalidHash;
801 p1 = unidefault_pagedir_str[ucs >> 11];
802 } else {
803 if (!hashtable_str_contents_valid)
804 return ucInvalidHash;
805 p1 = uni_pagedir_str[ucs >> 11];
806 }
807
808 if (p1 &&
809 (p2 = p1[(ucs >> 6) & 0x1f]) &&
810 (h = p2[ucs & 0x3f])) {
811 StrNCpy(outbuf, h, (buflen - 1));
812 return 1; /* ok ! */
813 }
814
815 /*
816 * Not found.
817 */
818 return ucNotFound;
819 }
820
821 int UCInitialized = 0;
822
823 /*
824 * [ original comment: - KW ]
825 * This is called at sys_setup time, after memory and the console are
826 * initialized. It must be possible to call kmalloc(..., GFP_KERNEL)
827 * from this function, hence the call from sys_setup.
828 */
UCconsole_map_init(void)829 static void UCconsole_map_init(void)
830 {
831 con_set_default_unimap();
832 UCInitialized = 1;
833 }
834
835 /*
836 * OK now, finally, some stuff that is more specifically for Lynx: - KW
837 */
UCTransUniChar(UCode_t unicode,int charset_out)838 int UCTransUniChar(UCode_t unicode,
839 int charset_out)
840 {
841 int rc = 0;
842 int UChndl_out;
843 int isdefault, trydefault = 0;
844 const u16 *ut;
845
846 if ((UChndl_out = LYCharSet_UC[charset_out].UChndl) < 0) {
847 if (LYCharSet_UC[charset_out].codepage < 0) {
848 if (unicode < 128) {
849 rc = (int) unicode;
850 } else {
851 rc = LYCharSet_UC[charset_out].codepage;
852 }
853 return rc;
854 }
855 if ((UChndl_out = default_UChndl) < 0) {
856 return ucCannotOutput;
857 }
858 isdefault = 1;
859 } else {
860 isdefault = UCInfo[UChndl_out].replacedesc.isdefault;
861 trydefault = UCInfo[UChndl_out].replacedesc.trydefault;
862 }
863
864 if (!isdefault) {
865 ut = UCInfo[UChndl_out].unitable;
866 if (ut != UC_current_unitable) {
867 rc = UC_con_set_unimap(UChndl_out, 1);
868 if (rc < 0) {
869 return rc;
870 }
871 }
872 rc = conv_uni_to_pc(unicode, 0);
873 if (rc >= 0) {
874 return rc;
875 }
876 }
877 if (isdefault || trydefault) {
878 rc = conv_uni_to_pc(unicode, 1);
879 if (rc >= 0) {
880 return rc;
881 }
882 }
883 if (!isdefault && (rc == ucNotFound)) {
884 rc = conv_uni_to_pc(0xfffdL, 0);
885 }
886 if ((isdefault || trydefault) && (rc == ucNotFound)) {
887 rc = conv_uni_to_pc(0xfffdL, 1);
888 }
889 return rc;
890 }
891
892 /*
893 * Returns string length, or negative value for error.
894 */
UCTransUniCharStr(char * outbuf,int buflen,UCode_t unicode,int charset_out,int chk_single_flag)895 int UCTransUniCharStr(char *outbuf,
896 int buflen,
897 UCode_t unicode,
898 int charset_out,
899 int chk_single_flag)
900 {
901 int rc = ucUnknown, src = 0;
902 int UChndl_out;
903 int isdefault, trydefault = 0;
904 struct unimapdesc_str *repl;
905 const u16 *ut;
906
907 if (buflen < 2)
908 return ucBufferTooSmall;
909
910 if ((UChndl_out = LYCharSet_UC[charset_out].UChndl) < 0) {
911 if (LYCharSet_UC[charset_out].codepage < 0)
912 return LYCharSet_UC[charset_out].codepage;
913 if ((UChndl_out = default_UChndl) < 0)
914 return ucCannotOutput;
915 isdefault = 1;
916 } else {
917 isdefault = UCInfo[UChndl_out].replacedesc.isdefault;
918 trydefault = UCInfo[UChndl_out].replacedesc.trydefault;
919 }
920
921 if (chk_single_flag) {
922 if (!isdefault) {
923 ut = UCInfo[UChndl_out].unitable;
924 if (ut != UC_current_unitable) {
925 src = UC_con_set_unimap(UChndl_out, 1);
926 if (src < 0) {
927 return src;
928 }
929 }
930 }
931 src = conv_uni_to_pc(unicode, isdefault);
932 if (src >= 32) {
933 outbuf[0] = (char) src;
934 outbuf[1] = '\0';
935 return 1;
936 }
937 }
938
939 repl = &(UCInfo[UChndl_out].replacedesc);
940 if (!isdefault) {
941 if (repl != UC_current_unitable_str) {
942 con_clear_unimap_str(0);
943 (void) UC_con_set_unimap_str(repl->entry_ct, repl->entries, 0);
944 UC_current_unitable_str = repl;
945 }
946 rc = conv_uni_to_str(outbuf, buflen, unicode, 0);
947 if (rc >= 0)
948 return (int) strlen(outbuf);
949 }
950 if (trydefault && chk_single_flag) {
951 src = conv_uni_to_pc(unicode, 1);
952 if (src >= 32) {
953 outbuf[0] = (char) src;
954 outbuf[1] = '\0';
955 return 1;
956 }
957 }
958 if (isdefault || trydefault) {
959 #ifdef EXP_JAPANESEUTF8_SUPPORT
960 if (LYCharSet_UC[charset_out].codepage == 0 &&
961 LYCharSet_UC[charset_out].codepoints == 0) {
962 iconv_t cd;
963 char str[3], *pin, *pout;
964 size_t inleft, outleft;
965 char *tocode = NULL;
966
967 str[0] = (char) (unicode >> 8);
968 str[1] = (char) (unicode & 0xFF);
969 str[2] = 0;
970 pin = str;
971 inleft = 2;
972 pout = outbuf;
973 outleft = (size_t) buflen;
974 /*
975 * Try TRANSLIT first, since it is an extension which can provide
976 * translations when there is no available exact translation to
977 * the target character set.
978 */
979 HTSprintf0(&tocode, "%s//TRANSLIT", LYCharSet_UC[charset_out].MIMEname);
980 cd = iconv_open(tocode, "UTF-16BE");
981 if (cd == (iconv_t) -1) {
982 /*
983 * Try again, without TRANSLIT
984 */
985 HTSprintf0(&tocode, "%s", LYCharSet_UC[charset_out].MIMEname);
986 cd = iconv_open(tocode, "UTF-16BE");
987
988 if (cd == (iconv_t) -1) {
989 CTRACE((tfp,
990 "Warning: Cannot transcode form charset %s to %s!\n",
991 "UTF-16BE", tocode));
992 }
993 }
994 FREE(tocode);
995
996 if (cd != (iconv_t) -1) {
997 rc = (int) iconv(cd, (ICONV_CONST char **) &pin, &inleft,
998 &pout, &outleft);
999 iconv_close(cd);
1000 if ((pout - outbuf) == 3) {
1001 CTRACE((tfp,
1002 "It seems to be a JIS X 0201 code(%" PRI_UCode_t
1003 "). Not supported.\n", unicode));
1004 pin = str;
1005 inleft = 2;
1006 pout = outbuf;
1007 outleft = (size_t) buflen;
1008 } else if (rc >= 0) {
1009 *pout = '\0';
1010 return (int) strlen(outbuf);
1011 }
1012 }
1013 }
1014 #endif
1015 rc = conv_uni_to_str(outbuf, buflen, unicode, 1);
1016 if (rc >= 0)
1017 return (int) strlen(outbuf);
1018 }
1019 if (rc == ucNotFound) {
1020 if (!isdefault)
1021 rc = conv_uni_to_str(outbuf, buflen, 0xfffdL, 0);
1022 if ((rc == ucNotFound) && (isdefault || trydefault))
1023 rc = conv_uni_to_str(outbuf, buflen, 0xfffdL, 1);
1024 if (rc >= 0)
1025 return (int) strlen(outbuf);
1026 }
1027 if (chk_single_flag && src == ucNotFound) {
1028 if (!isdefault)
1029 rc = conv_uni_to_pc(0xfffdL, 0);
1030 if ((rc == ucNotFound) && (isdefault || trydefault))
1031 rc = conv_uni_to_pc(0xfffdL, 1);
1032 if (rc >= 32) {
1033 outbuf[0] = (char) rc;
1034 outbuf[1] = '\0';
1035 return 1;
1036 }
1037 return rc;
1038 }
1039 return ucNotFound;
1040 }
1041
1042 static int UC_lastautoGN = 0;
1043
UC_MapGN(int UChndl,int update_flag)1044 static int UC_MapGN(int UChndl,
1045 int update_flag)
1046 {
1047 int i, Gn, found, lasthndl;
1048
1049 found = 0;
1050 Gn = -1;
1051 for (i = 0; i < 4 && Gn < 0; i++) {
1052 if (UC_GNhandles[i] < 0) {
1053 Gn = i;
1054 } else if (UC_GNhandles[i] == UChndl) {
1055 Gn = i;
1056 found = 1;
1057 }
1058 }
1059 if (found)
1060 return Gn;
1061 if (Gn >= 0) {
1062 UCInfo[UChndl].GN = Gn;
1063 UC_GNhandles[Gn] = UChndl;
1064 } else {
1065 if (UC_lastautoGN == GRAF_MAP) {
1066 Gn = IBMPC_MAP;
1067 } else {
1068 Gn = GRAF_MAP;
1069 }
1070 UC_lastautoGN = Gn;
1071 lasthndl = UC_GNhandles[Gn];
1072 UCInfo[lasthndl].GN = -1;
1073 UCInfo[UChndl].GN = Gn;
1074 UC_GNhandles[Gn] = UChndl;
1075 }
1076 CTRACE((tfp, "UC_MapGN: Using %d <- %d (%s)\n",
1077 Gn, UChndl, UCInfo[UChndl].MIMEname));
1078 UC_con_set_trans(UChndl, Gn, update_flag);
1079 return Gn;
1080 }
1081
UCTransChar(int ch_in,int charset_in,int charset_out)1082 int UCTransChar(int ch_in,
1083 int charset_in,
1084 int charset_out)
1085 {
1086 UCode_t unicode;
1087 int Gn;
1088 int rc = ucNotFound;
1089 int UChndl_in, UChndl_out;
1090 int isdefault, trydefault = 0;
1091 const u16 *ut;
1092 int upd = 0;
1093
1094 if (charset_in == charset_out)
1095 return UCH(ch_in);
1096 if (charset_in < 0)
1097 return ucCannotConvert;
1098 if ((UChndl_in = LYCharSet_UC[charset_in].UChndl) < 0)
1099 return ucCannotConvert;
1100 if ((UChndl_out = LYCharSet_UC[charset_out].UChndl) < 0) {
1101 if (LYCharSet_UC[charset_out].codepage < 0)
1102 return LYCharSet_UC[charset_out].codepage;
1103 if ((UChndl_out = default_UChndl) < 0)
1104 return ucCannotOutput;
1105 isdefault = 1;
1106 } else {
1107 isdefault = UCInfo[UChndl_out].replacedesc.isdefault;
1108 trydefault = UCInfo[UChndl_out].replacedesc.trydefault;
1109 }
1110 if (!UCInfo[UChndl_in].num_uni)
1111 return ucCannotConvert;
1112 if ((Gn = UCInfo[UChndl_in].GN) < 0) {
1113 Gn = UC_MapGN(UChndl_in, 0);
1114 upd = 1;
1115 }
1116
1117 ut = UCInfo[UChndl_out].unitable;
1118 if (!isdefault) {
1119 if (ut == UC_current_unitable) {
1120 if (upd) {
1121 set_inverse_transl(Gn);
1122 }
1123 } else {
1124 rc = UC_con_set_unimap(UChndl_out, 1);
1125 if (rc > 0) {
1126 set_inverse_transl(Gn);
1127 } else if (rc < 0) {
1128 return rc;
1129 }
1130 }
1131 }
1132 UC_translate = set_translate(Gn);
1133 unicode = UC_translate[UCH(ch_in)];
1134 if (!isdefault) {
1135 rc = conv_uni_to_pc(unicode, 0);
1136 if (rc >= 0)
1137 return rc;
1138 }
1139 if ((rc == ucNotFound) && (isdefault || trydefault)) {
1140 rc = conv_uni_to_pc(unicode, 1);
1141 }
1142 if ((rc == ucNotFound) && !isdefault) {
1143 rc = conv_uni_to_pc(0xfffdL, 0);
1144 }
1145 if ((rc == ucNotFound) && (isdefault || trydefault)) {
1146 rc = conv_uni_to_pc(0xfffdL, 1);
1147 }
1148 return rc;
1149 }
1150
1151 #ifdef EXP_JAPANESEUTF8_SUPPORT
UCTransJPToUni(char * inbuf,int buflen,int charset_in)1152 UCode_t UCTransJPToUni(char *inbuf,
1153 int buflen,
1154 int charset_in)
1155 {
1156 char outbuf[3], *pin, *pout;
1157 size_t ilen, olen;
1158 iconv_t cd;
1159
1160 pin = inbuf;
1161 pout = outbuf;
1162 ilen = 2;
1163 olen = (size_t) buflen;
1164
1165 cd = iconv_open("UTF-16BE", LYCharSet_UC[charset_in].MIMEname);
1166 (void) iconv(cd, (ICONV_CONST char **) &pin, &ilen, &pout, &olen);
1167 iconv_close(cd);
1168 if ((ilen == 0) && (olen == 0)) {
1169 return (((unsigned char) outbuf[0]) << 8) + (unsigned char) outbuf[1];
1170 }
1171 return ucCannotConvert;
1172 }
1173 #endif
1174
1175 /*
1176 * Translate a character to Unicode. If additional bytes are needed, this
1177 * returns ucNeedMore, based on its internal state. To reset the state,
1178 * call this with charset_in < 0.
1179 */
UCTransToUni(int ch_in,int charset_in)1180 UCode_t UCTransToUni(int ch_in,
1181 int charset_in)
1182 {
1183 static char buffer[10];
1184 static unsigned inx = 0;
1185
1186 UCode_t unicode;
1187 int Gn;
1188 unsigned char ch_iu = UCH(ch_in);
1189 int UChndl_in;
1190
1191 /*
1192 * Reset saved-state.
1193 */
1194 if (charset_in < 0) {
1195 inx = 0;
1196 return ucCannotConvert;
1197 } else if (charset_in == LATIN1) {
1198 return ch_iu;
1199 } else if (charset_in == UTF8_handle) {
1200 if (is8bits(ch_iu)) {
1201 unsigned need;
1202 char *ptr;
1203
1204 buffer[inx++] = (char) ch_iu;
1205 buffer[inx] = '\0';
1206 need = (unsigned) utf8_length(TRUE, buffer);
1207 if (need && (need + 1) == inx) {
1208 inx = 0;
1209 ptr = buffer;
1210 return UCGetUniFromUtf8String(&ptr);
1211 } else if (inx < sizeof(buffer) - 1) {
1212 return ucNeedMore;
1213 } else {
1214 inx = 0;
1215 }
1216 } else {
1217 inx = 0;
1218 }
1219 }
1220 #ifdef EXP_JAPANESEUTF8_SUPPORT
1221 if ((strcmp(LYCharSet_UC[charset_in].MIMEname, "shift_jis") == 0) ||
1222 (strcmp(LYCharSet_UC[charset_in].MIMEname, "euc-jp") == 0)) {
1223 char obuffer[3], *pin, *pout;
1224 size_t ilen, olen;
1225 iconv_t cd;
1226
1227 pin = buffer;
1228 pout = obuffer;
1229 ilen = olen = 2;
1230 if (strcmp(LYCharSet_UC[charset_in].MIMEname, "shift_jis") == 0) {
1231 if (inx == 0) {
1232 if (IS_SJIS_HI1(ch_iu) ||
1233 IS_SJIS_HI2(ch_iu)) {
1234 buffer[0] = (char) ch_in;
1235 inx = 1;
1236 return ucNeedMore;
1237 }
1238 } else {
1239 if (IS_SJIS_LO(ch_iu)) {
1240 buffer[1] = (char) ch_in;
1241 buffer[2] = 0;
1242
1243 cd = iconv_open("UTF-16BE", "Shift_JIS");
1244 (void) iconv(cd, (ICONV_CONST char **) &pin, &ilen, &pout, &olen);
1245 iconv_close(cd);
1246 inx = 0;
1247 if ((ilen == 0) && (olen == 0)) {
1248 return (UCH(obuffer[0]) << 8) + UCH(obuffer[1]);
1249 }
1250 }
1251 }
1252 }
1253 if (strcmp(LYCharSet_UC[charset_in].MIMEname, "euc-jp") == 0) {
1254 if (inx == 0) {
1255 if (IS_EUC_HI(ch_iu)) {
1256 buffer[0] = (char) ch_in;
1257 inx = 1;
1258 return ucNeedMore;
1259 }
1260 } else {
1261 if (IS_EUC_LOX(ch_iu)) {
1262 buffer[1] = (char) ch_in;
1263 buffer[2] = 0;
1264
1265 cd = iconv_open("UTF-16BE", "EUC-JP");
1266 (void) iconv(cd, (ICONV_CONST char **) &pin, &ilen, &pout, &olen);
1267 iconv_close(cd);
1268 inx = 0;
1269 if ((ilen == 0) && (olen == 0)) {
1270 return (UCH(obuffer[0]) << 8) + UCH(obuffer[1]);
1271 }
1272 }
1273 }
1274 }
1275 inx = 0;
1276 }
1277 #endif
1278 if (ch_iu < 128 && ch_iu >= 32)
1279 return ch_iu;
1280
1281 if (ch_iu < 32 &&
1282 LYCharSet_UC[charset_in].enc != UCT_ENC_8BIT_C0) {
1283 /*
1284 * Don't translate C0 chars except for specific charsets.
1285 */
1286 return ch_iu;
1287 } else if ((UChndl_in = LYCharSet_UC[charset_in].UChndl) < 0) {
1288 return ucCannotConvert;
1289 } else if (!UCInfo[UChndl_in].num_uni) {
1290 return ucCannotConvert;
1291 }
1292
1293 if ((Gn = UCInfo[UChndl_in].GN) < 0) {
1294 Gn = UC_MapGN(UChndl_in, 1);
1295 }
1296
1297 UC_translate = set_translate(Gn);
1298 unicode = UC_translate[ch_iu];
1299
1300 return unicode;
1301 }
1302
UCReverseTransChar(int ch_out,int charset_in,int charset_out)1303 int UCReverseTransChar(int ch_out,
1304 int charset_in,
1305 int charset_out)
1306 {
1307 int Gn;
1308 int rc = ucError;
1309 int UChndl_in, UChndl_out;
1310 int isdefault;
1311 int i_ch = UCH(ch_out);
1312 const u16 *ut;
1313
1314 if (charset_in == charset_out)
1315 return UCH(ch_out);
1316 if (charset_in < 0)
1317 return ucCannotConvert;
1318 if ((UChndl_in = LYCharSet_UC[charset_in].UChndl) < 0)
1319 return ucCannotConvert;
1320 if (!UCInfo[UChndl_in].num_uni)
1321 return ucCannotConvert;
1322 if (charset_out < 0)
1323 return ucCannotOutput;
1324 if ((UChndl_out = LYCharSet_UC[charset_out].UChndl) < 0) {
1325 if (LYCharSet_UC[charset_out].codepage < 0)
1326 return LYCharSet_UC[charset_out].codepage;
1327 if ((UChndl_out = default_UChndl) < 0)
1328 return ucCannotOutput;
1329 isdefault = 1;
1330 } else {
1331 isdefault = UCInfo[UChndl_out].replacedesc.isdefault;
1332 }
1333
1334 if (!isdefault) {
1335 /*
1336 * Try to use the inverse table if charset_out is not equivalent
1337 * to using just the default table. If it is, it should have
1338 * just ASCII chars and trying to back-translate those should
1339 * not give anything but themselves. - kw
1340 */
1341 ut = UCInfo[UChndl_out].unitable;
1342 if (ut == UC_current_unitable) {
1343 if ((Gn = UCInfo[UChndl_in].GN) < 0) {
1344 Gn = UC_MapGN(UChndl_in, 1);
1345 }
1346 UC_translate = set_translate(Gn);
1347 if (inv_translate)
1348 rc = inv_translate[i_ch];
1349 if (rc >= 32) {
1350 return rc;
1351 }
1352 }
1353 }
1354 return UCTransChar(ch_out, charset_out, charset_in);
1355 }
1356
1357 /*
1358 * Returns string length, or negative value for error.
1359 */
UCTransCharStr(char * outbuf,int buflen,int ch_in,int charset_in,int charset_out,int chk_single_flag)1360 int UCTransCharStr(char *outbuf,
1361 int buflen,
1362 int ch_in,
1363 int charset_in,
1364 int charset_out,
1365 int chk_single_flag)
1366 {
1367 UCode_t unicode;
1368 int Gn;
1369 int rc = ucUnknown, src = 0;
1370 int UChndl_in, UChndl_out;
1371 int isdefault, trydefault = 0;
1372 struct unimapdesc_str *repl;
1373 const u16 *ut;
1374 int upd = 0;
1375
1376 if (buflen < 2)
1377 return ucBufferTooSmall;
1378 if (chk_single_flag && charset_in == charset_out) {
1379 outbuf[0] = (char) ch_in;
1380 outbuf[1] = '\0';
1381 return 1;
1382 }
1383 if (charset_in < 0)
1384 return ucCannotConvert;
1385 if ((UChndl_in = LYCharSet_UC[charset_in].UChndl) < 0)
1386 return ucCannotConvert;
1387 if (!UCInfo[UChndl_in].num_uni)
1388 return ucCannotConvert;
1389 if ((UChndl_out = LYCharSet_UC[charset_out].UChndl) < 0) {
1390 if (LYCharSet_UC[charset_out].codepage < 0)
1391 return LYCharSet_UC[charset_out].codepage;
1392 if ((UChndl_out = default_UChndl) < 0)
1393 return ucCannotOutput;
1394 isdefault = 1;
1395 } else {
1396 isdefault = UCInfo[UChndl_out].replacedesc.isdefault;
1397 trydefault = UCInfo[UChndl_out].replacedesc.trydefault;
1398 }
1399 if ((Gn = UCInfo[UChndl_in].GN) < 0) {
1400 Gn = UC_MapGN(UChndl_in, !chk_single_flag);
1401 upd = chk_single_flag;
1402 }
1403
1404 UC_translate = set_translate(Gn);
1405 unicode = UC_translate[UCH(ch_in)];
1406
1407 if (chk_single_flag) {
1408 if (!isdefault) {
1409 ut = UCInfo[UChndl_out].unitable;
1410 if (ut == UC_current_unitable) {
1411 if (upd)
1412 set_inverse_transl(Gn);
1413 } else {
1414 src = UC_con_set_unimap(UChndl_out, 1);
1415 if (src > 0) {
1416 set_inverse_transl(Gn);
1417 } else if (src < 0) {
1418 return src;
1419 }
1420 }
1421 }
1422 src = conv_uni_to_pc(unicode, isdefault);
1423 if (src >= 32) {
1424 outbuf[0] = (char) src;
1425 outbuf[1] = '\0';
1426 return 1;
1427 }
1428 }
1429
1430 repl = &(UCInfo[UChndl_out].replacedesc);
1431 if (!isdefault) {
1432 if (repl != UC_current_unitable_str) {
1433 con_clear_unimap_str(0);
1434 (void) UC_con_set_unimap_str(repl->entry_ct, repl->entries, 0);
1435 UC_current_unitable_str = repl;
1436 }
1437 rc = conv_uni_to_str(outbuf, buflen, unicode, 0);
1438 if (rc >= 0)
1439 return (int) strlen(outbuf);
1440 }
1441 if (trydefault && chk_single_flag) {
1442 src = conv_uni_to_pc(unicode, 1);
1443 if (src >= 32) {
1444 outbuf[0] = (char) src;
1445 outbuf[1] = '\0';
1446 return 1;
1447 }
1448 }
1449 if (isdefault || trydefault) {
1450 rc = conv_uni_to_str(outbuf, buflen, unicode, 1);
1451 if (rc >= 0)
1452 return (int) strlen(outbuf);
1453 }
1454 if (rc == ucNotFound) {
1455 if (!isdefault)
1456 rc = conv_uni_to_str(outbuf, buflen, 0xfffdL, 0);
1457 if ((rc == ucNotFound) && (isdefault || trydefault))
1458 rc = conv_uni_to_str(outbuf, buflen, 0xfffdL, 1);
1459 if (rc >= 0)
1460 return (int) strlen(outbuf);
1461 }
1462 if (chk_single_flag && src == ucNotFound) {
1463 if (!isdefault)
1464 rc = conv_uni_to_pc(0xfffdL, 0);
1465 if ((rc == ucNotFound) && (isdefault || trydefault))
1466 rc = conv_uni_to_pc(0xfffdL, 1);
1467 if (rc >= 32) {
1468 outbuf[0] = (char) rc;
1469 outbuf[1] = '\0';
1470 return 1;
1471 } else if (rc <= 0) {
1472 outbuf[0] = '\0';
1473 return rc;
1474 }
1475 return rc;
1476 }
1477 return ucNotFound;
1478 }
1479
UC_FindGN_byMIME(const char * UC_MIMEcharset)1480 static int UC_FindGN_byMIME(const char *UC_MIMEcharset)
1481 {
1482 int i;
1483
1484 for (i = 0; i < 4; i++) {
1485 if (!strcmp(UC_MIMEcharset, UC_GNsetMIMEnames[i])) {
1486 return i;
1487 }
1488 }
1489 return ucError;
1490 }
1491
UCGetRawUniMode_byLYhndl(int i)1492 int UCGetRawUniMode_byLYhndl(int i)
1493 {
1494 if (i < 0)
1495 return 0;
1496 return LYCharSet_UC[i].enc;
1497 }
1498
1499 /*
1500 * Construct a new charset name, given prefix and codepage. This introduces
1501 * potentially unchecked recursion into UCGetLYhntl_byMIME if neither the "cp"
1502 * nor "windows-" prefixes are configured, so we check it here.
1503 */
getLYhndl_byCP(const char * prefix,const char * codepage)1504 static int getLYhndl_byCP(const char *prefix,
1505 const char *codepage)
1506 {
1507 static int nested;
1508 int result = ucError;
1509
1510 if (!nested++) {
1511 char *cptmp = NULL;
1512
1513 StrAllocCopy(cptmp, prefix);
1514 StrAllocCat(cptmp, codepage);
1515 result = UCGetLYhndl_byMIME(cptmp);
1516 FREE(cptmp);
1517 }
1518 nested--;
1519 return result;
1520 }
1521
1522 /*
1523 * Get Lynx internal charset handler from MIME name,
1524 * return -1 if we got NULL or did not recognize value.
1525 * According to RFC, MIME headers should match case-insensitively.
1526 */
UCGetLYhndl_byMIME(const char * value)1527 int UCGetLYhndl_byMIME(const char *value)
1528 {
1529 int i;
1530 int LYhndl = -1;
1531
1532 if (!value || !(*value)) {
1533 CTRACE((tfp,
1534 "UCGetLYhndl_byMIME: NULL argument instead of MIME name.\n"));
1535 return ucError;
1536 }
1537
1538 for (i = 0;
1539 (i < MAXCHARSETS && i < LYNumCharsets &&
1540 LYchar_set_names[i]); i++) {
1541 if (LYCharSet_UC[i].MIMEname &&
1542 !strcasecomp(value, LYCharSet_UC[i].MIMEname)) {
1543 return i;
1544 }
1545 }
1546
1547 /*
1548 * Not yet found, try synonyms. - FM
1549 */
1550 #if !NO_CHARSET_utf_8
1551 if (!strcasecomp(value, "unicode-1-1-utf-8") ||
1552 !strcasecomp(value, "utf8")) {
1553 /*
1554 * Treat these as synonyms for the IANA registered name. - FM
1555 */
1556 return UCGetLYhndl_byMIME("utf-8");
1557 }
1558 #endif
1559 if (!strncasecomp(value, "iso", 3) && !StrNCmp(value + 3, "8859", 4)) {
1560 return getLYhndl_byCP("iso-", value + 3);
1561 }
1562 if (!strcasecomp(value, "iso-8859-8-i") ||
1563 !strcasecomp(value, "iso-8859-8-e")) {
1564 return UCGetLYhndl_byMIME("iso-8859-8");
1565 }
1566 #if !NO_CHARSET_euc_jp
1567 if (!strcasecomp(value, "x-euc-jp") ||
1568 !strcasecomp(value, "eucjp")) {
1569 return UCGetLYhndl_byMIME("euc-jp");
1570 }
1571 #endif
1572 #if !NO_CHARSET_shift_jis
1573 if ((!strcasecomp(value, "x-shift-jis")) ||
1574 (!strcasecomp(value, "x-sjis")) ||
1575 (!strcasecomp(value, "pck"))) {
1576 return UCGetLYhndl_byMIME("shift_jis");
1577 }
1578 #endif
1579 #if !NO_CHARSET_euc_kr
1580 if (!strcasecomp(value, "iso-2022-kr")) {
1581 return UCGetLYhndl_byMIME("euc-kr");
1582 }
1583 #endif
1584 #if !NO_CHARSET_euc_cn
1585 if (!strcasecomp(value, "gb2312") ||
1586 !strncasecomp(value, "cn-gb", 5) ||
1587 !strcasecomp(value, "iso-2022-cn")) {
1588 return UCGetLYhndl_byMIME("euc-cn");
1589 }
1590 #endif
1591 #if !NO_CHARSET_big5
1592 if (!strcasecomp(value, "cn-big5")) {
1593 return UCGetLYhndl_byMIME("big5");
1594 }
1595 #endif
1596 #if !NO_CHARSET_macintosh
1597 if (!strcasecomp(value, "x-mac-roman") ||
1598 !strcasecomp(value, "mac-roman")) {
1599 return UCGetLYhndl_byMIME("macintosh");
1600 }
1601 #endif
1602 #if !NO_CHARSET_next
1603 if (!strcasecomp(value, "x-next") ||
1604 !strcasecomp(value, "nextstep") ||
1605 !strcasecomp(value, "x-nextstep")) {
1606 return UCGetLYhndl_byMIME("next");
1607 }
1608 #endif
1609 #if !NO_CHARSET_windows_1252
1610 if (!strcasecomp(value, "iso-8859-1-windows-3.1-latin-1") ||
1611 !strcasecomp(value, "cp1252") ||
1612 !strcasecomp(value, "cp-1252") ||
1613 !strcasecomp(value, "ibm1252") ||
1614 !strcasecomp(value, "iso-8859-1-windows-3.0-latin-1")) {
1615 /*
1616 * Treat these as synonyms for windows-1252, which is more
1617 * commonly used than the IANA registered name. - FM
1618 */
1619 return UCGetLYhndl_byMIME("windows-1252");
1620 }
1621 #endif
1622 #if !NO_CHARSET_windows_1251
1623 if (!strcasecomp(value, "ansi-1251")) {
1624 return UCGetLYhndl_byMIME("windows-1251");
1625 }
1626 #endif
1627 #if !NO_CHARSET_windows_1250
1628 if (!strcasecomp(value, "iso-8859-2-windows-latin-2") ||
1629 !strcasecomp(value, "cp1250") ||
1630 !strcasecomp(value, "cp-1250") ||
1631 !strcasecomp(value, "ibm1250")) {
1632 /*
1633 * Treat these as synonyms for windows-1250. - FM
1634 */
1635 return UCGetLYhndl_byMIME("windows-1250");
1636 }
1637 #endif
1638 if ((!strncasecomp(value, "ibm", 3) ||
1639 !strncasecomp(value, "cp-", 3)) &&
1640 isdigit(UCH(value[3])) &&
1641 isdigit(UCH(value[4])) &&
1642 isdigit(UCH(value[5]))) {
1643 /*
1644 * For "ibmNNN<...>" or "cp-NNN", try "cpNNN<...>"
1645 * if not yet found. - KW & FM
1646 */
1647 if ((LYhndl = getLYhndl_byCP("cp", value + 3)) >= 0)
1648 return LYhndl;
1649 /*
1650 * Try windows-NNN<...> if not yet found. - FM
1651 */
1652 return getLYhndl_byCP("windows-", value + 3);
1653 }
1654 if (!strncasecomp(value, "windows-", 8) &&
1655 isdigit(UCH(value[8])) &&
1656 isdigit(UCH(value[9])) &&
1657 isdigit(UCH(value[10]))) {
1658 /*
1659 * For "windows-NNN<...>", try "cpNNN<...>" - FM
1660 */
1661 return getLYhndl_byCP("cp", value + 8);
1662 }
1663 #if !NO_CHARSET_koi8_r
1664 if (!strcasecomp(value, "koi-8")) { /* accentsoft bugosity */
1665 return UCGetLYhndl_byMIME("koi8-r");
1666 }
1667 #endif
1668 if (!strcasecomp(value, "ANSI_X3.4-1968")) {
1669 return US_ASCII;
1670 }
1671 /* no more synonyms if come here... */
1672
1673 CTRACE((tfp, "UCGetLYhndl_byMIME: unrecognized MIME name \"%s\"\n", value));
1674 return ucError; /* returns -1 if no charset found by that MIME name */
1675 }
1676
1677 /*
1678 * Function UC_setup_LYCharSets_repl() tries to set up a subtable in
1679 * LYCharSets[] appropriate for this new charset, for compatibility with the
1680 * "old method". Maybe not nice (maybe not even necessary any more), but it
1681 * works (as far as it goes..).
1682 *
1683 * We try to be conservative and only allocate new memory for this if needed.
1684 * If not needed, just point to SevenBitApproximations[i]. [Could do the same
1685 * for ISO_Latin1[] if it's identical to that, but would make it even *more*
1686 * messy than it already is...] This the only function in this file that knows,
1687 * or cares, about the HTMLDTD or details of LYCharSets[] subtables (and
1688 * therefore somewhat violates the idea that this file should be independent of
1689 * those). As in other places, we rely on ISO_Latin1 being the *first* table
1690 * in LYCharSets. - KW
1691 */
1692
1693 /*
1694 * We need to remember which ones were allocated and which are static.
1695 */
1696 static STRING2PTR remember_allocated_LYCharSets[MAXCHARSETS];
1697
UCreset_allocated_LYCharSets(void)1698 static void UCreset_allocated_LYCharSets(void)
1699 {
1700 int i = 0;
1701
1702 for (; i < MAXCHARSETS; i++) {
1703 remember_allocated_LYCharSets[i] = NULL;
1704 }
1705 }
1706
1707 #ifdef LY_FIND_LEAKS
UCfree_allocated_LYCharSets(void)1708 static void UCfree_allocated_LYCharSets(void)
1709 {
1710 int i = 0;
1711
1712 for (; i < MAXCHARSETS; i++) {
1713 if (remember_allocated_LYCharSets[i] != NULL) {
1714 FREE(remember_allocated_LYCharSets[i]);
1715 }
1716 }
1717 }
1718 #endif
1719
UC_setup_LYCharSets_repl(int UC_charset_in_hndl,unsigned lowest8)1720 static STRING2PTR UC_setup_LYCharSets_repl(int UC_charset_in_hndl,
1721 unsigned lowest8)
1722 {
1723 STRING2PTR ISO_Latin1 = LYCharSets[0];
1724 const char **p;
1725 char **prepl;
1726 const u16 *pp;
1727 const char **tp;
1728 const char *s7;
1729 const char *s8;
1730 size_t i;
1731 int j, changed;
1732 u16 k;
1733 u8 *ti;
1734
1735 /*
1736 * Create a temporary table for reverse lookup of latin1 codes:
1737 */
1738 tp = (const char **) malloc(96 * sizeof(char *));
1739
1740 if (!tp)
1741 return NULL;
1742 for (i = 0; i < 96; i++)
1743 tp[i] = NULL;
1744 ti = (u8 *) malloc(96 * sizeof(u8));
1745 if (!ti) {
1746 FREE(tp);
1747 return NULL;
1748 }
1749 for (i = 0; i < 96; i++)
1750 ti[i] = 0;
1751
1752 pp = UCInfo[UC_charset_in_hndl].unitable;
1753
1754 /*
1755 * Determine if we have any mapping of a Unicode in the range 160-255
1756 * to an allowed code point > 0x80 in our new charset...
1757 * Store any mappings found in ti[].
1758 */
1759 if (UCInfo[UC_charset_in_hndl].num_uni > 0) {
1760 for (i = 0; i < 256; i++) {
1761 if ((j = UCInfo[UC_charset_in_hndl].unicount[i])) {
1762 if ((k = *pp) >= 160 && k < 256 && i >= lowest8) {
1763 ti[k - 160] = UCH(i);
1764 }
1765 for (; j; j--) {
1766 pp++;
1767 }
1768 }
1769 }
1770 } {
1771 u16 ct;
1772 struct unipair_str *list;
1773
1774 /*
1775 * Determine if we have any mapping of a Unicode in the range
1776 * 160-255 to a replacement string for our new charset...
1777 * Store any mappings found in tp[].
1778 */
1779 ct = UCInfo[UC_charset_in_hndl].replacedesc.entry_ct;
1780 list = UCInfo[UC_charset_in_hndl].replacedesc.entries;
1781 while (ct--) {
1782 if ((k = list->unicode) >= 160 && k < 256) {
1783 tp[k - 160] = list->replace_str;
1784 }
1785 list++;
1786 }
1787 }
1788 /*
1789 * Now allocate a new table compatible with LYCharSets[]
1790 * and with the HTMLDTD for entities.
1791 * We don't know yet whether we'll keep it around.
1792 */
1793 prepl = (char **) malloc(HTML_dtd.number_of_entities * sizeof(char *));
1794
1795 if (!prepl) {
1796 FREE(tp);
1797 FREE(ti);
1798 return 0;
1799 }
1800
1801 p = (const char **) prepl;
1802 changed = 0;
1803 for (i = 0; i < HTML_dtd.number_of_entities; i++, p++) {
1804 /*
1805 * For each of those entities, we check what the "old method"
1806 * ISO_Latin1[] mapping does with them. If it is nothing we
1807 * want to use, just point to the SevenBitApproximations[] string.
1808 */
1809 s7 = SevenBitApproximations[i];
1810 s8 = ISO_Latin1[i];
1811 *p = s7;
1812 if (s8 && UCH(*s8) >= 160 && s8[1] == '\0') {
1813 /*
1814 * We have an entity that is mapped to
1815 * one valid eightbit latin1 char.
1816 */
1817 if (ti[UCH(*s8) - 160] >= UCH(lowest8) &&
1818 !(UCH(s7[0]) == ti[UCH(*s8) - 160] &&
1819 s7[1] == '\0')) {
1820 /*
1821 * ...which in turn is mapped, by our "new method",
1822 * to another valid eightbit char for this new
1823 * charset: either to itself...
1824 */
1825 if (ti[UCH(*s8) - 160] == UCH(*s8)) {
1826 *p = s8;
1827 } else {
1828 /*
1829 * make those 1-char strings
1830 * into HTAtoms, so they will be cleaned up
1831 * at exit... all for the sake of preventing
1832 * memory leaks, sigh.
1833 */
1834 static char dummy[2]; /* one char dummy string */
1835
1836 dummy[0] = (char) ti[UCH(*s8) - 160];
1837 *p = HTAtom_name(HTAtom_for(dummy));
1838 }
1839 changed = 1;
1840 } else if (tp[UCH(*s8) - 160] &&
1841 strcmp(s7, tp[UCH(*s8) - 160])) {
1842 /*
1843 * ...or which is mapped, by our "new method",
1844 * to a replacement string for this new charset.
1845 */
1846 *p = tp[UCH(*s8) - 160];
1847 changed = 1;
1848 }
1849 }
1850 }
1851 FREE(tp);
1852 FREE(ti);
1853 if (!changed) {
1854 FREE(prepl);
1855 return NULL;
1856 }
1857 return (STRING2PTR) prepl;
1858 }
1859
1860 /*
1861 * "New method" meets "Old method" ...
1862 */
UC_Register_with_LYCharSets(int s,const char * UC_MIMEcharset,const char * UC_LYNXcharset,int lowest_eightbit)1863 static int UC_Register_with_LYCharSets(int s,
1864 const char *UC_MIMEcharset,
1865 const char *UC_LYNXcharset,
1866 int lowest_eightbit)
1867 {
1868 int i, LYhndl, found;
1869 STRING2PTR repl;
1870
1871 LYhndl = -1;
1872 if (LYNumCharsets == 0) {
1873 /*
1874 * Initialize here; so whoever changes
1875 * LYCharSets.c doesn't have to count...
1876 */
1877 for (i = 0; (i < MAXCHARSETS) && LYchar_set_names[i]; i++) {
1878 LYNumCharsets = i + 1;
1879 }
1880 }
1881
1882 /*
1883 * Search by MIME name, (LYchar_set_names may differ...)
1884 */
1885 for (i = 0; i < MAXCHARSETS && LYchar_set_names[i] && LYhndl < 0; i++) {
1886 if (LYCharSet_UC[i].MIMEname &&
1887 !strcmp(UC_MIMEcharset, LYCharSet_UC[i].MIMEname)) {
1888 LYhndl = i;
1889 }
1890 }
1891
1892 if (LYhndl < 0) { /* not found */
1893 found = 0;
1894 if (LYNumCharsets >= MAXCHARSETS) {
1895 CTRACE((tfp,
1896 "UC_Register_with_LYCharSets: Too many. Ignoring %s/%s.",
1897 UC_MIMEcharset, UC_LYNXcharset));
1898 return ucError;
1899 }
1900 /*
1901 * Add to LYCharSets.c lists.
1902 */
1903 LYhndl = LYNumCharsets;
1904 LYNumCharsets++;
1905 LYlowest_eightbit[LYhndl] = 999;
1906 LYCharSets[LYhndl] = SevenBitApproximations;
1907 /*
1908 * Hmm, try to be conservative here.
1909 */
1910 LYchar_set_names[LYhndl] = UC_LYNXcharset;
1911 LYchar_set_names[LYhndl + 1] = NULL;
1912 /*
1913 * Terminating NULL may be looked for by Lynx code.
1914 */
1915 } else {
1916 found = 1;
1917 }
1918 LYCharSet_UC[LYhndl].UChndl = s;
1919 /*
1920 * Can we just copy the pointer? Hope so...
1921 */
1922 LYCharSet_UC[LYhndl].MIMEname = UC_MIMEcharset;
1923 LYCharSet_UC[LYhndl].enc = UCInfo[s].enc;
1924 LYCharSet_UC[LYhndl].codepage = UCInfo[s].codepage;
1925
1926 /*
1927 * @@@ We really SHOULD get more info from the table files,
1928 * and set relevant flags in the LYCharSet_UC[] entry with
1929 * that info... For now, let's try it without. - KW
1930 */
1931 if (lowest_eightbit < LYlowest_eightbit[LYhndl]) {
1932 LYlowest_eightbit[LYhndl] = lowest_eightbit;
1933 } else if (lowest_eightbit > LYlowest_eightbit[LYhndl]) {
1934 UCInfo[s].lowest_eight = LYlowest_eightbit[LYhndl];
1935 }
1936
1937 if (!found && LYhndl > 0) {
1938 repl = UC_setup_LYCharSets_repl(s, (unsigned) UCInfo[s].lowest_eight);
1939 if (repl) {
1940 LYCharSets[LYhndl] = repl;
1941 /*
1942 * Remember to FREE at exit.
1943 */
1944 remember_allocated_LYCharSets[LYhndl] = repl;
1945 }
1946 }
1947 return LYhndl;
1948 }
1949
1950 /*
1951 * This only sets up the structure - no initialization of the tables
1952 * is done here yet.
1953 */
UC_Charset_Setup(const char * UC_MIMEcharset,const char * UC_LYNXcharset,const u8 * unicount,const u16 * unitable,int nnuni,struct unimapdesc_str replacedesc,int lowest_eight,int UC_rawuni,int codepage)1954 void UC_Charset_Setup(const char *UC_MIMEcharset,
1955 const char *UC_LYNXcharset,
1956 const u8 * unicount,
1957 const u16 * unitable,
1958 int nnuni,
1959 struct unimapdesc_str replacedesc,
1960 int lowest_eight,
1961 int UC_rawuni,
1962 int codepage)
1963 {
1964 int s, Gn;
1965 int i, status = 0, found;
1966
1967 /*
1968 * Get (new?) slot.
1969 */
1970 found = -1;
1971 for (i = 0; i < UCNumCharsets && found < 0; i++) {
1972 if (!strcmp(UCInfo[i].MIMEname, UC_MIMEcharset)) {
1973 found = i;
1974 }
1975 }
1976 if (found >= 0) {
1977 s = found;
1978 } else {
1979 if (UCNumCharsets >= MAXCHARSETS) {
1980 CTRACE((tfp, "UC_Charset_Setup: Too many. Ignoring %s/%s.",
1981 UC_MIMEcharset, UC_LYNXcharset));
1982 return;
1983 }
1984 s = UCNumCharsets;
1985 UCInfo[s].MIMEname = UC_MIMEcharset;
1986 }
1987 UCInfo[s].LYNXname = UC_LYNXcharset;
1988 UCInfo[s].unicount = unicount;
1989 UCInfo[s].unitable = unitable;
1990 UCInfo[s].num_uni = nnuni;
1991 UCInfo[s].replacedesc = replacedesc;
1992 if (replacedesc.isdefault) {
1993 default_UChndl = s;
1994 }
1995 Gn = UC_FindGN_byMIME(UC_MIMEcharset);
1996 if (Gn >= 0)
1997 UC_GNhandles[Gn] = s;
1998 UCInfo[s].GN = Gn;
1999 if (UC_rawuni == UCT_ENC_UTF8)
2000 lowest_eight = 128; /* cheat here */
2001 UCInfo[s].lowest_eight = lowest_eight;
2002 UCInfo[s].enc = UC_rawuni;
2003 UCInfo[s].codepage = codepage;
2004 UCInfo[s].LYhndl = UC_Register_with_LYCharSets(s,
2005 UC_MIMEcharset,
2006 UC_LYNXcharset,
2007 lowest_eight);
2008 CTRACE2(TRACE_CFG, (tfp, "registered charset %d mime \"%s\" lynx \"%s\"\n",
2009 s, UC_MIMEcharset, UC_LYNXcharset));
2010 UCInfo[s].uc_status = status;
2011 if (found < 0)
2012 UCNumCharsets++;
2013 return;
2014 }
2015
2016 /*
2017 * UC_NoUctb_Register_with_LYCharSets, UC_Charset_NoUctb_Setup -
2018 * Alternative functions for adding character set info to the lists
2019 * kept in LYCharSets.c.
2020 *
2021 * These are for character sets without any real tables of their own.
2022 * We don't keep an entry in UCinfo[] for them.
2023 */
UC_NoUctb_Register_with_LYCharSets(const char * UC_MIMEcharset,const char * UC_LYNXcharset,int lowest_eightbit,int UC_rawuni,int codepage)2024 static int UC_NoUctb_Register_with_LYCharSets(const char *UC_MIMEcharset,
2025 const char *UC_LYNXcharset,
2026 int lowest_eightbit,
2027 int UC_rawuni,
2028 int codepage)
2029 {
2030 int i, LYhndl = -1;
2031
2032 if (LYNumCharsets == 0) {
2033 /*
2034 * Initialize here; so whoever changes
2035 * LYCharSets.c doesn't have to count...
2036 */
2037 for (i = 0; (i < MAXCHARSETS) && LYchar_set_names[i]; i++) {
2038 LYNumCharsets = i + 1;
2039 }
2040 }
2041
2042 /*
2043 * Search by MIME name, (LYchar_set_names may differ...)
2044 * ignore if already present!
2045 */
2046 for (i = 0; i < MAXCHARSETS && LYchar_set_names[i] && LYhndl < 0; i++) {
2047 if (LYCharSet_UC[i].MIMEname &&
2048 !strcmp(UC_MIMEcharset, LYCharSet_UC[i].MIMEname)) {
2049 return ucError;
2050 }
2051 }
2052
2053 /* not found */
2054 if (LYNumCharsets >= MAXCHARSETS) {
2055 CTRACE((tfp,
2056 "UC_NoUctb_Register_with_LYCharSets: Too many. Ignoring %s/%s.",
2057 UC_MIMEcharset, UC_LYNXcharset));
2058 return ucError;
2059 }
2060 /*
2061 * Add to LYCharSets.c lists.
2062 */
2063 LYhndl = LYNumCharsets;
2064 LYNumCharsets++;
2065 LYlowest_eightbit[LYhndl] = lowest_eightbit;
2066 LYCharSets[LYhndl] = SevenBitApproximations;
2067 LYchar_set_names[LYhndl] = UC_LYNXcharset;
2068 LYchar_set_names[LYhndl + 1] = NULL;
2069 /*
2070 * Terminating NULL may be looked for by Lynx code.
2071 */
2072
2073 LYCharSet_UC[LYhndl].UChndl = -1; /* no corresponding UChndl ! */
2074 LYCharSet_UC[LYhndl].MIMEname = UC_MIMEcharset;
2075 LYCharSet_UC[LYhndl].enc = UC_rawuni;
2076 LYCharSet_UC[LYhndl].codepage = codepage;
2077
2078 /*
2079 * @@@ We really SHOULD get more info from the table files,
2080 * and set relevant flags in the LYCharSet_UC[] entry with
2081 * that info... For now, let's try it without. - KW
2082 */
2083
2084 return LYhndl;
2085 }
2086
2087 /*
2088 * A wrapper for the previous function.
2089 */
UC_Charset_NoUctb_Setup(const char * UC_MIMEcharset,const char * UC_LYNXcharset,int trydefault,int lowest_eight,int UC_rawuni,int codepage)2090 static void UC_Charset_NoUctb_Setup(const char *UC_MIMEcharset,
2091 const char *UC_LYNXcharset,
2092 int trydefault,
2093 int lowest_eight,
2094 int UC_rawuni,
2095 int codepage)
2096 {
2097 int i;
2098
2099 /*
2100 * Ignore completely if already in slot.
2101 */
2102 for (i = 0; i < UCNumCharsets; i++) {
2103 if (!strcmp(UCInfo[i].MIMEname, UC_MIMEcharset)) {
2104 return;
2105 }
2106 }
2107 if (UC_rawuni == UCT_ENC_UTF8)
2108 lowest_eight = 128; /* cheat here */
2109 /* 'codepage' doubles as a flag for 'do not try any table
2110 * lookup, not even default' when negative. The value will
2111 * be returned immediately by UCTrans* functions.
2112 */
2113 if (!trydefault && codepage == 0)
2114 codepage = ucCannotOutput; /* if not already set; any negative should do. */
2115 UC_NoUctb_Register_with_LYCharSets(UC_MIMEcharset,
2116 UC_LYNXcharset,
2117 lowest_eight,
2118 UC_rawuni,
2119 codepage);
2120 return;
2121 }
2122
2123 #ifdef LY_FIND_LEAKS
UCcleanup_mem(void)2124 static void UCcleanup_mem(void)
2125 {
2126 int i;
2127
2128 UCfree_allocated_LYCharSets();
2129 con_clear_unimap_str(0);
2130 con_clear_unimap_str(1);
2131 con_clear_unimap(0);
2132 con_clear_unimap(1);
2133 for (i = 1; i < 4; i++) { /* first one is static! */
2134 FREE(inverse_translations[i]);
2135 }
2136 }
2137 #endif /* LY_FIND_LEAKS */
2138
2139 #ifdef EXP_CHARTRANS_AUTOSWITCH
2140 #ifdef CAN_AUTODETECT_DISPLAY_CHARSET
2141 # ifdef __EMX__
CpOrdinal(const unsigned UCode_t cp,const int other)2142 static int CpOrdinal(const unsigned UCode_t cp, const int other)
2143 {
2144 char lyName[80];
2145 char myMimeName[80];
2146 char *mimeName, *mName = NULL, *lName = NULL;
2147 int s, i, exists = 0, ret;
2148
2149 CTRACE((tfp, "CpOrdinal(cp=%lu, other=%d).\n", cp, other));
2150 sprintf(myMimeName, "auto%s-cp%lu", (other ? "2" : ""), cp);
2151 mimeName = myMimeName + 5 + (other != 0);
2152 sprintf(lyName, "AutoDetect%s (cp%lu)",
2153 (other ? "-2" : ""), cp);
2154 /* Find slot. */
2155 s = -1;
2156 for (i = 0; i < UCNumCharsets; i++) {
2157 if (!strcmp(UCInfo[i].LYNXname, lyName))
2158 return UCGetLYhndl_byMIME(myMimeName);
2159 else if (!strcasecomp(UCInfo[i].MIMEname, mimeName))
2160 s = i;
2161 }
2162 if (s < 0)
2163 return ucError;
2164 /* Store the "real" charset info */
2165 real_charsets[other != 0] = UCGetLYhndl_byMIME(mimeName);
2166 /* Duplicate the record. */
2167 StrAllocCopy(mName, myMimeName);
2168 StrAllocCopy(lName, lyName);
2169 UC_Charset_Setup(mName, lName,
2170 UCInfo[s].unicount, UCInfo[s].unitable,
2171 UCInfo[s].num_uni, UCInfo[s].replacedesc,
2172 UCInfo[s].lowest_eight, UCInfo[s].enc,
2173 UCInfo[s].codepage);
2174 ret = UCGetLYhndl_byMIME(myMimeName);
2175 CTRACE((tfp, "Found %i.\n", ret));
2176 return ret;
2177 }
2178 # endif /* __EMX__ */
2179 #endif /* CAN_AUTODETECT_DISPLAY_CHARSET */
2180 #endif /* EXP_CHARTRANS_AUTOSWITCH */
2181
UCInit(void)2182 void UCInit(void)
2183 {
2184
2185 UCreset_allocated_LYCharSets();
2186 #ifdef LY_FIND_LEAKS
2187 atexit(UCcleanup_mem);
2188 #endif
2189 UCconsole_map_init();
2190
2191 /*
2192 * The order of charset names visible in Lynx Options menu correspond to
2193 * the order of lines below, except the first two described in LYCharSet.c
2194 *
2195 * Entries whose comment is marked with *** are declared in UCdomap.h,
2196 * others are based on the included tables - UCdomap.c, near the top.
2197 */
2198
2199 UC_CHARSET_SETUP_iso_8859_1; /* ISO Latin 1 */
2200 UC_CHARSET_SETUP_iso_8859_15; /* ISO 8859-15 (Latin 9) */
2201 UC_CHARSET_SETUP_cp850; /* DosLatin1 (cp850) */
2202 UC_CHARSET_SETUP_windows_1252; /* WinLatin1 (cp1252) */
2203 UC_CHARSET_SETUP_cp437; /* DosLatinUS (cp437) */
2204
2205 UC_CHARSET_SETUP_dec_mcs; /* DEC Multinational */
2206 UC_CHARSET_SETUP_macintosh; /* Macintosh (8 bit) */
2207 UC_CHARSET_SETUP_next; /* NeXT character set */
2208 UC_CHARSET_SETUP_hp_roman8; /* HP Roman8 */
2209
2210 UC_CHARSET_SETUP_euc_cn; /*** Chinese */
2211 UC_CHARSET_SETUP_euc_jp; /*** Japanese (EUC_JP) */
2212 UC_CHARSET_SETUP_shift_jis; /*** Japanese (Shift_JIS) */
2213 UC_CHARSET_SETUP_euc_kr; /*** Korean */
2214 UC_CHARSET_SETUP_big5; /*** Taipei (Big5) */
2215
2216 UC_CHARSET_SETUP_viscii; /* Vietnamese (VISCII) */
2217 UC_CHARSET_SETUP; /* us-ascii */ /* 7 bit approximations */
2218
2219 UC_CHARSET_SETUP_x_transparent; /*** Transparent */
2220
2221 UC_CHARSET_SETUP_iso_8859_2; /* ISO Latin 2 */
2222 UC_CHARSET_SETUP_cp852; /* DosLatin2 (cp852) */
2223 UC_CHARSET_SETUP_windows_1250; /* WinLatin2 (cp1250) */
2224
2225 UC_CHARSET_SETUP_iso_8859_3; /* ISO Latin 3 */
2226 UC_CHARSET_SETUP_iso_8859_4; /* ISO Latin 4 */
2227 UC_CHARSET_SETUP_iso_8859_13; /* ISO 8859-13 Baltic Rim */
2228 UC_CHARSET_SETUP_cp775; /* DosBaltRim (cp775) */
2229 UC_CHARSET_SETUP_windows_1257; /* WinBaltRim (cp1257) */
2230 UC_CHARSET_SETUP_iso_8859_5; /* ISO 8859-5 Cyrillic */
2231 UC_CHARSET_SETUP_cp866; /* DosCyrillic (cp866) */
2232 UC_CHARSET_SETUP_windows_1251; /* WinCyrillic (cp1251) */
2233 UC_CHARSET_SETUP_koi8_r; /* KOI8-R Cyrillic */
2234 UC_CHARSET_SETUP_iso_8859_6; /* ISO 8869-6 Arabic */
2235 UC_CHARSET_SETUP_cp864; /* DosArabic (cp864) */
2236 UC_CHARSET_SETUP_windows_1256; /* WinArabic (cp1256) */
2237 UC_CHARSET_SETUP_iso_8859_14; /* ISO 8859-14 Celtic */
2238 UC_CHARSET_SETUP_iso_8859_7; /* ISO 8859-7 Greek */
2239 UC_CHARSET_SETUP_cp737; /* DosGreek (cp737) */
2240 UC_CHARSET_SETUP_cp869; /* DosGreek2 (cp869) */
2241 UC_CHARSET_SETUP_windows_1253; /* WinGreek (cp1253) */
2242 UC_CHARSET_SETUP_iso_8859_8; /* ISO 8859-8 Hebrew */
2243 UC_CHARSET_SETUP_cp862; /* DosHebrew (cp862) */
2244 UC_CHARSET_SETUP_windows_1255; /* WinHebrew (cp1255) */
2245 UC_CHARSET_SETUP_iso_8859_9; /* ISO 8859-9 (Latin 5) */
2246 UC_CHARSET_SETUP_cp857; /* DosTurkish (cp857) */
2247 UC_CHARSET_SETUP_iso_8859_10; /* ISO 8859-10 North European */
2248
2249 UC_CHARSET_SETUP_utf_8; /*** UNICODE UTF-8 */
2250 UC_CHARSET_SETUP_mnemonic_ascii_0; /* RFC 1345 w/o Intro */
2251 UC_CHARSET_SETUP_mnemonic; /* RFC 1345 Mnemonic */
2252 UC_CHARSET_SETUP_cp866u; /* Ukrainian Cyrillic (866) */
2253 UC_CHARSET_SETUP_koi8_u; /* Ukrainian Cyrillic (koi8-u) */
2254 UC_CHARSET_SETUP_ptcp154; /* Cyrillic-Asian (PT154) */
2255
2256 #ifdef EXP_CHARTRANS_AUTOSWITCH
2257 #ifdef CAN_AUTODETECT_DISPLAY_CHARSET
2258 # ifdef __EMX__
2259 {
2260 unsigned UCode_t lst[3];
2261 unsigned UCode_t len, rc;
2262
2263 rc = DosQueryCp(sizeof(lst), lst, &len);
2264 if (rc == 0) {
2265 if (len >= 1)
2266 auto_display_charset = CpOrdinal(lst[0], 0);
2267 # ifdef CAN_SWITCH_DISPLAY_CHARSET
2268 if (len >= 3) {
2269 codepages[0] = lst[0];
2270 codepages[1] = (lst[0] == lst[1] ? lst[2] : lst[1]);
2271 auto_other_display_charset = CpOrdinal(codepages[1], 1);
2272 }
2273 # endif
2274 } else {
2275 CTRACE((tfp, "DosQueryCp() returned %#lx=%lu.\n", rc, rc));
2276 }
2277 }
2278 # endif
2279 #endif
2280 #endif
2281
2282 /*
2283 * To add synonyms for any charset name check function UCGetLYhndl_byMIME in
2284 * this file.
2285 */
2286
2287 /* for coding/performance - easy to type: */
2288 LATIN1 = UCGetLYhndl_byMIME("iso-8859-1");
2289 US_ASCII = UCGetLYhndl_byMIME("us-ascii");
2290 UTF8_handle = UCGetLYhndl_byMIME("utf-8");
2291 TRANSPARENT = UCGetLYhndl_byMIME("x-transparent");
2292 }
2293
2294 /*
2295 * Safe variant of UCGetLYhndl_byMIME, with blind recovery from typo in user
2296 * input: lynx.cfg, userdefs.h, command line switches.
2297 */
safeUCGetLYhndl_byMIME(const char * value)2298 int safeUCGetLYhndl_byMIME(const char *value)
2299 {
2300 int i = UCGetLYhndl_byMIME(value);
2301
2302 if (i == -1) { /* was user's typo or not yet recognized value */
2303 i = LATIN1; /* error recovery? */
2304 CTRACE((tfp, "safeUCGetLYhndl_byMIME: ISO-8859-1 assumed.\n"));
2305 }
2306
2307 return (i);
2308 }
2309
2310 #ifdef USE_LOCALE_CHARSET
2311
2312 #if defined(USE_LOCALE_CHARSET) && !defined(HAVE_LANGINFO_CODESET)
2313 /*
2314 * This is a quick-and-dirty emulator of the nl_langinfo(CODESET)
2315 * function defined in the Single Unix Specification for those systems
2316 * (FreeBSD, etc.) that don't have one yet. It behaves as if it had
2317 * been called after setlocale(LC_CTYPE, ""), that is it looks at
2318 * the locale environment variables.
2319 *
2320 * http://www.opengroup.org/onlinepubs/7908799/xsh/langinfo.h.html
2321 *
2322 * Please extend it as needed and suggest improvements to the author.
2323 * This emulator will hopefully become redundant soon as
2324 * nl_langinfo(CODESET) becomes more widely implemented.
2325 *
2326 * Since the proposed Li18nux encoding name registry is still not mature,
2327 * the output follows the MIME registry where possible:
2328 *
2329 * http://www.iana.org/assignments/character-sets
2330 *
2331 * A possible autoconf test for the availability of nl_langinfo(CODESET)
2332 * can be found in
2333 *
2334 * http://www.cl.cam.ac.uk/~mgk25/unicode.html#activate
2335 *
2336 * Markus.Kuhn@cl.cam.ac.uk -- 2002-03-11
2337 * Permission to use, copy, modify, and distribute this software
2338 * for any purpose and without fee is hereby granted. The author
2339 * disclaims all warranties with regard to this software.
2340 *
2341 * Latest version:
2342 *
2343 * http://www.cl.cam.ac.uk/~mgk25/ucs/langinfo.c
2344 */
2345
2346 /*
2347 #include "langinfo.h"
2348 */
2349 typedef int nl_item;
2350
2351 #define CODESET 1
2352
2353 #define C_CODESET "US-ASCII" /* Return this as the encoding of the
2354 * C/POSIX locale. Could as well one day
2355 * become "UTF-8". */
2356
2357 #define digit(x) ((x) >= '0' && (x) <= '9')
2358
2359 static char buf[16];
2360
nl_langinfo(nl_item item)2361 static char *nl_langinfo(nl_item item)
2362 {
2363 char *l, *p;
2364
2365 if (item != CODESET)
2366 return NULL;
2367
2368 if (((l = LYGetEnv("LC_ALL")) != 0) ||
2369 ((l = LYGetEnv("LC_CTYPE")) != 0) ||
2370 ((l = LYGetEnv("LANG")) != 0)) {
2371 /* check standardized locales */
2372 if (!strcmp(l, "C") || !strcmp(l, "POSIX"))
2373 return C_CODESET;
2374 /* check for encoding name fragment */
2375 if (strstr(l, "UTF") || strstr(l, "utf"))
2376 return "UTF-8";
2377 if ((p = strstr(l, "8859-"))) {
2378 memcpy(buf, "ISO-8859-\0\0", 12);
2379 p += 5;
2380 if (digit(*p)) {
2381 buf[9] = *p++;
2382 if (digit(*p))
2383 buf[10] = *p++;
2384 return buf;
2385 }
2386 }
2387 if (strstr(l, "KOI8-R"))
2388 return "KOI8-R";
2389 if (strstr(l, "KOI8-U"))
2390 return "KOI8-U";
2391 if (strstr(l, "620"))
2392 return "TIS-620";
2393 if (strstr(l, "2312"))
2394 return "GB2312";
2395 if (strstr(l, "HKSCS"))
2396 return "Big5HKSCS"; /* no MIME charset */
2397 if (strstr(l, "Big5") || strstr(l, "BIG5"))
2398 return "Big5";
2399 if (strstr(l, "GBK"))
2400 return "GBK"; /* no MIME charset */
2401 if (strstr(l, "18030"))
2402 return "GB18030"; /* no MIME charset */
2403 if (strstr(l, "Shift_JIS") || strstr(l, "SJIS"))
2404 return "Shift_JIS";
2405 /* check for conclusive modifier */
2406 if (strstr(l, "euro"))
2407 return "ISO-8859-15";
2408 /* check for language (and perhaps country) codes */
2409 if (strstr(l, "zh_TW"))
2410 return "Big5";
2411 if (strstr(l, "zh_HK"))
2412 return "Big5HKSCS"; /* no MIME charset */
2413 if (strstr(l, "zh"))
2414 return "GB2312";
2415 if (strstr(l, "ja"))
2416 return "EUC-JP";
2417 if (strstr(l, "ko"))
2418 return "EUC-KR";
2419 if (strstr(l, "ru"))
2420 return "KOI8-R";
2421 if (strstr(l, "uk"))
2422 return "KOI8-U";
2423 if (strstr(l, "pl") || strstr(l, "hr") ||
2424 strstr(l, "hu") || strstr(l, "cs") ||
2425 strstr(l, "sk") || strstr(l, "sl"))
2426 return "ISO-8859-2";
2427 if (strstr(l, "eo") || strstr(l, "mt"))
2428 return "ISO-8859-3";
2429 if (strstr(l, "el"))
2430 return "ISO-8859-7";
2431 if (strstr(l, "he"))
2432 return "ISO-8859-8";
2433 if (strstr(l, "tr"))
2434 return "ISO-8859-9";
2435 if (strstr(l, "th"))
2436 return "TIS-620"; /* or ISO-8859-11 */
2437 if (strstr(l, "lt"))
2438 return "ISO-8859-13";
2439 if (strstr(l, "cy"))
2440 return "ISO-8859-14";
2441 if (strstr(l, "ro"))
2442 return "ISO-8859-2"; /* or ISO-8859-16 */
2443 if (strstr(l, "am") || strstr(l, "vi"))
2444 return "UTF-8";
2445 /* Send me further rules if you like, but don't forget that we are
2446 * *only* interested in locale naming conventions on platforms
2447 * that do not already provide an nl_langinfo(CODESET) implementation. */
2448 return "ISO-8859-1"; /* should perhaps be "UTF-8" instead */
2449 }
2450 return C_CODESET;
2451 }
2452 #endif /* defined(USE_LOCALE_CHARSET) && !defined(HAVE_LANGINFO_CODESET) */
2453
2454 /*
2455 * If LYLocaleCharset is true, use the current locale to lookup a MIME name
2456 * that corresponds, and use that as the display charset. This feature is
2457 * experimental because while nl_langinfo(CODESET) itself is standardized,
2458 * the return values and their relationship to the locale value is not.
2459 * GNU libiconv happens to give useful values, but other implementations are
2460 * not guaranteed to do this.
2461 *
2462 * Not all Linux versions provide useful information. GNU libc 2.2 returns
2463 * "ANSI_X3.4-1968"
2464 * whether locale is POSIX or en_US.UTF-8.
2465 *
2466 * Another possible thing to investigate is the locale_charset() function
2467 * provided in libiconv 1.5.1.
2468 */
LYFindLocaleCharset(void)2469 void LYFindLocaleCharset(void)
2470 {
2471 BOOL found = FALSE;
2472 char *name;
2473
2474 CTRACE((tfp, "LYFindLocaleCharset(%d)\n", LYLocaleCharset));
2475 name = nl_langinfo(CODESET);
2476
2477 if (name != 0) {
2478 int value = UCGetLYhndl_byMIME(name);
2479
2480 if (value >= 0) {
2481 found = TRUE;
2482 linedrawing_char_set = value;
2483 CTRACE((tfp, "Found name \"%s\" -> %d\n", name, value));
2484 } else {
2485 CTRACE((tfp, "Cannot find a handle for MIME name \"%s\"\n", name));
2486 }
2487 } else {
2488 CTRACE((tfp, "Cannot find a MIME name for locale\n"));
2489 }
2490
2491 if (found && LYLocaleCharset) {
2492 current_char_set = linedrawing_char_set;
2493 }
2494 }
2495 #endif /* USE_LOCALE_CHARSET */
2496
UCScanCode(UCode_t * target,const char * source,BOOL isHex)2497 BOOL UCScanCode(UCode_t *target, const char *source, BOOL isHex)
2498 {
2499 BOOL status = FALSE;
2500 long lcode;
2501 char *endptr;
2502
2503 errno = 0;
2504 *target = 0;
2505 lcode = strtol(source, &endptr, isHex ? 16 : 10);
2506 if (lcode >= 0
2507 && (endptr > source)
2508 #if defined(ERANGE) && defined(LONG_MAX) && defined(LONG_MIN)
2509 && (errno != ERANGE || (lcode != LONG_MAX && lcode != LONG_MIN))
2510 #else
2511 && (endptr - source) < (isHex ? 8 : 10)
2512 #endif
2513 && (endptr != 0)
2514 && (*endptr == '\0')) {
2515 *target = (UCode_t) lcode;
2516 status = TRUE;
2517 }
2518 return status;
2519 }
2520