1 /*
2 * Copyright (C) 1984-2002 Mark Nudelman
3 *
4 * You may distribute under the terms of either the GNU General Public
5 * License or the Less License, as specified in the README file.
6 *
7 * For more information about less, or for information on how to
8 * contact the author, see the README file.
9 */
10
11
12 /*
13 * Functions to define the character set
14 * and do things specific to the character set.
15 */
16
17 #include "less.h"
18 #if HAVE_LOCALE
19 #include <locale.h>
20 #include <ctype.h>
21 #endif
22 #if HAVE_UTF8_LOCALE
23 #include <langinfo.h>
24 #include <locale.h>
25 #endif
26
27 __RCSID("$MirOS: src/usr.bin/less/charset.c,v 1.3 2007/07/05 23:09:42 tg Exp $");
28
29 public int utf_mode = 0;
30
31 #if !SMALL
32 /*
33 * Predefined character sets,
34 * selected by the LESSCHARSET environment variable.
35 */
36 struct charset {
37 char *name;
38 int *p_flag;
39 char *desc;
40 } charsets[] = {
41 { "ascii", NULL, "8bcccbcc18b95.b" },
42 { "dos", NULL, "8bcccbcc12bc5b223.b" },
43 { "ebcdic", NULL, "5bc6bcc7bcc41b.9b7.9b5.b..8b6.10b6.b9.7b9.8b8.17b3.3b9.7b9.8b8.6b10.b.b.b." },
44 { "IBM-1047", NULL, "4cbcbc3b9cbccbccbb4c6bcc5b3cbbc4bc4bccbc191.b" },
45 { "iso8859", NULL, "8bcccbcc18b95.33b." },
46 { "koi8-r", NULL, "8bcccbcc18b95.b128." },
47 { "next", NULL, "8bcccbcc18b95.bb125.bb" },
48 { "utf-8", &utf_mode, "8bcccbcc18b." },
49 { NULL, NULL, NULL }
50 };
51
52 struct cs_alias {
53 char *name;
54 char *oname;
55 } cs_aliases[] = {
56 { "latin1", "iso8859" },
57 { "latin9", "iso8859" },
58 { NULL, NULL }
59 };
60
61 #define IS_BINARY_CHAR 01
62 #define IS_CONTROL_CHAR 02
63
64 static char chardef[256];
65 static char *binfmt = NULL;
66 public int binattr = AT_STANDOUT;
67
68
69 /*
70 * Define a charset, given a description string.
71 * The string consists of 256 letters,
72 * one for each character in the charset.
73 * If the string is shorter than 256 letters, missing letters
74 * are taken to be identical to the last one.
75 * A decimal number followed by a letter is taken to be a
76 * repetition of the letter.
77 *
78 * Each letter is one of:
79 * . normal character
80 * b binary character
81 * c control character
82 */
83 static void
ichardef(s)84 ichardef(s)
85 char *s;
86 {
87 register char *cp;
88 register int n;
89 register char v;
90
91 n = 0;
92 v = 0;
93 cp = chardef;
94 while (*s != '\0')
95 {
96 switch (*s++)
97 {
98 case '.':
99 v = 0;
100 break;
101 case 'c':
102 v = IS_CONTROL_CHAR;
103 break;
104 case 'b':
105 v = IS_BINARY_CHAR|IS_CONTROL_CHAR;
106 break;
107
108 case '0': case '1': case '2': case '3': case '4':
109 case '5': case '6': case '7': case '8': case '9':
110 n = (10 * n) + (s[-1] - '0');
111 continue;
112
113 default:
114 error("invalid chardef", NULL_PARG);
115 quit(QUIT_ERROR);
116 /*NOTREACHED*/
117 }
118
119 do
120 {
121 if (cp >= chardef + sizeof(chardef))
122 {
123 error("chardef longer than 256", NULL_PARG);
124 quit(QUIT_ERROR);
125 /*NOTREACHED*/
126 }
127 *cp++ = v;
128 } while (--n > 0);
129 n = 0;
130 }
131
132 while (cp < chardef + sizeof(chardef))
133 *cp++ = v;
134 }
135
136 /*
137 * Define a charset, given a charset name.
138 * The valid charset names are listed in the "charsets" array.
139 */
140 static int
icharset(name)141 icharset(name)
142 register char *name;
143 {
144 register struct charset *p;
145 register struct cs_alias *a;
146
147 if (name == NULL || *name == '\0')
148 return (0);
149
150 /* First see if the name is an alias. */
151 for (a = cs_aliases; a->name != NULL; a++)
152 {
153 if (strcmp(name, a->name) == 0)
154 {
155 name = a->oname;
156 break;
157 }
158 }
159
160 for (p = charsets; p->name != NULL; p++)
161 {
162 if (strcmp(name, p->name) == 0)
163 {
164 ichardef(p->desc);
165 if (p->p_flag != NULL)
166 *(p->p_flag) = 1;
167 return (1);
168 }
169 }
170
171 error("invalid charset name", NULL_PARG);
172 quit(QUIT_ERROR);
173 /*NOTREACHED*/
174 return (0);
175 }
176
177 #if HAVE_LOCALE
178 /*
179 * Define a charset, given a locale name.
180 */
181 static void
ilocale()182 ilocale()
183 {
184 register int c;
185
186 #ifndef __MirBSD__
187 setlocale(LC_ALL, "");
188 #endif
189 for (c = 0; c < (int) sizeof(chardef); c++)
190 {
191 if (isprint(c))
192 chardef[c] = 0;
193 else if (iscntrl(c))
194 chardef[c] = IS_CONTROL_CHAR;
195 else
196 chardef[c] = IS_BINARY_CHAR|IS_CONTROL_CHAR;
197 }
198 }
199 #endif
200
201 /*
202 * Define the printing format for control chars.
203 */
204 public void
setbinfmt(s)205 setbinfmt(s)
206 char *s;
207 {
208 if (s == NULL || *s == '\0')
209 s = "*s<%X>";
210 /*
211 * Select the attributes if it starts with "*".
212 */
213 if (*s == '*')
214 {
215 switch (s[1])
216 {
217 case 'd': binattr = AT_BOLD; break;
218 case 'k': binattr = AT_BLINK; break;
219 case 's': binattr = AT_STANDOUT; break;
220 case 'u': binattr = AT_UNDERLINE; break;
221 default: binattr = AT_NORMAL; break;
222 }
223 s += 2;
224 }
225 binfmt = s;
226 }
227
228 /*
229 * Initialize charset data structures.
230 */
231 public void
init_charset()232 init_charset()
233 {
234 #ifdef __MirBSD__
235 icharset("utf-8");
236 #else
237 register char *s;
238
239 s = lgetenv("LESSBINFMT");
240 setbinfmt(s);
241
242 /*
243 * See if environment variable LESSCHARSET is defined.
244 */
245 s = lgetenv("LESSCHARSET");
246 if (icharset(s))
247 return;
248 /*
249 * LESSCHARSET is not defined: try LESSCHARDEF.
250 */
251 s = lgetenv("LESSCHARDEF");
252 if (s != NULL && *s != '\0')
253 {
254 ichardef(s);
255 return;
256 }
257
258 #if HAVE_STRSTR
259 /*
260 * Check whether LC_ALL, LC_CTYPE or LANG look like UTF-8 is used.
261 */
262 #if HAVE_UTF8_LOCALE
263 #ifdef LC_CTYPE
264 if ((s = setlocale(LC_CTYPE, "")) != NULL)
265 #else
266 if ((s = setlocale(LC_ALL, "")) != NULL)
267 #endif
268 {
269 if (strstr(s, "UTF-8") != NULL || strstr(s, "utf-8") != NULL)
270 if (icharset("utf-8"))
271 return;
272 #ifdef CODESET
273 if ((s = nl_langinfo(CODESET)) != NULL)
274 if (strstr(s, "UTF-8") != NULL || strstr(s, "utf-8") != NULL)
275 if (icharset("utf-8"))
276 return;
277 #endif
278 }
279 #endif
280 if ((s = lgetenv("LC_ALL")) != NULL ||
281 (s = lgetenv("LC_CTYPE")) != NULL ||
282 (s = lgetenv("LANG")) != NULL)
283 {
284 if (strstr(s, "UTF-8") != NULL || strstr(s, "utf-8") != NULL)
285 if (icharset("utf-8"))
286 return;
287 }
288 #endif
289
290 #if HAVE_LOCALE
291 /*
292 * Use setlocale.
293 */
294 ilocale();
295 #else
296 #if MSDOS_COMPILER
297 /*
298 * Default to "dos".
299 */
300 (void) icharset("dos");
301 #else
302 /*
303 * Default to "latin1".
304 */
305 (void) icharset("latin1");
306 #endif
307 #endif
308 #endif
309 }
310
311 /*
312 * Is a given character a "binary" character?
313 */
314 public int
binary_char(c)315 binary_char(c)
316 unsigned char c;
317 {
318 c &= 0377;
319 return (chardef[c] & IS_BINARY_CHAR);
320 }
321
322 /*
323 * Is a given character a "control" character?
324 */
325 public int
control_char(c)326 control_char(c)
327 int c;
328 {
329 c &= 0377;
330 return (chardef[c] & IS_CONTROL_CHAR);
331 }
332
333 /*
334 * Return the printable form of a character.
335 * For example, in the "ascii" charset '\3' is printed as "^C".
336 */
337 public char *
prchar(c)338 prchar(c)
339 int c;
340 {
341 static char buf[8];
342
343 c &= 0377;
344 if (!control_char(c))
345 snprintf(buf, sizeof(buf), "%c", c);
346 else if (c == ESC)
347 snprintf(buf, sizeof(buf), "ESC");
348 #if IS_EBCDIC_HOST
349 else if (!binary_char(c) && c < 64)
350 snprintf(buf, sizeof(buf), "^%c",
351 /*
352 * This array roughly inverts CONTROL() #defined in less.h,
353 * and should be kept in sync with CONTROL() and IBM-1047.
354 */
355 "@ABC.I.?...KLMNO"
356 "PQRS.JH.XY.."
357 "\\]^_"
358 "......W[.....EFG"
359 "..V....D....TU.Z"[c]);
360 #else
361 else if (c < 128 && !control_char(c ^ 0100))
362 snprintf(buf, sizeof(buf), "^%c", c ^ 0100);
363 #endif
364 else
365 snprintf(buf, sizeof(buf), binfmt, c);
366 return (buf);
367 }
368
369 #else /* SMALL */
370
371 public int binattr = AT_STANDOUT;
372
373 public void
init_charset()374 init_charset()
375 {
376 return;
377 }
378
379 /*
380 * Is a given character a "binary" character?
381 */
382 public int
binary_char(c)383 binary_char(c)
384 unsigned char c;
385 {
386 return (!isprint(c) && !isspace(c));
387 }
388
389 /*
390 * Is a given character a "control" character?
391 */
392 public int
control_char(c)393 control_char(c)
394 int c;
395 {
396 return (iscntrl(c));
397 }
398
399 /*
400 * Return the printable form of a character.
401 * For example, in the "ascii" charset '\3' is printed as "^C".
402 */
403 public char *
prchar(c)404 prchar(c)
405 int c;
406 {
407 static char buf[8];
408
409 c &= 0377;
410 if (!iscntrl(c))
411 snprintf(buf, sizeof(buf), "%c", c);
412 else if (c == ESC)
413 snprintf(buf, sizeof(buf), "ESC");
414 else if (c < 128 && !iscntrl(c ^ 0100))
415 snprintf(buf, sizeof(buf), "^%c", c ^ 0100);
416 else
417 snprintf(buf, sizeof(buf), "*s<%X>", c);
418 return (buf);
419 }
420 #endif /* SMALL */
421