1 /*
2  * Copyright (C) 1984-2002  Mark Nudelman
3  *
4  * You may distribute under the terms of either the GNU General Public
5  * License or the Less License, as specified in the README file.
6  *
7  * For more information about less, or for information on how to
8  * contact the author, see the README file.
9  */
10 
11 
12 /*
13  * Functions to define the character set
14  * and do things specific to the character set.
15  */
16 
17 #include "less.h"
18 #if HAVE_LOCALE
19 #include <locale.h>
20 #include <ctype.h>
21 #endif
22 #if HAVE_UTF8_LOCALE
23 #include <langinfo.h>
24 #include <locale.h>
25 #endif
26 
27 __RCSID("$MirOS: src/usr.bin/less/charset.c,v 1.3 2007/07/05 23:09:42 tg Exp $");
28 
29 public int utf_mode = 0;
30 
31 #if !SMALL
32 /*
33  * Predefined character sets,
34  * selected by the LESSCHARSET environment variable.
35  */
36 struct charset {
37 	char *name;
38 	int *p_flag;
39 	char *desc;
40 } charsets[] = {
41 	{ "ascii",	NULL,       "8bcccbcc18b95.b" },
42 	{ "dos",	NULL,       "8bcccbcc12bc5b223.b" },
43 	{ "ebcdic",	NULL,       "5bc6bcc7bcc41b.9b7.9b5.b..8b6.10b6.b9.7b9.8b8.17b3.3b9.7b9.8b8.6b10.b.b.b." },
44 	{ "IBM-1047",	NULL,       "4cbcbc3b9cbccbccbb4c6bcc5b3cbbc4bc4bccbc191.b" },
45 	{ "iso8859",	NULL,       "8bcccbcc18b95.33b." },
46 	{ "koi8-r",	NULL,       "8bcccbcc18b95.b128." },
47 	{ "next",	NULL,       "8bcccbcc18b95.bb125.bb" },
48 	{ "utf-8",	&utf_mode,  "8bcccbcc18b." },
49 	{ NULL, NULL, NULL }
50 };
51 
52 struct cs_alias {
53 	char *name;
54 	char *oname;
55 } cs_aliases[] = {
56 	{ "latin1",	"iso8859" },
57 	{ "latin9",	"iso8859" },
58 	{ NULL, NULL }
59 };
60 
61 #define	IS_BINARY_CHAR	01
62 #define	IS_CONTROL_CHAR	02
63 
64 static char chardef[256];
65 static char *binfmt = NULL;
66 public int binattr = AT_STANDOUT;
67 
68 
69 /*
70  * Define a charset, given a description string.
71  * The string consists of 256 letters,
72  * one for each character in the charset.
73  * If the string is shorter than 256 letters, missing letters
74  * are taken to be identical to the last one.
75  * A decimal number followed by a letter is taken to be a
76  * repetition of the letter.
77  *
78  * Each letter is one of:
79  *	. normal character
80  *	b binary character
81  *	c control character
82  */
83 	static void
ichardef(s)84 ichardef(s)
85 	char *s;
86 {
87 	register char *cp;
88 	register int n;
89 	register char v;
90 
91 	n = 0;
92 	v = 0;
93 	cp = chardef;
94 	while (*s != '\0')
95 	{
96 		switch (*s++)
97 		{
98 		case '.':
99 			v = 0;
100 			break;
101 		case 'c':
102 			v = IS_CONTROL_CHAR;
103 			break;
104 		case 'b':
105 			v = IS_BINARY_CHAR|IS_CONTROL_CHAR;
106 			break;
107 
108 		case '0': case '1': case '2': case '3': case '4':
109 		case '5': case '6': case '7': case '8': case '9':
110 			n = (10 * n) + (s[-1] - '0');
111 			continue;
112 
113 		default:
114 			error("invalid chardef", NULL_PARG);
115 			quit(QUIT_ERROR);
116 			/*NOTREACHED*/
117 		}
118 
119 		do
120 		{
121 			if (cp >= chardef + sizeof(chardef))
122 			{
123 				error("chardef longer than 256", NULL_PARG);
124 				quit(QUIT_ERROR);
125 				/*NOTREACHED*/
126 			}
127 			*cp++ = v;
128 		} while (--n > 0);
129 		n = 0;
130 	}
131 
132 	while (cp < chardef + sizeof(chardef))
133 		*cp++ = v;
134 }
135 
136 /*
137  * Define a charset, given a charset name.
138  * The valid charset names are listed in the "charsets" array.
139  */
140 	static int
icharset(name)141 icharset(name)
142 	register char *name;
143 {
144 	register struct charset *p;
145 	register struct cs_alias *a;
146 
147 	if (name == NULL || *name == '\0')
148 		return (0);
149 
150 	/* First see if the name is an alias. */
151 	for (a = cs_aliases;  a->name != NULL;  a++)
152 	{
153 		if (strcmp(name, a->name) == 0)
154 		{
155 			name = a->oname;
156 			break;
157 		}
158 	}
159 
160 	for (p = charsets;  p->name != NULL;  p++)
161 	{
162 		if (strcmp(name, p->name) == 0)
163 		{
164 			ichardef(p->desc);
165 			if (p->p_flag != NULL)
166 				*(p->p_flag) = 1;
167 			return (1);
168 		}
169 	}
170 
171 	error("invalid charset name", NULL_PARG);
172 	quit(QUIT_ERROR);
173 	/*NOTREACHED*/
174 	return (0);
175 }
176 
177 #if HAVE_LOCALE
178 /*
179  * Define a charset, given a locale name.
180  */
181 	static void
ilocale()182 ilocale()
183 {
184 	register int c;
185 
186 #ifndef __MirBSD__
187 	setlocale(LC_ALL, "");
188 #endif
189 	for (c = 0;  c < (int) sizeof(chardef);  c++)
190 	{
191 		if (isprint(c))
192 			chardef[c] = 0;
193 		else if (iscntrl(c))
194 			chardef[c] = IS_CONTROL_CHAR;
195 		else
196 			chardef[c] = IS_BINARY_CHAR|IS_CONTROL_CHAR;
197 	}
198 }
199 #endif
200 
201 /*
202  * Define the printing format for control chars.
203  */
204    	public void
setbinfmt(s)205 setbinfmt(s)
206 	char *s;
207 {
208 	if (s == NULL || *s == '\0')
209 		s = "*s<%X>";
210 	/*
211 	 * Select the attributes if it starts with "*".
212 	 */
213 	if (*s == '*')
214 	{
215 		switch (s[1])
216 		{
217 		case 'd':  binattr = AT_BOLD;      break;
218 		case 'k':  binattr = AT_BLINK;     break;
219 		case 's':  binattr = AT_STANDOUT;  break;
220 		case 'u':  binattr = AT_UNDERLINE; break;
221 		default:   binattr = AT_NORMAL;    break;
222 		}
223 		s += 2;
224 	}
225 	binfmt = s;
226 }
227 
228 /*
229  * Initialize charset data structures.
230  */
231 	public void
init_charset()232 init_charset()
233 {
234 #ifdef __MirBSD__
235 	icharset("utf-8");
236 #else
237 	register char *s;
238 
239 	s = lgetenv("LESSBINFMT");
240 	setbinfmt(s);
241 
242 	/*
243 	 * See if environment variable LESSCHARSET is defined.
244 	 */
245 	s = lgetenv("LESSCHARSET");
246 	if (icharset(s))
247 		return;
248 	/*
249 	 * LESSCHARSET is not defined: try LESSCHARDEF.
250 	 */
251 	s = lgetenv("LESSCHARDEF");
252 	if (s != NULL && *s != '\0')
253 	{
254 		ichardef(s);
255 		return;
256 	}
257 
258 #if HAVE_STRSTR
259 	/*
260 	 * Check whether LC_ALL, LC_CTYPE or LANG look like UTF-8 is used.
261 	 */
262 #if HAVE_UTF8_LOCALE
263 #ifdef LC_CTYPE
264 	if ((s = setlocale(LC_CTYPE, "")) != NULL)
265 #else
266 	if ((s = setlocale(LC_ALL, "")) != NULL)
267 #endif
268 	{
269 		if (strstr(s, "UTF-8") != NULL || strstr(s, "utf-8") != NULL)
270 			if (icharset("utf-8"))
271 				return;
272 #ifdef CODESET
273 		if ((s = nl_langinfo(CODESET)) != NULL)
274 		if (strstr(s, "UTF-8") != NULL || strstr(s, "utf-8") != NULL)
275 			if (icharset("utf-8"))
276 				return;
277 #endif
278 	}
279 #endif
280 	if ((s = lgetenv("LC_ALL")) != NULL ||
281 	    (s = lgetenv("LC_CTYPE")) != NULL ||
282 	    (s = lgetenv("LANG")) != NULL)
283 	{
284 		if (strstr(s, "UTF-8") != NULL || strstr(s, "utf-8") != NULL)
285 			if (icharset("utf-8"))
286 				return;
287 	}
288 #endif
289 
290 #if HAVE_LOCALE
291 	/*
292 	 * Use setlocale.
293 	 */
294 	ilocale();
295 #else
296 #if MSDOS_COMPILER
297 	/*
298 	 * Default to "dos".
299 	 */
300 	(void) icharset("dos");
301 #else
302 	/*
303 	 * Default to "latin1".
304 	 */
305 	(void) icharset("latin1");
306 #endif
307 #endif
308 #endif
309 }
310 
311 /*
312  * Is a given character a "binary" character?
313  */
314 	public int
binary_char(c)315 binary_char(c)
316 	unsigned char c;
317 {
318 	c &= 0377;
319 	return (chardef[c] & IS_BINARY_CHAR);
320 }
321 
322 /*
323  * Is a given character a "control" character?
324  */
325 	public int
control_char(c)326 control_char(c)
327 	int c;
328 {
329 	c &= 0377;
330 	return (chardef[c] & IS_CONTROL_CHAR);
331 }
332 
333 /*
334  * Return the printable form of a character.
335  * For example, in the "ascii" charset '\3' is printed as "^C".
336  */
337 	public char *
prchar(c)338 prchar(c)
339 	int c;
340 {
341 	static char buf[8];
342 
343 	c &= 0377;
344 	if (!control_char(c))
345 		snprintf(buf, sizeof(buf), "%c", c);
346 	else if (c == ESC)
347 		snprintf(buf, sizeof(buf), "ESC");
348 #if IS_EBCDIC_HOST
349 	else if (!binary_char(c) && c < 64)
350 		snprintf(buf, sizeof(buf), "^%c",
351 		/*
352 		 * This array roughly inverts CONTROL() #defined in less.h,
353 	 	 * and should be kept in sync with CONTROL() and IBM-1047.
354  	 	 */
355 		"@ABC.I.?...KLMNO"
356 		"PQRS.JH.XY.."
357 		"\\]^_"
358 		"......W[.....EFG"
359 		"..V....D....TU.Z"[c]);
360 #else
361   	else if (c < 128 && !control_char(c ^ 0100))
362   		snprintf(buf, sizeof(buf), "^%c", c ^ 0100);
363 #endif
364 	else
365 		snprintf(buf, sizeof(buf), binfmt, c);
366 	return (buf);
367 }
368 
369 #else /* SMALL */
370 
371 public int binattr = AT_STANDOUT;
372 
373 	public void
init_charset()374 init_charset()
375 {
376 	return;
377 }
378 
379 /*
380  * Is a given character a "binary" character?
381  */
382 	public int
binary_char(c)383 binary_char(c)
384 	unsigned char c;
385 {
386 	return (!isprint(c) && !isspace(c));
387 }
388 
389 /*
390  * Is a given character a "control" character?
391  */
392 	public int
control_char(c)393 control_char(c)
394 	int c;
395 {
396 	return (iscntrl(c));
397 }
398 
399 /*
400  * Return the printable form of a character.
401  * For example, in the "ascii" charset '\3' is printed as "^C".
402  */
403 	public char *
prchar(c)404 prchar(c)
405 	int c;
406 {
407 	static char buf[8];
408 
409 	c &= 0377;
410 	if (!iscntrl(c))
411 		snprintf(buf, sizeof(buf), "%c", c);
412 	else if (c == ESC)
413 		snprintf(buf, sizeof(buf), "ESC");
414   	else if (c < 128 && !iscntrl(c ^ 0100))
415   		snprintf(buf, sizeof(buf), "^%c", c ^ 0100);
416 	else
417 		snprintf(buf, sizeof(buf), "*s<%X>", c);
418 	return (buf);
419 }
420 #endif /* SMALL */
421