1 /*
2  * $LynxId: UCAux.c,v 1.44 2010/11/07 21:21:09 tom Exp $
3  */
4 #include <HTUtils.h>
5 
6 #include <HTCJK.h>
7 #include <UCMap.h>
8 #include <UCDefs.h>
9 #include <HTStream.h>
10 #include <UCAux.h>
11 #include <LYCharSets.h>
12 #include <LYCurses.h>
13 #include <LYStrings.h>
14 
UCCanUniTranslateFrom(int from)15 BOOL UCCanUniTranslateFrom(int from)
16 {
17     if (from < 0)
18 	return NO;
19 #ifndef EXP_JAPANESEUTF8_SUPPORT
20     if (LYCharSet_UC[from].enc == UCT_ENC_CJK)
21 	return NO;
22 #endif
23     if (!strcmp(LYCharSet_UC[from].MIMEname, "x-transparent"))
24 	return NO;
25 
26     /* others YES */
27     return YES;
28 }
29 
UCCanTranslateUniTo(int to)30 BOOL UCCanTranslateUniTo(int to)
31 {
32     if (to < 0)
33 	return NO;
34 /*???
35     if (!strcmp(LYCharSet_UC[to].MIMEname, "x-transparent"))
36        return NO;
37 */
38 
39     return YES;			/* well at least some characters... */
40 }
41 
UCCanTranslateFromTo(int from,int to)42 BOOL UCCanTranslateFromTo(int from,
43 			  int to)
44 {
45     if (from == to)
46 	return YES;
47     if (from < 0 || to < 0)
48 	return NO;
49     if (from == LATIN1)
50 	return UCCanTranslateUniTo(to);
51     if (to == LATIN1 || LYCharSet_UC[to].enc == UCT_ENC_UTF8)
52 	return UCCanUniTranslateFrom(from);
53     {
54 	const char *fromname = LYCharSet_UC[from].MIMEname;
55 	const char *toname = LYCharSet_UC[to].MIMEname;
56 
57 	if (!strcmp(fromname, "x-transparent") ||
58 	    !strcmp(toname, "x-transparent")) {
59 	    return YES;		/* ??? */
60 	} else if (!strcmp(fromname, "us-ascii")) {
61 	    return YES;
62 	}
63 	if (LYCharSet_UC[from].enc == UCT_ENC_CJK) {
64 	    /*
65 	     * CJK mode may be off (i.e., !IS_CJK_TTY) because the current
66 	     * document is not CJK, but the check may be for capability in
67 	     * relation to another document, for which CJK mode might be turned
68 	     * on when retrieved.  Thus, when the from charset is CJK, check if
69 	     * the to charset is CJK, and return NO or YES in relation to that.
70 	     * - FM
71 	     */
72 	    if (LYCharSet_UC[to].enc != UCT_ENC_CJK)
73 		return NO;
74 	    if ((!strcmp(toname, "euc-jp") ||
75 		 !strcmp(toname, "shift_jis")) &&
76 		(!strcmp(fromname, "euc-jp") ||
77 		 !strcmp(fromname, "shift_jis")))
78 		return YES;
79 	    /*
80 	     * The euc-cn and euc-kr charsets were handled by the (from == to)
81 	     * above, so we need not check those.  - FM
82 	     */
83 	    return NO;
84 	}
85     }
86     return YES;			/* others YES */
87 }
88 
89 /*
90  *  Returns YES if no translation necessary (because
91  *  charsets are equal, are equivalent, etc.).
92  */
UCNeedNotTranslate(int from,int to)93 BOOL UCNeedNotTranslate(int from,
94 			int to)
95 {
96     const char *fromname;
97     const char *toname;
98 
99     if (from == to)
100 	return YES;
101     if (from < 0)
102 	return NO;		/* ??? */
103     if (LYCharSet_UC[from].enc == UCT_ENC_7BIT) {
104 	return YES;		/* Only 7bit chars. */
105     }
106     fromname = LYCharSet_UC[from].MIMEname;
107     if (!strcmp(fromname, "x-transparent") ||
108 	!strcmp(fromname, "us-ascii")) {
109 	return YES;
110     }
111     if (to < 0)
112 	return NO;		/* ??? */
113     if (to == LATIN1) {
114 	if (LYCharSet_UC[from].codepoints & (UCT_CP_SUBSETOF_LAT1))
115 	    return YES;
116     }
117     toname = LYCharSet_UC[to].MIMEname;
118     if (!strcmp(toname, "x-transparent")) {
119 	return YES;
120     }
121     if (LYCharSet_UC[to].enc == UCT_ENC_UTF8) {
122 	return NO;
123     }
124     if (from == LATIN1) {
125 	if (LYCharSet_UC[from].codepoints & (UCT_CP_SUPERSETOF_LAT1))
126 	    return YES;
127     }
128     if (LYCharSet_UC[from].enc == UCT_ENC_CJK) {
129 	if (!IS_CJK_TTY)	/* Use that global flag, for now. */
130 	    return NO;
131 	if (HTCJK == JAPANESE &&
132 	    (!strcmp(fromname, "euc-jp") ||
133 	     !strcmp(fromname, "shift_jis")))
134 	    return YES;		/* translate internally by lynx, no unicode */
135 	return NO;		/* If not handled by (from == to) above. */
136     }
137     return NO;
138 }
139 
140 /*
141  *  The idea here is that any stage of the stream pipe which is interested
142  *  in some charset dependent processing will call this function.
143  *  Given input and output charsets, this function will set various flags
144  *  in a UCTransParams structure that _suggest_ to the caller what to do.
145  *
146  *  Should be called once when a stage starts processing text (and the
147  *  input and output charsets are known), or whenever one of input or
148  *  output charsets has changed (e.g., by SGML.c stage after HTML.c stage
149  *  has processed a META tag).
150  *  The global flags (LYRawMode, HTPassEightBitRaw etc.) are currently
151  *  not taken into account here (except for HTCJK, somewhat), it's still
152  *  up to the caller to do something about them. - KW
153  */
UCSetTransParams(UCTransParams * pT,int cs_in,const LYUCcharset * p_in,int cs_out,const LYUCcharset * p_out)154 void UCSetTransParams(UCTransParams * pT, int cs_in,
155 		      const LYUCcharset *p_in,
156 		      int cs_out,
157 		      const LYUCcharset *p_out)
158 {
159     CTRACE((tfp, "UCSetTransParams: from %s(%d) to %s(%d)\n",
160 	    p_in->MIMEname, UCGetLYhndl_byMIME(p_in->MIMEname),
161 	    p_out->MIMEname, UCGetLYhndl_byMIME(p_out->MIMEname)));
162 
163     /*
164      * Initialize this element to FALSE, and set it TRUE below if we're dealing
165      * with VISCII.  - FM
166      */
167     pT->trans_C0_to_uni = FALSE;
168 
169     /*
170      * The "transparent" display character set is a "super raw mode".  - FM
171      */
172     pT->transp = (BOOL) (!strcmp(p_in->MIMEname, "x-transparent") ||
173 			 !strcmp(p_out->MIMEname, "x-transparent"));
174 
175     /*
176      * UCS-2 is handled as a special case in SGML_write().
177      */
178     pT->ucs_mode = 0;
179 
180     if (pT->transp) {
181 	/*
182 	 * Set up the structure for "transparent".  - FM
183 	 */
184 	pT->do_cjk = FALSE;
185 	pT->decode_utf8 = FALSE;
186 	pT->output_utf8 = FALSE;	/* We may, but won't know about it. - KW */
187 	pT->do_8bitraw = TRUE;
188 	pT->use_raw_char_in = TRUE;
189 	pT->strip_raw_char_in = FALSE;
190 	pT->pass_160_173_raw = TRUE;
191 	pT->repl_translated_C0 = (BOOL) (p_out->enc == UCT_ENC_8BIT_C0);
192 	pT->trans_C0_to_uni = (BOOL) (p_in->enc == UCT_ENC_8BIT_C0 ||
193 				      p_out->enc == UCT_ENC_8BIT_C0);
194     } else {
195 	/*
196 	 * Initialize local flags.  - FM
197 	 */
198 	BOOL intm_ucs = FALSE;
199 	BOOL use_ucs = FALSE;
200 
201 	/*
202 	 * Set this element if we want to treat the input as CJK.  - FM
203 	 */
204 	pT->do_cjk = (BOOL) ((p_in->enc == UCT_ENC_CJK) && IS_CJK_TTY);
205 	/*
206 	 * Set these elements based on whether we are dealing with UTF-8.  - FM
207 	 */
208 	pT->decode_utf8 = (BOOL) (p_in->enc == UCT_ENC_UTF8);
209 	pT->output_utf8 = (BOOL) (p_out->enc == UCT_ENC_UTF8);
210 	if (pT->do_cjk) {
211 	    /*
212 	     * Set up the structure for a CJK input with
213 	     * a CJK output (IS_CJK_TTY).  - FM
214 	     */
215 	    pT->trans_to_uni = FALSE;
216 	    pT->do_8bitraw = FALSE;
217 	    pT->pass_160_173_raw = TRUE;
218 	    pT->use_raw_char_in = FALSE;	/* Not used for CJK. - KW */
219 	    pT->repl_translated_C0 = FALSE;
220 	    pT->trans_from_uni = FALSE;		/* Not used for CJK. - KW */
221 	} else {
222 	    /*
223 	     * Set up for all other charset combinations.  The intm_ucs flag is
224 	     * set TRUE if the input charset is iso-8859-1 or UTF-8, or largely
225 	     * equivalent to them, i.e., if we have UCS without having to do a
226 	     * table translation.
227 	     */
228 	    intm_ucs = (BOOL) (cs_in == LATIN1 || pT->decode_utf8 ||
229 			       (p_in->codepoints &
230 				(UCT_CP_SUBSETOF_LAT1 | UCT_CP_SUBSETOF_UCS2)));
231 	    /*
232 	     * pT->trans_to_uni is set TRUE if we do not have that as input
233 	     * already, and we can translate to Unicode.  Note that UTF-8
234 	     * always is converted to Unicode in functions that use the
235 	     * transformation structure, so it is treated as already Unicode
236 	     * here.
237 	     */
238 	    pT->trans_to_uni = (BOOL) (!intm_ucs &&
239 				       UCCanUniTranslateFrom(cs_in));
240 	    /*
241 	     * We set this if we are translating to Unicode and what normally
242 	     * are low value control characters in fact are encoding octets for
243 	     * the input charset (presently, this applies to VISCII).  - FM
244 	     */
245 	    pT->trans_C0_to_uni = (BOOL) (pT->trans_to_uni &&
246 					  p_in->enc == UCT_ENC_8BIT_C0);
247 	    /*
248 	     * We set this, presently, for VISCII.  - FM
249 	     */
250 	    pT->repl_translated_C0 = (BOOL) (p_out->enc == UCT_ENC_8BIT_C0);
251 	    /*
252 	     * Currently unused for any charset combination.
253 	     * Should always be FALSE
254 	     */
255 	    pT->strip_raw_char_in = FALSE;
256 	    /*
257 	     * use_ucs should be set TRUE if we have or will create Unicode
258 	     * values for input octets or UTF multibytes.  - FM
259 	     */
260 	    use_ucs = (BOOL) (intm_ucs || pT->trans_to_uni);
261 	    /*
262 	     * This is set TRUE if use_ucs was set FALSE.  It is complementary
263 	     * to the HTPassEightBitRaw flag, which is set TRUE or FALSE
264 	     * elsewhere based on the raw mode setting in relation to the
265 	     * current Display Character Set.  - FM
266 	     */
267 	    pT->do_8bitraw = (BOOL) (!use_ucs);
268 	    /*
269 	     * This is set TRUE when 160 and 173 should not be treated as nbsp
270 	     * and shy, respectively.  - FM
271 	     */
272 	    pT->pass_160_173_raw = (BOOL) (!use_ucs &&
273 					   !(p_in->like8859 & UCT_R_8859SPECL));
274 	    /*
275 	     * This is set when the input and output charsets match, and they
276 	     * are not ones which should go through a Unicode translation
277 	     * process anyway.  - FM
278 	     */
279 	    pT->use_raw_char_in = (BOOL) (!pT->output_utf8 &&
280 					  cs_in == cs_out &&
281 					  !pT->trans_C0_to_uni);
282 	    /*
283 	     * This should be set TRUE when we expect to have done translation
284 	     * to Unicode or had the equivalent as input, can translate it to
285 	     * our output charset, and normally want to do so.  The latter
286 	     * depends on the pT->do_8bitraw and pT->use_raw_char_in values set
287 	     * above, but also on HTPassEightBitRaw in any functions which use
288 	     * the transformation structure..  - FM
289 	     */
290 	    pT->trans_from_uni = (BOOL) (use_ucs && !pT->do_8bitraw &&
291 					 !pT->use_raw_char_in &&
292 					 UCCanTranslateUniTo(cs_out));
293 	}
294     }
295 }
296 
297 /*
298  *  This function initializes the transformation
299  *  structure by setting all its elements to
300  *  FALSE. - KW
301  */
UCTransParams_clear(UCTransParams * pT)302 void UCTransParams_clear(UCTransParams * pT)
303 {
304     pT->transp = FALSE;
305     pT->do_cjk = FALSE;
306     pT->decode_utf8 = FALSE;
307     pT->output_utf8 = FALSE;
308     pT->do_8bitraw = FALSE;
309     pT->use_raw_char_in = FALSE;
310     pT->strip_raw_char_in = FALSE;
311     pT->pass_160_173_raw = FALSE;
312     pT->trans_to_uni = FALSE;
313     pT->trans_C0_to_uni = FALSE;
314     pT->repl_translated_C0 = FALSE;
315     pT->trans_from_uni = FALSE;
316 }
317 
318 /*
319  * If terminal is in UTF-8 mode, it probably cannot understand box drawing
320  * chars as the 8-bit (n)curses handles them.  (This may also be true for other
321  * display character sets, but isn't currently checked.) In that case set the
322  * chars for horizontal and vertical drawing chars to displayable ASCII chars
323  * if '0' was requested.  They'll stay as they are otherwise.  -KW, TD
324  *
325  * If we're able to obtain a character set based on the locale settings,
326  * assume that the user has setup $TERM and the fonts already so line-drawing
327  * works.
328  */
UCSetBoxChars(int cset,int * pvert_out,int * phori_out,int vert_in,int hori_in)329 void UCSetBoxChars(int cset,
330 		   int *pvert_out,
331 		   int *phori_out,
332 		   int vert_in,
333 		   int hori_in)
334 {
335     BOOL fix_lines = FALSE;
336 
337     if (cset >= 0) {
338 #ifndef WIDEC_CURSES
339 	if (LYCharSet_UC[cset].enc == UCT_ENC_UTF8) {
340 	    fix_lines = TRUE;
341 	}
342 #endif
343 	/*
344 	 * If we've identified a charset that works, require it.
345 	 * This is important if we have loaded a font, which would
346 	 * confuse curses.
347 	 */
348 	/* US-ASCII vs Latin-1 is safe (usually) */
349 	if ((cset == US_ASCII
350 	     || cset == LATIN1)
351 	    && (linedrawing_char_set == US_ASCII
352 		|| linedrawing_char_set == LATIN1)) {
353 #if (defined(FANCY_CURSES) && defined(A_ALTCHARSET)) || defined(USE_SLANG)
354 	    vert_in = 0;
355 	    hori_in = 0;
356 #else
357 	    ;
358 #endif
359 	}
360 #ifdef EXP_CHARTRANS_AUTOSWITCH
361 #if defined(NCURSES_VERSION) || defined(HAVE_TIGETSTR)
362 	else {
363 	    static BOOL first = TRUE;
364 	    static int last_cset = -99;
365 	    static BOOL last_result = TRUE;
366 	    /* *INDENT-OFF* */
367 	    static struct {
368 		int mapping;
369 		UCode_t internal;
370 		int external;
371 	    } table[] = {
372 		{ 'j', 0x2518, 0 }, /* BOX DRAWINGS LIGHT UP AND LEFT */
373 		{ 'k', 0x2510, 0 }, /* BOX DRAWINGS LIGHT DOWN AND LEFT */
374 		{ 'l', 0x250c, 0 }, /* BOX DRAWINGS LIGHT DOWN AND RIGHT */
375 		{ 'm', 0x2514, 0 }, /* BOX DRAWINGS LIGHT UP AND RIGHT */
376 		{ 'n', 0x253c, 0 }, /* BOX DRAWINGS LIGHT VERTICAL AND HORIZONTAL */
377 		{ 'q', 0x2500, 0 }, /* BOX DRAWINGS LIGHT HORIZONTAL */
378 		{ 't', 0x251c, 0 }, /* BOX DRAWINGS LIGHT VERTICAL AND RIGHT */
379 		{ 'u', 0x2524, 0 }, /* BOX DRAWINGS LIGHT VERTICAL AND LEFT */
380 		{ 'v', 0x2534, 0 }, /* BOX DRAWINGS LIGHT UP AND HORIZONTAL */
381 		{ 'w', 0x252c, 0 }, /* BOX DRAWINGS LIGHT DOWN AND HORIZONTAL */
382 		{ 'x', 0x2502, 0 }, /* BOX DRAWINGS LIGHT VERTICAL */
383 	    };
384 	    /* *INDENT-ON* */
385 
386 	    unsigned n;
387 
388 	    if (first) {
389 		static char acsc_name[] = "acsc";
390 		char *map = tigetstr(acsc_name);
391 
392 		if (map != 0) {
393 		    CTRACE((tfp, "build terminal line-drawing map\n"));
394 		    while (map[0] != 0 && map[1] != 0) {
395 			for (n = 0; n < TABLESIZE(table); ++n) {
396 			    if (table[n].mapping == map[0]) {
397 				table[n].external = UCH(map[1]);
398 				CTRACE((tfp,
399 					"  map[%c] %#" PRI_UCode_t " -> %#x\n",
400 					table[n].mapping,
401 					table[n].internal,
402 					table[n].external));
403 				break;
404 			    }
405 			}
406 			map += 2;
407 		    }
408 		}
409 		first = FALSE;
410 	    }
411 
412 	    if (cset == last_cset) {
413 		fix_lines = last_result;
414 	    } else if (cset == UTF8_handle) {
415 		last_result = FALSE;
416 		last_cset = cset;
417 	    } else {
418 		CTRACE((tfp, "check terminal line-drawing map\n"));
419 		for (n = 0; n < TABLESIZE(table); ++n) {
420 		    int test = UCTransUniChar(table[n].internal, cset);
421 
422 		    if (test != table[n].external) {
423 			CTRACE((tfp,
424 				"line-drawing map %c mismatch (have %#x, want %#x)\n",
425 				table[n].mapping,
426 				test, table[n].external));
427 			fix_lines = TRUE;
428 			break;
429 		    }
430 		}
431 		last_result = fix_lines;
432 		last_cset = cset;
433 	    }
434 	}
435 #else
436 	else if (cset != linedrawing_char_set && linedrawing_char_set >= 0) {
437 	    fix_lines = TRUE;
438 	}
439 #endif
440 #endif
441     }
442     if (fix_lines) {
443 	if (!vert_in)
444 	    vert_in = '|';
445 	if (!hori_in)
446 	    hori_in = '-';
447     }
448     *pvert_out = vert_in;
449     *phori_out = hori_in;
450 }
451 
452 /*
453  *  Given an output target HTStream* (can also be a HTStructured* via
454  *  typecast), the target stream's put_character method, and a Unicode
455  *  character,  CPutUtf8_charstring() will either output the UTF8
456  *  encoding of the Unicode and return YES, or do nothing and return
457  *  NO (if conversion would be unnecessary or the Unicode character is
458  *  considered invalid).
459  *
460  *  [Could be used more generally, but is currently only used for &#nnnnn
461  *  stuff - generation of UTF8 from 8-bit encoded charsets not yet done
462  *  by SGML.c etc.]
463  */
464 #define PUTC(ch) ((*myPutc)(target, (char)(ch)))
465 #define PUTC2(ch) ((*myPutc)(target,(char)(0x80|(0x3f &(ch)))))
466 
UCPutUtf8_charstring(HTStream * target,putc_func_t * myPutc,UCode_t code)467 BOOL UCPutUtf8_charstring(HTStream *target, putc_func_t *myPutc, UCode_t code)
468 {
469     if (code < 128)
470 	return NO;		/* indicate to caller we didn't handle it */
471     else if (code < 0x800L) {
472 	PUTC(0xc0 | (code >> 6));
473 	PUTC2(code);
474     } else if (code < 0x10000L) {
475 	PUTC(0xe0 | (code >> 12));
476 	PUTC2(code >> 6);
477 	PUTC2(code);
478     } else if (code < 0x200000L) {
479 	PUTC(0xf0 | (code >> 18));
480 	PUTC2(code >> 12);
481 	PUTC2(code >> 6);
482 	PUTC2(code);
483     } else if (code < 0x4000000L) {
484 	PUTC(0xf8 | (code >> 24));
485 	PUTC2(code >> 18);
486 	PUTC2(code >> 12);
487 	PUTC2(code >> 6);
488 	PUTC2(code);
489     } else if (code <= 0x7fffffffL) {
490 	PUTC(0xfc | (code >> 30));
491 	PUTC2(code >> 24);
492 	PUTC2(code >> 18);
493 	PUTC2(code >> 12);
494 	PUTC2(code >> 6);
495 	PUTC2(code);
496     } else
497 	return NO;
498     return YES;
499 }
500 
501 /*
502  *  This function converts a Unicode (UCode_t) value
503  *  to a multibyte UTF-8 character, which is loaded
504  *  into the buffer received as an argument.  The
505  *  buffer should be large enough to hold at least
506  *  seven characters (but should be declared as 8
507  *  to minimize byte alignment problems with some
508  *  compilers). - FM
509  */
UCConvertUniToUtf8(UCode_t code,char * buffer)510 BOOL UCConvertUniToUtf8(UCode_t code, char *buffer)
511 {
512     char *ch = buffer;
513 
514     if (!ch)
515 	return NO;
516 
517     if (code <= 0 || code > 0x7fffffffL) {
518 	*ch = '\0';
519 	return NO;
520     }
521 
522     if (code < 0x800L) {
523 	*ch++ = (char) (0xc0 | (code >> 6));
524 	*ch++ = (char) (0x80 | (0x3f & (code)));
525 	*ch = '\0';
526     } else if (code < 0x10000L) {
527 	*ch++ = (char) (0xe0 | (code >> 12));
528 	*ch++ = (char) (0x80 | (0x3f & (code >> 6)));
529 	*ch++ = (char) (0x80 | (0x3f & (code)));
530 	*ch = '\0';
531     } else if (code < 0x200000L) {
532 	*ch++ = (char) (0xf0 | (code >> 18));
533 	*ch++ = (char) (0x80 | (0x3f & (code >> 12)));
534 	*ch++ = (char) (0x80 | (0x3f & (code >> 6)));
535 	*ch++ = (char) (0x80 | (0x3f & (code)));
536 	*ch = '\0';
537     } else if (code < 0x4000000L) {
538 	*ch++ = (char) (0xf8 | (code >> 24));
539 	*ch++ = (char) (0x80 | (0x3f & (code >> 18)));
540 	*ch++ = (char) (0x80 | (0x3f & (code >> 12)));
541 	*ch++ = (char) (0x80 | (0x3f & (code >> 6)));
542 	*ch++ = (char) (0x80 | (0x3f & (code)));
543 	*ch = '\0';
544     } else {
545 	*ch++ = (char) (0xfc | (code >> 30));
546 	*ch++ = (char) (0x80 | (0x3f & (code >> 24)));
547 	*ch++ = (char) (0x80 | (0x3f & (code >> 18)));
548 	*ch++ = (char) (0x80 | (0x3f & (code >> 12)));
549 	*ch++ = (char) (0x80 | (0x3f & (code >> 6)));
550 	*ch++ = (char) (0x80 | (0x3f & (code)));
551 	*ch = '\0';
552     }
553     return YES;
554 }
555 
556 /*
557  * Get UCS character code for one character from UTF-8 encoded string.
558  *
559  * On entry:
560  *	*ppuni should point to beginning of UTF-8 encoding character
561  * On exit:
562  *	*ppuni is advanced to point to the last byte of UTF-8 sequence,
563  *		if there was a valid one; otherwise unchanged.
564  * returns the UCS value
565  * returns negative value on error (invalid UTF-8 sequence)
566  */
UCGetUniFromUtf8String(char ** ppuni)567 UCode_t UCGetUniFromUtf8String(char **ppuni)
568 {
569     UCode_t uc_out = 0;
570     char *p = *ppuni;
571     int utf_count, i;
572 
573     if (!(**ppuni & 0x80))
574 	return (UCode_t) **ppuni;	/* ASCII range character */
575     else if (!(**ppuni & 0x40))
576 	return (-1);		/* not a valid UTF-8 start */
577     if ((*p & 0xe0) == 0xc0) {
578 	utf_count = 1;
579     } else if ((*p & 0xf0) == 0xe0) {
580 	utf_count = 2;
581     } else if ((*p & 0xf8) == 0xf0) {
582 	utf_count = 3;
583     } else if ((*p & 0xfc) == 0xf8) {
584 	utf_count = 4;
585     } else if ((*p & 0xfe) == 0xfc) {
586 	utf_count = 5;
587     } else {			/* garbage */
588 	return (-1);
589     }
590     for (p = *ppuni, i = 0; i < utf_count; i++) {
591 	if ((*(++p) & 0xc0) != 0x80)
592 	    return (-1);
593     }
594     p = *ppuni;
595     switch (utf_count) {
596     case 1:
597 	uc_out = (((*p & 0x1f) << 6) |
598 		  (*(p + 1) & 0x3f));
599 	break;
600     case 2:
601 	uc_out = (((((*p & 0x0f) << 6) |
602 		    (*(p + 1) & 0x3f)) << 6) |
603 		  (*(p + 2) & 0x3f));
604 	break;
605     case 3:
606 	uc_out = (((((((*p & 0x07) << 6) |
607 		      (*(p + 1) & 0x3f)) << 6) |
608 		    (*(p + 2) & 0x3f)) << 6) |
609 		  (*(p + 3) & 0x3f));
610 	break;
611     case 4:
612 	uc_out = (((((((((*p & 0x03) << 6) |
613 			(*(p + 1) & 0x3f)) << 6) |
614 		      (*(p + 2) & 0x3f)) << 6) |
615 		    (*(p + 3) & 0x3f)) << 6) |
616 		  (*(p + 4) & 0x3f));
617 	break;
618     case 5:
619 	uc_out = (((((((((((*p & 0x01) << 6) |
620 			  (*(p + 1) & 0x3f)) << 6) |
621 			(*(p + 2) & 0x3f)) << 6) |
622 		      (*(p + 3) & 0x3f)) << 6) |
623 		    (*(p + 4) & 0x3f)) << 6) |
624 		  (*(p + 5) & 0x3f));
625 	break;
626     }
627     *ppuni = p + utf_count;
628     return uc_out;
629 }
630