1 /*
2 * $LynxId: UCAux.c,v 1.44 2010/11/07 21:21:09 tom Exp $
3 */
4 #include <HTUtils.h>
5
6 #include <HTCJK.h>
7 #include <UCMap.h>
8 #include <UCDefs.h>
9 #include <HTStream.h>
10 #include <UCAux.h>
11 #include <LYCharSets.h>
12 #include <LYCurses.h>
13 #include <LYStrings.h>
14
UCCanUniTranslateFrom(int from)15 BOOL UCCanUniTranslateFrom(int from)
16 {
17 if (from < 0)
18 return NO;
19 #ifndef EXP_JAPANESEUTF8_SUPPORT
20 if (LYCharSet_UC[from].enc == UCT_ENC_CJK)
21 return NO;
22 #endif
23 if (!strcmp(LYCharSet_UC[from].MIMEname, "x-transparent"))
24 return NO;
25
26 /* others YES */
27 return YES;
28 }
29
UCCanTranslateUniTo(int to)30 BOOL UCCanTranslateUniTo(int to)
31 {
32 if (to < 0)
33 return NO;
34 /*???
35 if (!strcmp(LYCharSet_UC[to].MIMEname, "x-transparent"))
36 return NO;
37 */
38
39 return YES; /* well at least some characters... */
40 }
41
UCCanTranslateFromTo(int from,int to)42 BOOL UCCanTranslateFromTo(int from,
43 int to)
44 {
45 if (from == to)
46 return YES;
47 if (from < 0 || to < 0)
48 return NO;
49 if (from == LATIN1)
50 return UCCanTranslateUniTo(to);
51 if (to == LATIN1 || LYCharSet_UC[to].enc == UCT_ENC_UTF8)
52 return UCCanUniTranslateFrom(from);
53 {
54 const char *fromname = LYCharSet_UC[from].MIMEname;
55 const char *toname = LYCharSet_UC[to].MIMEname;
56
57 if (!strcmp(fromname, "x-transparent") ||
58 !strcmp(toname, "x-transparent")) {
59 return YES; /* ??? */
60 } else if (!strcmp(fromname, "us-ascii")) {
61 return YES;
62 }
63 if (LYCharSet_UC[from].enc == UCT_ENC_CJK) {
64 /*
65 * CJK mode may be off (i.e., !IS_CJK_TTY) because the current
66 * document is not CJK, but the check may be for capability in
67 * relation to another document, for which CJK mode might be turned
68 * on when retrieved. Thus, when the from charset is CJK, check if
69 * the to charset is CJK, and return NO or YES in relation to that.
70 * - FM
71 */
72 if (LYCharSet_UC[to].enc != UCT_ENC_CJK)
73 return NO;
74 if ((!strcmp(toname, "euc-jp") ||
75 !strcmp(toname, "shift_jis")) &&
76 (!strcmp(fromname, "euc-jp") ||
77 !strcmp(fromname, "shift_jis")))
78 return YES;
79 /*
80 * The euc-cn and euc-kr charsets were handled by the (from == to)
81 * above, so we need not check those. - FM
82 */
83 return NO;
84 }
85 }
86 return YES; /* others YES */
87 }
88
89 /*
90 * Returns YES if no translation necessary (because
91 * charsets are equal, are equivalent, etc.).
92 */
UCNeedNotTranslate(int from,int to)93 BOOL UCNeedNotTranslate(int from,
94 int to)
95 {
96 const char *fromname;
97 const char *toname;
98
99 if (from == to)
100 return YES;
101 if (from < 0)
102 return NO; /* ??? */
103 if (LYCharSet_UC[from].enc == UCT_ENC_7BIT) {
104 return YES; /* Only 7bit chars. */
105 }
106 fromname = LYCharSet_UC[from].MIMEname;
107 if (!strcmp(fromname, "x-transparent") ||
108 !strcmp(fromname, "us-ascii")) {
109 return YES;
110 }
111 if (to < 0)
112 return NO; /* ??? */
113 if (to == LATIN1) {
114 if (LYCharSet_UC[from].codepoints & (UCT_CP_SUBSETOF_LAT1))
115 return YES;
116 }
117 toname = LYCharSet_UC[to].MIMEname;
118 if (!strcmp(toname, "x-transparent")) {
119 return YES;
120 }
121 if (LYCharSet_UC[to].enc == UCT_ENC_UTF8) {
122 return NO;
123 }
124 if (from == LATIN1) {
125 if (LYCharSet_UC[from].codepoints & (UCT_CP_SUPERSETOF_LAT1))
126 return YES;
127 }
128 if (LYCharSet_UC[from].enc == UCT_ENC_CJK) {
129 if (!IS_CJK_TTY) /* Use that global flag, for now. */
130 return NO;
131 if (HTCJK == JAPANESE &&
132 (!strcmp(fromname, "euc-jp") ||
133 !strcmp(fromname, "shift_jis")))
134 return YES; /* translate internally by lynx, no unicode */
135 return NO; /* If not handled by (from == to) above. */
136 }
137 return NO;
138 }
139
140 /*
141 * The idea here is that any stage of the stream pipe which is interested
142 * in some charset dependent processing will call this function.
143 * Given input and output charsets, this function will set various flags
144 * in a UCTransParams structure that _suggest_ to the caller what to do.
145 *
146 * Should be called once when a stage starts processing text (and the
147 * input and output charsets are known), or whenever one of input or
148 * output charsets has changed (e.g., by SGML.c stage after HTML.c stage
149 * has processed a META tag).
150 * The global flags (LYRawMode, HTPassEightBitRaw etc.) are currently
151 * not taken into account here (except for HTCJK, somewhat), it's still
152 * up to the caller to do something about them. - KW
153 */
UCSetTransParams(UCTransParams * pT,int cs_in,const LYUCcharset * p_in,int cs_out,const LYUCcharset * p_out)154 void UCSetTransParams(UCTransParams * pT, int cs_in,
155 const LYUCcharset *p_in,
156 int cs_out,
157 const LYUCcharset *p_out)
158 {
159 CTRACE((tfp, "UCSetTransParams: from %s(%d) to %s(%d)\n",
160 p_in->MIMEname, UCGetLYhndl_byMIME(p_in->MIMEname),
161 p_out->MIMEname, UCGetLYhndl_byMIME(p_out->MIMEname)));
162
163 /*
164 * Initialize this element to FALSE, and set it TRUE below if we're dealing
165 * with VISCII. - FM
166 */
167 pT->trans_C0_to_uni = FALSE;
168
169 /*
170 * The "transparent" display character set is a "super raw mode". - FM
171 */
172 pT->transp = (BOOL) (!strcmp(p_in->MIMEname, "x-transparent") ||
173 !strcmp(p_out->MIMEname, "x-transparent"));
174
175 /*
176 * UCS-2 is handled as a special case in SGML_write().
177 */
178 pT->ucs_mode = 0;
179
180 if (pT->transp) {
181 /*
182 * Set up the structure for "transparent". - FM
183 */
184 pT->do_cjk = FALSE;
185 pT->decode_utf8 = FALSE;
186 pT->output_utf8 = FALSE; /* We may, but won't know about it. - KW */
187 pT->do_8bitraw = TRUE;
188 pT->use_raw_char_in = TRUE;
189 pT->strip_raw_char_in = FALSE;
190 pT->pass_160_173_raw = TRUE;
191 pT->repl_translated_C0 = (BOOL) (p_out->enc == UCT_ENC_8BIT_C0);
192 pT->trans_C0_to_uni = (BOOL) (p_in->enc == UCT_ENC_8BIT_C0 ||
193 p_out->enc == UCT_ENC_8BIT_C0);
194 } else {
195 /*
196 * Initialize local flags. - FM
197 */
198 BOOL intm_ucs = FALSE;
199 BOOL use_ucs = FALSE;
200
201 /*
202 * Set this element if we want to treat the input as CJK. - FM
203 */
204 pT->do_cjk = (BOOL) ((p_in->enc == UCT_ENC_CJK) && IS_CJK_TTY);
205 /*
206 * Set these elements based on whether we are dealing with UTF-8. - FM
207 */
208 pT->decode_utf8 = (BOOL) (p_in->enc == UCT_ENC_UTF8);
209 pT->output_utf8 = (BOOL) (p_out->enc == UCT_ENC_UTF8);
210 if (pT->do_cjk) {
211 /*
212 * Set up the structure for a CJK input with
213 * a CJK output (IS_CJK_TTY). - FM
214 */
215 pT->trans_to_uni = FALSE;
216 pT->do_8bitraw = FALSE;
217 pT->pass_160_173_raw = TRUE;
218 pT->use_raw_char_in = FALSE; /* Not used for CJK. - KW */
219 pT->repl_translated_C0 = FALSE;
220 pT->trans_from_uni = FALSE; /* Not used for CJK. - KW */
221 } else {
222 /*
223 * Set up for all other charset combinations. The intm_ucs flag is
224 * set TRUE if the input charset is iso-8859-1 or UTF-8, or largely
225 * equivalent to them, i.e., if we have UCS without having to do a
226 * table translation.
227 */
228 intm_ucs = (BOOL) (cs_in == LATIN1 || pT->decode_utf8 ||
229 (p_in->codepoints &
230 (UCT_CP_SUBSETOF_LAT1 | UCT_CP_SUBSETOF_UCS2)));
231 /*
232 * pT->trans_to_uni is set TRUE if we do not have that as input
233 * already, and we can translate to Unicode. Note that UTF-8
234 * always is converted to Unicode in functions that use the
235 * transformation structure, so it is treated as already Unicode
236 * here.
237 */
238 pT->trans_to_uni = (BOOL) (!intm_ucs &&
239 UCCanUniTranslateFrom(cs_in));
240 /*
241 * We set this if we are translating to Unicode and what normally
242 * are low value control characters in fact are encoding octets for
243 * the input charset (presently, this applies to VISCII). - FM
244 */
245 pT->trans_C0_to_uni = (BOOL) (pT->trans_to_uni &&
246 p_in->enc == UCT_ENC_8BIT_C0);
247 /*
248 * We set this, presently, for VISCII. - FM
249 */
250 pT->repl_translated_C0 = (BOOL) (p_out->enc == UCT_ENC_8BIT_C0);
251 /*
252 * Currently unused for any charset combination.
253 * Should always be FALSE
254 */
255 pT->strip_raw_char_in = FALSE;
256 /*
257 * use_ucs should be set TRUE if we have or will create Unicode
258 * values for input octets or UTF multibytes. - FM
259 */
260 use_ucs = (BOOL) (intm_ucs || pT->trans_to_uni);
261 /*
262 * This is set TRUE if use_ucs was set FALSE. It is complementary
263 * to the HTPassEightBitRaw flag, which is set TRUE or FALSE
264 * elsewhere based on the raw mode setting in relation to the
265 * current Display Character Set. - FM
266 */
267 pT->do_8bitraw = (BOOL) (!use_ucs);
268 /*
269 * This is set TRUE when 160 and 173 should not be treated as nbsp
270 * and shy, respectively. - FM
271 */
272 pT->pass_160_173_raw = (BOOL) (!use_ucs &&
273 !(p_in->like8859 & UCT_R_8859SPECL));
274 /*
275 * This is set when the input and output charsets match, and they
276 * are not ones which should go through a Unicode translation
277 * process anyway. - FM
278 */
279 pT->use_raw_char_in = (BOOL) (!pT->output_utf8 &&
280 cs_in == cs_out &&
281 !pT->trans_C0_to_uni);
282 /*
283 * This should be set TRUE when we expect to have done translation
284 * to Unicode or had the equivalent as input, can translate it to
285 * our output charset, and normally want to do so. The latter
286 * depends on the pT->do_8bitraw and pT->use_raw_char_in values set
287 * above, but also on HTPassEightBitRaw in any functions which use
288 * the transformation structure.. - FM
289 */
290 pT->trans_from_uni = (BOOL) (use_ucs && !pT->do_8bitraw &&
291 !pT->use_raw_char_in &&
292 UCCanTranslateUniTo(cs_out));
293 }
294 }
295 }
296
297 /*
298 * This function initializes the transformation
299 * structure by setting all its elements to
300 * FALSE. - KW
301 */
UCTransParams_clear(UCTransParams * pT)302 void UCTransParams_clear(UCTransParams * pT)
303 {
304 pT->transp = FALSE;
305 pT->do_cjk = FALSE;
306 pT->decode_utf8 = FALSE;
307 pT->output_utf8 = FALSE;
308 pT->do_8bitraw = FALSE;
309 pT->use_raw_char_in = FALSE;
310 pT->strip_raw_char_in = FALSE;
311 pT->pass_160_173_raw = FALSE;
312 pT->trans_to_uni = FALSE;
313 pT->trans_C0_to_uni = FALSE;
314 pT->repl_translated_C0 = FALSE;
315 pT->trans_from_uni = FALSE;
316 }
317
318 /*
319 * If terminal is in UTF-8 mode, it probably cannot understand box drawing
320 * chars as the 8-bit (n)curses handles them. (This may also be true for other
321 * display character sets, but isn't currently checked.) In that case set the
322 * chars for horizontal and vertical drawing chars to displayable ASCII chars
323 * if '0' was requested. They'll stay as they are otherwise. -KW, TD
324 *
325 * If we're able to obtain a character set based on the locale settings,
326 * assume that the user has setup $TERM and the fonts already so line-drawing
327 * works.
328 */
UCSetBoxChars(int cset,int * pvert_out,int * phori_out,int vert_in,int hori_in)329 void UCSetBoxChars(int cset,
330 int *pvert_out,
331 int *phori_out,
332 int vert_in,
333 int hori_in)
334 {
335 BOOL fix_lines = FALSE;
336
337 if (cset >= 0) {
338 #ifndef WIDEC_CURSES
339 if (LYCharSet_UC[cset].enc == UCT_ENC_UTF8) {
340 fix_lines = TRUE;
341 }
342 #endif
343 /*
344 * If we've identified a charset that works, require it.
345 * This is important if we have loaded a font, which would
346 * confuse curses.
347 */
348 /* US-ASCII vs Latin-1 is safe (usually) */
349 if ((cset == US_ASCII
350 || cset == LATIN1)
351 && (linedrawing_char_set == US_ASCII
352 || linedrawing_char_set == LATIN1)) {
353 #if (defined(FANCY_CURSES) && defined(A_ALTCHARSET)) || defined(USE_SLANG)
354 vert_in = 0;
355 hori_in = 0;
356 #else
357 ;
358 #endif
359 }
360 #ifdef EXP_CHARTRANS_AUTOSWITCH
361 #if defined(NCURSES_VERSION) || defined(HAVE_TIGETSTR)
362 else {
363 static BOOL first = TRUE;
364 static int last_cset = -99;
365 static BOOL last_result = TRUE;
366 /* *INDENT-OFF* */
367 static struct {
368 int mapping;
369 UCode_t internal;
370 int external;
371 } table[] = {
372 { 'j', 0x2518, 0 }, /* BOX DRAWINGS LIGHT UP AND LEFT */
373 { 'k', 0x2510, 0 }, /* BOX DRAWINGS LIGHT DOWN AND LEFT */
374 { 'l', 0x250c, 0 }, /* BOX DRAWINGS LIGHT DOWN AND RIGHT */
375 { 'm', 0x2514, 0 }, /* BOX DRAWINGS LIGHT UP AND RIGHT */
376 { 'n', 0x253c, 0 }, /* BOX DRAWINGS LIGHT VERTICAL AND HORIZONTAL */
377 { 'q', 0x2500, 0 }, /* BOX DRAWINGS LIGHT HORIZONTAL */
378 { 't', 0x251c, 0 }, /* BOX DRAWINGS LIGHT VERTICAL AND RIGHT */
379 { 'u', 0x2524, 0 }, /* BOX DRAWINGS LIGHT VERTICAL AND LEFT */
380 { 'v', 0x2534, 0 }, /* BOX DRAWINGS LIGHT UP AND HORIZONTAL */
381 { 'w', 0x252c, 0 }, /* BOX DRAWINGS LIGHT DOWN AND HORIZONTAL */
382 { 'x', 0x2502, 0 }, /* BOX DRAWINGS LIGHT VERTICAL */
383 };
384 /* *INDENT-ON* */
385
386 unsigned n;
387
388 if (first) {
389 static char acsc_name[] = "acsc";
390 char *map = tigetstr(acsc_name);
391
392 if (map != 0) {
393 CTRACE((tfp, "build terminal line-drawing map\n"));
394 while (map[0] != 0 && map[1] != 0) {
395 for (n = 0; n < TABLESIZE(table); ++n) {
396 if (table[n].mapping == map[0]) {
397 table[n].external = UCH(map[1]);
398 CTRACE((tfp,
399 " map[%c] %#" PRI_UCode_t " -> %#x\n",
400 table[n].mapping,
401 table[n].internal,
402 table[n].external));
403 break;
404 }
405 }
406 map += 2;
407 }
408 }
409 first = FALSE;
410 }
411
412 if (cset == last_cset) {
413 fix_lines = last_result;
414 } else if (cset == UTF8_handle) {
415 last_result = FALSE;
416 last_cset = cset;
417 } else {
418 CTRACE((tfp, "check terminal line-drawing map\n"));
419 for (n = 0; n < TABLESIZE(table); ++n) {
420 int test = UCTransUniChar(table[n].internal, cset);
421
422 if (test != table[n].external) {
423 CTRACE((tfp,
424 "line-drawing map %c mismatch (have %#x, want %#x)\n",
425 table[n].mapping,
426 test, table[n].external));
427 fix_lines = TRUE;
428 break;
429 }
430 }
431 last_result = fix_lines;
432 last_cset = cset;
433 }
434 }
435 #else
436 else if (cset != linedrawing_char_set && linedrawing_char_set >= 0) {
437 fix_lines = TRUE;
438 }
439 #endif
440 #endif
441 }
442 if (fix_lines) {
443 if (!vert_in)
444 vert_in = '|';
445 if (!hori_in)
446 hori_in = '-';
447 }
448 *pvert_out = vert_in;
449 *phori_out = hori_in;
450 }
451
452 /*
453 * Given an output target HTStream* (can also be a HTStructured* via
454 * typecast), the target stream's put_character method, and a Unicode
455 * character, CPutUtf8_charstring() will either output the UTF8
456 * encoding of the Unicode and return YES, or do nothing and return
457 * NO (if conversion would be unnecessary or the Unicode character is
458 * considered invalid).
459 *
460 * [Could be used more generally, but is currently only used for &#nnnnn
461 * stuff - generation of UTF8 from 8-bit encoded charsets not yet done
462 * by SGML.c etc.]
463 */
464 #define PUTC(ch) ((*myPutc)(target, (char)(ch)))
465 #define PUTC2(ch) ((*myPutc)(target,(char)(0x80|(0x3f &(ch)))))
466
UCPutUtf8_charstring(HTStream * target,putc_func_t * myPutc,UCode_t code)467 BOOL UCPutUtf8_charstring(HTStream *target, putc_func_t *myPutc, UCode_t code)
468 {
469 if (code < 128)
470 return NO; /* indicate to caller we didn't handle it */
471 else if (code < 0x800L) {
472 PUTC(0xc0 | (code >> 6));
473 PUTC2(code);
474 } else if (code < 0x10000L) {
475 PUTC(0xe0 | (code >> 12));
476 PUTC2(code >> 6);
477 PUTC2(code);
478 } else if (code < 0x200000L) {
479 PUTC(0xf0 | (code >> 18));
480 PUTC2(code >> 12);
481 PUTC2(code >> 6);
482 PUTC2(code);
483 } else if (code < 0x4000000L) {
484 PUTC(0xf8 | (code >> 24));
485 PUTC2(code >> 18);
486 PUTC2(code >> 12);
487 PUTC2(code >> 6);
488 PUTC2(code);
489 } else if (code <= 0x7fffffffL) {
490 PUTC(0xfc | (code >> 30));
491 PUTC2(code >> 24);
492 PUTC2(code >> 18);
493 PUTC2(code >> 12);
494 PUTC2(code >> 6);
495 PUTC2(code);
496 } else
497 return NO;
498 return YES;
499 }
500
501 /*
502 * This function converts a Unicode (UCode_t) value
503 * to a multibyte UTF-8 character, which is loaded
504 * into the buffer received as an argument. The
505 * buffer should be large enough to hold at least
506 * seven characters (but should be declared as 8
507 * to minimize byte alignment problems with some
508 * compilers). - FM
509 */
UCConvertUniToUtf8(UCode_t code,char * buffer)510 BOOL UCConvertUniToUtf8(UCode_t code, char *buffer)
511 {
512 char *ch = buffer;
513
514 if (!ch)
515 return NO;
516
517 if (code <= 0 || code > 0x7fffffffL) {
518 *ch = '\0';
519 return NO;
520 }
521
522 if (code < 0x800L) {
523 *ch++ = (char) (0xc0 | (code >> 6));
524 *ch++ = (char) (0x80 | (0x3f & (code)));
525 *ch = '\0';
526 } else if (code < 0x10000L) {
527 *ch++ = (char) (0xe0 | (code >> 12));
528 *ch++ = (char) (0x80 | (0x3f & (code >> 6)));
529 *ch++ = (char) (0x80 | (0x3f & (code)));
530 *ch = '\0';
531 } else if (code < 0x200000L) {
532 *ch++ = (char) (0xf0 | (code >> 18));
533 *ch++ = (char) (0x80 | (0x3f & (code >> 12)));
534 *ch++ = (char) (0x80 | (0x3f & (code >> 6)));
535 *ch++ = (char) (0x80 | (0x3f & (code)));
536 *ch = '\0';
537 } else if (code < 0x4000000L) {
538 *ch++ = (char) (0xf8 | (code >> 24));
539 *ch++ = (char) (0x80 | (0x3f & (code >> 18)));
540 *ch++ = (char) (0x80 | (0x3f & (code >> 12)));
541 *ch++ = (char) (0x80 | (0x3f & (code >> 6)));
542 *ch++ = (char) (0x80 | (0x3f & (code)));
543 *ch = '\0';
544 } else {
545 *ch++ = (char) (0xfc | (code >> 30));
546 *ch++ = (char) (0x80 | (0x3f & (code >> 24)));
547 *ch++ = (char) (0x80 | (0x3f & (code >> 18)));
548 *ch++ = (char) (0x80 | (0x3f & (code >> 12)));
549 *ch++ = (char) (0x80 | (0x3f & (code >> 6)));
550 *ch++ = (char) (0x80 | (0x3f & (code)));
551 *ch = '\0';
552 }
553 return YES;
554 }
555
556 /*
557 * Get UCS character code for one character from UTF-8 encoded string.
558 *
559 * On entry:
560 * *ppuni should point to beginning of UTF-8 encoding character
561 * On exit:
562 * *ppuni is advanced to point to the last byte of UTF-8 sequence,
563 * if there was a valid one; otherwise unchanged.
564 * returns the UCS value
565 * returns negative value on error (invalid UTF-8 sequence)
566 */
UCGetUniFromUtf8String(char ** ppuni)567 UCode_t UCGetUniFromUtf8String(char **ppuni)
568 {
569 UCode_t uc_out = 0;
570 char *p = *ppuni;
571 int utf_count, i;
572
573 if (!(**ppuni & 0x80))
574 return (UCode_t) **ppuni; /* ASCII range character */
575 else if (!(**ppuni & 0x40))
576 return (-1); /* not a valid UTF-8 start */
577 if ((*p & 0xe0) == 0xc0) {
578 utf_count = 1;
579 } else if ((*p & 0xf0) == 0xe0) {
580 utf_count = 2;
581 } else if ((*p & 0xf8) == 0xf0) {
582 utf_count = 3;
583 } else if ((*p & 0xfc) == 0xf8) {
584 utf_count = 4;
585 } else if ((*p & 0xfe) == 0xfc) {
586 utf_count = 5;
587 } else { /* garbage */
588 return (-1);
589 }
590 for (p = *ppuni, i = 0; i < utf_count; i++) {
591 if ((*(++p) & 0xc0) != 0x80)
592 return (-1);
593 }
594 p = *ppuni;
595 switch (utf_count) {
596 case 1:
597 uc_out = (((*p & 0x1f) << 6) |
598 (*(p + 1) & 0x3f));
599 break;
600 case 2:
601 uc_out = (((((*p & 0x0f) << 6) |
602 (*(p + 1) & 0x3f)) << 6) |
603 (*(p + 2) & 0x3f));
604 break;
605 case 3:
606 uc_out = (((((((*p & 0x07) << 6) |
607 (*(p + 1) & 0x3f)) << 6) |
608 (*(p + 2) & 0x3f)) << 6) |
609 (*(p + 3) & 0x3f));
610 break;
611 case 4:
612 uc_out = (((((((((*p & 0x03) << 6) |
613 (*(p + 1) & 0x3f)) << 6) |
614 (*(p + 2) & 0x3f)) << 6) |
615 (*(p + 3) & 0x3f)) << 6) |
616 (*(p + 4) & 0x3f));
617 break;
618 case 5:
619 uc_out = (((((((((((*p & 0x01) << 6) |
620 (*(p + 1) & 0x3f)) << 6) |
621 (*(p + 2) & 0x3f)) << 6) |
622 (*(p + 3) & 0x3f)) << 6) |
623 (*(p + 4) & 0x3f)) << 6) |
624 (*(p + 5) & 0x3f));
625 break;
626 }
627 *ppuni = p + utf_count;
628 return uc_out;
629 }
630