1
2 #include "EXTERN.h"
3 #include "perl.h"
4 #include "XSUB.h"
5
6 /* These 5 files are prepared by mkheader */
7 #include "unfcmb.h"
8 #include "unfcan.h"
9 #include "unfcpt.h"
10 #include "unfcmp.h"
11 #include "unfexc.h"
12
13 /* Perl 5.6.1 ? */
14 #ifndef uvuni_to_utf8
15 #define uvuni_to_utf8 uv_to_utf8
16 #endif /* uvuni_to_utf8 */
17
18 /* Perl 5.6.1 ? */
19 #ifndef utf8n_to_uvuni
20 #define utf8n_to_uvuni utf8_to_uv
21 #endif /* utf8n_to_uvuni */
22
23 /* UTF8_ALLOW_BOM is used before Perl 5.8.0 */
24 #ifdef UTF8_ALLOW_BOM
25 #define AllowAnyUTF (UTF8_ALLOW_SURROGATE|UTF8_ALLOW_BOM|UTF8_ALLOW_FFFF)
26 #else
27 #define AllowAnyUTF (UTF8_ALLOW_SURROGATE|UTF8_ALLOW_FFFF)
28 #endif
29
30 /* if utf8n_to_uvuni() sets retlen to 0 (?) */
31 #define ErrRetlenIsZero "panic (Unicode::Normalize): zero-length character"
32
33 /* utf8_hop() hops back before start. Maybe broken UTF-8 */
34 #define ErrHopBeforeStart "panic (Unicode::Normalize): hopping before start"
35
36 /* At present, char > 0x10ffff are unaffected without complaint, right? */
37 #define VALID_UTF_MAX (0x10ffff)
38 #define OVER_UTF_MAX(uv) (VALID_UTF_MAX < (uv))
39
40 /* HANGUL_H */
41 #define Hangul_SBase 0xAC00
42 #define Hangul_SFinal 0xD7A3
43 #define Hangul_SCount 11172
44
45 #define Hangul_NCount 588
46
47 #define Hangul_LBase 0x1100
48 #define Hangul_LFinal 0x1112
49 #define Hangul_LCount 19
50
51 #define Hangul_VBase 0x1161
52 #define Hangul_VFinal 0x1175
53 #define Hangul_VCount 21
54
55 #define Hangul_TBase 0x11A7
56 #define Hangul_TFinal 0x11C2
57 #define Hangul_TCount 28
58
59 #define Hangul_IsS(u) ((Hangul_SBase <= (u)) && ((u) <= Hangul_SFinal))
60 #define Hangul_IsN(u) (((u) - Hangul_SBase) % Hangul_TCount == 0)
61 #define Hangul_IsLV(u) (Hangul_IsS(u) && Hangul_IsN(u))
62 #define Hangul_IsL(u) ((Hangul_LBase <= (u)) && ((u) <= Hangul_LFinal))
63 #define Hangul_IsV(u) ((Hangul_VBase <= (u)) && ((u) <= Hangul_VFinal))
64 #define Hangul_IsT(u) ((Hangul_TBase < (u)) && ((u) <= Hangul_TFinal))
65 /* HANGUL_H */
66
67 /* this is used for canonical ordering of combining characters (c.c.). */
68 typedef struct {
69 U8 cc; /* combining class */
70 UV uv; /* codepoint */
71 STRLEN pos; /* position */
72 } UNF_cc;
73
compare_cc(const void * a,const void * b)74 static int compare_cc (const void *a, const void *b)
75 {
76 int ret_cc;
77 ret_cc = ((UNF_cc*) a)->cc - ((UNF_cc*) b)->cc;
78 if (ret_cc)
79 return ret_cc;
80
81 return ( ((UNF_cc*) a)->pos > ((UNF_cc*) b)->pos )
82 - ( ((UNF_cc*) a)->pos < ((UNF_cc*) b)->pos );
83 }
84
dec_canonical(UV uv)85 static U8* dec_canonical (UV uv)
86 {
87 U8 ***plane, **row;
88 if (OVER_UTF_MAX(uv))
89 return NULL;
90 plane = (U8***)UNF_canon[uv >> 16];
91 if (! plane)
92 return NULL;
93 row = plane[(uv >> 8) & 0xff];
94 return row ? row[uv & 0xff] : NULL;
95 }
96
dec_compat(UV uv)97 static U8* dec_compat (UV uv)
98 {
99 U8 ***plane, **row;
100 if (OVER_UTF_MAX(uv))
101 return NULL;
102 plane = (U8***)UNF_compat[uv >> 16];
103 if (! plane)
104 return NULL;
105 row = plane[(uv >> 8) & 0xff];
106 return row ? row[uv & 0xff] : NULL;
107 }
108
composite_uv(UV uv,UV uv2)109 static UV composite_uv (UV uv, UV uv2)
110 {
111 UNF_complist ***plane, **row, *cell, *i;
112
113 if (! uv2 || OVER_UTF_MAX(uv) || OVER_UTF_MAX(uv2))
114 return 0;
115
116 if (Hangul_IsL(uv) && Hangul_IsV(uv2)) {
117 uv -= Hangul_LBase; /* lindex */
118 uv2 -= Hangul_VBase; /* vindex */
119 return(Hangul_SBase + (uv * Hangul_VCount + uv2) * Hangul_TCount);
120 }
121 if (Hangul_IsLV(uv) && Hangul_IsT(uv2)) {
122 uv2 -= Hangul_TBase; /* tindex */
123 return(uv + uv2);
124 }
125 plane = UNF_compos[uv >> 16];
126 if (! plane)
127 return 0;
128 row = plane[(uv >> 8) & 0xff];
129 if (! row)
130 return 0;
131 cell = row[uv & 0xff];
132 if (! cell)
133 return 0;
134 for (i = cell; i->nextchar; i++) {
135 if (uv2 == i->nextchar)
136 return i->composite;
137 }
138 return 0;
139 }
140
getCombinClass(UV uv)141 static U8 getCombinClass (UV uv)
142 {
143 U8 **plane, *row;
144 if (OVER_UTF_MAX(uv))
145 return 0;
146 plane = (U8**)UNF_combin[uv >> 16];
147 if (! plane)
148 return 0;
149 row = plane[(uv >> 8) & 0xff];
150 return row ? row[uv & 0xff] : 0;
151 }
152
sv_cat_decompHangul(SV * sv,UV uv)153 static void sv_cat_decompHangul (SV* sv, UV uv)
154 {
155 UV sindex, lindex, vindex, tindex;
156 U8 *t, tmp[3 * UTF8_MAXLEN + 1];
157
158 if (! Hangul_IsS(uv))
159 return;
160
161 sindex = uv - Hangul_SBase;
162 lindex = sindex / Hangul_NCount;
163 vindex = (sindex % Hangul_NCount) / Hangul_TCount;
164 tindex = sindex % Hangul_TCount;
165
166 t = tmp;
167 t = uvuni_to_utf8(t, (lindex + Hangul_LBase));
168 t = uvuni_to_utf8(t, (vindex + Hangul_VBase));
169 if (tindex)
170 t = uvuni_to_utf8(t, (tindex + Hangul_TBase));
171 *t = '\0';
172 sv_catpvn(sv, (char *)tmp, t - tmp);
173 }
174
sv_cat_uvuni(SV * sv,UV uv)175 static void sv_cat_uvuni (SV* sv, UV uv)
176 {
177 U8 *t, tmp[UTF8_MAXLEN + 1];
178
179 t = tmp;
180 t = uvuni_to_utf8(t, uv);
181 *t = '\0';
182 sv_catpvn(sv, (char *)tmp, t - tmp);
183 }
184
sv_2pvunicode(SV * sv,STRLEN * lp)185 static char * sv_2pvunicode(SV *sv, STRLEN *lp)
186 {
187 char *s;
188 STRLEN len;
189 s = (char*)SvPV(sv,len);
190 if (!SvUTF8(sv)) {
191 SV* tmpsv = sv_mortalcopy(sv);
192 if (!SvPOK(tmpsv))
193 (void)sv_pvn_force(tmpsv,&len);
194 sv_utf8_upgrade(tmpsv);
195 s = (char*)SvPV(tmpsv,len);
196 }
197 *lp = len;
198 return s;
199 }
200
201 MODULE = Unicode::Normalize PACKAGE = Unicode::Normalize
202
203 SV*
204 decompose(src, compat = &PL_sv_no)
205 SV * src
206 SV * compat
207 PROTOTYPE: $;$
208 PREINIT:
209 SV *dst;
210 STRLEN srclen, retlen;
211 U8 *s, *e, *p, *r;
212 UV uv;
213 bool iscompat;
214 CODE:
215 iscompat = SvTRUE(compat);
216 s = (U8*)sv_2pvunicode(src,&srclen);
217 e = s + srclen;
218
219 dst = newSV(1);
220 (void)SvPOK_only(dst);
221 SvUTF8_on(dst);
222
223 for (p = s; p < e; p += retlen) {
224 uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
225 if (!retlen)
226 croak(ErrRetlenIsZero);
227
228 if (Hangul_IsS(uv))
229 sv_cat_decompHangul(dst, uv);
230 else {
231 r = iscompat ? dec_compat(uv) : dec_canonical(uv);
232 if (r)
233 sv_catpv(dst, (char *)r);
234 else
235 sv_cat_uvuni(dst, uv);
236 }
237 }
238 RETVAL = dst;
239 OUTPUT:
240 RETVAL
241
242
243
244 SV*
245 reorder(src)
246 SV * src
247 PROTOTYPE: $
248 PREINIT:
249 SV *dst;
250 STRLEN srclen, dstlen, retlen, stk_cc_max;
251 U8 *s, *e, *p, *d, curCC;
252 UV uv, uvlast;
253 UNF_cc * stk_cc;
254 STRLEN i, cc_pos;
255 bool valid_uvlast;
256 CODE:
257 s = (U8*)sv_2pvunicode(src,&srclen);
258 e = s + srclen;
259
260 dstlen = srclen + 1;
261 dst = newSV(dstlen);
262 (void)SvPOK_only(dst);
263 SvUTF8_on(dst);
264 d = (U8*)SvPVX(dst);
265
266 stk_cc_max = 10; /* enough as an initial value? */
267 New(0, stk_cc, stk_cc_max, UNF_cc);
268
269 for (p = s; p < e;) {
270 uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
271 if (!retlen)
272 croak(ErrRetlenIsZero);
273 p += retlen;
274
275 curCC = getCombinClass(uv);
276 if (curCC == 0) {
277 d = uvuni_to_utf8(d, uv);
278 continue;
279 }
280
281 cc_pos = 0;
282 stk_cc[cc_pos].cc = curCC;
283 stk_cc[cc_pos].uv = uv;
284 stk_cc[cc_pos].pos = cc_pos;
285
286 valid_uvlast = FALSE;
287 while (p < e) {
288 uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
289 if (!retlen)
290 croak(ErrRetlenIsZero);
291 p += retlen;
292
293 curCC = getCombinClass(uv);
294 if (curCC == 0) {
295 uvlast = uv;
296 valid_uvlast = TRUE;
297 break;
298 }
299
300 cc_pos++;
301 if (stk_cc_max <= cc_pos) { /* extend if need */
302 stk_cc_max = cc_pos + 1;
303 Renew(stk_cc, stk_cc_max, UNF_cc);
304 }
305 stk_cc[cc_pos].cc = curCC;
306 stk_cc[cc_pos].uv = uv;
307 stk_cc[cc_pos].pos = cc_pos;
308 }
309
310 /* reordered if there are two c.c.'s */
311 if (cc_pos) {
312 qsort((void*)stk_cc, cc_pos + 1, sizeof(UNF_cc), compare_cc);
313 }
314
315 for (i = 0; i <= cc_pos; i++) {
316 d = uvuni_to_utf8(d, stk_cc[i].uv);
317 }
318 if (valid_uvlast)
319 {
320 d = uvuni_to_utf8(d, uvlast);
321 }
322 }
323 *d = '\0';
324 SvCUR_set(dst, d - (U8*)SvPVX(dst));
325 Safefree(stk_cc);
326 RETVAL = dst;
327 OUTPUT:
328 RETVAL
329
330
331
332 SV*
333 compose(src)
334 SV * src
335 PROTOTYPE: $
336 ALIAS:
337 composeContiguous = 1
338 PREINIT:
339 SV *dst, *tmp;
340 U8 *s, *p, *e, *d, *t, *tmp_start, curCC, preCC;
341 UV uv, uvS, uvComp;
342 STRLEN srclen, dstlen, tmplen, retlen;
343 bool beginning = TRUE;
344 CODE:
345 s = (U8*)sv_2pvunicode(src,&srclen);
346 e = s + srclen;
347
348 dstlen = srclen + 1;
349 dst = newSV(dstlen);
350 (void)SvPOK_only(dst);
351 SvUTF8_on(dst);
352 d = (U8*)SvPVX(dst);
353
354 /* for uncomposed combining char */
355 tmp = sv_2mortal(newSV(dstlen));
356 (void)SvPOK_only(tmp);
357 SvUTF8_on(tmp);
358
359 for (p = s; p < e;) {
360 if (beginning) {
361 uvS = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
362 if (!retlen)
363 croak(ErrRetlenIsZero);
364 p += retlen;
365
366 if (getCombinClass(uvS)) { /* no Starter found yet */
367 d = uvuni_to_utf8(d, uvS);
368 continue;
369 }
370 beginning = FALSE;
371 }
372
373 /* Starter */
374 t = tmp_start = (U8*)SvPVX(tmp);
375 preCC = 0;
376
377 /* to the next Starter */
378 while (p < e) {
379 uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
380 if (!retlen)
381 croak(ErrRetlenIsZero);
382 p += retlen;
383
384 curCC = getCombinClass(uv);
385
386 if (preCC && preCC == curCC) {
387 preCC = curCC;
388 t = uvuni_to_utf8(t, uv);
389 } else {
390 uvComp = composite_uv(uvS, uv);
391
392 if (uvComp && ! isExclusion(uvComp) &&
393 (ix ? (t == tmp_start) : (preCC <= curCC))) {
394 STRLEN leftcur, rightcur, dstcur;
395 leftcur = UNISKIP(uvComp);
396 rightcur = UNISKIP(uvS) + UNISKIP(uv);
397
398 if (leftcur > rightcur) {
399 dstcur = d - (U8*)SvPVX(dst);
400 dstlen += leftcur - rightcur;
401 d = (U8*)SvGROW(dst,dstlen) + dstcur;
402 }
403 /* preCC not changed to curCC */
404 uvS = uvComp;
405 } else if (! curCC && p < e) { /* blocked */
406 break;
407 } else {
408 preCC = curCC;
409 t = uvuni_to_utf8(t, uv);
410 }
411 }
412 }
413 d = uvuni_to_utf8(d, uvS); /* starter (composed or not) */
414 tmplen = t - tmp_start;
415 if (tmplen) { /* uncomposed combining char */
416 t = (U8*)SvPVX(tmp);
417 while (tmplen--)
418 *d++ = *t++;
419 }
420 uvS = uv;
421 } /* for */
422 *d = '\0';
423 SvCUR_set(dst, d - (U8*)SvPVX(dst));
424 RETVAL = dst;
425 OUTPUT:
426 RETVAL
427
428
429 void
430 checkNFD(src)
431 SV * src
432 PROTOTYPE: $
433 ALIAS:
434 checkNFKD = 1
435 PREINIT:
436 STRLEN srclen, retlen;
437 U8 *s, *e, *p, curCC, preCC;
438 UV uv;
439 CODE:
440 s = (U8*)sv_2pvunicode(src,&srclen);
441 e = s + srclen;
442
443 preCC = 0;
444 for (p = s; p < e; p += retlen) {
445 uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
446 if (!retlen)
447 croak(ErrRetlenIsZero);
448
449 curCC = getCombinClass(uv);
450 if (preCC > curCC && curCC != 0) /* canonical ordering violated */
451 XSRETURN_NO;
452 if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv)))
453 XSRETURN_NO;
454 preCC = curCC;
455 }
456 XSRETURN_YES;
457
458
459
460 void
461 checkNFC(src)
462 SV * src
463 PROTOTYPE: $
464 ALIAS:
465 checkNFKC = 1
466 PREINIT:
467 STRLEN srclen, retlen;
468 U8 *s, *e, *p, curCC, preCC;
469 UV uv;
470 bool isMAYBE;
471 CODE:
472 s = (U8*)sv_2pvunicode(src,&srclen);
473 e = s + srclen;
474
475 preCC = 0;
476 isMAYBE = FALSE;
477 for (p = s; p < e; p += retlen) {
478 uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
479 if (!retlen)
480 croak(ErrRetlenIsZero);
481
482 curCC = getCombinClass(uv);
483
484 if (preCC > curCC && curCC != 0) /* canonical ordering violated */
485 XSRETURN_NO;
486
487 /* get NFC/NFKC property */
488 if (Hangul_IsS(uv)) /* Hangul syllables are canonical composites */
489 ; /* YES */
490 else if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv))
491 XSRETURN_NO;
492 else if (isComp2nd(uv))
493 isMAYBE = TRUE;
494 else if (ix) {
495 char *canon, *compat;
496 /* NFKC_NO when having compatibility mapping. */
497 canon = (char *) dec_canonical(uv);
498 compat = (char *) dec_compat(uv);
499 if (compat && !(canon && strEQ(canon, compat)))
500 XSRETURN_NO;
501 } /* end of get NFC/NFKC property */
502
503 preCC = curCC;
504 }
505 if (isMAYBE)
506 XSRETURN_UNDEF;
507 else
508 XSRETURN_YES;
509
510
511
512 void
513 checkFCD(src)
514 SV * src
515 PROTOTYPE: $
516 ALIAS:
517 checkFCC = 1
518 PREINIT:
519 STRLEN srclen, retlen, canlen, canret;
520 U8 *s, *e, *p, curCC, preCC;
521 UV uv, uvLead, uvTrail;
522 U8 *sCan, *pCan, *eCan;
523 bool isMAYBE;
524 CODE:
525 s = (U8*)sv_2pvunicode(src,&srclen);
526 e = s + srclen;
527
528 preCC = 0;
529 isMAYBE = FALSE;
530 for (p = s; p < e; p += retlen) {
531 uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
532 if (!retlen)
533 croak(ErrRetlenIsZero);
534
535 sCan = (U8*) dec_canonical(uv);
536
537 if (sCan) {
538 canlen = (STRLEN)strlen((char *) sCan);
539 uvLead = utf8n_to_uvuni(sCan, canlen, &canret, AllowAnyUTF);
540 }
541 else {
542 uvLead = uv;
543 }
544
545 curCC = getCombinClass(uvLead);
546
547 if (curCC != 0 && curCC < preCC) /* canonical ordering violated */
548 XSRETURN_NO;
549
550 if (ix) {
551 if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv))
552 XSRETURN_NO;
553 else if (isComp2nd(uv))
554 isMAYBE = TRUE;
555 }
556
557 if (sCan) {
558 eCan = sCan + canlen;
559 pCan = utf8_hop(eCan, -1);
560 if (pCan < sCan)
561 croak(ErrHopBeforeStart);
562 uvTrail = utf8n_to_uvuni(pCan, eCan - pCan, &canret, AllowAnyUTF);
563 preCC = getCombinClass(uvTrail);
564 }
565 else {
566 preCC = curCC;
567 }
568 }
569 if (isMAYBE)
570 XSRETURN_UNDEF;
571 else
572 XSRETURN_YES;
573
574
575
576 U8
577 getCombinClass(uv)
578 UV uv
579 PROTOTYPE: $
580
581 bool
582 isExclusion(uv)
583 UV uv
584 PROTOTYPE: $
585
586 bool
587 isSingleton(uv)
588 UV uv
589 PROTOTYPE: $
590
591 bool
592 isNonStDecomp(uv)
593 UV uv
594 PROTOTYPE: $
595
596 bool
597 isComp2nd(uv)
598 UV uv
599 PROTOTYPE: $
600 ALIAS:
601 isNFC_MAYBE = 1
602 isNFKC_MAYBE = 2
603
604
605
606 void
607 isNFD_NO(uv)
608 UV uv
609 PROTOTYPE: $
610 ALIAS:
611 isNFKD_NO = 1
612 CODE:
613 if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv)))
614 XSRETURN_YES; /* NFD_NO or NFKD_NO */
615 else
616 XSRETURN_NO;
617
618
619
620 void
621 isComp_Ex(uv)
622 UV uv
623 PROTOTYPE: $
624 ALIAS:
625 isNFC_NO = 0
626 isNFKC_NO = 1
627 CODE:
628 if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv))
629 XSRETURN_YES; /* NFC_NO or NFKC_NO */
630 else if (ix) {
631 char *canon, *compat;
632 canon = (char *) dec_canonical(uv);
633 compat = (char *) dec_compat(uv);
634 if (compat && (!canon || strNE(canon, compat)))
635 XSRETURN_YES; /* NFC_NO or NFKC_NO */
636 else
637 XSRETURN_NO;
638 }
639 else
640 XSRETURN_NO;
641
642
643
644 SV*
645 getComposite(uv, uv2)
646 UV uv
647 UV uv2
648 PROTOTYPE: $$
649 PREINIT:
650 UV composite;
651 CODE:
652 composite = composite_uv(uv, uv2);
653 RETVAL = composite ? newSVuv(composite) : &PL_sv_undef;
654 OUTPUT:
655 RETVAL
656
657
658
659 SV*
660 getCanon(uv)
661 UV uv
662 PROTOTYPE: $
663 ALIAS:
664 getCompat = 1
665 PREINIT:
666 U8 * rstr;
667 CODE:
668 if (Hangul_IsS(uv)) {
669 SV * dst;
670 dst = newSV(1);
671 (void)SvPOK_only(dst);
672 sv_cat_decompHangul(dst, uv);
673 RETVAL = dst;
674 } else {
675 rstr = ix ? dec_compat(uv) : dec_canonical(uv);
676 if (!rstr)
677 XSRETURN_UNDEF;
678 RETVAL = newSVpvn((char *)rstr, strlen((char *)rstr));
679 }
680 SvUTF8_on(RETVAL);
681 OUTPUT:
682 RETVAL
683
684
685 void
686 splitOnLastStarter(src)
687 SV * src
688 PREINIT:
689 SV *svp;
690 STRLEN srclen, retlen;
691 U8 *s, *e, *p;
692 UV uv;
693 PPCODE:
694 s = (U8*)sv_2pvunicode(src,&srclen);
695 e = s + srclen;
696
697 for (p = e; s < p; ) {
698 p = utf8_hop(p, -1);
699 if (p < s)
700 croak(ErrHopBeforeStart);
701 uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
702 if (getCombinClass(uv) == 0) /* Last Starter found */
703 break;
704 }
705
706 svp = sv_2mortal(newSVpvn((char*)s, p - s));
707 SvUTF8_on(svp);
708 XPUSHs(svp);
709
710 svp = sv_2mortal(newSVpvn((char*)p, e - p));
711 SvUTF8_on(svp);
712 XPUSHs(svp);
713
714