1package Unicode::Normalize; 2 3BEGIN { 4 unless ("A" eq pack('U', 0x41)) { 5 die "Unicode::Normalize cannot stringify a Unicode code point\n"; 6 } 7} 8 9use 5.006; 10use strict; 11use warnings; 12use Carp; 13 14no warnings 'utf8'; 15 16our $VERSION = '0.32'; 17our $PACKAGE = __PACKAGE__; 18 19require Exporter; 20require DynaLoader; 21 22our @ISA = qw(Exporter DynaLoader); 23our @EXPORT = qw( NFC NFD NFKC NFKD ); 24our @EXPORT_OK = qw( 25 normalize decompose reorder compose 26 checkNFD checkNFKD checkNFC checkNFKC check 27 getCanon getCompat getComposite getCombinClass 28 isExclusion isSingleton isNonStDecomp isComp2nd isComp_Ex 29 isNFD_NO isNFC_NO isNFC_MAYBE isNFKD_NO isNFKC_NO isNFKC_MAYBE 30 FCD checkFCD FCC checkFCC composeContiguous 31 splitOnLastStarter 32); 33our %EXPORT_TAGS = ( 34 all => [ @EXPORT, @EXPORT_OK ], 35 normalize => [ @EXPORT, qw/normalize decompose reorder compose/ ], 36 check => [ qw/checkNFD checkNFKD checkNFC checkNFKC check/ ], 37 fast => [ qw/FCD checkFCD FCC checkFCC composeContiguous/ ], 38); 39 40###### 41 42bootstrap Unicode::Normalize $VERSION; 43 44###### 45 46sub pack_U { 47 return pack('U*', @_); 48} 49 50sub unpack_U { 51 return unpack('U*', pack('U*').shift); 52} 53 54 55## 56## normalization forms 57## 58 59use constant COMPAT => 1; 60 61sub NFD ($) { reorder(decompose($_[0])) } 62sub NFKD ($) { reorder(decompose($_[0], COMPAT)) } 63sub NFC ($) { compose(reorder(decompose($_[0]))) } 64sub NFKC ($) { compose(reorder(decompose($_[0], COMPAT))) } 65 66sub FCD ($) { 67 my $str = shift; 68 return checkFCD($str) ? $str : NFD($str); 69} 70sub FCC ($) { composeContiguous(reorder(decompose($_[0]))) } 71 72our %formNorm = ( 73 NFC => \&NFC, C => \&NFC, 74 NFD => \&NFD, D => \&NFD, 75 NFKC => \&NFKC, KC => \&NFKC, 76 NFKD => \&NFKD, KD => \&NFKD, 77 FCD => \&FCD, FCC => \&FCC, 78); 79 80sub normalize($$) 81{ 82 my $form = shift; 83 my $str = shift; 84 return exists $formNorm{$form} 85 ? $formNorm{$form}->($str) 86 : croak $PACKAGE."::normalize: invalid form name: $form"; 87} 88 89 90## 91## quick check 92## 93 94our %formCheck = ( 95 NFC => \&checkNFC, C => \&checkNFC, 96 NFD => \&checkNFD, D => \&checkNFD, 97 NFKC => \&checkNFKC, KC => \&checkNFKC, 98 NFKD => \&checkNFKD, KD => \&checkNFKD, 99 FCD => \&checkFCD, FCC => \&checkFCC, 100); 101 102sub check($$) 103{ 104 my $form = shift; 105 my $str = shift; 106 return exists $formCheck{$form} 107 ? $formCheck{$form}->($str) 108 : croak $PACKAGE."::check: invalid form name: $form"; 109} 110 1111; 112__END__ 113 114=head1 NAME 115 116Unicode::Normalize - Unicode Normalization Forms 117 118=head1 SYNOPSIS 119 120(1) using function names exported by default: 121 122 use Unicode::Normalize; 123 124 $NFD_string = NFD($string); # Normalization Form D 125 $NFC_string = NFC($string); # Normalization Form C 126 $NFKD_string = NFKD($string); # Normalization Form KD 127 $NFKC_string = NFKC($string); # Normalization Form KC 128 129(2) using function names exported on request: 130 131 use Unicode::Normalize 'normalize'; 132 133 $NFD_string = normalize('D', $string); # Normalization Form D 134 $NFC_string = normalize('C', $string); # Normalization Form C 135 $NFKD_string = normalize('KD', $string); # Normalization Form KD 136 $NFKC_string = normalize('KC', $string); # Normalization Form KC 137 138=head1 DESCRIPTION 139 140Parameters: 141 142C<$string> is used as a string under character semantics 143(see F<perlunicode>). 144 145C<$codepoint> should be an unsigned integer 146representing a Unicode code point. 147 148Note: Between XSUB and pure Perl, there is an incompatibility 149about the interpretation of C<$codepoint> as a decimal number. 150XSUB converts C<$codepoint> to an unsigned integer, but pure Perl does not. 151Do not use a floating point nor a negative sign in C<$codepoint>. 152 153=head2 Normalization Forms 154 155=over 4 156 157=item C<$NFD_string = NFD($string)> 158 159returns the Normalization Form D (formed by canonical decomposition). 160 161=item C<$NFC_string = NFC($string)> 162 163returns the Normalization Form C (formed by canonical decomposition 164followed by canonical composition). 165 166=item C<$NFKD_string = NFKD($string)> 167 168returns the Normalization Form KD (formed by compatibility decomposition). 169 170=item C<$NFKC_string = NFKC($string)> 171 172returns the Normalization Form KC (formed by compatibility decomposition 173followed by B<canonical> composition). 174 175=item C<$FCD_string = FCD($string)> 176 177If the given string is in FCD ("Fast C or D" form; cf. UTN #5), 178returns it without modification; otherwise returns an FCD string. 179 180Note: FCD is not always unique, then plural forms may be equivalent 181each other. C<FCD()> will return one of these equivalent forms. 182 183=item C<$FCC_string = FCC($string)> 184 185returns the FCC form ("Fast C Contiguous"; cf. UTN #5). 186 187Note: FCC is unique, as well as four normalization forms (NF*). 188 189=item C<$normalized_string = normalize($form_name, $string)> 190 191As C<$form_name>, one of the following names must be given. 192 193 'C' or 'NFC' for Normalization Form C (UAX #15) 194 'D' or 'NFD' for Normalization Form D (UAX #15) 195 'KC' or 'NFKC' for Normalization Form KC (UAX #15) 196 'KD' or 'NFKD' for Normalization Form KD (UAX #15) 197 198 'FCD' for "Fast C or D" Form (UTN #5) 199 'FCC' for "Fast C Contiguous" (UTN #5) 200 201=back 202 203=head2 Decomposition and Composition 204 205=over 4 206 207=item C<$decomposed_string = decompose($string)> 208 209=item C<$decomposed_string = decompose($string, $useCompatMapping)> 210 211Decomposes the specified string and returns the result. 212 213If the second parameter (a boolean) is omitted or false, decomposes it 214using the Canonical Decomposition Mapping. 215If true, decomposes it using the Compatibility Decomposition Mapping. 216 217The string returned is not always in NFD/NFKD. 218Reordering may be required. 219 220 $NFD_string = reorder(decompose($string)); # eq. to NFD() 221 $NFKD_string = reorder(decompose($string, TRUE)); # eq. to NFKD() 222 223=item C<$reordered_string = reorder($string)> 224 225Reorders the combining characters and the like in the canonical ordering 226and returns the result. 227 228E.g., when you have a list of NFD/NFKD strings, 229you can get the concatenated NFD/NFKD string from them, saying 230 231 $concat_NFD = reorder(join '', @NFD_strings); 232 $concat_NFKD = reorder(join '', @NFKD_strings); 233 234=item C<$composed_string = compose($string)> 235 236Returns the string where composable pairs are composed. 237 238E.g., when you have a NFD/NFKD string, 239you can get its NFC/NFKC string, saying 240 241 $NFC_string = compose($NFD_string); 242 $NFKC_string = compose($NFKD_string); 243 244=back 245 246=head2 Quick Check 247 248(see Annex 8, UAX #15; and F<DerivedNormalizationProps.txt>) 249 250The following functions check whether the string is in that normalization form. 251 252The result returned will be: 253 254 YES The string is in that normalization form. 255 NO The string is not in that normalization form. 256 MAYBE Dubious. Maybe yes, maybe no. 257 258=over 4 259 260=item C<$result = checkNFD($string)> 261 262returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>. 263 264=item C<$result = checkNFC($string)> 265 266returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>; 267C<undef> if C<MAYBE>. 268 269=item C<$result = checkNFKD($string)> 270 271returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>. 272 273=item C<$result = checkNFKC($string)> 274 275returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>; 276C<undef> if C<MAYBE>. 277 278=item C<$result = checkFCD($string)> 279 280returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>. 281 282=item C<$result = checkFCC($string)> 283 284returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>; 285C<undef> if C<MAYBE>. 286 287If a string is not in FCD, it must not be in FCC. 288So C<checkFCC($not_FCD_string)> should return C<NO>. 289 290=item C<$result = check($form_name, $string)> 291 292returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>; 293C<undef> if C<MAYBE>. 294 295As C<$form_name>, one of the following names must be given. 296 297 'C' or 'NFC' for Normalization Form C (UAX #15) 298 'D' or 'NFD' for Normalization Form D (UAX #15) 299 'KC' or 'NFKC' for Normalization Form KC (UAX #15) 300 'KD' or 'NFKD' for Normalization Form KD (UAX #15) 301 302 'FCD' for "Fast C or D" Form (UTN #5) 303 'FCC' for "Fast C Contiguous" (UTN #5) 304 305=back 306 307B<Note> 308 309In the cases of NFD, NFKD, and FCD, the answer must be 310either C<YES> or C<NO>. The answer C<MAYBE> may be returned 311in the cases of NFC, NFKC, and FCC. 312 313A C<MAYBE> string should contain at least one combining character 314or the like. For example, C<COMBINING ACUTE ACCENT> has 315the MAYBE_NFC/MAYBE_NFKC property. 316 317Both C<checkNFC("A\N{COMBINING ACUTE ACCENT}")> 318and C<checkNFC("B\N{COMBINING ACUTE ACCENT}")> will return C<MAYBE>. 319C<"A\N{COMBINING ACUTE ACCENT}"> is not in NFC 320(its NFC is C<"\N{LATIN CAPITAL LETTER A WITH ACUTE}">), 321while C<"B\N{COMBINING ACUTE ACCENT}"> is in NFC. 322 323If you want to check exactly, compare the string with its NFC/NFKC/FCC. 324 325 if ($string eq NFC($string)) { 326 # $string is exactly normalized in NFC; 327 } else { 328 # $string is not normalized in NFC; 329 } 330 331 if ($string eq NFKC($string)) { 332 # $string is exactly normalized in NFKC; 333 } else { 334 # $string is not normalized in NFKC; 335 } 336 337=head2 Character Data 338 339These functions are interface of character data used internally. 340If you want only to get Unicode normalization forms, you don't need 341call them yourself. 342 343=over 4 344 345=item C<$canonical_decomposed = getCanon($codepoint)> 346 347If the character of the specified codepoint is canonically 348decomposable (including Hangul Syllables), 349returns the B<completely decomposed> string canonically equivalent to it. 350 351If it is not decomposable, returns C<undef>. 352 353=item C<$compatibility_decomposed = getCompat($codepoint)> 354 355If the character of the specified codepoint is compatibility 356decomposable (including Hangul Syllables), 357returns the B<completely decomposed> string compatibility equivalent to it. 358 359If it is not decomposable, returns C<undef>. 360 361=item C<$codepoint_composite = getComposite($codepoint_here, $codepoint_next)> 362 363If two characters here and next (as codepoints) are composable 364(including Hangul Jamo/Syllables and Composition Exclusions), 365returns the codepoint of the composite. 366 367If they are not composable, returns C<undef>. 368 369=item C<$combining_class = getCombinClass($codepoint)> 370 371Returns the combining class of the character as an integer. 372 373=item C<$is_exclusion = isExclusion($codepoint)> 374 375Returns a boolean whether the character of the specified codepoint 376is a composition exclusion. 377 378=item C<$is_singleton = isSingleton($codepoint)> 379 380Returns a boolean whether the character of the specified codepoint is 381a singleton. 382 383=item C<$is_non_starter_decomposition = isNonStDecomp($codepoint)> 384 385Returns a boolean whether the canonical decomposition 386of the character of the specified codepoint 387is a Non-Starter Decomposition. 388 389=item C<$may_be_composed_with_prev_char = isComp2nd($codepoint)> 390 391Returns a boolean whether the character of the specified codepoint 392may be composed with the previous one in a certain composition 393(including Hangul Compositions, but excluding 394Composition Exclusions and Non-Starter Decompositions). 395 396=back 397 398=head1 EXPORT 399 400C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default. 401 402C<normalize> and other some functions: on request. 403 404=head1 CAVEATS 405 406=over 4 407 408=item Perl's version vs. Unicode version 409 410Since this module refers to perl core's Unicode database in the directory 411F</lib/unicore> (or formerly F</lib/unicode>), the Unicode version of 412normalization implemented by this module depends on your perl's version. 413 414 perl's version implemented Unicode version 415 5.6.1 3.0.1 416 5.7.2 3.1.0 417 5.7.3 3.1.1 (same normalized form as that of 3.1.0) 418 5.8.0 3.2.0 419 5.8.1-5.8.3 4.0.0 420 5.8.4-5.8.6 (latest) 4.0.1 (same normalized form as that of 4.0.0) 421 422=item Correction of decomposition mapping 423 424In older Unicode versions, a small number of characters (all of which are 425CJK compatibility ideographs as far as they have been found) may have 426an erroneous decomposition mapping (see F<NormalizationCorrections.txt>). 427Anyhow, this module will neither refer to F<NormalizationCorrections.txt> 428nor provide any specific version of normalization. Therefore this module 429running on an older perl with an older Unicode database may use 430the erroneous decomposition mapping blindly conforming to the Unicode database. 431 432=item Revised definition of canonical composition 433 434In Unicode 4.1.0, the definition D2 of canonical composition (which 435affects NFC and NFKC) has been changed (see Public Review Issue #29 436and recent UAX #15). This module has used the newer definition 437since the version 0.07 (Oct 31, 2001). 438This module does not support normalization according to the older 439definition, even if the Unicode version implemented by perl is 440lower than 4.1.0. 441 442=back 443 444=head1 AUTHOR 445 446SADAHIRO Tomoyuki <SADAHIRO@cpan.org> 447 448Copyright(C) 2001-2005, SADAHIRO Tomoyuki. Japan. All rights reserved. 449 450This module is free software; you can redistribute it 451and/or modify it under the same terms as Perl itself. 452 453=head1 SEE ALSO 454 455=over 4 456 457=item http://www.unicode.org/reports/tr15/ 458 459Unicode Normalization Forms - UAX #15 460 461=item http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt 462 463Derived Normalization Properties 464 465=item http://www.unicode.org/Public/UNIDATA/NormalizationCorrections.txt 466 467Normalization Corrections 468 469=item http://www.unicode.org/review/pr-29.html 470 471Public Review Issue #29: Normalization Issue 472 473=item http://www.unicode.org/notes/tn5/ 474 475Canonical Equivalence in Applications - UTN #5 476 477=back 478 479=cut 480