1# $Id: encoding.pm,v 2.2 2005/09/08 14:17:17 dankogai Exp dankogai $ 2package encoding; 3our $VERSION = do { my @r = (q$Revision: 2.2 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; 4 5use Encode; 6use strict; 7 8sub DEBUG () { 0 } 9 10BEGIN { 11 if (ord("A") == 193) { 12 require Carp; 13 Carp::croak("encoding: pragma does not support EBCDIC platforms"); 14 } 15} 16 17our $HAS_PERLIO = 0; 18eval { require PerlIO::encoding }; 19unless ($@){ 20 $HAS_PERLIO = (PerlIO::encoding->VERSION >= 0.02); 21} 22 23sub _exception{ 24 my $name = shift; 25 $] > 5.008 and return 0; # 5.8.1 or higher then no 26 my %utfs = map {$_=>1} 27 qw(utf8 UCS-2BE UCS-2LE UTF-16 UTF-16BE UTF-16LE 28 UTF-32 UTF-32BE UTF-32LE); 29 $utfs{$name} or return 0; # UTFs or no 30 require Config; Config->import(); our %Config; 31 return $Config{perl_patchlevel} ? 0 : 1 # maintperl then no 32} 33 34sub in_locale { $^H & ($locale::hint_bits || 0)} 35 36sub _get_locale_encoding { 37 my $locale_encoding; 38 39 # I18N::Langinfo isn't available everywhere 40 eval { 41 require I18N::Langinfo; 42 I18N::Langinfo->import(qw(langinfo CODESET)); 43 $locale_encoding = langinfo(CODESET()); 44 }; 45 46 my $country_language; 47 48 no warnings 'uninitialized'; 49 50 if (not $locale_encoding && in_locale()) { 51 if ($ENV{LC_ALL} =~ /^([^.]+)\.([^.]+)$/) { 52 ($country_language, $locale_encoding) = ($1, $2); 53 } elsif ($ENV{LANG} =~ /^([^.]+)\.([^.]+)$/) { 54 ($country_language, $locale_encoding) = ($1, $2); 55 } 56 # LANGUAGE affects only LC_MESSAGES only on glibc 57 } elsif (not $locale_encoding) { 58 if ($ENV{LC_ALL} =~ /\butf-?8\b/i || 59 $ENV{LANG} =~ /\butf-?8\b/i) { 60 $locale_encoding = 'utf8'; 61 } 62 # Could do more heuristics based on the country and language 63 # parts of LC_ALL and LANG (the parts before the dot (if any)), 64 # since we have Locale::Country and Locale::Language available. 65 # TODO: get a database of Language -> Encoding mappings 66 # (the Estonian database at http://www.eki.ee/letter/ 67 # would be excellent!) --jhi 68 } 69 if (defined $locale_encoding && 70 lc($locale_encoding) eq 'euc' && 71 defined $country_language) { 72 if ($country_language =~ /^ja_JP|japan(?:ese)?$/i) { 73 $locale_encoding = 'euc-jp'; 74 } elsif ($country_language =~ /^ko_KR|korean?$/i) { 75 $locale_encoding = 'euc-kr'; 76 } elsif ($country_language =~ /^zh_CN|chin(?:a|ese)?$/i) { 77 $locale_encoding = 'euc-cn'; 78 } elsif ($country_language =~ /^zh_TW|taiwan(?:ese)?$/i) { 79 $locale_encoding = 'euc-tw'; 80 } else { 81 require Carp; 82 Carp::croak("encoding: Locale encoding '$locale_encoding' too ambiguous"); 83 } 84 } 85 86 return $locale_encoding; 87} 88 89sub import { 90 my $class = shift; 91 my $name = shift; 92 if ($name eq ':_get_locale_encoding') { # used by lib/open.pm 93 my $caller = caller(); 94 { 95 no strict 'refs'; 96 *{"${caller}::_get_locale_encoding"} = \&_get_locale_encoding; 97 } 98 return; 99 } 100 $name = _get_locale_encoding() if $name eq ':locale'; 101 my %arg = @_; 102 $name = $ENV{PERL_ENCODING} unless defined $name; 103 my $enc = find_encoding($name); 104 unless (defined $enc) { 105 require Carp; 106 Carp::croak("encoding: Unknown encoding '$name'"); 107 } 108 $name = $enc->name; # canonize 109 unless ($arg{Filter}) { 110 DEBUG and warn "_exception($name) = ", _exception($name); 111 _exception($name) or ${^ENCODING} = $enc; 112 $HAS_PERLIO or return 1; 113 }else{ 114 defined(${^ENCODING}) and undef ${^ENCODING}; 115 # implicitly 'use utf8' 116 require utf8; # to fetch $utf8::hint_bits; 117 $^H |= $utf8::hint_bits; 118 eval { 119 require Filter::Util::Call ; 120 Filter::Util::Call->import ; 121 filter_add(sub{ 122 my $status = filter_read(); 123 if ($status > 0){ 124 $_ = $enc->decode($_, 1); 125 DEBUG and warn $_; 126 } 127 $status ; 128 }); 129 }; 130 $@ eq '' and DEBUG and warn "Filter installed"; 131 } 132 defined ${^UNICODE} and ${^UNICODE} != 0 and return 1; 133 for my $h (qw(STDIN STDOUT)){ 134 if ($arg{$h}){ 135 unless (defined find_encoding($arg{$h})) { 136 require Carp; 137 Carp::croak("encoding: Unknown encoding for $h, '$arg{$h}'"); 138 } 139 eval { binmode($h, ":raw :encoding($arg{$h})") }; 140 }else{ 141 unless (exists $arg{$h}){ 142 eval { 143 no warnings 'uninitialized'; 144 binmode($h, ":raw :encoding($name)"); 145 }; 146 } 147 } 148 if ($@){ 149 require Carp; 150 Carp::croak($@); 151 } 152 } 153 return 1; # I doubt if we need it, though 154} 155 156sub unimport{ 157 no warnings; 158 undef ${^ENCODING}; 159 if ($HAS_PERLIO){ 160 binmode(STDIN, ":raw"); 161 binmode(STDOUT, ":raw"); 162 }else{ 163 binmode(STDIN); 164 binmode(STDOUT); 165 } 166 if ($INC{"Filter/Util/Call.pm"}){ 167 eval { filter_del() }; 168 } 169} 170 1711; 172__END__ 173 174=pod 175 176=head1 NAME 177 178encoding - allows you to write your script in non-ascii or non-utf8 179 180=head1 SYNOPSIS 181 182 use encoding "greek"; # Perl like Greek to you? 183 use encoding "euc-jp"; # Jperl! 184 185 # or you can even do this if your shell supports your native encoding 186 187 perl -Mencoding=latin2 -e '...' # Feeling centrally European? 188 perl -Mencoding=euc-kr -e '...' # Or Korean? 189 190 # more control 191 192 # A simple euc-cn => utf-8 converter 193 use encoding "euc-cn", STDOUT => "utf8"; while(<>){print}; 194 195 # "no encoding;" supported (but not scoped!) 196 no encoding; 197 198 # an alternate way, Filter 199 use encoding "euc-jp", Filter=>1; 200 # now you can use kanji identifiers -- in euc-jp! 201 202 # switch on locale - 203 # note that this probably means that unless you have a complete control 204 # over the environments the application is ever going to be run, you should 205 # NOT use the feature of encoding pragma allowing you to write your script 206 # in any recognized encoding because changing locale settings will wreck 207 # the script; you can of course still use the other features of the pragma. 208 use encoding ':locale'; 209 210=head1 ABSTRACT 211 212Let's start with a bit of history: Perl 5.6.0 introduced Unicode 213support. You could apply C<substr()> and regexes even to complex CJK 214characters -- so long as the script was written in UTF-8. But back 215then, text editors that supported UTF-8 were still rare and many users 216instead chose to write scripts in legacy encodings, giving up a whole 217new feature of Perl 5.6. 218 219Rewind to the future: starting from perl 5.8.0 with the B<encoding> 220pragma, you can write your script in any encoding you like (so long 221as the C<Encode> module supports it) and still enjoy Unicode support. 222This pragma achieves that by doing the following: 223 224=over 225 226=item * 227 228Internally converts all literals (C<q//,qq//,qr//,qw///, qx//>) from 229the encoding specified to utf8. In Perl 5.8.1 and later, literals in 230C<tr///> and C<DATA> pseudo-filehandle are also converted. 231 232=item * 233 234Changing PerlIO layers of C<STDIN> and C<STDOUT> to the encoding 235 specified. 236 237=back 238 239=head2 Literal Conversions 240 241You can write code in EUC-JP as follows: 242 243 my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji 244 #<-char-><-char-> # 4 octets 245 s/\bCamel\b/$Rakuda/; 246 247And with C<use encoding "euc-jp"> in effect, it is the same thing as 248the code in UTF-8: 249 250 my $Rakuda = "\x{99F1}\x{99DD}"; # two Unicode Characters 251 s/\bCamel\b/$Rakuda/; 252 253=head2 PerlIO layers for C<STD(IN|OUT)> 254 255The B<encoding> pragma also modifies the filehandle layers of 256STDIN and STDOUT to the specified encoding. Therefore, 257 258 use encoding "euc-jp"; 259 my $message = "Camel is the symbol of perl.\n"; 260 my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji 261 $message =~ s/\bCamel\b/$Rakuda/; 262 print $message; 263 264Will print "\xF1\xD1\xF1\xCC is the symbol of perl.\n", 265not "\x{99F1}\x{99DD} is the symbol of perl.\n". 266 267You can override this by giving extra arguments; see below. 268 269=head2 Implicit upgrading for byte strings 270 271By default, if strings operating under byte semantics and strings 272with Unicode character data are concatenated, the new string will 273be created by decoding the byte strings as I<ISO 8859-1 (Latin-1)>. 274 275The B<encoding> pragma changes this to use the specified encoding 276instead. For example: 277 278 use encoding 'utf8'; 279 my $string = chr(20000); # a Unicode string 280 utf8::encode($string); # now it's a UTF-8 encoded byte string 281 # concatenate with another Unicode string 282 print length($string . chr(20000)); 283 284Will print C<2>, because C<$string> is upgraded as UTF-8. Without 285C<use encoding 'utf8';>, it will print C<4> instead, since C<$string> 286is three octets when interpreted as Latin-1. 287 288=head1 FEATURES THAT REQUIRE 5.8.1 289 290Some of the features offered by this pragma requires perl 5.8.1. Most 291of these are done by Inaba Hiroto. Any other features and changes 292are good for 5.8.0. 293 294=over 295 296=item "NON-EUC" doublebyte encodings 297 298Because perl needs to parse script before applying this pragma, such 299encodings as Shift_JIS and Big-5 that may contain '\' (BACKSLASH; 300\x5c) in the second byte fails because the second byte may 301accidentally escape the quoting character that follows. Perl 5.8.1 302or later fixes this problem. 303 304=item tr// 305 306C<tr//> was overlooked by Perl 5 porters when they released perl 5.8.0 307See the section below for details. 308 309=item DATA pseudo-filehandle 310 311Another feature that was overlooked was C<DATA>. 312 313=back 314 315=head1 USAGE 316 317=over 4 318 319=item use encoding [I<ENCNAME>] ; 320 321Sets the script encoding to I<ENCNAME>. And unless ${^UNICODE} 322exists and non-zero, PerlIO layers of STDIN and STDOUT are set to 323":encoding(I<ENCNAME>)". 324 325Note that STDERR WILL NOT be changed. 326 327Also note that non-STD file handles remain unaffected. Use C<use 328open> or C<binmode> to change layers of those. 329 330If no encoding is specified, the environment variable L<PERL_ENCODING> 331is consulted. If no encoding can be found, the error C<Unknown encoding 332'I<ENCNAME>'> will be thrown. 333 334=item use encoding I<ENCNAME> [ STDIN =E<gt> I<ENCNAME_IN> ...] ; 335 336You can also individually set encodings of STDIN and STDOUT via the 337C<< STDIN => I<ENCNAME> >> form. In this case, you cannot omit the 338first I<ENCNAME>. C<< STDIN => undef >> turns the IO transcoding 339completely off. 340 341When ${^UNICODE} exists and non-zero, these options will completely 342ignored. ${^UNICODE} is a variable introduced in perl 5.8.1. See 343L<perlrun> see L<perlvar/"${^UNICODE}"> and L<perlrun/"-C"> for 344details (perl 5.8.1 and later). 345 346=item use encoding I<ENCNAME> Filter=E<gt>1; 347 348This turns the encoding pragma into a source filter. While the 349default approach just decodes interpolated literals (in qq() and 350qr()), this will apply a source filter to the entire source code. See 351L</"The Filter Option"> below for details. 352 353=item no encoding; 354 355Unsets the script encoding. The layers of STDIN, STDOUT are 356reset to ":raw" (the default unprocessed raw stream of bytes). 357 358=back 359 360=head1 The Filter Option 361 362The magic of C<use encoding> is not applied to the names of 363identifiers. In order to make C<${"\x{4eba}"}++> ($human++, where human 364is a single Han ideograph) work, you still need to write your script 365in UTF-8 -- or use a source filter. That's what 'Filter=>1' does. 366 367What does this mean? Your source code behaves as if it is written in 368UTF-8 with 'use utf8' in effect. So even if your editor only supports 369Shift_JIS, for example, you can still try examples in Chapter 15 of 370C<Programming Perl, 3rd Ed.>. For instance, you can use UTF-8 371identifiers. 372 373This option is significantly slower and (as of this writing) non-ASCII 374identifiers are not very stable WITHOUT this option and with the 375source code written in UTF-8. 376 377=head2 Filter-related changes at Encode version 1.87 378 379=over 380 381=item * 382 383The Filter option now sets STDIN and STDOUT like non-filter options. 384And C<< STDIN=>I<ENCODING> >> and C<< STDOUT=>I<ENCODING> >> work like 385non-filter version. 386 387=item * 388 389C<use utf8> is implicitly declared so you no longer have to C<use 390utf8> to C<${"\x{4eba}"}++>. 391 392=back 393 394=head1 CAVEATS 395 396=head2 NOT SCOPED 397 398The pragma is a per script, not a per block lexical. Only the last 399C<use encoding> or C<no encoding> matters, and it affects 400B<the whole script>. However, the <no encoding> pragma is supported and 401B<use encoding> can appear as many times as you want in a given script. 402The multiple use of this pragma is discouraged. 403 404By the same reason, the use this pragma inside modules is also 405discouraged (though not as strongly discouraged as the case above. 406See below). 407 408If you still have to write a module with this pragma, be very careful 409of the load order. See the codes below; 410 411 # called module 412 package Module_IN_BAR; 413 use encoding "bar"; 414 # stuff in "bar" encoding here 415 1; 416 417 # caller script 418 use encoding "foo" 419 use Module_IN_BAR; 420 # surprise! use encoding "bar" is in effect. 421 422The best way to avoid this oddity is to use this pragma RIGHT AFTER 423other modules are loaded. i.e. 424 425 use Module_IN_BAR; 426 use encoding "foo"; 427 428=head2 DO NOT MIX MULTIPLE ENCODINGS 429 430Notice that only literals (string or regular expression) having only 431legacy code points are affected: if you mix data like this 432 433 \xDF\x{100} 434 435the data is assumed to be in (Latin 1 and) Unicode, not in your native 436encoding. In other words, this will match in "greek": 437 438 "\xDF" =~ /\x{3af}/ 439 440but this will not 441 442 "\xDF\x{100}" =~ /\x{3af}\x{100}/ 443 444since the C<\xDF> (ISO 8859-7 GREEK SMALL LETTER IOTA WITH TONOS) on 445the left will B<not> be upgraded to C<\x{3af}> (Unicode GREEK SMALL 446LETTER IOTA WITH TONOS) because of the C<\x{100}> on the left. You 447should not be mixing your legacy data and Unicode in the same string. 448 449This pragma also affects encoding of the 0x80..0xFF code point range: 450normally characters in that range are left as eight-bit bytes (unless 451they are combined with characters with code points 0x100 or larger, 452in which case all characters need to become UTF-8 encoded), but if 453the C<encoding> pragma is present, even the 0x80..0xFF range always 454gets UTF-8 encoded. 455 456After all, the best thing about this pragma is that you don't have to 457resort to \x{....} just to spell your name in a native encoding. 458So feel free to put your strings in your encoding in quotes and 459regexes. 460 461=head2 tr/// with ranges 462 463The B<encoding> pragma works by decoding string literals in 464C<q//,qq//,qr//,qw///, qx//> and so forth. In perl 5.8.0, this 465does not apply to C<tr///>. Therefore, 466 467 use encoding 'euc-jp'; 468 #.... 469 $kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/; 470 # -------- -------- -------- -------- 471 472Does not work as 473 474 $kana =~ tr/\x{3041}-\x{3093}/\x{30a1}-\x{30f3}/; 475 476=over 477 478=item Legend of characters above 479 480 utf8 euc-jp charnames::viacode() 481 ----------------------------------------- 482 \x{3041} \xA4\xA1 HIRAGANA LETTER SMALL A 483 \x{3093} \xA4\xF3 HIRAGANA LETTER N 484 \x{30a1} \xA5\xA1 KATAKANA LETTER SMALL A 485 \x{30f3} \xA5\xF3 KATAKANA LETTER N 486 487=back 488 489This counterintuitive behavior has been fixed in perl 5.8.1. 490 491=head3 workaround to tr///; 492 493In perl 5.8.0, you can work around as follows; 494 495 use encoding 'euc-jp'; 496 # .... 497 eval qq{ \$kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/ }; 498 499Note the C<tr//> expression is surrounded by C<qq{}>. The idea behind 500is the same as classic idiom that makes C<tr///> 'interpolate'. 501 502 tr/$from/$to/; # wrong! 503 eval qq{ tr/$from/$to/ }; # workaround. 504 505Nevertheless, in case of B<encoding> pragma even C<q//> is affected so 506C<tr///> not being decoded was obviously against the will of Perl5 507Porters so it has been fixed in Perl 5.8.1 or later. 508 509=head1 EXAMPLE - Greekperl 510 511 use encoding "iso 8859-7"; 512 513 # \xDF in ISO 8859-7 (Greek) is \x{3af} in Unicode. 514 515 $a = "\xDF"; 516 $b = "\x{100}"; 517 518 printf "%#x\n", ord($a); # will print 0x3af, not 0xdf 519 520 $c = $a . $b; 521 522 # $c will be "\x{3af}\x{100}", not "\x{df}\x{100}". 523 524 # chr() is affected, and ... 525 526 print "mega\n" if ord(chr(0xdf)) == 0x3af; 527 528 # ... ord() is affected by the encoding pragma ... 529 530 print "tera\n" if ord(pack("C", 0xdf)) == 0x3af; 531 532 # ... as are eq and cmp ... 533 534 print "peta\n" if "\x{3af}" eq pack("C", 0xdf); 535 print "exa\n" if "\x{3af}" cmp pack("C", 0xdf) == 0; 536 537 # ... but pack/unpack C are not affected, in case you still 538 # want to go back to your native encoding 539 540 print "zetta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf; 541 542=head1 KNOWN PROBLEMS 543 544=over 545 546=item literals in regex that are longer than 127 bytes 547 548For native multibyte encodings (either fixed or variable length), 549the current implementation of the regular expressions may introduce 550recoding errors for regular expression literals longer than 127 bytes. 551 552=item EBCDIC 553 554The encoding pragma is not supported on EBCDIC platforms. 555(Porters who are willing and able to remove this limitation are 556welcome.) 557 558=item format 559 560This pragma doesn't work well with format because PerlIO does not 561get along very well with it. When format contains non-ascii 562characters it prints funny or gets "wide character warnings". 563To understand it, try the code below. 564 565 # Save this one in utf8 566 # replace *non-ascii* with a non-ascii string 567 my $camel; 568 format STDOUT = 569 *non-ascii*@>>>>>>> 570 $camel 571 . 572 $camel = "*non-ascii*"; 573 binmode(STDOUT=>':encoding(utf8)'); # bang! 574 write; # funny 575 print $camel, "\n"; # fine 576 577Without binmode this happens to work but without binmode, print() 578fails instead of write(). 579 580At any rate, the very use of format is questionable when it comes to 581unicode characters since you have to consider such things as character 582width (i.e. double-width for ideographs) and directions (i.e. BIDI for 583Arabic and Hebrew). 584 585=back 586 587=head2 The Logic of :locale 588 589The logic of C<:locale> is as follows: 590 591=over 4 592 593=item 1. 594 595If the platform supports the langinfo(CODESET) interface, the codeset 596returned is used as the default encoding for the open pragma. 597 598=item 2. 599 600If 1. didn't work but we are under the locale pragma, the environment 601variables LC_ALL and LANG (in that order) are matched for encodings 602(the part after C<.>, if any), and if any found, that is used 603as the default encoding for the open pragma. 604 605=item 3. 606 607If 1. and 2. didn't work, the environment variables LC_ALL and LANG 608(in that order) are matched for anything looking like UTF-8, and if 609any found, C<:utf8> is used as the default encoding for the open 610pragma. 611 612=back 613 614If your locale environment variables (LC_ALL, LC_CTYPE, LANG) 615contain the strings 'UTF-8' or 'UTF8' (case-insensitive matching), 616the default encoding of your STDIN, STDOUT, and STDERR, and of 617B<any subsequent file open>, is UTF-8. 618 619=head1 HISTORY 620 621This pragma first appeared in Perl 5.8.0. For features that require 6225.8.1 and better, see above. 623 624The C<:locale> subpragma was implemented in 2.01, or Perl 5.8.6. 625 626=head1 SEE ALSO 627 628L<perlunicode>, L<Encode>, L<open>, L<Filter::Util::Call>, 629 630Ch. 15 of C<Programming Perl (3rd Edition)> 631by Larry Wall, Tom Christiansen, Jon Orwant; 632O'Reilly & Associates; ISBN 0-596-00027-8 633 634=cut 635