1# 2# $Id: Encode.pm,v 2.12 2005/09/08 14:17:17 dankogai Exp dankogai $ 3# 4package Encode; 5use strict; 6our $VERSION = sprintf "%d.%02d", q$Revision: 2.12 $ =~ /(\d+)/g; 7sub DEBUG () { 0 } 8use XSLoader (); 9XSLoader::load(__PACKAGE__, $VERSION); 10 11require Exporter; 12use base qw/Exporter/; 13 14# Public, encouraged API is exported by default 15 16our @EXPORT = qw( 17 decode decode_utf8 encode encode_utf8 18 encodings find_encoding clone_encoding 19); 20 21our @FB_FLAGS = qw(DIE_ON_ERR WARN_ON_ERR RETURN_ON_ERR LEAVE_SRC 22 PERLQQ HTMLCREF XMLCREF STOP_AT_PARTIAL); 23our @FB_CONSTS = qw(FB_DEFAULT FB_CROAK FB_QUIET FB_WARN 24 FB_PERLQQ FB_HTMLCREF FB_XMLCREF); 25 26our @EXPORT_OK = 27 ( 28 qw( 29 _utf8_off _utf8_on define_encoding from_to is_16bit is_8bit 30 is_utf8 perlio_ok resolve_alias utf8_downgrade utf8_upgrade 31 ), 32 @FB_FLAGS, @FB_CONSTS, 33 ); 34 35our %EXPORT_TAGS = 36 ( 37 all => [ @EXPORT, @EXPORT_OK ], 38 fallbacks => [ @FB_CONSTS ], 39 fallback_all => [ @FB_CONSTS, @FB_FLAGS ], 40 ); 41 42# Documentation moved after __END__ for speed - NI-S 43 44our $ON_EBCDIC = (ord("A") == 193); 45 46use Encode::Alias; 47 48# Make a %Encoding package variable to allow a certain amount of cheating 49our %Encoding; 50our %ExtModule; 51require Encode::Config; 52eval { require Encode::ConfigLocal }; 53 54sub encodings 55{ 56 my $class = shift; 57 my %enc; 58 if (@_ and $_[0] eq ":all"){ 59 %enc = ( %Encoding, %ExtModule ); 60 }else{ 61 %enc = %Encoding; 62 for my $mod (map {m/::/o ? $_ : "Encode::$_" } @_){ 63 DEBUG and warn $mod; 64 for my $enc (keys %ExtModule){ 65 $ExtModule{$enc} eq $mod and $enc{$enc} = $mod; 66 } 67 } 68 } 69 return 70 sort { lc $a cmp lc $b } 71 grep {!/^(?:Internal|Unicode|Guess)$/o} keys %enc; 72} 73 74sub perlio_ok{ 75 my $obj = ref($_[0]) ? $_[0] : find_encoding($_[0]); 76 $obj->can("perlio_ok") and return $obj->perlio_ok(); 77 return 0; # safety net 78} 79 80sub define_encoding 81{ 82 my $obj = shift; 83 my $name = shift; 84 $Encoding{$name} = $obj; 85 my $lc = lc($name); 86 define_alias($lc => $obj) unless $lc eq $name; 87 while (@_){ 88 my $alias = shift; 89 define_alias($alias, $obj); 90 } 91 return $obj; 92} 93 94sub getEncoding 95{ 96 my ($class, $name, $skip_external) = @_; 97 98 ref($name) && $name->can('renew') and return $name; 99 exists $Encoding{$name} and return $Encoding{$name}; 100 my $lc = lc $name; 101 exists $Encoding{$lc} and return $Encoding{$lc}; 102 103 my $oc = $class->find_alias($name); 104 defined($oc) and return $oc; 105 $lc ne $name and $oc = $class->find_alias($lc); 106 defined($oc) and return $oc; 107 108 unless ($skip_external) 109 { 110 if (my $mod = $ExtModule{$name} || $ExtModule{$lc}){ 111 $mod =~ s,::,/,g ; $mod .= '.pm'; 112 eval{ require $mod; }; 113 exists $Encoding{$name} and return $Encoding{$name}; 114 } 115 } 116 return; 117} 118 119sub find_encoding($;$) 120{ 121 my ($name, $skip_external) = @_; 122 return __PACKAGE__->getEncoding($name,$skip_external); 123} 124 125sub resolve_alias($){ 126 my $obj = find_encoding(shift); 127 defined $obj and return $obj->name; 128 return; 129} 130 131sub clone_encoding($){ 132 my $obj = find_encoding(shift); 133 ref $obj or return; 134 eval { require Storable }; 135 $@ and return; 136 return Storable::dclone($obj); 137} 138 139sub encode($$;$) 140{ 141 my ($name, $string, $check) = @_; 142 return undef unless defined $string; 143 $string .= '' if ref $string; # stringify; 144 $check ||=0; 145 my $enc = find_encoding($name); 146 unless(defined $enc){ 147 require Carp; 148 Carp::croak("Unknown encoding '$name'"); 149 } 150 my $octets = $enc->encode($string,$check); 151 $_[1] = $string if $check and !($check & LEAVE_SRC()); 152 return $octets; 153} 154 155sub decode($$;$) 156{ 157 my ($name,$octets,$check) = @_; 158 return undef unless defined $octets; 159 $octets .= '' if ref $octets; 160 $check ||=0; 161 my $enc = find_encoding($name); 162 unless(defined $enc){ 163 require Carp; 164 Carp::croak("Unknown encoding '$name'"); 165 } 166 my $string = $enc->decode($octets,$check); 167 $_[1] = $octets if $check and !($check & LEAVE_SRC()); 168 return $string; 169} 170 171sub from_to($$$;$) 172{ 173 my ($string,$from,$to,$check) = @_; 174 return undef unless defined $string; 175 $check ||=0; 176 my $f = find_encoding($from); 177 unless (defined $f){ 178 require Carp; 179 Carp::croak("Unknown encoding '$from'"); 180 } 181 my $t = find_encoding($to); 182 unless (defined $t){ 183 require Carp; 184 Carp::croak("Unknown encoding '$to'"); 185 } 186 my $uni = $f->decode($string,$check); 187 return undef if ($check && length($string)); 188 $string = $t->encode($uni,$check); 189 return undef if ($check && length($uni)); 190 return defined($_[0] = $string) ? length($string) : undef ; 191} 192 193sub encode_utf8($) 194{ 195 my ($str) = @_; 196 utf8::encode($str); 197 return $str; 198} 199 200sub decode_utf8($;$) 201{ 202 my ($str, $check) = @_; 203 if ($check){ 204 return decode("utf8", $str, $check); 205 }else{ 206 return decode("utf8", $str); 207 return $str; 208 } 209} 210 211predefine_encodings(1); 212 213# 214# This is to restore %Encoding if really needed; 215# 216 217sub predefine_encodings{ 218 use Encode::Encoding; 219 no warnings 'redefine'; 220 my $use_xs = shift; 221 if ($ON_EBCDIC) { 222 # was in Encode::UTF_EBCDIC 223 package Encode::UTF_EBCDIC; 224 push @Encode::UTF_EBCDIC::ISA, 'Encode::Encoding'; 225 *decode = sub{ 226 my ($obj,$str,$chk) = @_; 227 my $res = ''; 228 for (my $i = 0; $i < length($str); $i++) { 229 $res .= 230 chr(utf8::unicode_to_native(ord(substr($str,$i,1)))); 231 } 232 $_[1] = '' if $chk; 233 return $res; 234 }; 235 *encode = sub{ 236 my ($obj,$str,$chk) = @_; 237 my $res = ''; 238 for (my $i = 0; $i < length($str); $i++) { 239 $res .= 240 chr(utf8::native_to_unicode(ord(substr($str,$i,1)))); 241 } 242 $_[1] = '' if $chk; 243 return $res; 244 }; 245 $Encode::Encoding{Unicode} = 246 bless {Name => "UTF_EBCDIC"} => "Encode::UTF_EBCDIC"; 247 } else { 248 package Encode::Internal; 249 push @Encode::Internal::ISA, 'Encode::Encoding'; 250 *decode = sub{ 251 my ($obj,$str,$chk) = @_; 252 utf8::upgrade($str); 253 $_[1] = '' if $chk; 254 return $str; 255 }; 256 *encode = \&decode; 257 $Encode::Encoding{Unicode} = 258 bless {Name => "Internal"} => "Encode::Internal"; 259 } 260 261 { 262 # was in Encode::utf8 263 package Encode::utf8; 264 push @Encode::utf8::ISA, 'Encode::Encoding'; 265 # 266 if ($use_xs){ 267 Encode::DEBUG and warn __PACKAGE__, " XS on"; 268 *decode = \&decode_xs; 269 *encode = \&encode_xs; 270 }else{ 271 Encode::DEBUG and warn __PACKAGE__, " XS off"; 272 *decode = sub{ 273 my ($obj,$octets,$chk) = @_; 274 my $str = Encode::decode_utf8($octets); 275 if (defined $str) { 276 $_[1] = '' if $chk; 277 return $str; 278 } 279 return undef; 280 }; 281 *encode = sub { 282 my ($obj,$string,$chk) = @_; 283 my $octets = Encode::encode_utf8($string); 284 $_[1] = '' if $chk; 285 return $octets; 286 }; 287 } 288 *cat_decode = sub{ # ($obj, $dst, $src, $pos, $trm, $chk) 289 my ($obj, undef, undef, $pos, $trm) = @_; # currently ignores $chk 290 my ($rdst, $rsrc, $rpos) = \@_[1,2,3]; 291 use bytes; 292 if ((my $npos = index($$rsrc, $trm, $pos)) >= 0) { 293 $$rdst .= substr($$rsrc, $pos, $npos - $pos + length($trm)); 294 $$rpos = $npos + length($trm); 295 return 1; 296 } 297 $$rdst .= substr($$rsrc, $pos); 298 $$rpos = length($$rsrc); 299 return ''; 300 }; 301 $Encode::Encoding{utf8} = 302 bless {Name => "utf8"} => "Encode::utf8"; 303 $Encode::Encoding{"utf-8-strict"} = 304 bless {Name => "utf-8-strict", strict_utf8 => 1 } => "Encode::utf8"; 305 } 306} 307 3081; 309 310__END__ 311 312=head1 NAME 313 314Encode - character encodings 315 316=head1 SYNOPSIS 317 318 use Encode; 319 320=head2 Table of Contents 321 322Encode consists of a collection of modules whose details are too big 323to fit in one document. This POD itself explains the top-level APIs 324and general topics at a glance. For other topics and more details, 325see the PODs below: 326 327 Name Description 328 -------------------------------------------------------- 329 Encode::Alias Alias definitions to encodings 330 Encode::Encoding Encode Implementation Base Class 331 Encode::Supported List of Supported Encodings 332 Encode::CN Simplified Chinese Encodings 333 Encode::JP Japanese Encodings 334 Encode::KR Korean Encodings 335 Encode::TW Traditional Chinese Encodings 336 -------------------------------------------------------- 337 338=head1 DESCRIPTION 339 340The C<Encode> module provides the interfaces between Perl's strings 341and the rest of the system. Perl strings are sequences of 342B<characters>. 343 344The repertoire of characters that Perl can represent is at least that 345defined by the Unicode Consortium. On most platforms the ordinal 346values of the characters (as returned by C<ord(ch)>) is the "Unicode 347codepoint" for the character (the exceptions are those platforms where 348the legacy encoding is some variant of EBCDIC rather than a super-set 349of ASCII - see L<perlebcdic>). 350 351Traditionally, computer data has been moved around in 8-bit chunks 352often called "bytes". These chunks are also known as "octets" in 353networking standards. Perl is widely used to manipulate data of many 354types - not only strings of characters representing human or computer 355languages but also "binary" data being the machine's representation of 356numbers, pixels in an image - or just about anything. 357 358When Perl is processing "binary data", the programmer wants Perl to 359process "sequences of bytes". This is not a problem for Perl - as a 360byte has 256 possible values, it easily fits in Perl's much larger 361"logical character". 362 363=head2 TERMINOLOGY 364 365=over 2 366 367=item * 368 369I<character>: a character in the range 0..(2**32-1) (or more). 370(What Perl's strings are made of.) 371 372=item * 373 374I<byte>: a character in the range 0..255 375(A special case of a Perl character.) 376 377=item * 378 379I<octet>: 8 bits of data, with ordinal values 0..255 380(Term for bytes passed to or from a non-Perl context, e.g. a disk file.) 381 382=back 383 384=head1 PERL ENCODING API 385 386=over 2 387 388=item $octets = encode(ENCODING, $string [, CHECK]) 389 390Encodes a string from Perl's internal form into I<ENCODING> and returns 391a sequence of octets. ENCODING can be either a canonical name or 392an alias. For encoding names and aliases, see L</"Defining Aliases">. 393For CHECK, see L</"Handling Malformed Data">. 394 395For example, to convert a string from Perl's internal format to 396iso-8859-1 (also known as Latin1), 397 398 $octets = encode("iso-8859-1", $string); 399 400B<CAVEAT>: When you run C<$octets = encode("utf8", $string)>, then $octets 401B<may not be equal to> $string. Though they both contain the same data, the utf8 flag 402for $octets is B<always> off. When you encode anything, utf8 flag of 403the result is always off, even when it contains completely valid utf8 404string. See L</"The UTF-8 flag"> below. 405 406If the $string is C<undef> then C<undef> is returned. 407 408=item $string = decode(ENCODING, $octets [, CHECK]) 409 410Decodes a sequence of octets assumed to be in I<ENCODING> into Perl's 411internal form and returns the resulting string. As in encode(), 412ENCODING can be either a canonical name or an alias. For encoding names 413and aliases, see L</"Defining Aliases">. For CHECK, see 414L</"Handling Malformed Data">. 415 416For example, to convert ISO-8859-1 data to a string in Perl's internal format: 417 418 $string = decode("iso-8859-1", $octets); 419 420B<CAVEAT>: When you run C<$string = decode("utf8", $octets)>, then $string 421B<may not be equal to> $octets. Though they both contain the same data, 422the utf8 flag for $string is on unless $octets entirely consists of 423ASCII data (or EBCDIC on EBCDIC machines). See L</"The UTF-8 flag"> 424below. 425 426If the $string is C<undef> then C<undef> is returned. 427 428=item [$length =] from_to($octets, FROM_ENC, TO_ENC [, CHECK]) 429 430Converts B<in-place> data between two encodings. The data in $octets 431must be encoded as octets and not as characters in Perl's internal 432format. For example, to convert ISO-8859-1 data to Microsoft's CP1250 433encoding: 434 435 from_to($octets, "iso-8859-1", "cp1250"); 436 437and to convert it back: 438 439 from_to($octets, "cp1250", "iso-8859-1"); 440 441Note that because the conversion happens in place, the data to be 442converted cannot be a string constant; it must be a scalar variable. 443 444from_to() returns the length of the converted string in octets on 445success, I<undef> on error. 446 447B<CAVEAT>: The following operations look the same but are not quite so; 448 449 from_to($data, "iso-8859-1", "utf8"); #1 450 $data = decode("iso-8859-1", $data); #2 451 452Both #1 and #2 make $data consist of a completely valid UTF-8 string 453but only #2 turns utf8 flag on. #1 is equivalent to 454 455 $data = encode("utf8", decode("iso-8859-1", $data)); 456 457See L</"The UTF-8 flag"> below. 458 459=item $octets = encode_utf8($string); 460 461Equivalent to C<$octets = encode("utf8", $string);> The characters 462that comprise $string are encoded in Perl's internal format and the 463result is returned as a sequence of octets. All possible 464characters have a UTF-8 representation so this function cannot fail. 465 466 467=item $string = decode_utf8($octets [, CHECK]); 468 469equivalent to C<$string = decode("utf8", $octets [, CHECK])>. 470The sequence of octets represented by 471$octets is decoded from UTF-8 into a sequence of logical 472characters. Not all sequences of octets form valid UTF-8 encodings, so 473it is possible for this call to fail. For CHECK, see 474L</"Handling Malformed Data">. 475 476=back 477 478=head2 Listing available encodings 479 480 use Encode; 481 @list = Encode->encodings(); 482 483Returns a list of the canonical names of the available encodings that 484are loaded. To get a list of all available encodings including the 485ones that are not loaded yet, say 486 487 @all_encodings = Encode->encodings(":all"); 488 489Or you can give the name of a specific module. 490 491 @with_jp = Encode->encodings("Encode::JP"); 492 493When "::" is not in the name, "Encode::" is assumed. 494 495 @ebcdic = Encode->encodings("EBCDIC"); 496 497To find out in detail which encodings are supported by this package, 498see L<Encode::Supported>. 499 500=head2 Defining Aliases 501 502To add a new alias to a given encoding, use: 503 504 use Encode; 505 use Encode::Alias; 506 define_alias(newName => ENCODING); 507 508After that, newName can be used as an alias for ENCODING. 509ENCODING may be either the name of an encoding or an 510I<encoding object> 511 512But before you do so, make sure the alias is nonexistent with 513C<resolve_alias()>, which returns the canonical name thereof. 514i.e. 515 516 Encode::resolve_alias("latin1") eq "iso-8859-1" # true 517 Encode::resolve_alias("iso-8859-12") # false; nonexistent 518 Encode::resolve_alias($name) eq $name # true if $name is canonical 519 520resolve_alias() does not need C<use Encode::Alias>; it can be 521exported via C<use Encode qw(resolve_alias)>. 522 523See L<Encode::Alias> for details. 524 525=head1 Encoding via PerlIO 526 527If your perl supports I<PerlIO> (which is the default), you can use a PerlIO layer to decode 528and encode directly via a filehandle. The following two examples 529are totally identical in their functionality. 530 531 # via PerlIO 532 open my $in, "<:encoding(shiftjis)", $infile or die; 533 open my $out, ">:encoding(euc-jp)", $outfile or die; 534 while(<$in>){ print $out $_; } 535 536 # via from_to 537 open my $in, "<", $infile or die; 538 open my $out, ">", $outfile or die; 539 while(<$in>){ 540 from_to($_, "shiftjis", "euc-jp", 1); 541 print $out $_; 542 } 543 544Unfortunately, it may be that encodings are PerlIO-savvy. You can check 545if your encoding is supported by PerlIO by calling the C<perlio_ok> 546method. 547 548 Encode::perlio_ok("hz"); # False 549 find_encoding("euc-cn")->perlio_ok; # True where PerlIO is available 550 551 use Encode qw(perlio_ok); # exported upon request 552 perlio_ok("euc-jp") 553 554Fortunately, all encodings that come with Encode core are PerlIO-savvy 555except for hz and ISO-2022-kr. For gory details, see 556L<Encode::Encoding> and L<Encode::PerlIO>. 557 558=head1 Handling Malformed Data 559 560The optional I<CHECK> argument tells Encode what to do when it 561encounters malformed data. Without CHECK, Encode::FB_DEFAULT ( == 0 ) 562is assumed. 563 564As of version 2.12 Encode supports coderef values for CHECK. See below. 565 566=over 2 567 568=item B<NOTE:> Not all encoding support this feature 569 570Some encodings ignore I<CHECK> argument. For example, 571L<Encode::Unicode> ignores I<CHECK> and it always croaks on error. 572 573=back 574 575Now here is the list of I<CHECK> values available 576 577=over 2 578 579=item I<CHECK> = Encode::FB_DEFAULT ( == 0) 580 581If I<CHECK> is 0, (en|de)code will put a I<substitution character> in 582place of a malformed character. When you encode, E<lt>subcharE<gt> 583will be used. When you decode the code point C<0xFFFD> is used. If 584the data is supposed to be UTF-8, an optional lexical warning 585(category utf8) is given. 586 587=item I<CHECK> = Encode::FB_CROAK ( == 1) 588 589If I<CHECK> is 1, methods will die on error immediately with an error 590message. Therefore, when I<CHECK> is set to 1, you should trap the 591error with eval{} unless you really want to let it die. 592 593=item I<CHECK> = Encode::FB_QUIET 594 595If I<CHECK> is set to Encode::FB_QUIET, (en|de)code will immediately 596return the portion of the data that has been processed so far when an 597error occurs. The data argument will be overwritten with everything 598after that point (that is, the unprocessed part of data). This is 599handy when you have to call decode repeatedly in the case where your 600source data may contain partial multi-byte character sequences, 601(i.e. you are reading with a fixed-width buffer). Here is a sample 602code that does exactly this: 603 604 my $buffer = ''; my $string = ''; 605 while(read $fh, $buffer, 256, length($buffer)){ 606 $string .= decode($encoding, $buffer, Encode::FB_QUIET); 607 # $buffer now contains the unprocessed partial character 608 } 609 610=item I<CHECK> = Encode::FB_WARN 611 612This is the same as above, except that it warns on error. Handy when 613you are debugging the mode above. 614 615=item perlqq mode (I<CHECK> = Encode::FB_PERLQQ) 616 617=item HTML charref mode (I<CHECK> = Encode::FB_HTMLCREF) 618 619=item XML charref mode (I<CHECK> = Encode::FB_XMLCREF) 620 621For encodings that are implemented by Encode::XS, CHECK == 622Encode::FB_PERLQQ turns (en|de)code into C<perlqq> fallback mode. 623 624When you decode, C<\xI<HH>> will be inserted for a malformed character, 625where I<HH> is the hex representation of the octet that could not be 626decoded to utf8. And when you encode, C<\x{I<HHHH>}> will be inserted, 627where I<HHHH> is the Unicode ID of the character that cannot be found 628in the character repertoire of the encoding. 629 630HTML/XML character reference modes are about the same, in place of 631C<\x{I<HHHH>}>, HTML uses C<&#I<NNN>;> where I<NNN> is a decimal number and 632XML uses C<&#xI<HHHH>;> where I<HHHH> is the hexadecimal number. 633 634In Encode 2.10 or later, C<LEAVE_SRC> is also implied. 635 636=item The bitmask 637 638These modes are actually set via a bitmask. Here is how the FB_XX 639constants are laid out. You can import the FB_XX constants via 640C<use Encode qw(:fallbacks)>; you can import the generic bitmask 641constants via C<use Encode qw(:fallback_all)>. 642 643 FB_DEFAULT FB_CROAK FB_QUIET FB_WARN FB_PERLQQ 644 DIE_ON_ERR 0x0001 X 645 WARN_ON_ERR 0x0002 X 646 RETURN_ON_ERR 0x0004 X X 647 LEAVE_SRC 0x0008 X 648 PERLQQ 0x0100 X 649 HTMLCREF 0x0200 650 XMLCREF 0x0400 651 652=back 653 654=head2 coderef for CHECK 655 656As of Encode 2.12 CHECK can also be a code reference which takes the 657ord value of unmapped caharacter as an argument and returns a string 658that represents the fallback character. For instance, 659 660 $ascii = encode("ascii", $utf8, sub{ sprintf "<U+%04X>", shift }); 661 662Acts like FB_PERLQQ but E<lt>U+I<XXXX>E<gt> is used instead of 663\x{I<XXXX>}. 664 665=head1 Defining Encodings 666 667To define a new encoding, use: 668 669 use Encode qw(define_encoding); 670 define_encoding($object, 'canonicalName' [, alias...]); 671 672I<canonicalName> will be associated with I<$object>. The object 673should provide the interface described in L<Encode::Encoding>. 674If more than two arguments are provided then additional 675arguments are taken as aliases for I<$object>. 676 677See L<Encode::Encoding> for more details. 678 679=head1 The UTF-8 flag 680 681Before the introduction of utf8 support in perl, The C<eq> operator 682just compared the strings represented by two scalars. Beginning with 683perl 5.8, C<eq> compares two strings with simultaneous consideration 684of I<the utf8 flag>. To explain why we made it so, I will quote page 685402 of C<Programming Perl, 3rd ed.> 686 687=over 2 688 689=item Goal #1: 690 691Old byte-oriented programs should not spontaneously break on the old 692byte-oriented data they used to work on. 693 694=item Goal #2: 695 696Old byte-oriented programs should magically start working on the new 697character-oriented data when appropriate. 698 699=item Goal #3: 700 701Programs should run just as fast in the new character-oriented mode 702as in the old byte-oriented mode. 703 704=item Goal #4: 705 706Perl should remain one language, rather than forking into a 707byte-oriented Perl and a character-oriented Perl. 708 709=back 710 711Back when C<Programming Perl, 3rd ed.> was written, not even Perl 5.6.0 712was born and many features documented in the book remained 713unimplemented for a long time. Perl 5.8 corrected this and the introduction 714of the UTF-8 flag is one of them. You can think of this perl notion as of a 715byte-oriented mode (utf8 flag off) and a character-oriented mode (utf8 716flag on). 717 718Here is how Encode takes care of the utf8 flag. 719 720=over 2 721 722=item * 723 724When you encode, the resulting utf8 flag is always off. 725 726=item * 727 728When you decode, the resulting utf8 flag is on unless you can 729unambiguously represent data. Here is the definition of 730dis-ambiguity. 731 732After C<$utf8 = decode('foo', $octet);>, 733 734 When $octet is... The utf8 flag in $utf8 is 735 --------------------------------------------- 736 In ASCII only (or EBCDIC only) OFF 737 In ISO-8859-1 ON 738 In any other Encoding ON 739 --------------------------------------------- 740 741As you see, there is one exception, In ASCII. That way you can assume 742Goal #1. And with Encode Goal #2 is assumed but you still have to be 743careful in such cases mentioned in B<CAVEAT> paragraphs. 744 745This utf8 flag is not visible in perl scripts, exactly for the same 746reason you cannot (or you I<don't have to>) see if a scalar contains a 747string, integer, or floating point number. But you can still peek 748and poke these if you will. See the section below. 749 750=back 751 752=head2 Messing with Perl's Internals 753 754The following API uses parts of Perl's internals in the current 755implementation. As such, they are efficient but may change. 756 757=over 2 758 759=item is_utf8(STRING [, CHECK]) 760 761[INTERNAL] Tests whether the UTF-8 flag is turned on in the STRING. 762If CHECK is true, also checks the data in STRING for being well-formed 763UTF-8. Returns true if successful, false otherwise. 764 765As of perl 5.8.1, L<utf8> also has utf8::is_utf8(). 766 767=item _utf8_on(STRING) 768 769[INTERNAL] Turns on the UTF-8 flag in STRING. The data in STRING is 770B<not> checked for being well-formed UTF-8. Do not use unless you 771B<know> that the STRING is well-formed UTF-8. Returns the previous 772state of the UTF-8 flag (so please don't treat the return value as 773indicating success or failure), or C<undef> if STRING is not a string. 774 775=item _utf8_off(STRING) 776 777[INTERNAL] Turns off the UTF-8 flag in STRING. Do not use frivolously. 778Returns the previous state of the UTF-8 flag (so please don't treat the 779return value as indicating success or failure), or C<undef> if STRING is 780not a string. 781 782=back 783 784=head1 UTF-8 vs. utf8 785 786 ....We now view strings not as sequences of bytes, but as sequences 787 of numbers in the range 0 .. 2**32-1 (or in the case of 64-bit 788 computers, 0 .. 2**64-1) -- Programming Perl, 3rd ed. 789 790That has been the perl's notion of UTF-8 but official UTF-8 is more 791strict; Its ranges is much narrower (0 .. 10FFFF), some sequences are 792not allowed (i.e. Those used in the surrogate pair, 0xFFFE, et al). 793 794Now that is overruled by Larry Wall himself. 795 796 From: Larry Wall <larry@wall.org> 797 Date: December 04, 2004 11:51:58 JST 798 To: perl-unicode@perl.org 799 Subject: Re: Make Encode.pm support the real UTF-8 800 Message-Id: <20041204025158.GA28754@wall.org> 801 802 On Fri, Dec 03, 2004 at 10:12:12PM +0000, Tim Bunce wrote: 803 : I've no problem with 'utf8' being perl's unrestricted uft8 encoding, 804 : but "UTF-8" is the name of the standard and should give the 805 : corresponding behaviour. 806 807 For what it's worth, that's how I've always kept them straight in my 808 head. 809 810 Also for what it's worth, Perl 6 will mostly default to strict but 811 make it easy to switch back to lax. 812 813 Larry 814 815Do you copy? As of Perl 5.8.7, B<UTF-8> means strict, official UTF-8 816while B<utf8> means liberal, lax, version thereof. And Encode version 8172.10 or later thus groks the difference between C<UTF-8> and C"utf8". 818 819 encode("utf8", "\x{FFFF_FFFF}", 1); # okay 820 encode("UTF-8", "\x{FFFF_FFFF}", 1); # croaks 821 822C<UTF-8> in Encode is actually a canonical name for C<utf-8-strict>. 823Yes, the hyphen between "UTF" and "8" is important. Without it Encode 824goes "liberal" 825 826 find_encoding("UTF-8")->name # is 'utf-8-strict' 827 find_encoding("utf-8")->name # ditto. names are case insensitive 828 find_encoding("utf8")->name # ditto. "_" are treated as "-" 829 find_encoding("UTF8")->name # is 'utf8'. 830 831 832=head1 SEE ALSO 833 834L<Encode::Encoding>, 835L<Encode::Supported>, 836L<Encode::PerlIO>, 837L<encoding>, 838L<perlebcdic>, 839L<perlfunc/open>, 840L<perlunicode>, 841L<utf8>, 842the Perl Unicode Mailing List E<lt>perl-unicode@perl.orgE<gt> 843 844=head1 MAINTAINER 845 846This project was originated by Nick Ing-Simmons and later maintained 847by Dan Kogai E<lt>dankogai@dan.co.jpE<gt>. See AUTHORS for a full 848list of people involved. For any questions, use 849E<lt>perl-unicode@perl.orgE<gt> so we can all share. 850 851=cut 852