1package Encode::Alias; 2use strict; 3no warnings 'redefine'; 4use Encode; 5our $VERSION = do { my @r = (q$Revision: 2.4 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; 6sub DEBUG () { 0 } 7 8use base qw(Exporter); 9 10# Public, encouraged API is exported by default 11 12our @EXPORT = 13 qw ( 14 define_alias 15 find_alias 16 ); 17 18our @Alias; # ordered matching list 19our %Alias; # cached known aliases 20 21sub find_alias{ 22 my $class = shift; 23 my $find = shift; 24 unless (exists $Alias{$find}) { 25 $Alias{$find} = undef; # Recursion guard 26 for (my $i=0; $i < @Alias; $i += 2){ 27 my $alias = $Alias[$i]; 28 my $val = $Alias[$i+1]; 29 my $new; 30 if (ref($alias) eq 'Regexp' && $find =~ $alias){ 31 DEBUG and warn "eval $val"; 32 $new = eval $val; 33 DEBUG and $@ and warn "$val, $@"; 34 }elsif (ref($alias) eq 'CODE'){ 35 DEBUG and warn "$alias", "->", "($find)"; 36 $new = $alias->($find); 37 }elsif (lc($find) eq lc($alias)){ 38 $new = $val; 39 } 40 if (defined($new)){ 41 next if $new eq $find; # avoid (direct) recursion on bugs 42 DEBUG and warn "$alias, $new"; 43 my $enc = (ref($new)) ? $new : Encode::find_encoding($new); 44 if ($enc){ 45 $Alias{$find} = $enc; 46 last; 47 } 48 } 49 } 50 # case insensitive search when canonical is not in all lowercase 51 # RT ticket #7835 52 unless ($Alias{$find}){ 53 my $lcfind = lc($find); 54 for my $name (keys %Encode::Encoding, keys %Encode::ExtModule){ 55 $lcfind eq lc($name) or next; 56 $Alias{$find} = Encode::find_encoding($name); 57 DEBUG and warn "$find => $name"; 58 } 59 } 60 } 61 if (DEBUG){ 62 my $name; 63 if (my $e = $Alias{$find}){ 64 $name = $e->name; 65 }else{ 66 $name = ""; 67 } 68 warn "find_alias($class, $find)->name = $name"; 69 } 70 return $Alias{$find}; 71} 72 73sub define_alias{ 74 while (@_){ 75 my ($alias,$name) = splice(@_,0,2); 76 unshift(@Alias, $alias => $name); # newer one has precedence 77 if (ref($alias)){ 78 # clear %Alias cache to allow overrides 79 my @a = keys %Alias; 80 for my $k (@a){ 81 if (ref($alias) eq 'Regexp' && $k =~ $alias){ 82 DEBUG and warn "delete \$Alias\{$k\}"; 83 delete $Alias{$k}; 84 } 85 elsif (ref($alias) eq 'CODE'){ 86 DEBUG and warn "delete \$Alias\{$k\}"; 87 delete $Alias{$alias->($name)}; 88 } 89 } 90 }else{ 91 DEBUG and warn "delete \$Alias\{$alias\}"; 92 delete $Alias{$alias}; 93 } 94 } 95} 96 97# Allow latin-1 style names as well 98# 0 1 2 3 4 5 6 7 8 9 10 99our @Latin2iso = ( 0, 1, 2, 3, 4, 9, 10, 13, 14, 15, 16 ); 100# Allow winlatin1 style names as well 101our %Winlatin2cp = ( 102 'latin1' => 1252, 103 'latin2' => 1250, 104 'cyrillic' => 1251, 105 'greek' => 1253, 106 'turkish' => 1254, 107 'hebrew' => 1255, 108 'arabic' => 1256, 109 'baltic' => 1257, 110 'vietnamese' => 1258, 111 ); 112 113init_aliases(); 114 115sub undef_aliases{ 116 @Alias = (); 117 %Alias = (); 118} 119 120sub init_aliases 121{ 122 undef_aliases(); 123 # Try all-lower-case version should all else fails 124 define_alias( qr/^(.*)$/ => '"\L$1"' ); 125 126 # UTF/UCS stuff 127 define_alias( qr/^UTF-?7$/i => '"UTF-7"'); 128 define_alias( qr/^UCS-?2-?LE$/i => '"UCS-2LE"' ); 129 define_alias( qr/^UCS-?2-?(BE)?$/i => '"UCS-2BE"', 130 qr/^UCS-?4-?(BE|LE)?$/i => 'uc("UTF-32$1")', 131 qr/^iso-10646-1$/i => '"UCS-2BE"' ); 132 define_alias( qr/^UTF-?(16|32)-?BE$/i => '"UTF-$1BE"', 133 qr/^UTF-?(16|32)-?LE$/i => '"UTF-$1LE"', 134 qr/^UTF-?(16|32)$/i => '"UTF-$1"', 135 ); 136 # ASCII 137 define_alias(qr/^(?:US-?)ascii$/i => '"ascii"'); 138 define_alias('C' => 'ascii'); 139 define_alias(qr/\bISO[-_]?646[-_]?US$/i => '"ascii"'); 140 define_alias('646' => 'ascii'); 141 142 # Allow variants of iso-8859-1 etc. 143 define_alias( qr/\biso[-_]?(\d+)[-_](\d+)$/i => '"iso-$1-$2"' ); 144 145 # At least HP-UX has these. 146 define_alias( qr/\biso8859(\d+)$/i => '"iso-8859-$1"' ); 147 148 # More HP stuff. 149 define_alias( qr/\b(?:hp-)?(arabic|greek|hebrew|kana|roman|thai|turkish)8$/i => '"${1}8"' ); 150 151 # The Official name of ASCII. 152 define_alias( qr/\bANSI[-_]?X3\.4[-_]?1968$/i => '"ascii"' ); 153 154 # This is a font issue, not an encoding issue. 155 # (The currency symbol of the Latin 1 upper half 156 # has been redefined as the euro symbol.) 157 define_alias( qr/^(.+)\@euro$/i => '"$1"' ); 158 159 define_alias( qr/\b(?:iso[-_]?)?latin[-_]?(\d+)$/i 160 => 'defined $Encode::Alias::Latin2iso[$1] ? "iso-8859-$Encode::Alias::Latin2iso[$1]" : undef' ); 161 162 define_alias( qr/\bwin(latin[12]|cyrillic|baltic|greek|turkish| 163 hebrew|arabic|baltic|vietnamese)$/ix => 164 '"cp" . $Encode::Alias::Winlatin2cp{lc($1)}' ); 165 166 # Common names for non-latin preferred MIME names 167 define_alias( 'ascii' => 'US-ascii', 168 'cyrillic' => 'iso-8859-5', 169 'arabic' => 'iso-8859-6', 170 'greek' => 'iso-8859-7', 171 'hebrew' => 'iso-8859-8', 172 'thai' => 'iso-8859-11', 173 'tis620' => 'iso-8859-11', 174 ); 175 176 # At least AIX has IBM-NNN (surprisingly...) instead of cpNNN. 177 # And Microsoft has their own naming (again, surprisingly). 178 # And windows-* is registered in IANA! 179 define_alias( qr/\b(?:cp|ibm|ms|windows)[-_ ]?(\d{2,4})$/i => '"cp$1"'); 180 181 # Sometimes seen with a leading zero. 182 # define_alias( qr/\bcp037\b/i => '"cp37"'); 183 184 # Mac Mappings 185 # predefined in *.ucm; unneeded 186 # define_alias( qr/\bmacIcelandic$/i => '"macIceland"'); 187 define_alias( qr/^mac_(.*)$/i => '"mac$1"'); 188 # Ououououou. gone. They are differente! 189 # define_alias( qr/\bmacRomanian$/i => '"macRumanian"'); 190 191 # Standardize on the dashed versions. 192 define_alias( qr/\bkoi8[\s\-_]*([ru])$/i => '"koi8-$1"' ); 193 194 unless ($Encode::ON_EBCDIC){ 195 # for Encode::CN 196 define_alias( qr/\beuc.*cn$/i => '"euc-cn"' ); 197 define_alias( qr/\bcn.*euc$/i => '"euc-cn"' ); 198 # define_alias( qr/\bGB[- ]?(\d+)$/i => '"euc-cn"' ) 199 # CP936 doesn't have vendor-addon for GBK, so they're identical. 200 define_alias( qr/^gbk$/i => '"cp936"'); 201 # This fixes gb2312 vs. euc-cn confusion, practically 202 define_alias( qr/\bGB[-_ ]?2312(?!-?raw)/i => '"euc-cn"' ); 203 # for Encode::JP 204 define_alias( qr/\bjis$/i => '"7bit-jis"' ); 205 define_alias( qr/\beuc.*jp$/i => '"euc-jp"' ); 206 define_alias( qr/\bjp.*euc$/i => '"euc-jp"' ); 207 define_alias( qr/\bujis$/i => '"euc-jp"' ); 208 define_alias( qr/\bshift.*jis$/i => '"shiftjis"' ); 209 define_alias( qr/\bsjis$/i => '"shiftjis"' ); 210 define_alias( qr/\bwindows-31j$/i => '"cp932"' ); 211 # for Encode::KR 212 define_alias( qr/\beuc.*kr$/i => '"euc-kr"' ); 213 define_alias( qr/\bkr.*euc$/i => '"euc-kr"' ); 214 # This fixes ksc5601 vs. euc-kr confusion, practically 215 define_alias( qr/(?:x-)?uhc$/i => '"cp949"' ); 216 define_alias( qr/(?:x-)?windows-949$/i => '"cp949"' ); 217 define_alias( qr/\bks_c_5601-1987$/i => '"cp949"' ); 218 # for Encode::TW 219 define_alias( qr/\bbig-?5$/i => '"big5-eten"' ); 220 define_alias( qr/\bbig5-?et(?:en)?$/i => '"big5-eten"' ); 221 define_alias( qr/\btca[-_]?big5$/i => '"big5-eten"' ); 222 define_alias( qr/\bbig5-?hk(?:scs)?$/i => '"big5-hkscs"' ); 223 define_alias( qr/\bhk(?:scs)?[-_]?big5$/i => '"big5-hkscs"' ); 224 } 225 # utf8 is blessed :) 226 define_alias( qr/^UTF-8$/i => '"utf-8-strict"'); 227 # At last, Map white space and _ to '-' 228 define_alias( qr/^(\S+)[\s_]+(.*)$/i => '"$1-$2"' ); 229} 230 2311; 232__END__ 233 234# TODO: HP-UX '8' encodings arabic8 greek8 hebrew8 kana8 thai8 turkish8 235# TODO: HP-UX '15' encodings japanese15 korean15 roi15 236# TODO: Cyrillic encoding ISO-IR-111 (useful?) 237# TODO: Armenian encoding ARMSCII-8 238# TODO: Hebrew encoding ISO-8859-8-1 239# TODO: Thai encoding TCVN 240# TODO: Vietnamese encodings VPS 241# TODO: Mac Asian+African encodings: Arabic Armenian Bengali Burmese 242# ChineseSimp ChineseTrad Devanagari Ethiopic ExtArabic 243# Farsi Georgian Gujarati Gurmukhi Hebrew Japanese 244# Kannada Khmer Korean Laotian Malayalam Mongolian 245# Oriya Sinhalese Symbol Tamil Telugu Tibetan Vietnamese 246 247=head1 NAME 248 249Encode::Alias - alias definitions to encodings 250 251=head1 SYNOPSIS 252 253 use Encode; 254 use Encode::Alias; 255 define_alias( newName => ENCODING); 256 257=head1 DESCRIPTION 258 259Allows newName to be used as an alias for ENCODING. ENCODING may be 260either the name of an encoding or an encoding object (as described 261in L<Encode>). 262 263Currently I<newName> can be specified in the following ways: 264 265=over 4 266 267=item As a simple string. 268 269=item As a qr// compiled regular expression, e.g.: 270 271 define_alias( qr/^iso8859-(\d+)$/i => '"iso-8859-$1"' ); 272 273In this case, if I<ENCODING> is not a reference, it is C<eval>-ed 274in order to allow C<$1> etc. to be substituted. The example is one 275way to alias names as used in X11 fonts to the MIME names for the 276iso-8859-* family. Note the double quotes inside the single quotes. 277 278(or, you don't have to do this yourself because this example is predefined) 279 280If you are using a regex here, you have to use the quotes as shown or 281it won't work. Also note that regex handling is tricky even for the 282experienced. Use this feature with caution. 283 284=item As a code reference, e.g.: 285 286 define_alias( sub {shift =~ /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } ); 287 288The same effect as the example above in a different way. The coderef 289takes the alias name as an argument and returns a canonical name on 290success or undef if not. Note the second argument is not required. 291Use this with even more caution than the regex version. 292 293=back 294 295=head3 Changes in code reference aliasing 296 297As of Encode 1.87, the older form 298 299 define_alias( sub { return /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } ); 300 301no longer works. 302 303Encode up to 1.86 internally used "local $_" to implement ths older 304form. But consider the code below; 305 306 use Encode; 307 $_ = "eeeee" ; 308 while (/(e)/g) { 309 my $utf = decode('aliased-encoding-name', $1); 310 print "position:",pos,"\n"; 311 } 312 313Prior to Encode 1.86 this fails because of "local $_". 314 315=head2 Alias overloading 316 317You can override predefined aliases by simply applying define_alias(). 318The new alias is always evaluated first, and when necessary, 319define_alias() flushes the internal cache to make the new definition 320available. 321 322 # redirect SHIFT_JIS to MS/IBM Code Page 932, which is a 323 # superset of SHIFT_JIS 324 325 define_alias( qr/shift.*jis$/i => '"cp932"' ); 326 define_alias( qr/sjis$/i => '"cp932"' ); 327 328If you want to zap all predefined aliases, you can use 329 330 Encode::Alias->undef_aliases; 331 332to do so. And 333 334 Encode::Alias->init_aliases; 335 336gets the factory settings back. 337 338=head1 SEE ALSO 339 340L<Encode>, L<Encode::Supported> 341 342=cut 343 344