1package Encode::Alias;
2use strict;
3no warnings 'redefine';
4use Encode;
5our $VERSION = do { my @r = (q$Revision: 2.4 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
6sub DEBUG () { 0 }
7
8use base qw(Exporter);
9
10# Public, encouraged API is exported by default
11
12our @EXPORT =
13    qw (
14	define_alias
15	find_alias
16	);
17
18our @Alias;  # ordered matching list
19our %Alias;  # cached known aliases
20
21sub find_alias{
22    my $class = shift;
23    my $find = shift;
24    unless (exists $Alias{$find}) {
25        $Alias{$find} = undef; # Recursion guard
26	for (my $i=0; $i < @Alias; $i += 2){
27	    my $alias = $Alias[$i];
28	    my $val   = $Alias[$i+1];
29	    my $new;
30	    if (ref($alias) eq 'Regexp' && $find =~ $alias){
31		DEBUG and warn "eval $val";
32		$new = eval $val;
33		DEBUG and $@ and warn "$val, $@";
34	    }elsif (ref($alias) eq 'CODE'){
35		DEBUG and warn "$alias", "->", "($find)";
36		$new = $alias->($find);
37	    }elsif (lc($find) eq lc($alias)){
38		$new = $val;
39	    }
40	    if (defined($new)){
41		next if $new eq $find; # avoid (direct) recursion on bugs
42		DEBUG and warn "$alias, $new";
43		my $enc = (ref($new)) ? $new : Encode::find_encoding($new);
44		if ($enc){
45		    $Alias{$find} = $enc;
46		    last;
47		}
48	    }
49	}
50	# case insensitive search when canonical is not in all lowercase
51	# RT ticket #7835
52	unless ($Alias{$find}){
53	    my $lcfind = lc($find);
54	    for my $name (keys %Encode::Encoding, keys %Encode::ExtModule){
55		$lcfind eq lc($name) or next;
56		$Alias{$find} =  Encode::find_encoding($name);
57		DEBUG and warn "$find => $name";
58	    }
59	}
60    }
61    if (DEBUG){
62	my $name;
63	if (my $e = $Alias{$find}){
64	    $name = $e->name;
65	}else{
66	    $name = "";
67	}
68	warn "find_alias($class, $find)->name = $name";
69    }
70    return $Alias{$find};
71}
72
73sub define_alias{
74    while (@_){
75	my ($alias,$name) = splice(@_,0,2);
76	unshift(@Alias, $alias => $name);   # newer one has precedence
77	if (ref($alias)){
78	    # clear %Alias cache to allow overrides
79	    my @a = keys %Alias;
80	    for my $k (@a){
81		if (ref($alias) eq 'Regexp' && $k =~ $alias){
82		    DEBUG and warn "delete \$Alias\{$k\}";
83		    delete $Alias{$k};
84		}
85		elsif (ref($alias) eq 'CODE'){
86		    DEBUG and warn "delete \$Alias\{$k\}";
87		    delete $Alias{$alias->($name)};
88		}
89	    }
90	}else{
91	    DEBUG and warn "delete \$Alias\{$alias\}";
92	    delete $Alias{$alias};
93	}
94    }
95}
96
97# Allow latin-1 style names as well
98# 0  1  2  3  4  5   6   7   8   9  10
99our @Latin2iso = ( 0, 1, 2, 3, 4, 9, 10, 13, 14, 15, 16 );
100# Allow winlatin1 style names as well
101our %Winlatin2cp   = (
102		      'latin1'     => 1252,
103		      'latin2'     => 1250,
104		      'cyrillic'   => 1251,
105		      'greek'      => 1253,
106		      'turkish'    => 1254,
107		      'hebrew'     => 1255,
108		      'arabic'     => 1256,
109		      'baltic'     => 1257,
110		      'vietnamese' => 1258,
111		     );
112
113init_aliases();
114
115sub undef_aliases{
116    @Alias = ();
117    %Alias = ();
118}
119
120sub init_aliases
121{
122    undef_aliases();
123    # Try all-lower-case version should all else fails
124    define_alias( qr/^(.*)$/ => '"\L$1"' );
125
126    # UTF/UCS stuff
127    define_alias( qr/^UTF-?7$/i           => '"UTF-7"');
128    define_alias( qr/^UCS-?2-?LE$/i       => '"UCS-2LE"' );
129    define_alias( qr/^UCS-?2-?(BE)?$/i    => '"UCS-2BE"',
130                  qr/^UCS-?4-?(BE|LE)?$/i => 'uc("UTF-32$1")',
131		  qr/^iso-10646-1$/i      => '"UCS-2BE"' );
132    define_alias( qr/^UTF-?(16|32)-?BE$/i   => '"UTF-$1BE"',
133		  qr/^UTF-?(16|32)-?LE$/i   => '"UTF-$1LE"',
134		  qr/^UTF-?(16|32)$/i       => '"UTF-$1"',
135		);
136    # ASCII
137    define_alias(qr/^(?:US-?)ascii$/i => '"ascii"');
138    define_alias('C' => 'ascii');
139    define_alias(qr/\bISO[-_]?646[-_]?US$/i => '"ascii"');
140    define_alias('646' => 'ascii');
141
142    # Allow variants of iso-8859-1 etc.
143    define_alias( qr/\biso[-_]?(\d+)[-_](\d+)$/i => '"iso-$1-$2"' );
144
145    # At least HP-UX has these.
146    define_alias( qr/\biso8859(\d+)$/i => '"iso-8859-$1"' );
147
148    # More HP stuff.
149    define_alias( qr/\b(?:hp-)?(arabic|greek|hebrew|kana|roman|thai|turkish)8$/i => '"${1}8"' );
150
151    # The Official name of ASCII.
152    define_alias( qr/\bANSI[-_]?X3\.4[-_]?1968$/i => '"ascii"' );
153
154    # This is a font issue, not an encoding issue.
155    # (The currency symbol of the Latin 1 upper half
156    #  has been redefined as the euro symbol.)
157    define_alias( qr/^(.+)\@euro$/i => '"$1"' );
158
159    define_alias( qr/\b(?:iso[-_]?)?latin[-_]?(\d+)$/i
160		  => 'defined $Encode::Alias::Latin2iso[$1] ? "iso-8859-$Encode::Alias::Latin2iso[$1]" : undef' );
161
162    define_alias( qr/\bwin(latin[12]|cyrillic|baltic|greek|turkish|
163			 hebrew|arabic|baltic|vietnamese)$/ix =>
164		  '"cp" . $Encode::Alias::Winlatin2cp{lc($1)}' );
165
166    # Common names for non-latin preferred MIME names
167    define_alias( 'ascii'    => 'US-ascii',
168		  'cyrillic' => 'iso-8859-5',
169		  'arabic'   => 'iso-8859-6',
170		  'greek'    => 'iso-8859-7',
171		  'hebrew'   => 'iso-8859-8',
172		  'thai'     => 'iso-8859-11',
173		  'tis620'   => 'iso-8859-11',
174		  );
175
176    # At least AIX has IBM-NNN (surprisingly...) instead of cpNNN.
177    # And Microsoft has their own naming (again, surprisingly).
178    # And windows-* is registered in IANA!
179    define_alias( qr/\b(?:cp|ibm|ms|windows)[-_ ]?(\d{2,4})$/i => '"cp$1"');
180
181    # Sometimes seen with a leading zero.
182    # define_alias( qr/\bcp037\b/i => '"cp37"');
183
184    # Mac Mappings
185    # predefined in *.ucm; unneeded
186    # define_alias( qr/\bmacIcelandic$/i => '"macIceland"');
187    define_alias( qr/^mac_(.*)$/i => '"mac$1"');
188    # Ououououou. gone.  They are differente!
189    # define_alias( qr/\bmacRomanian$/i => '"macRumanian"');
190
191    # Standardize on the dashed versions.
192    define_alias( qr/\bkoi8[\s\-_]*([ru])$/i => '"koi8-$1"' );
193
194    unless ($Encode::ON_EBCDIC){
195        # for Encode::CN
196	define_alias( qr/\beuc.*cn$/i        => '"euc-cn"' );
197	define_alias( qr/\bcn.*euc$/i        => '"euc-cn"' );
198	# define_alias( qr/\bGB[- ]?(\d+)$/i => '"euc-cn"' )
199	# CP936 doesn't have vendor-addon for GBK, so they're identical.
200	define_alias( qr/^gbk$/i => '"cp936"');
201	# This fixes gb2312 vs. euc-cn confusion, practically
202	define_alias( qr/\bGB[-_ ]?2312(?!-?raw)/i => '"euc-cn"' );
203	# for Encode::JP
204	define_alias( qr/\bjis$/i            => '"7bit-jis"' );
205	define_alias( qr/\beuc.*jp$/i        => '"euc-jp"' );
206	define_alias( qr/\bjp.*euc$/i        => '"euc-jp"' );
207	define_alias( qr/\bujis$/i           => '"euc-jp"' );
208	define_alias( qr/\bshift.*jis$/i     => '"shiftjis"' );
209	define_alias( qr/\bsjis$/i           => '"shiftjis"' );
210	define_alias( qr/\bwindows-31j$/i    => '"cp932"' );
211        # for Encode::KR
212	define_alias( qr/\beuc.*kr$/i        => '"euc-kr"' );
213	define_alias( qr/\bkr.*euc$/i        => '"euc-kr"' );
214	# This fixes ksc5601 vs. euc-kr confusion, practically
215        define_alias( qr/(?:x-)?uhc$/i            => '"cp949"' );
216        define_alias( qr/(?:x-)?windows-949$/i    => '"cp949"' );
217        define_alias( qr/\bks_c_5601-1987$/i      => '"cp949"' );
218        # for Encode::TW
219	define_alias( qr/\bbig-?5$/i		  => '"big5-eten"' );
220	define_alias( qr/\bbig5-?et(?:en)?$/i	  => '"big5-eten"' );
221	define_alias( qr/\btca[-_]?big5$/i	  => '"big5-eten"' );
222	define_alias( qr/\bbig5-?hk(?:scs)?$/i	  => '"big5-hkscs"' );
223	define_alias( qr/\bhk(?:scs)?[-_]?big5$/i  => '"big5-hkscs"' );
224    }
225    # utf8 is blessed :)
226    define_alias( qr/^UTF-8$/i => '"utf-8-strict"');
227    # At last, Map white space and _ to '-'
228    define_alias( qr/^(\S+)[\s_]+(.*)$/i => '"$1-$2"' );
229}
230
2311;
232__END__
233
234# TODO: HP-UX '8' encodings arabic8 greek8 hebrew8 kana8 thai8 turkish8
235# TODO: HP-UX '15' encodings japanese15 korean15 roi15
236# TODO: Cyrillic encoding ISO-IR-111 (useful?)
237# TODO: Armenian encoding ARMSCII-8
238# TODO: Hebrew encoding ISO-8859-8-1
239# TODO: Thai encoding TCVN
240# TODO: Vietnamese encodings VPS
241# TODO: Mac Asian+African encodings: Arabic Armenian Bengali Burmese
242#       ChineseSimp ChineseTrad Devanagari Ethiopic ExtArabic
243#       Farsi Georgian Gujarati Gurmukhi Hebrew Japanese
244#       Kannada Khmer Korean Laotian Malayalam Mongolian
245#       Oriya Sinhalese Symbol Tamil Telugu Tibetan Vietnamese
246
247=head1 NAME
248
249Encode::Alias - alias definitions to encodings
250
251=head1 SYNOPSIS
252
253  use Encode;
254  use Encode::Alias;
255  define_alias( newName => ENCODING);
256
257=head1 DESCRIPTION
258
259Allows newName to be used as an alias for ENCODING. ENCODING may be
260either the name of an encoding or an encoding object (as described
261in L<Encode>).
262
263Currently I<newName> can be specified in the following ways:
264
265=over 4
266
267=item As a simple string.
268
269=item As a qr// compiled regular expression, e.g.:
270
271  define_alias( qr/^iso8859-(\d+)$/i => '"iso-8859-$1"' );
272
273In this case, if I<ENCODING> is not a reference, it is C<eval>-ed
274in order to allow C<$1> etc. to be substituted.  The example is one
275way to alias names as used in X11 fonts to the MIME names for the
276iso-8859-* family.  Note the double quotes inside the single quotes.
277
278(or, you don't have to do this yourself because this example is predefined)
279
280If you are using a regex here, you have to use the quotes as shown or
281it won't work.  Also note that regex handling is tricky even for the
282experienced.  Use this feature with caution.
283
284=item As a code reference, e.g.:
285
286  define_alias( sub {shift =~ /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } );
287
288The same effect as the example above in a different way.  The coderef
289takes the alias name as an argument and returns a canonical name on
290success or undef if not.  Note the second argument is not required.
291Use this with even more caution than the regex version.
292
293=back
294
295=head3 Changes in code reference aliasing
296
297As of Encode 1.87, the older form
298
299  define_alias( sub { return  /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } );
300
301no longer works.
302
303Encode up to 1.86 internally used "local $_" to implement ths older
304form.  But consider the code below;
305
306  use Encode;
307  $_ = "eeeee" ;
308  while (/(e)/g) {
309    my $utf = decode('aliased-encoding-name', $1);
310    print "position:",pos,"\n";
311  }
312
313Prior to Encode 1.86 this fails because of "local $_".
314
315=head2 Alias overloading
316
317You can override predefined aliases by simply applying define_alias().
318The new alias is always evaluated first, and when necessary,
319define_alias() flushes the internal cache to make the new definition
320available.
321
322  # redirect SHIFT_JIS to MS/IBM Code Page 932, which is a
323  # superset of SHIFT_JIS
324
325  define_alias( qr/shift.*jis$/i  => '"cp932"' );
326  define_alias( qr/sjis$/i        => '"cp932"' );
327
328If you want to zap all predefined aliases, you can use
329
330  Encode::Alias->undef_aliases;
331
332to do so.  And
333
334  Encode::Alias->init_aliases;
335
336gets the factory settings back.
337
338=head1 SEE ALSO
339
340L<Encode>, L<Encode::Supported>
341
342=cut
343
344