1package utf8; 2use strict; 3use warnings; 4 5sub DEBUG () { 0 } 6 7sub DESTROY {} 8 9my %Cache; 10 11our (%PropertyAlias, %PA_reverse, %PropValueAlias, %PVA_reverse, %PVA_abbr_map); 12 13sub croak { require Carp; Carp::croak(@_) } 14 15## 16## "SWASH" == "SWATCH HASH". A "swatch" is a swatch of the Unicode landscape. 17## It's a data structure that encodes a set of Unicode characters. 18## 19 20sub SWASHNEW { 21 my ($class, $type, $list, $minbits, $none) = @_; 22 local $^D = 0 if $^D; 23 24 print STDERR "SWASHNEW @_\n" if DEBUG; 25 26 ## 27 ## Get the list of codepoints for the type. 28 ## Called from utf8.c 29 ## 30 ## Given a $type, our goal is to fill $list with the set of codepoint 31 ## ranges. 32 ## 33 ## To make the parsing of $type clear, this code takes the a rather 34 ## unorthodox approach of last'ing out of the block once we have the 35 ## info we need. Were this to be a subroutine, the 'last' would just 36 ## be a 'return'. 37 ## 38 my $file; ## file to load data from, and also part of the %Cache key. 39 my $ListSorted = 0; 40 41 if ($type) 42 { 43 $type =~ s/^\s+//; 44 $type =~ s/\s+$//; 45 46 print "type = $type\n" if DEBUG; 47 48 GETFILE: 49 { 50 ## 51 ## It could be a user-defined property. 52 ## 53 54 my $caller1 = $type =~ s/(.+)::// ? $1 : caller(1); 55 56 if (defined $caller1 && $type =~ /^(?:\w+)$/) { 57 my $prop = "${caller1}::$type"; 58 if (exists &{$prop}) { 59 no strict 'refs'; 60 61 $list = &{$prop}; 62 last GETFILE; 63 } 64 } 65 66 my $wasIs; 67 68 ($wasIs = $type =~ s/^Is(?:\s+|[-_])?//i) 69 or 70 $type =~ s/^(?:(?:General(?:\s+|_)?)?Category|gc)\s*[:=]\s*//i 71 or 72 $type =~ s/^(?:Script|sc)\s*[:=]\s*//i 73 or 74 $type =~ s/^Block\s*[:=]\s*/In/i; 75 76 77 ## 78 ## See if it's in some enumeration. 79 ## 80 require "unicore/PVA.pl"; 81 if ($type =~ /^([\w\s]+)[:=]\s*(.*)/) { 82 my ($enum, $val) = (lc $1, lc $2); 83 $enum =~ tr/ _-//d; 84 $val =~ tr/ _-//d; 85 86 my $pa = $PropertyAlias{$enum} ? $enum : $PA_reverse{$enum}; 87 my $f = $PropValueAlias{$pa}{$val} ? $val : $PVA_reverse{$pa}{lc $val}; 88 89 if ($pa and $f) { 90 $pa = "gc_sc" if $pa eq "gc" or $pa eq "sc"; 91 $file = "unicore/lib/$pa/$PVA_abbr_map{$pa}{lc $f}.pl"; 92 last GETFILE; 93 } 94 } 95 else { 96 my $t = lc $type; 97 $t =~ tr/ _-//d; 98 99 if ($PropValueAlias{gc}{$t} or $PropValueAlias{sc}{$t}) { 100 $file = "unicore/lib/gc_sc/$PVA_abbr_map{gc_sc}{$t}.pl"; 101 last GETFILE; 102 } 103 } 104 105 ## 106 ## See if it's in the direct mapping table. 107 ## 108 require "unicore/Exact.pl"; 109 if (my $base = $utf8::Exact{$type}) { 110 $file = "unicore/lib/gc_sc/$base.pl"; 111 last GETFILE; 112 } 113 114 ## 115 ## If not there exactly, try the canonical form. The canonical 116 ## form is lowercased, with any separators (\s+|[-_]) removed. 117 ## 118 my $canonical = lc $type; 119 $canonical =~ s/(?<=[a-z\d])(?:\s+|[-_])(?=[a-z\d])//g; 120 print "canonical = $canonical\n" if DEBUG; 121 122 require "unicore/Canonical.pl"; 123 if (my $base = ($utf8::Canonical{$canonical} || $utf8::Canonical{ lc $utf8::PropertyAlias{$canonical} })) { 124 $file = "unicore/lib/gc_sc/$base.pl"; 125 last GETFILE; 126 } 127 128 ## 129 ## See if it's a user-level "To". 130 ## 131 132 my $caller0 = caller(0); 133 134 if (defined $caller0 && $type =~ /^To(?:\w+)$/) { 135 my $map = $caller0 . "::" . $type; 136 137 if (exists &{$map}) { 138 no strict 'refs'; 139 140 $list = &{$map}; 141 last GETFILE; 142 } 143 } 144 145 ## 146 ## Last attempt -- see if it's a standard "To" name 147 ## (e.g. "ToLower") ToTitle is used by ucfirst(). 148 ## The user-level way to access ToDigit() and ToFold() 149 ## is to use Unicode::UCD. 150 ## 151 if ($type =~ /^To(Digit|Fold|Lower|Title|Upper)$/) 152 { 153 $file = "unicore/To/$1.pl"; 154 ## would like to test to see if $file actually exists.... 155 last GETFILE; 156 } 157 158 ## 159 ## If we reach this line, it's because we couldn't figure 160 ## out what to do with $type. Ouch. 161 ## 162 163 return $type; 164 } 165 166 if (defined $file) { 167 print "found it (file='$file')\n" if DEBUG; 168 169 ## 170 ## If we reach here, it was due to a 'last GETFILE' above 171 ## (exception: user-defined properties and mappings), so we 172 ## have a filename, so now we load it if we haven't already. 173 ## If we have, return the cached results. The cache key is the 174 ## file to load. 175 ## 176 if ($Cache{$file} and ref($Cache{$file}) eq $class) 177 { 178 print "Returning cached '$file' for \\p{$type}\n" if DEBUG; 179 return $Cache{$class, $file}; 180 } 181 182 $list = do $file; 183 } 184 185 $ListSorted = 1; ## we know that these lists are sorted 186 } 187 188 my $extras; 189 my $bits = 0; 190 191 my $ORIG = $list; 192 if ($list) { 193 my @tmp = split(/^/m, $list); 194 my %seen; 195 no warnings; 196 $extras = join '', grep /^[^0-9a-fA-F]/, @tmp; 197 $list = join '', 198 map { $_->[1] } 199 sort { $a->[0] <=> $b->[0] } 200 map { /^([0-9a-fA-F]+)/; [ hex($1), $_ ] } 201 grep { /^([0-9a-fA-F]+)/ and not $seen{$1}++ } @tmp; # XXX doesn't do ranges right 202 } 203 204 if ($none) { 205 my $hextra = sprintf "%04x", $none + 1; 206 $list =~ s/\tXXXX$/\t$hextra/mg; 207 } 208 209 if ($minbits < 32) { 210 my $top = 0; 211 while ($list =~ /^([0-9a-fA-F]+)(?:[\t]([0-9a-fA-F]+)?)(?:[ \t]([0-9a-fA-F]+))?/mg) { 212 my $min = hex $1; 213 my $max = defined $2 ? hex $2 : $min; 214 my $val = defined $3 ? hex $3 : 0; 215 $val += $max - $min if defined $3; 216 $top = $val if $val > $top; 217 } 218 $bits = 219 $top > 0xffff ? 32 : 220 $top > 0xff ? 16 : 221 $top > 1 ? 8 : 1 222 } 223 $bits = $minbits if $bits < $minbits; 224 225 my @extras; 226 for my $x ($extras) { 227 pos $x = 0; 228 while ($x =~ /^([^0-9a-fA-F\n])(.*)/mg) { 229 my $char = $1; 230 my $name = $2; 231 print STDERR "$1 => $2\n" if DEBUG; 232 if ($char =~ /[-+!&]/) { 233 my ($c,$t) = split(/::/, $name, 2); # bogus use of ::, really 234 my $subobj; 235 if ($c eq 'utf8') { 236 $subobj = utf8->SWASHNEW($t, "", 0, 0, 0); 237 } 238 elsif (exists &$name) { 239 $subobj = utf8->SWASHNEW($name, "", 0, 0, 0); 240 } 241 elsif ($c =~ /^([0-9a-fA-F]+)/) { 242 $subobj = utf8->SWASHNEW("", $c, 0, 0, 0); 243 } 244 return $subobj unless ref $subobj; 245 push @extras, $name => $subobj; 246 $bits = $subobj->{BITS} if $bits < $subobj->{BITS}; 247 } 248 } 249 } 250 251 print STDERR "CLASS = $class, TYPE => $type, BITS => $bits, NONE => $none\nEXTRAS =>\n$extras\nLIST =>\n$list\n" if DEBUG; 252 253 my $SWASH = bless { 254 TYPE => $type, 255 BITS => $bits, 256 EXTRAS => $extras, 257 LIST => $list, 258 NONE => $none, 259 @extras, 260 } => $class; 261 262 if ($file) { 263 $Cache{$class, $file} = $SWASH; 264 } 265 266 return $SWASH; 267} 268 269# NOTE: utf8.c:swash_init() assumes entries are never modified once generated. 270 271sub SWASHGET { 272 # See utf8.c:Perl_swash_fetch for problems with this interface. 273 my ($self, $start, $len) = @_; 274 local $^D = 0 if $^D; 275 my $type = $self->{TYPE}; 276 my $bits = $self->{BITS}; 277 my $none = $self->{NONE}; 278 print STDERR "SWASHGET @_ [$type/$bits/$none]\n" if DEBUG; 279 my $end = $start + $len; 280 my $swatch = ""; 281 my $key; 282 vec($swatch, $len - 1, $bits) = 0; # Extend to correct length. 283 if ($none) { 284 for $key (0 .. $len - 1) { vec($swatch, $key, $bits) = $none } 285 } 286 287 for ($self->{LIST}) { 288 pos $_ = 0; 289 if ($bits > 1) { 290 LINE: 291 while (/^([0-9a-fA-F]+)(?:[ \t]([0-9a-fA-F]+)?)?(?:[ \t]([0-9a-fA-F]+))?/mg) { 292 chomp; 293 my ($a, $b, $c) = ($1, $2, $3); 294 croak "$type: illegal mapping '$_'" 295 if $type =~ /^To/ && 296 !(defined $a && defined $c); 297 my $min = hex $a; 298 my $max = defined $b ? hex $b : $min; 299 my $val = defined $c ? hex $c : 0; 300 next if $max < $start; 301 print "$min $max $val\n" if DEBUG; 302 if ($none) { 303 if ($min < $start) { 304 $val += $start - $min if $val < $none; 305 $min = $start; 306 } 307 for ($key = $min; $key <= $max; $key++) { 308 last LINE if $key >= $end; 309 print STDERR "$key => $val\n" if DEBUG; 310 vec($swatch, $key - $start, $bits) = $val; 311 ++$val if $val < $none; 312 } 313 } 314 else { 315 if ($min < $start) { 316 $val += $start - $min; 317 $min = $start; 318 } 319 for ($key = $min; $key <= $max; $key++, $val++) { 320 last LINE if $key >= $end; 321 print STDERR "$key => $val\n" if DEBUG; 322 vec($swatch, $key - $start, $bits) = $val; 323 } 324 } 325 } 326 } 327 else { 328 LINE: 329 while (/^([0-9a-fA-F]+)(?:[ \t]+([0-9a-fA-F]+))?/mg) { 330 chomp; 331 my $min = hex $1; 332 my $max = defined $2 ? hex $2 : $min; 333 next if $max < $start; 334 if ($min < $start) { 335 $min = $start; 336 } 337 for ($key = $min; $key <= $max; $key++) { 338 last LINE if $key >= $end; 339 print STDERR "$key => 1\n" if DEBUG; 340 vec($swatch, $key - $start, 1) = 1; 341 } 342 } 343 } 344 } 345 for my $x ($self->{EXTRAS}) { 346 pos $x = 0; 347 while ($x =~ /^([-+!&])(.*)/mg) { 348 my $char = $1; 349 my $name = $2; 350 print STDERR "INDIRECT $1 $2\n" if DEBUG; 351 my $otherbits = $self->{$name}->{BITS}; 352 croak("SWASHGET size mismatch") if $bits < $otherbits; 353 my $other = $self->{$name}->SWASHGET($start, $len); 354 if ($char eq '+') { 355 if ($bits == 1 and $otherbits == 1) { 356 $swatch |= $other; 357 } 358 else { 359 for ($key = 0; $key < $len; $key++) { 360 vec($swatch, $key, $bits) = vec($other, $key, $otherbits); 361 } 362 } 363 } 364 elsif ($char eq '!') { 365 if ($bits == 1 and $otherbits == 1) { 366 $swatch |= ~$other; 367 } 368 else { 369 for ($key = 0; $key < $len; $key++) { 370 if (!vec($other, $key, $otherbits)) { 371 vec($swatch, $key, $bits) = 1; 372 } 373 } 374 } 375 } 376 elsif ($char eq '-') { 377 if ($bits == 1 and $otherbits == 1) { 378 $swatch &= ~$other; 379 } 380 else { 381 for ($key = 0; $key < $len; $key++) { 382 if (vec($other, $key, $otherbits)) { 383 vec($swatch, $key, $bits) = 0; 384 } 385 } 386 } 387 } 388 elsif ($char eq '&') { 389 if ($bits == 1 and $otherbits == 1) { 390 $swatch &= $other; 391 } 392 else { 393 for ($key = 0; $key < $len; $key++) { 394 if (!vec($other, $key, $otherbits)) { 395 vec($swatch, $key, $bits) = 0; 396 } 397 } 398 } 399 } 400 } 401 } 402 if (DEBUG) { 403 print STDERR "CELLS "; 404 for ($key = 0; $key < $len; $key++) { 405 print STDERR vec($swatch, $key, $bits), " "; 406 } 407 print STDERR "\n"; 408 } 409 $swatch; 410} 411 4121; 413