1#!perl 2# 3# This auxiliary script makes five header files 4# used for building XSUB of Unicode::Normalize. 5# 6# Usage: 7# <do 'mkheader'> in perl, or <perl mkheader> in command line 8# 9# Input files: 10# unicore/CombiningClass.pl (or unicode/CombiningClass.pl) 11# unicore/Decomposition.pl (or unicode/Decomposition.pl) 12# unicore/CompositionExclusions.txt (or unicode/CompExcl.txt) 13# 14# Output files: 15# unfcan.h 16# unfcpt.h 17# unfcmb.h 18# unfcmp.h 19# unfexc.h 20# 21use 5.006; 22use strict; 23use warnings; 24use Carp; 25use File::Spec; 26 27BEGIN { 28 unless ("A" eq pack('U', 0x41)) { 29 die "Unicode::Normalize cannot stringify a Unicode code point\n"; 30 } 31} 32 33our $PACKAGE = 'Unicode::Normalize, mkheader'; 34 35our $Combin = do "unicore/CombiningClass.pl" 36 || do "unicode/CombiningClass.pl" 37 || croak "$PACKAGE: CombiningClass.pl not found"; 38 39our $Decomp = do "unicore/Decomposition.pl" 40 || do "unicode/Decomposition.pl" 41 || croak "$PACKAGE: Decomposition.pl not found"; 42 43our %Combin; # $codepoint => $number : combination class 44our %Canon; # $codepoint => \@codepoints : canonical decomp. 45our %Compat; # $codepoint => \@codepoints : compat. decomp. 46# after _U_stringify(), ($codepoint => $hexstring) for %Canon and %Compat 47our %Exclus; # $codepoint => 1 : composition exclusions 48our %Single; # $codepoint => 1 : singletons 49our %NonStD; # $codepoint => 1 : non-starter decompositions 50 51our %Comp1st; # $codepoint => $listname : may be composed with a next char. 52our %Comp2nd; # $codepoint => 1 : may be composed with a prev char. 53our %CompList; # $listname,$2nd => $codepoint : composite 54 55our $prefix = "UNF_"; 56our $structname = "${prefix}complist"; 57 58########## definition of Hangul constants ########## 59use constant SBase => 0xAC00; 60use constant SFinal => 0xD7A3; # SBase -1 + SCount 61use constant SCount => 11172; # LCount * NCount 62use constant NCount => 588; # VCount * TCount 63use constant LBase => 0x1100; 64use constant LFinal => 0x1112; 65use constant LCount => 19; 66use constant VBase => 0x1161; 67use constant VFinal => 0x1175; 68use constant VCount => 21; 69use constant TBase => 0x11A7; 70use constant TFinal => 0x11C2; 71use constant TCount => 28; 72 73sub decomposeHangul { 74 my $SIndex = $_[0] - SBase; 75 my $LIndex = int( $SIndex / NCount); 76 my $VIndex = int(($SIndex % NCount) / TCount); 77 my $TIndex = $SIndex % TCount; 78 my @ret = ( 79 LBase + $LIndex, 80 VBase + $VIndex, 81 $TIndex ? (TBase + $TIndex) : (), 82 ); 83 wantarray ? @ret : pack('U*', @ret); 84 # any element in @ret greater than 0xFF, so no need of u2n conversion. 85} 86 87########## getting full decomposion ########## 88{ 89 my($f, $fh); 90 foreach my $d (@INC) { 91 $f = File::Spec->catfile($d, "unicore", "CompositionExclusions.txt"); 92 last if open($fh, $f); 93 $f = File::Spec->catfile($d, "unicode", "CompExcl.txt"); 94 last if open($fh, $f); 95 $f = undef; 96 } 97 croak "$PACKAGE: neither unicore/CompositionExclusions.txt " 98 . "nor unicode/CompExcl.txt is found in @INC" unless defined $f; 99 100 while (<$fh>) { 101 next if /^#/ or /^$/; 102 s/#.*//; 103 $Exclus{ hex($1) } = 1 if /([0-9A-Fa-f]+)/; 104 } 105 close $fh; 106} 107 108## 109## converts string "hhhh hhhh hhhh" to a numeric list 110## 111sub _getHexArray { map hex, $_[0] =~ /([0-9A-Fa-f]+)/g } 112 113while ($Combin =~ /(.+)/g) { 114 my @tab = split /\t/, $1; 115 my $ini = hex $tab[0]; 116 if ($tab[1] eq '') { 117 $Combin{ $ini } = $tab[2]; 118 } else { 119 $Combin{ $_ } = $tab[2] foreach $ini .. hex($tab[1]); 120 } 121} 122 123while ($Decomp =~ /(.+)/g) { 124 my @tab = split /\t/, $1; 125 my $compat = $tab[2] =~ s/<[^>]+>//; 126 my $dec = [ _getHexArray($tab[2]) ]; # decomposition 127 my $ini = hex($tab[0]); # initial decomposable character 128 129 my $listname = 130 @$dec == 2 ? sprintf("${structname}_%06x", $dec->[0]) : 'USELESS'; 131 # %04x is bad since it'd place _3046 after _1d157. 132 133 if ($tab[1] eq '') { 134 $Compat{ $ini } = $dec; 135 136 if (! $compat) { 137 $Canon{ $ini } = $dec; 138 139 if (@$dec == 2) { 140 if ($Combin{ $dec->[0] }) { 141 $NonStD{ $ini } = 1; 142 } else { 143 $CompList{ $listname }{ $dec->[1] } = $ini; 144 $Comp1st{ $dec->[0] } = $listname; 145 $Comp2nd{ $dec->[1] } = 1 if ! $Exclus{$ini}; 146 } 147 } elsif (@$dec == 1) { 148 $Single{ $ini } = 1; 149 } else { 150 croak("Weird Canonical Decomposition of U+$tab[0]"); 151 } 152 } 153 } else { 154 foreach my $u ($ini .. hex($tab[1])) { 155 $Compat{ $u } = $dec; 156 157 if (! $compat) { 158 $Canon{ $u } = $dec; 159 160 if (@$dec == 2) { 161 if ($Combin{ $dec->[0] }) { 162 $NonStD{ $u } = 1; 163 } else { 164 $CompList{ $listname }{ $dec->[1] } = $u; 165 $Comp1st{ $dec->[0] } = $listname; 166 $Comp2nd{ $dec->[1] } = 1 if ! $Exclus{$u}; 167 } 168 } elsif (@$dec == 1) { 169 $Single{ $u } = 1; 170 } else { 171 croak("Weird Canonical Decomposition of U+$tab[0]"); 172 } 173 } 174 } 175 } 176} 177 178# modern HANGUL JUNGSEONG and HANGUL JONGSEONG jamo 179foreach my $j (0x1161..0x1175, 0x11A8..0x11C2) { 180 $Comp2nd{$j} = 1; 181} 182 183sub getCanonList { 184 my @src = @_; 185 my @dec = map { 186 (SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_) 187 : $Canon{$_} ? @{ $Canon{$_} } : $_ 188 } @src; 189 return join(" ",@src) eq join(" ",@dec) ? @dec : getCanonList(@dec); 190 # condition @src == @dec is not ok. 191} 192 193sub getCompatList { 194 my @src = @_; 195 my @dec = map { 196 (SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_) 197 : $Compat{$_} ? @{ $Compat{$_} } : $_ 198 } @src; 199 return join(" ",@src) eq join(" ",@dec) ? @dec : getCompatList(@dec); 200 # condition @src == @dec is not ok. 201} 202 203# exhaustive decomposition 204foreach my $key (keys %Canon) { 205 $Canon{$key} = [ getCanonList($key) ]; 206} 207 208# exhaustive decomposition 209foreach my $key (keys %Compat) { 210 $Compat{$key} = [ getCompatList($key) ]; 211} 212 213sub _pack_U { 214 return pack('U*', @_); 215} 216 217sub _U_stringify { 218 sprintf '"%s"', join '', 219 map sprintf("\\x%02x", $_), unpack 'C*', _pack_U(@_); 220} 221 222foreach my $hash (\%Canon, \%Compat) { 223 foreach my $key (keys %$hash) { 224 $hash->{$key} = _U_stringify( @{ $hash->{$key} } ); 225 } 226} 227 228########## writing header files ########## 229 230my @boolfunc = ( 231 { 232 name => "Exclusion", 233 type => "bool", 234 hash => \%Exclus, 235 }, 236 { 237 name => "Singleton", 238 type => "bool", 239 hash => \%Single, 240 }, 241 { 242 name => "NonStDecomp", 243 type => "bool", 244 hash => \%NonStD, 245 }, 246 { 247 name => "Comp2nd", 248 type => "bool", 249 hash => \%Comp2nd, 250 }, 251); 252 253my $file = "unfexc.h"; 254open FH, ">$file" or croak "$PACKAGE: $file can't be made"; 255binmode FH; select FH; 256 257 print << 'EOF'; 258/* 259 * This file is auto-generated by mkheader. 260 * Any changes here will be lost! 261 */ 262EOF 263 264foreach my $tbl (@boolfunc) { 265 my @temp = sort {$a <=> $b} keys %{$tbl->{hash}}; 266 my $type = $tbl->{type}; 267 my $name = $tbl->{name}; 268 print "$type is$name (UV uv)\n{\nreturn\n\t"; 269 270 while (@temp) { 271 my $cur = shift @temp; 272 if (@temp && $cur + 1 == $temp[0]) { 273 print "($cur <= uv && uv <= "; 274 while (@temp && $cur + 1 == $temp[0]) { 275 $cur = shift @temp; 276 } 277 print "$cur)"; 278 print "\n\t|| " if @temp; 279 } else { 280 print "uv == $cur"; 281 print "\n\t|| " if @temp; 282 } 283 } 284 print "\n\t? TRUE : FALSE;\n}\n\n"; 285} 286 287close FH; 288 289#################################### 290 291my $compinit = 292 "typedef struct { UV nextchar; UV composite; } $structname;\n\n"; 293 294foreach my $i (sort keys %CompList) { 295 $compinit .= "$structname $i [] = {\n"; 296 $compinit .= join ",\n", 297 map sprintf("\t{ %d, %d }", $_, $CompList{$i}{$_}), 298 sort {$a <=> $b } keys %{ $CompList{$i} }; 299 $compinit .= ",\n{0,0}\n};\n\n"; # with sentinel 300} 301 302my @tripletable = ( 303 { 304 file => "unfcmb", 305 name => "combin", 306 type => "STDCHAR", 307 hash => \%Combin, 308 null => 0, 309 }, 310 { 311 file => "unfcan", 312 name => "canon", 313 type => "char*", 314 hash => \%Canon, 315 null => "NULL", 316 }, 317 { 318 file => "unfcpt", 319 name => "compat", 320 type => "char*", 321 hash => \%Compat, 322 null => "NULL", 323 }, 324 { 325 file => "unfcmp", 326 name => "compos", 327 type => "$structname *", 328 hash => \%Comp1st, 329 null => "NULL", 330 init => $compinit, 331 }, 332); 333 334foreach my $tbl (@tripletable) { 335 my $file = "$tbl->{file}.h"; 336 my $head = "${prefix}$tbl->{name}"; 337 my $type = $tbl->{type}; 338 my $hash = $tbl->{hash}; 339 my $null = $tbl->{null}; 340 my $init = $tbl->{init}; 341 342 open FH, ">$file" or croak "$PACKAGE: $file can't be made"; 343 binmode FH; select FH; 344 my %val; 345 346 print FH << 'EOF'; 347/* 348 * This file is auto-generated by mkheader. 349 * Any changes here will be lost! 350 */ 351EOF 352 353 print $init if defined $init; 354 355 foreach my $uv (keys %$hash) { 356 croak sprintf("a Unicode code point 0x%04X over 0x10FFFF.", $uv) 357 unless $uv <= 0x10FFFF; 358 my @c = unpack 'CCCC', pack 'N', $uv; 359 $val{ $c[1] }{ $c[2] }{ $c[3] } = $hash->{$uv}; 360 } 361 362 foreach my $p (sort { $a <=> $b } keys %val) { 363 next if ! $val{ $p }; 364 for (my $r = 0; $r < 256; $r++) { 365 next if ! $val{ $p }{ $r }; 366 printf "$type ${head}_%02x_%02x [256] = {\n", $p, $r; 367 for (my $c = 0; $c < 256; $c++) { 368 print "\t", defined $val{$p}{$r}{$c} 369 ? "($type)".$val{$p}{$r}{$c} 370 : $null; 371 print ',' if $c != 255; 372 print "\n" if $c % 8 == 7; 373 } 374 print "};\n\n"; 375 } 376 } 377 foreach my $p (sort { $a <=> $b } keys %val) { 378 next if ! $val{ $p }; 379 printf "$type* ${head}_%02x [256] = {\n", $p; 380 for (my $r = 0; $r < 256; $r++) { 381 print $val{ $p }{ $r } 382 ? sprintf("${head}_%02x_%02x", $p, $r) 383 : "NULL"; 384 print ',' if $r != 255; 385 print "\n" if $val{ $p }{ $r } || ($r+1) % 8 == 0; 386 } 387 print "};\n\n"; 388 } 389 print "$type** $head [] = {\n"; 390 for (my $p = 0; $p <= 0x10; $p++) { 391 print $val{ $p } ? sprintf("${head}_%02x", $p) : "NULL"; 392 print ',' if $p != 0x10; 393 print "\n"; 394 } 395 print "};\n\n"; 396 close FH; 397} 398 3991; 400__END__ 401