1#!perl
2#
3# This auxiliary script makes five header files
4# used for building XSUB of Unicode::Normalize.
5#
6# Usage:
7#    <do 'mkheader'> in perl, or <perl mkheader> in command line
8#
9# Input files:
10#    unicore/CombiningClass.pl (or unicode/CombiningClass.pl)
11#    unicore/Decomposition.pl (or unicode/Decomposition.pl)
12#    unicore/CompositionExclusions.txt (or unicode/CompExcl.txt)
13#
14# Output files:
15#    unfcan.h
16#    unfcpt.h
17#    unfcmb.h
18#    unfcmp.h
19#    unfexc.h
20#
21use 5.006;
22use strict;
23use warnings;
24use Carp;
25use File::Spec;
26
27BEGIN {
28    unless ("A" eq pack('U', 0x41)) {
29	die "Unicode::Normalize cannot stringify a Unicode code point\n";
30    }
31}
32
33our $PACKAGE = 'Unicode::Normalize, mkheader';
34
35our $Combin = do "unicore/CombiningClass.pl"
36    || do "unicode/CombiningClass.pl"
37    || croak "$PACKAGE: CombiningClass.pl not found";
38
39our $Decomp = do "unicore/Decomposition.pl"
40    || do "unicode/Decomposition.pl"
41    || croak "$PACKAGE: Decomposition.pl not found";
42
43our %Combin;	# $codepoint => $number    : combination class
44our %Canon;	# $codepoint => \@codepoints : canonical decomp.
45our %Compat;	# $codepoint => \@codepoints : compat. decomp.
46# after _U_stringify(), ($codepoint => $hexstring) for %Canon and %Compat
47our %Exclus;	# $codepoint => 1          : composition exclusions
48our %Single;	# $codepoint => 1          : singletons
49our %NonStD;	# $codepoint => 1          : non-starter decompositions
50
51our %Comp1st;	# $codepoint => $listname  : may be composed with a next char.
52our %Comp2nd;	# $codepoint => 1          : may be composed with a prev char.
53our %CompList;	# $listname,$2nd  => $codepoint : composite
54
55our $prefix = "UNF_";
56our $structname = "${prefix}complist";
57
58########## definition of Hangul constants ##########
59use constant SBase  => 0xAC00;
60use constant SFinal => 0xD7A3; # SBase -1 + SCount
61use constant SCount =>  11172; # LCount * NCount
62use constant NCount =>    588; # VCount * TCount
63use constant LBase  => 0x1100;
64use constant LFinal => 0x1112;
65use constant LCount =>     19;
66use constant VBase  => 0x1161;
67use constant VFinal => 0x1175;
68use constant VCount =>     21;
69use constant TBase  => 0x11A7;
70use constant TFinal => 0x11C2;
71use constant TCount =>     28;
72
73sub decomposeHangul {
74    my $SIndex = $_[0] - SBase;
75    my $LIndex = int( $SIndex / NCount);
76    my $VIndex = int(($SIndex % NCount) / TCount);
77    my $TIndex =      $SIndex % TCount;
78    my @ret = (
79       LBase + $LIndex,
80       VBase + $VIndex,
81      $TIndex ? (TBase + $TIndex) : (),
82    );
83    wantarray ? @ret : pack('U*', @ret);
84     # any element in @ret greater than 0xFF, so no need of u2n conversion.
85}
86
87########## getting full decomposion ##########
88{
89    my($f, $fh);
90    foreach my $d (@INC) {
91	$f = File::Spec->catfile($d, "unicore", "CompositionExclusions.txt");
92	last if open($fh, $f);
93	$f = File::Spec->catfile($d, "unicode", "CompExcl.txt");
94	last if open($fh, $f);
95	$f = undef;
96    }
97    croak "$PACKAGE: neither unicore/CompositionExclusions.txt "
98	. "nor unicode/CompExcl.txt is found in @INC" unless defined $f;
99
100    while (<$fh>) {
101	next if /^#/ or /^$/;
102	s/#.*//;
103	$Exclus{ hex($1) } = 1 if /([0-9A-Fa-f]+)/;
104    }
105    close $fh;
106}
107
108##
109## converts string "hhhh hhhh hhhh" to a numeric list
110##
111sub _getHexArray { map hex, $_[0] =~ /([0-9A-Fa-f]+)/g }
112
113while ($Combin =~ /(.+)/g) {
114    my @tab = split /\t/, $1;
115    my $ini = hex $tab[0];
116    if ($tab[1] eq '') {
117	$Combin{ $ini } = $tab[2];
118    } else {
119	$Combin{ $_ } = $tab[2] foreach $ini .. hex($tab[1]);
120    }
121}
122
123while ($Decomp =~ /(.+)/g) {
124    my @tab = split /\t/, $1;
125    my $compat = $tab[2] =~ s/<[^>]+>//;
126    my $dec = [ _getHexArray($tab[2]) ]; # decomposition
127    my $ini = hex($tab[0]); # initial decomposable character
128
129    my $listname =
130	@$dec == 2 ? sprintf("${structname}_%06x", $dec->[0]) : 'USELESS';
131		# %04x is bad since it'd place _3046 after _1d157.
132
133    if ($tab[1] eq '') {
134	$Compat{ $ini } = $dec;
135
136	if (! $compat) {
137	    $Canon{ $ini } = $dec;
138
139	    if (@$dec == 2) {
140		if ($Combin{ $dec->[0] }) {
141		    $NonStD{ $ini } = 1;
142		} else {
143		    $CompList{ $listname }{ $dec->[1] } = $ini;
144		    $Comp1st{ $dec->[0] } = $listname;
145		    $Comp2nd{ $dec->[1] } = 1 if ! $Exclus{$ini};
146		}
147	    } elsif (@$dec == 1) {
148		$Single{ $ini } = 1;
149	    } else {
150		croak("Weird Canonical Decomposition of U+$tab[0]");
151	    }
152	}
153    } else {
154	foreach my $u ($ini .. hex($tab[1])) {
155	    $Compat{ $u } = $dec;
156
157	    if (! $compat) {
158		$Canon{ $u } = $dec;
159
160		if (@$dec == 2) {
161		    if ($Combin{ $dec->[0] }) {
162			$NonStD{ $u } = 1;
163		    } else {
164			$CompList{ $listname }{ $dec->[1] } = $u;
165			$Comp1st{ $dec->[0] } = $listname;
166			$Comp2nd{ $dec->[1] } = 1 if ! $Exclus{$u};
167		    }
168		} elsif (@$dec == 1) {
169		    $Single{ $u } = 1;
170		} else {
171		    croak("Weird Canonical Decomposition of U+$tab[0]");
172		}
173	    }
174	}
175    }
176}
177
178# modern HANGUL JUNGSEONG and HANGUL JONGSEONG jamo
179foreach my $j (0x1161..0x1175, 0x11A8..0x11C2) {
180    $Comp2nd{$j} = 1;
181}
182
183sub getCanonList {
184    my @src = @_;
185    my @dec = map {
186	(SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_)
187	    : $Canon{$_} ? @{ $Canon{$_} } : $_
188		} @src;
189    return join(" ",@src) eq join(" ",@dec) ? @dec : getCanonList(@dec);
190    # condition @src == @dec is not ok.
191}
192
193sub getCompatList {
194    my @src = @_;
195    my @dec = map {
196	(SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_)
197	    : $Compat{$_} ? @{ $Compat{$_} } : $_
198		} @src;
199    return join(" ",@src) eq join(" ",@dec) ? @dec : getCompatList(@dec);
200    # condition @src == @dec is not ok.
201}
202
203# exhaustive decomposition
204foreach my $key (keys %Canon) {
205    $Canon{$key}  = [ getCanonList($key) ];
206}
207
208# exhaustive decomposition
209foreach my $key (keys %Compat) {
210    $Compat{$key} = [ getCompatList($key) ];
211}
212
213sub _pack_U {
214    return pack('U*', @_);
215}
216
217sub _U_stringify {
218    sprintf '"%s"', join '',
219	map sprintf("\\x%02x", $_), unpack 'C*', _pack_U(@_);
220}
221
222foreach my $hash (\%Canon, \%Compat) {
223    foreach my $key (keys %$hash) {
224	$hash->{$key} = _U_stringify( @{ $hash->{$key} } );
225    }
226}
227
228########## writing header files ##########
229
230my @boolfunc = (
231    {
232	name => "Exclusion",
233	type => "bool",
234	hash => \%Exclus,
235    },
236    {
237	name => "Singleton",
238	type => "bool",
239	hash => \%Single,
240    },
241    {
242	name => "NonStDecomp",
243	type => "bool",
244	hash => \%NonStD,
245    },
246    {
247	name => "Comp2nd",
248	type => "bool",
249	hash => \%Comp2nd,
250    },
251);
252
253my $file = "unfexc.h";
254open FH, ">$file" or croak "$PACKAGE: $file can't be made";
255binmode FH; select FH;
256
257    print << 'EOF';
258/*
259 * This file is auto-generated by mkheader.
260 * Any changes here will be lost!
261 */
262EOF
263
264foreach my $tbl (@boolfunc) {
265    my @temp = sort {$a <=> $b} keys %{$tbl->{hash}};
266    my $type = $tbl->{type};
267    my $name = $tbl->{name};
268    print "$type is$name (UV uv)\n{\nreturn\n\t";
269
270    while (@temp) {
271	my $cur = shift @temp;
272	if (@temp && $cur + 1 == $temp[0]) {
273	    print "($cur <= uv && uv <= ";
274	    while (@temp && $cur + 1 == $temp[0]) {
275		$cur = shift @temp;
276	    }
277	    print "$cur)";
278	    print "\n\t|| " if @temp;
279	} else {
280	    print "uv == $cur";
281	    print "\n\t|| " if @temp;
282	}
283    }
284    print "\n\t? TRUE : FALSE;\n}\n\n";
285}
286
287close FH;
288
289####################################
290
291my $compinit =
292    "typedef struct { UV nextchar; UV composite; } $structname;\n\n";
293
294foreach my $i (sort keys %CompList) {
295    $compinit .= "$structname $i [] = {\n";
296    $compinit .= join ",\n",
297	map sprintf("\t{ %d, %d }", $_, $CompList{$i}{$_}),
298	    sort {$a <=> $b } keys %{ $CompList{$i} };
299    $compinit .= ",\n{0,0}\n};\n\n"; # with sentinel
300}
301
302my @tripletable = (
303    {
304	file => "unfcmb",
305	name => "combin",
306	type => "STDCHAR",
307	hash => \%Combin,
308	null =>  0,
309    },
310    {
311	file => "unfcan",
312	name => "canon",
313	type => "char*",
314	hash => \%Canon,
315	null => "NULL",
316    },
317    {
318	file => "unfcpt",
319	name => "compat",
320	type => "char*",
321	hash => \%Compat,
322	null => "NULL",
323    },
324    {
325	file => "unfcmp",
326	name => "compos",
327	type => "$structname *",
328	hash => \%Comp1st,
329	null => "NULL",
330	init => $compinit,
331    },
332);
333
334foreach my $tbl (@tripletable) {
335    my $file = "$tbl->{file}.h";
336    my $head = "${prefix}$tbl->{name}";
337    my $type = $tbl->{type};
338    my $hash = $tbl->{hash};
339    my $null = $tbl->{null};
340    my $init = $tbl->{init};
341
342    open FH, ">$file" or croak "$PACKAGE: $file can't be made";
343    binmode FH; select FH;
344    my %val;
345
346    print FH << 'EOF';
347/*
348 * This file is auto-generated by mkheader.
349 * Any changes here will be lost!
350 */
351EOF
352
353    print $init if defined $init;
354
355    foreach my $uv (keys %$hash) {
356	croak sprintf("a Unicode code point 0x%04X over 0x10FFFF.", $uv)
357	    unless $uv <= 0x10FFFF;
358	my @c = unpack 'CCCC', pack 'N', $uv;
359	$val{ $c[1] }{ $c[2] }{ $c[3] } = $hash->{$uv};
360    }
361
362    foreach my $p (sort { $a <=> $b } keys %val) {
363	next if ! $val{ $p };
364	for (my $r = 0; $r < 256; $r++) {
365	    next if ! $val{ $p }{ $r };
366	    printf "$type ${head}_%02x_%02x [256] = {\n", $p, $r;
367	    for (my $c = 0; $c < 256; $c++) {
368		print "\t", defined $val{$p}{$r}{$c}
369		    ? "($type)".$val{$p}{$r}{$c}
370		    : $null;
371		print ','  if $c != 255;
372		print "\n" if $c % 8 == 7;
373	    }
374	    print "};\n\n";
375	}
376    }
377    foreach my $p (sort { $a <=> $b } keys %val) {
378	next if ! $val{ $p };
379	printf "$type* ${head}_%02x [256] = {\n", $p;
380	for (my $r = 0; $r < 256; $r++) {
381	    print $val{ $p }{ $r }
382		? sprintf("${head}_%02x_%02x", $p, $r)
383		: "NULL";
384	    print ','  if $r != 255;
385	    print "\n" if $val{ $p }{ $r } || ($r+1) % 8 == 0;
386	}
387	print "};\n\n";
388    }
389    print "$type** $head [] = {\n";
390    for (my $p = 0; $p <= 0x10; $p++) {
391	print $val{ $p } ? sprintf("${head}_%02x", $p) : "NULL";
392	print ','  if $p != 0x10;
393	print "\n";
394    }
395    print "};\n\n";
396    close FH;
397}
398
3991;
400__END__
401