1#!/usr/local/bin/perl 2# 3# $Id: ucmlint,v 2.0 2004/05/16 20:55:16 dankogai Exp $ 4# 5 6use strict; 7our $VERSION = do { my @r = (q$Revision: 2.0 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; 8 9use Getopt::Std; 10our %Opt; 11getopts("Dehfv", \%Opt); 12 13if ($Opt{e}){ 14 eval{ require Encode; }; 15 $@ and die "can't load Encode : $@"; 16} 17 18$Opt{h} and help(); 19@ARGV or help(); 20 21sub help{ 22 print <<""; 23$0 -[Dehfv] [ucm files ...] 24 -D debug mode on 25 -e test with Encode module also (requires perl 5.7.3 or higher) 26 -h shows this message 27 -f forces roundtrip check even for |[123] 28 -v verbose mode 29 30} 31 32$| = 1; 33my (%Hdr, %U2E, %E2U); 34my $in_charmap = 0; 35my $nerror = 0; 36my $nwarning = 0; 37 38sub nit($;$){ 39 my ($msg, $level) = @_; 40 my $lstr; 41 if ($level == 2){ 42 $lstr = 'notice'; 43 }elsif ($level == 1){ 44 $lstr = 'warning'; $nwarning++; 45 }else{ 46 $lstr = 'error'; $nerror++; 47 } 48 print "$ARGV:$lstr in line $.: $msg\n"; 49} 50 51for $ARGV (@ARGV){ 52 open UCM, $ARGV or die "$ARGV:$!"; 53 %Hdr = %U2E = %E2U = (); 54 $in_charmap = $nerror = $nwarning = 0; 55 $. = 0; 56 while(<UCM>){ 57 chomp; 58 s/\s*#.*$//o; /^$/ and next; 59 if ($_ eq "CHARMAP"){ 60 $in_charmap = 1; 61 for my $must (qw/code_set_name mb_cur_min mb_cur_max/){ 62 exists $Hdr{$must} or nit "<$must> nonexistent"; 63 } 64 $Hdr{mb_cur_min} > $Hdr{mb_cur_max} 65 and nit sprintf("mb_cur_min(%d) > mb_cur_max(%d)", 66 $Hdr{mb_cur_min},$Hdr{mb_cur_max}); 67 $in_charmap = 1; 68 next; 69 } 70 unless ($in_charmap){ 71 my($hkey, $hvalue) = /^<(\S+)>\s+[\"\']?([^\"\']+)/o or next; 72 $Opt{D} and warn "$hkey => $hvalue"; 73 if ($hkey eq "code_set_name"){ # name check 74 exists $Hdr{code_set_name} 75 and nit "Duplicate <code_set_name>: $hkey"; 76 } 77 if ($hkey eq "code_set_alias"){ # alias check 78 $hvalue eq $Hdr{code_set_name} 79 and nit qq(alias "$hvalue" is already in <code_set_name>); 80 } 81 $Hdr{$hkey} = $hvalue; 82 }else{ 83 my $name = $Hdr{code_set_name}; 84 my($unistr, $encstr, $fb) = /^(\S+)\s+(\S+)\s(\S+)/o or next; 85 $Opt{v} and nit $_, 2; 86 my $uni = uniparse($unistr); 87 my $enc = encparse($encstr); 88 $fb =~ /^\|([0123])$/ or nit "malformed fallback: $fb"; 89 $fb = $1; 90 $Opt{f} and $fb = 0; 91 unless ($fb == 1){ # check uni -> enc 92 if (exists $U2E{$uni}){ 93 nit "dupe encode map: U$uni => $U2E{$uni} and $enc", 1; 94 }else{ 95 $U2E{$uni} = $enc; 96 if ($Opt{e} and $fb != 3) { 97 my $e = hex2enc($enc); 98 my $u = hex2uni($uni); 99 my $eu = Encode::encode($name, $u); 100 $e eq $eu 101 or nit qq(encode('$name', $uni) != $enc); 102 } 103 } 104 } 105 unless ($fb == 3){ # check enc -> uni 106 if (exists $E2U{$enc}){ 107 nit "dupe decode map: $enc => U$E2U{$enc} and U$uni", 1; 108 }else{ 109 $E2U{$enc} = $uni; 110 if ($Opt{e} and $fb != 1) { 111 my $e = hex2enc($enc); 112 my $u = hex2uni($uni); 113 $Opt{D} and warn "$uni, $enc"; 114 my $de = Encode::decode($name, $e); 115 $de eq $u 116 or nit qq(decode('$name', $enc) != $uni); 117 } 118 } 119 } 120 # warn "$uni, $enc, $fb"; 121 } 122 } 123 $in_charmap or nit "Where is CHARMAP?"; 124 checkRT(); 125 printf ("$ARGV: %s error%s found\n", 126 ($nerror == 0 ? 'no' : $nerror), 127 ($nerror > 1 ? 's' : '')); 128} 129 130exit; 131 132sub hex2enc{ 133 pack("C*", map {hex($_)} split(",", shift)); 134} 135sub hex2uni{ 136 join("", map { chr(hex($_)) } split(",", shift)); 137} 138 139sub checkRT{ 140 for my $uni (keys %E2U){ 141 my $enc = $U2E{$uni} or next; # okay 142 $E2U{$U2E{$uni}} eq $uni or 143 nit "RT failure: U$uni => $enc =>U$E2U{$U2E{$uni}}"; 144 } 145 for my $enc (keys %E2U){ 146 my $uni = $E2U{$enc} or next; # okay 147 $U2E{$E2U{$enc}} eq $enc or 148 nit "RT failure: $enc => U$uni => $U2E{$E2U{$enc}}"; 149 } 150} 151 152 153sub uniparse{ 154 my $str = shift; 155 my @u; 156 push @u, $1 while($str =~ /\G<U(.*?)>/ig); 157 for my $u (@u){ 158 $u =~ /^([0-9A-Za-z]+)$/o 159 or nit "malformed Unicode character: $u"; 160 } 161 return join(',', @u); 162} 163 164sub encparse{ 165 my $str = shift; 166 my @e; 167 for my $e (split /\\x/io, $str){ 168 $e or next; # first \x 169 $e =~ /^([0-9A-Za-z]{1,2})$/io 170 or nit "Hex $e in $str is bogus"; 171 push @e, $1; 172 } 173 return join(',', @e); 174} 175 176 177 178__END__ 179 180A UCM file looks like this. 181 182 # 183 # Comments 184 # 185 <code_set_name> "US-ascii" # Required 186 <code_set_alias> "ascii" # Optional 187 <mb_cur_min> 1 # Required; usually 1 188 <mb_cur_max> 1 # Max. # of bytes/char 189 <subchar> \x3F # Substitution char 190 # 191 CHARMAP 192 <U0000> \x00 |0 # <control> 193 <U0001> \x01 |0 # <control> 194 <U0002> \x02 |0 # <control> 195 .... 196 <U007C> \x7C |0 # VERTICAL LINE 197 <U007D> \x7D |0 # RIGHT CURLY BRACKET 198 <U007E> \x7E |0 # TILDE 199 <U007F> \x7F |0 # <control> 200 END CHARMAP 201 202