1 |
#!/usr/bin/perl |
2 |
|
3 |
print <<EOF; |
4 |
unicode block statistics of general character categories, |
5 |
decomposition and uppercase mappings based on |
6 |
Blocks.txt and UnicodeData.txt |
7 |
> http://unicode.org/Public/UNIDATA/ |
8 |
|
9 |
|
10 |
EOF |
11 |
|
12 |
|
13 |
$dir = $ARGV[0] || '.'; |
14 |
open UDATA, $dir.'/UnicodeData.txt'; |
15 |
open BLOCKS, $dir.'/Blocks.txt'; |
16 |
|
17 |
$end = -1; |
18 |
$gap = 0; |
19 |
# get next block |
20 |
sub nblock { |
21 |
while (<BLOCKS>) { |
22 |
last unless /^#/; |
23 |
} |
24 |
chomp; |
25 |
$lend = $end; |
26 |
($beg,$end,$block) = /([0-9A-Z]*)..([0-9A-Z]*); (.*)/; |
27 |
$h = {}; |
28 |
$h->{beg} = $beg; |
29 |
$h->{end} = $end; |
30 |
$beg = hex $beg; |
31 |
$end = hex $end; |
32 |
$gap += $beg - 1 - $lend if 65536 > $beg; |
33 |
$h->{len} = $end - $beg + 1; |
34 |
$h->{nam} = $block; |
35 |
# print "$beg..$end($h->{len}): $block\n"; |
36 |
push @blocks, $h; |
37 |
} |
38 |
|
39 |
nblock; |
40 |
|
41 |
|
42 |
# UnicodeData.txt: |
43 |
# 0 number (hex) |
44 |
# 1 name S |
45 |
# 2 general category E |
46 |
# 3 canonical combining class N |
47 |
# 4 bidi class E |
48 |
# 5 decomposition <type> mapping <E>S |
49 |
# 6-8 numeric type and value E/N |
50 |
# 9 bidi_mirrored B |
51 |
# 10 old name S |
52 |
# 11 comment S |
53 |
# 12-14 upper/lower/titlecase mapping S |
54 |
while (<UDATA>) { |
55 |
@l = split /;/; |
56 |
# last if '10000' eq $l[0]; # BMP only |
57 |
$num = hex $l[0]; |
58 |
while ($num > $end) { nblock; } |
59 |
die "$_ not in any block!" if $num < $beg; |
60 |
# <CJK Ideograph, First> |
61 |
if ( $l[1] =~ /, First>$/ ) { |
62 |
$n = <UDATA>; |
63 |
($e) = split /;/,$n; |
64 |
$e = hex($e); |
65 |
# print stderr "$l[1]..Last: $num .. $e\n"; |
66 |
die "end $e of $l[1] ($num) not in block $h->{nam}" if $e > $end; |
67 |
$h->{$l[2]} += $e - $num +1; |
68 |
next; |
69 |
} |
70 |
# gen cat |
71 |
$h->{$l[2]}++; |
72 |
# decomp |
73 |
if ($l[5]) { |
74 |
($type) = ($l[5] =~ /<(.*)>/); |
75 |
$deco{$type||'(canonical)'}++; |
76 |
if ( ! $type ) { # decomp Canon |
77 |
$h->{'dC'}++; |
78 |
} |
79 |
} |
80 |
# has Upper |
81 |
if ($l[12]) { |
82 |
$h->{'uC'}++; |
83 |
} |
84 |
} |
85 |
|
86 |
|
87 |
print <<EOF; |
88 |
*\tdecomposition mappings |
89 |
see |
90 |
> http://unicode.org/Public/UNIDATA/UCD.html#Character_Decomposition_Mappings |
91 |
|
92 |
\$ |
93 |
EOF |
94 |
for (sort keys %deco) { print join("\t",$_,$deco{$_}),"\n"; } |
95 |
print "\$\n\n\n"; |
96 |
|
97 |
|
98 |
# 30 general categories + 2 specials |
99 |
@cat = ( |
100 |
'Cn','Lu','Ll','Lt','Lm','Lo','Mn','Me','Mc','Nd','Nl','No','Zs','Zl','Zp', |
101 |
'Cc','Cf','Co','Cs','Pc','Pd','Ps','Pe','Pi','Pf','Po','Sm','Sc','Sk','So' |
102 |
); |
103 |
@add = ( 'uC','dC' ); |
104 |
|
105 |
|
106 |
# table major categories to blocks |
107 |
print "\n*\tmajor category/block table\n"; |
108 |
print <<EOF; |
109 |
Categories are letter, mark, numeric, punctuation, symbol, separator and other. |
110 |
Additional columns give number of characters which have an uppercase and |
111 |
canonical decomposition mapping, resp. |
112 |
Final columns give begin and end, block length and name. |
113 |
|
114 |
\$ |
115 |
EOF |
116 |
# headers |
117 |
@mcat = ('L','M','N','P','S','Z','C'); |
118 |
print join("\t", |
119 |
'Let','Mar','Num','Pun','Sym','Sep','Oth', |
120 |
'upC', 'deC', |
121 |
'beg','end','len','block' |
122 |
),"\n"; |
123 |
for $h (@blocks) { |
124 |
$ass = 0; |
125 |
%maj = (); |
126 |
for (@cat) { |
127 |
$ass += $h->{$_}; |
128 |
$tot{$_} += $h->{$_}; |
129 |
$maj{substr($_,0,1)} += $h->{$_}; |
130 |
} |
131 |
$maj{'C'} += $h->{'Cn'} = $h->{len} - $ass; # unassigned |
132 |
$Cn += $h->{'Cn'} if 65536 > hex($h->{beg}); # in BMP |
133 |
for (@mcat) { |
134 |
$tot{$_} += $maj{$_}; |
135 |
print $maj{$_},"\t"; |
136 |
} |
137 |
for (@add) { |
138 |
$tot{$_} += $h->{$_}; |
139 |
print $h->{$_}||'0',"\t"; |
140 |
} |
141 |
print join("\t",$h->{beg},$h->{end},$h->{len},$h->{nam}),"\n"; |
142 |
} |
143 |
for (@mcat,@add) { print $tot{$_},"\t"; } print "\n"; |
144 |
print "\$\n"; |
145 |
print "BMP: nonblock $gap unassigned $Cn\n\n\n"; |
146 |
|
147 |
# list blocks to categories |
148 |
print <<EOF; |
149 |
*\tdetailled block stats |
150 |
see |
151 |
> http://unicode.org/Public/UNIDATA/UCD.html#General_Category_Values |
152 |
|
153 |
\$ |
154 |
EOF |
155 |
for $h (@blocks) { |
156 |
print join("\t",$h->{nam},'b'.$h->{beg},'l'.$h->{len}),"\t"; |
157 |
for (@cat) { |
158 |
if ($h->{$_}) { |
159 |
print $_,$h->{$_},"\t"; |
160 |
} |
161 |
} |
162 |
print "\n"; |
163 |
} |
164 |
print "\$\n\n\n"; |