Contents of /trunk2/openisis/unistat

#!/usr/bin/perl

print <<EOF;
unicode block statistics of general character categories,
decomposition and uppercase mappings based on
Blocks.txt and UnicodeData.txt
>       http://unicode.org/Public/UNIDATA/


EOF


$dir = $ARGV[0] || '.';
open UDATA, $dir.'/UnicodeData.txt';
open BLOCKS, $dir.'/Blocks.txt';

$end = -1;
$gap = 0;
# get next block
sub nblock {
        while (<BLOCKS>) {
                last unless /^#/;
        }
        chomp;
        $lend = $end;
        ($beg,$end,$block) = /([0-9A-Z]*)..([0-9A-Z]*); (.*)/;
        $h = {};
        $h->{beg} = $beg;
        $h->{end} = $end;
        $beg = hex $beg;
        $end = hex $end;
        $gap += $beg - 1 - $lend if 65536 > $beg;
        $h->{len} = $end - $beg + 1;
        $h->{nam} = $block;
        # print "$beg..$end($h->{len}): $block\n";
        push @blocks, $h;
}

nblock;


# UnicodeData.txt:
# 0 number (hex)
# 1 name S
# 2 general category E
# 3 canonical combining class N
# 4 bidi class E
# 5 decomposition <type> mapping <E>S
# 6-8 numeric type and value E/N
# 9 bidi_mirrored B
# 10 old name S
# 11 comment S
# 12-14 upper/lower/titlecase mapping S
while (<UDATA>) {
        @l = split /;/;
        # last if '10000' eq $l[0]; # BMP only
        $num = hex $l[0];
        while ($num > $end) { nblock; }
        die "$_ not in any block!" if $num < $beg;
        # <CJK Ideograph, First>
        if ( $l[1] =~ /, First>$/ ) {
                $n = <UDATA>;
                ($e) = split /;/,$n;
                $e = hex($e);
                # print stderr "$l[1]..Last: $num .. $e\n";
                die "end $e of $l[1] ($num) not in block $h->{nam}" if $e > $end;
                $h->{$l[2]} += $e - $num +1;
                next;
        }
        # gen cat
        $h->{$l[2]}++;
        # decomp
        if ($l[5]) {
                ($type) = ($l[5] =~ /<(.*)>/);
                $deco{$type||'(canonical)'}++;
                if ( ! $type ) { # decomp Canon
                        $h->{'dC'}++;
                }
        }
        # has Upper
        if ($l[12]) {
                $h->{'uC'}++;
        }
}


print <<EOF;
*\tdecomposition mappings
see
>       http://unicode.org/Public/UNIDATA/UCD.html#Character_Decomposition_Mappings

\$
EOF
for (sort keys %deco) { print join("\t",$_,$deco{$_}),"\n"; }
print "\$\n\n\n";


# 30 general categories + 2 specials
@cat = (
        'Cn','Lu','Ll','Lt','Lm','Lo','Mn','Me','Mc','Nd','Nl','No','Zs','Zl','Zp',
        'Cc','Cf','Co','Cs','Pc','Pd','Ps','Pe','Pi','Pf','Po','Sm','Sc','Sk','So'
);
@add = ( 'uC','dC' );


# table major categories to blocks
print "\n*\tmajor category/block table\n";
print <<EOF;
Categories are letter, mark, numeric, punctuation, symbol, separator and other.
Additional columns give number of characters which have an uppercase and
canonical decomposition mapping, resp.
Final columns give begin and end, block length and name. 

\$
EOF
# headers
@mcat = ('L','M','N','P','S','Z','C');
print join("\t",
        'Let','Mar','Num','Pun','Sym','Sep','Oth',
        'upC', 'deC',
        'beg','end','len','block'
),"\n";
for $h (@blocks) {
        $ass = 0;
        %maj = ();
        for (@cat) {
                $ass += $h->{$_};
                $tot{$_} += $h->{$_};
                $maj{substr($_,0,1)} += $h->{$_};
        }
        $maj{'C'} += $h->{'Cn'} = $h->{len} - $ass; # unassigned
        $Cn += $h->{'Cn'} if 65536 > hex($h->{beg}); # in BMP
        for (@mcat) {
                $tot{$_} += $maj{$_};
                print $maj{$_},"\t";
        }
        for (@add) {
                $tot{$_} += $h->{$_};
                print $h->{$_}||'0',"\t";
        }
        print join("\t",$h->{beg},$h->{end},$h->{len},$h->{nam}),"\n";
}
for (@mcat,@add) { print $tot{$_},"\t"; } print "\n";
print "\$\n";
print "BMP: nonblock $gap unassigned $Cn\n\n\n";

# list blocks to categories
print <<EOF;
*\tdetailled block stats
see
>       http://unicode.org/Public/UNIDATA/UCD.html#General_Category_Values

\$
EOF
for $h (@blocks) {
        print join("\t",$h->{nam},'b'.$h->{beg},'l'.$h->{len}),"\t";
        for (@cat) {
                if ($h->{$_}) {
                        print $_,$h->{$_},"\t";
                }
        }
        print "\n";
}
print "\$\n\n\n";
1	#!/usr/bin/perl
2
3	print <<EOF;
4	unicode block statistics of general character categories,
5	decomposition and uppercase mappings based on
6	Blocks.txt and UnicodeData.txt
7	> http://unicode.org/Public/UNIDATA/
8
9
10	EOF
11
12
13	$dir = $ARGV[0] \|\| '.';
14	open UDATA, $dir.'/UnicodeData.txt';
15	open BLOCKS, $dir.'/Blocks.txt';
16
17	$end = -1;
18	$gap = 0;
19	# get next block
20	sub nblock {
21	while (<BLOCKS>) {
22	last unless /^#/;
23	}
24	chomp;
25	$lend = $end;
26	($beg,$end,$block) = /([0-9A-Z])..([0-9A-Z]); (.*)/;
27	$h = {};
28	$h->{beg} = $beg;
29	$h->{end} = $end;
30	$beg = hex $beg;
31	$end = hex $end;
32	$gap += $beg - 1 - $lend if 65536 > $beg;
33	$h->{len} = $end - $beg + 1;
34	$h->{nam} = $block;
35	# print "$beg..$end($h->{len}): $block\n";
36	push @blocks, $h;
37	}
38
39	nblock;
40
41
42	# UnicodeData.txt:
43	# 0 number (hex)
44	# 1 name S
45	# 2 general category E
46	# 3 canonical combining class N
47	# 4 bidi class E
48	# 5 decomposition <type> mapping <E>S
49	# 6-8 numeric type and value E/N
50	# 9 bidi_mirrored B
51	# 10 old name S
52	# 11 comment S
53	# 12-14 upper/lower/titlecase mapping S
54	while (<UDATA>) {
55	@l = split /;/;
56	# last if '10000' eq $l[0]; # BMP only
57	$num = hex $l[0];
58	while ($num > $end) { nblock; }
59	die "$_ not in any block!" if $num < $beg;
60	# <CJK Ideograph, First>
61	if ( $l[1] =~ /, First>$/ ) {
62	$n = <UDATA>;
63	($e) = split /;/,$n;
64	$e = hex($e);
65	# print stderr "$l[1]..Last: $num .. $e\n";
66	die "end $e of $l[1] ($num) not in block $h->{nam}" if $e > $end;
67	$h->{$l[2]} += $e - $num +1;
68	next;
69	}
70	# gen cat
71	$h->{$l[2]}++;
72	# decomp
73	if ($l[5]) {
74	($type) = ($l[5] =~ /<(.*)>/);
75	$deco{$type\|\|'(canonical)'}++;
76	if ( ! $type ) { # decomp Canon
77	$h->{'dC'}++;
78	}
79	}
80	# has Upper
81	if ($l[12]) {
82	$h->{'uC'}++;
83	}
84	}
85
86
87	print <<EOF;
88	*\tdecomposition mappings
89	see
90	> http://unicode.org/Public/UNIDATA/UCD.html#Character_Decomposition_Mappings
91
92	\$
93	EOF
94	for (sort keys %deco) { print join("\t",$_,$deco{$_}),"\n"; }
95	print "\$\n\n\n";
96
97
98	# 30 general categories + 2 specials
99	@cat = (
100	'Cn','Lu','Ll','Lt','Lm','Lo','Mn','Me','Mc','Nd','Nl','No','Zs','Zl','Zp',
101	'Cc','Cf','Co','Cs','Pc','Pd','Ps','Pe','Pi','Pf','Po','Sm','Sc','Sk','So'
102	);
103	@add = ( 'uC','dC' );
104
105
106	# table major categories to blocks
107	print "\n*\tmajor category/block table\n";
108	print <<EOF;
109	Categories are letter, mark, numeric, punctuation, symbol, separator and other.
110	Additional columns give number of characters which have an uppercase and
111	canonical decomposition mapping, resp.
112	Final columns give begin and end, block length and name.
113
114	\$
115	EOF
116	# headers
117	@mcat = ('L','M','N','P','S','Z','C');
118	print join("\t",
119	'Let','Mar','Num','Pun','Sym','Sep','Oth',
120	'upC', 'deC',
121	'beg','end','len','block'
122	),"\n";
123	for $h (@blocks) {
124	$ass = 0;
125	%maj = ();
126	for (@cat) {
127	$ass += $h->{$_};
128	$tot{$_} += $h->{$_};
129	$maj{substr($_,0,1)} += $h->{$_};
130	}
131	$maj{'C'} += $h->{'Cn'} = $h->{len} - $ass; # unassigned
132	$Cn += $h->{'Cn'} if 65536 > hex($h->{beg}); # in BMP
133	for (@mcat) {
134	$tot{$_} += $maj{$_};
135	print $maj{$_},"\t";
136	}
137	for (@add) {
138	$tot{$_} += $h->{$_};
139	print $h->{$_}\|\|'0',"\t";
140	}
141	print join("\t",$h->{beg},$h->{end},$h->{len},$h->{nam}),"\n";
142	}
143	for (@mcat,@add) { print $tot{$_},"\t"; } print "\n";
144	print "\$\n";
145	print "BMP: nonblock $gap unassigned $Cn\n\n\n";
146
147	# list blocks to categories
148	print <<EOF;
149	*\tdetailled block stats
150	see
151	> http://unicode.org/Public/UNIDATA/UCD.html#General_Category_Values
152
153	\$
154	EOF
155	for $h (@blocks) {
156	print join("\t",$h->{nam},'b'.$h->{beg},'l'.$h->{len}),"\t";
157	for (@cat) {
158	if ($h->{$_}) {
159	print $_,$h->{$_},"\t";
160	}
161	}
162	print "\n";
163	}
164	print "\$\n\n\n";