15 |
|
|
16 |
use strict; |
use strict; |
17 |
use Getopt::Std; |
use Getopt::Std; |
|
use Lingua::Spelling::Alternative; |
|
18 |
require Unicode::Map8; |
require Unicode::Map8; |
19 |
|
use GDBM_File; |
20 |
|
|
21 |
my $sadrzaj=0; |
my $sadrzaj=0; |
22 |
my $nr=0; |
my $nr=0; |
27 |
my $aname; ## ancor name na originalnim stranicama |
my $aname; ## ancor name na originalnim stranicama |
28 |
|
|
29 |
my $nn_dir="../"; # dir u kojem su wget-ani fileovi |
my $nn_dir="../"; # dir u kojem su wget-ani fileovi |
30 |
my $url="http://www.nn.hr/CijeliBrojS.asp?god=%d&br=%s&mid=%s#%d"; |
#my $path_fmt="http://www.nn.hr/CijeliBrojS.asp?god=%d&br=%s&mid=%s#%d"; |
31 |
|
|
32 |
|
# configure gdbm files here |
33 |
|
my $gdbm_brzakona="$nn_dir/swish/brzakona.gdbm"; |
34 |
|
my $gdbm_file2title="$nn_dir/swish/file2title.gdbm"; |
35 |
|
|
36 |
|
# where to drop full text URLs |
37 |
|
my $full_url_list="$nn_dir/sluzbeno/in.url"; |
38 |
|
|
39 |
|
# URL to original site |
40 |
|
my $full_url="http://www.nn.hr/clanci/sluzbeno/"; |
41 |
|
my $full_filename_fmt="%04d/%04s.htm"; |
42 |
|
my $path_fmt = $full_filename_fmt; |
43 |
|
|
44 |
my %opts; |
my %opts; |
45 |
getopts("vqdl:", \%opts); |
getopts("vqdl:", \%opts); |
46 |
|
|
47 |
my $brojeva=0; |
my $brojeva=0; |
48 |
my $zakona=0; |
my $zakona=0; |
49 |
|
my $zak_u_broju; |
50 |
|
|
|
my $hr = new Lingua::Spelling::Alternative( DEBUG => $opts{d} ); |
|
|
#$hr->load_affix("$nn_dir/search/croatian.aff"); |
|
|
$hr->load_findaffix("$nn_dir/prvih_50.txt"); |
|
51 |
|
|
52 |
my $l2_map = Unicode::Map8->new("ISO-8859-2") || die; |
my $l2_map = Unicode::Map8->new("ISO-8859-2") || die; |
53 |
|
|
54 |
|
my %br_zakona; |
55 |
|
tie %br_zakona, 'GDBM_File', $gdbm_brzakona.".temp", &GDBM_NEWDB, 0644; |
56 |
|
my %file2title; |
57 |
|
tie %file2title, 'GDBM_File', $gdbm_file2title.".temp", &GDBM_NEWDB, 0644; |
58 |
|
|
59 |
#-------------------------------------------------------------------- |
#-------------------------------------------------------------------- |
60 |
|
|
61 |
|
sub save_br_zak { |
62 |
|
my $god = shift || return; |
63 |
|
my $br = shift || return; |
64 |
|
my $zak_u_broju = shift || return; |
65 |
|
print STDERR "[$god/$br: $zak_u_broju zakona]\n" if (! $opts{q}); |
66 |
|
if (! $br_zakona{sprintf("%04d",$god)}) { |
67 |
|
$br_zakona{sprintf("%04d",$god)} = $zak_u_broju; |
68 |
|
} else { |
69 |
|
$br_zakona{sprintf("%04d",$god)} += $zak_u_broju; |
70 |
|
} |
71 |
|
} |
72 |
|
|
73 |
|
#-------------------------------------------------------------------- |
74 |
|
|
75 |
sub dump_to_swish { |
sub dump_to_swish { |
76 |
my $xml = shift @_; |
my $xml = shift @_; |
77 |
my ($god,$br,$nr,$aname) = @_; |
my ($god,$br,$nr,$aname) = @_; |
78 |
|
|
79 |
use utf8; |
use utf8; |
80 |
|
|
81 |
print "Path-Name: ".sprintf($url,$god,$br,$nr,$aname)."\n". |
# print "Path-Name: ".sprintf($path_fmt,$god,$br,$nr,$aname)."\n". |
82 |
|
print "Path-Name: ".sprintf($path_fmt,$god,$nr)."\n". |
83 |
"Content-Length: ".length($xml)."\n". |
"Content-Length: ".length($xml)."\n". |
84 |
"Document-Type: XML\n". |
"Document-Type: XML\n". |
85 |
"\n$xml"; |
"\n$xml"; |
87 |
|
|
88 |
#-------------------------------------------------------------------- |
#-------------------------------------------------------------------- |
89 |
|
|
90 |
|
open(URL,"> $full_url_list") || warn "can't open URL list file '$full_url_list': $!"; |
91 |
|
|
92 |
opendir(DIR,$nn_dir) || warn "opendir: $!"; |
opendir(DIR,$nn_dir) || warn "opendir: $!"; |
93 |
my @files; |
my @files; |
100 |
} |
} |
101 |
closedir(DIR); |
closedir(DIR); |
102 |
|
|
103 |
foreach my $file (@files) { |
foreach my $file (sort @files) { |
104 |
open(IN,"$nn_dir/$file") || die "can't open '$nn_dir/$file': $!"; |
open(IN,"$nn_dir/$file") || die "can't open '$nn_dir/$file': $!"; |
105 |
|
|
106 |
if ($file=~m/god=(\d+)\&br=(\d+)/) { |
if ($file=~m/god=(\d+)\&br=(\d+)/) { |
107 |
|
save_br_zak($god,$br,$zak_u_broju); |
108 |
|
print STDERR "$file " if (! $opts{q}); |
109 |
($br,$god) = ($2,$1); |
($br,$god) = ($2,$1); |
|
print STDERR "$file -- $2 -- $1\n" if (! $opts{q}); |
|
110 |
$brojeva++; |
$brojeva++; |
111 |
|
$zak_u_broju = 0; |
112 |
} |
} |
113 |
|
|
114 |
while(<IN>) { |
while(<IN>) { |
131 |
my $naslov_czs = lc($naslov); |
my $naslov_czs = lc($naslov); |
132 |
$naslov_czs =~ tr/¹©ðÐèÈæƾ®/sSdDcCcCzZ/; |
$naslov_czs =~ tr/¹©ðÐèÈæƾ®/sSdDcCcCzZ/; |
133 |
$naslov_czs =~ tr/a-zA-Z/ /cs; # non a-z -> space |
$naslov_czs =~ tr/a-zA-Z/ /cs; # non a-z -> space |
|
$naslov_czs = join(" ",$hr->alternatives(split(/ /,$naslov_czs))); |
|
134 |
# $naslov_czs = $hr->minimal(split(/ /,$naslov_czs)); |
# $naslov_czs = $hr->minimal(split(/ /,$naslov_czs)); |
135 |
my $xml="<nn>\n<br>$br</br>\n<god>$god</god>\n<nr>$nr</nr>\n<aname>$aname</aname>\n"; |
my $xml="<nn>\n<br>$br</br>\n<god>$god</god>\n<nr>$nr</nr>\n<aname>$aname</aname>\n"; |
136 |
my $naslov_utf=$l2_map->tou($naslov)->utf8; |
my $naslov_utf=$l2_map->tou($naslov)->utf8; |
143 |
$xml.="<naslov>$naslov_utf</naslov>\n"; |
$xml.="<naslov>$naslov_utf</naslov>\n"; |
144 |
$xml.="<naslov_czs>$naslov_czs</naslov_czs>\n</nn>\n\n"; |
$xml.="<naslov_czs>$naslov_czs</naslov_czs>\n</nn>\n\n"; |
145 |
dump_to_swish($xml,$god,$br,$nr,$aname); |
dump_to_swish($xml,$god,$br,$nr,$aname); |
146 |
|
|
147 |
|
my $file = sprintf($full_filename_fmt,$god,$nr); |
148 |
|
print URL $full_url.$file."\n"; |
149 |
|
$file2title{$file}="$god $br $nr $aname $naslov"; |
150 |
|
|
151 |
$naslov=""; |
$naslov=""; |
152 |
$nr=0; |
$nr=0; |
153 |
$zakona++; |
$zakona++; |
154 |
|
$zak_u_broju++; |
155 |
} |
} |
156 |
|
|
157 |
if ($sadrzaj) { |
if ($sadrzaj) { |
173 |
close(IN); |
close(IN); |
174 |
} |
} |
175 |
|
|
176 |
|
save_br_zak($god,$br,$zak_u_broju); |
177 |
print STDERR "Ukupno $brojeva brojeva NN, sa $zakona zakona...\n" if (! $opts{q}); |
print STDERR "Ukupno $brojeva brojeva NN, sa $zakona zakona...\n" if (! $opts{q}); |
178 |
|
|
179 |
|
untie %br_zakona; |
180 |
|
|
181 |
|
# rename temp gdbm files |
182 |
|
rename $gdbm_brzakona.".temp",$gdbm_brzakona || die "can't rename $gdbm_brzakona: $!"; |
183 |
|
rename $gdbm_file2title.".temp",$gdbm_file2title || die "can't rename $gdbm_file2title: $!"; |