1 |
#!/usr/bin/perl -w |
2 |
|
3 |
# indexer, Dobrica Pavlinusic <dpavlin@rot13.org> 2001-01-28 |
4 |
# options: -q quiet |
5 |
# -d debug |
6 |
# -v verbose |
7 |
|
8 |
use strict; |
9 |
use DBI; |
10 |
use Getopt::Std; |
11 |
use Lingua::Spelling::Alternative; |
12 |
|
13 |
my $sadrzaj=0; |
14 |
my $nr=0; |
15 |
my $naslov=""; |
16 |
|
17 |
my $br; ## broj NN |
18 |
my $god; ## godina NN |
19 |
my $aname; ## ancor name na originalnim stranicama |
20 |
|
21 |
my $nn_dir="."; # dir u kojem su wget-ani fileovi |
22 |
|
23 |
my %opts; |
24 |
getopts("vqd", \%opts); |
25 |
|
26 |
my $brojeva=0; |
27 |
my $zakona=0; |
28 |
|
29 |
my $hr = new Lingua::Spelling::Alternative( DEBUG => $opts{d} ); |
30 |
#$hr->load_affix("$nn_dir/search/croatian.aff"); |
31 |
$hr->load_findaffix("$nn_dir/prvih_50.txt"); |
32 |
|
33 |
|
34 |
#-------------------------------------------------------------------- |
35 |
|
36 |
my $dbh = DBI->connect("DBI:Pg:dbname=nn","","") || die $DBI::errstr; |
37 |
|
38 |
$dbh->do("delete from nn") || die $dbh->errstr(); |
39 |
|
40 |
opendir(DIR,$nn_dir) || warn "opendir: $!"; |
41 |
my @files = grep { /^CijeliBrojS/ && -f "$nn_dir/$_" } readdir(DIR); |
42 |
closedir(DIR); |
43 |
|
44 |
my $sth=$dbh->prepare("insert into nn (br,god,nr,aname,title,title_czs) values (?,?,?,?,?,?)") || die $dbh->errstr(); |
45 |
|
46 |
|
47 |
foreach my $file (@files) { |
48 |
open(IN,$file) || die "can't open $file: $!"; |
49 |
|
50 |
if ($file=~m/god=(\d+)\&br=(\d+)/) { |
51 |
($br,$god) = ($2,$1); |
52 |
print "$file -- $2 -- $1\n" if (! $opts{q}); |
53 |
$brojeva++; |
54 |
} |
55 |
|
56 |
while(<IN>) { |
57 |
chomp; |
58 |
s/\015//g; # kill cr |
59 |
tr/ðèæÐÈÆ/¹ð¾èæ©Ð®ÈÆ/; # 1250 -> iso8859-2 |
60 |
|
61 |
if (m,<div class=sadrzaj>,) { |
62 |
$sadrzaj++; |
63 |
next; |
64 |
} |
65 |
|
66 |
if ($sadrzaj && m,</div>,) { |
67 |
$sadrzaj--; |
68 |
$naslov=~s/\s+/ /g; |
69 |
$naslov=~s/<[^>]+>//g; |
70 |
$naslov=~s/^\s+//g; |
71 |
$naslov=~s/\s+$//g; |
72 |
print "$god $br $nr: $naslov\n" if ($opts{v}); |
73 |
my $naslov_czs = lc($naslov); |
74 |
$naslov_czs =~ tr/¹©ðÐèÈæƾ®/sSdDcCcCzZ/; |
75 |
$naslov_czs =~ tr/a-zA-Z/ /cs; # non a-z -> space |
76 |
$naslov_czs = join(" ",$hr->alternatives(split(/ /,$naslov_czs))); |
77 |
$sth->execute($br,$god,$nr,$aname,$naslov,$naslov_czs) || die $dbh->errstr(); |
78 |
$naslov=""; |
79 |
$nr=0; |
80 |
$zakona++; |
81 |
} |
82 |
|
83 |
if ($sadrzaj) { |
84 |
if (s/<a href="#([^"]+)">\s*(\d+)\.\s*<[^>]+>//i) { |
85 |
($aname,$nr) = ($1,$2); |
86 |
} elsif (s/<a href="Javascript:Mojdok\((\d+),(\d+),(\w+),(\d+)\)[^>]*>//i) { |
87 |
($nr,$aname) = ($3,$4); |
88 |
die "conflict in godina: $1 != $god" if ($god != $1); |
89 |
die "conflict in broj: $2 != $br" if ($br != $2); |
90 |
} else { |
91 |
die "can't find nr in line: $_"; |
92 |
} |
93 |
$naslov.=$_; |
94 |
$naslov=~s/^\s*$nr\.*\s*//g; |
95 |
} |
96 |
|
97 |
} |
98 |
|
99 |
close(IN); |
100 |
} |
101 |
|
102 |
$dbh->do("vacuum") || die $dbh->errstr(); |
103 |
print "Ukupno $brojeva brojeva NN, sa $zakona zakona...\n" if (! $opts{q}); |
104 |
|