viewvc.cgi/nn/find3.pl

#!/usr/bin/perl -w

# indexer, Dobrica Pavlinusic <dpavlin@rot13.org> 2002-01-28
# options:      -q quiet
#               -d debug
#               -v verbose

use strict;
use DBI;
use Getopt::Std;

my $sadrzaj=0;
my $nr=0;
my $naslov="";

my $br;         ## broj NN
my $god;        ## godina NN
my $aname;      ## ancor name na originalnim stranicama

my $nn_dir=".";         # dir u kojem su wget-ani fileovi

my %opts;
getopts("vqd", \%opts);

my $brojeva=0;
my $zakona=0;

#--------------------------------------------------------------------

my @affix_regexp;
my @affix_add;
my @affix_sub;

sub load_affix {
        my ($filename) = @_;

        my $suffixes=0;

        my ($regexp,$add,$sub);

        open (A,$filename) || die "$filename: $!";
        while(<A>) {
                chomp;
                next if (/^#|^[\s\t\n\r]*$/);

                if (/^suffixes/i) {
                        $suffixes++;
                        next;
                }

                next if (! $suffixes);

                if (/^flag[\s\t]+\*{0,1}(.):/i) {
                        undef $regexp;
                        undef $add;
                        undef $sub;
                        next;
                }

                if (/^[\s\t]*([^>#]+)>[\s\t]+-([^\,\s\t]+),([^\s\t]+)/) {
                        $regexp = $1;
                        $add = $2;
                        $sub = $3 if ($3 ne "-");
                } elsif (/^[\s\t]*([^>#]+)>[\s\t]+([^\s\t\#]+)/) {
                        $regexp = $1;
                        $sub = $2;
                }

                sub nuke_s {
                        my $tmp = $_[0];
                        return if (!$tmp);
                        $tmp=~s/^ *//g;
                        $tmp=~s/ *$//g;
                        $tmp=~s/ *//g;
                        return $tmp;
                }

                push @affix_regexp,nuke_s($regexp);
                push @affix_add,nuke_s($add);
                push @affix_sub,nuke_s($sub);
        }
}

sub normalize_word {
        my @out;
        foreach my $word (@_) {
                push @out,$word;                # save original word
                next if (length($word) < 3);    # cludge: preskoci kratke
                for(my $i=0; $i<=$#affix_regexp; $i++) {
                        my $regexp = $affix_regexp[$i];
                        my $add = $affix_add[$i];
                        my $sub = $affix_sub[$i];
                        next if length($word) < length($sub);
                        my $tmp_word = $word;
                        if ($sub) {
                                next if ($word !~ m/$sub$/i);
                                if ($add) {
                                        $tmp_word =~ s/$sub$/$add/i;
                                } else {
                                        $tmp_word =~ s/$sub$//i;
                                }
                        } else {
                                $tmp_word = $word.$add;
                        }
                        if ($tmp_word =~ m/$regexp/ix) {
                                print "$word -> $tmp_word\t-$sub, +$add, regexp: $regexp\n" if ($opts{d});
                                push @out,lc($tmp_word);
                        }
                }
        }
        return @out;
}
                

load_affix("$nn_dir/search/croatian.aff");


#--------------------------------------------------------------------

my $dbh = DBI->connect("DBI:Pg:dbname=nn","","") || die $DBI::errstr;

$dbh->do("delete from nn") || die $dbh->errstr();

opendir(DIR,$nn_dir) || warn "opendir: $!";
my @files = grep { /^CijeliBrojS/ && -f "$nn_dir/$_" } readdir(DIR);
closedir(DIR);

my $sth=$dbh->prepare("insert into nn (br,god,nr,aname,title,title_czs) values (?,?,?,?,?,?)") || die $dbh->errstr();


foreach my $file (@files) {
        open(IN,$file) || die "can't open $file: $!";

        if ($file=~m/god=(\d+)\&br=(\d+)/) {
                ($br,$god) = ($2,$1);
                print "$file -- $2 -- $1\n" if (! $opts{q});
                $brojeva++;
        }

        while(<IN>) {
                chomp;
                s/\015//g;      # kill cr
                tr/ðèæÐÈÆ/¹ð¾èæ©Ð®ÈÆ/;      # 1250 -> iso8859-2

                if (m,<div class=sadrzaj>,) {
                        $sadrzaj++;
                        next;
                }

                if ($sadrzaj && m,</div>,) {
                        $sadrzaj--;
                        $naslov=~s/\s+/ /g;
                        $naslov=~s/<[^>]+>//g;
                        $naslov=~s/^\s+//g;
                        $naslov=~s/\s+$//g;
                        print "$god $br $nr: $naslov\n" if ($opts{v});
                        my $naslov_czs = lc($naslov);
                        $naslov_czs =~ tr/¹©ðÐèÈæÆ¾®/sSdDcCcCzZ/;
                        $naslov_czs =~ tr/a-zA-Z/ /cs;  # non a-z  -> space
                        $naslov_czs = join(" ",normalize_word(split(/ /,$naslov_czs)));
                        $sth->execute($br,$god,$nr,$aname,$naslov,$naslov_czs) || die $dbh->errstr();
                        $naslov="";
                        $nr=0;
                        $zakona++;
                }

                if ($sadrzaj) {
                        if (s/<a href="#([^"]+)">\s*(\d+)\.\s*<[^>]+>//i) {
                                ($aname,$nr) = ($1,$2);
                        } elsif (s/<a href="Javascript:Mojdok\((\d+),(\d+),(\w+),(\d+)\)[^>]*>//i) {
                                ($nr,$aname) = ($3,$4);
                                die "conflict in godina: $1 != $god"  if ($god != $1);
                                die "conflict in broj: $2 != $br"  if ($br != $2);
                        } else {
                                die "can't find nr in line: $_";
                        }
                        $naslov.=$_;
                        $naslov=~s/^\s*$nr\.*\s*//g;
                }
                
        }

        close(IN);
}

$dbh->do("vacuum") || die $dbh->errstr();
print "Ukupno $brojeva brojeva NN, sa $zakona zakona...\n" if (! $opts{q});
1	#!/usr/bin/perl -w
2
3	# indexer, Dobrica Pavlinusic <dpavlin@rot13.org> 2002-01-28
4	# options: -q quiet
5	# -d debug
6	# -v verbose
7
8	use strict;
9	use DBI;
10	use Getopt::Std;
11
12	my $sadrzaj=0;
13	my $nr=0;
14	my $naslov="";
15
16	my $br; ## broj NN
17	my $god; ## godina NN
18	my $aname; ## ancor name na originalnim stranicama
19
20	my $nn_dir="."; # dir u kojem su wget-ani fileovi
21
22	my %opts;
23	getopts("vqd", \%opts);
24
25	my $brojeva=0;
26	my $zakona=0;
27
28	#--------------------------------------------------------------------
29
30	my @affix_regexp;
31	my @affix_add;
32	my @affix_sub;
33
34	sub load_affix {
35	my ($filename) = @_;
36
37	my $suffixes=0;
38
39	my ($regexp,$add,$sub);
40
41	open (A,$filename) \|\| die "$filename: $!";
42	while(<A>) {
43	chomp;
44	next if (/^#\|^[\s\t\n\r]*$/);
45
46	if (/^suffixes/i) {
47	$suffixes++;
48	next;
49	}
50
51	next if (! $suffixes);
52
53	if (/^flag[\s\t]+\*{0,1}(.):/i) {
54	undef $regexp;
55	undef $add;
56	undef $sub;
57	next;
58	}
59
60	if (/^[\s\t]*([^>#]+)>[\s\t]+-([^\,\s\t]+),([^\s\t]+)/) {
61	$regexp = $1;
62	$add = $2;
63	$sub = $3 if ($3 ne "-");
64	} elsif (/^[\s\t]*([^>#]+)>[\s\t]+([^\s\t\#]+)/) {
65	$regexp = $1;
66	$sub = $2;
67	}
68
69	sub nuke_s {
70	my $tmp = $_[0];
71	return if (!$tmp);
72	$tmp=~s/^ *//g;
73	$tmp=~s/ *$//g;
74	$tmp=~s/ *//g;
75	return $tmp;
76	}
77
78	push @affix_regexp,nuke_s($regexp);
79	push @affix_add,nuke_s($add);
80	push @affix_sub,nuke_s($sub);
81	}
82	}
83
84	sub normalize_word {
85	my @out;
86	foreach my $word (@_) {
87	push @out,$word; # save original word
88	next if (length($word) < 3); # cludge: preskoci kratke
89	for(my $i=0; $i<=$#affix_regexp; $i++) {
90	my $regexp = $affix_regexp[$i];
91	my $add = $affix_add[$i];
92	my $sub = $affix_sub[$i];
93	next if length($word) < length($sub);
94	my $tmp_word = $word;
95	if ($sub) {
96	next if ($word !~ m/$sub$/i);
97	if ($add) {
98	$tmp_word =~ s/$sub$/$add/i;
99	} else {
100	$tmp_word =~ s/$sub$//i;
101	}
102	} else {
103	$tmp_word = $word.$add;
104	}
105	if ($tmp_word =~ m/$regexp/ix) {
106	print "$word -> $tmp_word\t-$sub, +$add, regexp: $regexp\n" if ($opts{d});
107	push @out,lc($tmp_word);
108	}
109	}
110	}
111	return @out;
112	}
113
114
115	load_affix("$nn_dir/search/croatian.aff");
116
117
118	#--------------------------------------------------------------------
119
120	my $dbh = DBI->connect("DBI:Pg:dbname=nn","","") \|\| die $DBI::errstr;
121
122	$dbh->do("delete from nn") \|\| die $dbh->errstr();
123
124	opendir(DIR,$nn_dir) \|\| warn "opendir: $!";
125	my @files = grep { /^CijeliBrojS/ && -f "$nn_dir/$_" } readdir(DIR);
126	closedir(DIR);
127
128	my $sth=$dbh->prepare("insert into nn (br,god,nr,aname,title,title_czs) values (?,?,?,?,?,?)") \|\| die $dbh->errstr();
129
130
131	foreach my $file (@files) {
132	open(IN,$file) \|\| die "can't open $file: $!";
133
134	if ($file=~m/god=(\d+)\&br=(\d+)/) {
135	($br,$god) = ($2,$1);
136	print "$file -- $2 -- $1\n" if (! $opts{q});
137	$brojeva++;
138	}
139
140	while(<IN>) {
141	chomp;
142	s/\015//g; # kill cr
143	tr/ðèæÐÈÆ/¹ð¾èæ©Ð®ÈÆ/; # 1250 -> iso8859-2
144
145	if (m,<div class=sadrzaj>,) {
146	$sadrzaj++;
147	next;
148	}
149
150	if ($sadrzaj && m,</div>,) {
151	$sadrzaj--;
152	$naslov=~s/\s+/ /g;
153	$naslov=~s/<[^>]+>//g;
154	$naslov=~s/^\s+//g;
155	$naslov=~s/\s+$//g;
156	print "$god $br $nr: $naslov\n" if ($opts{v});
157	my $naslov_czs = lc($naslov);
158	$naslov_czs =~ tr/¹©ðÐèÈæÆ¾®/sSdDcCcCzZ/;
159	$naslov_czs =~ tr/a-zA-Z/ /cs; # non a-z -> space
160	$naslov_czs = join(" ",normalize_word(split(/ /,$naslov_czs)));
161	$sth->execute($br,$god,$nr,$aname,$naslov,$naslov_czs) \|\| die $dbh->errstr();
162	$naslov="";
163	$nr=0;
164	$zakona++;
165	}
166
167	if ($sadrzaj) {
168	if (s/<a href="#([^"]+)">\s(\d+)\.\s<[^>]+>//i) {
169	($aname,$nr) = ($1,$2);
170	} elsif (s/<a href="Javascript:Mojdok\((\d+),(\d+),(\w+),(\d+)\)[^>]*>//i) {
171	($nr,$aname) = ($3,$4);
172	die "conflict in godina: $1 != $god" if ($god != $1);
173	die "conflict in broj: $2 != $br" if ($br != $2);
174	} else {
175	die "can't find nr in line: $_";
176	}
177	$naslov.=$_;
178	$naslov=~s/^\s$nr\.\s*//g;
179	}
180
181	}
182
183	close(IN);
184	}
185
186	$dbh->do("vacuum") \|\| die $dbh->errstr();
187	print "Ukupno $brojeva brojeva NN, sa $zakona zakona...\n" if (! $opts{q});