nn.old/trunk/find3.pl

#!/usr/bin/perl -w

# indexer, Dobrica Pavlinusic <dpavlin@rot13.org> 2001-01-28
# options:      -q quiet
#               -d debug
#               -v verbose

use strict;
use DBI;
use Getopt::Std;

my $sadrzaj=0;
my $nr=0;
my $naslov="";

my $br;         ## broj NN
my $god;        ## godina NN
my $aname;      ## ancor name na originalnim stranicama

my $nn_dir=".";         # dir u kojem su wget-ani fileovi

my %opts;
getopt("vqd", \%opts);

#--------------------------------------------------------------------

my @affix_regexp;
my @affix_add;
my @affix_sub;

sub load_affix {
        my ($filename) = @_;

        my $suffixes=0;

        my ($regexp,$add,$sub);

        open (A,$filename) || die "$filename: $!";
        while(<A>) {
                chomp;
                next if (/^#|^[\s\t\n\r]*$/);

                if (/^suffixes/i) {
                        $suffixes++;
                        next;
                }

                next if (! $suffixes);

                if (/^flag[\s\t]+\*{0,1}(.):/i) {
                        undef $regexp;
                        undef $add;
                        undef $sub;
                        next;
                }

                if (/^[\s\t]*([^>#]+)>[\s\t]+-([^\,\s\t]+),([^\s\t]+)/) {
                        $regexp = $1;
                        $add = $2;
                        $sub = $3 if ($3 ne "-");
                } elsif (/^[\s\t]*([^>#]+)>[\s\t]+([^\s\t\#]+)/) {
                        $regexp = $1;
                        $sub = $2;
                }

                sub nuke_s {
                        my $tmp = $_[0];
                        return if (!$tmp);
                        $tmp=~s/^ *//g;
                        $tmp=~s/ *$//g;
                        $tmp=~s/ *//g;
                        return $tmp;
                }

                push @affix_regexp,nuke_s($regexp);
                push @affix_add,nuke_s($add);
                push @affix_sub,nuke_s($sub);
        }
}

sub normalize_word {
        my @out;
        foreach my $word (@_) {
                push @out,$word;                # save original word
                next if (length($word) < 3);    # cludge: preskoci kratke
                for(my $i=0; $i<=$#affix_regexp; $i++) {
                        my $regexp = $affix_regexp[$i];
                        my $add = $affix_add[$i];
                        my $sub = $affix_sub[$i];
                        next if length($word) < length($sub);
                        my $tmp_word = $word;
                        if ($sub) {
                                next if ($word !~ m/$sub$/i);
                                if ($add) {
                                        $tmp_word =~ s/$sub$/$add/i;
                                } else {
                                        $tmp_word =~ s/$sub$//i;
                                }
                        } else {
                                $tmp_word = $word.$add;
                        }
                        if ($tmp_word =~ m/$regexp/ix) {
                                print "$word -> $tmp_word\t-$sub, +$add, regexp: $regexp\n" if ($opts{d});
                                push @out,lc($tmp_word);
                        }
                }
        }
        return @out;
}
                

load_affix("$nn_dir/search/croatian.aff");


#--------------------------------------------------------------------

my $dbh = DBI->connect("DBI:Pg:dbname=nn","","") || die $DBI::errstr;

$dbh->do("delete from nn") || die $dbh->errstr();

opendir(DIR,$nn_dir) || warn "opendir: $!";
my @files = grep { /^CijeliBrojS/ && -f "$nn_dir/$_" } readdir(DIR);
closedir(DIR);

my $sth=$dbh->prepare("insert into nn (br,god,nr,aname,title,title_czs) values (?,?,?,?,?,?)") || die $dbh->errstr();


foreach my $file (@files) {
        open(IN,$file) || die "can't open $file: $!";

        if ($file=~m/god=(\d+)\&br=(\d+)/) {
                ($br,$god) = ($2,$1);
                print "$file -- $2 -- $1\n" if (! $opts{q});
        }

        while(<IN>) {
                chomp;
                s/\015//g;      # kill cr
                tr/ðèæÐÈÆ/¹ð¾èæ©Ð®ÈÆ/;      # 1250 -> iso8859-2

                if (m,<div class=sadrzaj>,) {
                        $sadrzaj++;
                        next;
                }

                if ($sadrzaj && m,</div>,) {
                        $sadrzaj--;
                        $naslov=~s/\s+/ /g;
                        $naslov=~s/<[^>]+>//g;
                        $naslov=~s/^\s+//g;
                        $naslov=~s/\s+$//g;
                        print "$god $br $nr: $naslov\n" if ($opts{v});
                        my $naslov_czs = lc($naslov);
                        $naslov_czs =~ tr/¹©ðÐèÈæÆ¾®/sSdDcCcCzZ/;
                        $naslov_czs =~ tr/a-zA-Z/ /cs;  # non a-z  -> space
                        $naslov_czs = join(" ",normalize_word(split(/ /,$naslov_czs)));
                        $sth->execute($br,$god,$nr,$aname,$naslov,$naslov_czs) || die $dbh->errstr();
                        $naslov="";
                        $nr=0;
                }

                if ($sadrzaj) {
                        if (s/<a href="#([^"]+)">\s*(\d+)\.\s*<[^>]+>//i) {
                                ($aname,$nr) = ($1,$2);
                        } elsif (s/<a href="Javascript:Mojdok\((\d+),(\d+),(\w+),(\d+)\)[^>]*>//i) {
                                ($nr,$aname) = ($3,$4);
                                die "conflict in godina: $1 != $god"  if ($god != $1);
                                die "conflict in broj: $2 != $br"  if ($br != $2);
                        } else {
                                die "can't find nr in line: $_";
                        }
                        $naslov.=$_;
                }
                
        }

        close(IN);
}

$dbh->do("vacuum") || die $dbh->errstr();
1	#!/usr/bin/perl -w
2
3	# indexer, Dobrica Pavlinusic <dpavlin@rot13.org> 2001-01-28
4	# options: -q quiet
5	# -d debug
6	# -v verbose
7
8	use strict;
9	use DBI;
10	use Getopt::Std;
11
12	my $sadrzaj=0;
13	my $nr=0;
14	my $naslov="";
15
16	my $br; ## broj NN
17	my $god; ## godina NN
18	my $aname; ## ancor name na originalnim stranicama
19
20	my $nn_dir="."; # dir u kojem su wget-ani fileovi
21
22	my %opts;
23	getopt("vqd", \%opts);
24
25	#--------------------------------------------------------------------
26
27	my @affix_regexp;
28	my @affix_add;
29	my @affix_sub;
30
31	sub load_affix {
32	my ($filename) = @_;
33
34	my $suffixes=0;
35
36	my ($regexp,$add,$sub);
37
38	open (A,$filename) \|\| die "$filename: $!";
39	while(<A>) {
40	chomp;
41	next if (/^#\|^[\s\t\n\r]*$/);
42
43	if (/^suffixes/i) {
44	$suffixes++;
45	next;
46	}
47
48	next if (! $suffixes);
49
50	if (/^flag[\s\t]+\*{0,1}(.):/i) {
51	undef $regexp;
52	undef $add;
53	undef $sub;
54	next;
55	}
56
57	if (/^[\s\t]*([^>#]+)>[\s\t]+-([^\,\s\t]+),([^\s\t]+)/) {
58	$regexp = $1;
59	$add = $2;
60	$sub = $3 if ($3 ne "-");
61	} elsif (/^[\s\t]*([^>#]+)>[\s\t]+([^\s\t\#]+)/) {
62	$regexp = $1;
63	$sub = $2;
64	}
65
66	sub nuke_s {
67	my $tmp = $_[0];
68	return if (!$tmp);
69	$tmp=~s/^ *//g;
70	$tmp=~s/ *$//g;
71	$tmp=~s/ *//g;
72	return $tmp;
73	}
74
75	push @affix_regexp,nuke_s($regexp);
76	push @affix_add,nuke_s($add);
77	push @affix_sub,nuke_s($sub);
78	}
79	}
80
81	sub normalize_word {
82	my @out;
83	foreach my $word (@_) {
84	push @out,$word; # save original word
85	next if (length($word) < 3); # cludge: preskoci kratke
86	for(my $i=0; $i<=$#affix_regexp; $i++) {
87	my $regexp = $affix_regexp[$i];
88	my $add = $affix_add[$i];
89	my $sub = $affix_sub[$i];
90	next if length($word) < length($sub);
91	my $tmp_word = $word;
92	if ($sub) {
93	next if ($word !~ m/$sub$/i);
94	if ($add) {
95	$tmp_word =~ s/$sub$/$add/i;
96	} else {
97	$tmp_word =~ s/$sub$//i;
98	}
99	} else {
100	$tmp_word = $word.$add;
101	}
102	if ($tmp_word =~ m/$regexp/ix) {
103	print "$word -> $tmp_word\t-$sub, +$add, regexp: $regexp\n" if ($opts{d});
104	push @out,lc($tmp_word);
105	}
106	}
107	}
108	return @out;
109	}
110
111
112	load_affix("$nn_dir/search/croatian.aff");
113
114
115	#--------------------------------------------------------------------
116
117	my $dbh = DBI->connect("DBI:Pg:dbname=nn","","") \|\| die $DBI::errstr;
118
119	$dbh->do("delete from nn") \|\| die $dbh->errstr();
120
121	opendir(DIR,$nn_dir) \|\| warn "opendir: $!";
122	my @files = grep { /^CijeliBrojS/ && -f "$nn_dir/$_" } readdir(DIR);
123	closedir(DIR);
124
125	my $sth=$dbh->prepare("insert into nn (br,god,nr,aname,title,title_czs) values (?,?,?,?,?,?)") \|\| die $dbh->errstr();
126
127
128	foreach my $file (@files) {
129	open(IN,$file) \|\| die "can't open $file: $!";
130
131	if ($file=~m/god=(\d+)\&br=(\d+)/) {
132	($br,$god) = ($2,$1);
133	print "$file -- $2 -- $1\n" if (! $opts{q});
134	}
135
136	while(<IN>) {
137	chomp;
138	s/\015//g; # kill cr
139	tr/ðèæÐÈÆ/¹ð¾èæ©Ð®ÈÆ/; # 1250 -> iso8859-2
140
141	if (m,<div class=sadrzaj>,) {
142	$sadrzaj++;
143	next;
144	}
145
146	if ($sadrzaj && m,</div>,) {
147	$sadrzaj--;
148	$naslov=~s/\s+/ /g;
149	$naslov=~s/<[^>]+>//g;
150	$naslov=~s/^\s+//g;
151	$naslov=~s/\s+$//g;
152	print "$god $br $nr: $naslov\n" if ($opts{v});
153	my $naslov_czs = lc($naslov);
154	$naslov_czs =~ tr/¹©ðÐèÈæÆ¾®/sSdDcCcCzZ/;
155	$naslov_czs =~ tr/a-zA-Z/ /cs; # non a-z -> space
156	$naslov_czs = join(" ",normalize_word(split(/ /,$naslov_czs)));
157	$sth->execute($br,$god,$nr,$aname,$naslov,$naslov_czs) \|\| die $dbh->errstr();
158	$naslov="";
159	$nr=0;
160	}
161
162	if ($sadrzaj) {
163	if (s/<a href="#([^"]+)">\s(\d+)\.\s<[^>]+>//i) {
164	($aname,$nr) = ($1,$2);
165	} elsif (s/<a href="Javascript:Mojdok\((\d+),(\d+),(\w+),(\d+)\)[^>]*>//i) {
166	($nr,$aname) = ($3,$4);
167	die "conflict in godina: $1 != $god" if ($god != $1);
168	die "conflict in broj: $2 != $br" if ($br != $2);
169	} else {
170	die "can't find nr in line: $_";
171	}
172	$naslov.=$_;
173	}
174
175	}
176
177	close(IN);
178	}
179
180	$dbh->do("vacuum") \|\| die $dbh->errstr();
Name	Value
cvs2svn:cvs-rev	1.3
svn:executable	*
svn:mime-type	application/octet-stream