--- find3.pl 2002/02/05 12:58:30 1.2 +++ find3.pl 2002/02/05 14:31:49 1.4 @@ -1,7 +1,13 @@ #!/usr/bin/perl -w +# indexer, Dobrica Pavlinusic 2001-01-28 +# options: -q quiet +# -d debug +# -v verbose + use strict; use DBI; +use Getopt::Std; my $sadrzaj=0; my $nr=0; @@ -13,6 +19,11 @@ my $nn_dir="."; # dir u kojem su wget-ani fileovi +my %opts; +getopts("vqd", \%opts); + +my $brojeva=0; +my $zakona=0; #-------------------------------------------------------------------- @@ -92,7 +103,7 @@ $tmp_word = $word.$add; } if ($tmp_word =~ m/$regexp/ix) { -# print "$word -> $tmp_word\t-$sub, +$add, regexp: $regexp\n"; + print "$word -> $tmp_word\t-$sub, +$add, regexp: $regexp\n" if ($opts{d}); push @out,lc($tmp_word); } } @@ -114,12 +125,16 @@ my @files = grep { /^CijeliBrojS/ && -f "$nn_dir/$_" } readdir(DIR); closedir(DIR); +my $sth=$dbh->prepare("insert into nn (br,god,nr,aname,title,title_czs) values (?,?,?,?,?,?)") || die $dbh->errstr(); + + foreach my $file (@files) { open(IN,$file) || die "can't open $file: $!"; if ($file=~m/god=(\d+)\&br=(\d+)/) { ($br,$god) = ($2,$1); - print "$file -- $2 -- $1\n"; + print "$file -- $2 -- $1\n" if (! $opts{q}); + $brojeva++; } while() { @@ -138,20 +153,21 @@ $naslov=~s/<[^>]+>//g; $naslov=~s/^\s+//g; $naslov=~s/\s+$//g; - print "$god $br $nr: $naslov\n"; + print "$god $br $nr: $naslov\n" if ($opts{v}); my $naslov_czs = lc($naslov); $naslov_czs =~ tr/¹©ðÐèÈæƾ®/sSdDcCcCzZ/; $naslov_czs =~ tr/a-zA-Z/ /cs; # non a-z -> space $naslov_czs = join(" ",normalize_word(split(/ /,$naslov_czs))); - $dbh->do("insert into nn (br,god,nr,aname,title,title_czs) values ($br,$god,$nr,'$aname','$naslov','$naslov_czs')") || die $dbh->errstr(); + $sth->execute($br,$god,$nr,$aname,$naslov,$naslov_czs) || die $dbh->errstr(); $naslov=""; $nr=0; + $zakona++; } if ($sadrzaj) { if (s/\s*(\d+)\.\s*<[^>]+>//i) { ($aname,$nr) = ($1,$2); - } elsif (s/]*>//i) { ($nr,$aname) = ($3,$4); die "conflict in godina: $1 != $god" if ($god != $1); die "conflict in broj: $2 != $br" if ($br != $2); @@ -159,6 +175,7 @@ die "can't find nr in line: $_"; } $naslov.=$_; + $naslov=~s/^\s*$nr\.*\s*//g; } } @@ -167,3 +184,4 @@ } $dbh->do("vacuum") || die $dbh->errstr(); +print "Ukupno $brojeva brojeva NN, sa $zakona zakona...\n" if (! $opts{q});