--- EPrints/index.pl 2007/06/29 18:46:45 13 +++ EPrints/index.pl 2007/07/02 12:55:49 18 @@ -5,11 +5,14 @@ use Data::Dump qw/dump/; use EPrints qw/_x/; - -use lib '/home/dpavlin/stem-hr/'; -use StemHR; +use KinoSearch::Simple; my $debug = shift @ARGV; +my $use = { + score => 1, + stem => 1, + slogovi => 1, +}; my $dbh = EPrints->dbh; my $sth = $dbh->prepare(qq{ @@ -17,6 +20,8 @@ archive_title.eprintid as id, title FROM archive_title +WHERE + lang = 'hr' }) || die $dbh->errstr(); $sth->execute() || die $sth->errstr(); @@ -36,28 +41,67 @@ my $total = 0; +my $kino = KinoSearch::Simple->new( + path => 'kinoindex/', + language => 'ru', +); + while (my $row = $sth->fetchrow_hashref ) { - EPrints->id( $row->{id} ); - my ( $title, $keywords, $abstract ) = ( - _x( $row->{title} ), - EPrints->lookup( 'keywords' ), - EPrints->lookup( 'abstract' ) - ); - my @body = split( /\W*\s+\W*/, "$title $title $title $keywords $keywords $abstract" ); - my $body = ''; - foreach my $word ( @body ) { - $body .= StemHR->stem( $word ) . ' '; + my $id = $row->{id}; + EPrints->id( $id ); + + my $parts = { + title => [ _x( $row->{title} ), 3 ], + keywords => [ EPrints->lookup( 'keywords' ), 2 ], + abstract => [ EPrints->lookup( 'abstract' ), 1 ], +# content => [ EPrints->fulltext_content, 1 ], + }; + + my $skip = 0; + foreach my $part ( qw/title keywords abstract/ ) { + if ( ! $parts->{$part}->[0] ) { + warn "skipped $id doesn't have required part $part\n"; + $skip = 1; + last; + } } + next if $skip; + + my $body = ''; - $body .= EPrints::slogovi( "$title $keywords $abstract" ); + foreach my $part ( qw/title keywords abstract content/ ) { + my $content = $parts->{$part}->[0] || next; - warn "body: $body\n" if $debug; + if ( $use->{slogovi} ) { + $body .= ' ' . EPrints->slogovi( $content ); + } + + if ( $use->{stem} ) { + my $stem = EPrints->stem( $content ); + warn "stem of '$content' didn't return anything\n" unless $stem; + $content = $stem; + } + + + if ( $use->{score} ) { + map { $body .= "$content " } 1 .. $parts->{$part}->[1]; + } else { + $body .= "$content "; + } - $body .= EPrints->fulltext_content; + warn ">>> $body <<<\n" if $debug; + } - $indexer->index( $row->{id}, join(" ", @body, $body ) ); + $indexer->index( $row->{id}, $body ); + $kino->add_doc({ + id => $id, + title => $parts->{title}, + keywords => $parts->{keywords}, + abstract => $parts->{abstract}, + }); + $total++; - print STDERR _x( $row->{id}, " ", $row->{title} ), "\n"; + print STDERR "$total: ", $row->{id}, " ", _x( $row->{title} ), " - ", length($body), " bytes\n"; }