--- EPrints/EPrints.pm 2007/06/29 14:52:31 6 +++ EPrints/EPrints.pm 2007/06/30 13:46:51 17 @@ -8,6 +8,11 @@ use Encode qw/from_to decode_utf8 decode/; use Data::Dump qw/dump/; use DBI; +use URI::Escape; +use Carp qw/confess/; + +use lib '/home/dpavlin/stem-hr/'; +use StemHR; use strict; use warnings; @@ -15,6 +20,8 @@ my $debug = 0; my $connect = "DBI:mysql:dbname=eprints"; +# path to eprints installation +my $eprints_archive = '/data/eprints2/archives/ffzg/documents/disk0/'; my $dbh = DBI->connect($connect,"dpavlin","") || die $DBI::errstr; @@ -37,11 +44,18 @@ sub lookup { my $self = shift; my $field = shift; + my $table = shift; + my $where = ''; + + if ( ! $table ) { + $table = "archive_$field"; + $where = " and lang = 'hr'"; + } my $sql = qq{ SELECT $field - FROM archive_$field - WHERE eprintid = $id and lang = 'hr' + FROM $table + WHERE eprintid = $id $where }; warn "# sql: $sql\n" if $debug; my @results = map { _x( $_->{$field} ) } @{ $dbh->selectall_arrayref($sql, { Slice => {} }) }; @@ -50,6 +64,44 @@ return join(" ", @results); } +sub fulltext { + my $self = shift; + my $fulltext = EPrints->lookup( 'fileinfo', 'archive' ); + $fulltext =~ s/\s+$//; + return split(/;/, $fulltext); +} + +sub fulltext_content { + my $self = shift; + + my $path = $eprints_archive; + + my ( $type, $uri ) = EPrints->fulltext; + $uri =~ s!http://[^/]+/!!; + $uri = uri_unescape($uri); + if ( $uri =~ s|^(\d+)/|| ) { + my $nr = sprintf("%08d", $1); + $nr =~ s!(\d\d)!$1/!g; + $path .= "/$nr/$uri"; + } else { + warn "can't find ID in $uri"; + return; + } + $path =~ s!//+!/!g; + if ( -r $path ) { + print "+ $path ", -s $path, " bytes\n"; + open(my $pdf, "pdftotext $path - | iconv -f utf-8 -t iso-8859-2 -c |") || die "can't open pdftotext $path: $!"; + local $/; + my $content = <$pdf>; + print "\t>>", length( $content ), " text bytes\n"; + close($pdf); # || die "can't close $path: $!"; + return $content; + } else { + warn "ERROR: $path: $!\n"; + } + +} + sub _x { my $v = join(" ", @_); decode_utf8( $v ); @@ -59,9 +111,10 @@ } sub slogovi { - my $text = shift; + my $self = shift; + my $text = shift || confess "no text?"; - my $count = 2; + my $count = 3; my $out = ''; foreach my $w ( split(/\W*\s+\W*/, $text ) ) { @@ -81,4 +134,16 @@ return $out; } +sub stem { + my $self = shift; + my $text = shift || confess "no text?"; + + my $body = ''; + foreach my $w ( split(/\W*\s+\W*/, $text ) ) { + $body .= StemHR->stem( $w ) . ' '; + } + + return $body; +} + 1;