--- EPrints/EPrints.pm 2007/06/29 16:58:42 11 +++ EPrints/EPrints.pm 2007/06/30 13:46:51 17 @@ -8,6 +8,11 @@ use Encode qw/from_to decode_utf8 decode/; use Data::Dump qw/dump/; use DBI; +use URI::Escape; +use Carp qw/confess/; + +use lib '/home/dpavlin/stem-hr/'; +use StemHR; use strict; use warnings; @@ -15,6 +20,8 @@ my $debug = 0; my $connect = "DBI:mysql:dbname=eprints"; +# path to eprints installation +my $eprints_archive = '/data/eprints2/archives/ffzg/documents/disk0/'; my $dbh = DBI->connect($connect,"dpavlin","") || die $DBI::errstr; @@ -57,6 +64,44 @@ return join(" ", @results); } +sub fulltext { + my $self = shift; + my $fulltext = EPrints->lookup( 'fileinfo', 'archive' ); + $fulltext =~ s/\s+$//; + return split(/;/, $fulltext); +} + +sub fulltext_content { + my $self = shift; + + my $path = $eprints_archive; + + my ( $type, $uri ) = EPrints->fulltext; + $uri =~ s!http://[^/]+/!!; + $uri = uri_unescape($uri); + if ( $uri =~ s|^(\d+)/|| ) { + my $nr = sprintf("%08d", $1); + $nr =~ s!(\d\d)!$1/!g; + $path .= "/$nr/$uri"; + } else { + warn "can't find ID in $uri"; + return; + } + $path =~ s!//+!/!g; + if ( -r $path ) { + print "+ $path ", -s $path, " bytes\n"; + open(my $pdf, "pdftotext $path - | iconv -f utf-8 -t iso-8859-2 -c |") || die "can't open pdftotext $path: $!"; + local $/; + my $content = <$pdf>; + print "\t>>", length( $content ), " text bytes\n"; + close($pdf); # || die "can't close $path: $!"; + return $content; + } else { + warn "ERROR: $path: $!\n"; + } + +} + sub _x { my $v = join(" ", @_); decode_utf8( $v ); @@ -66,9 +111,10 @@ } sub slogovi { - my $text = shift; + my $self = shift; + my $text = shift || confess "no text?"; - my $count = 2; + my $count = 3; my $out = ''; foreach my $w ( split(/\W*\s+\W*/, $text ) ) { @@ -88,4 +134,16 @@ return $out; } +sub stem { + my $self = shift; + my $text = shift || confess "no text?"; + + my $body = ''; + foreach my $w ( split(/\W*\s+\W*/, $text ) ) { + $body .= StemHR->stem( $w ) . ' '; + } + + return $body; +} + 1;