8 |
use Encode qw/from_to decode_utf8 decode/; |
use Encode qw/from_to decode_utf8 decode/; |
9 |
use Data::Dump qw/dump/; |
use Data::Dump qw/dump/; |
10 |
use DBI; |
use DBI; |
11 |
|
use URI::Escape; |
12 |
|
use Carp qw/confess/; |
13 |
|
|
14 |
|
use lib '/home/dpavlin/stem-hr/'; |
15 |
|
use StemHR; |
16 |
|
|
17 |
use strict; |
use strict; |
18 |
use warnings; |
use warnings; |
20 |
my $debug = 0; |
my $debug = 0; |
21 |
|
|
22 |
my $connect = "DBI:mysql:dbname=eprints"; |
my $connect = "DBI:mysql:dbname=eprints"; |
23 |
|
# path to eprints installation |
24 |
|
my $eprints_archive = '/data/eprints2/archives/ffzg/documents/disk0/'; |
25 |
|
|
26 |
my $dbh = DBI->connect($connect,"dpavlin","") || die $DBI::errstr; |
my $dbh = DBI->connect($connect,"dpavlin","") || die $DBI::errstr; |
27 |
|
|
64 |
return join(" ", @results); |
return join(" ", @results); |
65 |
} |
} |
66 |
|
|
67 |
|
sub fulltext { |
68 |
|
my $self = shift; |
69 |
|
my $fulltext = EPrints->lookup( 'fileinfo', 'archive' ); |
70 |
|
$fulltext =~ s/\s+$//; |
71 |
|
return split(/;/, $fulltext); |
72 |
|
} |
73 |
|
|
74 |
|
sub fulltext_content { |
75 |
|
my $self = shift; |
76 |
|
|
77 |
|
my $path = $eprints_archive; |
78 |
|
|
79 |
|
my ( $type, $uri ) = EPrints->fulltext; |
80 |
|
$uri =~ s!http://[^/]+/!!; |
81 |
|
$uri = uri_unescape($uri); |
82 |
|
if ( $uri =~ s|^(\d+)/|| ) { |
83 |
|
my $nr = sprintf("%08d", $1); |
84 |
|
$nr =~ s!(\d\d)!$1/!g; |
85 |
|
$path .= "/$nr/$uri"; |
86 |
|
} else { |
87 |
|
warn "can't find ID in $uri"; |
88 |
|
return; |
89 |
|
} |
90 |
|
$path =~ s!//+!/!g; |
91 |
|
if ( -r $path ) { |
92 |
|
print "+ $path ", -s $path, " bytes\n"; |
93 |
|
open(my $pdf, "pdftotext $path - | iconv -f utf-8 -t iso-8859-2 -c |") || die "can't open pdftotext $path: $!"; |
94 |
|
local $/; |
95 |
|
my $content = <$pdf>; |
96 |
|
print "\t>>", length( $content ), " text bytes\n"; |
97 |
|
close($pdf); # || die "can't close $path: $!"; |
98 |
|
return $content; |
99 |
|
} else { |
100 |
|
warn "ERROR: $path: $!\n"; |
101 |
|
} |
102 |
|
|
103 |
|
} |
104 |
|
|
105 |
sub _x { |
sub _x { |
106 |
my $v = join(" ", @_); |
my $v = join(" ", @_); |
107 |
decode_utf8( $v ); |
decode_utf8( $v ); |
111 |
} |
} |
112 |
|
|
113 |
sub slogovi { |
sub slogovi { |
114 |
my $text = shift; |
my $self = shift; |
115 |
|
my $text = shift || confess "no text?"; |
116 |
|
|
117 |
my $count = 2; |
my $count = 3; |
118 |
my $out = ''; |
my $out = ''; |
119 |
|
|
120 |
foreach my $w ( split(/\W*\s+\W*/, $text ) ) { |
foreach my $w ( split(/\W*\s+\W*/, $text ) ) { |
134 |
return $out; |
return $out; |
135 |
} |
} |
136 |
|
|
137 |
|
sub stem { |
138 |
|
my $self = shift; |
139 |
|
my $text = shift || confess "no text?"; |
140 |
|
|
141 |
|
my $body = ''; |
142 |
|
foreach my $w ( split(/\W*\s+\W*/, $text ) ) { |
143 |
|
$body .= StemHR->stem( $w ) . ' '; |
144 |
|
} |
145 |
|
|
146 |
|
return $body; |
147 |
|
} |
148 |
|
|
149 |
1; |
1; |