8 |
use Encode qw/from_to decode_utf8 decode/; |
use Encode qw/from_to decode_utf8 decode/; |
9 |
use Data::Dump qw/dump/; |
use Data::Dump qw/dump/; |
10 |
use DBI; |
use DBI; |
11 |
|
use URI::Escape; |
12 |
|
|
13 |
use strict; |
use strict; |
14 |
use warnings; |
use warnings; |
16 |
my $debug = 0; |
my $debug = 0; |
17 |
|
|
18 |
my $connect = "DBI:mysql:dbname=eprints"; |
my $connect = "DBI:mysql:dbname=eprints"; |
19 |
|
# path to eprints installation |
20 |
|
my $eprints_archive = '/data/eprints2/archives/ffzg/documents/disk0/'; |
21 |
|
|
22 |
my $dbh = DBI->connect($connect,"dpavlin","") || die $DBI::errstr; |
my $dbh = DBI->connect($connect,"dpavlin","") || die $DBI::errstr; |
23 |
|
|
62 |
|
|
63 |
sub fulltext { |
sub fulltext { |
64 |
my $self = shift; |
my $self = shift; |
65 |
return split(/;/, EPrints->lookup( 'fileinfo', 'archive' )); |
my $fulltext = EPrints->lookup( 'fileinfo', 'archive' ); |
66 |
|
$fulltext =~ s/\s+$//; |
67 |
|
return split(/;/, $fulltext); |
68 |
|
} |
69 |
|
|
70 |
|
sub fulltext_content { |
71 |
|
my $self = shift; |
72 |
|
|
73 |
|
my $path = $eprints_archive; |
74 |
|
|
75 |
|
my ( $type, $uri ) = EPrints->fulltext; |
76 |
|
$uri =~ s!http://[^/]+/!!; |
77 |
|
$uri = uri_unescape($uri); |
78 |
|
if ( $uri =~ s|^(\d+)/|| ) { |
79 |
|
my $nr = sprintf("%08d", $1); |
80 |
|
$nr =~ s!(\d\d)!$1/!g; |
81 |
|
$path .= "/$nr/$uri"; |
82 |
|
} else { |
83 |
|
warn "can't find ID in $uri"; |
84 |
|
return; |
85 |
|
} |
86 |
|
$path =~ s!//+!/!g; |
87 |
|
if ( -r $path ) { |
88 |
|
print "+ $path ", -s $path, " bytes\n"; |
89 |
|
open(my $pdf, "pdftotext $path - | iconv -f utf-8 -t iso-8859-2 -c |") || die "can't open pdftotext $path: $!"; |
90 |
|
local $/; |
91 |
|
my $content = <$pdf>; |
92 |
|
print "\t>>", length( $content ), " text bytes\n"; |
93 |
|
close($pdf); # || die "can't close $path: $!"; |
94 |
|
return $content; |
95 |
|
} else { |
96 |
|
warn "ERROR: $path: $!\n"; |
97 |
|
} |
98 |
|
|
99 |
} |
} |
100 |
|
|
101 |
sub _x { |
sub _x { |