1 |
# Dobrica Pavlinusic, <dpavlin@rot13.org> 06/28/07 23:28:21 CEST |
2 |
|
3 |
package EPrints; |
4 |
|
5 |
use Exporter 'import'; |
6 |
@EXPORT_OK = qw(_x slogovi); |
7 |
|
8 |
use Encode qw/from_to decode_utf8 decode/; |
9 |
use Data::Dump qw/dump/; |
10 |
use DBI; |
11 |
use URI::Escape; |
12 |
|
13 |
use strict; |
14 |
use warnings; |
15 |
|
16 |
my $debug = 0; |
17 |
|
18 |
my $connect = "DBI:mysql:dbname=eprints"; |
19 |
# path to eprints installation |
20 |
my $eprints_archive = '/data/eprints2/archives/ffzg/documents/disk0/'; |
21 |
|
22 |
my $dbh = DBI->connect($connect,"dpavlin","") || die $DBI::errstr; |
23 |
|
24 |
sub dbh { |
25 |
my $self = shift; |
26 |
return $dbh; |
27 |
} |
28 |
|
29 |
my $id; |
30 |
|
31 |
sub id { |
32 |
my $self = shift; |
33 |
if ( defined( $_[0] ) ) { |
34 |
$id = $_[0]; |
35 |
warn "# id = $id\n" if $debug; |
36 |
} |
37 |
return $id; |
38 |
} |
39 |
|
40 |
sub lookup { |
41 |
my $self = shift; |
42 |
my $field = shift; |
43 |
my $table = shift; |
44 |
my $where = ''; |
45 |
|
46 |
if ( ! $table ) { |
47 |
$table = "archive_$field"; |
48 |
$where = " and lang = 'hr'"; |
49 |
} |
50 |
|
51 |
my $sql = qq{ |
52 |
SELECT $field |
53 |
FROM $table |
54 |
WHERE eprintid = $id $where |
55 |
}; |
56 |
warn "# sql: $sql\n" if $debug; |
57 |
my @results = map { _x( $_->{$field} ) } @{ $dbh->selectall_arrayref($sql, { Slice => {} }) }; |
58 |
|
59 |
warn "# loookup( $field, $id ) = ", dump( @results ),$/ if $debug; |
60 |
return join(" ", @results); |
61 |
} |
62 |
|
63 |
sub fulltext { |
64 |
my $self = shift; |
65 |
my $fulltext = EPrints->lookup( 'fileinfo', 'archive' ); |
66 |
$fulltext =~ s/\s+$//; |
67 |
return split(/;/, $fulltext); |
68 |
} |
69 |
|
70 |
sub fulltext_content { |
71 |
my $self = shift; |
72 |
|
73 |
my $path = $eprints_archive; |
74 |
|
75 |
my ( $type, $uri ) = EPrints->fulltext; |
76 |
$uri =~ s!http://[^/]+/!!; |
77 |
$uri = uri_unescape($uri); |
78 |
if ( $uri =~ s|^(\d+)/|| ) { |
79 |
my $nr = sprintf("%08d", $1); |
80 |
$nr =~ s!(\d\d)!$1/!g; |
81 |
$path .= "/$nr/$uri"; |
82 |
} else { |
83 |
warn "can't find ID in $uri"; |
84 |
return; |
85 |
} |
86 |
$path =~ s!//+!/!g; |
87 |
if ( -r $path ) { |
88 |
print "+ $path ", -s $path, " bytes\n"; |
89 |
open(my $pdf, "pdftotext $path - | iconv -f utf-8 -t iso-8859-2 -c |") || die "can't open pdftotext $path: $!"; |
90 |
local $/; |
91 |
my $content = <$pdf>; |
92 |
print "\t>>", length( $content ), " text bytes\n"; |
93 |
close($pdf); # || die "can't close $path: $!"; |
94 |
return $content; |
95 |
} else { |
96 |
warn "ERROR: $path: $!\n"; |
97 |
} |
98 |
|
99 |
} |
100 |
|
101 |
sub _x { |
102 |
my $v = join(" ", @_); |
103 |
decode_utf8( $v ); |
104 |
from_to( $v, 'utf-8', 'iso-8859-2' ); |
105 |
warn "_x($v)\n" if $debug; |
106 |
return "$v "; |
107 |
} |
108 |
|
109 |
sub slogovi { |
110 |
my $text = shift; |
111 |
|
112 |
my $count = 2; |
113 |
my $out = ''; |
114 |
|
115 |
foreach my $w ( split(/\W*\s+\W*/, $text ) ) { |
116 |
warn "w: $w\n" if $debug; |
117 |
my @s; |
118 |
while ( $w =~ s/^([^aeiou]*[aeiou])//i ) { |
119 |
push @s, $1; |
120 |
} |
121 |
push @s, $w if $w; |
122 |
warn "slogovi = ", dump( @s ), $/ if $debug; |
123 |
foreach my $p ( 0 .. ( $#s - $count + 1 ) ) { |
124 |
map { $out .= $s[ $p + $_ ] } 0 .. $count - 1; |
125 |
$out .= ' '; |
126 |
} |
127 |
} |
128 |
warn "$out\n" if $debug; |
129 |
return $out; |
130 |
} |
131 |
|
132 |
1; |