1 |
dpavlin |
1 |
# Dobrica Pavlinusic, <dpavlin@rot13.org> 06/28/07 23:28:21 CEST |
2 |
|
|
|
3 |
|
|
package EPrints; |
4 |
|
|
|
5 |
|
|
use Exporter 'import'; |
6 |
dpavlin |
4 |
@EXPORT_OK = qw(_x slogovi); |
7 |
dpavlin |
1 |
|
8 |
|
|
use Encode qw/from_to decode_utf8 decode/; |
9 |
|
|
use Data::Dump qw/dump/; |
10 |
dpavlin |
3 |
use DBI; |
11 |
dpavlin |
13 |
use URI::Escape; |
12 |
dpavlin |
1 |
|
13 |
|
|
use strict; |
14 |
|
|
use warnings; |
15 |
|
|
|
16 |
|
|
my $debug = 0; |
17 |
|
|
|
18 |
|
|
my $connect = "DBI:mysql:dbname=eprints"; |
19 |
dpavlin |
13 |
# path to eprints installation |
20 |
|
|
my $eprints_archive = '/data/eprints2/archives/ffzg/documents/disk0/'; |
21 |
dpavlin |
1 |
|
22 |
|
|
my $dbh = DBI->connect($connect,"dpavlin","") || die $DBI::errstr; |
23 |
|
|
|
24 |
|
|
sub dbh { |
25 |
|
|
my $self = shift; |
26 |
|
|
return $dbh; |
27 |
|
|
} |
28 |
|
|
|
29 |
|
|
my $id; |
30 |
|
|
|
31 |
|
|
sub id { |
32 |
|
|
my $self = shift; |
33 |
|
|
if ( defined( $_[0] ) ) { |
34 |
|
|
$id = $_[0]; |
35 |
|
|
warn "# id = $id\n" if $debug; |
36 |
|
|
} |
37 |
|
|
return $id; |
38 |
|
|
} |
39 |
|
|
|
40 |
|
|
sub lookup { |
41 |
|
|
my $self = shift; |
42 |
|
|
my $field = shift; |
43 |
dpavlin |
11 |
my $table = shift; |
44 |
|
|
my $where = ''; |
45 |
dpavlin |
1 |
|
46 |
dpavlin |
11 |
if ( ! $table ) { |
47 |
|
|
$table = "archive_$field"; |
48 |
|
|
$where = " and lang = 'hr'"; |
49 |
|
|
} |
50 |
|
|
|
51 |
dpavlin |
1 |
my $sql = qq{ |
52 |
|
|
SELECT $field |
53 |
dpavlin |
11 |
FROM $table |
54 |
|
|
WHERE eprintid = $id $where |
55 |
dpavlin |
1 |
}; |
56 |
|
|
warn "# sql: $sql\n" if $debug; |
57 |
|
|
my @results = map { _x( $_->{$field} ) } @{ $dbh->selectall_arrayref($sql, { Slice => {} }) }; |
58 |
|
|
|
59 |
|
|
warn "# loookup( $field, $id ) = ", dump( @results ),$/ if $debug; |
60 |
|
|
return join(" ", @results); |
61 |
|
|
} |
62 |
|
|
|
63 |
dpavlin |
12 |
sub fulltext { |
64 |
|
|
my $self = shift; |
65 |
dpavlin |
13 |
my $fulltext = EPrints->lookup( 'fileinfo', 'archive' ); |
66 |
|
|
$fulltext =~ s/\s+$//; |
67 |
|
|
return split(/;/, $fulltext); |
68 |
dpavlin |
12 |
} |
69 |
|
|
|
70 |
dpavlin |
13 |
sub fulltext_content { |
71 |
|
|
my $self = shift; |
72 |
|
|
|
73 |
|
|
my $path = $eprints_archive; |
74 |
|
|
|
75 |
|
|
my ( $type, $uri ) = EPrints->fulltext; |
76 |
|
|
$uri =~ s!http://[^/]+/!!; |
77 |
|
|
$uri = uri_unescape($uri); |
78 |
|
|
if ( $uri =~ s|^(\d+)/|| ) { |
79 |
|
|
my $nr = sprintf("%08d", $1); |
80 |
|
|
$nr =~ s!(\d\d)!$1/!g; |
81 |
|
|
$path .= "/$nr/$uri"; |
82 |
|
|
} else { |
83 |
|
|
warn "can't find ID in $uri"; |
84 |
|
|
return; |
85 |
|
|
} |
86 |
|
|
$path =~ s!//+!/!g; |
87 |
|
|
if ( -r $path ) { |
88 |
|
|
print "+ $path ", -s $path, " bytes\n"; |
89 |
|
|
open(my $pdf, "pdftotext $path - | iconv -f utf-8 -t iso-8859-2 -c |") || die "can't open pdftotext $path: $!"; |
90 |
|
|
local $/; |
91 |
|
|
my $content = <$pdf>; |
92 |
|
|
print "\t>>", length( $content ), " text bytes\n"; |
93 |
|
|
close($pdf); # || die "can't close $path: $!"; |
94 |
|
|
return $content; |
95 |
|
|
} else { |
96 |
|
|
warn "ERROR: $path: $!\n"; |
97 |
|
|
} |
98 |
|
|
|
99 |
|
|
} |
100 |
|
|
|
101 |
dpavlin |
1 |
sub _x { |
102 |
|
|
my $v = join(" ", @_); |
103 |
|
|
decode_utf8( $v ); |
104 |
|
|
from_to( $v, 'utf-8', 'iso-8859-2' ); |
105 |
|
|
warn "_x($v)\n" if $debug; |
106 |
|
|
return "$v "; |
107 |
|
|
} |
108 |
|
|
|
109 |
dpavlin |
4 |
sub slogovi { |
110 |
|
|
my $text = shift; |
111 |
dpavlin |
6 |
|
112 |
|
|
my $count = 2; |
113 |
|
|
my $out = ''; |
114 |
|
|
|
115 |
dpavlin |
4 |
foreach my $w ( split(/\W*\s+\W*/, $text ) ) { |
116 |
dpavlin |
6 |
warn "w: $w\n" if $debug; |
117 |
|
|
my @s; |
118 |
dpavlin |
4 |
while ( $w =~ s/^([^aeiou]*[aeiou])//i ) { |
119 |
dpavlin |
6 |
push @s, $1; |
120 |
dpavlin |
4 |
} |
121 |
dpavlin |
6 |
push @s, $w if $w; |
122 |
dpavlin |
4 |
warn "slogovi = ", dump( @s ), $/ if $debug; |
123 |
dpavlin |
6 |
foreach my $p ( 0 .. ( $#s - $count + 1 ) ) { |
124 |
|
|
map { $out .= $s[ $p + $_ ] } 0 .. $count - 1; |
125 |
|
|
$out .= ' '; |
126 |
|
|
} |
127 |
dpavlin |
4 |
} |
128 |
dpavlin |
6 |
warn "$out\n" if $debug; |
129 |
|
|
return $out; |
130 |
dpavlin |
4 |
} |
131 |
|
|
|
132 |
dpavlin |
1 |
1; |