1 |
dpavlin |
1 |
# Dobrica Pavlinusic, <dpavlin@rot13.org> 06/28/07 23:28:21 CEST |
2 |
|
|
|
3 |
|
|
package EPrints; |
4 |
|
|
|
5 |
|
|
use Exporter 'import'; |
6 |
dpavlin |
4 |
@EXPORT_OK = qw(_x slogovi); |
7 |
dpavlin |
1 |
|
8 |
|
|
use Encode qw/from_to decode_utf8 decode/; |
9 |
|
|
use Data::Dump qw/dump/; |
10 |
dpavlin |
3 |
use DBI; |
11 |
dpavlin |
13 |
use URI::Escape; |
12 |
dpavlin |
17 |
use Carp qw/confess/; |
13 |
dpavlin |
1 |
|
14 |
dpavlin |
17 |
use lib '/home/dpavlin/stem-hr/'; |
15 |
|
|
use StemHR; |
16 |
|
|
|
17 |
dpavlin |
1 |
use strict; |
18 |
|
|
use warnings; |
19 |
|
|
|
20 |
|
|
my $debug = 0; |
21 |
|
|
|
22 |
|
|
my $connect = "DBI:mysql:dbname=eprints"; |
23 |
dpavlin |
13 |
# path to eprints installation |
24 |
|
|
my $eprints_archive = '/data/eprints2/archives/ffzg/documents/disk0/'; |
25 |
dpavlin |
1 |
|
26 |
|
|
my $dbh = DBI->connect($connect,"dpavlin","") || die $DBI::errstr; |
27 |
|
|
|
28 |
|
|
sub dbh { |
29 |
|
|
my $self = shift; |
30 |
|
|
return $dbh; |
31 |
|
|
} |
32 |
|
|
|
33 |
|
|
my $id; |
34 |
|
|
|
35 |
|
|
sub id { |
36 |
|
|
my $self = shift; |
37 |
|
|
if ( defined( $_[0] ) ) { |
38 |
|
|
$id = $_[0]; |
39 |
|
|
warn "# id = $id\n" if $debug; |
40 |
|
|
} |
41 |
|
|
return $id; |
42 |
|
|
} |
43 |
|
|
|
44 |
|
|
sub lookup { |
45 |
|
|
my $self = shift; |
46 |
|
|
my $field = shift; |
47 |
dpavlin |
11 |
my $table = shift; |
48 |
|
|
my $where = ''; |
49 |
dpavlin |
1 |
|
50 |
dpavlin |
11 |
if ( ! $table ) { |
51 |
|
|
$table = "archive_$field"; |
52 |
|
|
$where = " and lang = 'hr'"; |
53 |
|
|
} |
54 |
|
|
|
55 |
dpavlin |
1 |
my $sql = qq{ |
56 |
|
|
SELECT $field |
57 |
dpavlin |
11 |
FROM $table |
58 |
|
|
WHERE eprintid = $id $where |
59 |
dpavlin |
1 |
}; |
60 |
|
|
warn "# sql: $sql\n" if $debug; |
61 |
|
|
my @results = map { _x( $_->{$field} ) } @{ $dbh->selectall_arrayref($sql, { Slice => {} }) }; |
62 |
|
|
|
63 |
|
|
warn "# loookup( $field, $id ) = ", dump( @results ),$/ if $debug; |
64 |
|
|
return join(" ", @results); |
65 |
|
|
} |
66 |
|
|
|
67 |
dpavlin |
12 |
sub fulltext { |
68 |
|
|
my $self = shift; |
69 |
dpavlin |
13 |
my $fulltext = EPrints->lookup( 'fileinfo', 'archive' ); |
70 |
|
|
$fulltext =~ s/\s+$//; |
71 |
|
|
return split(/;/, $fulltext); |
72 |
dpavlin |
12 |
} |
73 |
|
|
|
74 |
dpavlin |
13 |
sub fulltext_content { |
75 |
|
|
my $self = shift; |
76 |
|
|
|
77 |
|
|
my $path = $eprints_archive; |
78 |
|
|
|
79 |
|
|
my ( $type, $uri ) = EPrints->fulltext; |
80 |
|
|
$uri =~ s!http://[^/]+/!!; |
81 |
|
|
$uri = uri_unescape($uri); |
82 |
|
|
if ( $uri =~ s|^(\d+)/|| ) { |
83 |
|
|
my $nr = sprintf("%08d", $1); |
84 |
|
|
$nr =~ s!(\d\d)!$1/!g; |
85 |
|
|
$path .= "/$nr/$uri"; |
86 |
|
|
} else { |
87 |
|
|
warn "can't find ID in $uri"; |
88 |
|
|
return; |
89 |
|
|
} |
90 |
|
|
$path =~ s!//+!/!g; |
91 |
|
|
if ( -r $path ) { |
92 |
|
|
print "+ $path ", -s $path, " bytes\n"; |
93 |
|
|
open(my $pdf, "pdftotext $path - | iconv -f utf-8 -t iso-8859-2 -c |") || die "can't open pdftotext $path: $!"; |
94 |
|
|
local $/; |
95 |
|
|
my $content = <$pdf>; |
96 |
|
|
print "\t>>", length( $content ), " text bytes\n"; |
97 |
|
|
close($pdf); # || die "can't close $path: $!"; |
98 |
|
|
return $content; |
99 |
|
|
} else { |
100 |
|
|
warn "ERROR: $path: $!\n"; |
101 |
|
|
} |
102 |
|
|
|
103 |
|
|
} |
104 |
|
|
|
105 |
dpavlin |
1 |
sub _x { |
106 |
|
|
my $v = join(" ", @_); |
107 |
|
|
decode_utf8( $v ); |
108 |
|
|
from_to( $v, 'utf-8', 'iso-8859-2' ); |
109 |
|
|
warn "_x($v)\n" if $debug; |
110 |
|
|
return "$v "; |
111 |
|
|
} |
112 |
|
|
|
113 |
dpavlin |
4 |
sub slogovi { |
114 |
dpavlin |
14 |
my $self = shift; |
115 |
dpavlin |
17 |
my $text = shift || confess "no text?"; |
116 |
dpavlin |
6 |
|
117 |
dpavlin |
17 |
my $count = 3; |
118 |
dpavlin |
6 |
my $out = ''; |
119 |
|
|
|
120 |
dpavlin |
4 |
foreach my $w ( split(/\W*\s+\W*/, $text ) ) { |
121 |
dpavlin |
6 |
warn "w: $w\n" if $debug; |
122 |
|
|
my @s; |
123 |
dpavlin |
4 |
while ( $w =~ s/^([^aeiou]*[aeiou])//i ) { |
124 |
dpavlin |
6 |
push @s, $1; |
125 |
dpavlin |
4 |
} |
126 |
dpavlin |
6 |
push @s, $w if $w; |
127 |
dpavlin |
4 |
warn "slogovi = ", dump( @s ), $/ if $debug; |
128 |
dpavlin |
6 |
foreach my $p ( 0 .. ( $#s - $count + 1 ) ) { |
129 |
|
|
map { $out .= $s[ $p + $_ ] } 0 .. $count - 1; |
130 |
|
|
$out .= ' '; |
131 |
|
|
} |
132 |
dpavlin |
4 |
} |
133 |
dpavlin |
6 |
warn "$out\n" if $debug; |
134 |
|
|
return $out; |
135 |
dpavlin |
4 |
} |
136 |
|
|
|
137 |
dpavlin |
17 |
sub stem { |
138 |
|
|
my $self = shift; |
139 |
|
|
my $text = shift || confess "no text?"; |
140 |
|
|
|
141 |
|
|
my $body = ''; |
142 |
|
|
foreach my $w ( split(/\W*\s+\W*/, $text ) ) { |
143 |
|
|
$body .= StemHR->stem( $w ) . ' '; |
144 |
|
|
} |
145 |
|
|
|
146 |
|
|
return $body; |
147 |
|
|
} |
148 |
|
|
|
149 |
dpavlin |
1 |
1; |