/[Semantic-Engine]/EPrints/EPrints.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /EPrints/EPrints.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 17 - (show annotations)
Sat Jun 30 13:46:51 2007 UTC (16 years, 10 months ago) by dpavlin
File size: 2928 byte(s)
move stem to EPrints and make it actually work, completly new
index code (mopefully less nosify in indexed data)
1 # Dobrica Pavlinusic, <dpavlin@rot13.org> 06/28/07 23:28:21 CEST
2
3 package EPrints;
4
5 use Exporter 'import';
6 @EXPORT_OK = qw(_x slogovi);
7
8 use Encode qw/from_to decode_utf8 decode/;
9 use Data::Dump qw/dump/;
10 use DBI;
11 use URI::Escape;
12 use Carp qw/confess/;
13
14 use lib '/home/dpavlin/stem-hr/';
15 use StemHR;
16
17 use strict;
18 use warnings;
19
20 my $debug = 0;
21
22 my $connect = "DBI:mysql:dbname=eprints";
23 # path to eprints installation
24 my $eprints_archive = '/data/eprints2/archives/ffzg/documents/disk0/';
25
26 my $dbh = DBI->connect($connect,"dpavlin","") || die $DBI::errstr;
27
28 sub dbh {
29 my $self = shift;
30 return $dbh;
31 }
32
33 my $id;
34
35 sub id {
36 my $self = shift;
37 if ( defined( $_[0] ) ) {
38 $id = $_[0];
39 warn "# id = $id\n" if $debug;
40 }
41 return $id;
42 }
43
44 sub lookup {
45 my $self = shift;
46 my $field = shift;
47 my $table = shift;
48 my $where = '';
49
50 if ( ! $table ) {
51 $table = "archive_$field";
52 $where = " and lang = 'hr'";
53 }
54
55 my $sql = qq{
56 SELECT $field
57 FROM $table
58 WHERE eprintid = $id $where
59 };
60 warn "# sql: $sql\n" if $debug;
61 my @results = map { _x( $_->{$field} ) } @{ $dbh->selectall_arrayref($sql, { Slice => {} }) };
62
63 warn "# loookup( $field, $id ) = ", dump( @results ),$/ if $debug;
64 return join(" ", @results);
65 }
66
67 sub fulltext {
68 my $self = shift;
69 my $fulltext = EPrints->lookup( 'fileinfo', 'archive' );
70 $fulltext =~ s/\s+$//;
71 return split(/;/, $fulltext);
72 }
73
74 sub fulltext_content {
75 my $self = shift;
76
77 my $path = $eprints_archive;
78
79 my ( $type, $uri ) = EPrints->fulltext;
80 $uri =~ s!http://[^/]+/!!;
81 $uri = uri_unescape($uri);
82 if ( $uri =~ s|^(\d+)/|| ) {
83 my $nr = sprintf("%08d", $1);
84 $nr =~ s!(\d\d)!$1/!g;
85 $path .= "/$nr/$uri";
86 } else {
87 warn "can't find ID in $uri";
88 return;
89 }
90 $path =~ s!//+!/!g;
91 if ( -r $path ) {
92 print "+ $path ", -s $path, " bytes\n";
93 open(my $pdf, "pdftotext $path - | iconv -f utf-8 -t iso-8859-2 -c |") || die "can't open pdftotext $path: $!";
94 local $/;
95 my $content = <$pdf>;
96 print "\t>>", length( $content ), " text bytes\n";
97 close($pdf); # || die "can't close $path: $!";
98 return $content;
99 } else {
100 warn "ERROR: $path: $!\n";
101 }
102
103 }
104
105 sub _x {
106 my $v = join(" ", @_);
107 decode_utf8( $v );
108 from_to( $v, 'utf-8', 'iso-8859-2' );
109 warn "_x($v)\n" if $debug;
110 return "$v ";
111 }
112
113 sub slogovi {
114 my $self = shift;
115 my $text = shift || confess "no text?";
116
117 my $count = 3;
118 my $out = '';
119
120 foreach my $w ( split(/\W*\s+\W*/, $text ) ) {
121 warn "w: $w\n" if $debug;
122 my @s;
123 while ( $w =~ s/^([^aeiou]*[aeiou])//i ) {
124 push @s, $1;
125 }
126 push @s, $w if $w;
127 warn "slogovi = ", dump( @s ), $/ if $debug;
128 foreach my $p ( 0 .. ( $#s - $count + 1 ) ) {
129 map { $out .= $s[ $p + $_ ] } 0 .. $count - 1;
130 $out .= ' ';
131 }
132 }
133 warn "$out\n" if $debug;
134 return $out;
135 }
136
137 sub stem {
138 my $self = shift;
139 my $text = shift || confess "no text?";
140
141 my $body = '';
142 foreach my $w ( split(/\W*\s+\W*/, $text ) ) {
143 $body .= StemHR->stem( $w ) . ' ';
144 }
145
146 return $body;
147 }
148
149 1;

  ViewVC Help
Powered by ViewVC 1.1.26