/[Semantic-Engine]/EPrints/EPrints.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /EPrints/EPrints.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 11 by dpavlin, Fri Jun 29 16:58:42 2007 UTC revision 17 by dpavlin, Sat Jun 30 13:46:51 2007 UTC
# Line 8  use Exporter 'import'; Line 8  use Exporter 'import';
8  use Encode qw/from_to decode_utf8 decode/;  use Encode qw/from_to decode_utf8 decode/;
9  use Data::Dump qw/dump/;  use Data::Dump qw/dump/;
10  use DBI;  use DBI;
11    use URI::Escape;
12    use Carp qw/confess/;
13    
14    use lib '/home/dpavlin/stem-hr/';
15    use StemHR;
16    
17  use strict;  use strict;
18  use warnings;  use warnings;
# Line 15  use warnings; Line 20  use warnings;
20  my $debug = 0;  my $debug = 0;
21    
22  my $connect = "DBI:mysql:dbname=eprints";  my $connect = "DBI:mysql:dbname=eprints";
23    # path to eprints installation
24    my $eprints_archive = '/data/eprints2/archives/ffzg/documents/disk0/';
25    
26  my $dbh = DBI->connect($connect,"dpavlin","") || die $DBI::errstr;  my $dbh = DBI->connect($connect,"dpavlin","") || die $DBI::errstr;
27    
# Line 57  sub lookup { Line 64  sub lookup {
64          return join("   ", @results);          return join("   ", @results);
65  }  }
66    
67    sub fulltext {
68            my $self = shift;
69            my $fulltext = EPrints->lookup( 'fileinfo', 'archive' );
70            $fulltext =~ s/\s+$//;
71            return split(/;/, $fulltext);
72    }
73    
74    sub fulltext_content {
75            my $self = shift;
76    
77            my $path = $eprints_archive;
78    
79            my ( $type, $uri ) = EPrints->fulltext;
80            $uri =~ s!http://[^/]+/!!;
81            $uri = uri_unescape($uri);
82            if ( $uri =~ s|^(\d+)/|| ) {
83                    my $nr = sprintf("%08d", $1);
84                    $nr =~ s!(\d\d)!$1/!g;
85                    $path .= "/$nr/$uri";
86            } else {
87                    warn "can't find ID in $uri";
88                    return;
89            }
90            $path =~ s!//+!/!g;
91            if ( -r $path ) {
92                    print "+ $path ", -s $path, " bytes\n";
93                    open(my $pdf, "pdftotext $path - | iconv -f utf-8 -t iso-8859-2 -c |") || die "can't open pdftotext $path: $!";
94                    local $/;
95                    my $content = <$pdf>;
96                    print "\t>>", length( $content ), " text bytes\n";
97                    close($pdf); # || die "can't close $path: $!";
98                    return $content;
99            } else {
100                    warn "ERROR: $path: $!\n";
101            }
102    
103    }
104    
105  sub _x {  sub _x {
106          my $v = join(" ", @_);          my $v = join(" ", @_);
107          decode_utf8( $v );          decode_utf8( $v );
# Line 66  sub _x { Line 111  sub _x {
111  }  }
112    
113  sub slogovi {  sub slogovi {
114          my $text = shift;          my $self = shift;
115            my $text = shift || confess "no text?";
116    
117          my $count = 2;          my $count = 3;
118          my $out = '';          my $out = '';
119    
120          foreach my $w ( split(/\W*\s+\W*/, $text ) ) {          foreach my $w ( split(/\W*\s+\W*/, $text ) ) {
# Line 88  sub slogovi { Line 134  sub slogovi {
134          return $out;          return $out;
135  }  }
136    
137    sub stem {
138            my $self = shift;
139            my $text = shift || confess "no text?";
140    
141            my $body = '';
142            foreach my $w ( split(/\W*\s+\W*/, $text ) ) {
143                    $body .= StemHR->stem( $w ) . ' ';
144            }
145    
146            return $body;
147    }
148    
149  1;  1;

Legend:
Removed from v.11  
changed lines
  Added in v.17

  ViewVC Help
Powered by ViewVC 1.1.26