/[Semantic-Engine]/EPrints/index.pl
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /EPrints/index.pl

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 13 by dpavlin, Fri Jun 29 18:46:45 2007 UTC revision 18 by dpavlin, Mon Jul 2 12:55:49 2007 UTC
# Line 5  use Semantic::API; Line 5  use Semantic::API;
5  use Data::Dump qw/dump/;  use Data::Dump qw/dump/;
6    
7  use EPrints qw/_x/;  use EPrints qw/_x/;
8    use KinoSearch::Simple;
 use lib '/home/dpavlin/stem-hr/';  
 use StemHR;  
9    
10  my $debug = shift @ARGV;  my $debug = shift @ARGV;
11    my $use = {
12            score => 1,
13            stem => 1,
14            slogovi => 1,
15    };
16    
17  my $dbh = EPrints->dbh;  my $dbh = EPrints->dbh;
18  my $sth = $dbh->prepare(qq{  my $sth = $dbh->prepare(qq{
# Line 17  SELECT Line 20  SELECT
20          archive_title.eprintid as id,          archive_title.eprintid as id,
21          title          title
22  FROM archive_title  FROM archive_title
23    WHERE
24            lang = 'hr'
25  }) || die $dbh->errstr();  }) || die $dbh->errstr();
26  $sth->execute() || die $sth->errstr();  $sth->execute() || die $sth->errstr();
27    
# Line 36  $indexer->set_default_encoding( "iso-885 Line 41  $indexer->set_default_encoding( "iso-885
41    
42  my $total = 0;  my $total = 0;
43    
44    my $kino = KinoSearch::Simple->new(
45            path => 'kinoindex/',
46            language => 'ru',
47    );
48    
49  while (my $row = $sth->fetchrow_hashref ) {  while (my $row = $sth->fetchrow_hashref ) {
50          EPrints->id( $row->{id} );          my $id = $row->{id};
51          my ( $title, $keywords, $abstract ) = (          EPrints->id( $id );
52                  _x( $row->{title} ),  
53                  EPrints->lookup( 'keywords' ),          my $parts = {
54                  EPrints->lookup( 'abstract' )                  title => [ _x( $row->{title} ), 3 ],
55          );                  keywords => [ EPrints->lookup( 'keywords' ), 2 ],
56          my @body = split( /\W*\s+\W*/, "$title $title $title $keywords $keywords $abstract" );                  abstract => [ EPrints->lookup( 'abstract' ), 1 ],
57          my $body = '';  #               content => [ EPrints->fulltext_content, 1 ],
58          foreach my $word ( @body ) {          };
59                  $body .= StemHR->stem( $word ) . ' ';  
60            my $skip = 0;
61            foreach my $part ( qw/title keywords abstract/ ) {
62                    if ( ! $parts->{$part}->[0] ) {
63                            warn "skipped $id doesn't have required part $part\n";
64                            $skip = 1;
65                            last;
66                    }
67          }          }
68            next if $skip;
69    
70            my $body = '';
71    
72          $body .= EPrints::slogovi( "$title $keywords $abstract" );          foreach my $part ( qw/title keywords abstract content/ ) {
73                    my $content = $parts->{$part}->[0] || next;
74    
75          warn "body: $body\n" if $debug;                  if ( $use->{slogovi} ) {
76                            $body .= ' ' . EPrints->slogovi( $content );
77                    }
78    
79                    if ( $use->{stem} ) {
80                            my $stem = EPrints->stem( $content );
81                            warn "stem of '$content' didn't return anything\n" unless $stem;
82                            $content = $stem;
83                    }
84    
85    
86                    if ( $use->{score} ) {
87                            map { $body .= "$content " } 1 .. $parts->{$part}->[1];
88                    } else {
89                            $body .= "$content ";
90                    }
91    
92          $body .= EPrints->fulltext_content;                  warn ">>> $body <<<\n" if $debug;
93            }
94    
95          $indexer->index( $row->{id}, join(" ", @body, $body ) );          $indexer->index( $row->{id}, $body );
96            $kino->add_doc({
97                    id => $id,
98                    title => $parts->{title},
99                    keywords => $parts->{keywords},
100                    abstract => $parts->{abstract},
101            });
102            
103          $total++;          $total++;
104          print STDERR _x( $row->{id}, " ", $row->{title} ), "\n";          print STDERR "$total: ", $row->{id}, " ", _x( $row->{title} ), " - ", length($body), " bytes\n";
105  }  }
106    
107    

Legend:
Removed from v.13  
changed lines
  Added in v.18

  ViewVC Help
Powered by ViewVC 1.1.26