/[Semantic-Engine]/EPrints/index.pl
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /EPrints/index.pl

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 4 by dpavlin, Fri Jun 29 09:52:53 2007 UTC revision 17 by dpavlin, Sat Jun 30 13:46:51 2007 UTC
# Line 3  Line 3 
3  use strict;  use strict;
4  use Semantic::API;  use Semantic::API;
5  use Data::Dump qw/dump/;  use Data::Dump qw/dump/;
 use EPrints qw/_x/;  
 use lib '/home/dpavlin/stem-hr/';  
 use StemHR;  
6    
7  warn dump( StemHR->stem('kuæni') );  use EPrints qw/_x/;
8    
9  my $debug = shift @ARGV;  my $debug = shift @ARGV;
10    my $use = {
11  my $type = 'slogovi';          score => 1,
12            stem => 1,
13            slogovi => 1,
14    };
15    
16  my $dbh = EPrints->dbh;  my $dbh = EPrints->dbh;
17  my $sth = $dbh->prepare(qq{  my $sth = $dbh->prepare(qq{
# Line 19  SELECT Line 19  SELECT
19          archive_title.eprintid as id,          archive_title.eprintid as id,
20          title          title
21  FROM archive_title  FROM archive_title
22    WHERE
23            lang = 'hr'
24  }) || die $dbh->errstr();  }) || die $dbh->errstr();
25  $sth->execute() || die $sth->errstr();  $sth->execute() || die $sth->errstr();
26    
# Line 34  $indexer->add_word_filters( minimum_leng Line 36  $indexer->add_word_filters( minimum_leng
36                              maximum_word_length => 15 );                              maximum_word_length => 15 );
37    
38  # use this encoding for any incoming text  # use this encoding for any incoming text
39  #$indexer->set_default_encoding( "utf8");  $indexer->set_default_encoding( "iso-8859-2" );
40    
41  my $total = 0;  my $total = 0;
42    
43  while (my $row = $sth->fetchrow_hashref ) {  while (my $row = $sth->fetchrow_hashref ) {
44          EPrints->id( $row->{id} );          my $id = $row->{id};
45          my ( $title, $keywords, $abstract ) = (          EPrints->id( $id );
46                  _x( $row->{title} ),          my $parts = {
47                  EPrints->lookup( 'keywords' ),                  title => [ _x( $row->{title} ), 3 ],
48                  EPrints->lookup( 'abstract' )                  keywords => [ EPrints->lookup( 'keywords' ), 2 ],
49          );                  abstract => [ EPrints->lookup( 'abstract' ), 1 ],
50          my @body = split( /\W*\s+\W*/, "$title $title $title $keywords $keywords $abstract" );  #               content => [ EPrints->fulltext_content, 1 ],
51          my $body;          };
52          foreach my $word ( @body ) {  
53  #               $body .= StemHR->stem( $word ) . ' ';          my $skip = 0;
54                  $body .= join(" ",EPrints::slogovi( $word )) . ' ';          foreach my $part ( qw/title keywords abstract/ ) {
55                    if ( ! $parts->{$part}->[0] ) {
56                            warn "skipped $id doesn't have required part $part\n";
57                            $skip = 1;
58                            last;
59                    }
60          }          }
61            next if $skip;
62    
63            my $body = '';
64    
65            foreach my $part ( qw/title keywords abstract content/ ) {
66                    my $content = $parts->{$part}->[0] || next;
67    
68          warn "body: $body\n" if $debug;                  if ( $use->{slogovi} ) {
69                            $body .= ' ' . EPrints->slogovi( $content );
70                    }
71    
72                    if ( $use->{stem} ) {
73                            my $stem = EPrints->stem( $content );
74                            warn "stem of '$content' didn't return anything\n" unless $stem;
75                            $content = $stem;
76                    }
77    
78    
79                    if ( $use->{score} ) {
80                            map { $body .= "$content " } 1 .. $parts->{$part}->[1];
81                    } else {
82                            $body .= "$content ";
83                    }
84    
85                    warn ">>> $body <<<\n" if $debug;
86            }
87    
88      $indexer->index( $row->{id}, join(" ", @body, $body ) );          $indexer->index( $row->{id}, $body );
89          $total++;          $total++;
90          print STDERR _x( $row->{id}, " ", $row->{title} ), "\n";          print STDERR "$total: ", $row->{id}, " ", _x( $row->{title} ), " - ", length($body), " bytes\n";
91  }  }
92    
93    

Legend:
Removed from v.4  
changed lines
  Added in v.17

  ViewVC Help
Powered by ViewVC 1.1.26