/[Semantic-Engine]/EPrints/index.pl
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /EPrints/index.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 19 - (hide annotations)
Mon Jul 2 18:12:02 2007 UTC (16 years, 9 months ago) by dpavlin
File MIME type: text/plain
File size: 2317 byte(s)
add content to KinoSearch
1 dpavlin 1 #!/usr/bin/perl -w
2    
3     use strict;
4     use Semantic::API;
5     use Data::Dump qw/dump/;
6 dpavlin 13
7 dpavlin 1 use EPrints qw/_x/;
8 dpavlin 18 use KinoSearch::Simple;
9 dpavlin 13
10 dpavlin 1 my $debug = shift @ARGV;
11 dpavlin 17 my $use = {
12     score => 1,
13     stem => 1,
14     slogovi => 1,
15     };
16 dpavlin 1
17     my $dbh = EPrints->dbh;
18     my $sth = $dbh->prepare(qq{
19     SELECT
20     archive_title.eprintid as id,
21     title
22     FROM archive_title
23 dpavlin 17 WHERE
24     lang = 'hr'
25 dpavlin 1 }) || die $dbh->errstr();
26     $sth->execute() || die $sth->errstr();
27    
28     my $indexer = Semantic::API::Index->new(
29     storage => 'sqlite',
30     database => 'eprints.db',
31     collection => 'EPrints'
32     );
33    
34    
35     $indexer->add_word_filters( minimum_length => 3,
36     too_many_numbers => 10,
37     maximum_word_length => 15 );
38    
39     # use this encoding for any incoming text
40 dpavlin 8 $indexer->set_default_encoding( "iso-8859-2" );
41 dpavlin 1
42     my $total = 0;
43    
44 dpavlin 18 my $kino = KinoSearch::Simple->new(
45     path => 'kinoindex/',
46     language => 'ru',
47     );
48    
49 dpavlin 1 while (my $row = $sth->fetchrow_hashref ) {
50 dpavlin 17 my $id = $row->{id};
51     EPrints->id( $id );
52 dpavlin 18
53 dpavlin 14 my $parts = {
54 dpavlin 17 title => [ _x( $row->{title} ), 3 ],
55     keywords => [ EPrints->lookup( 'keywords' ), 2 ],
56     abstract => [ EPrints->lookup( 'abstract' ), 1 ],
57 dpavlin 14 # content => [ EPrints->fulltext_content, 1 ],
58     };
59    
60 dpavlin 17 my $skip = 0;
61     foreach my $part ( qw/title keywords abstract/ ) {
62     if ( ! $parts->{$part}->[0] ) {
63     warn "skipped $id doesn't have required part $part\n";
64     $skip = 1;
65     last;
66     }
67     }
68     next if $skip;
69    
70 dpavlin 8 my $body = '';
71 dpavlin 1
72 dpavlin 14 foreach my $part ( qw/title keywords abstract content/ ) {
73 dpavlin 17 my $content = $parts->{$part}->[0] || next;
74 dpavlin 8
75 dpavlin 17 if ( $use->{slogovi} ) {
76     $body .= ' ' . EPrints->slogovi( $content );
77     }
78    
79     if ( $use->{stem} ) {
80     my $stem = EPrints->stem( $content );
81     warn "stem of '$content' didn't return anything\n" unless $stem;
82     $content = $stem;
83     }
84    
85    
86     if ( $use->{score} ) {
87     map { $body .= "$content " } 1 .. $parts->{$part}->[1];
88 dpavlin 14 } else {
89 dpavlin 17 $body .= "$content ";
90 dpavlin 14 }
91 dpavlin 17
92     warn ">>> $body <<<\n" if $debug;
93 dpavlin 14 }
94 dpavlin 1
95 dpavlin 14 $indexer->index( $row->{id}, $body );
96 dpavlin 18 $kino->add_doc({
97     id => $id,
98 dpavlin 19 title => $parts->{title}->[0],
99     keywords => $parts->{keywords}->[0],
100     abstract => $parts->{abstract}->[0],
101 dpavlin 18 });
102    
103 dpavlin 1 $total++;
104 dpavlin 14 print STDERR "$total: ", $row->{id}, " ", _x( $row->{title} ), " - ", length($body), " bytes\n";
105 dpavlin 1 }
106    
107    
108     print STDERR "\nNow adding $total items to the database...";
109     $indexer->finish();
110     print STDERR "done!\n";
111    

Properties

Name Value
svn:executable

  ViewVC Help
Powered by ViewVC 1.1.26