/[Semantic-Engine]/EPrints/index.pl
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /EPrints/index.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 23 - (show annotations)
Mon Jul 2 19:59:45 2007 UTC (16 years, 9 months ago) by dpavlin
File MIME type: text/plain
File size: 2357 byte(s)
index full text with KinoSearch
1 #!/usr/bin/perl -w
2
3 use strict;
4 use Semantic::API;
5 use Data::Dump qw/dump/;
6
7 use EPrints qw/_x/;
8 use KinoSearch::Simple;
9
10 my $debug = shift @ARGV;
11 my $use = {
12 score => 1,
13 stem => 1,
14 slogovi => 1,
15 };
16
17 my $dbh = EPrints->dbh;
18 my $sth = $dbh->prepare(qq{
19 SELECT
20 archive_title.eprintid as id,
21 title
22 FROM archive_title
23 WHERE
24 lang = 'hr'
25 }) || die $dbh->errstr();
26 $sth->execute() || die $sth->errstr();
27
28 my $indexer = Semantic::API::Index->new(
29 storage => 'sqlite',
30 database => 'eprints.db',
31 collection => 'EPrints'
32 );
33
34
35 $indexer->add_word_filters( minimum_length => 3,
36 too_many_numbers => 10,
37 maximum_word_length => 15 );
38
39 # use this encoding for any incoming text
40 $indexer->set_default_encoding( "iso-8859-2" );
41
42 my $total = 0;
43
44 my $kino = KinoSearch::Simple->new(
45 path => 'kinoindex/',
46 language => 'ru',
47 );
48
49 while (my $row = $sth->fetchrow_hashref ) {
50 my $id = $row->{id};
51 EPrints->id( $id );
52
53 my $parts = {
54 title => [ _x( $row->{title} ), 3 ],
55 keywords => [ EPrints->lookup( 'keywords' ), 2 ],
56 abstract => [ EPrints->lookup( 'abstract' ), 1 ],
57 # content => [ EPrints->fulltext_content, 1 ],
58 };
59
60 my $skip = 0;
61 foreach my $part ( qw/title keywords abstract/ ) {
62 if ( ! $parts->{$part}->[0] ) {
63 warn "skipped $id doesn't have required part $part\n";
64 $skip = 1;
65 last;
66 }
67 }
68 next if $skip;
69
70 my $body = '';
71
72 foreach my $part ( qw/title keywords abstract content/ ) {
73 my $content = $parts->{$part}->[0] || next;
74
75 if ( $use->{slogovi} ) {
76 $body .= ' ' . EPrints->slogovi( $content );
77 }
78
79 if ( $use->{stem} ) {
80 my $stem = EPrints->stem( $content );
81 warn "stem of '$content' didn't return anything\n" unless $stem;
82 $content = $stem;
83 }
84
85
86 if ( $use->{score} ) {
87 map { $body .= "$content " } 1 .. $parts->{$part}->[1];
88 } else {
89 $body .= "$content ";
90 }
91
92 warn ">>> $body <<<\n" if $debug;
93 }
94
95 $indexer->index( $row->{id}, $body );
96 $kino->add_doc({
97 id => $id,
98 title => $parts->{title}->[0],
99 keywords => $parts->{keywords}->[0],
100 abstract => $parts->{abstract}->[0],
101 content => EPrints->fulltext_content,
102 });
103
104 $total++;
105 print STDERR "$total: ", $row->{id}, " ", _x( $row->{title} ), " - ", length($body), " bytes\n";
106 }
107
108
109 print STDERR "\nNow adding $total items to the database...";
110 $indexer->finish();
111 print STDERR "done!\n";
112

Properties

Name Value
svn:executable

  ViewVC Help
Powered by ViewVC 1.1.26