/[Semantic-Engine]/EPrints/index.pl
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /EPrints/index.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 17 - (show annotations)
Sat Jun 30 13:46:51 2007 UTC (16 years, 10 months ago) by dpavlin
File MIME type: text/plain
File size: 2061 byte(s)
move stem to EPrints and make it actually work, completly new
index code (mopefully less nosify in indexed data)
1 #!/usr/bin/perl -w
2
3 use strict;
4 use Semantic::API;
5 use Data::Dump qw/dump/;
6
7 use EPrints qw/_x/;
8
9 my $debug = shift @ARGV;
10 my $use = {
11 score => 1,
12 stem => 1,
13 slogovi => 1,
14 };
15
16 my $dbh = EPrints->dbh;
17 my $sth = $dbh->prepare(qq{
18 SELECT
19 archive_title.eprintid as id,
20 title
21 FROM archive_title
22 WHERE
23 lang = 'hr'
24 }) || die $dbh->errstr();
25 $sth->execute() || die $sth->errstr();
26
27 my $indexer = Semantic::API::Index->new(
28 storage => 'sqlite',
29 database => 'eprints.db',
30 collection => 'EPrints'
31 );
32
33
34 $indexer->add_word_filters( minimum_length => 3,
35 too_many_numbers => 10,
36 maximum_word_length => 15 );
37
38 # use this encoding for any incoming text
39 $indexer->set_default_encoding( "iso-8859-2" );
40
41 my $total = 0;
42
43 while (my $row = $sth->fetchrow_hashref ) {
44 my $id = $row->{id};
45 EPrints->id( $id );
46 my $parts = {
47 title => [ _x( $row->{title} ), 3 ],
48 keywords => [ EPrints->lookup( 'keywords' ), 2 ],
49 abstract => [ EPrints->lookup( 'abstract' ), 1 ],
50 # content => [ EPrints->fulltext_content, 1 ],
51 };
52
53 my $skip = 0;
54 foreach my $part ( qw/title keywords abstract/ ) {
55 if ( ! $parts->{$part}->[0] ) {
56 warn "skipped $id doesn't have required part $part\n";
57 $skip = 1;
58 last;
59 }
60 }
61 next if $skip;
62
63 my $body = '';
64
65 foreach my $part ( qw/title keywords abstract content/ ) {
66 my $content = $parts->{$part}->[0] || next;
67
68 if ( $use->{slogovi} ) {
69 $body .= ' ' . EPrints->slogovi( $content );
70 }
71
72 if ( $use->{stem} ) {
73 my $stem = EPrints->stem( $content );
74 warn "stem of '$content' didn't return anything\n" unless $stem;
75 $content = $stem;
76 }
77
78
79 if ( $use->{score} ) {
80 map { $body .= "$content " } 1 .. $parts->{$part}->[1];
81 } else {
82 $body .= "$content ";
83 }
84
85 warn ">>> $body <<<\n" if $debug;
86 }
87
88 $indexer->index( $row->{id}, $body );
89 $total++;
90 print STDERR "$total: ", $row->{id}, " ", _x( $row->{title} ), " - ", length($body), " bytes\n";
91 }
92
93
94 print STDERR "\nNow adding $total items to the database...";
95 $indexer->finish();
96 print STDERR "done!\n";
97

Properties

Name Value
svn:executable

  ViewVC Help
Powered by ViewVC 1.1.26