Semantic-Engine/EPrints/index.pl

#!/usr/bin/perl -w

use strict;
use Semantic::API;
use Data::Dump qw/dump/;
use EPrints qw/_x/;
use lib '/home/dpavlin/stem-hr/';
use StemHR;

my $debug = shift @ARGV;

my $dbh = EPrints->dbh;
my $sth = $dbh->prepare(qq{
SELECT
        archive_title.eprintid as id,
        title
FROM archive_title 
}) || die $dbh->errstr();
$sth->execute() || die $sth->errstr();

my $indexer = Semantic::API::Index->new(
        storage => 'sqlite',
        database => 'eprints.db',
        collection => 'EPrints'
);


$indexer->add_word_filters( minimum_length => 3,
                            too_many_numbers => 10,
                            maximum_word_length => 15 );

# use this encoding for any incoming text
$indexer->set_default_encoding( "iso-8859-2" ); 

my $total = 0;

while (my $row = $sth->fetchrow_hashref ) {
        EPrints->id( $row->{id} );
        my ( $title, $keywords, $abstract ) = (
                _x( $row->{title} ),
                EPrints->lookup( 'keywords' ),
                EPrints->lookup( 'abstract' )
        );
        my @body = split( /\W*\s+\W*/, "$title $title $title $keywords $keywords $abstract" );
        my $body = '';
        foreach my $word ( @body ) {
                $body .= StemHR->stem( $word ) . ' ';
        }

        $body .= EPrints::slogovi( "$title $keywords $abstract" );

        warn "body: $body\n" if $debug;

    $indexer->index( $row->{id}, join(" ", @body, $body ) );
        $total++;
        print STDERR _x( $row->{id}, " ", $row->{title} ), "\n";
}


print STDERR "\nNow adding $total items to the database...";
$indexer->finish(); 
print STDERR "done!\n"; 

1	#!/usr/bin/perl -w
2
3	use strict;
4	use Semantic::API;
5	use Data::Dump qw/dump/;
6	use EPrints qw/_x/;
7	use lib '/home/dpavlin/stem-hr/';
8	use StemHR;
9
10	my $debug = shift @ARGV;
11
12	my $dbh = EPrints->dbh;
13	my $sth = $dbh->prepare(qq{
14	SELECT
15	archive_title.eprintid as id,
16	title
17	FROM archive_title
18	}) \|\| die $dbh->errstr();
19	$sth->execute() \|\| die $sth->errstr();
20
21	my $indexer = Semantic::API::Index->new(
22	storage => 'sqlite',
23	database => 'eprints.db',
24	collection => 'EPrints'
25	);
26
27
28	$indexer->add_word_filters( minimum_length => 3,
29	too_many_numbers => 10,
30	maximum_word_length => 15 );
31
32	# use this encoding for any incoming text
33	$indexer->set_default_encoding( "iso-8859-2" );
34
35	my $total = 0;
36
37	while (my $row = $sth->fetchrow_hashref ) {
38	EPrints->id( $row->{id} );
39	my ( $title, $keywords, $abstract ) = (
40	_x( $row->{title} ),
41	EPrints->lookup( 'keywords' ),
42	EPrints->lookup( 'abstract' )
43	);
44	my @body = split( /\W\s+\W/, "$title $title $title $keywords $keywords $abstract" );
45	my $body = '';
46	foreach my $word ( @body ) {
47	$body .= StemHR->stem( $word ) . ' ';
48	}
49
50	$body .= EPrints::slogovi( "$title $keywords $abstract" );
51
52	warn "body: $body\n" if $debug;
53
54	$indexer->index( $row->{id}, join(" ", @body, $body ) );
55	$total++;
56	print STDERR _x( $row->{id}, " ", $row->{title} ), "\n";
57	}
58
59
60	print STDERR "\nNow adding $total items to the database...";
61	$indexer->finish();
62	print STDERR "done!\n";
63