Semantic-Engine/EPrints/index.pl

#!/usr/bin/perl -w

use strict;
use Semantic::API;
use DBI;
use Data::Dump qw/dump/;
use EPrints qw/_x/;
use lib '/home/dpavlin/stem-hr/';
use StemHR;

warn dump( StemHR->stem('kuæni') );

my $debug = shift @ARGV;

my $dbh = EPrints->dbh;
my $sth = $dbh->prepare(qq{
SELECT
        archive_title.eprintid as id,
        title
FROM archive_title 
}) || die $dbh->errstr();
$sth->execute() || die $sth->errstr();

my $indexer = Semantic::API::Index->new(
        storage => 'sqlite',
        database => 'eprints.db',
        collection => 'EPrints'
);


$indexer->add_word_filters( minimum_length => 3,
                            too_many_numbers => 10,
                            maximum_word_length => 15 );

# use this encoding for any incoming text
$indexer->set_default_encoding( "utf8"); 

my $total = 0;

while (my $row = $sth->fetchrow_hashref ) {
        EPrints->id( $row->{id} );
        my ( $title, $keywords, $abstract ) = (
                _x( $row->{title} ),
                EPrints->lookup( 'keywords' ),
                EPrints->lookup( 'abstract' )
        );
        my @body = split( /\W*\s+\W*/, "$title $title $title $keywords $keywords $abstract" );
        my $body;
        foreach my $word ( @body ) {
                $body .= StemHR->stem( $word ) . ' ';
        }

        warn "body: $body\n" if $debug;

    $indexer->index( $row->{id}, join(" ", @body, $body ) );
        $total++;
        print STDERR _x( $row->{id}, " ", $row->{title} ), "\n";
}


print STDERR "\nNow adding $total items to the database...";
$indexer->finish(); 
print STDERR "done!\n"; 

1	#!/usr/bin/perl -w
2
3	use strict;
4	use Semantic::API;
5	use DBI;
6	use Data::Dump qw/dump/;
7	use EPrints qw/_x/;
8	use lib '/home/dpavlin/stem-hr/';
9	use StemHR;
10
11	warn dump( StemHR->stem('kuæni') );
12
13	my $debug = shift @ARGV;
14
15	my $dbh = EPrints->dbh;
16	my $sth = $dbh->prepare(qq{
17	SELECT
18	archive_title.eprintid as id,
19	title
20	FROM archive_title
21	}) \|\| die $dbh->errstr();
22	$sth->execute() \|\| die $sth->errstr();
23
24	my $indexer = Semantic::API::Index->new(
25	storage => 'sqlite',
26	database => 'eprints.db',
27	collection => 'EPrints'
28	);
29
30
31	$indexer->add_word_filters( minimum_length => 3,
32	too_many_numbers => 10,
33	maximum_word_length => 15 );
34
35	# use this encoding for any incoming text
36	$indexer->set_default_encoding( "utf8");
37
38	my $total = 0;
39
40	while (my $row = $sth->fetchrow_hashref ) {
41	EPrints->id( $row->{id} );
42	my ( $title, $keywords, $abstract ) = (
43	_x( $row->{title} ),
44	EPrints->lookup( 'keywords' ),
45	EPrints->lookup( 'abstract' )
46	);
47	my @body = split( /\W\s+\W/, "$title $title $title $keywords $keywords $abstract" );
48	my $body;
49	foreach my $word ( @body ) {
50	$body .= StemHR->stem( $word ) . ' ';
51	}
52
53	warn "body: $body\n" if $debug;
54
55	$indexer->index( $row->{id}, join(" ", @body, $body ) );
56	$total++;
57	print STDERR _x( $row->{id}, " ", $row->{title} ), "\n";
58	}
59
60
61	print STDERR "\nNow adding $total items to the database...";
62	$indexer->finish();
63	print STDERR "done!\n";
64