--- trunk/lib/WebPAC/Output/KinoSearch.pm 2006/08/01 17:26:55 609 +++ trunk/lib/WebPAC/Output/KinoSearch.pm 2007/10/31 12:29:57 939 @@ -3,12 +3,21 @@ use warnings; use strict; -use base qw/WebPAC::Common/; +use base qw/WebPAC::Common WebPAC::Output Class::Accessor/; +__PACKAGE__->mk_accessors(qw( + path + database + input + encoding + clean -use KinoSearch::InvIndexer; -use KinoSearch::Analysis::PolyAnalyzer; + index +)); + +use KinoSearch::Simple; +use File::Path; use Encode qw/from_to/; -use Data::Dumper; +use Data::Dump qw/dump/; use Storable; =head1 NAME @@ -17,11 +26,11 @@ =head1 VERSION -Version 0.03 +Version 0.05 =cut -our $VERSION = '0.03'; +our $VERSION = '0.05'; =head1 SYNOPSIS @@ -34,35 +43,25 @@ Open KinoSearch index - my $est = new WebPAC::Output::KinoSearch( - index_path => '/path/to/invindex', - fields => qw/name of all filelds used/, + my $out = new WebPAC::Output::KinoSearch({ + path => '/path/to/invindex', database => 'demo', - label => 'node label', encoding => 'iso-8859-2', clean => 1, - ); + }); Options are: =over 4 -=item index_path +=item path path to KinoSearch index to use -=item fields - -name of all fields used in this index - =item database name of database from which data comes -=item label - -label for node (optional) - =item encoding character encoding of C if it's differenet than C @@ -71,147 +70,91 @@ =back +=head2 init + + $out->init; + =cut -sub new { - my $class = shift; - my $self = {@_}; - bless($self, $class); +sub init { + my $self = shift; my $log = $self->_get_logger; - #$log->debug("self: ", sub { Dumper($self) }); + #$log->debug("self: ", sub { dump($self) }); - foreach my $p (qw/index_path fields database/) { - $log->logdie("need $p") unless ($self->{$p}); + foreach my $p (qw/path database/) { + $log->logdie("need $p") unless ($self->$p); } - $log->logdie("fields is not ARRAY") unless (ref($self->{fields}) eq 'ARRAY'); +# $log->logdie("fields is not ARRAY") unless (ref($self->{fields}) eq 'ARRAY'); - $self->{encoding} ||= 'ISO-8859-2'; + $self->encoding( 'ISO-8859-2' ) unless $self->encoding; - $self->{clean} = 1 if (! -e $self->{index_path} . '/segments'); + ## FIXME we shouldn't re-create whole KinoSearch index every time! + $self->clean( 1 ); - $log->info("using", $self->{clean} ? ' new' : '', " index $self->{index_path} with encoding $self->{encoding}"); + if ( ! -e $self->path ) { + mkpath $self->path || $log->logdie("can't create ", $self->path,": $!"); + $log->info("created ", $self->path); + } elsif ( $self->clean ) { + $log->info("removing existing ", $self->path); + rmtree $self->path || $log->logdie("can't remove ", $self->path,": $!"); + mkpath $self->path || $log->logdie("can't create ", $self->path,": $!"); + } - my $analyzer = KinoSearch::Analysis::PolyAnalyzer->new( language => 'en' ); + my $path = $self->path . '/' . $self->database; - $self->{invindex} = KinoSearch::InvIndexer->new( - invindex => $self->{index_path}, - create => $self->{clean}, - analyzer => $analyzer, + $log->info("using index $path with encoding ", $self->encoding); + + my $index = KinoSearch::Simple->new( + path => $path, + language => 'en', ); - my $fields_path = $self->{index_path} . '/fields.storable'; - $fields_path =~ s#//#/#g; - if (-e $fields_path) { - $self->{fields} = retrieve($fields_path) || - $log->warn("can't open $fields_path: $!"); - } else { - $log->error("This will be dummy run since no fields statistics are found!"); - $log->error("You will have to re-run indexing to get search results!"); - $self->{dummy_run} = 1; - } - $self->{fields_path} = $fields_path; + $log->logdie("can't open $path: $!") unless $index; - foreach my $f (@{ $self->{fields} }) { - $self->{invindex}->spec_field( - name => $f, -# boost => 10, - stored => 1, - indexed => 1, - vectorized => 0, - ); - } + $self->index( $index ); - $self ? return $self : return undef; } =head2 add -Adds one entry to database. - - $est->add( - id => 42, - ds => $ds, - type => 'display', - text => 'optional text from which snippet is created', - ); - -This function will create entries in index using following URI format: +Adds one entry - C - -Each tag in C with specified C will create one -attribute and corresponding hidden text (used for search). + $out->add( 42, $ds ); =cut sub add { my $self = shift; - my $args = {@_}; + my ( $id, $ds ) = @_; my $log = $self->_get_logger; + $log->logdie("need id") unless defined $id; + $log->logdie("need ds") unless $ds; - my $database = $self->{'database'} || $log->logconfess('no database in $self'); - $log->logconfess('need invindex in object') unless ($self->{'invindex'}); - - foreach my $p (qw/id ds type/) { - $log->logdie("need $p") unless ($args->{$p}); - } - - my $type = $args->{'type'}; - my $id = $args->{'id'}; - - my $uri = "file:///$type/$database/$id"; - $log->debug("creating $uri"); + $log->debug("id: $id ds = ", sub { dump($ds) }); - my $doc = $self->{invindex}->new_doc( $uri ) || $log->logdie("can't create new_doc( $uri )"); + my $hash = $self->ds_to_hash( $ds, 'search' ) || return; - sub add_value($$$$$) { - my ($self,$log,$doc,$n,$v) = @_; - return unless ($v); + $hash->{id} ||= $id; + $hash->{database} ||= $self->database; + $hash->{input} ||= $self->input; - $self->{value_usage}->{$n}++; - return if ($self->{dummy_run}); - - eval { $doc->set_value($n, $self->convert($v) ) }; - $log->warn("can't insert: $n = $v") if ($@); + foreach my $f ( keys %$hash ) { + if ( ref($hash->{$f}) eq 'ARRAY' ) { + $hash->{$f} = join(' <*> ', @{ $hash->{$f} }); + } } - add_value($self,$log,$doc, 'uri', $uri); - - $log->debug("ds = ", sub { Dumper($args->{'ds'}) } ); + $log->debug("add( $id, ", sub { dump($ds) }," ) => ", sub { dump( $hash ) }); - # filter all tags which have type defined - my @tags = grep { - ref($args->{'ds'}->{$_}) eq 'HASH' && defined( $args->{'ds'}->{$_}->{$type} ) - } keys %{ $args->{'ds'} }; + $self->index->add_doc( $hash ); - $log->debug("tags = ", join(",", @tags)); - - return unless (@tags); - - foreach my $tag (@tags) { - - my $vals = join(" ", @{ $args->{'ds'}->{$tag}->{$type} }); - - next if (! $vals); - - $vals = $self->convert( $vals ) or - $log->logdie("can't convert '$vals' to UTF-8"); - - add_value($self, $log, $doc, $tag, $vals ); - } - - if (my $text = $args->{'text'}) { - add_value($self, $log, $doc, 'bodytext', $text ); - } - - #$log->debug("adding ", sub { $doc->dump_draft } ); - $self->{invindex}->add_doc($doc) || $log->warn("can't add document $uri"); + $self->{count}++; return 1; } @@ -220,7 +163,7 @@ Close index - $index->finish; + $out->finish; =cut @@ -229,32 +172,8 @@ my $log = $self->_get_logger(); - $log->info("finish index writing to disk"); - $self->{invindex}->finish; - - $log->info("writing value usage file"); - - # add fields from last run - map { $self->{value_usage}->{$_}++ } @{ $self->{fields} }; - - my @fields = keys %{ $self->{value_usage} }; - store \@fields, $self->{fields_path} || - $log->warn("can't write $self->{fields_path}: $!"); - -} - -=head2 convert - - my $utf8_string = $self->convert('string in codepage'); - -=cut - -sub convert { - my $self = shift; + $log->info("indexed ", $self->{count}, " records"); - my $text = shift || return; - from_to($text, $self->{encoding}, 'UTF-8'); - return $text; } =head1 AUTHOR @@ -263,7 +182,7 @@ =head1 COPYRIGHT & LICENSE -Copyright 2005 Dobrica Pavlinusic, All Rights Reserved. +Copyright 2005-2007 Dobrica Pavlinusic, All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself.