/[webpac2]/trunk/lib/WebPAC/Output/KinoSearch.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/lib/WebPAC/Output/KinoSearch.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 913 by dpavlin, Mon Sep 3 15:26:46 2007 UTC revision 914 by dpavlin, Tue Oct 30 20:11:04 2007 UTC
# Line 3  package WebPAC::Output::KinoSearch; Line 3  package WebPAC::Output::KinoSearch;
3  use warnings;  use warnings;
4  use strict;  use strict;
5    
6  use base qw/WebPAC::Common/;  use base qw/WebPAC::Common WebPAC::Output Class::Accessor/;
7    __PACKAGE__->mk_accessors(qw(
8            path
9            database
10            encoding
11            clean
12    
13  use KinoSearch::InvIndexer;          index
14  use KinoSearch::Analysis::PolyAnalyzer;  ));
15    
16    use KinoSearch::Simple;
17    use File::Path;
18  use Encode qw/from_to/;  use Encode qw/from_to/;
19  use Data::Dump qw/dump/;  use Data::Dump qw/dump/;
20  use Storable;  use Storable;
# Line 17  WebPAC::Output::KinoSearch - Create Kino Line 25  WebPAC::Output::KinoSearch - Create Kino
25    
26  =head1 VERSION  =head1 VERSION
27    
28  Version 0.03  Version 0.04
29    
30  =cut  =cut
31    
32  our $VERSION = '0.03';  our $VERSION = '0.04';
33    
34  =head1 SYNOPSIS  =head1 SYNOPSIS
35    
# Line 34  type C<search>. Line 42  type C<search>.
42    
43  Open KinoSearch index  Open KinoSearch index
44    
45   my $est = new WebPAC::Output::KinoSearch(   my $est = new WebPAC::Output::KinoSearch({
46          index_path => '/path/to/invindex',          path => '/path/to/invindex',
         fields => qw/name of all filelds used/,  
47          database => 'demo',          database => 'demo',
         label => 'node label',  
48          encoding => 'iso-8859-2',          encoding => 'iso-8859-2',
49          clean => 1,          clean => 1,
50   );   });
51    
52  Options are:  Options are:
53    
54  =over 4  =over 4
55    
56  =item index_path  =item path
57    
58  path to KinoSearch index to use  path to KinoSearch index to use
59    
 =item fields  
   
 name of all fields used in this index  
   
60  =item database  =item database
61    
62  name of database from which data comes  name of database from which data comes
63    
 =item label  
   
 label for node (optional)  
   
64  =item encoding  =item encoding
65    
66  character encoding of C<data_structure> if it's differenet than C<ISO-8859-2>  character encoding of C<data_structure> if it's differenet than C<ISO-8859-2>
# Line 73  index. Line 71  index.
71    
72  =cut  =cut
73    
74  sub new {  sub init {
75          my $class = shift;          my $self = shift;
         my $self = {@_};  
         bless($self, $class);  
76    
77          my $log = $self->_get_logger;          my $log = $self->_get_logger;
78    
79          #$log->debug("self: ", sub { dump($self) });          #$log->debug("self: ", sub { dump($self) });
80    
81          foreach my $p (qw/index_path fields database/) {          foreach my $p (qw/path database/) {
82                  $log->logdie("need $p") unless ($self->{$p});                  $log->logdie("need $p") unless ($self->$p);
83          }          }
84    
85          $log->logdie("fields is not ARRAY") unless (ref($self->{fields}) eq 'ARRAY');  #       $log->logdie("fields is not ARRAY") unless (ref($self->{fields}) eq 'ARRAY');
   
         $self->{encoding} ||= 'ISO-8859-2';  
86    
87          $self->{index_path} .= '/' . $self->{database};          $self->encoding( 'ISO-8859-2' ) unless $self->encoding;
88    
89          $self->{clean} = 1 if (! -e $self->{index_path} . '/segments');          if ( ! -e $self->path ) {
90                    mkpath $self->path || $log->logdie("can't create ", $self->path,": $!");
91                    $log->info("created ", $self->path);
92            }
93    
94          $log->info("using", $self->{clean} ? ' new' : '', " index $self->{index_path} with encoding $self->{encoding}");          my $path = $self->path . '/' . $self->database;
95    
96          my $analyzer = KinoSearch::Analysis::PolyAnalyzer->new( language => 'en' );          $log->info("using index $path with encoding ", $self->encoding);
97    
98          $self->{invindex} = KinoSearch::InvIndexer->new(          my $index = KinoSearch::Simple->new(
99                  invindex => $self->{index_path},                  path => $path,
100                  create   => $self->{clean},                  language => 'en',
                 analyzer => $analyzer,  
101          );          );
102    
103          my $fields_path = $self->{index_path} . '/fields.storable';          $log->logdie("can't open $path: $!") unless $index;
         $fields_path =~ s#//#/#g;  
         if (-e $fields_path) {  
                 $self->{fields} = retrieve($fields_path) ||  
                         $log->warn("can't open $fields_path: $!");  
         } else {  
                 $log->error("This will be dummy run since no fields statistics are found!");  
                 $log->error("You will have to re-run indexing to get search results!");  
                 $self->{dummy_run} = 1;  
         }  
         $self->{fields_path} = $fields_path;  
104    
105          foreach my $f (@{ $self->{fields} }) {          $self->index( $index );
                 $self->{invindex}->spec_field(  
                         name  => $f,  
 #                       boost => 10,  
                         stored => 1,  
                         indexed => 1,  
                         vectorized => 0,  
                 );  
         }  
106    
         $self ? return $self : return undef;  
107  }  }
108    
109    
110  =head2 add  =head2 add
111    
112  Adds one entry to database.  Adds one entry
   
   $est->add(  
         id => 42,  
         ds => $ds,  
         type => 'display',  
         text => 'optional text from which snippet is created',  
   );  
   
 This function will create  entries in index using following URI format:  
113    
114    C<file:///type/database%20name/000>    $est->add( 42, $ds );
   
 Each tag in C<data_structure> with specified C<type> will create one  
 attribute and corresponding hidden text (used for search).  
115    
116  =cut  =cut
117    
118  sub add {  sub add {
119          my $self = shift;          my $self = shift;
120    
121          my $args = {@_};          my ( $id, $ds ) = @_;
122    
123          my $log = $self->_get_logger;          my $log = $self->_get_logger;
124            $log->logdie("need id") unless defined $id;
125            $log->logdie("need ds") unless $ds;
126    
127          my $database = $self->{'database'} || $log->logconfess('no database in $self');          $log->debug("id: $id ds = ",dump($ds));
         $log->logconfess('need invindex in object') unless ($self->{'invindex'});  
   
         foreach my $p (qw/id ds type/) {  
                 $log->logdie("need $p") unless ($args->{$p});  
         }  
   
         my $type = $args->{'type'};  
         my $id = $args->{'id'};  
   
         my $uri = "file:///$type/$database/$id";  
         $log->debug("creating $uri");  
   
         my $doc = $self->{invindex}->new_doc( $uri ) || $log->logdie("can't create new_doc( $uri )");  
128    
129          sub _add_value($$$$$) {          my $hash = $self->ds_to_hash( $ds, 'search' ) || return;
                 my ($self,$log,$doc,$n,$v) = @_;  
                 return unless ($v);  
130    
131                  $self->{value_usage}->{$n}++;          warn "add( $id, ",dump($ds)," ) => ", dump( $hash );
                 return if ($self->{dummy_run});  
132    
133                  eval { $doc->set_value($n, $self->convert($v) ) };          $self->index->add_doc( $hash );
                 $log->warn("can't insert: $n = $v") if ($@);  
         }  
   
         _add_value($self,$log,$doc, 'uri', $uri);  
   
         $log->debug("ds = ", sub { dump($args->{'ds'}) } );  
   
         # filter all tags which have type defined  
         my @tags = grep {  
                 ref($args->{'ds'}->{$_}) eq 'HASH' && defined( $args->{'ds'}->{$_}->{$type} )  
         } keys %{ $args->{'ds'} };  
   
         $log->debug("tags = ", join(",", @tags));  
   
         return unless (@tags);  
   
         foreach my $tag (@tags) {  
   
                 my $vals = join(" ", @{ $args->{'ds'}->{$tag}->{$type} });  
   
                 next if (! $vals);  
   
                 $vals = $self->convert( $vals ) or  
                         $log->logdie("can't convert '$vals' to UTF-8");  
   
                 _add_value($self, $log, $doc, $tag, $vals );  
         }  
   
         if (my $text = $args->{'text'}) {  
                 _add_value($self, $log, $doc, 'bodytext', $text );  
         }  
   
         #$log->debug("adding ", sub { $doc->dump_draft } );  
         $self->{invindex}->add_doc($doc) || $log->warn("can't add document $uri");  
134    
135          return 1;          return 1;
136  }  }
# Line 231  sub finish { Line 148  sub finish {
148    
149          my $log = $self->_get_logger();          my $log = $self->_get_logger();
150    
151          $log->info("finish index writing to disk");          $log->info("dummy finish");
         $self->{invindex}->finish;  
   
         $log->info("writing value usage file");  
   
         # add fields from last run  
         map { $self->{value_usage}->{$_}++ } @{ $self->{fields} };  
   
         my @fields = keys %{ $self->{value_usage} };  
         store \@fields, $self->{fields_path} ||  
                 $log->warn("can't write $self->{fields_path}: $!");  
152    
153  }  }
154    
# Line 265  Dobrica Pavlinusic, C<< <dpavlin@rot13.o Line 172  Dobrica Pavlinusic, C<< <dpavlin@rot13.o
172    
173  =head1 COPYRIGHT & LICENSE  =head1 COPYRIGHT & LICENSE
174    
175  Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.  Copyright 2005-2007 Dobrica Pavlinusic, All Rights Reserved.
176    
177  This program is free software; you can redistribute it and/or modify it  This program is free software; you can redistribute it and/or modify it
178  under the same terms as Perl itself.  under the same terms as Perl itself.

Legend:
Removed from v.913  
changed lines
  Added in v.914

  ViewVC Help
Powered by ViewVC 1.1.26