/[webpac2]/trunk/lib/WebPAC/Output/KinoSearch.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/lib/WebPAC/Output/KinoSearch.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 609 by dpavlin, Tue Aug 1 17:26:55 2006 UTC revision 949 by dpavlin, Thu Nov 1 00:16:48 2007 UTC
# Line 3  package WebPAC::Output::KinoSearch; Line 3  package WebPAC::Output::KinoSearch;
3  use warnings;  use warnings;
4  use strict;  use strict;
5    
6  use base qw/WebPAC::Common/;  use base qw/WebPAC::Common WebPAC::Output Class::Accessor/;
7    __PACKAGE__->mk_accessors(qw(
8  use KinoSearch::InvIndexer;          path
9  use KinoSearch::Analysis::PolyAnalyzer;          database
10  use Encode qw/from_to/;          input
11  use Data::Dumper;          encoding
12            clean
13    
14            index
15    ));
16    
17    use KinoSearch::Simple;
18    use File::Path;
19    use Encode qw/decode/;
20    use Data::Dump qw/dump/;
21  use Storable;  use Storable;
22    
23  =head1 NAME  =head1 NAME
# Line 17  WebPAC::Output::KinoSearch - Create Kino Line 26  WebPAC::Output::KinoSearch - Create Kino
26    
27  =head1 VERSION  =head1 VERSION
28    
29  Version 0.03  Version 0.05
30    
31  =cut  =cut
32    
33  our $VERSION = '0.03';  our $VERSION = '0.05';
34    
35  =head1 SYNOPSIS  =head1 SYNOPSIS
36    
# Line 34  type C<search>. Line 43  type C<search>.
43    
44  Open KinoSearch index  Open KinoSearch index
45    
46   my $est = new WebPAC::Output::KinoSearch(   my $out = new WebPAC::Output::KinoSearch({
47          index_path => '/path/to/invindex',          path => '/path/to/invindex',
         fields => qw/name of all filelds used/,  
48          database => 'demo',          database => 'demo',
         label => 'node label',  
49          encoding => 'iso-8859-2',          encoding => 'iso-8859-2',
50          clean => 1,          clean => 1,
51   );   });
52    
53  Options are:  Options are:
54    
55  =over 4  =over 4
56    
57  =item index_path  =item path
58    
59  path to KinoSearch index to use  path to KinoSearch index to use
60    
 =item fields  
   
 name of all fields used in this index  
   
61  =item database  =item database
62    
63  name of database from which data comes  name of database from which data comes
64    
 =item label  
   
 label for node (optional)  
   
65  =item encoding  =item encoding
66    
67  character encoding of C<data_structure> if it's differenet than C<ISO-8859-2>  character encoding of C<data_structure> if it's differenet than C<ISO-8859-2>
# Line 71  index. Line 70  index.
70    
71  =back  =back
72    
73    =head2 init
74    
75      $out->init;
76    
77  =cut  =cut
78    
79  sub new {  sub init {
80          my $class = shift;          my $self = shift;
         my $self = {@_};  
         bless($self, $class);  
81    
82          my $log = $self->_get_logger;          my $log = $self->_get_logger;
83    
84          #$log->debug("self: ", sub { Dumper($self) });          #$log->debug("self: ", sub { dump($self) });
85    
86          foreach my $p (qw/index_path fields database/) {          foreach my $p (qw/path database/) {
87                  $log->logdie("need $p") unless ($self->{$p});                  $log->logdie("need $p") unless ($self->$p);
88          }          }
89    
90          $log->logdie("fields is not ARRAY") unless (ref($self->{fields}) eq 'ARRAY');  #       $log->logdie("fields is not ARRAY") unless (ref($self->{fields}) eq 'ARRAY');
91    
92          $self->{encoding} ||= 'ISO-8859-2';          $self->encoding( 'ISO-8859-2' ) unless $self->encoding;
93    
94          $self->{clean} = 1 if (! -e $self->{index_path} . '/segments');          ## FIXME we shouldn't re-create whole KinoSearch index every time!
95            $self->clean( 1 );
96    
97          $log->info("using", $self->{clean} ? ' new' : '', " index $self->{index_path} with encoding $self->{encoding}");          if ( ! -e $self->path ) {
98                    mkpath $self->path || $log->logdie("can't create ", $self->path,": $!");
99                    $log->info("created ", $self->path);
100            } elsif ( $self->clean ) {
101                    $log->info("removing existing ", $self->path);
102                    rmtree $self->path || $log->logdie("can't remove ", $self->path,": $!");
103                    mkpath $self->path || $log->logdie("can't create ", $self->path,": $!");
104            }
105    
106          my $analyzer = KinoSearch::Analysis::PolyAnalyzer->new( language => 'en' );          my $path = $self->path . '/' . $self->database;
107    
108          $self->{invindex} = KinoSearch::InvIndexer->new(          $log->info("using index $path with encoding ", $self->encoding);
109                  invindex => $self->{index_path},  
110                  create   => $self->{clean},          my $index = KinoSearch::Simple->new(
111                  analyzer => $analyzer,                  path => $path,
112                    language => 'en',
113          );          );
114    
115          my $fields_path = $self->{index_path} . '/fields.storable';          $log->logdie("can't open $path: $!") unless $index;
         $fields_path =~ s#//#/#g;  
         if (-e $fields_path) {  
                 $self->{fields} = retrieve($fields_path) ||  
                         $log->warn("can't open $fields_path: $!");  
         } else {  
                 $log->error("This will be dummy run since no fields statistics are found!");  
                 $log->error("You will have to re-run indexing to get search results!");  
                 $self->{dummy_run} = 1;  
         }  
         $self->{fields_path} = $fields_path;  
116    
117          foreach my $f (@{ $self->{fields} }) {          $self->index( $index );
                 $self->{invindex}->spec_field(  
                         name  => $f,  
 #                       boost => 10,  
                         stored => 1,  
                         indexed => 1,  
                         vectorized => 0,  
                 );  
         }  
118    
         $self ? return $self : return undef;  
119  }  }
120    
121    
122  =head2 add  =head2 add
123    
124  Adds one entry to database.  Adds one entry
   
   $est->add(  
         id => 42,  
         ds => $ds,  
         type => 'display',  
         text => 'optional text from which snippet is created',  
   );  
   
 This function will create  entries in index using following URI format:  
125    
126    C<file:///type/database%20name/000>    $out->add( 42, $ds );
   
 Each tag in C<data_structure> with specified C<type> will create one  
 attribute and corresponding hidden text (used for search).  
127    
128  =cut  =cut
129    
130  sub add {  sub add {
131          my $self = shift;          my $self = shift;
132    
133          my $args = {@_};          my ( $id, $ds ) = @_;
134    
135          my $log = $self->_get_logger;          my $log = $self->_get_logger;
136            $log->logdie("need id") unless defined $id;
137            $log->logdie("need ds") unless $ds;
138    
139          my $database = $self->{'database'} || $log->logconfess('no database in $self');          $log->debug("id: $id ds = ", sub { dump($ds) });
         $log->logconfess('need invindex in object') unless ($self->{'invindex'});  
   
         foreach my $p (qw/id ds type/) {  
                 $log->logdie("need $p") unless ($args->{$p});  
         }  
   
         my $type = $args->{'type'};  
         my $id = $args->{'id'};  
   
         my $uri = "file:///$type/$database/$id";  
         $log->debug("creating $uri");  
140    
141          my $doc = $self->{invindex}->new_doc( $uri ) || $log->logdie("can't create new_doc( $uri )");          my $hash = $self->ds_to_hash( $ds, 'search' ) || return;
142    
143          sub add_value($$$$$) {          $hash->{id}       ||= $id;
144                  my ($self,$log,$doc,$n,$v) = @_;          $hash->{database} ||= $self->database;
145                  return unless ($v);          $hash->{input}    ||= $self->input;
146    
147                  $self->{value_usage}->{$n}++;          foreach my $f ( keys %$hash ) {
148                  return if ($self->{dummy_run});                  if ( ref($hash->{$f}) eq 'ARRAY' ) {
149                            $hash->{$f} = join(' <*> ', @{ $hash->{$f} });
150                  eval { $doc->set_value($n, $self->convert($v) ) };                  }
151                  $log->warn("can't insert: $n = $v") if ($@);  #               $hash->{$f} = decode( $self->encoding, $hash->{$f} );
152          }          }
153    
154          add_value($self,$log,$doc, 'uri', $uri);          $log->debug("add( $id, ", sub { dump($ds) }," ) => ", sub { dump( $hash ) });
   
         $log->debug("ds = ", sub { Dumper($args->{'ds'}) } );  
155    
156          # filter all tags which have type defined          $self->index->add_doc( $hash );
         my @tags = grep {  
                 ref($args->{'ds'}->{$_}) eq 'HASH' && defined( $args->{'ds'}->{$_}->{$type} )  
         } keys %{ $args->{'ds'} };  
157    
158          $log->debug("tags = ", join(",", @tags));          $self->{count}++;
   
         return unless (@tags);  
   
         foreach my $tag (@tags) {  
   
                 my $vals = join(" ", @{ $args->{'ds'}->{$tag}->{$type} });  
   
                 next if (! $vals);  
   
                 $vals = $self->convert( $vals ) or  
                         $log->logdie("can't convert '$vals' to UTF-8");  
   
                 add_value($self, $log, $doc, $tag, $vals );  
         }  
   
         if (my $text = $args->{'text'}) {  
                 add_value($self, $log, $doc, 'bodytext', $text );  
         }  
   
         #$log->debug("adding ", sub { $doc->dump_draft } );  
         $self->{invindex}->add_doc($doc) || $log->warn("can't add document $uri");  
159    
160          return 1;          return 1;
161  }  }
# Line 220  sub add { Line 164  sub add {
164    
165  Close index  Close index
166    
167   $index->finish;   $out->finish;
168    
169  =cut  =cut
170    
# Line 229  sub finish { Line 173  sub finish {
173    
174          my $log = $self->_get_logger();          my $log = $self->_get_logger();
175    
176          $log->info("finish index writing to disk");          $log->info("indexed ", $self->{count}, " records");
         $self->{invindex}->finish;  
   
         $log->info("writing value usage file");  
   
         # add fields from last run  
         map { $self->{value_usage}->{$_}++ } @{ $self->{fields} };  
   
         my @fields = keys %{ $self->{value_usage} };  
         store \@fields, $self->{fields_path} ||  
                 $log->warn("can't write $self->{fields_path}: $!");  
   
 }  
   
 =head2 convert  
   
  my $utf8_string = $self->convert('string in codepage');  
   
 =cut  
   
 sub convert {  
         my $self = shift;  
177    
         my $text = shift || return;  
         from_to($text, $self->{encoding}, 'UTF-8');  
         return $text;  
178  }  }
179    
180  =head1 AUTHOR  =head1 AUTHOR
# Line 263  Dobrica Pavlinusic, C<< <dpavlin@rot13.o Line 183  Dobrica Pavlinusic, C<< <dpavlin@rot13.o
183    
184  =head1 COPYRIGHT & LICENSE  =head1 COPYRIGHT & LICENSE
185    
186  Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.  Copyright 2005-2007 Dobrica Pavlinusic, All Rights Reserved.
187    
188  This program is free software; you can redistribute it and/or modify it  This program is free software; you can redistribute it and/or modify it
189  under the same terms as Perl itself.  under the same terms as Perl itself.

Legend:
Removed from v.609  
changed lines
  Added in v.949

  ViewVC Help
Powered by ViewVC 1.1.26