/[webpac2]/trunk/lib/WebPAC/Output/KinoSearch.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/lib/WebPAC/Output/KinoSearch.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 536 by dpavlin, Mon Jun 26 16:39:51 2006 UTC revision 924 by dpavlin, Wed Oct 31 00:26:45 2007 UTC
# Line 3  package WebPAC::Output::KinoSearch; Line 3  package WebPAC::Output::KinoSearch;
3  use warnings;  use warnings;
4  use strict;  use strict;
5    
6  use base qw/WebPAC::Common/;  use base qw/WebPAC::Common WebPAC::Output Class::Accessor/;
7    __PACKAGE__->mk_accessors(qw(
8            path
9            database
10            encoding
11            clean
12    
13  use KinoSearch::InvIndexer;          index
14  use KinoSearch::Analysis::PolyAnalyzer;  ));
15    
16    use KinoSearch::Simple;
17    use File::Path;
18  use Encode qw/from_to/;  use Encode qw/from_to/;
19  use Data::Dumper;  use Data::Dump qw/dump/;
20  use Storable;  use Storable;
21    
22  =head1 NAME  =head1 NAME
# Line 17  WebPAC::Output::KinoSearch - Create Kino Line 25  WebPAC::Output::KinoSearch - Create Kino
25    
26  =head1 VERSION  =head1 VERSION
27    
28  Version 0.02  Version 0.05
29    
30  =cut  =cut
31    
32  our $VERSION = '0.02';  our $VERSION = '0.05';
33    
34  =head1 SYNOPSIS  =head1 SYNOPSIS
35    
# Line 34  type C<search>. Line 42  type C<search>.
42    
43  Open KinoSearch index  Open KinoSearch index
44    
45   my $est = new WebPAC::Output::KinoSearch(   my $out = new WebPAC::Output::KinoSearch({
46          index_path => '/path/to/invindex',          path => '/path/to/invindex',
         fields => qw/name of all filelds used/,  
47          database => 'demo',          database => 'demo',
         label => 'node label',  
48          encoding => 'iso-8859-2',          encoding => 'iso-8859-2',
49          clean => 1,          clean => 1,
50   );   });
51    
52  Options are:  Options are:
53    
54  =over 4  =over 4
55    
56  =item index_path  =item path
57    
58  path to KinoSearch index to use  path to KinoSearch index to use
59    
 =item fields  
   
 name of all fields used in this index  
   
60  =item database  =item database
61    
62  name of database from which data comes  name of database from which data comes
63    
 =item label  
   
 label for node (optional)  
   
64  =item encoding  =item encoding
65    
66  character encoding of C<data_structure> if it's differenet than C<ISO-8859-2>  character encoding of C<data_structure> if it's differenet than C<ISO-8859-2>
# Line 71  index. Line 69  index.
69    
70  =back  =back
71    
72    =head2 init
73    
74      $out->init;
75    
76  =cut  =cut
77    
78  sub new {  sub init {
79          my $class = shift;          my $self = shift;
         my $self = {@_};  
         bless($self, $class);  
80    
81          my $log = $self->_get_logger;          my $log = $self->_get_logger;
82    
83          #$log->debug("self: ", sub { Dumper($self) });          #$log->debug("self: ", sub { dump($self) });
84    
85          foreach my $p (qw/index_path fields database/) {          foreach my $p (qw/path database/) {
86                  $log->logdie("need $p") unless ($self->{$p});                  $log->logdie("need $p") unless ($self->$p);
87          }          }
88    
89          $log->logdie("fields is not ARRAY") unless (ref($self->{fields}) eq 'ARRAY');  #       $log->logdie("fields is not ARRAY") unless (ref($self->{fields}) eq 'ARRAY');
90    
91          $self->{encoding} ||= 'ISO-8859-2';          $self->encoding( 'ISO-8859-2' ) unless $self->encoding;
92    
93          $log->info("using index $self->{index_path} with encoding $self->{encoding}");          if ( ! -e $self->path ) {
94                    mkpath $self->path || $log->logdie("can't create ", $self->path,": $!");
95                    $log->info("created ", $self->path);
96            } elsif ( $self->clean ) {
97                    $log->info("removing existing ", $self->path);
98                    rmtree $self->path || $log->logdie("can't remove ", $self->path,": $!");
99                    mkpath $self->path || $log->logdie("can't create ", $self->path,": $!");
100            }
101    
102          my $analyzer = KinoSearch::Analysis::PolyAnalyzer->new( language => 'en' );          my $path = $self->path . '/' . $self->database;
103    
104          $self->{invindex} = KinoSearch::InvIndexer->new(          $log->info("using index $path with encoding ", $self->encoding);
105                  invindex => $self->{index_path},  
106                  create   => $self->{clean},          my $index = KinoSearch::Simple->new(
107                  analyzer => $analyzer,                  path => $path,
108                    language => 'en',
109          );          );
110    
111          my $fields_path = $self->{index_path} . '/fields.storable';          $log->logdie("can't open $path: $!") unless $index;
         $fields_path =~ s#//#/#g;  
         if (-e $fields_path) {  
                 $self->{fields} = retrieve($fields_path) ||  
                         $log->warn("can't open $fields_path: $!");  
         } else {  
                 $log->error("This will be dummy run since no fields statistics are found!");  
                 $log->error("You will have to re-run indexing to get search results!");  
                 $self->{dummy_run} = 1;  
         }  
         $self->{fields_path} = $fields_path;  
112    
113          foreach my $f (@{ $self->{fields} }) {          $self->index( $index );
                 $self->{invindex}->spec_field(  
                         name  => $f,  
 #                       boost => 10,  
                         stored => 1,  
                         indexed => 1,  
                         vectorized => 0,  
                 );  
         }  
114    
         $self ? return $self : return undef;  
115  }  }
116    
117    
118  =head2 add  =head2 add
119    
120  Adds one entry to database.  Adds one entry
   
   $est->add(  
         id => 42,  
         ds => $ds,  
         type => 'display',  
         text => 'optional text from which snippet is created',  
   );  
   
 This function will create  entries in index using following URI format:  
121    
122    C<file:///type/database%20name/000>    $out->add( 42, $ds );
   
 Each tag in C<data_structure> with specified C<type> will create one  
 attribute and corresponding hidden text (used for search).  
123    
124  =cut  =cut
125    
126  sub add {  sub add {
127          my $self = shift;          my $self = shift;
128    
129          my $args = {@_};          my ( $id, $ds ) = @_;
130    
131          my $log = $self->_get_logger;          my $log = $self->_get_logger;
132            $log->logdie("need id") unless defined $id;
133            $log->logdie("need ds") unless $ds;
134    
135          my $database = $self->{'database'} || $log->logconfess('no database in $self');          $log->debug("id: $id ds = ",dump($ds));
         $log->logconfess('need invindex in object') unless ($self->{'invindex'});  
   
         foreach my $p (qw/id ds type/) {  
                 $log->logdie("need $p") unless ($args->{$p});  
         }  
   
         my $type = $args->{'type'};  
         my $id = $args->{'id'};  
   
         my $uri = "file:///$type/$database/$id";  
         $log->debug("creating $uri");  
136    
137          my $doc = $self->{invindex}->new_doc( $uri ) || $log->logdie("can't create new_doc( $uri )");          my $hash = $self->ds_to_hash( $ds, 'search' ) || return;
138    
139          sub add_value($$$$$) {          $hash->{database} ||= $self->database;
140                  my ($self,$log,$doc,$n,$v) = @_;          $hash->{id} ||= $id;
                 return unless ($v);  
141    
142                  $self->{value_usage}->{$n}++;          foreach my $f ( keys %$hash ) {
143                  return if ($self->{dummy_run});                  if ( ref($hash->{$f}) eq 'ARRAY' ) {
144                            $hash->{$f} = join(' <*> ', @{ $hash->{$f} });
145                  eval { $doc->set_value($n, $self->convert($v) ) };                  }
                 $log->warn("can't insert: $n = $v") if ($@);  
146          }          }
147    
148          add_value($self,$log,$doc, 'uri', $uri);          $log->debug("add( $id, ", sub { dump($ds) }," ) => ", sub { dump( $hash ) });
   
         $log->debug("ds = ", sub { Dumper($args->{'ds'}) } );  
149    
150          # filter all tags which have type defined          $self->index->add_doc( $hash );
         my @tags = grep {  
                 ref($args->{'ds'}->{$_}) eq 'HASH' && defined( $args->{'ds'}->{$_}->{$type} )  
         } keys %{ $args->{'ds'} };  
151    
152          $log->debug("tags = ", join(",", @tags));          $self->{count}++;
   
         return unless (@tags);  
   
         foreach my $tag (@tags) {  
   
                 my $vals = join(" ", @{ $args->{'ds'}->{$tag}->{$type} });  
   
                 next if (! $vals);  
   
                 $vals = $self->convert( $vals ) or  
                         $log->logdie("can't convert '$vals' to UTF-8");  
   
                 add_value($self, $log, $doc, $tag, $vals );  
         }  
   
         if (my $text = $args->{'text'}) {  
                 add_value($self, $log, $doc, 'bodytext', $text );  
         }  
   
         #$log->debug("adding ", sub { $doc->dump_draft } );  
         $self->{invindex}->add_doc($doc) || $log->warn("can't add document $uri");  
153    
154          return 1;          return 1;
155  }  }
# Line 218  sub add { Line 158  sub add {
158    
159  Close index  Close index
160    
161   $index->finish;   $out->finish;
162    
163  =cut  =cut
164    
# Line 227  sub finish { Line 167  sub finish {
167    
168          my $log = $self->_get_logger();          my $log = $self->_get_logger();
169    
170          $log->info("finish index writing to disk");          $log->info("indexed ", $self->{count}, " records");
         $self->{invindex}->finish;  
   
         $log->info("writing value usage file");  
   
         # add fields from last run  
         map { $self->{value_usage}->{$_}++ } @{ $self->{fields} };  
   
         my @fields = keys %{ $self->{value_usage} };  
         store \@fields, $self->{fields_path} ||  
                 $log->warn("can't write $self->{fields_path}: $!");  
   
 }  
   
 =head2 convert  
   
  my $utf8_string = $self->convert('string in codepage');  
   
 =cut  
   
 sub convert {  
         my $self = shift;  
171    
         my $text = shift || return;  
         from_to($text, $self->{encoding}, 'UTF-8');  
         return $text;  
172  }  }
173    
174  =head1 AUTHOR  =head1 AUTHOR
# Line 261  Dobrica Pavlinusic, C<< <dpavlin@rot13.o Line 177  Dobrica Pavlinusic, C<< <dpavlin@rot13.o
177    
178  =head1 COPYRIGHT & LICENSE  =head1 COPYRIGHT & LICENSE
179    
180  Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.  Copyright 2005-2007 Dobrica Pavlinusic, All Rights Reserved.
181    
182  This program is free software; you can redistribute it and/or modify it  This program is free software; you can redistribute it and/or modify it
183  under the same terms as Perl itself.  under the same terms as Perl itself.

Legend:
Removed from v.536  
changed lines
  Added in v.924

  ViewVC Help
Powered by ViewVC 1.1.26