/[webpac2]/trunk/lib/WebPAC/Output/KinoSearch.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Output/KinoSearch.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 887 - (hide annotations)
Mon Sep 3 15:26:46 2007 UTC (16 years, 8 months ago) by dpavlin
File size: 5614 byte(s)
 r1322@llin:  dpavlin | 2007-09-03 16:44:01 +0200
 - replace Data::Dumper usage with Data::Dump
 - rewrite WebPAC::Store to use Class::Accessor

1 dpavlin 431 package WebPAC::Output::KinoSearch;
2    
3     use warnings;
4     use strict;
5    
6     use base qw/WebPAC::Common/;
7    
8     use KinoSearch::InvIndexer;
9     use KinoSearch::Analysis::PolyAnalyzer;
10     use Encode qw/from_to/;
11 dpavlin 887 use Data::Dump qw/dump/;
12 dpavlin 536 use Storable;
13 dpavlin 431
14     =head1 NAME
15    
16     WebPAC::Output::KinoSearch - Create KinoSearch full text index
17    
18     =head1 VERSION
19    
20 dpavlin 609 Version 0.03
21 dpavlin 431
22     =cut
23    
24 dpavlin 609 our $VERSION = '0.03';
25 dpavlin 431
26     =head1 SYNOPSIS
27    
28     Create full text index using KinoSearch index from data with
29     type C<search>.
30    
31     =head1 FUNCTIONS
32    
33     =head2 new
34    
35     Open KinoSearch index
36    
37     my $est = new WebPAC::Output::KinoSearch(
38     index_path => '/path/to/invindex',
39     fields => qw/name of all filelds used/,
40     database => 'demo',
41     label => 'node label',
42     encoding => 'iso-8859-2',
43     clean => 1,
44     );
45    
46     Options are:
47    
48     =over 4
49    
50     =item index_path
51    
52     path to KinoSearch index to use
53    
54     =item fields
55    
56     name of all fields used in this index
57    
58     =item database
59    
60     name of database from which data comes
61    
62     =item label
63    
64     label for node (optional)
65    
66     =item encoding
67    
68     character encoding of C<data_structure> if it's differenet than C<ISO-8859-2>
69     (and it probably is). This encoding will be converted to C<UTF-8> for
70     index.
71    
72     =back
73    
74     =cut
75    
76     sub new {
77     my $class = shift;
78     my $self = {@_};
79     bless($self, $class);
80    
81     my $log = $self->_get_logger;
82    
83 dpavlin 887 #$log->debug("self: ", sub { dump($self) });
84 dpavlin 431
85     foreach my $p (qw/index_path fields database/) {
86     $log->logdie("need $p") unless ($self->{$p});
87     }
88    
89     $log->logdie("fields is not ARRAY") unless (ref($self->{fields}) eq 'ARRAY');
90    
91     $self->{encoding} ||= 'ISO-8859-2';
92    
93 dpavlin 610 $self->{index_path} .= '/' . $self->{database};
94    
95 dpavlin 609 $self->{clean} = 1 if (! -e $self->{index_path} . '/segments');
96 dpavlin 431
97 dpavlin 609 $log->info("using", $self->{clean} ? ' new' : '', " index $self->{index_path} with encoding $self->{encoding}");
98    
99 dpavlin 431 my $analyzer = KinoSearch::Analysis::PolyAnalyzer->new( language => 'en' );
100    
101     $self->{invindex} = KinoSearch::InvIndexer->new(
102     invindex => $self->{index_path},
103     create => $self->{clean},
104     analyzer => $analyzer,
105     );
106    
107 dpavlin 536 my $fields_path = $self->{index_path} . '/fields.storable';
108     $fields_path =~ s#//#/#g;
109     if (-e $fields_path) {
110     $self->{fields} = retrieve($fields_path) ||
111     $log->warn("can't open $fields_path: $!");
112     } else {
113     $log->error("This will be dummy run since no fields statistics are found!");
114     $log->error("You will have to re-run indexing to get search results!");
115     $self->{dummy_run} = 1;
116     }
117     $self->{fields_path} = $fields_path;
118    
119 dpavlin 431 foreach my $f (@{ $self->{fields} }) {
120     $self->{invindex}->spec_field(
121     name => $f,
122     # boost => 10,
123     stored => 1,
124     indexed => 1,
125     vectorized => 0,
126     );
127     }
128    
129     $self ? return $self : return undef;
130     }
131    
132    
133     =head2 add
134    
135     Adds one entry to database.
136    
137     $est->add(
138     id => 42,
139     ds => $ds,
140     type => 'display',
141     text => 'optional text from which snippet is created',
142     );
143    
144     This function will create entries in index using following URI format:
145    
146     C<file:///type/database%20name/000>
147    
148     Each tag in C<data_structure> with specified C<type> will create one
149     attribute and corresponding hidden text (used for search).
150    
151     =cut
152    
153     sub add {
154     my $self = shift;
155    
156     my $args = {@_};
157    
158     my $log = $self->_get_logger;
159    
160     my $database = $self->{'database'} || $log->logconfess('no database in $self');
161     $log->logconfess('need invindex in object') unless ($self->{'invindex'});
162    
163     foreach my $p (qw/id ds type/) {
164     $log->logdie("need $p") unless ($args->{$p});
165     }
166    
167     my $type = $args->{'type'};
168     my $id = $args->{'id'};
169    
170     my $uri = "file:///$type/$database/$id";
171     $log->debug("creating $uri");
172    
173     my $doc = $self->{invindex}->new_doc( $uri ) || $log->logdie("can't create new_doc( $uri )");
174    
175 dpavlin 620 sub _add_value($$$$$) {
176 dpavlin 437 my ($self,$log,$doc,$n,$v) = @_;
177     return unless ($v);
178 dpavlin 536
179     $self->{value_usage}->{$n}++;
180     return if ($self->{dummy_run});
181    
182 dpavlin 432 eval { $doc->set_value($n, $self->convert($v) ) };
183     $log->warn("can't insert: $n = $v") if ($@);
184     }
185    
186 dpavlin 620 _add_value($self,$log,$doc, 'uri', $uri);
187 dpavlin 432
188 dpavlin 887 $log->debug("ds = ", sub { dump($args->{'ds'}) } );
189 dpavlin 431
190     # filter all tags which have type defined
191     my @tags = grep {
192     ref($args->{'ds'}->{$_}) eq 'HASH' && defined( $args->{'ds'}->{$_}->{$type} )
193     } keys %{ $args->{'ds'} };
194    
195     $log->debug("tags = ", join(",", @tags));
196    
197     return unless (@tags);
198    
199     foreach my $tag (@tags) {
200    
201     my $vals = join(" ", @{ $args->{'ds'}->{$tag}->{$type} });
202    
203     next if (! $vals);
204    
205     $vals = $self->convert( $vals ) or
206     $log->logdie("can't convert '$vals' to UTF-8");
207    
208 dpavlin 620 _add_value($self, $log, $doc, $tag, $vals );
209 dpavlin 431 }
210    
211 dpavlin 432 if (my $text = $args->{'text'}) {
212 dpavlin 620 _add_value($self, $log, $doc, 'bodytext', $text );
213 dpavlin 431 }
214    
215     #$log->debug("adding ", sub { $doc->dump_draft } );
216     $self->{invindex}->add_doc($doc) || $log->warn("can't add document $uri");
217    
218     return 1;
219     }
220    
221 dpavlin 434 =head2 finish
222 dpavlin 431
223 dpavlin 434 Close index
224    
225     $index->finish;
226    
227     =cut
228    
229     sub finish {
230     my $self = shift;
231    
232 dpavlin 536 my $log = $self->_get_logger();
233    
234     $log->info("finish index writing to disk");
235 dpavlin 434 $self->{invindex}->finish;
236 dpavlin 536
237     $log->info("writing value usage file");
238    
239     # add fields from last run
240     map { $self->{value_usage}->{$_}++ } @{ $self->{fields} };
241    
242     my @fields = keys %{ $self->{value_usage} };
243     store \@fields, $self->{fields_path} ||
244     $log->warn("can't write $self->{fields_path}: $!");
245    
246 dpavlin 434 }
247    
248 dpavlin 431 =head2 convert
249    
250     my $utf8_string = $self->convert('string in codepage');
251    
252     =cut
253    
254     sub convert {
255     my $self = shift;
256    
257     my $text = shift || return;
258     from_to($text, $self->{encoding}, 'UTF-8');
259     return $text;
260     }
261    
262     =head1 AUTHOR
263    
264     Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
265    
266     =head1 COPYRIGHT & LICENSE
267    
268     Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
269    
270     This program is free software; you can redistribute it and/or modify it
271     under the same terms as Perl itself.
272    
273     =cut
274    
275     1; # End of WebPAC::Output::Estraier

  ViewVC Help
Powered by ViewVC 1.1.26