/[webpac2]/trunk/lib/WebPAC/Output/KinoSearch.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WebPAC/Output/KinoSearch.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 536 - (show annotations)
Mon Jun 26 16:39:51 2006 UTC (17 years, 9 months ago) by dpavlin
File size: 5457 byte(s)
 r719@llin:  dpavlin | 2006-06-26 18:40:57 +0200
 big refacture: depriciate and remove all normalisation formats except .pl sets (but
 old code is still available in WebPAC::Lookup::Normalize because lookups use it) [2.20]

1 package WebPAC::Output::KinoSearch;
2
3 use warnings;
4 use strict;
5
6 use base qw/WebPAC::Common/;
7
8 use KinoSearch::InvIndexer;
9 use KinoSearch::Analysis::PolyAnalyzer;
10 use Encode qw/from_to/;
11 use Data::Dumper;
12 use Storable;
13
14 =head1 NAME
15
16 WebPAC::Output::KinoSearch - Create KinoSearch full text index
17
18 =head1 VERSION
19
20 Version 0.02
21
22 =cut
23
24 our $VERSION = '0.02';
25
26 =head1 SYNOPSIS
27
28 Create full text index using KinoSearch index from data with
29 type C<search>.
30
31 =head1 FUNCTIONS
32
33 =head2 new
34
35 Open KinoSearch index
36
37 my $est = new WebPAC::Output::KinoSearch(
38 index_path => '/path/to/invindex',
39 fields => qw/name of all filelds used/,
40 database => 'demo',
41 label => 'node label',
42 encoding => 'iso-8859-2',
43 clean => 1,
44 );
45
46 Options are:
47
48 =over 4
49
50 =item index_path
51
52 path to KinoSearch index to use
53
54 =item fields
55
56 name of all fields used in this index
57
58 =item database
59
60 name of database from which data comes
61
62 =item label
63
64 label for node (optional)
65
66 =item encoding
67
68 character encoding of C<data_structure> if it's differenet than C<ISO-8859-2>
69 (and it probably is). This encoding will be converted to C<UTF-8> for
70 index.
71
72 =back
73
74 =cut
75
76 sub new {
77 my $class = shift;
78 my $self = {@_};
79 bless($self, $class);
80
81 my $log = $self->_get_logger;
82
83 #$log->debug("self: ", sub { Dumper($self) });
84
85 foreach my $p (qw/index_path fields database/) {
86 $log->logdie("need $p") unless ($self->{$p});
87 }
88
89 $log->logdie("fields is not ARRAY") unless (ref($self->{fields}) eq 'ARRAY');
90
91 $self->{encoding} ||= 'ISO-8859-2';
92
93 $log->info("using index $self->{index_path} with encoding $self->{encoding}");
94
95 my $analyzer = KinoSearch::Analysis::PolyAnalyzer->new( language => 'en' );
96
97 $self->{invindex} = KinoSearch::InvIndexer->new(
98 invindex => $self->{index_path},
99 create => $self->{clean},
100 analyzer => $analyzer,
101 );
102
103 my $fields_path = $self->{index_path} . '/fields.storable';
104 $fields_path =~ s#//#/#g;
105 if (-e $fields_path) {
106 $self->{fields} = retrieve($fields_path) ||
107 $log->warn("can't open $fields_path: $!");
108 } else {
109 $log->error("This will be dummy run since no fields statistics are found!");
110 $log->error("You will have to re-run indexing to get search results!");
111 $self->{dummy_run} = 1;
112 }
113 $self->{fields_path} = $fields_path;
114
115 foreach my $f (@{ $self->{fields} }) {
116 $self->{invindex}->spec_field(
117 name => $f,
118 # boost => 10,
119 stored => 1,
120 indexed => 1,
121 vectorized => 0,
122 );
123 }
124
125 $self ? return $self : return undef;
126 }
127
128
129 =head2 add
130
131 Adds one entry to database.
132
133 $est->add(
134 id => 42,
135 ds => $ds,
136 type => 'display',
137 text => 'optional text from which snippet is created',
138 );
139
140 This function will create entries in index using following URI format:
141
142 C<file:///type/database%20name/000>
143
144 Each tag in C<data_structure> with specified C<type> will create one
145 attribute and corresponding hidden text (used for search).
146
147 =cut
148
149 sub add {
150 my $self = shift;
151
152 my $args = {@_};
153
154 my $log = $self->_get_logger;
155
156 my $database = $self->{'database'} || $log->logconfess('no database in $self');
157 $log->logconfess('need invindex in object') unless ($self->{'invindex'});
158
159 foreach my $p (qw/id ds type/) {
160 $log->logdie("need $p") unless ($args->{$p});
161 }
162
163 my $type = $args->{'type'};
164 my $id = $args->{'id'};
165
166 my $uri = "file:///$type/$database/$id";
167 $log->debug("creating $uri");
168
169 my $doc = $self->{invindex}->new_doc( $uri ) || $log->logdie("can't create new_doc( $uri )");
170
171 sub add_value($$$$$) {
172 my ($self,$log,$doc,$n,$v) = @_;
173 return unless ($v);
174
175 $self->{value_usage}->{$n}++;
176 return if ($self->{dummy_run});
177
178 eval { $doc->set_value($n, $self->convert($v) ) };
179 $log->warn("can't insert: $n = $v") if ($@);
180 }
181
182 add_value($self,$log,$doc, 'uri', $uri);
183
184 $log->debug("ds = ", sub { Dumper($args->{'ds'}) } );
185
186 # filter all tags which have type defined
187 my @tags = grep {
188 ref($args->{'ds'}->{$_}) eq 'HASH' && defined( $args->{'ds'}->{$_}->{$type} )
189 } keys %{ $args->{'ds'} };
190
191 $log->debug("tags = ", join(",", @tags));
192
193 return unless (@tags);
194
195 foreach my $tag (@tags) {
196
197 my $vals = join(" ", @{ $args->{'ds'}->{$tag}->{$type} });
198
199 next if (! $vals);
200
201 $vals = $self->convert( $vals ) or
202 $log->logdie("can't convert '$vals' to UTF-8");
203
204 add_value($self, $log, $doc, $tag, $vals );
205 }
206
207 if (my $text = $args->{'text'}) {
208 add_value($self, $log, $doc, 'bodytext', $text );
209 }
210
211 #$log->debug("adding ", sub { $doc->dump_draft } );
212 $self->{invindex}->add_doc($doc) || $log->warn("can't add document $uri");
213
214 return 1;
215 }
216
217 =head2 finish
218
219 Close index
220
221 $index->finish;
222
223 =cut
224
225 sub finish {
226 my $self = shift;
227
228 my $log = $self->_get_logger();
229
230 $log->info("finish index writing to disk");
231 $self->{invindex}->finish;
232
233 $log->info("writing value usage file");
234
235 # add fields from last run
236 map { $self->{value_usage}->{$_}++ } @{ $self->{fields} };
237
238 my @fields = keys %{ $self->{value_usage} };
239 store \@fields, $self->{fields_path} ||
240 $log->warn("can't write $self->{fields_path}: $!");
241
242 }
243
244 =head2 convert
245
246 my $utf8_string = $self->convert('string in codepage');
247
248 =cut
249
250 sub convert {
251 my $self = shift;
252
253 my $text = shift || return;
254 from_to($text, $self->{encoding}, 'UTF-8');
255 return $text;
256 }
257
258 =head1 AUTHOR
259
260 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
261
262 =head1 COPYRIGHT & LICENSE
263
264 Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
265
266 This program is free software; you can redistribute it and/or modify it
267 under the same terms as Perl itself.
268
269 =cut
270
271 1; # End of WebPAC::Output::Estraier

  ViewVC Help
Powered by ViewVC 1.1.26