1 |
package SWISH::Split; |
2 |
|
3 |
use 5.008; |
4 |
use strict; |
5 |
use warnings; |
6 |
|
7 |
our $VERSION = '0.00'; |
8 |
|
9 |
use SWISH::API; |
10 |
use Text::Iconv; |
11 |
use File::Temp qw/ :mktemp /; |
12 |
use Carp; |
13 |
use Digest::MD5 qw(md5_hex); |
14 |
use Memoize; |
15 |
use IPC::Run qw(start timeout pump finish); |
16 |
use File::Which; |
17 |
|
18 |
use Data::Dumper; |
19 |
|
20 |
=head1 NAME |
21 |
|
22 |
SWISH::Split - Perl interface to split index variant of Swish-e |
23 |
|
24 |
=head1 SYNOPSIS |
25 |
|
26 |
use SWISH::Split; |
27 |
|
28 |
|
29 |
=head1 DESCRIPTION |
30 |
|
31 |
This is alternative interface for indexing data with swish-e. It's designed |
32 |
to split indexes over multiple files (slices) to allow updates of records in index |
33 |
by reindexing just changed parts (slice). |
34 |
|
35 |
Data is stored in index using intrface which is somewhat similar to |
36 |
L<Plucene::Simple>. This could make your migration (or supporting two index |
37 |
engines) easier. |
38 |
|
39 |
In the background, it will fork swish-e binaries (one for each index slice) |
40 |
and produce UTF-8 encoded XML files for it. So, if your imput charset isn't |
41 |
C<ISO-8859-1> you will have to specify it. |
42 |
|
43 |
=head1 Methods used for indexing |
44 |
|
45 |
=head2 open |
46 |
|
47 |
Create new object for index. |
48 |
|
49 |
my $i = SWISH::Split->open({ |
50 |
index => '/path/to/index', |
51 |
slice_name => \&slice_on_path, |
52 |
slices => 30, |
53 |
merge => 0, |
54 |
codepage => 'ISO-8859-2', |
55 |
swish_config => qq{ |
56 |
PropertyNames from date |
57 |
PropertyNamesDate date |
58 |
}, |
59 |
memoize_to_xml => 0, |
60 |
); |
61 |
|
62 |
# split index on first component of path |
63 |
sub slice_on_path { |
64 |
return shift split(/\//,$_[0]); |
65 |
} |
66 |
|
67 |
Options to open are following: |
68 |
|
69 |
=over 5 |
70 |
|
71 |
=item C<index> |
72 |
|
73 |
path to (existing) directory in which index slices will be created. |
74 |
|
75 |
=item C<slice_name> |
76 |
|
77 |
coderef to function which provide slicing from path. |
78 |
|
79 |
=item C<slices> |
80 |
|
81 |
maximum number of index slices. See L<"in_slice"> for |
82 |
more explanation. |
83 |
|
84 |
=item C<merge> |
85 |
|
86 |
(planned) option to merge indexes into one at end. |
87 |
|
88 |
=item C<codepage> |
89 |
|
90 |
data codepage (needed for conversion to UTF-8). |
91 |
By default, it's C<ISO-8859-1>. |
92 |
|
93 |
=item C<swish_config> |
94 |
|
95 |
additional parametars which will be inserted into |
96 |
C<swish-e> configuration file. See L<swish-config>. |
97 |
|
98 |
=item C<memoize_to_xml> |
99 |
|
100 |
speed up repeatable data, see L<"to_xml">. |
101 |
|
102 |
=back |
103 |
|
104 |
=cut |
105 |
|
106 |
my $iso2utf = Text::Iconv->new('ISO-8859-1','UTF-8'); |
107 |
|
108 |
sub open { |
109 |
my $class = shift; |
110 |
my $self = {@_}; |
111 |
bless($self, $class); |
112 |
|
113 |
croak "need slice_name coderef" unless ref $self->{'slice_name'}; |
114 |
croak "need slices" unless $self->{'slices'}; |
115 |
|
116 |
croak "need index" unless $self->{'index'}; |
117 |
croak "index '",$self->{'index'},"' doesn't exist" unless -e $self->{'index'}; |
118 |
croak "index '",$self->{'index'},"' is not directory" unless -d $self->{'index'}; |
119 |
|
120 |
$iso2utf = Text::Iconv->new($self->{'codepage'},'UTF-8') if ($self->{'codepage'}); |
121 |
|
122 |
# speedup |
123 |
memoize('in_slice'); |
124 |
memoize('to_xml') if ($self->{'memoize_to_xml'}); |
125 |
|
126 |
$self ? return $self : return undef; |
127 |
|
128 |
} |
129 |
|
130 |
=head2 add |
131 |
|
132 |
Add document to index. |
133 |
|
134 |
$i->add($swishpath, { |
135 |
headline => 'foobar result', |
136 |
property => 'data', |
137 |
}) |
138 |
|
139 |
=cut |
140 |
|
141 |
sub add { |
142 |
my $self = shift; |
143 |
|
144 |
my $swishpath = shift || return; |
145 |
my $data = shift || return; |
146 |
|
147 |
my ($out,$err) = $self->put_slice($swishpath, $self->to_xml($data)); |
148 |
|
149 |
if ($err) { |
150 |
carp "$swishpath: $err"; |
151 |
return 0; |
152 |
} |
153 |
|
154 |
return 1; |
155 |
} |
156 |
|
157 |
=head2 delete |
158 |
|
159 |
Delete documents from index. |
160 |
|
161 |
$i->delete(@swishpath); |
162 |
|
163 |
=cut |
164 |
|
165 |
sub delete { |
166 |
my $self = shift; |
167 |
|
168 |
my @paths = @_ || return; |
169 |
|
170 |
return 42; |
171 |
} |
172 |
|
173 |
|
174 |
=head2 finish |
175 |
|
176 |
Finish indexing and close index file(s). |
177 |
|
178 |
$i->finish; |
179 |
|
180 |
This is most time-consuming operation. When it's called, it will re-index |
181 |
all entries which haven't changed in all slices. |
182 |
|
183 |
Returns number of slices updated. |
184 |
|
185 |
=cut |
186 |
|
187 |
sub finish { |
188 |
my $self = shift; |
189 |
|
190 |
my $ret = 0; |
191 |
|
192 |
foreach my $s (keys %{$self->{'slice'}}) { |
193 |
$ret += $self->close_slice($s); |
194 |
} |
195 |
|
196 |
return $ret; |
197 |
} |
198 |
|
199 |
|
200 |
|
201 |
=head1 Reporting methods |
202 |
|
203 |
This methods return statistics about your index. |
204 |
|
205 |
=head2 swishpaths |
206 |
|
207 |
Return array of C<swishpath>s in index. |
208 |
|
209 |
my @p = $i->swishpaths; |
210 |
|
211 |
=cut |
212 |
|
213 |
sub swishpaths { |
214 |
my $self = shift; |
215 |
} |
216 |
|
217 |
=head2 swishpaths_updated |
218 |
|
219 |
Return array with updated C<swishpath>s. |
220 |
|
221 |
my @d = $i->swishpaths_updated; |
222 |
|
223 |
=cut |
224 |
|
225 |
sub swishpaths_updated { |
226 |
my $self = shift; |
227 |
} |
228 |
|
229 |
|
230 |
=head2 swishpaths_deleted |
231 |
|
232 |
Return array with deleted C<swishpath>s. |
233 |
|
234 |
my $n = $i->swishpaths_deleted; |
235 |
|
236 |
=cut |
237 |
|
238 |
sub swishpaths_deleted { |
239 |
my $self = shift; |
240 |
} |
241 |
|
242 |
|
243 |
=head2 slices |
244 |
|
245 |
Return array with all slice names. |
246 |
|
247 |
my @s = $i->slices; |
248 |
|
249 |
=cut |
250 |
|
251 |
sub slices { |
252 |
my $self = shift; |
253 |
} |
254 |
|
255 |
=head1 Helper methods |
256 |
|
257 |
This methods are used internally, but they might be useful. |
258 |
|
259 |
=head2 in_slice |
260 |
|
261 |
Takes path and return slice in which this path belongs. |
262 |
|
263 |
my $s = $i->in_slice('path/to/document/in/index'); |
264 |
|
265 |
If there are C<slices> parametar to L<"open"> it will use |
266 |
MD5 hash to spread documents across slices. That will produce random |
267 |
distribution of your documents in slices, which might or might not be best |
268 |
for your data. If you have to re-index large number of slices on each |
269 |
run, think about creating your own C<slice> function and distributing |
270 |
documents manually across slices. |
271 |
|
272 |
Slice number must always be true value or various sanity checks will fail. |
273 |
|
274 |
This function is C<Memoize>ed for performance reasons. |
275 |
|
276 |
=cut |
277 |
|
278 |
sub in_slice { |
279 |
my $self = shift; |
280 |
|
281 |
my $path = shift || confess "need path"; |
282 |
|
283 |
confess "need slice_name function" unless ref ($self->{'slice_name'}); |
284 |
|
285 |
if ($self->{'slices'}) { |
286 |
# first, pass path through slice_name function |
287 |
my $slice = &{$self->{'slice_name'}}($path); |
288 |
# then calculate MD5 hash |
289 |
$slice = md5_hex($slice); |
290 |
# take first 8 chars to produce number |
291 |
# FIXME how random is this? |
292 |
$slice = hex(substr($slice,0,8)); |
293 |
|
294 |
$slice = ($slice % $self->{'slices'}) + 1; |
295 |
print "hash: $slice / ",$self->{'slices'}," => $slice\n"; |
296 |
return $slice; |
297 |
} else { |
298 |
return &{$self->{'split'}}($path); |
299 |
} |
300 |
} |
301 |
|
302 |
=head2 find_paths |
303 |
|
304 |
Return array of C<swishpath>s for given C<swish-e> query. |
305 |
|
306 |
my @p = $i->find_paths("headline=test*"); |
307 |
|
308 |
Useful for combining with L<"delete_documents"> to delete documents |
309 |
which hasn't changed a while (so, expired). |
310 |
|
311 |
=cut |
312 |
|
313 |
sub find_paths { |
314 |
my $self = shift; |
315 |
|
316 |
my $s = shift || return; |
317 |
} |
318 |
|
319 |
|
320 |
=head2 make_config |
321 |
|
322 |
Create C<swish-e> configuration file for given slice. |
323 |
|
324 |
my $config_filename = $i->make_config('slice name'); |
325 |
|
326 |
It returns configuration filename. If no C<swish_config> was defined in |
327 |
L<"open">, default swish-e configuration will be used. It will index all data for |
328 |
searching, but none for properties. |
329 |
|
330 |
If you want to see what is allready defined for swish-e in configuration |
331 |
take a look at source code for C<DEFAULT_SWISH_CONF>. |
332 |
|
333 |
It uses C<cat> utility to comunicate with C<swish-e>. Path is provided |
334 |
by C<File::Which>. Do Windows users have to change that to C<COPY /B> |
335 |
or something similar? |
336 |
|
337 |
=cut |
338 |
|
339 |
sub make_config { |
340 |
my $self = shift; |
341 |
|
342 |
|
343 |
my $index_file = $self->{'index'}."/"; |
344 |
$index_file .= shift || confess "need slice name"; |
345 |
|
346 |
my ($tmp_fh, $swish_config_filename) = mkstemp("/tmp/swishXXXXX"); |
347 |
|
348 |
# find cat on filesystem |
349 |
my $cat = which('cat'); |
350 |
|
351 |
print $tmp_fh <<"DEFAULT_SWISH_CONF"; |
352 |
# swish-e config file |
353 |
|
354 |
IndexDir cat |
355 |
#SwishProgParameters - |
356 |
|
357 |
# input file definition |
358 |
DefaultContents XML* |
359 |
|
360 |
# indexed metatags |
361 |
MetaNames xml swishdocpath |
362 |
|
363 |
|
364 |
#XMLClassAttributes type |
365 |
UndefinedMetaTags auto |
366 |
UndefinedXMLAttributes auto |
367 |
|
368 |
IndexFile $index_file |
369 |
|
370 |
# Croatian ISO-8859-2 characters to unaccented equivalents |
371 |
TranslateCharacters ¹©ðÐèÈæƾ® ssddcccczz |
372 |
|
373 |
|
374 |
# disable output |
375 |
ParserWarnLevel 0 |
376 |
IndexReport 1 |
377 |
|
378 |
DEFAULT_SWISH_CONF |
379 |
|
380 |
# add user parametars (like stored properties) |
381 |
print $tmp_fh $self->{'swish_config'} if ($self->{'swish_config'}); |
382 |
|
383 |
close($tmp_fh); |
384 |
|
385 |
return $swish_config_filename; |
386 |
} |
387 |
|
388 |
=head2 create_slice |
389 |
|
390 |
On first run, starts C<swish-e> using L<IPC::Run>. On subsequent calls just return |
391 |
it's handles using L<Memoize>. |
392 |
|
393 |
my $s = create_slice('/path/to/document'); |
394 |
|
395 |
You shouldn't need to call C<create_slice> directly because it will be called |
396 |
from L<"put_slice"> when needed. |
397 |
|
398 |
=cut |
399 |
|
400 |
sub create_slice { |
401 |
my $self = shift; |
402 |
|
403 |
my $path = shift || confess "create_slice need path!"; |
404 |
|
405 |
my $s = $self->in_slice($path) || confess "in_slice returned null"; |
406 |
|
407 |
return $s if (exists($self->{'slice'}->{$s})); |
408 |
|
409 |
my $swish_config = $self->make_config($s); |
410 |
|
411 |
print STDERR "creating slice $s\n"; # FIXME |
412 |
|
413 |
my @swish = qw(swish-e -S prog -c); |
414 |
push @swish, $swish_config; |
415 |
|
416 |
## Build the harness, open all pipes, and launch the subprocesses |
417 |
$self->{'slice'}->{$s}->{'h'} = start \@swish, |
418 |
\$self->{'slice'}->{$s}->{'in'}, |
419 |
\$self->{'slice'}->{$s}->{'out'}, |
420 |
\$self->{'slice'}->{$s}->{'err'}, |
421 |
timeout( 90 ); # FIXME |
422 |
|
423 |
$self->{'slice'}->{$s}->{'out_len'} = 0; |
424 |
$self->{'slice'}->{$s}->{'err_len'} = 0; |
425 |
|
426 |
$self->slice_output($s); |
427 |
|
428 |
return $s; |
429 |
} |
430 |
|
431 |
=head2 put_slice |
432 |
|
433 |
Pass XML data to swish and receive output and errors. |
434 |
|
435 |
my ($out,$err) = $i->put_slice('/swish/path', '<xml>data</xml>'); |
436 |
|
437 |
=cut |
438 |
|
439 |
sub put_slice { |
440 |
my $self = shift; |
441 |
|
442 |
my $path = shift || confess "need path"; |
443 |
my $xml = shift || confess "need xml"; |
444 |
|
445 |
$xml = $iso2utf->convert($xml) || carp "XML conversion error in $xml"; |
446 |
|
447 |
my $s = $self->create_slice($path) || confess "create_slice returned null"; |
448 |
|
449 |
confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s})); |
450 |
confess "no 'in' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'in'})); |
451 |
confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'})); |
452 |
|
453 |
$self->slice_output($s); |
454 |
|
455 |
use bytes; # as opposed to chars |
456 |
$self->{'slice'}->{$s}->{'in'} .= |
457 |
"Path-Name: $path\n". |
458 |
"Content-Length: ".(length($xml)+1)."\n". |
459 |
"Document-Type: XML\n\n$xml\n"; |
460 |
|
461 |
# do I/O |
462 |
$self->{'slice'}->{$s}->{'h'}->pump while length $self->{'slice'}->{$s}->{'in'} ; # wait for all input to go |
463 |
|
464 |
$self->slice_output($s); |
465 |
|
466 |
return $s; |
467 |
} |
468 |
|
469 |
=head2 slice_output |
470 |
|
471 |
Prints to STDERR output and errors from C<swish-e>. |
472 |
|
473 |
$i->slice_output($s); |
474 |
|
475 |
Normally, you don't need to call it. |
476 |
|
477 |
=cut |
478 |
|
479 |
sub slice_output { |
480 |
my $self = shift; |
481 |
|
482 |
my $s = shift || confess "slice_output needs slice"; |
483 |
|
484 |
confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s})); |
485 |
confess "no 'in' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'in'})); |
486 |
confess "no 'out' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'out'})); |
487 |
|
488 |
if (length $self->{'slice'}->{$s}->{'out'} > $self->{'slice'}->{$s}->{'out_len'}) { |
489 |
#print STDERR "swish-e OUT: ",$self->{'slice'}->{$s}->{'out'},"\n" if ($self->{'slice'}->{$s}->{'out'}); |
490 |
$self->{'slice'}->{$s}->{'out_len'} = length $self->{'slice'}->{$s}->{'out'}; |
491 |
return 1; |
492 |
} elsif (length $self->{'slice'}->{$s}->{'err'} > $self->{'slice'}->{$s}->{'err_len'}) { |
493 |
print STDERR "swish-e ERR: ",$self->{'slice'}->{$s}->{'err'},"\n" if ($self->{'slice'}->{$s}->{'err'}); |
494 |
$self->{'slice'}->{$s}->{'err_len'} = length $self->{'slice'}->{$s}->{'err'}; |
495 |
# this is fatal |
496 |
return 0; |
497 |
} |
498 |
|
499 |
return 1; |
500 |
} |
501 |
|
502 |
=head2 close_slice { |
503 |
|
504 |
Close slice (terminates swish-e process for that slice). |
505 |
|
506 |
my $i->close_slice($s); |
507 |
|
508 |
Returns true if slice is closed, false otherwise. |
509 |
|
510 |
=cut |
511 |
|
512 |
sub close_slice { |
513 |
my $self = shift; |
514 |
|
515 |
my $s = shift || confess "close_slice needs slice"; |
516 |
|
517 |
confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s})); |
518 |
confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'})); |
519 |
|
520 |
# pump rest of content (if any) |
521 |
$self->{'slice'}->{$s}->{'h'}->pump while length $self->{'slice'}->{$s}->{'in'}; |
522 |
|
523 |
$self->slice_output($s); |
524 |
|
525 |
# clean up |
526 |
$self->{'slice'}->{$s}->{'h'}->finish or confess "finish on slice $s returned: $?"; |
527 |
|
528 |
delete($self->{'slice'}->{$s}) && return 1; |
529 |
return 0; |
530 |
} |
531 |
|
532 |
=head2 to_xml |
533 |
|
534 |
Convert (binary safe, I hope) your data into XML for C<swish-e>. |
535 |
Data will not yet be recoded to UTF-8. L<"put_slice"> will do that. |
536 |
|
537 |
my $xml = $i->to_xml({ foo => 'bar' }); |
538 |
|
539 |
This function is extracted from L<"add"> method so that you can L<Memoize> it. |
540 |
If your data set has a lot of repeatable data, and memory is not a problem, you |
541 |
can add C<memoize_to_xml> option to L<"open">. |
542 |
|
543 |
=cut |
544 |
|
545 |
my %escape = ('<'=>'<', '>'=>'>', '&'=>'&', '"'=>'"'); |
546 |
my $escape_re = join '|' => keys %escape; |
547 |
|
548 |
sub to_xml { |
549 |
my $self = shift; |
550 |
|
551 |
my $data = shift || return; |
552 |
|
553 |
my $xml = qq{<xml>}; |
554 |
foreach my $tag (keys %$data) { |
555 |
my $content = $data->{$tag}; |
556 |
next if (! $content || $content eq ''); |
557 |
# save [cr/]lf before conversion to XML |
558 |
# $content =~ s/\n\r/##lf##/gs; |
559 |
# $content =~ s/\n/##lf##/gs; |
560 |
$content =~ s/($escape_re)/$escape{$1}/gs; |
561 |
$xml .= "<$tag><![CDATA[".$content."]]></$tag>"; |
562 |
} |
563 |
$xml .= qq{</xml>}; |
564 |
} |
565 |
|
566 |
1; |
567 |
__END__ |
568 |
|
569 |
|
570 |
=head2 Searching |
571 |
|
572 |
Searching is still conducted using L<SWISH::API>, but you have to glob |
573 |
index names. |
574 |
|
575 |
use SWISH::API; |
576 |
|
577 |
my $swish = SWISH::API->new( glob('index.swish-e/*') ); |
578 |
|
579 |
You can also alternativly create merged index (using C<merge> option) and |
580 |
not change your source code at all. |
581 |
|
582 |
That would also benefit performance, but it increases indexing time |
583 |
because merged indexes must be re-created on each indexing run. |
584 |
|
585 |
=head2 EXPORT |
586 |
|
587 |
Nothing by default. |
588 |
|
589 |
=head2 EXAMPLES |
590 |
|
591 |
Test script for this module uses all parts of API. It's also nice example |
592 |
how to use C<SWISH::Split>. |
593 |
|
594 |
=head1 SEE ALSO |
595 |
|
596 |
L<SWISH::API>, |
597 |
L<http://www.swish-e.org/> |
598 |
|
599 |
=head1 AUTHOR |
600 |
|
601 |
Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt> |
602 |
|
603 |
=head1 COPYRIGHT AND LICENSE |
604 |
|
605 |
Copyright (C) 2004 by Dobrica Pavlinusic |
606 |
|
607 |
This library is free software; you can redistribute it and/or modify |
608 |
it under the same terms as Perl itself, either Perl version 5.8.4 or, |
609 |
at your option, any later version of Perl 5 you may have available. |
610 |
|
611 |
|
612 |
=cut |