4 |
use strict; |
use strict; |
5 |
use warnings; |
use warnings; |
6 |
|
|
7 |
our $VERSION = '0.00'; |
our $VERSION = '0.03'; |
8 |
|
|
9 |
use SWISH::API; |
use SWISH::API; |
10 |
use Text::Iconv; |
use Text::Iconv; |
12 |
use Carp; |
use Carp; |
13 |
use Digest::MD5 qw(md5_hex); |
use Digest::MD5 qw(md5_hex); |
14 |
use Memoize; |
use Memoize; |
|
use IPC::Run qw(start timeout pump finish); |
|
15 |
use File::Which; |
use File::Which; |
16 |
|
|
17 |
use Data::Dumper; |
use Data::Dumper; |
18 |
|
|
19 |
|
use constant { |
20 |
|
ADDED => 1, |
21 |
|
DELETED => 2, |
22 |
|
}; |
23 |
|
|
24 |
=head1 NAME |
=head1 NAME |
25 |
|
|
26 |
SWISH::Split - Perl interface to split index variant of Swish-e |
SWISH::Split - Perl interface to split index variant of Swish-e |
41 |
engines) easier. |
engines) easier. |
42 |
|
|
43 |
In the background, it will fork swish-e binaries (one for each index slice) |
In the background, it will fork swish-e binaries (one for each index slice) |
44 |
and produce UTF-8 encoded XML files for it. So, if your imput charset isn't |
and produce UTF-8 encoded XML files for it. So, if your input charset isn't |
45 |
C<ISO-8859-1> you will have to specify it. |
C<ISO-8859-1> you will have to specify it. |
46 |
|
|
47 |
=head1 Methods used for indexing |
=head1 Methods used for indexing |
48 |
|
|
49 |
=head2 open |
=head2 open_index |
50 |
|
|
51 |
Create new object for index. |
Create new object for index. |
52 |
|
|
53 |
my $i = SWISH::Split->open({ |
my $i = SWISH::Split->open_index({ |
54 |
index => '/path/to/index', |
index => '/path/to/index', |
55 |
slice_name => \&slice_on_path, |
slice_name => \&slice_on_path, |
56 |
slices => 30, |
slices => 30, |
68 |
return shift split(/\//,$_[0]); |
return shift split(/\//,$_[0]); |
69 |
} |
} |
70 |
|
|
71 |
Options to open are following: |
Options to C<open_index> are following: |
72 |
|
|
73 |
=over 5 |
=over 5 |
74 |
|
|
97 |
=item C<swish_config> |
=item C<swish_config> |
98 |
|
|
99 |
additional parametars which will be inserted into |
additional parametars which will be inserted into |
100 |
C<swish-e> configuration file. See L<swish-config>. |
C<swish-e> configuration file. See C<swish-config>. |
101 |
|
|
102 |
=item C<memoize_to_xml> |
=item C<memoize_to_xml> |
103 |
|
|
109 |
|
|
110 |
my $iso2utf = Text::Iconv->new('ISO-8859-1','UTF-8'); |
my $iso2utf = Text::Iconv->new('ISO-8859-1','UTF-8'); |
111 |
|
|
112 |
sub open { |
sub open_index { |
113 |
my $class = shift; |
my $class = shift; |
114 |
my $self = {@_}; |
my $self = {@_}; |
115 |
bless($self, $class); |
bless($self, $class); |
148 |
my $swishpath = shift || return; |
my $swishpath = shift || return; |
149 |
my $data = shift || return; |
my $data = shift || return; |
150 |
|
|
151 |
my ($out,$err) = $self->put_slice($swishpath, $self->to_xml($data)); |
my $slice = $self->put_slice($swishpath, $self->to_xml($data)); |
|
|
|
|
if ($err) { |
|
|
carp "$swishpath: $err"; |
|
|
return 0; |
|
|
} |
|
152 |
|
|
153 |
return 1; |
return $slice; |
154 |
} |
} |
155 |
|
|
156 |
=head2 delete |
=head2 delete |
166 |
|
|
167 |
my @paths = @_ || return; |
my @paths = @_ || return; |
168 |
|
|
169 |
|
foreach my $path (@paths) { |
170 |
|
$self->{'paths'}->{$path} = DELETED; |
171 |
|
} |
172 |
|
|
173 |
return 42; |
return 42; |
174 |
} |
} |
175 |
|
|
176 |
|
|
177 |
=head2 finish |
=head2 done |
178 |
|
|
179 |
Finish indexing and close index file(s). |
Finish indexing and close index file(s). |
180 |
|
|
181 |
$i->finish; |
$i->done; |
182 |
|
|
183 |
This is most time-consuming operation. When it's called, it will re-index |
This is most time-consuming operation. When it's called, it will re-index |
184 |
all entries which haven't changed in all slices. |
all entries which haven't changed in all slices. |
185 |
|
|
186 |
Returns number of slices updated. |
Returns number of slices updated. |
187 |
|
|
188 |
|
This method should really be called close or finish, but both of those are |
189 |
|
allready used. |
190 |
|
|
191 |
=cut |
=cut |
192 |
|
|
193 |
sub finish { |
sub done { |
194 |
my $self = shift; |
my $self = shift; |
195 |
|
|
196 |
my $ret = 0; |
my $ret = 0; |
197 |
|
|
198 |
foreach my $s (keys %{$self->{'slice'}}) { |
foreach my $s (keys %{$self->{'slice'}}) { |
199 |
|
$self->_debug("closing slice $s"); |
200 |
$ret += $self->close_slice($s); |
$ret += $self->close_slice($s); |
201 |
} |
} |
202 |
|
|
219 |
|
|
220 |
sub swishpaths { |
sub swishpaths { |
221 |
my $self = shift; |
my $self = shift; |
222 |
|
|
223 |
|
my $s = shift || return; |
224 |
|
return if (! exists($self->{'slice'}->{'s'})); |
225 |
|
|
226 |
|
return keys %{$self->{'slice'}->{'s'}}; |
227 |
} |
} |
228 |
|
|
229 |
=head2 swishpaths_updated |
=head2 swishpaths_updated |
274 |
|
|
275 |
my $s = $i->in_slice('path/to/document/in/index'); |
my $s = $i->in_slice('path/to/document/in/index'); |
276 |
|
|
277 |
If there are C<slices> parametar to L<"open"> it will use |
If there are C<slices> parametar to L<"open_index"> it will use |
278 |
MD5 hash to spread documents across slices. That will produce random |
MD5 hash to spread documents across slices. That will produce random |
279 |
distribution of your documents in slices, which might or might not be best |
distribution of your documents in slices, which might or might not be best |
280 |
for your data. If you have to re-index large number of slices on each |
for your data. If you have to re-index large number of slices on each |
298 |
# first, pass path through slice_name function |
# first, pass path through slice_name function |
299 |
my $slice = &{$self->{'slice_name'}}($path); |
my $slice = &{$self->{'slice_name'}}($path); |
300 |
# then calculate MD5 hash |
# then calculate MD5 hash |
301 |
$slice = md5_hex($slice); |
my $hash = md5_hex($slice); |
302 |
# take first 8 chars to produce number |
# take first 8 chars to produce number |
303 |
# FIXME how random is this? |
# FIXME how random is this? |
304 |
$slice = hex(substr($slice,0,8)); |
$hash = hex(substr($hash,0,8)); |
305 |
|
|
306 |
$slice = ($slice % $self->{'slices'}) + 1; |
$slice = ($hash % $self->{'slices'}) + 1; |
307 |
print "hash: $slice / ",$self->{'slices'}," => $slice\n"; |
$self->_debug("hash: $hash / ",$self->{'slices'}," => $slice"); |
308 |
return $slice; |
return $slice; |
309 |
} else { |
} else { |
310 |
return &{$self->{'split'}}($path); |
return &{$self->{'split'}}($path); |
325 |
sub find_paths { |
sub find_paths { |
326 |
my $self = shift; |
my $self = shift; |
327 |
|
|
|
my $s = shift || return; |
|
328 |
} |
} |
329 |
|
|
330 |
|
|
335 |
my $config_filename = $i->make_config('slice name'); |
my $config_filename = $i->make_config('slice name'); |
336 |
|
|
337 |
It returns configuration filename. If no C<swish_config> was defined in |
It returns configuration filename. If no C<swish_config> was defined in |
338 |
L<"open">, default swish-e configuration will be used. It will index all data for |
L<"open_index">, default swish-e configuration will be used. It will index all data for |
339 |
searching, but none for properties. |
searching, but none for properties. |
340 |
|
|
341 |
If you want to see what is allready defined for swish-e in configuration |
If you want to see what is allready defined for swish-e in configuration |
342 |
take a look at source code for C<DEFAULT_SWISH_CONF>. |
take a look at source code for C<DEFAULT_SWISH_CONF>. |
343 |
|
|
344 |
It uses C<cat> utility to comunicate with C<swish-e>. Path is provided |
It uses C<stdin> as C<IndexDir> to comunicate with C<swish-e>. |
|
by C<File::Which>. Do Windows users have to change that to C<COPY /B> |
|
|
or something similar? |
|
345 |
|
|
346 |
=cut |
=cut |
347 |
|
|
360 |
print $tmp_fh <<"DEFAULT_SWISH_CONF"; |
print $tmp_fh <<"DEFAULT_SWISH_CONF"; |
361 |
# swish-e config file |
# swish-e config file |
362 |
|
|
363 |
IndexDir cat |
IndexDir stdin |
|
#SwishProgParameters - |
|
364 |
|
|
365 |
# input file definition |
# input file definition |
366 |
DefaultContents XML* |
DefaultContents XML* |
395 |
|
|
396 |
=head2 create_slice |
=head2 create_slice |
397 |
|
|
398 |
On first run, starts C<swish-e> using L<IPC::Run>. On subsequent calls just return |
On first run, starts C<swish-e>. On subsequent calls just return |
399 |
it's handles using L<Memoize>. |
it's handles using C<Memoize>. |
400 |
|
|
401 |
my $s = create_slice('/path/to/document'); |
my $s = create_slice('/path/to/document'); |
402 |
|
|
416 |
|
|
417 |
my $swish_config = $self->make_config($s); |
my $swish_config = $self->make_config($s); |
418 |
|
|
419 |
print STDERR "creating slice $s\n"; # FIXME |
my $swish = qq{| swish-e }; |
420 |
|
if (-f $self->{'index'}.'/'.$s) { |
421 |
|
$swish .= qq{ -u }; |
422 |
|
$self->{'slice'}->{$s}->{'update_mode'}++; |
423 |
|
} |
424 |
|
$swish .= qq{ -S prog -c } . $swish_config; |
425 |
|
|
426 |
my @swish = qw(swish-e -S prog -c); |
$self->_debug("creating slice $s using $swish"); |
|
push @swish, $swish_config; |
|
427 |
|
|
428 |
## Build the harness, open all pipes, and launch the subprocesses |
## Build the harness, open all pipes, and launch the subprocesses |
429 |
$self->{'slice'}->{$s}->{'h'} = start \@swish, |
open(my $fh, $swish) || croak "can't open $swish: $!"; |
|
\$self->{'slice'}->{$s}->{'in'}, |
|
|
\$self->{'slice'}->{$s}->{'out'}, |
|
|
\$self->{'slice'}->{$s}->{'err'}, |
|
|
timeout( 90 ); # FIXME |
|
430 |
|
|
431 |
$self->{'slice'}->{$s}->{'out_len'} = 0; |
$self->{'slice'}->{$s}->{'h'} = $fh; |
|
$self->{'slice'}->{$s}->{'err_len'} = 0; |
|
432 |
|
|
433 |
$self->slice_output($s); |
$self->slice_output($s); |
434 |
|
|
437 |
|
|
438 |
=head2 put_slice |
=head2 put_slice |
439 |
|
|
440 |
Pass XML data to swish and receive output and errors. |
Pass XML data to swish. |
441 |
|
|
442 |
|
my $slice = $i->put_slice('/swish/path', '<xml>data</xml>'); |
443 |
|
|
444 |
my ($out,$err) = $i->put_slice('/swish/path', '<xml>data</xml>'); |
Returns slice in which XML ended up. |
445 |
|
|
446 |
=cut |
=cut |
447 |
|
|
456 |
my $s = $self->create_slice($path) || confess "create_slice returned null"; |
my $s = $self->create_slice($path) || confess "create_slice returned null"; |
457 |
|
|
458 |
confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s})); |
confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s})); |
|
confess "no 'in' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'in'})); |
|
459 |
confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'})); |
confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'})); |
460 |
|
|
461 |
$self->slice_output($s); |
$self->slice_output($s); |
462 |
|
|
463 |
use bytes; # as opposed to chars |
use bytes; # as opposed to chars |
464 |
$self->{'slice'}->{$s}->{'in'} .= |
my $fh = $self->{'slice'}->{$s}->{'h'} || confess "handle for slice $s undefined"; |
465 |
"Path-Name: $path\n". |
|
466 |
"Content-Length: ".(length($xml)+1)."\n". |
my $update_header = "Update-Mode: Index\n"; |
467 |
"Document-Type: XML\n\n$xml\n"; |
$update_header = '' unless ($self->{'slice'}->{$s}->{'update_mode'}); |
468 |
|
|
469 |
# do I/O |
print { $fh } "Path-Name: $path\n". |
470 |
$self->{'slice'}->{$s}->{'h'}->pump while length $self->{'slice'}->{$s}->{'in'} ; # wait for all input to go |
"Content-Length: ".(length($xml)+1)."\n" . $update_header . |
471 |
|
"Document-Type: XML\n\n$xml\n"; |
472 |
|
|
473 |
$self->slice_output($s); |
$self->slice_output($s); |
474 |
|
|
475 |
|
$self->_debug("dumping in slice $s: $path"); |
476 |
|
|
477 |
|
$self->{'paths'}->{$path} = ADDED; |
478 |
|
|
479 |
return $s; |
return $s; |
480 |
} |
} |
481 |
|
|
483 |
|
|
484 |
Prints to STDERR output and errors from C<swish-e>. |
Prints to STDERR output and errors from C<swish-e>. |
485 |
|
|
486 |
$i->slice_output($s); |
my $slice = $i->slice_output($s); |
487 |
|
|
488 |
Normally, you don't need to call it. |
Normally, you don't need to call it. |
489 |
|
|
490 |
|
B<This is dummy placeholder function for very old code that assumes this |
491 |
|
module is using C<IPC::Run> which it isn't any more.> |
492 |
|
|
493 |
=cut |
=cut |
494 |
|
|
495 |
sub slice_output { |
sub slice_output { |
498 |
my $s = shift || confess "slice_output needs slice"; |
my $s = shift || confess "slice_output needs slice"; |
499 |
|
|
500 |
confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s})); |
confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s})); |
|
confess "no 'in' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'in'})); |
|
|
confess "no 'out' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'out'})); |
|
501 |
|
|
502 |
if (length $self->{'slice'}->{$s}->{'out'} > $self->{'slice'}->{$s}->{'out_len'}) { |
# FIXME |
|
#print STDERR "swish-e OUT: ",$self->{'slice'}->{$s}->{'out'},"\n" if ($self->{'slice'}->{$s}->{'out'}); |
|
|
$self->{'slice'}->{$s}->{'out_len'} = length $self->{'slice'}->{$s}->{'out'}; |
|
|
return 1; |
|
|
} elsif (length $self->{'slice'}->{$s}->{'err'} > $self->{'slice'}->{$s}->{'err_len'}) { |
|
|
print STDERR "swish-e ERR: ",$self->{'slice'}->{$s}->{'err'},"\n" if ($self->{'slice'}->{$s}->{'err'}); |
|
|
$self->{'slice'}->{$s}->{'err_len'} = length $self->{'slice'}->{$s}->{'err'}; |
|
|
# this is fatal |
|
|
return 0; |
|
|
} |
|
503 |
|
|
504 |
return 1; |
return $s; |
505 |
} |
} |
506 |
|
|
507 |
=head2 close_slice { |
=head2 close_slice |
508 |
|
|
509 |
Close slice (terminates swish-e process for that slice). |
Close slice (terminates swish-e process for that slice). |
510 |
|
|
523 |
confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'})); |
confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'})); |
524 |
|
|
525 |
# pump rest of content (if any) |
# pump rest of content (if any) |
526 |
$self->{'slice'}->{$s}->{'h'}->pump while length $self->{'slice'}->{$s}->{'in'}; |
close $self->{'slice'}->{$s}->{'h'} || carp "can't close slice $s: $!"; |
527 |
|
|
528 |
$self->slice_output($s); |
$self->slice_output($s); |
529 |
|
|
530 |
# clean up |
undef $self->{'slice'}->{$s}->{'h'}; |
|
$self->{'slice'}->{$s}->{'h'}->finish or confess "finish on slice $s returned: $?"; |
|
531 |
|
|
532 |
delete($self->{'slice'}->{$s}) && return 1; |
delete($self->{'slice'}->{$s}) && return 1; |
533 |
return 0; |
return 0; |
540 |
|
|
541 |
my $xml = $i->to_xml({ foo => 'bar' }); |
my $xml = $i->to_xml({ foo => 'bar' }); |
542 |
|
|
543 |
This function is extracted from L<"add"> method so that you can L<Memoize> it. |
This function is extracted from L<"add"> method so that you can C<Memoize> it. |
544 |
If your data set has a lot of repeatable data, and memory is not a problem, you |
If your data set has a lot of repeatable data, and memory is not a problem, you |
545 |
can add C<memoize_to_xml> option to L<"open">. |
can add C<memoize_to_xml> option to L<"open_index">. |
546 |
|
|
547 |
=cut |
=cut |
548 |
|
|
567 |
$xml .= qq{</xml>}; |
$xml .= qq{</xml>}; |
568 |
} |
} |
569 |
|
|
570 |
|
sub _debug { |
571 |
|
my $self = shift; |
572 |
|
print STDERR "## ",@_,"\n" if ($self->{'debug'}); |
573 |
|
return; |
574 |
|
} |
575 |
|
|
576 |
1; |
1; |
577 |
__END__ |
__END__ |
578 |
|
|
579 |
|
|
580 |
=head2 Searching |
=head1 Searching |
581 |
|
|
582 |
Searching is still conducted using L<SWISH::API>, but you have to glob |
Searching is still conducted using L<SWISH::API>, but you have to glob |
583 |
index names. |
index names. |
592 |
That would also benefit performance, but it increases indexing time |
That would also benefit performance, but it increases indexing time |
593 |
because merged indexes must be re-created on each indexing run. |
because merged indexes must be re-created on each indexing run. |
594 |
|
|
595 |
=head2 EXPORT |
=head1 EXPORT |
596 |
|
|
597 |
Nothing by default. |
Nothing by default. |
598 |
|
|
599 |
=head2 EXAMPLES |
=head1 EXAMPLES |
600 |
|
|
601 |
Test script for this module uses all parts of API. It's also nice example |
Test script for this module uses all parts of API. It's also nice example |
602 |
how to use C<SWISH::Split>. |
how to use C<SWISH::Split>. |