4 |
use strict; |
use strict; |
5 |
use warnings; |
use warnings; |
6 |
|
|
7 |
our $VERSION = '0.00'; |
our $VERSION = '0.01'; |
8 |
|
|
9 |
use SWISH::API; |
use SWISH::API; |
10 |
use Text::Iconv; |
use Text::Iconv; |
17 |
|
|
18 |
use Data::Dumper; |
use Data::Dumper; |
19 |
|
|
20 |
|
use constant { |
21 |
|
ADDED => 1, |
22 |
|
DELETED => 2, |
23 |
|
}; |
24 |
|
|
25 |
=head1 NAME |
=head1 NAME |
26 |
|
|
27 |
SWISH::Split - Perl interface to split index variant of Swish-e |
SWISH::Split - Perl interface to split index variant of Swish-e |
42 |
engines) easier. |
engines) easier. |
43 |
|
|
44 |
In the background, it will fork swish-e binaries (one for each index slice) |
In the background, it will fork swish-e binaries (one for each index slice) |
45 |
and produce UTF-8 encoded XML files for it. So, if your imput charset isn't |
and produce UTF-8 encoded XML files for it. So, if your input charset isn't |
46 |
C<ISO-8859-1> you will have to specify it. |
C<ISO-8859-1> you will have to specify it. |
47 |
|
|
48 |
=head1 Methods used for indexing |
=head1 Methods used for indexing |
149 |
my $swishpath = shift || return; |
my $swishpath = shift || return; |
150 |
my $data = shift || return; |
my $data = shift || return; |
151 |
|
|
152 |
my ($out,$err) = $self->put_slice($swishpath, $self->to_xml($data)); |
my $slice = $self->put_slice($swishpath, $self->to_xml($data)); |
153 |
|
|
154 |
if ($err) { |
# if ($err) { |
155 |
carp "$swishpath: $err"; |
# carp "$swishpath: $err"; |
156 |
return 0; |
# return undef; |
157 |
} |
# } |
158 |
|
|
159 |
return 1; |
return $slice; |
160 |
} |
} |
161 |
|
|
162 |
=head2 delete |
=head2 delete |
172 |
|
|
173 |
my @paths = @_ || return; |
my @paths = @_ || return; |
174 |
|
|
175 |
|
foreach my $path (@paths) { |
176 |
|
$self->{'paths'}->{$path} = DELETED; |
177 |
|
} |
178 |
|
|
179 |
return 42; |
return 42; |
180 |
} |
} |
181 |
|
|
182 |
|
|
183 |
=head2 finish |
=head2 done |
184 |
|
|
185 |
Finish indexing and close index file(s). |
Finish indexing and close index file(s). |
186 |
|
|
187 |
$i->finish; |
$i->done; |
188 |
|
|
189 |
This is most time-consuming operation. When it's called, it will re-index |
This is most time-consuming operation. When it's called, it will re-index |
190 |
all entries which haven't changed in all slices. |
all entries which haven't changed in all slices. |
191 |
|
|
192 |
Returns number of slices updated. |
Returns number of slices updated. |
193 |
|
|
194 |
|
This method should really be called close or finish, but both of those are |
195 |
|
allready used. |
196 |
|
|
197 |
=cut |
=cut |
198 |
|
|
199 |
sub finish { |
sub done { |
200 |
my $self = shift; |
my $self = shift; |
201 |
|
|
202 |
my $ret = 0; |
my $ret = 0; |
203 |
|
|
204 |
foreach my $s (keys %{$self->{'slice'}}) { |
foreach my $s (keys %{$self->{'slice'}}) { |
205 |
|
print STDERR "closing slice $s\n"; |
206 |
$ret += $self->close_slice($s); |
$ret += $self->close_slice($s); |
207 |
} |
} |
208 |
|
|
225 |
|
|
226 |
sub swishpaths { |
sub swishpaths { |
227 |
my $self = shift; |
my $self = shift; |
228 |
|
|
229 |
|
my $s = shift || return; |
230 |
|
return if (! exists($self->{'slice'}->{'s'})); |
231 |
|
|
232 |
|
return keys %{$self->{'slice'}->{'s'}}; |
233 |
} |
} |
234 |
|
|
235 |
=head2 swishpaths_updated |
=head2 swishpaths_updated |
331 |
sub find_paths { |
sub find_paths { |
332 |
my $self = shift; |
my $self = shift; |
333 |
|
|
|
my $s = shift || return; |
|
334 |
} |
} |
335 |
|
|
336 |
|
|
347 |
If you want to see what is allready defined for swish-e in configuration |
If you want to see what is allready defined for swish-e in configuration |
348 |
take a look at source code for C<DEFAULT_SWISH_CONF>. |
take a look at source code for C<DEFAULT_SWISH_CONF>. |
349 |
|
|
350 |
It uses C<cat> utility to comunicate with C<swish-e>. Path is provided |
It uses C<stdin> as C<IndexDir> to comunicate with C<swish-e>. |
|
by C<File::Which>. Do Windows users have to change that to C<COPY /B> |
|
|
or something similar? |
|
351 |
|
|
352 |
=cut |
=cut |
353 |
|
|
366 |
print $tmp_fh <<"DEFAULT_SWISH_CONF"; |
print $tmp_fh <<"DEFAULT_SWISH_CONF"; |
367 |
# swish-e config file |
# swish-e config file |
368 |
|
|
369 |
IndexDir cat |
IndexDir stdin |
|
#SwishProgParameters - |
|
370 |
|
|
371 |
# input file definition |
# input file definition |
372 |
DefaultContents XML* |
DefaultContents XML* |
424 |
|
|
425 |
print STDERR "creating slice $s\n"; # FIXME |
print STDERR "creating slice $s\n"; # FIXME |
426 |
|
|
427 |
my @swish = qw(swish-e -S prog -c); |
my @swish = qw(swish-e -u -S prog -c); |
428 |
push @swish, $swish_config; |
push @swish, $swish_config; |
429 |
|
|
430 |
## Build the harness, open all pipes, and launch the subprocesses |
## Build the harness, open all pipes, and launch the subprocesses |
444 |
|
|
445 |
=head2 put_slice |
=head2 put_slice |
446 |
|
|
447 |
Pass XML data to swish and receive output and errors. |
Pass XML data to swish. |
448 |
|
|
449 |
my ($out,$err) = $i->put_slice('/swish/path', '<xml>data</xml>'); |
my $slice = $i->put_slice('/swish/path', '<xml>data</xml>'); |
450 |
|
|
451 |
|
Returns slice in which XML ended up. |
452 |
|
|
453 |
=cut |
=cut |
454 |
|
|
472 |
$self->{'slice'}->{$s}->{'in'} .= |
$self->{'slice'}->{$s}->{'in'} .= |
473 |
"Path-Name: $path\n". |
"Path-Name: $path\n". |
474 |
"Content-Length: ".(length($xml)+1)."\n". |
"Content-Length: ".(length($xml)+1)."\n". |
475 |
|
"Update-Mode: Index\n". |
476 |
"Document-Type: XML\n\n$xml\n"; |
"Document-Type: XML\n\n$xml\n"; |
477 |
|
|
478 |
# do I/O |
# do I/O |
480 |
|
|
481 |
$self->slice_output($s); |
$self->slice_output($s); |
482 |
|
|
483 |
|
$self->{'paths'}->{$path} = ADDED; |
484 |
|
|
485 |
return $s; |
return $s; |
486 |
} |
} |
487 |
|
|
489 |
|
|
490 |
Prints to STDERR output and errors from C<swish-e>. |
Prints to STDERR output and errors from C<swish-e>. |
491 |
|
|
492 |
$i->slice_output($s); |
my $slice = $i->slice_output($s); |
493 |
|
|
494 |
Normally, you don't need to call it. |
Normally, you don't need to call it. |
495 |
|
|
507 |
if (length $self->{'slice'}->{$s}->{'out'} > $self->{'slice'}->{$s}->{'out_len'}) { |
if (length $self->{'slice'}->{$s}->{'out'} > $self->{'slice'}->{$s}->{'out_len'}) { |
508 |
#print STDERR "swish-e OUT: ",$self->{'slice'}->{$s}->{'out'},"\n" if ($self->{'slice'}->{$s}->{'out'}); |
#print STDERR "swish-e OUT: ",$self->{'slice'}->{$s}->{'out'},"\n" if ($self->{'slice'}->{$s}->{'out'}); |
509 |
$self->{'slice'}->{$s}->{'out_len'} = length $self->{'slice'}->{$s}->{'out'}; |
$self->{'slice'}->{$s}->{'out_len'} = length $self->{'slice'}->{$s}->{'out'}; |
510 |
return 1; |
return $s; |
511 |
} elsif (length $self->{'slice'}->{$s}->{'err'} > $self->{'slice'}->{$s}->{'err_len'}) { |
} elsif (length $self->{'slice'}->{$s}->{'err'} > $self->{'slice'}->{$s}->{'err_len'}) { |
512 |
print STDERR "swish-e ERR: ",$self->{'slice'}->{$s}->{'err'},"\n" if ($self->{'slice'}->{$s}->{'err'}); |
print STDERR "swish-e ERR: ",$self->{'slice'}->{$s}->{'err'},"\n" if ($self->{'slice'}->{$s}->{'err'}); |
513 |
$self->{'slice'}->{$s}->{'err_len'} = length $self->{'slice'}->{$s}->{'err'}; |
$self->{'slice'}->{$s}->{'err_len'} = length $self->{'slice'}->{$s}->{'err'}; |
514 |
# this is fatal |
# this is fatal |
515 |
return 0; |
return undef; |
516 |
} |
} |
517 |
|
|
518 |
return 1; |
return $s; |
519 |
} |
} |
520 |
|
|
521 |
=head2 close_slice { |
=head2 close_slice |
522 |
|
|
523 |
Close slice (terminates swish-e process for that slice). |
Close slice (terminates swish-e process for that slice). |
524 |
|
|
542 |
$self->slice_output($s); |
$self->slice_output($s); |
543 |
|
|
544 |
# clean up |
# clean up |
545 |
$self->{'slice'}->{$s}->{'h'}->finish or confess "finish on slice $s returned: $?"; |
$self->{'slice'}->{$s}->{'h'}->finish or confess "finish on slice $s returned $?: $! -- ",$self->{'slice'}->{$s}->{'err'}; |
546 |
|
|
547 |
delete($self->{'slice'}->{$s}) && return 1; |
delete($self->{'slice'}->{$s}) && return 1; |
548 |
return 0; |
return 0; |
586 |
__END__ |
__END__ |
587 |
|
|
588 |
|
|
589 |
=head2 Searching |
=head1 Searching |
590 |
|
|
591 |
Searching is still conducted using L<SWISH::API>, but you have to glob |
Searching is still conducted using L<SWISH::API>, but you have to glob |
592 |
index names. |
index names. |
601 |
That would also benefit performance, but it increases indexing time |
That would also benefit performance, but it increases indexing time |
602 |
because merged indexes must be re-created on each indexing run. |
because merged indexes must be re-created on each indexing run. |
603 |
|
|
604 |
=head2 EXPORT |
=head1 EXPORT |
605 |
|
|
606 |
Nothing by default. |
Nothing by default. |
607 |
|
|
608 |
=head2 EXAMPLES |
=head1 EXAMPLES |
609 |
|
|
610 |
Test script for this module uses all parts of API. It's also nice example |
Test script for this module uses all parts of API. It's also nice example |
611 |
how to use C<SWISH::Split>. |
how to use C<SWISH::Split>. |