--- trunk/Split.pm 2004/08/08 19:22:56 4 +++ trunk/Split.pm 2004/12/19 03:06:01 8 @@ -4,7 +4,7 @@ use strict; use warnings; -our $VERSION = '0.00'; +our $VERSION = '0.02'; use SWISH::API; use Text::Iconv; @@ -12,11 +12,15 @@ use Carp; use Digest::MD5 qw(md5_hex); use Memoize; -use IPC::Run qw(start timeout pump finish); use File::Which; use Data::Dumper; +use constant { + ADDED => 1, + DELETED => 2, +}; + =head1 NAME SWISH::Split - Perl interface to split index variant of Swish-e @@ -37,16 +41,16 @@ engines) easier. In the background, it will fork swish-e binaries (one for each index slice) -and produce UTF-8 encoded XML files for it. So, if your imput charset isn't +and produce UTF-8 encoded XML files for it. So, if your input charset isn't C you will have to specify it. =head1 Methods used for indexing -=head2 open +=head2 open_index Create new object for index. - my $i = SWISH::Split->open({ + my $i = SWISH::Split->open_index({ index => '/path/to/index', slice_name => \&slice_on_path, slices => 30, @@ -64,7 +68,7 @@ return shift split(/\//,$_[0]); } -Options to open are following: +Options to C are following: =over 5 @@ -105,7 +109,7 @@ my $iso2utf = Text::Iconv->new('ISO-8859-1','UTF-8'); -sub open { +sub open_index { my $class = shift; my $self = {@_}; bless($self, $class); @@ -144,14 +148,9 @@ my $swishpath = shift || return; my $data = shift || return; - my ($out,$err) = $self->put_slice($swishpath, $self->to_xml($data)); - - if ($err) { - carp "$swishpath: $err"; - return 0; - } + my $slice = $self->put_slice($swishpath, $self->to_xml($data)); - return 1; + return $slice; } =head2 delete @@ -167,29 +166,37 @@ my @paths = @_ || return; + foreach my $path (@paths) { + $self->{'paths'}->{$path} = DELETED; + } + return 42; } -=head2 finish +=head2 done Finish indexing and close index file(s). - $i->finish; + $i->done; This is most time-consuming operation. When it's called, it will re-index all entries which haven't changed in all slices. Returns number of slices updated. +This method should really be called close or finish, but both of those are +allready used. + =cut -sub finish { +sub done { my $self = shift; my $ret = 0; foreach my $s (keys %{$self->{'slice'}}) { + $self->_debug("closing slice $s"); $ret += $self->close_slice($s); } @@ -212,6 +219,11 @@ sub swishpaths { my $self = shift; + + my $s = shift || return; + return if (! exists($self->{'slice'}->{'s'})); + + return keys %{$self->{'slice'}->{'s'}}; } =head2 swishpaths_updated @@ -262,7 +274,7 @@ my $s = $i->in_slice('path/to/document/in/index'); -If there are C parametar to L<"open"> it will use +If there are C parametar to L<"open_index"> it will use MD5 hash to spread documents across slices. That will produce random distribution of your documents in slices, which might or might not be best for your data. If you have to re-index large number of slices on each @@ -286,13 +298,13 @@ # first, pass path through slice_name function my $slice = &{$self->{'slice_name'}}($path); # then calculate MD5 hash - $slice = md5_hex($slice); + my $hash = md5_hex($slice); # take first 8 chars to produce number # FIXME how random is this? - $slice = hex(substr($slice,0,8)); + $hash = hex(substr($hash,0,8)); - $slice = ($slice % $self->{'slices'}) + 1; - print "hash: $slice / ",$self->{'slices'}," => $slice\n"; + $slice = ($hash % $self->{'slices'}) + 1; + $self->_debug("hash: $hash / ",$self->{'slices'}," => $slice"); return $slice; } else { return &{$self->{'split'}}($path); @@ -313,7 +325,6 @@ sub find_paths { my $self = shift; - my $s = shift || return; } @@ -324,15 +335,13 @@ my $config_filename = $i->make_config('slice name'); It returns configuration filename. If no C was defined in -L<"open">, default swish-e configuration will be used. It will index all data for +L<"open_index">, default swish-e configuration will be used. It will index all data for searching, but none for properties. If you want to see what is allready defined for swish-e in configuration take a look at source code for C. -It uses C utility to comunicate with C. Path is provided -by C. Do Windows users have to change that to C -or something similar? +It uses C as C to comunicate with C. =cut @@ -351,8 +360,7 @@ print $tmp_fh <<"DEFAULT_SWISH_CONF"; # swish-e config file -IndexDir cat -#SwishProgParameters - +IndexDir stdin # input file definition DefaultContents XML* @@ -387,7 +395,7 @@ =head2 create_slice -On first run, starts C using L. On subsequent calls just return +On first run, starts C. On subsequent calls just return it's handles using L. my $s = create_slice('/path/to/document'); @@ -408,20 +416,16 @@ my $swish_config = $self->make_config($s); - print STDERR "creating slice $s\n"; # FIXME + my $swish = qq{| swish-e }; + $swish .= qq{ -u } if (-f $self->{'index'}.'/'.$s); + $swish .= qq{ -S prog -c } . $swish_config; - my @swish = qw(swish-e -S prog -c); - push @swish, $swish_config; + $self->_debug("creating slice $s using $swish"); ## Build the harness, open all pipes, and launch the subprocesses - $self->{'slice'}->{$s}->{'h'} = start \@swish, - \$self->{'slice'}->{$s}->{'in'}, - \$self->{'slice'}->{$s}->{'out'}, - \$self->{'slice'}->{$s}->{'err'}, - timeout( 90 ); # FIXME + open(my $fh, $swish) || croak "can't open $swish: $!"; - $self->{'slice'}->{$s}->{'out_len'} = 0; - $self->{'slice'}->{$s}->{'err_len'} = 0; + $self->{'slice'}->{$s}->{'h'} = $fh; $self->slice_output($s); @@ -430,9 +434,11 @@ =head2 put_slice -Pass XML data to swish and receive output and errors. +Pass XML data to swish. - my ($out,$err) = $i->put_slice('/swish/path', 'data'); + my $slice = $i->put_slice('/swish/path', 'data'); + +Returns slice in which XML ended up. =cut @@ -447,22 +453,23 @@ my $s = $self->create_slice($path) || confess "create_slice returned null"; confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s})); - confess "no 'in' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'in'})); confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'})); $self->slice_output($s); use bytes; # as opposed to chars - $self->{'slice'}->{$s}->{'in'} .= - "Path-Name: $path\n". + my $fh = $self->{'slice'}->{$s}->{'h'} || confess "handle for slice $s undefined"; + print { $fh } "Path-Name: $path\n". "Content-Length: ".(length($xml)+1)."\n". + "Update-Mode: Index\n". "Document-Type: XML\n\n$xml\n"; - # do I/O - $self->{'slice'}->{$s}->{'h'}->pump while length $self->{'slice'}->{$s}->{'in'} ; # wait for all input to go - $self->slice_output($s); + $self->_debug("dumping in slice $s: $path"); + + $self->{'paths'}->{$path} = ADDED; + return $s; } @@ -470,10 +477,13 @@ Prints to STDERR output and errors from C. - $i->slice_output($s); + my $slice = $i->slice_output($s); Normally, you don't need to call it. +B which it isn't any more.> + =cut sub slice_output { @@ -482,24 +492,13 @@ my $s = shift || confess "slice_output needs slice"; confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s})); - confess "no 'in' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'in'})); - confess "no 'out' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'out'})); - if (length $self->{'slice'}->{$s}->{'out'} > $self->{'slice'}->{$s}->{'out_len'}) { - #print STDERR "swish-e OUT: ",$self->{'slice'}->{$s}->{'out'},"\n" if ($self->{'slice'}->{$s}->{'out'}); - $self->{'slice'}->{$s}->{'out_len'} = length $self->{'slice'}->{$s}->{'out'}; - return 1; - } elsif (length $self->{'slice'}->{$s}->{'err'} > $self->{'slice'}->{$s}->{'err_len'}) { - print STDERR "swish-e ERR: ",$self->{'slice'}->{$s}->{'err'},"\n" if ($self->{'slice'}->{$s}->{'err'}); - $self->{'slice'}->{$s}->{'err_len'} = length $self->{'slice'}->{$s}->{'err'}; - # this is fatal - return 0; - } + # FIXME - return 1; + return $s; } -=head2 close_slice { +=head2 close_slice Close slice (terminates swish-e process for that slice). @@ -518,12 +517,11 @@ confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'})); # pump rest of content (if any) - $self->{'slice'}->{$s}->{'h'}->pump while length $self->{'slice'}->{$s}->{'in'}; + close $self->{'slice'}->{$s}->{'h'} || carp "can't close slice $s: $!"; $self->slice_output($s); - # clean up - $self->{'slice'}->{$s}->{'h'}->finish or confess "finish on slice $s returned: $?"; + undef $self->{'slice'}->{$s}->{'h'}; delete($self->{'slice'}->{$s}) && return 1; return 0; @@ -538,7 +536,7 @@ This function is extracted from L<"add"> method so that you can L it. If your data set has a lot of repeatable data, and memory is not a problem, you -can add C option to L<"open">. +can add C option to L<"open_index">. =cut @@ -563,11 +561,17 @@ $xml .= qq{}; } +sub _debug { + my $self = shift; + print STDERR "## ",@_,"\n" if ($self->{'debug'}); + return; +} + 1; __END__ -=head2 Searching +=head1 Searching Searching is still conducted using L, but you have to glob index names. @@ -582,11 +586,11 @@ That would also benefit performance, but it increases indexing time because merged indexes must be re-created on each indexing run. -=head2 EXPORT +=head1 EXPORT Nothing by default. -=head2 EXAMPLES +=head1 EXAMPLES Test script for this module uses all parts of API. It's also nice example how to use C.