--- trunk/Split.pm 2004/08/11 14:28:40 5 +++ trunk/Split.pm 2005/04/29 22:50:16 11 @@ -4,7 +4,7 @@ use strict; use warnings; -our $VERSION = '0.00'; +our $VERSION = '0.03'; use SWISH::API; use Text::Iconv; @@ -12,7 +12,6 @@ use Carp; use Digest::MD5 qw(md5_hex); use Memoize; -use IPC::Run qw(start timeout pump finish); use File::Which; use Data::Dumper; @@ -42,16 +41,16 @@ engines) easier. In the background, it will fork swish-e binaries (one for each index slice) -and produce UTF-8 encoded XML files for it. So, if your imput charset isn't +and produce UTF-8 encoded XML files for it. So, if your input charset isn't C you will have to specify it. =head1 Methods used for indexing -=head2 open +=head2 open_index Create new object for index. - my $i = SWISH::Split->open({ + my $i = SWISH::Split->open_index({ index => '/path/to/index', slice_name => \&slice_on_path, slices => 30, @@ -69,7 +68,7 @@ return shift split(/\//,$_[0]); } -Options to open are following: +Options to C are following: =over 5 @@ -98,7 +97,7 @@ =item C additional parametars which will be inserted into -C configuration file. See L. +C configuration file. See C. =item C @@ -110,7 +109,7 @@ my $iso2utf = Text::Iconv->new('ISO-8859-1','UTF-8'); -sub open { +sub open_index { my $class = shift; my $self = {@_}; bless($self, $class); @@ -149,14 +148,9 @@ my $swishpath = shift || return; my $data = shift || return; - my ($out,$err) = $self->put_slice($swishpath, $self->to_xml($data)); + my $slice = $self->put_slice($swishpath, $self->to_xml($data)); - if ($err) { - carp "$swishpath: $err"; - return 0; - } - - return 1; + return $slice; } =head2 delete @@ -202,7 +196,7 @@ my $ret = 0; foreach my $s (keys %{$self->{'slice'}}) { - print STDERR "closing slice $s\n"; + $self->_debug("closing slice $s"); $ret += $self->close_slice($s); } @@ -280,7 +274,7 @@ my $s = $i->in_slice('path/to/document/in/index'); -If there are C parametar to L<"open"> it will use +If there are C parametar to L<"open_index"> it will use MD5 hash to spread documents across slices. That will produce random distribution of your documents in slices, which might or might not be best for your data. If you have to re-index large number of slices on each @@ -304,13 +298,13 @@ # first, pass path through slice_name function my $slice = &{$self->{'slice_name'}}($path); # then calculate MD5 hash - $slice = md5_hex($slice); + my $hash = md5_hex($slice); # take first 8 chars to produce number # FIXME how random is this? - $slice = hex(substr($slice,0,8)); + $hash = hex(substr($hash,0,8)); - $slice = ($slice % $self->{'slices'}) + 1; - print "hash: $slice / ",$self->{'slices'}," => $slice\n"; + $slice = ($hash % $self->{'slices'}) + 1; + $self->_debug("hash: $hash / ",$self->{'slices'}," => $slice"); return $slice; } else { return &{$self->{'split'}}($path); @@ -341,15 +335,13 @@ my $config_filename = $i->make_config('slice name'); It returns configuration filename. If no C was defined in -L<"open">, default swish-e configuration will be used. It will index all data for +L<"open_index">, default swish-e configuration will be used. It will index all data for searching, but none for properties. If you want to see what is allready defined for swish-e in configuration take a look at source code for C. -It uses C utility to comunicate with C. Path is provided -by C. Do Windows users have to change that to C -or something similar? +It uses C as C to comunicate with C. =cut @@ -368,8 +360,7 @@ print $tmp_fh <<"DEFAULT_SWISH_CONF"; # swish-e config file -IndexDir cat -#SwishProgParameters - +IndexDir stdin # input file definition DefaultContents XML* @@ -404,8 +395,8 @@ =head2 create_slice -On first run, starts C using L. On subsequent calls just return -it's handles using L. +On first run, starts C. On subsequent calls just return +it's handles using C. my $s = create_slice('/path/to/document'); @@ -425,20 +416,19 @@ my $swish_config = $self->make_config($s); - print STDERR "creating slice $s\n"; # FIXME + my $swish = qq{| swish-e }; + if (-f $self->{'index'}.'/'.$s) { + $swish .= qq{ -u }; + $self->{'slice'}->{$s}->{'update_mode'}++; + } + $swish .= qq{ -S prog -c } . $swish_config; - my @swish = qw(swish-e -S prog -c); - push @swish, $swish_config; + $self->_debug("creating slice $s using $swish"); ## Build the harness, open all pipes, and launch the subprocesses - $self->{'slice'}->{$s}->{'h'} = start \@swish, - \$self->{'slice'}->{$s}->{'in'}, - \$self->{'slice'}->{$s}->{'out'}, - \$self->{'slice'}->{$s}->{'err'}, - timeout( 90 ); # FIXME + open(my $fh, $swish) || croak "can't open $swish: $!"; - $self->{'slice'}->{$s}->{'out_len'} = 0; - $self->{'slice'}->{$s}->{'err_len'} = 0; + $self->{'slice'}->{$s}->{'h'} = $fh; $self->slice_output($s); @@ -447,9 +437,11 @@ =head2 put_slice -Pass XML data to swish and receive output and errors. +Pass XML data to swish. + + my $slice = $i->put_slice('/swish/path', 'data'); - my ($out,$err) = $i->put_slice('/swish/path', 'data'); +Returns slice in which XML ended up. =cut @@ -464,22 +456,23 @@ my $s = $self->create_slice($path) || confess "create_slice returned null"; confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s})); - confess "no 'in' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'in'})); confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'})); $self->slice_output($s); use bytes; # as opposed to chars - $self->{'slice'}->{$s}->{'in'} .= - "Path-Name: $path\n". - "Content-Length: ".(length($xml)+1)."\n". - "Document-Type: XML\n\n$xml\n"; + my $fh = $self->{'slice'}->{$s}->{'h'} || confess "handle for slice $s undefined"; + + my $update_header = "Update-Mode: Index\n"; + $update_header = '' unless ($self->{'slice'}->{$s}->{'update_mode'}); - # do I/O - $self->{'slice'}->{$s}->{'h'}->pump while length $self->{'slice'}->{$s}->{'in'} ; # wait for all input to go + print { $fh } "Path-Name: $path\n". + "Content-Length: ".(length($xml)+1)."\n" . $update_header . + "Document-Type: XML\n\n$xml\n"; $self->slice_output($s); + $self->_debug("dumping in slice $s: $path"); $self->{'paths'}->{$path} = ADDED; @@ -490,10 +483,13 @@ Prints to STDERR output and errors from C. - $i->slice_output($s); + my $slice = $i->slice_output($s); Normally, you don't need to call it. +B which it isn't any more.> + =cut sub slice_output { @@ -502,21 +498,10 @@ my $s = shift || confess "slice_output needs slice"; confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s})); - confess "no 'in' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'in'})); - confess "no 'out' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'out'})); - if (length $self->{'slice'}->{$s}->{'out'} > $self->{'slice'}->{$s}->{'out_len'}) { - #print STDERR "swish-e OUT: ",$self->{'slice'}->{$s}->{'out'},"\n" if ($self->{'slice'}->{$s}->{'out'}); - $self->{'slice'}->{$s}->{'out_len'} = length $self->{'slice'}->{$s}->{'out'}; - return 1; - } elsif (length $self->{'slice'}->{$s}->{'err'} > $self->{'slice'}->{$s}->{'err_len'}) { - print STDERR "swish-e ERR: ",$self->{'slice'}->{$s}->{'err'},"\n" if ($self->{'slice'}->{$s}->{'err'}); - $self->{'slice'}->{$s}->{'err_len'} = length $self->{'slice'}->{$s}->{'err'}; - # this is fatal - return 0; - } + # FIXME - return 1; + return $s; } =head2 close_slice @@ -538,12 +523,11 @@ confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'})); # pump rest of content (if any) - $self->{'slice'}->{$s}->{'h'}->pump while length $self->{'slice'}->{$s}->{'in'}; + close $self->{'slice'}->{$s}->{'h'} || carp "can't close slice $s: $!"; $self->slice_output($s); - # clean up - $self->{'slice'}->{$s}->{'h'}->finish or confess "finish on slice $s returned: $?"; + undef $self->{'slice'}->{$s}->{'h'}; delete($self->{'slice'}->{$s}) && return 1; return 0; @@ -556,9 +540,9 @@ my $xml = $i->to_xml({ foo => 'bar' }); -This function is extracted from L<"add"> method so that you can L it. +This function is extracted from L<"add"> method so that you can C it. If your data set has a lot of repeatable data, and memory is not a problem, you -can add C option to L<"open">. +can add C option to L<"open_index">. =cut @@ -583,6 +567,12 @@ $xml .= qq{}; } +sub _debug { + my $self = shift; + print STDERR "## ",@_,"\n" if ($self->{'debug'}); + return; +} + 1; __END__