--- trunk/Split.pm 2004/12/17 18:32:34 7 +++ trunk/Split.pm 2004/12/19 03:06:01 8 @@ -4,7 +4,7 @@ use strict; use warnings; -our $VERSION = '0.01'; +our $VERSION = '0.02'; use SWISH::API; use Text::Iconv; @@ -12,7 +12,6 @@ use Carp; use Digest::MD5 qw(md5_hex); use Memoize; -use IPC::Run qw(start timeout pump finish); use File::Which; use Data::Dumper; @@ -47,11 +46,11 @@ =head1 Methods used for indexing -=head2 open +=head2 open_index Create new object for index. - my $i = SWISH::Split->open({ + my $i = SWISH::Split->open_index({ index => '/path/to/index', slice_name => \&slice_on_path, slices => 30, @@ -69,7 +68,7 @@ return shift split(/\//,$_[0]); } -Options to open are following: +Options to C are following: =over 5 @@ -110,7 +109,7 @@ my $iso2utf = Text::Iconv->new('ISO-8859-1','UTF-8'); -sub open { +sub open_index { my $class = shift; my $self = {@_}; bless($self, $class); @@ -151,11 +150,6 @@ my $slice = $self->put_slice($swishpath, $self->to_xml($data)); -# if ($err) { -# carp "$swishpath: $err"; -# return undef; -# } - return $slice; } @@ -202,7 +196,7 @@ my $ret = 0; foreach my $s (keys %{$self->{'slice'}}) { - print STDERR "closing slice $s\n"; + $self->_debug("closing slice $s"); $ret += $self->close_slice($s); } @@ -280,7 +274,7 @@ my $s = $i->in_slice('path/to/document/in/index'); -If there are C parametar to L<"open"> it will use +If there are C parametar to L<"open_index"> it will use MD5 hash to spread documents across slices. That will produce random distribution of your documents in slices, which might or might not be best for your data. If you have to re-index large number of slices on each @@ -304,13 +298,13 @@ # first, pass path through slice_name function my $slice = &{$self->{'slice_name'}}($path); # then calculate MD5 hash - $slice = md5_hex($slice); + my $hash = md5_hex($slice); # take first 8 chars to produce number # FIXME how random is this? - $slice = hex(substr($slice,0,8)); + $hash = hex(substr($hash,0,8)); - $slice = ($slice % $self->{'slices'}) + 1; - print "hash: $slice / ",$self->{'slices'}," => $slice\n"; + $slice = ($hash % $self->{'slices'}) + 1; + $self->_debug("hash: $hash / ",$self->{'slices'}," => $slice"); return $slice; } else { return &{$self->{'split'}}($path); @@ -341,7 +335,7 @@ my $config_filename = $i->make_config('slice name'); It returns configuration filename. If no C was defined in -L<"open">, default swish-e configuration will be used. It will index all data for +L<"open_index">, default swish-e configuration will be used. It will index all data for searching, but none for properties. If you want to see what is allready defined for swish-e in configuration @@ -401,7 +395,7 @@ =head2 create_slice -On first run, starts C using L. On subsequent calls just return +On first run, starts C. On subsequent calls just return it's handles using L. my $s = create_slice('/path/to/document'); @@ -422,20 +416,16 @@ my $swish_config = $self->make_config($s); - print STDERR "creating slice $s\n"; # FIXME + my $swish = qq{| swish-e }; + $swish .= qq{ -u } if (-f $self->{'index'}.'/'.$s); + $swish .= qq{ -S prog -c } . $swish_config; - my @swish = qw(swish-e -u -S prog -c); - push @swish, $swish_config; + $self->_debug("creating slice $s using $swish"); ## Build the harness, open all pipes, and launch the subprocesses - $self->{'slice'}->{$s}->{'h'} = start \@swish, - \$self->{'slice'}->{$s}->{'in'}, - \$self->{'slice'}->{$s}->{'out'}, - \$self->{'slice'}->{$s}->{'err'}, - timeout( 90 ); # FIXME + open(my $fh, $swish) || croak "can't open $swish: $!"; - $self->{'slice'}->{$s}->{'out_len'} = 0; - $self->{'slice'}->{$s}->{'err_len'} = 0; + $self->{'slice'}->{$s}->{'h'} = $fh; $self->slice_output($s); @@ -463,23 +453,21 @@ my $s = $self->create_slice($path) || confess "create_slice returned null"; confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s})); - confess "no 'in' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'in'})); confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'})); $self->slice_output($s); use bytes; # as opposed to chars - $self->{'slice'}->{$s}->{'in'} .= - "Path-Name: $path\n". + my $fh = $self->{'slice'}->{$s}->{'h'} || confess "handle for slice $s undefined"; + print { $fh } "Path-Name: $path\n". "Content-Length: ".(length($xml)+1)."\n". "Update-Mode: Index\n". "Document-Type: XML\n\n$xml\n"; - # do I/O - $self->{'slice'}->{$s}->{'h'}->pump while length $self->{'slice'}->{$s}->{'in'} ; # wait for all input to go - $self->slice_output($s); + $self->_debug("dumping in slice $s: $path"); + $self->{'paths'}->{$path} = ADDED; return $s; @@ -493,6 +481,9 @@ Normally, you don't need to call it. +B which it isn't any more.> + =cut sub slice_output { @@ -501,19 +492,8 @@ my $s = shift || confess "slice_output needs slice"; confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s})); - confess "no 'in' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'in'})); - confess "no 'out' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'out'})); - if (length $self->{'slice'}->{$s}->{'out'} > $self->{'slice'}->{$s}->{'out_len'}) { - #print STDERR "swish-e OUT: ",$self->{'slice'}->{$s}->{'out'},"\n" if ($self->{'slice'}->{$s}->{'out'}); - $self->{'slice'}->{$s}->{'out_len'} = length $self->{'slice'}->{$s}->{'out'}; - return $s; - } elsif (length $self->{'slice'}->{$s}->{'err'} > $self->{'slice'}->{$s}->{'err_len'}) { - print STDERR "swish-e ERR: ",$self->{'slice'}->{$s}->{'err'},"\n" if ($self->{'slice'}->{$s}->{'err'}); - $self->{'slice'}->{$s}->{'err_len'} = length $self->{'slice'}->{$s}->{'err'}; - # this is fatal - return undef; - } + # FIXME return $s; } @@ -537,12 +517,11 @@ confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'})); # pump rest of content (if any) - $self->{'slice'}->{$s}->{'h'}->pump while length $self->{'slice'}->{$s}->{'in'}; + close $self->{'slice'}->{$s}->{'h'} || carp "can't close slice $s: $!"; $self->slice_output($s); - # clean up - $self->{'slice'}->{$s}->{'h'}->finish or confess "finish on slice $s returned $?: $! -- ",$self->{'slice'}->{$s}->{'err'}; + undef $self->{'slice'}->{$s}->{'h'}; delete($self->{'slice'}->{$s}) && return 1; return 0; @@ -557,7 +536,7 @@ This function is extracted from L<"add"> method so that you can L it. If your data set has a lot of repeatable data, and memory is not a problem, you -can add C option to L<"open">. +can add C option to L<"open_index">. =cut @@ -582,6 +561,12 @@ $xml .= qq{}; } +sub _debug { + my $self = shift; + print STDERR "## ",@_,"\n" if ($self->{'debug'}); + return; +} + 1; __END__