--- trunk/Split.pm 2004/08/08 10:53:04 3 +++ trunk/Split.pm 2004/08/08 19:22:56 4 @@ -12,6 +12,8 @@ use Carp; use Digest::MD5 qw(md5_hex); use Memoize; +use IPC::Run qw(start timeout pump finish); +use File::Which; use Data::Dumper; @@ -27,15 +29,15 @@ =head1 DESCRIPTION This is alternative interface for indexing data with swish-e. It's designed -to split indexes over multiple files to allow updates of records in index -by reindexing just changed parts. +to split indexes over multiple files (slices) to allow updates of records in index +by reindexing just changed parts (slice). Data is stored in index using intrface which is somewhat similar to L. This could make your migration (or supporting two index engines) easier. In the background, it will fork swish-e binaries (one for each index slice) -and produce UTF-8 encoded XML files. So, if your imput charset isn't +and produce UTF-8 encoded XML files for it. So, if your imput charset isn't C you will have to specify it. =head1 Methods used for indexing @@ -48,8 +50,13 @@ index => '/path/to/index', slice_name => \&slice_on_path, slices => 30, - merge => 1, - codepage => 'ISO-8859-2' + merge => 0, + codepage => 'ISO-8859-2', + swish_config => qq{ + PropertyNames from date + PropertyNamesDate date + }, + memoize_to_xml => 0, ); # split index on first component of path @@ -57,10 +64,43 @@ return shift split(/\//,$_[0]); } +Options to open are following: -C is maximum number of index slices. See L<"in_slice"> for +=over 5 + +=item C + +path to (existing) directory in which index slices will be created. + +=item C + +coderef to function which provide slicing from path. + +=item C + +maximum number of index slices. See L<"in_slice"> for more explanation. +=item C + +(planned) option to merge indexes into one at end. + +=item C + +data codepage (needed for conversion to UTF-8). +By default, it's C. + +=item C + +additional parametars which will be inserted into +C configuration file. See L. + +=item C + +speed up repeatable data, see L<"to_xml">. + +=back + =cut my $iso2utf = Text::Iconv->new('ISO-8859-1','UTF-8'); @@ -70,8 +110,6 @@ my $self = {@_}; bless($self, $class); - print Dumper($self->{'slice_name'}); - croak "need slice_name coderef" unless ref $self->{'slice_name'}; croak "need slices" unless $self->{'slices'}; @@ -81,7 +119,9 @@ $iso2utf = Text::Iconv->new($self->{'codepage'},'UTF-8') if ($self->{'codepage'}); + # speedup memoize('in_slice'); + memoize('to_xml') if ($self->{'memoize_to_xml'}); $self ? return $self : return undef; @@ -104,6 +144,13 @@ my $swishpath = shift || return; my $data = shift || return; + my ($out,$err) = $self->put_slice($swishpath, $self->to_xml($data)); + + if ($err) { + carp "$swishpath: $err"; + return 0; + } + return 1; } @@ -124,21 +171,29 @@ } -=head2 close +=head2 finish -Close index file and finish indexing. +Finish indexing and close index file(s). - $i->close; + $i->finish; This is most time-consuming operation. When it's called, it will re-index all entries which haven't changed in all slices. +Returns number of slices updated. + =cut -sub close { +sub finish { my $self = shift; - return 1; + my $ret = 0; + + foreach my $s (keys %{$self->{'slice'}}) { + $ret += $self->close_slice($s); + } + + return $ret; } @@ -214,6 +269,8 @@ run, think about creating your own C function and distributing documents manually across slices. +Slice number must always be true value or various sanity checks will fail. + This function is Ced for performance reasons. =cut @@ -223,7 +280,6 @@ my $path = shift || confess "need path"; - print Dumper($self->{'slice_name'}); confess "need slice_name function" unless ref ($self->{'slice_name'}); if ($self->{'slices'}) { @@ -235,8 +291,9 @@ # FIXME how random is this? $slice = hex(substr($slice,0,8)); - print "slice_nr: $slice slices: ",$self->{'slices'},"\n"; - return ($slice % $self->{'slices'}); + $slice = ($slice % $self->{'slices'}) + 1; + print "hash: $slice / ",$self->{'slices'}," => $slice\n"; + return $slice; } else { return &{$self->{'split'}}($path); } @@ -260,6 +317,251 @@ } +=head2 make_config + +Create C configuration file for given slice. + + my $config_filename = $i->make_config('slice name'); + +It returns configuration filename. If no C was defined in +L<"open">, default swish-e configuration will be used. It will index all data for +searching, but none for properties. + +If you want to see what is allready defined for swish-e in configuration +take a look at source code for C. + +It uses C utility to comunicate with C. Path is provided +by C. Do Windows users have to change that to C +or something similar? + +=cut + +sub make_config { + my $self = shift; + + + my $index_file = $self->{'index'}."/"; + $index_file .= shift || confess "need slice name"; + + my ($tmp_fh, $swish_config_filename) = mkstemp("/tmp/swishXXXXX"); + + # find cat on filesystem + my $cat = which('cat'); + + print $tmp_fh <<"DEFAULT_SWISH_CONF"; +# swish-e config file + +IndexDir cat +#SwishProgParameters - + +# input file definition +DefaultContents XML* + +# indexed metatags +MetaNames xml swishdocpath + + +#XMLClassAttributes type +UndefinedMetaTags auto +UndefinedXMLAttributes auto + +IndexFile $index_file + +# Croatian ISO-8859-2 characters to unaccented equivalents +TranslateCharacters ¹©ðÐèÈæƾ® ssddcccczz + + +# disable output +ParserWarnLevel 0 +IndexReport 1 + +DEFAULT_SWISH_CONF + + # add user parametars (like stored properties) + print $tmp_fh $self->{'swish_config'} if ($self->{'swish_config'}); + + close($tmp_fh); + + return $swish_config_filename; +} + +=head2 create_slice + +On first run, starts C using L. On subsequent calls just return +it's handles using L. + + my $s = create_slice('/path/to/document'); + +You shouldn't need to call C directly because it will be called +from L<"put_slice"> when needed. + +=cut + +sub create_slice { + my $self = shift; + + my $path = shift || confess "create_slice need path!"; + + my $s = $self->in_slice($path) || confess "in_slice returned null"; + + return $s if (exists($self->{'slice'}->{$s})); + + my $swish_config = $self->make_config($s); + + print STDERR "creating slice $s\n"; # FIXME + + my @swish = qw(swish-e -S prog -c); + push @swish, $swish_config; + + ## Build the harness, open all pipes, and launch the subprocesses + $self->{'slice'}->{$s}->{'h'} = start \@swish, + \$self->{'slice'}->{$s}->{'in'}, + \$self->{'slice'}->{$s}->{'out'}, + \$self->{'slice'}->{$s}->{'err'}, + timeout( 90 ); # FIXME + + $self->{'slice'}->{$s}->{'out_len'} = 0; + $self->{'slice'}->{$s}->{'err_len'} = 0; + + $self->slice_output($s); + + return $s; +} + +=head2 put_slice + +Pass XML data to swish and receive output and errors. + + my ($out,$err) = $i->put_slice('/swish/path', 'data'); + +=cut + +sub put_slice { + my $self = shift; + + my $path = shift || confess "need path"; + my $xml = shift || confess "need xml"; + + $xml = $iso2utf->convert($xml) || carp "XML conversion error in $xml"; + + my $s = $self->create_slice($path) || confess "create_slice returned null"; + + confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s})); + confess "no 'in' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'in'})); + confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'})); + + $self->slice_output($s); + + use bytes; # as opposed to chars + $self->{'slice'}->{$s}->{'in'} .= + "Path-Name: $path\n". + "Content-Length: ".(length($xml)+1)."\n". + "Document-Type: XML\n\n$xml\n"; + + # do I/O + $self->{'slice'}->{$s}->{'h'}->pump while length $self->{'slice'}->{$s}->{'in'} ; # wait for all input to go + + $self->slice_output($s); + + return $s; +} + +=head2 slice_output + +Prints to STDERR output and errors from C. + + $i->slice_output($s); + +Normally, you don't need to call it. + +=cut + +sub slice_output { + my $self = shift; + + my $s = shift || confess "slice_output needs slice"; + + confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s})); + confess "no 'in' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'in'})); + confess "no 'out' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'out'})); + + if (length $self->{'slice'}->{$s}->{'out'} > $self->{'slice'}->{$s}->{'out_len'}) { + #print STDERR "swish-e OUT: ",$self->{'slice'}->{$s}->{'out'},"\n" if ($self->{'slice'}->{$s}->{'out'}); + $self->{'slice'}->{$s}->{'out_len'} = length $self->{'slice'}->{$s}->{'out'}; + return 1; + } elsif (length $self->{'slice'}->{$s}->{'err'} > $self->{'slice'}->{$s}->{'err_len'}) { + print STDERR "swish-e ERR: ",$self->{'slice'}->{$s}->{'err'},"\n" if ($self->{'slice'}->{$s}->{'err'}); + $self->{'slice'}->{$s}->{'err_len'} = length $self->{'slice'}->{$s}->{'err'}; + # this is fatal + return 0; + } + + return 1; +} + +=head2 close_slice { + +Close slice (terminates swish-e process for that slice). + + my $i->close_slice($s); + +Returns true if slice is closed, false otherwise. + +=cut + +sub close_slice { + my $self = shift; + + my $s = shift || confess "close_slice needs slice"; + + confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s})); + confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'})); + + # pump rest of content (if any) + $self->{'slice'}->{$s}->{'h'}->pump while length $self->{'slice'}->{$s}->{'in'}; + + $self->slice_output($s); + + # clean up + $self->{'slice'}->{$s}->{'h'}->finish or confess "finish on slice $s returned: $?"; + + delete($self->{'slice'}->{$s}) && return 1; + return 0; +} + +=head2 to_xml + +Convert (binary safe, I hope) your data into XML for C. +Data will not yet be recoded to UTF-8. L<"put_slice"> will do that. + + my $xml = $i->to_xml({ foo => 'bar' }); + +This function is extracted from L<"add"> method so that you can L it. +If your data set has a lot of repeatable data, and memory is not a problem, you +can add C option to L<"open">. + +=cut + +my %escape = ('<'=>'<', '>'=>'>', '&'=>'&', '"'=>'"'); +my $escape_re = join '|' => keys %escape; + +sub to_xml { + my $self = shift; + + my $data = shift || return; + + my $xml = qq{}; + foreach my $tag (keys %$data) { + my $content = $data->{$tag}; + next if (! $content || $content eq ''); + # save [cr/]lf before conversion to XML +# $content =~ s/\n\r/##lf##/gs; +# $content =~ s/\n/##lf##/gs; + $content =~ s/($escape_re)/$escape{$1}/gs; + $xml .= "<$tag>"; + } + $xml .= qq{}; +} 1; __END__ @@ -282,9 +584,12 @@ =head2 EXPORT -None by default. +Nothing by default. +=head2 EXAMPLES +Test script for this module uses all parts of API. It's also nice example +how to use C. =head1 SEE ALSO