--- trunk/Split.pm	2004/08/08 10:53:04	3
+++ trunk/Split.pm	2004/08/08 19:22:56	4
@@ -12,6 +12,8 @@
 use Carp;
 use Digest::MD5 qw(md5_hex);
 use Memoize;
+use IPC::Run qw(start timeout pump finish);
+use File::Which;
 
 use Data::Dumper;
 
@@ -27,15 +29,15 @@
 =head1 DESCRIPTION
 
 This is alternative interface for indexing data with swish-e. It's designed
-to split indexes over multiple files to allow updates of records in index
-by reindexing just changed parts.
+to split indexes over multiple files (slices) to allow updates of records in index
+by reindexing just changed parts (slice).
 
 Data is stored in index using intrface which is somewhat similar to
 L<Plucene::Simple>. This could make your migration (or supporting two index
 engines) easier.
 
 In the background, it will fork swish-e binaries (one for each index slice)
-and produce UTF-8 encoded XML files. So, if your imput charset isn't
+and produce UTF-8 encoded XML files for it. So, if your imput charset isn't
 C<ISO-8859-1> you will have to specify it.
 
 =head1 Methods used for indexing
@@ -48,8 +50,13 @@
   	index => '/path/to/index',
 	slice_name => \&slice_on_path,
 	slices => 30,
-	merge => 1,
-	codepage => 'ISO-8859-2'
+	merge => 0,
+	codepage => 'ISO-8859-2',
+	swish_config => qq{
+		PropertyNames from date
+		PropertyNamesDate date
+        },
+	memoize_to_xml => 0,
   );
 
   # split index on first component of path
@@ -57,10 +64,43 @@
 	return shift split(/\//,$_[0]);
   }
 
+Options to open are following:
 
-C<slices> is maximum number of index slices. See L<"in_slice"> for
+=over 5
+
+=item C<index>
+
+path to (existing) directory in which index slices will be created.
+
+=item C<slice_name>
+
+coderef to function which provide slicing from path.
+
+=item C<slices>
+
+maximum number of index slices. See L<"in_slice"> for
 more explanation.
 
+=item C<merge>
+
+(planned) option to merge indexes into one at end.
+
+=item C<codepage>
+
+data codepage (needed for conversion to UTF-8).
+By default, it's C<ISO-8859-1>.
+
+=item C<swish_config>
+
+additional parametars which will be inserted into
+C<swish-e> configuration file. See L<swish-config>.
+
+=item C<memoize_to_xml>
+
+speed up repeatable data, see L<"to_xml">.
+
+=back
+
 =cut
 
 my $iso2utf = Text::Iconv->new('ISO-8859-1','UTF-8');
@@ -70,8 +110,6 @@
         my $self = {@_};
 	bless($self, $class);
 
-	print Dumper($self->{'slice_name'});
-
 	croak "need slice_name coderef" unless ref $self->{'slice_name'};
 	croak "need slices" unless $self->{'slices'};
 
@@ -81,7 +119,9 @@
 
 	$iso2utf = Text::Iconv->new($self->{'codepage'},'UTF-8') if ($self->{'codepage'});
 
+	# speedup
 	memoize('in_slice');
+	memoize('to_xml') if ($self->{'memoize_to_xml'});
 
 	$self ? return $self : return undef;
 
@@ -104,6 +144,13 @@
 	my $swishpath = shift || return;
 	my $data = shift || return;
 
+	my ($out,$err) = $self->put_slice($swishpath, $self->to_xml($data));
+
+	if ($err) {
+		carp "$swishpath: $err";
+		return 0;
+	}
+
 	return 1;
 }
 
@@ -124,21 +171,29 @@
 }
 
 
-=head2 close
+=head2 finish
 
-Close index file and finish indexing.
+Finish indexing and close index file(s).
 
-  $i->close;
+  $i->finish;
 
 This is most time-consuming operation. When it's called, it will re-index
 all entries which haven't changed in all slices.
 
+Returns number of slices updated.
+
 =cut
 
-sub close {
+sub finish {
 	my $self = shift;
 
-	return 1;
+	my $ret = 0;
+
+	foreach my $s (keys %{$self->{'slice'}}) {
+		$ret += $self->close_slice($s);
+	}
+
+	return $ret;
 }
 
 
@@ -214,6 +269,8 @@
 run, think about creating your own C<slice> function and distributing
 documents manually across slices.
 
+Slice number must always be true value or various sanity checks will fail.
+
 This function is C<Memoize>ed for performance reasons.
 
 =cut
@@ -223,7 +280,6 @@
 
 	my $path = shift || confess "need path";
 
-	print Dumper($self->{'slice_name'});
 	confess "need slice_name function" unless ref ($self->{'slice_name'});
 
 	if ($self->{'slices'}) {
@@ -235,8 +291,9 @@
 		# FIXME how random is this?
 		$slice = hex(substr($slice,0,8));
 		
-		print "slice_nr: $slice slices: ",$self->{'slices'},"\n";
-		return ($slice % $self->{'slices'});
+		$slice = ($slice % $self->{'slices'}) + 1;
+		print "hash: $slice / ",$self->{'slices'}," => $slice\n";
+		return $slice;
 	} else {
 		return &{$self->{'split'}}($path);
 	}
@@ -260,6 +317,251 @@
 }
 
 
+=head2 make_config
+
+Create C<swish-e> configuration file for given slice.
+
+  my $config_filename = $i->make_config('slice name');
+
+It returns configuration filename. If no C<swish_config> was defined in
+L<"open">, default swish-e configuration will be used. It will index all data for
+searching, but none for properties.
+
+If you want to see what is allready defined for swish-e in configuration
+take a look at source code for C<DEFAULT_SWISH_CONF>.
+
+It uses C<cat> utility to comunicate with C<swish-e>. Path is provided
+by C<File::Which>. Do Windows users have to change that to C<COPY /B>
+or something similar?
+
+=cut
+
+sub make_config {
+	my $self = shift;
+
+
+	my $index_file = $self->{'index'}."/";
+	$index_file .= shift || confess "need slice name";
+
+	my ($tmp_fh, $swish_config_filename) = mkstemp("/tmp/swishXXXXX");
+
+	# find cat on filesystem
+	my $cat = which('cat');
+
+	print $tmp_fh <<"DEFAULT_SWISH_CONF";
+# swish-e config file
+
+IndexDir cat
+#SwishProgParameters -
+
+# input file definition
+DefaultContents XML*
+
+# indexed metatags
+MetaNames xml swishdocpath
+
+
+#XMLClassAttributes type
+UndefinedMetaTags auto
+UndefinedXMLAttributes auto
+
+IndexFile $index_file
+
+# Croatian ISO-8859-2 characters to unaccented equivalents
+TranslateCharacters 工靜菷翔小 ssddcccczz
+
+
+# disable output
+ParserWarnLevel 0
+IndexReport 1
+
+DEFAULT_SWISH_CONF
+
+	# add user parametars (like stored properties)
+	print $tmp_fh $self->{'swish_config'} if ($self->{'swish_config'});
+
+	close($tmp_fh);
+
+	return $swish_config_filename;
+}
+
+=head2 create_slice
+
+On first run, starts C<swish-e> using L<IPC::Run>. On subsequent calls just return
+it's handles using L<Memoize>.
+
+  my $s = create_slice('/path/to/document');
+
+You shouldn't need to call C<create_slice> directly because it will be called
+from L<"put_slice"> when needed.
+
+=cut
+
+sub create_slice {
+	my $self = shift;
+
+	my $path = shift || confess "create_slice need path!";
+
+	my $s = $self->in_slice($path) || confess "in_slice returned null";
+
+	return $s if (exists($self->{'slice'}->{$s}));
+
+	my $swish_config = $self->make_config($s);
+
+	print STDERR "creating slice $s\n";	# FIXME
+
+	my @swish = qw(swish-e -S prog -c);
+	push @swish, $swish_config;
+
+	## Build the harness, open all pipes, and launch the subprocesses
+	$self->{'slice'}->{$s}->{'h'} = start \@swish,
+		\$self->{'slice'}->{$s}->{'in'},
+		\$self->{'slice'}->{$s}->{'out'},
+		\$self->{'slice'}->{$s}->{'err'},
+		timeout( 90 );	# FIXME
+
+	$self->{'slice'}->{$s}->{'out_len'} = 0;
+	$self->{'slice'}->{$s}->{'err_len'} = 0;
+
+	$self->slice_output($s);
+
+	return $s;
+}
+
+=head2 put_slice
+
+Pass XML data to swish and receive output and errors.
+
+  my ($out,$err) = $i->put_slice('/swish/path', '<xml>data</xml>');
+
+=cut
+
+sub put_slice {
+	my $self = shift;
+
+	my $path = shift || confess "need path";
+	my $xml = shift || confess "need xml";
+
+	$xml = $iso2utf->convert($xml) || carp "XML conversion error in $xml";
+
+	my $s = $self->create_slice($path) || confess "create_slice returned null";
+
+	confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
+	confess "no 'in' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'in'}));
+	confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));
+
+	$self->slice_output($s);
+
+	use bytes;      # as opposed to chars
+	$self->{'slice'}->{$s}->{'in'} .=
+		"Path-Name: $path\n".
+		"Content-Length: ".(length($xml)+1)."\n".
+		"Document-Type: XML\n\n$xml\n";
+
+	# do I/O
+	$self->{'slice'}->{$s}->{'h'}->pump while length $self->{'slice'}->{$s}->{'in'} ;  # wait for all input to go
+
+	$self->slice_output($s);
+
+	return $s;
+}
+
+=head2 slice_output
+
+Prints to STDERR output and errors from C<swish-e>.
+
+  $i->slice_output($s);
+
+Normally, you don't need to call it.
+
+=cut
+
+sub slice_output {
+	my $self = shift;
+
+	my $s = shift || confess "slice_output needs slice";
+
+	confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
+	confess "no 'in' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'in'}));
+	confess "no 'out' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'out'}));
+
+	if (length $self->{'slice'}->{$s}->{'out'} > $self->{'slice'}->{$s}->{'out_len'}) {
+		#print STDERR "swish-e OUT: ",$self->{'slice'}->{$s}->{'out'},"\n" if ($self->{'slice'}->{$s}->{'out'});
+		$self->{'slice'}->{$s}->{'out_len'} = length $self->{'slice'}->{$s}->{'out'};
+		return 1;
+	} elsif (length $self->{'slice'}->{$s}->{'err'} > $self->{'slice'}->{$s}->{'err_len'}) {
+		print STDERR "swish-e ERR: ",$self->{'slice'}->{$s}->{'err'},"\n" if ($self->{'slice'}->{$s}->{'err'});
+		$self->{'slice'}->{$s}->{'err_len'} = length $self->{'slice'}->{$s}->{'err'};
+		# this is fatal
+		return 0;
+	}
+
+	return 1;
+}
+
+=head2 close_slice {
+
+Close slice (terminates swish-e process for that slice).
+
+  my $i->close_slice($s);
+
+Returns true if slice is closed, false otherwise.
+
+=cut
+
+sub close_slice {
+	my $self = shift;
+
+	my $s = shift || confess "close_slice needs slice";
+
+	confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
+	confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));
+
+	# pump rest of content (if any)
+	$self->{'slice'}->{$s}->{'h'}->pump while length $self->{'slice'}->{$s}->{'in'};
+
+	$self->slice_output($s);
+
+	# clean up
+	$self->{'slice'}->{$s}->{'h'}->finish or confess "finish on slice $s returned: $?";
+	
+	delete($self->{'slice'}->{$s}) && return 1;
+	return 0;
+}
+
+=head2 to_xml
+
+Convert (binary safe, I hope) your data into XML for C<swish-e>.
+Data will not yet be recoded to UTF-8. L<"put_slice"> will do that.
+
+  my $xml = $i->to_xml({ foo => 'bar' });
+
+This function is extracted from L<"add"> method so that you can L<Memoize> it.
+If your data set has a lot of repeatable data, and memory is not a problem, you
+can add C<memoize_to_xml> option to L<"open">.
+
+=cut
+
+my %escape = ('<'=>'&lt;', '>'=>'&gt;', '&'=>'&amp;', '"'=>'&quot;');
+my $escape_re  = join '|' => keys %escape;
+
+sub to_xml {
+	my $self = shift;
+
+	my $data = shift || return;
+
+	my $xml = qq{<xml>};
+	foreach my $tag (keys %$data) {
+		my $content = $data->{$tag};
+		next if (! $content || $content eq '');
+		# save [cr/]lf before conversion to XML
+#		$content =~ s/\n\r/##lf##/gs;
+#		$content =~ s/\n/##lf##/gs;
+		$content =~ s/($escape_re)/$escape{$1}/gs;
+		$xml .= "<$tag><![CDATA[".$content."]]></$tag>";
+	}
+	$xml .= qq{</xml>};
+}
 
 1;
 __END__
@@ -282,9 +584,12 @@
 
 =head2 EXPORT
 
-None by default.
+Nothing by default.
 
+=head2 EXAMPLES
 
+Test script for this module uses all parts of API. It's also nice example
+how to use C<SWISH::Split>.
 
 =head1 SEE ALSO