--- trunk/Split.pm	2004/08/08 19:22:56	4
+++ trunk/Split.pm	2004/12/19 03:06:01	8
@@ -4,7 +4,7 @@
 use strict;
 use warnings;
 
-our $VERSION = '0.00';
+our $VERSION = '0.02';
 
 use SWISH::API;
 use Text::Iconv;
@@ -12,11 +12,15 @@
 use Carp;
 use Digest::MD5 qw(md5_hex);
 use Memoize;
-use IPC::Run qw(start timeout pump finish);
 use File::Which;
 
 use Data::Dumper;
 
+use constant {
+	ADDED => 1,
+	DELETED => 2,
+};
+
 =head1 NAME
 
 SWISH::Split - Perl interface to split index variant of Swish-e
@@ -37,16 +41,16 @@
 engines) easier.
 
 In the background, it will fork swish-e binaries (one for each index slice)
-and produce UTF-8 encoded XML files for it. So, if your imput charset isn't
+and produce UTF-8 encoded XML files for it. So, if your input charset isn't
 C<ISO-8859-1> you will have to specify it.
 
 =head1 Methods used for indexing
 
-=head2 open
+=head2 open_index
 
 Create new object for index.
 
-  my $i = SWISH::Split->open({
+  my $i = SWISH::Split->open_index({
   	index => '/path/to/index',
 	slice_name => \&slice_on_path,
 	slices => 30,
@@ -64,7 +68,7 @@
 	return shift split(/\//,$_[0]);
   }
 
-Options to open are following:
+Options to C<open_index> are following:
 
 =over 5
 
@@ -105,7 +109,7 @@
 
 my $iso2utf = Text::Iconv->new('ISO-8859-1','UTF-8');
 
-sub open {
+sub open_index {
         my $class = shift;
         my $self = {@_};
 	bless($self, $class);
@@ -144,14 +148,9 @@
 	my $swishpath = shift || return;
 	my $data = shift || return;
 
-	my ($out,$err) = $self->put_slice($swishpath, $self->to_xml($data));
-
-	if ($err) {
-		carp "$swishpath: $err";
-		return 0;
-	}
+	my $slice = $self->put_slice($swishpath, $self->to_xml($data));
 
-	return 1;
+	return $slice;
 }
 
 =head2 delete
@@ -167,29 +166,37 @@
 
 	my @paths = @_ || return;
 
+	foreach my $path (@paths) {
+		$self->{'paths'}->{$path} = DELETED; 
+	}
+
 	return 42;
 }
 
 
-=head2 finish
+=head2 done
 
 Finish indexing and close index file(s).
 
-  $i->finish;
+  $i->done;
 
 This is most time-consuming operation. When it's called, it will re-index
 all entries which haven't changed in all slices.
 
 Returns number of slices updated.
 
+This method should really be called close or finish, but both of those are
+allready used.
+
 =cut
 
-sub finish {
+sub done {
 	my $self = shift;
 
 	my $ret = 0;
 
 	foreach my $s (keys %{$self->{'slice'}}) {
+		$self->_debug("closing slice $s");
 		$ret += $self->close_slice($s);
 	}
 
@@ -212,6 +219,11 @@
 
 sub swishpaths {
 	my $self = shift;
+
+	my $s = shift || return;
+	return if (! exists($self->{'slice'}->{'s'}));
+
+	return keys %{$self->{'slice'}->{'s'}};
 }
 
 =head2 swishpaths_updated
@@ -262,7 +274,7 @@
 
   my $s = $i->in_slice('path/to/document/in/index');
 
-If there are C<slices> parametar to L<"open"> it will use
+If there are C<slices> parametar to L<"open_index"> it will use
 MD5 hash to spread documents across slices. That will produce random
 distribution of your documents in slices, which might or might not be best
 for your data. If you have to re-index large number of slices on each
@@ -286,13 +298,13 @@
 		# first, pass path through slice_name function
 		my $slice = &{$self->{'slice_name'}}($path);
 		# then calculate MD5 hash
-		$slice = md5_hex($slice);
+		my $hash = md5_hex($slice);
 		# take first 8 chars to produce number
 		# FIXME how random is this?
-		$slice = hex(substr($slice,0,8));
+		$hash = hex(substr($hash,0,8));
 		
-		$slice = ($slice % $self->{'slices'}) + 1;
-		print "hash: $slice / ",$self->{'slices'}," => $slice\n";
+		$slice = ($hash % $self->{'slices'}) + 1;
+		$self->_debug("hash: $hash / ",$self->{'slices'}," => $slice");
 		return $slice;
 	} else {
 		return &{$self->{'split'}}($path);
@@ -313,7 +325,6 @@
 sub find_paths {
 	my $self = shift;
 
-	my $s = shift || return;
 }
 
 
@@ -324,15 +335,13 @@
   my $config_filename = $i->make_config('slice name');
 
 It returns configuration filename. If no C<swish_config> was defined in
-L<"open">, default swish-e configuration will be used. It will index all data for
+L<"open_index">, default swish-e configuration will be used. It will index all data for
 searching, but none for properties.
 
 If you want to see what is allready defined for swish-e in configuration
 take a look at source code for C<DEFAULT_SWISH_CONF>.
 
-It uses C<cat> utility to comunicate with C<swish-e>. Path is provided
-by C<File::Which>. Do Windows users have to change that to C<COPY /B>
-or something similar?
+It uses C<stdin> as C<IndexDir> to comunicate with C<swish-e>.
 
 =cut
 
@@ -351,8 +360,7 @@
 	print $tmp_fh <<"DEFAULT_SWISH_CONF";
 # swish-e config file
 
-IndexDir cat
-#SwishProgParameters -
+IndexDir stdin
 
 # input file definition
 DefaultContents XML*
@@ -387,7 +395,7 @@
 
 =head2 create_slice
 
-On first run, starts C<swish-e> using L<IPC::Run>. On subsequent calls just return
+On first run, starts C<swish-e>. On subsequent calls just return
 it's handles using L<Memoize>.
 
   my $s = create_slice('/path/to/document');
@@ -408,20 +416,16 @@
 
 	my $swish_config = $self->make_config($s);
 
-	print STDERR "creating slice $s\n";	# FIXME
+	my $swish = qq{| swish-e };
+	$swish .= qq{ -u } if (-f $self->{'index'}.'/'.$s);	
+	$swish .= qq{ -S prog -c } . $swish_config;
 
-	my @swish = qw(swish-e -S prog -c);
-	push @swish, $swish_config;
+	$self->_debug("creating slice $s using $swish");
 
 	## Build the harness, open all pipes, and launch the subprocesses
-	$self->{'slice'}->{$s}->{'h'} = start \@swish,
-		\$self->{'slice'}->{$s}->{'in'},
-		\$self->{'slice'}->{$s}->{'out'},
-		\$self->{'slice'}->{$s}->{'err'},
-		timeout( 90 );	# FIXME
+	open(my $fh, $swish) || croak "can't open $swish: $!";
 
-	$self->{'slice'}->{$s}->{'out_len'} = 0;
-	$self->{'slice'}->{$s}->{'err_len'} = 0;
+	$self->{'slice'}->{$s}->{'h'} = $fh;
 
 	$self->slice_output($s);
 
@@ -430,9 +434,11 @@
 
 =head2 put_slice
 
-Pass XML data to swish and receive output and errors.
+Pass XML data to swish.
 
-  my ($out,$err) = $i->put_slice('/swish/path', '<xml>data</xml>');
+  my $slice = $i->put_slice('/swish/path', '<xml>data</xml>');
+
+Returns slice in which XML ended up.
 
 =cut
 
@@ -447,22 +453,23 @@
 	my $s = $self->create_slice($path) || confess "create_slice returned null";
 
 	confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
-	confess "no 'in' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'in'}));
 	confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));
 
 	$self->slice_output($s);
 
 	use bytes;      # as opposed to chars
-	$self->{'slice'}->{$s}->{'in'} .=
-		"Path-Name: $path\n".
+	my $fh = $self->{'slice'}->{$s}->{'h'} || confess "handle for slice $s undefined";
+	print { $fh } "Path-Name: $path\n".
 		"Content-Length: ".(length($xml)+1)."\n".
+		"Update-Mode: Index\n".
 		"Document-Type: XML\n\n$xml\n";
 
-	# do I/O
-	$self->{'slice'}->{$s}->{'h'}->pump while length $self->{'slice'}->{$s}->{'in'} ;  # wait for all input to go
-
 	$self->slice_output($s);
 
+	$self->_debug("dumping in slice $s: $path");
+
+	$self->{'paths'}->{$path} = ADDED; 
+
 	return $s;
 }
 
@@ -470,10 +477,13 @@
 
 Prints to STDERR output and errors from C<swish-e>.
 
-  $i->slice_output($s);
+  my $slice = $i->slice_output($s);
 
 Normally, you don't need to call it.
 
+B<This is dummy placeholder function for very old code that assumes this
+module is using C<IPC::Run> which it isn't any more.>
+
 =cut
 
 sub slice_output {
@@ -482,24 +492,13 @@
 	my $s = shift || confess "slice_output needs slice";
 
 	confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
-	confess "no 'in' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'in'}));
-	confess "no 'out' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'out'}));
 
-	if (length $self->{'slice'}->{$s}->{'out'} > $self->{'slice'}->{$s}->{'out_len'}) {
-		#print STDERR "swish-e OUT: ",$self->{'slice'}->{$s}->{'out'},"\n" if ($self->{'slice'}->{$s}->{'out'});
-		$self->{'slice'}->{$s}->{'out_len'} = length $self->{'slice'}->{$s}->{'out'};
-		return 1;
-	} elsif (length $self->{'slice'}->{$s}->{'err'} > $self->{'slice'}->{$s}->{'err_len'}) {
-		print STDERR "swish-e ERR: ",$self->{'slice'}->{$s}->{'err'},"\n" if ($self->{'slice'}->{$s}->{'err'});
-		$self->{'slice'}->{$s}->{'err_len'} = length $self->{'slice'}->{$s}->{'err'};
-		# this is fatal
-		return 0;
-	}
+	# FIXME
 
-	return 1;
+	return $s;
 }
 
-=head2 close_slice {
+=head2 close_slice
 
 Close slice (terminates swish-e process for that slice).
 
@@ -518,12 +517,11 @@
 	confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));
 
 	# pump rest of content (if any)
-	$self->{'slice'}->{$s}->{'h'}->pump while length $self->{'slice'}->{$s}->{'in'};
+	close $self->{'slice'}->{$s}->{'h'} || carp "can't close slice $s: $!";
 
 	$self->slice_output($s);
 
-	# clean up
-	$self->{'slice'}->{$s}->{'h'}->finish or confess "finish on slice $s returned: $?";
+	undef $self->{'slice'}->{$s}->{'h'};
 	
 	delete($self->{'slice'}->{$s}) && return 1;
 	return 0;
@@ -538,7 +536,7 @@
 
 This function is extracted from L<"add"> method so that you can L<Memoize> it.
 If your data set has a lot of repeatable data, and memory is not a problem, you
-can add C<memoize_to_xml> option to L<"open">.
+can add C<memoize_to_xml> option to L<"open_index">.
 
 =cut
 
@@ -563,11 +561,17 @@
 	$xml .= qq{</xml>};
 }
 
+sub _debug {
+	my $self = shift;
+	print STDERR "## ",@_,"\n" if ($self->{'debug'});
+	return;
+}
+
 1;
 __END__
 
 
-=head2 Searching
+=head1 Searching
 
 Searching is still conducted using L<SWISH::API>, but you have to glob
 index names.
@@ -582,11 +586,11 @@
 That would also benefit performance, but it increases indexing time
 because merged indexes must be re-created on each indexing run.
 
-=head2 EXPORT
+=head1 EXPORT
 
 Nothing by default.
 
-=head2 EXAMPLES
+=head1 EXAMPLES
 
 Test script for this module uses all parts of API. It's also nice example
 how to use C<SWISH::Split>.