SWISH-Split/trunk/Split.pm

package SWISH::Split;

use 5.008;
use strict;
use warnings;

our $VERSION = '0.00';

use SWISH::API;
use Text::Iconv;
use File::Temp qw/ :mktemp /;
use Carp;
use Digest::MD5 qw(md5_hex);
use Memoize;
use IPC::Run qw(start timeout pump finish);
use File::Which;

use Data::Dumper;

=head1 NAME

SWISH::Split - Perl interface to split index variant of Swish-e

=head1 SYNOPSIS

  use SWISH::Split;


=head1 DESCRIPTION

This is alternative interface for indexing data with swish-e. It's designed
to split indexes over multiple files (slices) to allow updates of records in index
by reindexing just changed parts (slice).

Data is stored in index using intrface which is somewhat similar to
L<Plucene::Simple>. This could make your migration (or supporting two index
engines) easier.

In the background, it will fork swish-e binaries (one for each index slice)
and produce UTF-8 encoded XML files for it. So, if your imput charset isn't
C<ISO-8859-1> you will have to specify it.

=head1 Methods used for indexing

=head2 open

Create new object for index.

  my $i = SWISH::Split->open({
        index => '/path/to/index',
        slice_name => \&slice_on_path,
        slices => 30,
        merge => 0,
        codepage => 'ISO-8859-2',
        swish_config => qq{
                PropertyNames from date
                PropertyNamesDate date
        },
        memoize_to_xml => 0,
  );

  # split index on first component of path
  sub slice_on_path {
        return shift split(/\//,$_[0]);
  }

Options to open are following:

=over 5

=item C<index>

path to (existing) directory in which index slices will be created.

=item C<slice_name>

coderef to function which provide slicing from path.

=item C<slices>

maximum number of index slices. See L<"in_slice"> for
more explanation.

=item C<merge>

(planned) option to merge indexes into one at end.

=item C<codepage>

data codepage (needed for conversion to UTF-8).
By default, it's C<ISO-8859-1>.

=item C<swish_config>

additional parametars which will be inserted into
C<swish-e> configuration file. See L<swish-config>.

=item C<memoize_to_xml>

speed up repeatable data, see L<"to_xml">.

=back

=cut

my $iso2utf = Text::Iconv->new('ISO-8859-1','UTF-8');

sub open {
        my $class = shift;
        my $self = {@_};
        bless($self, $class);

        croak "need slice_name coderef" unless ref $self->{'slice_name'};
        croak "need slices" unless $self->{'slices'};

        croak "need index" unless $self->{'index'};
        croak "index '",$self->{'index'},"' doesn't exist" unless -e $self->{'index'};
        croak "index '",$self->{'index'},"' is not directory" unless -d $self->{'index'};

        $iso2utf = Text::Iconv->new($self->{'codepage'},'UTF-8') if ($self->{'codepage'});

        # speedup
        memoize('in_slice');
        memoize('to_xml') if ($self->{'memoize_to_xml'});

        $self ? return $self : return undef;

}

=head2 add

Add document to index.

  $i->add($swishpath, {
        headline => 'foobar result',
        property => 'data',
  })

=cut

sub add {
        my $self = shift;

        my $swishpath = shift || return;
        my $data = shift || return;

        my ($out,$err) = $self->put_slice($swishpath, $self->to_xml($data));

        if ($err) {
                carp "$swishpath: $err";
                return 0;
        }

        return 1;
}

=head2 delete

Delete documents from index.

  $i->delete(@swishpath);

=cut

sub delete {
        my $self = shift;

        my @paths = @_ || return;

        return 42;
}


=head2 finish

Finish indexing and close index file(s).

  $i->finish;

This is most time-consuming operation. When it's called, it will re-index
all entries which haven't changed in all slices.

Returns number of slices updated.

=cut

sub finish {
        my $self = shift;

        my $ret = 0;

        foreach my $s (keys %{$self->{'slice'}}) {
                $ret += $self->close_slice($s);
        }

        return $ret;
}


=head1 Reporting methods

This methods return statistics about your index.

=head2 swishpaths

Return array of C<swishpath>s in index.

  my @p = $i->swishpaths;

=cut

sub swishpaths {
        my $self = shift;
}

=head2 swishpaths_updated

Return array with updated C<swishpath>s.

  my @d = $i->swishpaths_updated;

=cut

sub swishpaths_updated {
        my $self = shift;
}


=head2 swishpaths_deleted

Return array with deleted C<swishpath>s.

  my $n = $i->swishpaths_deleted;

=cut

sub swishpaths_deleted {
        my $self = shift;
}


=head2 slices

Return array with all slice names.

  my @s = $i->slices;

=cut

sub slices {
        my $self = shift;
}

=head1 Helper methods

This methods are used internally, but they might be useful.

=head2 in_slice

Takes path and return slice in which this path belongs.

  my $s = $i->in_slice('path/to/document/in/index');

If there are C<slices> parametar to L<"open"> it will use
MD5 hash to spread documents across slices. That will produce random
distribution of your documents in slices, which might or might not be best
for your data. If you have to re-index large number of slices on each
run, think about creating your own C<slice> function and distributing
documents manually across slices.

Slice number must always be true value or various sanity checks will fail.

This function is C<Memoize>ed for performance reasons.

=cut

sub in_slice {
        my $self = shift;

        my $path = shift || confess "need path";

        confess "need slice_name function" unless ref ($self->{'slice_name'});

        if ($self->{'slices'}) {
                # first, pass path through slice_name function
                my $slice = &{$self->{'slice_name'}}($path);
                # then calculate MD5 hash
                $slice = md5_hex($slice);
                # take first 8 chars to produce number
                # FIXME how random is this?
                $slice = hex(substr($slice,0,8));
                
                $slice = ($slice % $self->{'slices'}) + 1;
                print "hash: $slice / ",$self->{'slices'}," => $slice\n";
                return $slice;
        } else {
                return &{$self->{'split'}}($path);
        }
}

=head2 find_paths

Return array of C<swishpath>s for given C<swish-e> query.

  my @p = $i->find_paths("headline=test*");

Useful for combining with L<"delete_documents"> to delete documents
which hasn't changed a while (so, expired).

=cut

sub find_paths {
        my $self = shift;

        my $s = shift || return;
}


=head2 make_config

Create C<swish-e> configuration file for given slice.

  my $config_filename = $i->make_config('slice name');

It returns configuration filename. If no C<swish_config> was defined in
L<"open">, default swish-e configuration will be used. It will index all data for
searching, but none for properties.

If you want to see what is allready defined for swish-e in configuration
take a look at source code for C<DEFAULT_SWISH_CONF>.

It uses C<cat> utility to comunicate with C<swish-e>. Path is provided
by C<File::Which>. Do Windows users have to change that to C<COPY /B>
or something similar?

=cut

sub make_config {
        my $self = shift;


        my $index_file = $self->{'index'}."/";
        $index_file .= shift || confess "need slice name";

        my ($tmp_fh, $swish_config_filename) = mkstemp("/tmp/swishXXXXX");

        # find cat on filesystem
        my $cat = which('cat');

        print $tmp_fh <<"DEFAULT_SWISH_CONF";
# swish-e config file

IndexDir cat
#SwishProgParameters -

# input file definition
DefaultContents XML*

# indexed metatags
MetaNames xml swishdocpath


#XMLClassAttributes type
UndefinedMetaTags auto
UndefinedXMLAttributes auto

IndexFile $index_file

# Croatian ISO-8859-2 characters to unaccented equivalents
TranslateCharacters ¹©ðÐèÈæÆ¾® ssddcccczz


# disable output
ParserWarnLevel 0
IndexReport 1

DEFAULT_SWISH_CONF

        # add user parametars (like stored properties)
        print $tmp_fh $self->{'swish_config'} if ($self->{'swish_config'});

        close($tmp_fh);

        return $swish_config_filename;
}

=head2 create_slice

On first run, starts C<swish-e> using L<IPC::Run>. On subsequent calls just return
it's handles using L<Memoize>.

  my $s = create_slice('/path/to/document');

You shouldn't need to call C<create_slice> directly because it will be called
from L<"put_slice"> when needed.

=cut

sub create_slice {
        my $self = shift;

        my $path = shift || confess "create_slice need path!";

        my $s = $self->in_slice($path) || confess "in_slice returned null";

        return $s if (exists($self->{'slice'}->{$s}));

        my $swish_config = $self->make_config($s);

        print STDERR "creating slice $s\n";     # FIXME

        my @swish = qw(swish-e -S prog -c);
        push @swish, $swish_config;

        ## Build the harness, open all pipes, and launch the subprocesses
        $self->{'slice'}->{$s}->{'h'} = start \@swish,
                \$self->{'slice'}->{$s}->{'in'},
                \$self->{'slice'}->{$s}->{'out'},
                \$self->{'slice'}->{$s}->{'err'},
                timeout( 90 );  # FIXME

        $self->{'slice'}->{$s}->{'out_len'} = 0;
        $self->{'slice'}->{$s}->{'err_len'} = 0;

        $self->slice_output($s);

        return $s;
}

=head2 put_slice

Pass XML data to swish and receive output and errors.

  my ($out,$err) = $i->put_slice('/swish/path', '<xml>data</xml>');

=cut

sub put_slice {
        my $self = shift;

        my $path = shift || confess "need path";
        my $xml = shift || confess "need xml";

        $xml = $iso2utf->convert($xml) || carp "XML conversion error in $xml";

        my $s = $self->create_slice($path) || confess "create_slice returned null";

        confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
        confess "no 'in' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'in'}));
        confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));

        $self->slice_output($s);

        use bytes;      # as opposed to chars
        $self->{'slice'}->{$s}->{'in'} .=
                "Path-Name: $path\n".
                "Content-Length: ".(length($xml)+1)."\n".
                "Document-Type: XML\n\n$xml\n";

        # do I/O
        $self->{'slice'}->{$s}->{'h'}->pump while length $self->{'slice'}->{$s}->{'in'} ;  # wait for all input to go

        $self->slice_output($s);

        return $s;
}

=head2 slice_output

Prints to STDERR output and errors from C<swish-e>.

  $i->slice_output($s);

Normally, you don't need to call it.

=cut

sub slice_output {
        my $self = shift;

        my $s = shift || confess "slice_output needs slice";

        confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
        confess "no 'in' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'in'}));
        confess "no 'out' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'out'}));

        if (length $self->{'slice'}->{$s}->{'out'} > $self->{'slice'}->{$s}->{'out_len'}) {
                #print STDERR "swish-e OUT: ",$self->{'slice'}->{$s}->{'out'},"\n" if ($self->{'slice'}->{$s}->{'out'});
                $self->{'slice'}->{$s}->{'out_len'} = length $self->{'slice'}->{$s}->{'out'};
                return 1;
        } elsif (length $self->{'slice'}->{$s}->{'err'} > $self->{'slice'}->{$s}->{'err_len'}) {
                print STDERR "swish-e ERR: ",$self->{'slice'}->{$s}->{'err'},"\n" if ($self->{'slice'}->{$s}->{'err'});
                $self->{'slice'}->{$s}->{'err_len'} = length $self->{'slice'}->{$s}->{'err'};
                # this is fatal
                return 0;
        }

        return 1;
}

=head2 close_slice {

Close slice (terminates swish-e process for that slice).

  my $i->close_slice($s);

Returns true if slice is closed, false otherwise.

=cut

sub close_slice {
        my $self = shift;

        my $s = shift || confess "close_slice needs slice";

        confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
        confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));

        # pump rest of content (if any)
        $self->{'slice'}->{$s}->{'h'}->pump while length $self->{'slice'}->{$s}->{'in'};

        $self->slice_output($s);

        # clean up
        $self->{'slice'}->{$s}->{'h'}->finish or confess "finish on slice $s returned: $?";
        
        delete($self->{'slice'}->{$s}) && return 1;
        return 0;
}

=head2 to_xml

Convert (binary safe, I hope) your data into XML for C<swish-e>.
Data will not yet be recoded to UTF-8. L<"put_slice"> will do that.

  my $xml = $i->to_xml({ foo => 'bar' });

This function is extracted from L<"add"> method so that you can L<Memoize> it.
If your data set has a lot of repeatable data, and memory is not a problem, you
can add C<memoize_to_xml> option to L<"open">.

=cut

my %escape = ('<'=>'&lt;', '>'=>'&gt;', '&'=>'&amp;', '"'=>'&quot;');
my $escape_re  = join '|' => keys %escape;

sub to_xml {
        my $self = shift;

        my $data = shift || return;

        my $xml = qq{<xml>};
        foreach my $tag (keys %$data) {
                my $content = $data->{$tag};
                next if (! $content || $content eq '');
                # save [cr/]lf before conversion to XML
#               $content =~ s/\n\r/##lf##/gs;
#               $content =~ s/\n/##lf##/gs;
                $content =~ s/($escape_re)/$escape{$1}/gs;
                $xml .= "<$tag><![CDATA[".$content."]]></$tag>";
        }
        $xml .= qq{</xml>};
}

1;
__END__


=head2 Searching

Searching is still conducted using L<SWISH::API>, but you have to glob
index names.

    use SWISH::API;

    my $swish = SWISH::API->new( glob('index.swish-e/*') );

You can also alternativly create merged index (using C<merge> option) and
not change your source code at all.

That would also benefit performance, but it increases indexing time
because merged indexes must be re-created on each indexing run.

=head2 EXPORT

Nothing by default.

=head2 EXAMPLES

Test script for this module uses all parts of API. It's also nice example
how to use C<SWISH::Split>.

=head1 SEE ALSO

L<SWISH::API>,
L<http://www.swish-e.org/>

=head1 AUTHOR

Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2004 by Dobrica Pavlinusic

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.8.4 or,
at your option, any later version of Perl 5 you may have available.


=cut
1	package SWISH::Split;
2
3	use 5.008;
4	use strict;
5	use warnings;
6
7	our $VERSION = '0.00';
8
9	use SWISH::API;
10	use Text::Iconv;
11	use File::Temp qw/ :mktemp /;
12	use Carp;
13	use Digest::MD5 qw(md5_hex);
14	use Memoize;
15	use IPC::Run qw(start timeout pump finish);
16	use File::Which;
17
18	use Data::Dumper;
19
20	=head1 NAME
21
22	SWISH::Split - Perl interface to split index variant of Swish-e
23
24	=head1 SYNOPSIS
25
26	use SWISH::Split;
27
28
29	=head1 DESCRIPTION
30
31	This is alternative interface for indexing data with swish-e. It's designed
32	to split indexes over multiple files (slices) to allow updates of records in index
33	by reindexing just changed parts (slice).
34
35	Data is stored in index using intrface which is somewhat similar to
36	L<Plucene::Simple>. This could make your migration (or supporting two index
37	engines) easier.
38
39	In the background, it will fork swish-e binaries (one for each index slice)
40	and produce UTF-8 encoded XML files for it. So, if your imput charset isn't
41	C<ISO-8859-1> you will have to specify it.
42
43	=head1 Methods used for indexing
44
45	=head2 open
46
47	Create new object for index.
48
49	my $i = SWISH::Split->open({
50	index => '/path/to/index',
51	slice_name => \&slice_on_path,
52	slices => 30,
53	merge => 0,
54	codepage => 'ISO-8859-2',
55	swish_config => qq{
56	PropertyNames from date
57	PropertyNamesDate date
58	},
59	memoize_to_xml => 0,
60	);
61
62	# split index on first component of path
63	sub slice_on_path {
64	return shift split(/\//,$_[0]);
65	}
66
67	Options to open are following:
68
69	=over 5
70
71	=item C<index>
72
73	path to (existing) directory in which index slices will be created.
74
75	=item C<slice_name>
76
77	coderef to function which provide slicing from path.
78
79	=item C<slices>
80
81	maximum number of index slices. See L<"in_slice"> for
82	more explanation.
83
84	=item C<merge>
85
86	(planned) option to merge indexes into one at end.
87
88	=item C<codepage>
89
90	data codepage (needed for conversion to UTF-8).
91	By default, it's C<ISO-8859-1>.
92
93	=item C<swish_config>
94
95	additional parametars which will be inserted into
96	C<swish-e> configuration file. See L<swish-config>.
97
98	=item C<memoize_to_xml>
99
100	speed up repeatable data, see L<"to_xml">.
101
102	=back
103
104	=cut
105
106	my $iso2utf = Text::Iconv->new('ISO-8859-1','UTF-8');
107
108	sub open {
109	my $class = shift;
110	my $self = {@_};
111	bless($self, $class);
112
113	croak "need slice_name coderef" unless ref $self->{'slice_name'};
114	croak "need slices" unless $self->{'slices'};
115
116	croak "need index" unless $self->{'index'};
117	croak "index '",$self->{'index'},"' doesn't exist" unless -e $self->{'index'};
118	croak "index '",$self->{'index'},"' is not directory" unless -d $self->{'index'};
119
120	$iso2utf = Text::Iconv->new($self->{'codepage'},'UTF-8') if ($self->{'codepage'});
121
122	# speedup
123	memoize('in_slice');
124	memoize('to_xml') if ($self->{'memoize_to_xml'});
125
126	$self ? return $self : return undef;
127
128	}
129
130	=head2 add
131
132	Add document to index.
133
134	$i->add($swishpath, {
135	headline => 'foobar result',
136	property => 'data',
137	})
138
139	=cut
140
141	sub add {
142	my $self = shift;
143
144	my $swishpath = shift \|\| return;
145	my $data = shift \|\| return;
146
147	my ($out,$err) = $self->put_slice($swishpath, $self->to_xml($data));
148
149	if ($err) {
150	carp "$swishpath: $err";
151	return 0;
152	}
153
154	return 1;
155	}
156
157	=head2 delete
158
159	Delete documents from index.
160
161	$i->delete(@swishpath);
162
163	=cut
164
165	sub delete {
166	my $self = shift;
167
168	my @paths = @_ \|\| return;
169
170	return 42;
171	}
172
173
174	=head2 finish
175
176	Finish indexing and close index file(s).
177
178	$i->finish;
179
180	This is most time-consuming operation. When it's called, it will re-index
181	all entries which haven't changed in all slices.
182
183	Returns number of slices updated.
184
185	=cut
186
187	sub finish {
188	my $self = shift;
189
190	my $ret = 0;
191
192	foreach my $s (keys %{$self->{'slice'}}) {
193	$ret += $self->close_slice($s);
194	}
195
196	return $ret;
197	}
198
199
200
201	=head1 Reporting methods
202
203	This methods return statistics about your index.
204
205	=head2 swishpaths
206
207	Return array of C<swishpath>s in index.
208
209	my @p = $i->swishpaths;
210
211	=cut
212
213	sub swishpaths {
214	my $self = shift;
215	}
216
217	=head2 swishpaths_updated
218
219	Return array with updated C<swishpath>s.
220
221	my @d = $i->swishpaths_updated;
222
223	=cut
224
225	sub swishpaths_updated {
226	my $self = shift;
227	}
228
229
230	=head2 swishpaths_deleted
231
232	Return array with deleted C<swishpath>s.
233
234	my $n = $i->swishpaths_deleted;
235
236	=cut
237
238	sub swishpaths_deleted {
239	my $self = shift;
240	}
241
242
243	=head2 slices
244
245	Return array with all slice names.
246
247	my @s = $i->slices;
248
249	=cut
250
251	sub slices {
252	my $self = shift;
253	}
254
255	=head1 Helper methods
256
257	This methods are used internally, but they might be useful.
258
259	=head2 in_slice
260
261	Takes path and return slice in which this path belongs.
262
263	my $s = $i->in_slice('path/to/document/in/index');
264
265	If there are C<slices> parametar to L<"open"> it will use
266	MD5 hash to spread documents across slices. That will produce random
267	distribution of your documents in slices, which might or might not be best
268	for your data. If you have to re-index large number of slices on each
269	run, think about creating your own C<slice> function and distributing
270	documents manually across slices.
271
272	Slice number must always be true value or various sanity checks will fail.
273
274	This function is C<Memoize>ed for performance reasons.
275
276	=cut
277
278	sub in_slice {
279	my $self = shift;
280
281	my $path = shift \|\| confess "need path";
282
283	confess "need slice_name function" unless ref ($self->{'slice_name'});
284
285	if ($self->{'slices'}) {
286	# first, pass path through slice_name function
287	my $slice = &{$self->{'slice_name'}}($path);
288	# then calculate MD5 hash
289	$slice = md5_hex($slice);
290	# take first 8 chars to produce number
291	# FIXME how random is this?
292	$slice = hex(substr($slice,0,8));
293
294	$slice = ($slice % $self->{'slices'}) + 1;
295	print "hash: $slice / ",$self->{'slices'}," => $slice\n";
296	return $slice;
297	} else {
298	return &{$self->{'split'}}($path);
299	}
300	}
301
302	=head2 find_paths
303
304	Return array of C<swishpath>s for given C<swish-e> query.
305
306	my @p = $i->find_paths("headline=test*");
307
308	Useful for combining with L<"delete_documents"> to delete documents
309	which hasn't changed a while (so, expired).
310
311	=cut
312
313	sub find_paths {
314	my $self = shift;
315
316	my $s = shift \|\| return;
317	}
318
319
320	=head2 make_config
321
322	Create C<swish-e> configuration file for given slice.
323
324	my $config_filename = $i->make_config('slice name');
325
326	It returns configuration filename. If no C<swish_config> was defined in
327	L<"open">, default swish-e configuration will be used. It will index all data for
328	searching, but none for properties.
329
330	If you want to see what is allready defined for swish-e in configuration
331	take a look at source code for C<DEFAULT_SWISH_CONF>.
332
333	It uses C<cat> utility to comunicate with C<swish-e>. Path is provided
334	by C<File::Which>. Do Windows users have to change that to C<COPY /B>
335	or something similar?
336
337	=cut
338
339	sub make_config {
340	my $self = shift;
341
342
343	my $index_file = $self->{'index'}."/";
344	$index_file .= shift \|\| confess "need slice name";
345
346	my ($tmp_fh, $swish_config_filename) = mkstemp("/tmp/swishXXXXX");
347
348	# find cat on filesystem
349	my $cat = which('cat');
350
351	print $tmp_fh <<"DEFAULT_SWISH_CONF";
352	# swish-e config file
353
354	IndexDir cat
355	#SwishProgParameters -
356
357	# input file definition
358	DefaultContents XML*
359
360	# indexed metatags
361	MetaNames xml swishdocpath
362
363
364	#XMLClassAttributes type
365	UndefinedMetaTags auto
366	UndefinedXMLAttributes auto
367
368	IndexFile $index_file
369
370	# Croatian ISO-8859-2 characters to unaccented equivalents
371	TranslateCharacters ¹©ðÐèÈæÆ¾® ssddcccczz
372
373
374	# disable output
375	ParserWarnLevel 0
376	IndexReport 1
377
378	DEFAULT_SWISH_CONF
379
380	# add user parametars (like stored properties)
381	print $tmp_fh $self->{'swish_config'} if ($self->{'swish_config'});
382
383	close($tmp_fh);
384
385	return $swish_config_filename;
386	}
387
388	=head2 create_slice
389
390	On first run, starts C<swish-e> using L<IPC::Run>. On subsequent calls just return
391	it's handles using L<Memoize>.
392
393	my $s = create_slice('/path/to/document');
394
395	You shouldn't need to call C<create_slice> directly because it will be called
396	from L<"put_slice"> when needed.
397
398	=cut
399
400	sub create_slice {
401	my $self = shift;
402
403	my $path = shift \|\| confess "create_slice need path!";
404
405	my $s = $self->in_slice($path) \|\| confess "in_slice returned null";
406
407	return $s if (exists($self->{'slice'}->{$s}));
408
409	my $swish_config = $self->make_config($s);
410
411	print STDERR "creating slice $s\n"; # FIXME
412
413	my @swish = qw(swish-e -S prog -c);
414	push @swish, $swish_config;
415
416	## Build the harness, open all pipes, and launch the subprocesses
417	$self->{'slice'}->{$s}->{'h'} = start \@swish,
418	\$self->{'slice'}->{$s}->{'in'},
419	\$self->{'slice'}->{$s}->{'out'},
420	\$self->{'slice'}->{$s}->{'err'},
421	timeout( 90 ); # FIXME
422
423	$self->{'slice'}->{$s}->{'out_len'} = 0;
424	$self->{'slice'}->{$s}->{'err_len'} = 0;
425
426	$self->slice_output($s);
427
428	return $s;
429	}
430
431	=head2 put_slice
432
433	Pass XML data to swish and receive output and errors.
434
435	my ($out,$err) = $i->put_slice('/swish/path', '<xml>data</xml>');
436
437	=cut
438
439	sub put_slice {
440	my $self = shift;
441
442	my $path = shift \|\| confess "need path";
443	my $xml = shift \|\| confess "need xml";
444
445	$xml = $iso2utf->convert($xml) \|\| carp "XML conversion error in $xml";
446
447	my $s = $self->create_slice($path) \|\| confess "create_slice returned null";
448
449	confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
450	confess "no 'in' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'in'}));
451	confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));
452
453	$self->slice_output($s);
454
455	use bytes; # as opposed to chars
456	$self->{'slice'}->{$s}->{'in'} .=
457	"Path-Name: $path\n".
458	"Content-Length: ".(length($xml)+1)."\n".
459	"Document-Type: XML\n\n$xml\n";
460
461	# do I/O
462	$self->{'slice'}->{$s}->{'h'}->pump while length $self->{'slice'}->{$s}->{'in'} ; # wait for all input to go
463
464	$self->slice_output($s);
465
466	return $s;
467	}
468
469	=head2 slice_output
470
471	Prints to STDERR output and errors from C<swish-e>.
472
473	$i->slice_output($s);
474
475	Normally, you don't need to call it.
476
477	=cut
478
479	sub slice_output {
480	my $self = shift;
481
482	my $s = shift \|\| confess "slice_output needs slice";
483
484	confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
485	confess "no 'in' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'in'}));
486	confess "no 'out' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'out'}));
487
488	if (length $self->{'slice'}->{$s}->{'out'} > $self->{'slice'}->{$s}->{'out_len'}) {
489	#print STDERR "swish-e OUT: ",$self->{'slice'}->{$s}->{'out'},"\n" if ($self->{'slice'}->{$s}->{'out'});
490	$self->{'slice'}->{$s}->{'out_len'} = length $self->{'slice'}->{$s}->{'out'};
491	return 1;
492	} elsif (length $self->{'slice'}->{$s}->{'err'} > $self->{'slice'}->{$s}->{'err_len'}) {
493	print STDERR "swish-e ERR: ",$self->{'slice'}->{$s}->{'err'},"\n" if ($self->{'slice'}->{$s}->{'err'});
494	$self->{'slice'}->{$s}->{'err_len'} = length $self->{'slice'}->{$s}->{'err'};
495	# this is fatal
496	return 0;
497	}
498
499	return 1;
500	}
501
502	=head2 close_slice {
503
504	Close slice (terminates swish-e process for that slice).
505
506	my $i->close_slice($s);
507
508	Returns true if slice is closed, false otherwise.
509
510	=cut
511
512	sub close_slice {
513	my $self = shift;
514
515	my $s = shift \|\| confess "close_slice needs slice";
516
517	confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
518	confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));
519
520	# pump rest of content (if any)
521	$self->{'slice'}->{$s}->{'h'}->pump while length $self->{'slice'}->{$s}->{'in'};
522
523	$self->slice_output($s);
524
525	# clean up
526	$self->{'slice'}->{$s}->{'h'}->finish or confess "finish on slice $s returned: $?";
527
528	delete($self->{'slice'}->{$s}) && return 1;
529	return 0;
530	}
531
532	=head2 to_xml
533
534	Convert (binary safe, I hope) your data into XML for C<swish-e>.
535	Data will not yet be recoded to UTF-8. L<"put_slice"> will do that.
536
537	my $xml = $i->to_xml({ foo => 'bar' });
538
539	This function is extracted from L<"add"> method so that you can L<Memoize> it.
540	If your data set has a lot of repeatable data, and memory is not a problem, you
541	can add C<memoize_to_xml> option to L<"open">.
542
543	=cut
544
545	my %escape = ('<'=>'<', '>'=>'>', '&'=>'&', '"'=>'"');
546	my $escape_re = join '\|' => keys %escape;
547
548	sub to_xml {
549	my $self = shift;
550
551	my $data = shift \|\| return;
552
553	my $xml = qq{<xml>};
554	foreach my $tag (keys %$data) {
555	my $content = $data->{$tag};
556	next if (! $content \|\| $content eq '');
557	# save [cr/]lf before conversion to XML
558	# $content =~ s/\n\r/##lf##/gs;
559	# $content =~ s/\n/##lf##/gs;
560	$content =~ s/($escape_re)/$escape{$1}/gs;
561	$xml .= "<$tag><![CDATA[".$content."]]></$tag>";
562	}
563	$xml .= qq{</xml>};
564	}
565
566	1;
567	__END__
568
569
570	=head2 Searching
571
572	Searching is still conducted using L<SWISH::API>, but you have to glob
573	index names.
574
575	use SWISH::API;
576
577	my $swish = SWISH::API->new( glob('index.swish-e/*') );
578
579	You can also alternativly create merged index (using C<merge> option) and
580	not change your source code at all.
581
582	That would also benefit performance, but it increases indexing time
583	because merged indexes must be re-created on each indexing run.
584
585	=head2 EXPORT
586
587	Nothing by default.
588
589	=head2 EXAMPLES
590
591	Test script for this module uses all parts of API. It's also nice example
592	how to use C<SWISH::Split>.
593
594	=head1 SEE ALSO
595
596	L<SWISH::API>,
597	L<http://www.swish-e.org/>
598
599	=head1 AUTHOR
600
601	Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
602
603	=head1 COPYRIGHT AND LICENSE
604
605	Copyright (C) 2004 by Dobrica Pavlinusic
606
607	This library is free software; you can redistribute it and/or modify
608	it under the same terms as Perl itself, either Perl version 5.8.4 or,
609	at your option, any later version of Perl 5 you may have available.
610
611
612	=cut