SWISH-Split/trunk/Split.pm

package SWISH::Split;

use 5.008;
use strict;
use warnings;

our $VERSION = '0.01';

use SWISH::API;
use Text::Iconv;
use File::Temp qw/ :mktemp /;
use Carp;
use Digest::MD5 qw(md5_hex);
use Memoize;
use IPC::Run qw(start timeout pump finish);
use File::Which;

use Data::Dumper;

use constant {
        ADDED => 1,
        DELETED => 2,
};

=head1 NAME

SWISH::Split - Perl interface to split index variant of Swish-e

=head1 SYNOPSIS

  use SWISH::Split;


=head1 DESCRIPTION

This is alternative interface for indexing data with swish-e. It's designed
to split indexes over multiple files (slices) to allow updates of records in index
by reindexing just changed parts (slice).

Data is stored in index using intrface which is somewhat similar to
L<Plucene::Simple>. This could make your migration (or supporting two index
engines) easier.

In the background, it will fork swish-e binaries (one for each index slice)
and produce UTF-8 encoded XML files for it. So, if your input charset isn't
C<ISO-8859-1> you will have to specify it.

=head1 Methods used for indexing

=head2 open

Create new object for index.

  my $i = SWISH::Split->open({
        index => '/path/to/index',
        slice_name => \&slice_on_path,
        slices => 30,
        merge => 0,
        codepage => 'ISO-8859-2',
        swish_config => qq{
                PropertyNames from date
                PropertyNamesDate date
        },
        memoize_to_xml => 0,
  );

  # split index on first component of path
  sub slice_on_path {
        return shift split(/\//,$_[0]);
  }

Options to open are following:

=over 5

=item C<index>

path to (existing) directory in which index slices will be created.

=item C<slice_name>

coderef to function which provide slicing from path.

=item C<slices>

maximum number of index slices. See L<"in_slice"> for
more explanation.

=item C<merge>

(planned) option to merge indexes into one at end.

=item C<codepage>

data codepage (needed for conversion to UTF-8).
By default, it's C<ISO-8859-1>.

=item C<swish_config>

additional parametars which will be inserted into
C<swish-e> configuration file. See L<swish-config>.

=item C<memoize_to_xml>

speed up repeatable data, see L<"to_xml">.

=back

=cut

my $iso2utf = Text::Iconv->new('ISO-8859-1','UTF-8');

sub open {
        my $class = shift;
        my $self = {@_};
        bless($self, $class);

        croak "need slice_name coderef" unless ref $self->{'slice_name'};
        croak "need slices" unless $self->{'slices'};

        croak "need index" unless $self->{'index'};
        croak "index '",$self->{'index'},"' doesn't exist" unless -e $self->{'index'};
        croak "index '",$self->{'index'},"' is not directory" unless -d $self->{'index'};

        $iso2utf = Text::Iconv->new($self->{'codepage'},'UTF-8') if ($self->{'codepage'});

        # speedup
        memoize('in_slice');
        memoize('to_xml') if ($self->{'memoize_to_xml'});

        $self ? return $self : return undef;

}

=head2 add

Add document to index.

  $i->add($swishpath, {
        headline => 'foobar result',
        property => 'data',
  })

=cut

sub add {
        my $self = shift;

        my $swishpath = shift || return;
        my $data = shift || return;

        my $slice = $self->put_slice($swishpath, $self->to_xml($data));

#       if ($err) {
#               carp "$swishpath: $err";
#               return undef;
#       }

        return $slice;
}

=head2 delete

Delete documents from index.

  $i->delete(@swishpath);

=cut

sub delete {
        my $self = shift;

        my @paths = @_ || return;

        foreach my $path (@paths) {
                $self->{'paths'}->{$path} = DELETED; 
        }

        return 42;
}


=head2 done

Finish indexing and close index file(s).

  $i->done;

This is most time-consuming operation. When it's called, it will re-index
all entries which haven't changed in all slices.

Returns number of slices updated.

This method should really be called close or finish, but both of those are
allready used.

=cut

sub done {
        my $self = shift;

        my $ret = 0;

        foreach my $s (keys %{$self->{'slice'}}) {
                print STDERR "closing slice $s\n";
                $ret += $self->close_slice($s);
        }

        return $ret;
}


=head1 Reporting methods

This methods return statistics about your index.

=head2 swishpaths

Return array of C<swishpath>s in index.

  my @p = $i->swishpaths;

=cut

sub swishpaths {
        my $self = shift;

        my $s = shift || return;
        return if (! exists($self->{'slice'}->{'s'}));

        return keys %{$self->{'slice'}->{'s'}};
}

=head2 swishpaths_updated

Return array with updated C<swishpath>s.

  my @d = $i->swishpaths_updated;

=cut

sub swishpaths_updated {
        my $self = shift;
}


=head2 swishpaths_deleted

Return array with deleted C<swishpath>s.

  my $n = $i->swishpaths_deleted;

=cut

sub swishpaths_deleted {
        my $self = shift;
}


=head2 slices

Return array with all slice names.

  my @s = $i->slices;

=cut

sub slices {
        my $self = shift;
}

=head1 Helper methods

This methods are used internally, but they might be useful.

=head2 in_slice

Takes path and return slice in which this path belongs.

  my $s = $i->in_slice('path/to/document/in/index');

If there are C<slices> parametar to L<"open"> it will use
MD5 hash to spread documents across slices. That will produce random
distribution of your documents in slices, which might or might not be best
for your data. If you have to re-index large number of slices on each
run, think about creating your own C<slice> function and distributing
documents manually across slices.

Slice number must always be true value or various sanity checks will fail.

This function is C<Memoize>ed for performance reasons.

=cut

sub in_slice {
        my $self = shift;

        my $path = shift || confess "need path";

        confess "need slice_name function" unless ref ($self->{'slice_name'});

        if ($self->{'slices'}) {
                # first, pass path through slice_name function
                my $slice = &{$self->{'slice_name'}}($path);
                # then calculate MD5 hash
                $slice = md5_hex($slice);
                # take first 8 chars to produce number
                # FIXME how random is this?
                $slice = hex(substr($slice,0,8));
                
                $slice = ($slice % $self->{'slices'}) + 1;
                print "hash: $slice / ",$self->{'slices'}," => $slice\n";
                return $slice;
        } else {
                return &{$self->{'split'}}($path);
        }
}

=head2 find_paths

Return array of C<swishpath>s for given C<swish-e> query.

  my @p = $i->find_paths("headline=test*");

Useful for combining with L<"delete_documents"> to delete documents
which hasn't changed a while (so, expired).

=cut

sub find_paths {
        my $self = shift;

}


=head2 make_config

Create C<swish-e> configuration file for given slice.

  my $config_filename = $i->make_config('slice name');

It returns configuration filename. If no C<swish_config> was defined in
L<"open">, default swish-e configuration will be used. It will index all data for
searching, but none for properties.

If you want to see what is allready defined for swish-e in configuration
take a look at source code for C<DEFAULT_SWISH_CONF>.

It uses C<stdin> as C<IndexDir> to comunicate with C<swish-e>.

=cut

sub make_config {
        my $self = shift;


        my $index_file = $self->{'index'}."/";
        $index_file .= shift || confess "need slice name";

        my ($tmp_fh, $swish_config_filename) = mkstemp("/tmp/swishXXXXX");

        # find cat on filesystem
        my $cat = which('cat');

        print $tmp_fh <<"DEFAULT_SWISH_CONF";
# swish-e config file

IndexDir stdin

# input file definition
DefaultContents XML*

# indexed metatags
MetaNames xml swishdocpath


#XMLClassAttributes type
UndefinedMetaTags auto
UndefinedXMLAttributes auto

IndexFile $index_file

# Croatian ISO-8859-2 characters to unaccented equivalents
TranslateCharacters ¹©ðÐèÈæÆ¾® ssddcccczz


# disable output
ParserWarnLevel 0
IndexReport 1

DEFAULT_SWISH_CONF

        # add user parametars (like stored properties)
        print $tmp_fh $self->{'swish_config'} if ($self->{'swish_config'});

        close($tmp_fh);

        return $swish_config_filename;
}

=head2 create_slice

On first run, starts C<swish-e> using L<IPC::Run>. On subsequent calls just return
it's handles using L<Memoize>.

  my $s = create_slice('/path/to/document');

You shouldn't need to call C<create_slice> directly because it will be called
from L<"put_slice"> when needed.

=cut

sub create_slice {
        my $self = shift;

        my $path = shift || confess "create_slice need path!";

        my $s = $self->in_slice($path) || confess "in_slice returned null";

        return $s if (exists($self->{'slice'}->{$s}));

        my $swish_config = $self->make_config($s);

        print STDERR "creating slice $s\n";     # FIXME

        my @swish = qw(swish-e -u -S prog -c);
        push @swish, $swish_config;

        ## Build the harness, open all pipes, and launch the subprocesses
        $self->{'slice'}->{$s}->{'h'} = start \@swish,
                \$self->{'slice'}->{$s}->{'in'},
                \$self->{'slice'}->{$s}->{'out'},
                \$self->{'slice'}->{$s}->{'err'},
                timeout( 90 );  # FIXME

        $self->{'slice'}->{$s}->{'out_len'} = 0;
        $self->{'slice'}->{$s}->{'err_len'} = 0;

        $self->slice_output($s);

        return $s;
}

=head2 put_slice

Pass XML data to swish.

  my $slice = $i->put_slice('/swish/path', '<xml>data</xml>');

Returns slice in which XML ended up.

=cut

sub put_slice {
        my $self = shift;

        my $path = shift || confess "need path";
        my $xml = shift || confess "need xml";

        $xml = $iso2utf->convert($xml) || carp "XML conversion error in $xml";

        my $s = $self->create_slice($path) || confess "create_slice returned null";

        confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
        confess "no 'in' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'in'}));
        confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));

        $self->slice_output($s);

        use bytes;      # as opposed to chars
        $self->{'slice'}->{$s}->{'in'} .=
                "Path-Name: $path\n".
                "Content-Length: ".(length($xml)+1)."\n".
                "Update-Mode: Index\n".
                "Document-Type: XML\n\n$xml\n";

        # do I/O
        $self->{'slice'}->{$s}->{'h'}->pump while length $self->{'slice'}->{$s}->{'in'} ;  # wait for all input to go

        $self->slice_output($s);

        $self->{'paths'}->{$path} = ADDED; 

        return $s;
}

=head2 slice_output

Prints to STDERR output and errors from C<swish-e>.

  my $slice = $i->slice_output($s);

Normally, you don't need to call it.

=cut

sub slice_output {
        my $self = shift;

        my $s = shift || confess "slice_output needs slice";

        confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
        confess "no 'in' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'in'}));
        confess "no 'out' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'out'}));

        if (length $self->{'slice'}->{$s}->{'out'} > $self->{'slice'}->{$s}->{'out_len'}) {
                #print STDERR "swish-e OUT: ",$self->{'slice'}->{$s}->{'out'},"\n" if ($self->{'slice'}->{$s}->{'out'});
                $self->{'slice'}->{$s}->{'out_len'} = length $self->{'slice'}->{$s}->{'out'};
                return $s;
        } elsif (length $self->{'slice'}->{$s}->{'err'} > $self->{'slice'}->{$s}->{'err_len'}) {
                print STDERR "swish-e ERR: ",$self->{'slice'}->{$s}->{'err'},"\n" if ($self->{'slice'}->{$s}->{'err'});
                $self->{'slice'}->{$s}->{'err_len'} = length $self->{'slice'}->{$s}->{'err'};
                # this is fatal
                return undef;
        }

        return $s;
}

=head2 close_slice

Close slice (terminates swish-e process for that slice).

  my $i->close_slice($s);

Returns true if slice is closed, false otherwise.

=cut

sub close_slice {
        my $self = shift;

        my $s = shift || confess "close_slice needs slice";

        confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
        confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));

        # pump rest of content (if any)
        $self->{'slice'}->{$s}->{'h'}->pump while length $self->{'slice'}->{$s}->{'in'};

        $self->slice_output($s);

        # clean up
        $self->{'slice'}->{$s}->{'h'}->finish or confess "finish on slice $s returned $?: $! -- ",$self->{'slice'}->{$s}->{'err'};
        
        delete($self->{'slice'}->{$s}) && return 1;
        return 0;
}

=head2 to_xml

Convert (binary safe, I hope) your data into XML for C<swish-e>.
Data will not yet be recoded to UTF-8. L<"put_slice"> will do that.

  my $xml = $i->to_xml({ foo => 'bar' });

This function is extracted from L<"add"> method so that you can L<Memoize> it.
If your data set has a lot of repeatable data, and memory is not a problem, you
can add C<memoize_to_xml> option to L<"open">.

=cut

my %escape = ('<'=>'&lt;', '>'=>'&gt;', '&'=>'&amp;', '"'=>'&quot;');
my $escape_re  = join '|' => keys %escape;

sub to_xml {
        my $self = shift;

        my $data = shift || return;

        my $xml = qq{<xml>};
        foreach my $tag (keys %$data) {
                my $content = $data->{$tag};
                next if (! $content || $content eq '');
                # save [cr/]lf before conversion to XML
#               $content =~ s/\n\r/##lf##/gs;
#               $content =~ s/\n/##lf##/gs;
                $content =~ s/($escape_re)/$escape{$1}/gs;
                $xml .= "<$tag><![CDATA[".$content."]]></$tag>";
        }
        $xml .= qq{</xml>};
}

1;
__END__


=head1 Searching

Searching is still conducted using L<SWISH::API>, but you have to glob
index names.

    use SWISH::API;

    my $swish = SWISH::API->new( glob('index.swish-e/*') );

You can also alternativly create merged index (using C<merge> option) and
not change your source code at all.

That would also benefit performance, but it increases indexing time
because merged indexes must be re-created on each indexing run.

=head1 EXPORT

Nothing by default.

=head1 EXAMPLES

Test script for this module uses all parts of API. It's also nice example
how to use C<SWISH::Split>.

=head1 SEE ALSO

L<SWISH::API>,
L<http://www.swish-e.org/>

=head1 AUTHOR

Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2004 by Dobrica Pavlinusic

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.8.4 or,
at your option, any later version of Perl 5 you may have available.


=cut
1	package SWISH::Split;
2
3	use 5.008;
4	use strict;
5	use warnings;
6
7	our $VERSION = '0.01';
8
9	use SWISH::API;
10	use Text::Iconv;
11	use File::Temp qw/ :mktemp /;
12	use Carp;
13	use Digest::MD5 qw(md5_hex);
14	use Memoize;
15	use IPC::Run qw(start timeout pump finish);
16	use File::Which;
17
18	use Data::Dumper;
19
20	use constant {
21	ADDED => 1,
22	DELETED => 2,
23	};
24
25	=head1 NAME
26
27	SWISH::Split - Perl interface to split index variant of Swish-e
28
29	=head1 SYNOPSIS
30
31	use SWISH::Split;
32
33
34	=head1 DESCRIPTION
35
36	This is alternative interface for indexing data with swish-e. It's designed
37	to split indexes over multiple files (slices) to allow updates of records in index
38	by reindexing just changed parts (slice).
39
40	Data is stored in index using intrface which is somewhat similar to
41	L<Plucene::Simple>. This could make your migration (or supporting two index
42	engines) easier.
43
44	In the background, it will fork swish-e binaries (one for each index slice)
45	and produce UTF-8 encoded XML files for it. So, if your input charset isn't
46	C<ISO-8859-1> you will have to specify it.
47
48	=head1 Methods used for indexing
49
50	=head2 open
51
52	Create new object for index.
53
54	my $i = SWISH::Split->open({
55	index => '/path/to/index',
56	slice_name => \&slice_on_path,
57	slices => 30,
58	merge => 0,
59	codepage => 'ISO-8859-2',
60	swish_config => qq{
61	PropertyNames from date
62	PropertyNamesDate date
63	},
64	memoize_to_xml => 0,
65	);
66
67	# split index on first component of path
68	sub slice_on_path {
69	return shift split(/\//,$_[0]);
70	}
71
72	Options to open are following:
73
74	=over 5
75
76	=item C<index>
77
78	path to (existing) directory in which index slices will be created.
79
80	=item C<slice_name>
81
82	coderef to function which provide slicing from path.
83
84	=item C<slices>
85
86	maximum number of index slices. See L<"in_slice"> for
87	more explanation.
88
89	=item C<merge>
90
91	(planned) option to merge indexes into one at end.
92
93	=item C<codepage>
94
95	data codepage (needed for conversion to UTF-8).
96	By default, it's C<ISO-8859-1>.
97
98	=item C<swish_config>
99
100	additional parametars which will be inserted into
101	C<swish-e> configuration file. See L<swish-config>.
102
103	=item C<memoize_to_xml>
104
105	speed up repeatable data, see L<"to_xml">.
106
107	=back
108
109	=cut
110
111	my $iso2utf = Text::Iconv->new('ISO-8859-1','UTF-8');
112
113	sub open {
114	my $class = shift;
115	my $self = {@_};
116	bless($self, $class);
117
118	croak "need slice_name coderef" unless ref $self->{'slice_name'};
119	croak "need slices" unless $self->{'slices'};
120
121	croak "need index" unless $self->{'index'};
122	croak "index '",$self->{'index'},"' doesn't exist" unless -e $self->{'index'};
123	croak "index '",$self->{'index'},"' is not directory" unless -d $self->{'index'};
124
125	$iso2utf = Text::Iconv->new($self->{'codepage'},'UTF-8') if ($self->{'codepage'});
126
127	# speedup
128	memoize('in_slice');
129	memoize('to_xml') if ($self->{'memoize_to_xml'});
130
131	$self ? return $self : return undef;
132
133	}
134
135	=head2 add
136
137	Add document to index.
138
139	$i->add($swishpath, {
140	headline => 'foobar result',
141	property => 'data',
142	})
143
144	=cut
145
146	sub add {
147	my $self = shift;
148
149	my $swishpath = shift \|\| return;
150	my $data = shift \|\| return;
151
152	my $slice = $self->put_slice($swishpath, $self->to_xml($data));
153
154	# if ($err) {
155	# carp "$swishpath: $err";
156	# return undef;
157	# }
158
159	return $slice;
160	}
161
162	=head2 delete
163
164	Delete documents from index.
165
166	$i->delete(@swishpath);
167
168	=cut
169
170	sub delete {
171	my $self = shift;
172
173	my @paths = @_ \|\| return;
174
175	foreach my $path (@paths) {
176	$self->{'paths'}->{$path} = DELETED;
177	}
178
179	return 42;
180	}
181
182
183	=head2 done
184
185	Finish indexing and close index file(s).
186
187	$i->done;
188
189	This is most time-consuming operation. When it's called, it will re-index
190	all entries which haven't changed in all slices.
191
192	Returns number of slices updated.
193
194	This method should really be called close or finish, but both of those are
195	allready used.
196
197	=cut
198
199	sub done {
200	my $self = shift;
201
202	my $ret = 0;
203
204	foreach my $s (keys %{$self->{'slice'}}) {
205	print STDERR "closing slice $s\n";
206	$ret += $self->close_slice($s);
207	}
208
209	return $ret;
210	}
211
212
213
214	=head1 Reporting methods
215
216	This methods return statistics about your index.
217
218	=head2 swishpaths
219
220	Return array of C<swishpath>s in index.
221
222	my @p = $i->swishpaths;
223
224	=cut
225
226	sub swishpaths {
227	my $self = shift;
228
229	my $s = shift \|\| return;
230	return if (! exists($self->{'slice'}->{'s'}));
231
232	return keys %{$self->{'slice'}->{'s'}};
233	}
234
235	=head2 swishpaths_updated
236
237	Return array with updated C<swishpath>s.
238
239	my @d = $i->swishpaths_updated;
240
241	=cut
242
243	sub swishpaths_updated {
244	my $self = shift;
245	}
246
247
248	=head2 swishpaths_deleted
249
250	Return array with deleted C<swishpath>s.
251
252	my $n = $i->swishpaths_deleted;
253
254	=cut
255
256	sub swishpaths_deleted {
257	my $self = shift;
258	}
259
260
261	=head2 slices
262
263	Return array with all slice names.
264
265	my @s = $i->slices;
266
267	=cut
268
269	sub slices {
270	my $self = shift;
271	}
272
273	=head1 Helper methods
274
275	This methods are used internally, but they might be useful.
276
277	=head2 in_slice
278
279	Takes path and return slice in which this path belongs.
280
281	my $s = $i->in_slice('path/to/document/in/index');
282
283	If there are C<slices> parametar to L<"open"> it will use
284	MD5 hash to spread documents across slices. That will produce random
285	distribution of your documents in slices, which might or might not be best
286	for your data. If you have to re-index large number of slices on each
287	run, think about creating your own C<slice> function and distributing
288	documents manually across slices.
289
290	Slice number must always be true value or various sanity checks will fail.
291
292	This function is C<Memoize>ed for performance reasons.
293
294	=cut
295
296	sub in_slice {
297	my $self = shift;
298
299	my $path = shift \|\| confess "need path";
300
301	confess "need slice_name function" unless ref ($self->{'slice_name'});
302
303	if ($self->{'slices'}) {
304	# first, pass path through slice_name function
305	my $slice = &{$self->{'slice_name'}}($path);
306	# then calculate MD5 hash
307	$slice = md5_hex($slice);
308	# take first 8 chars to produce number
309	# FIXME how random is this?
310	$slice = hex(substr($slice,0,8));
311
312	$slice = ($slice % $self->{'slices'}) + 1;
313	print "hash: $slice / ",$self->{'slices'}," => $slice\n";
314	return $slice;
315	} else {
316	return &{$self->{'split'}}($path);
317	}
318	}
319
320	=head2 find_paths
321
322	Return array of C<swishpath>s for given C<swish-e> query.
323
324	my @p = $i->find_paths("headline=test*");
325
326	Useful for combining with L<"delete_documents"> to delete documents
327	which hasn't changed a while (so, expired).
328
329	=cut
330
331	sub find_paths {
332	my $self = shift;
333
334	}
335
336
337	=head2 make_config
338
339	Create C<swish-e> configuration file for given slice.
340
341	my $config_filename = $i->make_config('slice name');
342
343	It returns configuration filename. If no C<swish_config> was defined in
344	L<"open">, default swish-e configuration will be used. It will index all data for
345	searching, but none for properties.
346
347	If you want to see what is allready defined for swish-e in configuration
348	take a look at source code for C<DEFAULT_SWISH_CONF>.
349
350	It uses C<stdin> as C<IndexDir> to comunicate with C<swish-e>.
351
352	=cut
353
354	sub make_config {
355	my $self = shift;
356
357
358	my $index_file = $self->{'index'}."/";
359	$index_file .= shift \|\| confess "need slice name";
360
361	my ($tmp_fh, $swish_config_filename) = mkstemp("/tmp/swishXXXXX");
362
363	# find cat on filesystem
364	my $cat = which('cat');
365
366	print $tmp_fh <<"DEFAULT_SWISH_CONF";
367	# swish-e config file
368
369	IndexDir stdin
370
371	# input file definition
372	DefaultContents XML*
373
374	# indexed metatags
375	MetaNames xml swishdocpath
376
377
378	#XMLClassAttributes type
379	UndefinedMetaTags auto
380	UndefinedXMLAttributes auto
381
382	IndexFile $index_file
383
384	# Croatian ISO-8859-2 characters to unaccented equivalents
385	TranslateCharacters ¹©ðÐèÈæÆ¾® ssddcccczz
386
387
388	# disable output
389	ParserWarnLevel 0
390	IndexReport 1
391
392	DEFAULT_SWISH_CONF
393
394	# add user parametars (like stored properties)
395	print $tmp_fh $self->{'swish_config'} if ($self->{'swish_config'});
396
397	close($tmp_fh);
398
399	return $swish_config_filename;
400	}
401
402	=head2 create_slice
403
404	On first run, starts C<swish-e> using L<IPC::Run>. On subsequent calls just return
405	it's handles using L<Memoize>.
406
407	my $s = create_slice('/path/to/document');
408
409	You shouldn't need to call C<create_slice> directly because it will be called
410	from L<"put_slice"> when needed.
411
412	=cut
413
414	sub create_slice {
415	my $self = shift;
416
417	my $path = shift \|\| confess "create_slice need path!";
418
419	my $s = $self->in_slice($path) \|\| confess "in_slice returned null";
420
421	return $s if (exists($self->{'slice'}->{$s}));
422
423	my $swish_config = $self->make_config($s);
424
425	print STDERR "creating slice $s\n"; # FIXME
426
427	my @swish = qw(swish-e -u -S prog -c);
428	push @swish, $swish_config;
429
430	## Build the harness, open all pipes, and launch the subprocesses
431	$self->{'slice'}->{$s}->{'h'} = start \@swish,
432	\$self->{'slice'}->{$s}->{'in'},
433	\$self->{'slice'}->{$s}->{'out'},
434	\$self->{'slice'}->{$s}->{'err'},
435	timeout( 90 ); # FIXME
436
437	$self->{'slice'}->{$s}->{'out_len'} = 0;
438	$self->{'slice'}->{$s}->{'err_len'} = 0;
439
440	$self->slice_output($s);
441
442	return $s;
443	}
444
445	=head2 put_slice
446
447	Pass XML data to swish.
448
449	my $slice = $i->put_slice('/swish/path', '<xml>data</xml>');
450
451	Returns slice in which XML ended up.
452
453	=cut
454
455	sub put_slice {
456	my $self = shift;
457
458	my $path = shift \|\| confess "need path";
459	my $xml = shift \|\| confess "need xml";
460
461	$xml = $iso2utf->convert($xml) \|\| carp "XML conversion error in $xml";
462
463	my $s = $self->create_slice($path) \|\| confess "create_slice returned null";
464
465	confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
466	confess "no 'in' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'in'}));
467	confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));
468
469	$self->slice_output($s);
470
471	use bytes; # as opposed to chars
472	$self->{'slice'}->{$s}->{'in'} .=
473	"Path-Name: $path\n".
474	"Content-Length: ".(length($xml)+1)."\n".
475	"Update-Mode: Index\n".
476	"Document-Type: XML\n\n$xml\n";
477
478	# do I/O
479	$self->{'slice'}->{$s}->{'h'}->pump while length $self->{'slice'}->{$s}->{'in'} ; # wait for all input to go
480
481	$self->slice_output($s);
482
483	$self->{'paths'}->{$path} = ADDED;
484
485	return $s;
486	}
487
488	=head2 slice_output
489
490	Prints to STDERR output and errors from C<swish-e>.
491
492	my $slice = $i->slice_output($s);
493
494	Normally, you don't need to call it.
495
496	=cut
497
498	sub slice_output {
499	my $self = shift;
500
501	my $s = shift \|\| confess "slice_output needs slice";
502
503	confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
504	confess "no 'in' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'in'}));
505	confess "no 'out' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'out'}));
506
507	if (length $self->{'slice'}->{$s}->{'out'} > $self->{'slice'}->{$s}->{'out_len'}) {
508	#print STDERR "swish-e OUT: ",$self->{'slice'}->{$s}->{'out'},"\n" if ($self->{'slice'}->{$s}->{'out'});
509	$self->{'slice'}->{$s}->{'out_len'} = length $self->{'slice'}->{$s}->{'out'};
510	return $s;
511	} elsif (length $self->{'slice'}->{$s}->{'err'} > $self->{'slice'}->{$s}->{'err_len'}) {
512	print STDERR "swish-e ERR: ",$self->{'slice'}->{$s}->{'err'},"\n" if ($self->{'slice'}->{$s}->{'err'});
513	$self->{'slice'}->{$s}->{'err_len'} = length $self->{'slice'}->{$s}->{'err'};
514	# this is fatal
515	return undef;
516	}
517
518	return $s;
519	}
520
521	=head2 close_slice
522
523	Close slice (terminates swish-e process for that slice).
524
525	my $i->close_slice($s);
526
527	Returns true if slice is closed, false otherwise.
528
529	=cut
530
531	sub close_slice {
532	my $self = shift;
533
534	my $s = shift \|\| confess "close_slice needs slice";
535
536	confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
537	confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));
538
539	# pump rest of content (if any)
540	$self->{'slice'}->{$s}->{'h'}->pump while length $self->{'slice'}->{$s}->{'in'};
541
542	$self->slice_output($s);
543
544	# clean up
545	$self->{'slice'}->{$s}->{'h'}->finish or confess "finish on slice $s returned $?: $! -- ",$self->{'slice'}->{$s}->{'err'};
546
547	delete($self->{'slice'}->{$s}) && return 1;
548	return 0;
549	}
550
551	=head2 to_xml
552
553	Convert (binary safe, I hope) your data into XML for C<swish-e>.
554	Data will not yet be recoded to UTF-8. L<"put_slice"> will do that.
555
556	my $xml = $i->to_xml({ foo => 'bar' });
557
558	This function is extracted from L<"add"> method so that you can L<Memoize> it.
559	If your data set has a lot of repeatable data, and memory is not a problem, you
560	can add C<memoize_to_xml> option to L<"open">.
561
562	=cut
563
564	my %escape = ('<'=>'<', '>'=>'>', '&'=>'&', '"'=>'"');
565	my $escape_re = join '\|' => keys %escape;
566
567	sub to_xml {
568	my $self = shift;
569
570	my $data = shift \|\| return;
571
572	my $xml = qq{<xml>};
573	foreach my $tag (keys %$data) {
574	my $content = $data->{$tag};
575	next if (! $content \|\| $content eq '');
576	# save [cr/]lf before conversion to XML
577	# $content =~ s/\n\r/##lf##/gs;
578	# $content =~ s/\n/##lf##/gs;
579	$content =~ s/($escape_re)/$escape{$1}/gs;
580	$xml .= "<$tag><![CDATA[".$content."]]></$tag>";
581	}
582	$xml .= qq{</xml>};
583	}
584
585	1;
586	__END__
587
588
589	=head1 Searching
590
591	Searching is still conducted using L<SWISH::API>, but you have to glob
592	index names.
593
594	use SWISH::API;
595
596	my $swish = SWISH::API->new( glob('index.swish-e/*') );
597
598	You can also alternativly create merged index (using C<merge> option) and
599	not change your source code at all.
600
601	That would also benefit performance, but it increases indexing time
602	because merged indexes must be re-created on each indexing run.
603
604	=head1 EXPORT
605
606	Nothing by default.
607
608	=head1 EXAMPLES
609
610	Test script for this module uses all parts of API. It's also nice example
611	how to use C<SWISH::Split>.
612
613	=head1 SEE ALSO
614
615	L<SWISH::API>,
616	L<http://www.swish-e.org/>
617
618	=head1 AUTHOR
619
620	Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
621
622	=head1 COPYRIGHT AND LICENSE
623
624	Copyright (C) 2004 by Dobrica Pavlinusic
625
626	This library is free software; you can redistribute it and/or modify
627	it under the same terms as Perl itself, either Perl version 5.8.4 or,
628	at your option, any later version of Perl 5 you may have available.
629
630
631	=cut