SWISH-Split/trunk/Split.pm

package SWISH::Split;

use 5.008;
use strict;
use warnings;

our $VERSION = '0.03';

use SWISH::API;
use Text::Iconv;
use File::Temp qw/ :mktemp /;
use Carp;
use Digest::MD5 qw(md5_hex);
use Memoize;
use File::Which;

use Data::Dumper;

use constant {
        ADDED => 1,
        DELETED => 2,
};

=head1 NAME

SWISH::Split - Perl interface to split index variant of Swish-e

=head1 SYNOPSIS

  use SWISH::Split;


=head1 DESCRIPTION

This is alternative interface for indexing data with swish-e. It's designed
to split indexes over multiple files (slices) to allow updates of records in index
by reindexing just changed parts (slice).

Data is stored in index using intrface which is somewhat similar to
L<Plucene::Simple>. This could make your migration (or supporting two index
engines) easier.

In the background, it will fork swish-e binaries (one for each index slice)
and produce UTF-8 encoded XML files for it. So, if your input charset isn't
C<ISO-8859-1> you will have to specify it.

=head1 Methods used for indexing

=head2 open_index

Create new object for index.

  my $i = SWISH::Split->open_index({
        index => '/path/to/index',
        slice_name => \&slice_on_path,
        slices => 30,
        merge => 0,
        codepage => 'ISO-8859-2',
        swish_config => qq{
                PropertyNames from date
                PropertyNamesDate date
        },
        memoize_to_xml => 0,
  );

  # split index on first component of path
  sub slice_on_path {
        return shift split(/\//,$_[0]);
  }

Options to C<open_index> are following:

=over 5

=item C<index>

path to (existing) directory in which index slices will be created.

=item C<slice_name>

coderef to function which provide slicing from path.

=item C<slices>

maximum number of index slices. See L<"in_slice"> for
more explanation.

=item C<merge>

(planned) option to merge indexes into one at end.

=item C<codepage>

data codepage (needed for conversion to UTF-8).
By default, it's C<ISO-8859-1>.

=item C<swish_config>

additional parametars which will be inserted into
C<swish-e> configuration file. See C<swish-config>.

=item C<memoize_to_xml>

speed up repeatable data, see L<"to_xml">.

=back

=cut

my $iso2utf = Text::Iconv->new('ISO-8859-1','UTF-8');

sub open_index {
        my $class = shift;
        my $self = {@_};
        bless($self, $class);

        croak "need slice_name coderef" unless ref $self->{'slice_name'};
        croak "need slices" unless $self->{'slices'};

        croak "need index" unless $self->{'index'};
        croak "index '",$self->{'index'},"' doesn't exist" unless -e $self->{'index'};
        croak "index '",$self->{'index'},"' is not directory" unless -d $self->{'index'};

        $iso2utf = Text::Iconv->new($self->{'codepage'},'UTF-8') if ($self->{'codepage'});

        # speedup
        memoize('in_slice');
        memoize('to_xml') if ($self->{'memoize_to_xml'});

        $self ? return $self : return undef;

}

=head2 add

Add document to index.

  $i->add($swishpath, {
        headline => 'foobar result',
        property => 'data',
  })

=cut

sub add {
        my $self = shift;

        my $swishpath = shift || return;
        my $data = shift || return;

        my $slice = $self->put_slice($swishpath, $self->to_xml($data));

        return $slice;
}

=head2 delete

Delete documents from index.

  $i->delete(@swishpath);

B<This function is not implemented.>

=cut

sub delete {
        my $self = shift;

        my @paths = @_ || return;

        foreach my $path (@paths) {
                $self->{'paths'}->{$path} = DELETED; 
        }

        die "delete is not yet implemented";

        return 42;
}


=head2 done

Finish indexing and close index file(s).

  $i->done;

This is most time-consuming operation. When it's called, it will re-index
all entries which haven't changed in all slices.

Returns number of slices updated.

This method should really be called close or finish, but both of those are
allready used.

=cut

sub done {
        my $self = shift;

        my $ret = 0;

        foreach my $s (keys %{$self->{'slice'}}) {
                $self->_debug("closing slice $s");
                $ret += $self->close_slice($s);
        }

        return $ret;
}


=head1 Reporting methods

This methods return statistics about your index.

=head2 swishpaths

Return array of C<swishpath>s in index.

  my @p = $i->swishpaths;

=cut

sub swishpaths {
        my $self = shift;

        my $s = shift || return;
        return if (! exists($self->{'slice'}->{'s'}));

        return keys %{$self->{'slice'}->{'s'}};
}

=head2 swishpaths_updated

Return array with updated C<swishpath>s.

  my @d = $i->swishpaths_updated;

=cut

sub swishpaths_updated {
        my $self = shift;
}


=head2 swishpaths_deleted

Return array with deleted C<swishpath>s.

  my $n = $i->swishpaths_deleted;

=cut

sub swishpaths_deleted {
        my $self = shift;
}


=head2 slices

Return array with all slice names.

  my @s = $i->slices;

=cut

sub slices {
        my $self = shift;
}

=head1 Helper methods

This methods are used internally, but they might be useful.

=head2 in_slice

Takes path and return slice in which this path belongs.

  my $s = $i->in_slice('path/to/document/in/index');

If there are C<slices> parametar to L<"open_index"> it will use
MD5 hash to spread documents across slices. That will produce random
distribution of your documents in slices, which might or might not be best
for your data. If you have to re-index large number of slices on each
run, think about creating your own C<slice> function and distributing
documents manually across slices.

Slice number must always be true value or various sanity checks will fail.

This function is C<Memoize>ed for performance reasons.

=cut

sub in_slice {
        my $self = shift;

        my $path = shift || confess "need path";

        confess "need slice_name function" unless ref ($self->{'slice_name'});

        if ($self->{'slices'}) {
                # first, pass path through slice_name function
                my $slice = &{$self->{'slice_name'}}($path);
                # then calculate MD5 hash
                my $hash = md5_hex($slice);
                # take first 8 chars to produce number
                # FIXME how random is this?
                $hash = hex(substr($hash,0,8));
                
                $slice = ($hash % $self->{'slices'}) + 1;
                $self->_debug("hash: $hash / ",$self->{'slices'}," => $slice");
                return $slice;
        } else {
                return &{$self->{'split'}}($path);
        }
}

=head2 find_paths

Return array of C<swishpath>s for given C<swish-e> query.

  my @p = $i->find_paths("headline=test*");

Useful for combining with L<"delete_documents"> to delete documents
which hasn't changed a while (so, expired).

=cut

sub find_paths {
        my $self = shift;

}


=head2 make_config

Create C<swish-e> configuration file for given slice.

  my $config_filename = $i->make_config('slice name');

It returns configuration filename. If no C<swish_config> was defined in
L<"open_index">, default swish-e configuration will be used. It will index all data for
searching, but none for properties.

If you want to see what is allready defined for swish-e in configuration
take a look at source code for C<DEFAULT_SWISH_CONF>.

It uses C<stdin> as C<IndexDir> to comunicate with C<swish-e>.

=cut

sub make_config {
        my $self = shift;


        my $index_file = $self->{'index'}."/";
        $index_file .= shift || confess "need slice name";

        my ($tmp_fh, $swish_config_filename) = mkstemp("/tmp/swishXXXXX");

        # find cat on filesystem
        my $cat = which('cat');

        print $tmp_fh <<"DEFAULT_SWISH_CONF";
# swish-e config file

IndexDir stdin

# input file definition
DefaultContents XML*

# indexed metatags
MetaNames xml swishdocpath


#XMLClassAttributes type
UndefinedMetaTags auto
UndefinedXMLAttributes auto

IndexFile $index_file

# Croatian ISO-8859-2 characters to unaccented equivalents
TranslateCharacters ¹©ðÐèÈæÆ¾® ssddcccczz


# disable output
ParserWarnLevel 0
IndexReport 1

DEFAULT_SWISH_CONF

        # add user parametars (like stored properties)
        print $tmp_fh $self->{'swish_config'} if ($self->{'swish_config'});

        close($tmp_fh);

        return $swish_config_filename;
}

=head2 create_slice

On first run, starts C<swish-e>. On subsequent calls just return
it's handles using C<Memoize>.

  my $s = create_slice('/path/to/document');

You shouldn't need to call C<create_slice> directly because it will be called
from L<"put_slice"> when needed.

=cut

sub create_slice {
        my $self = shift;

        my $path = shift || confess "create_slice need path!";

        my $s = $self->in_slice($path) || confess "in_slice returned null";

        return $s if (exists($self->{'slice'}->{$s}));

        my $swish_config = $self->make_config($s);

        my $swish = qq{| swish-e };
        if (-f $self->{'index'}.'/'.$s) {
                $swish .= qq{ -u };
                $self->{'slice'}->{$s}->{'update_mode'}++;
        }
        $swish .= qq{ -S prog -c } . $swish_config;

        $self->_debug("creating slice $s using $swish");

        ## Build the harness, open all pipes, and launch the subprocesses
        open(my $fh, $swish) || croak "can't open $swish: $!";

        $self->{'slice'}->{$s}->{'h'} = $fh;

        $self->slice_output($s);

        return $s;
}

=head2 put_slice

Pass XML data to swish.

  my $slice = $i->put_slice('/swish/path', '<xml>data</xml>');

Returns slice in which XML ended up.

=cut

sub put_slice {
        my $self = shift;

        my $path = shift || confess "need path";
        my $xml = shift || confess "need xml";

        $xml = $iso2utf->convert($xml) || carp "XML conversion error in $xml";

        my $s = $self->create_slice($path) || confess "create_slice returned null";

        confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
        confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));

        $self->slice_output($s);

        use bytes;      # as opposed to chars
        my $fh = $self->{'slice'}->{$s}->{'h'} || confess "handle for slice $s undefined";

        my $update_header = "Update-Mode: Index\n";
        $update_header = '' unless ($self->{'slice'}->{$s}->{'update_mode'});

        print { $fh } "Path-Name: $path\n".
                "Content-Length: ".(length($xml)+1)."\n" . $update_header . 
                "Document-Type: XML\n\n$xml\n";

        $self->slice_output($s);

        $self->_debug("dumping in slice $s: $path");

        $self->{'paths'}->{$path} = ADDED; 

        return $s;
}

=head2 slice_output

Prints to STDERR output and errors from C<swish-e>.

  my $slice = $i->slice_output($s);

Normally, you don't need to call it.

B<This is dummy placeholder function for very old code that assumes this
module is using C<IPC::Run> which it isn't any more.>

=cut

sub slice_output {
        my $self = shift;

        my $s = shift || confess "slice_output needs slice";

        confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));

        # FIXME

        return $s;
}

=head2 close_slice

Close slice (terminates swish-e process for that slice).

  my $i->close_slice($s);

Returns true if slice is closed, false otherwise.

=cut

sub close_slice {
        my $self = shift;

        my $s = shift || confess "close_slice needs slice";

        confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
        confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));

        # pump rest of content (if any)
        close $self->{'slice'}->{$s}->{'h'} || carp "can't close slice $s: $!";

        $self->slice_output($s);

        undef $self->{'slice'}->{$s}->{'h'};
        
        delete($self->{'slice'}->{$s}) && return 1;
        return 0;
}

=head2 to_xml

Convert (binary safe, I hope) your data into XML for C<swish-e>.
Data will not yet be recoded to UTF-8. L<"put_slice"> will do that.

  my $xml = $i->to_xml({ foo => 'bar' });

This function is extracted from L<"add"> method so that you can C<Memoize> it.
If your data set has a lot of repeatable data, and memory is not a problem, you
can add C<memoize_to_xml> option to L<"open_index">.

=cut

my %escape = ('<'=>'&lt;', '>'=>'&gt;', '&'=>'&amp;', '"'=>'&quot;');
my $escape_re  = join '|' => keys %escape;

sub to_xml {
        my $self = shift;

        my $data = shift || return;

        my $xml = qq{<xml>};
        foreach my $tag (keys %$data) {
                my $content = $data->{$tag};
                next if (! $content || $content eq '');
                # save [cr/]lf before conversion to XML
#               $content =~ s/\n\r/##lf##/gs;
#               $content =~ s/\n/##lf##/gs;
                $content =~ s/($escape_re)/$escape{$1}/gs;
                $xml .= "<$tag><![CDATA[".$content."]]></$tag>";
        }
        $xml .= qq{</xml>};
}

sub _debug {
        my $self = shift;
        print STDERR "## ",@_,"\n" if ($self->{'debug'});
        return;
}

1;
__END__


=head1 Searching

Searching is still conducted using L<SWISH::API>, but you have to glob
index names.

    use SWISH::API;

    my $swish = SWISH::API->new( glob('index.swish-e/*') );

You can also alternativly create merged index (using C<merge> option) and
not change your source code at all.

That would also benefit performance, but it increases indexing time
because merged indexes must be re-created on each indexing run.

=head1 EXPORT

Nothing by default.

=head1 EXAMPLES

Test script for this module uses all parts of API. It's also nice example
how to use C<SWISH::Split>.

=head1 SEE ALSO

L<SWISH::API>,
L<http://www.swish-e.org/>

=head1 AUTHOR

Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2004 by Dobrica Pavlinusic

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.8.4 or,
at your option, any later version of Perl 5 you may have available.


=cut
1	package SWISH::Split;
2
3	use 5.008;
4	use strict;
5	use warnings;
6
7	our $VERSION = '0.03';
8
9	use SWISH::API;
10	use Text::Iconv;
11	use File::Temp qw/ :mktemp /;
12	use Carp;
13	use Digest::MD5 qw(md5_hex);
14	use Memoize;
15	use File::Which;
16
17	use Data::Dumper;
18
19	use constant {
20	ADDED => 1,
21	DELETED => 2,
22	};
23
24	=head1 NAME
25
26	SWISH::Split - Perl interface to split index variant of Swish-e
27
28	=head1 SYNOPSIS
29
30	use SWISH::Split;
31
32
33	=head1 DESCRIPTION
34
35	This is alternative interface for indexing data with swish-e. It's designed
36	to split indexes over multiple files (slices) to allow updates of records in index
37	by reindexing just changed parts (slice).
38
39	Data is stored in index using intrface which is somewhat similar to
40	L<Plucene::Simple>. This could make your migration (or supporting two index
41	engines) easier.
42
43	In the background, it will fork swish-e binaries (one for each index slice)
44	and produce UTF-8 encoded XML files for it. So, if your input charset isn't
45	C<ISO-8859-1> you will have to specify it.
46
47	=head1 Methods used for indexing
48
49	=head2 open_index
50
51	Create new object for index.
52
53	my $i = SWISH::Split->open_index({
54	index => '/path/to/index',
55	slice_name => \&slice_on_path,
56	slices => 30,
57	merge => 0,
58	codepage => 'ISO-8859-2',
59	swish_config => qq{
60	PropertyNames from date
61	PropertyNamesDate date
62	},
63	memoize_to_xml => 0,
64	);
65
66	# split index on first component of path
67	sub slice_on_path {
68	return shift split(/\//,$_[0]);
69	}
70
71	Options to C<open_index> are following:
72
73	=over 5
74
75	=item C<index>
76
77	path to (existing) directory in which index slices will be created.
78
79	=item C<slice_name>
80
81	coderef to function which provide slicing from path.
82
83	=item C<slices>
84
85	maximum number of index slices. See L<"in_slice"> for
86	more explanation.
87
88	=item C<merge>
89
90	(planned) option to merge indexes into one at end.
91
92	=item C<codepage>
93
94	data codepage (needed for conversion to UTF-8).
95	By default, it's C<ISO-8859-1>.
96
97	=item C<swish_config>
98
99	additional parametars which will be inserted into
100	C<swish-e> configuration file. See C<swish-config>.
101
102	=item C<memoize_to_xml>
103
104	speed up repeatable data, see L<"to_xml">.
105
106	=back
107
108	=cut
109
110	my $iso2utf = Text::Iconv->new('ISO-8859-1','UTF-8');
111
112	sub open_index {
113	my $class = shift;
114	my $self = {@_};
115	bless($self, $class);
116
117	croak "need slice_name coderef" unless ref $self->{'slice_name'};
118	croak "need slices" unless $self->{'slices'};
119
120	croak "need index" unless $self->{'index'};
121	croak "index '",$self->{'index'},"' doesn't exist" unless -e $self->{'index'};
122	croak "index '",$self->{'index'},"' is not directory" unless -d $self->{'index'};
123
124	$iso2utf = Text::Iconv->new($self->{'codepage'},'UTF-8') if ($self->{'codepage'});
125
126	# speedup
127	memoize('in_slice');
128	memoize('to_xml') if ($self->{'memoize_to_xml'});
129
130	$self ? return $self : return undef;
131
132	}
133
134	=head2 add
135
136	Add document to index.
137
138	$i->add($swishpath, {
139	headline => 'foobar result',
140	property => 'data',
141	})
142
143	=cut
144
145	sub add {
146	my $self = shift;
147
148	my $swishpath = shift \|\| return;
149	my $data = shift \|\| return;
150
151	my $slice = $self->put_slice($swishpath, $self->to_xml($data));
152
153	return $slice;
154	}
155
156	=head2 delete
157
158	Delete documents from index.
159
160	$i->delete(@swishpath);
161
162	B<This function is not implemented.>
163
164	=cut
165
166	sub delete {
167	my $self = shift;
168
169	my @paths = @_ \|\| return;
170
171	foreach my $path (@paths) {
172	$self->{'paths'}->{$path} = DELETED;
173	}
174
175	die "delete is not yet implemented";
176
177	return 42;
178	}
179
180
181	=head2 done
182
183	Finish indexing and close index file(s).
184
185	$i->done;
186
187	This is most time-consuming operation. When it's called, it will re-index
188	all entries which haven't changed in all slices.
189
190	Returns number of slices updated.
191
192	This method should really be called close or finish, but both of those are
193	allready used.
194
195	=cut
196
197	sub done {
198	my $self = shift;
199
200	my $ret = 0;
201
202	foreach my $s (keys %{$self->{'slice'}}) {
203	$self->_debug("closing slice $s");
204	$ret += $self->close_slice($s);
205	}
206
207	return $ret;
208	}
209
210
211
212	=head1 Reporting methods
213
214	This methods return statistics about your index.
215
216	=head2 swishpaths
217
218	Return array of C<swishpath>s in index.
219
220	my @p = $i->swishpaths;
221
222	=cut
223
224	sub swishpaths {
225	my $self = shift;
226
227	my $s = shift \|\| return;
228	return if (! exists($self->{'slice'}->{'s'}));
229
230	return keys %{$self->{'slice'}->{'s'}};
231	}
232
233	=head2 swishpaths_updated
234
235	Return array with updated C<swishpath>s.
236
237	my @d = $i->swishpaths_updated;
238
239	=cut
240
241	sub swishpaths_updated {
242	my $self = shift;
243	}
244
245
246	=head2 swishpaths_deleted
247
248	Return array with deleted C<swishpath>s.
249
250	my $n = $i->swishpaths_deleted;
251
252	=cut
253
254	sub swishpaths_deleted {
255	my $self = shift;
256	}
257
258
259	=head2 slices
260
261	Return array with all slice names.
262
263	my @s = $i->slices;
264
265	=cut
266
267	sub slices {
268	my $self = shift;
269	}
270
271	=head1 Helper methods
272
273	This methods are used internally, but they might be useful.
274
275	=head2 in_slice
276
277	Takes path and return slice in which this path belongs.
278
279	my $s = $i->in_slice('path/to/document/in/index');
280
281	If there are C<slices> parametar to L<"open_index"> it will use
282	MD5 hash to spread documents across slices. That will produce random
283	distribution of your documents in slices, which might or might not be best
284	for your data. If you have to re-index large number of slices on each
285	run, think about creating your own C<slice> function and distributing
286	documents manually across slices.
287
288	Slice number must always be true value or various sanity checks will fail.
289
290	This function is C<Memoize>ed for performance reasons.
291
292	=cut
293
294	sub in_slice {
295	my $self = shift;
296
297	my $path = shift \|\| confess "need path";
298
299	confess "need slice_name function" unless ref ($self->{'slice_name'});
300
301	if ($self->{'slices'}) {
302	# first, pass path through slice_name function
303	my $slice = &{$self->{'slice_name'}}($path);
304	# then calculate MD5 hash
305	my $hash = md5_hex($slice);
306	# take first 8 chars to produce number
307	# FIXME how random is this?
308	$hash = hex(substr($hash,0,8));
309
310	$slice = ($hash % $self->{'slices'}) + 1;
311	$self->_debug("hash: $hash / ",$self->{'slices'}," => $slice");
312	return $slice;
313	} else {
314	return &{$self->{'split'}}($path);
315	}
316	}
317
318	=head2 find_paths
319
320	Return array of C<swishpath>s for given C<swish-e> query.
321
322	my @p = $i->find_paths("headline=test*");
323
324	Useful for combining with L<"delete_documents"> to delete documents
325	which hasn't changed a while (so, expired).
326
327	=cut
328
329	sub find_paths {
330	my $self = shift;
331
332	}
333
334
335	=head2 make_config
336
337	Create C<swish-e> configuration file for given slice.
338
339	my $config_filename = $i->make_config('slice name');
340
341	It returns configuration filename. If no C<swish_config> was defined in
342	L<"open_index">, default swish-e configuration will be used. It will index all data for
343	searching, but none for properties.
344
345	If you want to see what is allready defined for swish-e in configuration
346	take a look at source code for C<DEFAULT_SWISH_CONF>.
347
348	It uses C<stdin> as C<IndexDir> to comunicate with C<swish-e>.
349
350	=cut
351
352	sub make_config {
353	my $self = shift;
354
355
356	my $index_file = $self->{'index'}."/";
357	$index_file .= shift \|\| confess "need slice name";
358
359	my ($tmp_fh, $swish_config_filename) = mkstemp("/tmp/swishXXXXX");
360
361	# find cat on filesystem
362	my $cat = which('cat');
363
364	print $tmp_fh <<"DEFAULT_SWISH_CONF";
365	# swish-e config file
366
367	IndexDir stdin
368
369	# input file definition
370	DefaultContents XML*
371
372	# indexed metatags
373	MetaNames xml swishdocpath
374
375
376	#XMLClassAttributes type
377	UndefinedMetaTags auto
378	UndefinedXMLAttributes auto
379
380	IndexFile $index_file
381
382	# Croatian ISO-8859-2 characters to unaccented equivalents
383	TranslateCharacters ¹©ðÐèÈæÆ¾® ssddcccczz
384
385
386	# disable output
387	ParserWarnLevel 0
388	IndexReport 1
389
390	DEFAULT_SWISH_CONF
391
392	# add user parametars (like stored properties)
393	print $tmp_fh $self->{'swish_config'} if ($self->{'swish_config'});
394
395	close($tmp_fh);
396
397	return $swish_config_filename;
398	}
399
400	=head2 create_slice
401
402	On first run, starts C<swish-e>. On subsequent calls just return
403	it's handles using C<Memoize>.
404
405	my $s = create_slice('/path/to/document');
406
407	You shouldn't need to call C<create_slice> directly because it will be called
408	from L<"put_slice"> when needed.
409
410	=cut
411
412	sub create_slice {
413	my $self = shift;
414
415	my $path = shift \|\| confess "create_slice need path!";
416
417	my $s = $self->in_slice($path) \|\| confess "in_slice returned null";
418
419	return $s if (exists($self->{'slice'}->{$s}));
420
421	my $swish_config = $self->make_config($s);
422
423	my $swish = qq{\| swish-e };
424	if (-f $self->{'index'}.'/'.$s) {
425	$swish .= qq{ -u };
426	$self->{'slice'}->{$s}->{'update_mode'}++;
427	}
428	$swish .= qq{ -S prog -c } . $swish_config;
429
430	$self->_debug("creating slice $s using $swish");
431
432	## Build the harness, open all pipes, and launch the subprocesses
433	open(my $fh, $swish) \|\| croak "can't open $swish: $!";
434
435	$self->{'slice'}->{$s}->{'h'} = $fh;
436
437	$self->slice_output($s);
438
439	return $s;
440	}
441
442	=head2 put_slice
443
444	Pass XML data to swish.
445
446	my $slice = $i->put_slice('/swish/path', '<xml>data</xml>');
447
448	Returns slice in which XML ended up.
449
450	=cut
451
452	sub put_slice {
453	my $self = shift;
454
455	my $path = shift \|\| confess "need path";
456	my $xml = shift \|\| confess "need xml";
457
458	$xml = $iso2utf->convert($xml) \|\| carp "XML conversion error in $xml";
459
460	my $s = $self->create_slice($path) \|\| confess "create_slice returned null";
461
462	confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
463	confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));
464
465	$self->slice_output($s);
466
467	use bytes; # as opposed to chars
468	my $fh = $self->{'slice'}->{$s}->{'h'} \|\| confess "handle for slice $s undefined";
469
470	my $update_header = "Update-Mode: Index\n";
471	$update_header = '' unless ($self->{'slice'}->{$s}->{'update_mode'});
472
473	print { $fh } "Path-Name: $path\n".
474	"Content-Length: ".(length($xml)+1)."\n" . $update_header .
475	"Document-Type: XML\n\n$xml\n";
476
477	$self->slice_output($s);
478
479	$self->_debug("dumping in slice $s: $path");
480
481	$self->{'paths'}->{$path} = ADDED;
482
483	return $s;
484	}
485
486	=head2 slice_output
487
488	Prints to STDERR output and errors from C<swish-e>.
489
490	my $slice = $i->slice_output($s);
491
492	Normally, you don't need to call it.
493
494	B<This is dummy placeholder function for very old code that assumes this
495	module is using C<IPC::Run> which it isn't any more.>
496
497	=cut
498
499	sub slice_output {
500	my $self = shift;
501
502	my $s = shift \|\| confess "slice_output needs slice";
503
504	confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
505
506	# FIXME
507
508	return $s;
509	}
510
511	=head2 close_slice
512
513	Close slice (terminates swish-e process for that slice).
514
515	my $i->close_slice($s);
516
517	Returns true if slice is closed, false otherwise.
518
519	=cut
520
521	sub close_slice {
522	my $self = shift;
523
524	my $s = shift \|\| confess "close_slice needs slice";
525
526	confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
527	confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));
528
529	# pump rest of content (if any)
530	close $self->{'slice'}->{$s}->{'h'} \|\| carp "can't close slice $s: $!";
531
532	$self->slice_output($s);
533
534	undef $self->{'slice'}->{$s}->{'h'};
535
536	delete($self->{'slice'}->{$s}) && return 1;
537	return 0;
538	}
539
540	=head2 to_xml
541
542	Convert (binary safe, I hope) your data into XML for C<swish-e>.
543	Data will not yet be recoded to UTF-8. L<"put_slice"> will do that.
544
545	my $xml = $i->to_xml({ foo => 'bar' });
546
547	This function is extracted from L<"add"> method so that you can C<Memoize> it.
548	If your data set has a lot of repeatable data, and memory is not a problem, you
549	can add C<memoize_to_xml> option to L<"open_index">.
550
551	=cut
552
553	my %escape = ('<'=>'<', '>'=>'>', '&'=>'&', '"'=>'"');
554	my $escape_re = join '\|' => keys %escape;
555
556	sub to_xml {
557	my $self = shift;
558
559	my $data = shift \|\| return;
560
561	my $xml = qq{<xml>};
562	foreach my $tag (keys %$data) {
563	my $content = $data->{$tag};
564	next if (! $content \|\| $content eq '');
565	# save [cr/]lf before conversion to XML
566	# $content =~ s/\n\r/##lf##/gs;
567	# $content =~ s/\n/##lf##/gs;
568	$content =~ s/($escape_re)/$escape{$1}/gs;
569	$xml .= "<$tag><![CDATA[".$content."]]></$tag>";
570	}
571	$xml .= qq{</xml>};
572	}
573
574	sub _debug {
575	my $self = shift;
576	print STDERR "## ",@_,"\n" if ($self->{'debug'});
577	return;
578	}
579
580	1;
581	__END__
582
583
584	=head1 Searching
585
586	Searching is still conducted using L<SWISH::API>, but you have to glob
587	index names.
588
589	use SWISH::API;
590
591	my $swish = SWISH::API->new( glob('index.swish-e/*') );
592
593	You can also alternativly create merged index (using C<merge> option) and
594	not change your source code at all.
595
596	That would also benefit performance, but it increases indexing time
597	because merged indexes must be re-created on each indexing run.
598
599	=head1 EXPORT
600
601	Nothing by default.
602
603	=head1 EXAMPLES
604
605	Test script for this module uses all parts of API. It's also nice example
606	how to use C<SWISH::Split>.
607
608	=head1 SEE ALSO
609
610	L<SWISH::API>,
611	L<http://www.swish-e.org/>
612
613	=head1 AUTHOR
614
615	Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
616
617	=head1 COPYRIGHT AND LICENSE
618
619	Copyright (C) 2004 by Dobrica Pavlinusic
620
621	This library is free software; you can redistribute it and/or modify
622	it under the same terms as Perl itself, either Perl version 5.8.4 or,
623	at your option, any later version of Perl 5 you may have available.
624
625
626	=cut