SWISH-Split/trunk/Split.pm

package SWISH::Split;

use 5.008;
use strict;
use warnings;

our $VERSION = '0.03';

use SWISH::API;
use Text::Iconv;
use File::Temp qw/ :mktemp /;
use Carp;
use Digest::MD5 qw(md5_hex);
use Memoize;
use File::Which;

use Data::Dumper;

use constant {
        ADDED => 1,
        DELETED => 2,
};

=head1 NAME

SWISH::Split - Perl interface to split index variant of Swish-e

=head1 SYNOPSIS

  use SWISH::Split;


=head1 DESCRIPTION

This is alternative interface for indexing data with swish-e. It's designed
to split indexes over multiple files (slices) to allow updates of records in index
by reindexing just changed parts (slice).

Data is stored in index using intrface which is somewhat similar to
L<Plucene::Simple>. This could make your migration (or supporting two index
engines) easier.

In the background, it will fork swish-e binaries (one for each index slice)
and produce UTF-8 encoded XML files for it. So, if your input charset isn't
C<ISO-8859-1> you will have to specify it.

=head1 Methods used for indexing

=head2 open_index

Create new object for index.

  my $i = SWISH::Split->open_index({
        index => '/path/to/index',
        slice_name => \&slice_on_path,
        slices => 30,
        merge => 0,
        codepage => 'ISO-8859-2',
        swish_config => qq{
                PropertyNames from date
                PropertyNamesDate date
        },
        memoize_to_xml => 0,
  );

  # split index on first component of path
  sub slice_on_path {
        return shift split(/\//,$_[0]);
  }

Options to C<open_index> are following:

=over 5

=item C<index>

path to (existing) directory in which index slices will be created.

=item C<slice_name>

coderef to function which provide slicing from path.

=item C<slices>

maximum number of index slices. See L<"in_slice"> for
more explanation.

=item C<merge>

(planned) option to merge indexes into one at end.

=item C<codepage>

data codepage (needed for conversion to UTF-8).
By default, it's C<ISO-8859-1>.

=item C<swish_config>

additional parametars which will be inserted into
C<swish-e> configuration file. See C<swish-config>.

=item C<memoize_to_xml>

speed up repeatable data, see L<"to_xml">.

=back

=cut

my $iso2utf = Text::Iconv->new('ISO-8859-1','UTF-8');

sub open_index {
        my $class = shift;
        my $self = {@_};
        bless($self, $class);

        croak "need slice_name coderef" unless ref $self->{'slice_name'};
        croak "need slices" unless $self->{'slices'};

        croak "need index" unless $self->{'index'};
        croak "index '",$self->{'index'},"' doesn't exist" unless -e $self->{'index'};
        croak "index '",$self->{'index'},"' is not directory" unless -d $self->{'index'};

        $iso2utf = Text::Iconv->new($self->{'codepage'},'UTF-8') if ($self->{'codepage'});

        # speedup
        memoize('in_slice');
        memoize('to_xml') if ($self->{'memoize_to_xml'});

        $self ? return $self : return undef;

}

=head2 add

Add document to index.

  $i->add($swishpath, {
        headline => 'foobar result',
        property => 'data',
  })

=cut

sub add {
        my $self = shift;

        my $swishpath = shift || return;
        my $data = shift || return;

        my $slice = $self->put_slice($swishpath, $self->to_xml($data));

        return $slice;
}

=head2 delete

Delete documents from index.

  $i->delete(@swishpath);

B<This function is not implemented.>

=cut

sub delete {
        my $self = shift;

        my @paths = @_ || return;

        foreach my $path (@paths) {
                $self->{'paths'}->{$path} = DELETED; 
        }

        die "delete is not yet implemented";

        return 42;
}


=head2 done

Finish indexing and close index file(s).

  $i->done;

This is most time-consuming operation. When it's called, it will re-index
all entries which haven't changed in all slices.

Returns number of slices updated.

This method should really be called close or finish, but both of those are
allready used.

=cut

sub done {
        my $self = shift;

        my $ret = 0;

        foreach my $s (keys %{$self->{'slice'}}) {
                $self->_debug("closing slice $s");
                $ret += $self->close_slice($s);
        }

        return $ret;
}


=head1 Reporting methods

This methods return statistics about your index.

=head2 swishpaths

Return array of C<swishpath>s in index.

  my @p = $i->swishpaths;

=cut

sub swishpaths {
        my $self = shift;

        my $s = shift || return;
        return if (! exists($self->{'slice'}->{'s'}));

        return keys %{$self->{'slice'}->{'s'}};
}

=head2 swishpaths_updated

Return array with updated C<swishpath>s.

  my @d = $i->swishpaths_updated;

=cut

sub swishpaths_updated {
        my $self = shift;
}


=head2 swishpaths_deleted

Return array with deleted C<swishpath>s.

  my $n = $i->swishpaths_deleted;

=cut

sub swishpaths_deleted {
        my $self = shift;
}


=head2 slices

Return array with all slice names.

  my @s = $i->slices;

=cut

sub slices {
        my $self = shift;
}

=head1 Helper methods

This methods are used internally, but they might be useful.

=head2 in_slice

Takes path and return slice in which this path belongs.

  my $s = $i->in_slice('path/to/document/in/index');

If there are C<slices> parametar to L<"open_index"> it will use
MD5 hash to spread documents across slices. That will produce random
distribution of your documents in slices, which might or might not be best
for your data. If you have to re-index large number of slices on each
run, think about creating your own C<slice> function and distributing
documents manually across slices.

Slice number must always be true value or various sanity checks will fail.

This function is C<Memoize>ed for performance reasons.

=cut

sub in_slice {
        my $self = shift;

        my $path = shift || confess "need path";

        confess "need slice_name function" unless ref ($self->{'slice_name'});

        if ($self->{'slices'}) {
                # first, pass path through slice_name function
                my $slice = &{$self->{'slice_name'}}($path);
                # then calculate MD5 hash
                my $hash = md5_hex($slice);
                # take first 8 chars to produce number
                # FIXME how random is this?
                $hash = hex(substr($hash,0,8));
                
                $slice = ($hash % $self->{'slices'}) + 1;
                $self->_debug("hash: $hash / ",$self->{'slices'}," => $slice");
                return $slice;
        } else {
                return &{$self->{'split'}}($path);
        }
}

=head2 find_paths

Return array of C<swishpath>s for given C<swish-e> query.

  my @p = $i->find_paths("headline=test*");

Useful for combining with L<"delete_documents"> to delete documents
which hasn't changed a while (so, expired).

=cut

sub find_paths {
        my $self = shift;

}


=head2 make_config

Create C<swish-e> configuration file for given slice.

  my $config_filename = $i->make_config('slice name');

It returns configuration filename. If no C<swish_config> was defined in
L<"open_index">, default swish-e configuration will be used. It will index all data for
searching, but none for properties.

If you want to see what is allready defined for swish-e in configuration
take a look at source code for C<DEFAULT_SWISH_CONF>.

It uses C<stdin> as C<IndexDir> to comunicate with C<swish-e>.

=cut

sub make_config {
        my $self = shift;


        my $index_file = $self->{'index'}."/";
        $index_file .= shift || confess "need slice name";

        my ($tmp_fh, $swish_config_filename) = mkstemp("/tmp/swishXXXXX");

        # find cat on filesystem
        my $cat = which('cat');

        print $tmp_fh <<"DEFAULT_SWISH_CONF";
# swish-e config file

IndexDir stdin

# input file definition
DefaultContents XML*

# indexed metatags
MetaNames xml swishdocpath


#XMLClassAttributes type
UndefinedMetaTags auto
UndefinedXMLAttributes auto

IndexFile $index_file

# Croatian ISO-8859-2 characters to unaccented equivalents
TranslateCharacters ¹©ðÐèÈæÆ¾® ssddcccczz


# disable output
ParserWarnLevel 0
IndexReport 1

DEFAULT_SWISH_CONF

        # add user parametars (like stored properties)
        print $tmp_fh $self->{'swish_config'} if ($self->{'swish_config'});

        close($tmp_fh);

        return $swish_config_filename;
}

=head2 create_slice

On first run, starts C<swish-e>. On subsequent calls just return
it's handles using C<Memoize>.

  my $s = create_slice('/path/to/document');

You shouldn't need to call C<create_slice> directly because it will be called
from L<"put_slice"> when needed.

=cut

sub create_slice {
        my $self = shift;

        my $path = shift || confess "create_slice need path!";

        my $s = $self->in_slice($path) || confess "in_slice returned null";

        return $s if (exists($self->{'slice'}->{$s}));

        my $swish_config = $self->make_config($s);

        my $swish = qq{| swish-e };
        if (-f $self->{'index'}.'/'.$s) {
                $swish .= qq{ -u };
                $self->{'slice'}->{$s}->{'update_mode'}++;
        }
        $swish .= qq{ -S prog -c } . $swish_config;

        $self->_debug("creating slice $s using $swish");

        ## Build the harness, open all pipes, and launch the subprocesses
        open(my $fh, $swish) || croak "can't open $swish: $!";

        $self->{'slice'}->{$s}->{'h'} = $fh;

        $self->slice_output($s);

        return $s;
}

=head2 put_slice

Pass XML data to swish.

  my $slice = $i->put_slice('/swish/path', '<xml>data</xml>');

Returns slice in which XML ended up.

=cut

sub put_slice {
        my $self = shift;

        my $path = shift || confess "need path";
        my $xml = shift || confess "need xml";

        $xml = $iso2utf->convert($xml) || carp "XML conversion error in $xml";

        my $s = $self->create_slice($path) || confess "create_slice returned null";

        confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
        confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));

        $self->slice_output($s);

        use bytes;      # as opposed to chars
        my $fh = $self->{'slice'}->{$s}->{'h'} || confess "handle for slice $s undefined";

        my $update_header = "Update-Mode: Index\n";
        $update_header = '' unless ($self->{'slice'}->{$s}->{'update_mode'});

        print { $fh } "Path-Name: $path\n".
                "Content-Length: ".(length($xml)+1)."\n" . $update_header . 
                "Document-Type: XML\n\n$xml\n";

        $self->slice_output($s);

        $self->_debug("dumping in slice $s: $path");

        $self->{'paths'}->{$path} = ADDED; 

        return $s;
}

=head2 slice_output

Prints to STDERR output and errors from C<swish-e>.

  my $slice = $i->slice_output($s);

Normally, you don't need to call it.

B<This is dummy placeholder function for very old code that assumes this
module is using C<IPC::Run> which it isn't any more.>

=cut

sub slice_output {
        my $self = shift;

        my $s = shift || confess "slice_output needs slice";

        confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));

        # FIXME

        return $s;
}

=head2 close_slice

Close slice (terminates swish-e process for that slice).

  my $i->close_slice($s);

Returns true if slice is closed, false otherwise.

=cut

sub close_slice {
        my $self = shift;

        my $s = shift || confess "close_slice needs slice";

        confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
        confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));

        # pump rest of content (if any)
        close $self->{'slice'}->{$s}->{'h'} || carp "can't close slice $s: $!";

        $self->slice_output($s);

        undef $self->{'slice'}->{$s}->{'h'};
        
        delete($self->{'slice'}->{$s}) && return 1;
        return 0;
}

=head2 to_xml

Convert (binary safe, I hope) your data into XML for C<swish-e>.
Data will not yet be recoded to UTF-8. L<"put_slice"> will do that.

  my $xml = $i->to_xml({ foo => 'bar' });

This function is extracted from L<"add"> method so that you can C<Memoize> it.
If your data set has a lot of repeatable data, and memory is not a problem, you
can add C<memoize_to_xml> option to L<"open_index">.

=cut

my %escape = ('<'=>'&lt;', '>'=>'&gt;', '&'=>'&amp;', '"'=>'&quot;');
my $escape_re  = join '|' => keys %escape;

sub to_xml {
        my $self = shift;

        my $data = shift || return;

        my $xml = qq{<xml>};
        foreach my $tag (keys %$data) {
                my $content = $data->{$tag};
                next if (! $content || $content eq '');
                # save [cr/]lf before conversion to XML
#               $content =~ s/\n\r/##lf##/gs;
#               $content =~ s/\n/##lf##/gs;
                $content =~ s/($escape_re)/$escape{$1}/gs;
                $xml .= "<$tag><![CDATA[".$content."]]></$tag>";
        }
        $xml .= qq{</xml>};
}

sub _debug {
        my $self = shift;
        print STDERR "## ",@_,"\n" if ($self->{'debug'});
        return;
}

1;
__END__


=head1 Searching

Searching is still conducted using L<SWISH::API>, but you have to glob
index names.

    use SWISH::API;

    my $swish = SWISH::API->new( glob('index.swish-e/*') );

You can also alternativly create merged index (using C<merge> option) and
not change your source code at all.

That would also benefit performance, but it increases indexing time
because merged indexes must be re-created on each indexing run.

=head1 EXPORT

Nothing by default.

=head1 EXAMPLES

Test script for this module uses all parts of API. It's also nice example
how to use C<SWISH::Split>.

=head1 SEE ALSO

L<SWISH::API>,
L<http://www.swish-e.org/>

=head1 AUTHOR

Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2004 by Dobrica Pavlinusic

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.8.4 or,
at your option, any later version of Perl 5 you may have available.


=cut
1	dpavlin	1	package SWISH::Split;
2
3			use 5.008;
4			use strict;
5			use warnings;
6
7	dpavlin	9	our $VERSION = '0.03';
8	dpavlin	1
9			use SWISH::API;
10			use Text::Iconv;
11			use File::Temp qw/ :mktemp /;
12			use Carp;
13			use Digest::MD5 qw(md5_hex);
14			use Memoize;
15	dpavlin	4	use File::Which;
16	dpavlin	1
17			use Data::Dumper;
18
19	dpavlin	5	use constant {
20			ADDED => 1,
21			DELETED => 2,
22			};
23
24	dpavlin	1	=head1 NAME
25
26			SWISH::Split - Perl interface to split index variant of Swish-e
27
28			=head1 SYNOPSIS
29
30			use SWISH::Split;
31
32
33			=head1 DESCRIPTION
34
35			This is alternative interface for indexing data with swish-e. It's designed
36	dpavlin	4	to split indexes over multiple files (slices) to allow updates of records in index
37			by reindexing just changed parts (slice).
38	dpavlin	1
39			Data is stored in index using intrface which is somewhat similar to
40			L<Plucene::Simple>. This could make your migration (or supporting two index
41			engines) easier.
42
43			In the background, it will fork swish-e binaries (one for each index slice)
44	dpavlin	6	and produce UTF-8 encoded XML files for it. So, if your input charset isn't
45	dpavlin	1	C<ISO-8859-1> you will have to specify it.
46
47			=head1 Methods used for indexing
48
49	dpavlin	8	=head2 open_index
50	dpavlin	1
51			Create new object for index.
52
53	dpavlin	8	my $i = SWISH::Split->open_index({
54	dpavlin	1	index => '/path/to/index',
55			slice_name => \&slice_on_path,
56			slices => 30,
57	dpavlin	4	merge => 0,
58			codepage => 'ISO-8859-2',
59			swish_config => qq{
60			PropertyNames from date
61			PropertyNamesDate date
62			},
63			memoize_to_xml => 0,
64	dpavlin	1	);
65
66			# split index on first component of path
67			sub slice_on_path {
68			return shift split(/\//,$_[0]);
69			}
70
71	dpavlin	8	Options to C<open_index> are following:
72	dpavlin	1
73	dpavlin	4	=over 5
74
75			=item C<index>
76
77			path to (existing) directory in which index slices will be created.
78
79			=item C<slice_name>
80
81			coderef to function which provide slicing from path.
82
83			=item C<slices>
84
85			maximum number of index slices. See L<"in_slice"> for
86	dpavlin	1	more explanation.
87
88	dpavlin	4	=item C<merge>
89
90			(planned) option to merge indexes into one at end.
91
92			=item C<codepage>
93
94			data codepage (needed for conversion to UTF-8).
95			By default, it's C<ISO-8859-1>.
96
97			=item C<swish_config>
98
99			additional parametars which will be inserted into
100	dpavlin	11	C<swish-e> configuration file. See C<swish-config>.
101	dpavlin	4
102			=item C<memoize_to_xml>
103
104			speed up repeatable data, see L<"to_xml">.
105
106			=back
107
108	dpavlin	1	=cut
109
110			my $iso2utf = Text::Iconv->new('ISO-8859-1','UTF-8');
111
112	dpavlin	8	sub open_index {
113	dpavlin	1	my $class = shift;
114			my $self = {@_};
115			bless($self, $class);
116
117			croak "need slice_name coderef" unless ref $self->{'slice_name'};
118			croak "need slices" unless $self->{'slices'};
119
120			croak "need index" unless $self->{'index'};
121			croak "index '",$self->{'index'},"' doesn't exist" unless -e $self->{'index'};
122			croak "index '",$self->{'index'},"' is not directory" unless -d $self->{'index'};
123
124			$iso2utf = Text::Iconv->new($self->{'codepage'},'UTF-8') if ($self->{'codepage'});
125
126	dpavlin	4	# speedup
127	dpavlin	1	memoize('in_slice');
128	dpavlin	4	memoize('to_xml') if ($self->{'memoize_to_xml'});
129	dpavlin	1
130			$self ? return $self : return undef;
131
132			}
133
134			=head2 add
135
136			Add document to index.
137
138			$i->add($swishpath, {
139			headline => 'foobar result',
140			property => 'data',
141			})
142
143			=cut
144
145			sub add {
146			my $self = shift;
147	dpavlin	3
148			my $swishpath = shift \|\| return;
149			my $data = shift \|\| return;
150
151	dpavlin	7	my $slice = $self->put_slice($swishpath, $self->to_xml($data));
152	dpavlin	4
153	dpavlin	7	return $slice;
154	dpavlin	1	}
155
156			=head2 delete
157
158	dpavlin	3	Delete documents from index.
159	dpavlin	1
160	dpavlin	3	$i->delete(@swishpath);
161	dpavlin	1
162	dpavlin	13	B<This function is not implemented.>
163
164	dpavlin	1	=cut
165
166			sub delete {
167			my $self = shift;
168	dpavlin	3
169			my @paths = @_ \|\| return;
170
171	dpavlin	5	foreach my $path (@paths) {
172			$self->{'paths'}->{$path} = DELETED;
173			}
174
175	dpavlin	13	die "delete is not yet implemented";
176
177	dpavlin	3	return 42;
178	dpavlin	1	}
179
180
181	dpavlin	5	=head2 done
182	dpavlin	1
183	dpavlin	4	Finish indexing and close index file(s).
184	dpavlin	1
185	dpavlin	5	$i->done;
186	dpavlin	1
187			This is most time-consuming operation. When it's called, it will re-index
188			all entries which haven't changed in all slices.
189
190	dpavlin	4	Returns number of slices updated.
191
192	dpavlin	5	This method should really be called close or finish, but both of those are
193			allready used.
194
195	dpavlin	1	=cut
196
197	dpavlin	5	sub done {
198	dpavlin	1	my $self = shift;
199	dpavlin	3
200	dpavlin	4	my $ret = 0;
201
202			foreach my $s (keys %{$self->{'slice'}}) {
203	dpavlin	8	$self->_debug("closing slice $s");
204	dpavlin	4	$ret += $self->close_slice($s);
205			}
206
207			return $ret;
208	dpavlin	1	}
209
210
211
212			=head1 Reporting methods
213
214			This methods return statistics about your index.
215
216			=head2 swishpaths
217
218			Return array of C<swishpath>s in index.
219
220			my @p = $i->swishpaths;
221
222			=cut
223
224			sub swishpaths {
225			my $self = shift;
226	dpavlin	5
227			my $s = shift \|\| return;
228			return if (! exists($self->{'slice'}->{'s'}));
229
230			return keys %{$self->{'slice'}->{'s'}};
231	dpavlin	1	}
232
233			=head2 swishpaths_updated
234
235			Return array with updated C<swishpath>s.
236
237			my @d = $i->swishpaths_updated;
238
239			=cut
240
241			sub swishpaths_updated {
242			my $self = shift;
243			}
244
245
246			=head2 swishpaths_deleted
247
248			Return array with deleted C<swishpath>s.
249
250			my $n = $i->swishpaths_deleted;
251
252			=cut
253
254			sub swishpaths_deleted {
255			my $self = shift;
256			}
257
258
259			=head2 slices
260
261			Return array with all slice names.
262
263			my @s = $i->slices;
264
265			=cut
266
267			sub slices {
268			my $self = shift;
269			}
270
271			=head1 Helper methods
272
273			This methods are used internally, but they might be useful.
274
275			=head2 in_slice
276
277			Takes path and return slice in which this path belongs.
278
279			my $s = $i->in_slice('path/to/document/in/index');
280
281	dpavlin	8	If there are C<slices> parametar to L<"open_index"> it will use
282	dpavlin	1	MD5 hash to spread documents across slices. That will produce random
283			distribution of your documents in slices, which might or might not be best
284			for your data. If you have to re-index large number of slices on each
285			run, think about creating your own C<slice> function and distributing
286			documents manually across slices.
287
288	dpavlin	4	Slice number must always be true value or various sanity checks will fail.
289
290	dpavlin	1	This function is C<Memoize>ed for performance reasons.
291
292			=cut
293
294			sub in_slice {
295			my $self = shift;
296
297			my $path = shift \|\| confess "need path";
298
299			confess "need slice_name function" unless ref ($self->{'slice_name'});
300
301			if ($self->{'slices'}) {
302			# first, pass path through slice_name function
303			my $slice = &{$self->{'slice_name'}}($path);
304			# then calculate MD5 hash
305	dpavlin	8	my $hash = md5_hex($slice);
306	dpavlin	1	# take first 8 chars to produce number
307			# FIXME how random is this?
308	dpavlin	8	$hash = hex(substr($hash,0,8));
309	dpavlin	1
310	dpavlin	8	$slice = ($hash % $self->{'slices'}) + 1;
311			$self->_debug("hash: $hash / ",$self->{'slices'}," => $slice");
312	dpavlin	4	return $slice;
313	dpavlin	1	} else {
314			return &{$self->{'split'}}($path);
315			}
316			}
317
318	dpavlin	3	=head2 find_paths
319	dpavlin	1
320	dpavlin	3	Return array of C<swishpath>s for given C<swish-e> query.
321	dpavlin	1
322	dpavlin	3	my @p = $i->find_paths("headline=test*");
323
324			Useful for combining with L<"delete_documents"> to delete documents
325			which hasn't changed a while (so, expired).
326
327			=cut
328
329			sub find_paths {
330			my $self = shift;
331
332			}
333
334
335	dpavlin	4	=head2 make_config
336	dpavlin	3
337	dpavlin	4	Create C<swish-e> configuration file for given slice.
338
339			my $config_filename = $i->make_config('slice name');
340
341			It returns configuration filename. If no C<swish_config> was defined in
342	dpavlin	8	L<"open_index">, default swish-e configuration will be used. It will index all data for
343	dpavlin	4	searching, but none for properties.
344
345			If you want to see what is allready defined for swish-e in configuration
346			take a look at source code for C<DEFAULT_SWISH_CONF>.
347
348	dpavlin	7	It uses C<stdin> as C<IndexDir> to comunicate with C<swish-e>.
349	dpavlin	4
350			=cut
351
352			sub make_config {
353			my $self = shift;
354
355
356			my $index_file = $self->{'index'}."/";
357			$index_file .= shift \|\| confess "need slice name";
358
359			my ($tmp_fh, $swish_config_filename) = mkstemp("/tmp/swishXXXXX");
360
361			# find cat on filesystem
362			my $cat = which('cat');
363
364			print $tmp_fh <<"DEFAULT_SWISH_CONF";
365			# swish-e config file
366
367	dpavlin	7	IndexDir stdin
368	dpavlin	4
369			# input file definition
370			DefaultContents XML*
371
372			# indexed metatags
373			MetaNames xml swishdocpath
374
375
376			#XMLClassAttributes type
377			UndefinedMetaTags auto
378			UndefinedXMLAttributes auto
379
380			IndexFile $index_file
381
382			# Croatian ISO-8859-2 characters to unaccented equivalents
383			TranslateCharacters ¹©ðÐèÈæÆ¾® ssddcccczz
384
385
386			# disable output
387			ParserWarnLevel 0
388			IndexReport 1
389
390			DEFAULT_SWISH_CONF
391
392			# add user parametars (like stored properties)
393			print $tmp_fh $self->{'swish_config'} if ($self->{'swish_config'});
394
395			close($tmp_fh);
396
397			return $swish_config_filename;
398			}
399
400			=head2 create_slice
401
402	dpavlin	8	On first run, starts C<swish-e>. On subsequent calls just return
403	dpavlin	11	it's handles using C<Memoize>.
404	dpavlin	4
405			my $s = create_slice('/path/to/document');
406
407			You shouldn't need to call C<create_slice> directly because it will be called
408			from L<"put_slice"> when needed.
409
410			=cut
411
412			sub create_slice {
413			my $self = shift;
414
415			my $path = shift \|\| confess "create_slice need path!";
416
417			my $s = $self->in_slice($path) \|\| confess "in_slice returned null";
418
419			return $s if (exists($self->{'slice'}->{$s}));
420
421			my $swish_config = $self->make_config($s);
422
423	dpavlin	8	my $swish = qq{\| swish-e };
424	dpavlin	9	if (-f $self->{'index'}.'/'.$s) {
425			$swish .= qq{ -u };
426			$self->{'slice'}->{$s}->{'update_mode'}++;
427			}
428	dpavlin	8	$swish .= qq{ -S prog -c } . $swish_config;
429	dpavlin	4
430	dpavlin	8	$self->_debug("creating slice $s using $swish");
431	dpavlin	4
432			## Build the harness, open all pipes, and launch the subprocesses
433	dpavlin	8	open(my $fh, $swish) \|\| croak "can't open $swish: $!";
434	dpavlin	4
435	dpavlin	8	$self->{'slice'}->{$s}->{'h'} = $fh;
436	dpavlin	4
437			$self->slice_output($s);
438
439			return $s;
440			}
441
442			=head2 put_slice
443
444	dpavlin	7	Pass XML data to swish.
445	dpavlin	4
446	dpavlin	7	my $slice = $i->put_slice('/swish/path', '<xml>data</xml>');
447	dpavlin	4
448	dpavlin	7	Returns slice in which XML ended up.
449
450	dpavlin	4	=cut
451
452			sub put_slice {
453			my $self = shift;
454
455			my $path = shift \|\| confess "need path";
456			my $xml = shift \|\| confess "need xml";
457
458			$xml = $iso2utf->convert($xml) \|\| carp "XML conversion error in $xml";
459
460			my $s = $self->create_slice($path) \|\| confess "create_slice returned null";
461
462			confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
463			confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));
464
465			$self->slice_output($s);
466
467			use bytes; # as opposed to chars
468	dpavlin	8	my $fh = $self->{'slice'}->{$s}->{'h'} \|\| confess "handle for slice $s undefined";
469	dpavlin	9
470			my $update_header = "Update-Mode: Index\n";
471			$update_header = '' unless ($self->{'slice'}->{$s}->{'update_mode'});
472
473	dpavlin	8	print { $fh } "Path-Name: $path\n".
474	dpavlin	9	"Content-Length: ".(length($xml)+1)."\n" . $update_header .
475	dpavlin	4	"Document-Type: XML\n\n$xml\n";
476
477			$self->slice_output($s);
478
479	dpavlin	8	$self->_debug("dumping in slice $s: $path");
480
481	dpavlin	5	$self->{'paths'}->{$path} = ADDED;
482
483	dpavlin	4	return $s;
484			}
485
486			=head2 slice_output
487
488			Prints to STDERR output and errors from C<swish-e>.
489
490	dpavlin	7	my $slice = $i->slice_output($s);
491	dpavlin	4
492			Normally, you don't need to call it.
493
494	dpavlin	8	B<This is dummy placeholder function for very old code that assumes this
495			module is using C<IPC::Run> which it isn't any more.>
496
497	dpavlin	4	=cut
498
499			sub slice_output {
500			my $self = shift;
501
502			my $s = shift \|\| confess "slice_output needs slice";
503
504			confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
505
506	dpavlin	8	# FIXME
507	dpavlin	4
508	dpavlin	7	return $s;
509	dpavlin	4	}
510
511	dpavlin	5	=head2 close_slice
512	dpavlin	4
513			Close slice (terminates swish-e process for that slice).
514
515			my $i->close_slice($s);
516
517			Returns true if slice is closed, false otherwise.
518
519			=cut
520
521			sub close_slice {
522			my $self = shift;
523
524			my $s = shift \|\| confess "close_slice needs slice";
525
526			confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
527			confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));
528
529			# pump rest of content (if any)
530	dpavlin	8	close $self->{'slice'}->{$s}->{'h'} \|\| carp "can't close slice $s: $!";
531	dpavlin	4
532			$self->slice_output($s);
533
534	dpavlin	8	undef $self->{'slice'}->{$s}->{'h'};
535	dpavlin	4
536			delete($self->{'slice'}->{$s}) && return 1;
537			return 0;
538			}
539
540			=head2 to_xml
541
542			Convert (binary safe, I hope) your data into XML for C<swish-e>.
543			Data will not yet be recoded to UTF-8. L<"put_slice"> will do that.
544
545			my $xml = $i->to_xml({ foo => 'bar' });
546
547	dpavlin	11	This function is extracted from L<"add"> method so that you can C<Memoize> it.
548	dpavlin	4	If your data set has a lot of repeatable data, and memory is not a problem, you
549	dpavlin	8	can add C<memoize_to_xml> option to L<"open_index">.
550	dpavlin	4
551			=cut
552
553			my %escape = ('<'=>'<', '>'=>'>', '&'=>'&', '"'=>'"');
554			my $escape_re = join '\|' => keys %escape;
555
556			sub to_xml {
557			my $self = shift;
558
559			my $data = shift \|\| return;
560
561			my $xml = qq{<xml>};
562			foreach my $tag (keys %$data) {
563			my $content = $data->{$tag};
564			next if (! $content \|\| $content eq '');
565			# save [cr/]lf before conversion to XML
566			# $content =~ s/\n\r/##lf##/gs;
567			# $content =~ s/\n/##lf##/gs;
568			$content =~ s/($escape_re)/$escape{$1}/gs;
569			$xml .= "<$tag><![CDATA[".$content."]]></$tag>";
570			}
571			$xml .= qq{</xml>};
572			}
573
574	dpavlin	8	sub _debug {
575			my $self = shift;
576			print STDERR "## ",@_,"\n" if ($self->{'debug'});
577			return;
578			}
579
580	dpavlin	1	1;
581			__END__
582
583
584	dpavlin	5	=head1 Searching
585	dpavlin	1
586			Searching is still conducted using L<SWISH::API>, but you have to glob
587			index names.
588
589			use SWISH::API;
590
591			my $swish = SWISH::API->new( glob('index.swish-e/*') );
592
593			You can also alternativly create merged index (using C<merge> option) and
594			not change your source code at all.
595
596			That would also benefit performance, but it increases indexing time
597			because merged indexes must be re-created on each indexing run.
598
599	dpavlin	5	=head1 EXPORT
600	dpavlin	1
601	dpavlin	4	Nothing by default.
602	dpavlin	1
603	dpavlin	5	=head1 EXAMPLES
604	dpavlin	1
605	dpavlin	4	Test script for this module uses all parts of API. It's also nice example
606			how to use C<SWISH::Split>.
607	dpavlin	1
608			=head1 SEE ALSO
609
610			L<SWISH::API>,
611			L<http://www.swish-e.org/>
612
613			=head1 AUTHOR
614
615			Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
616
617			=head1 COPYRIGHT AND LICENSE
618
619			Copyright (C) 2004 by Dobrica Pavlinusic
620
621			This library is free software; you can redistribute it and/or modify
622			it under the same terms as Perl itself, either Perl version 5.8.4 or,
623			at your option, any later version of Perl 5 you may have available.
624
625
626			=cut