/[SWISH-Split]/trunk/Split.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/Split.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 4 - (hide annotations)
Sun Aug 8 19:22:56 2004 UTC (19 years, 7 months ago) by dpavlin
File size: 13382 byte(s)
first version which passes 51 test. It still doesn't update documents, just
insert.

1 dpavlin 1 package SWISH::Split;
2    
3     use 5.008;
4     use strict;
5     use warnings;
6    
7     our $VERSION = '0.00';
8    
9     use SWISH::API;
10     use Text::Iconv;
11     use File::Temp qw/ :mktemp /;
12     use Carp;
13     use Digest::MD5 qw(md5_hex);
14     use Memoize;
15 dpavlin 4 use IPC::Run qw(start timeout pump finish);
16     use File::Which;
17 dpavlin 1
18     use Data::Dumper;
19    
20     =head1 NAME
21    
22     SWISH::Split - Perl interface to split index variant of Swish-e
23    
24     =head1 SYNOPSIS
25    
26     use SWISH::Split;
27    
28    
29     =head1 DESCRIPTION
30    
31     This is alternative interface for indexing data with swish-e. It's designed
32 dpavlin 4 to split indexes over multiple files (slices) to allow updates of records in index
33     by reindexing just changed parts (slice).
34 dpavlin 1
35     Data is stored in index using intrface which is somewhat similar to
36     L<Plucene::Simple>. This could make your migration (or supporting two index
37     engines) easier.
38    
39     In the background, it will fork swish-e binaries (one for each index slice)
40 dpavlin 4 and produce UTF-8 encoded XML files for it. So, if your imput charset isn't
41 dpavlin 1 C<ISO-8859-1> you will have to specify it.
42    
43     =head1 Methods used for indexing
44    
45     =head2 open
46    
47     Create new object for index.
48    
49     my $i = SWISH::Split->open({
50     index => '/path/to/index',
51     slice_name => \&slice_on_path,
52     slices => 30,
53 dpavlin 4 merge => 0,
54     codepage => 'ISO-8859-2',
55     swish_config => qq{
56     PropertyNames from date
57     PropertyNamesDate date
58     },
59     memoize_to_xml => 0,
60 dpavlin 1 );
61    
62     # split index on first component of path
63     sub slice_on_path {
64     return shift split(/\//,$_[0]);
65     }
66    
67 dpavlin 4 Options to open are following:
68 dpavlin 1
69 dpavlin 4 =over 5
70    
71     =item C<index>
72    
73     path to (existing) directory in which index slices will be created.
74    
75     =item C<slice_name>
76    
77     coderef to function which provide slicing from path.
78    
79     =item C<slices>
80    
81     maximum number of index slices. See L<"in_slice"> for
82 dpavlin 1 more explanation.
83    
84 dpavlin 4 =item C<merge>
85    
86     (planned) option to merge indexes into one at end.
87    
88     =item C<codepage>
89    
90     data codepage (needed for conversion to UTF-8).
91     By default, it's C<ISO-8859-1>.
92    
93     =item C<swish_config>
94    
95     additional parametars which will be inserted into
96     C<swish-e> configuration file. See L<swish-config>.
97    
98     =item C<memoize_to_xml>
99    
100     speed up repeatable data, see L<"to_xml">.
101    
102     =back
103    
104 dpavlin 1 =cut
105    
106     my $iso2utf = Text::Iconv->new('ISO-8859-1','UTF-8');
107    
108     sub open {
109     my $class = shift;
110     my $self = {@_};
111     bless($self, $class);
112    
113     croak "need slice_name coderef" unless ref $self->{'slice_name'};
114     croak "need slices" unless $self->{'slices'};
115    
116     croak "need index" unless $self->{'index'};
117     croak "index '",$self->{'index'},"' doesn't exist" unless -e $self->{'index'};
118     croak "index '",$self->{'index'},"' is not directory" unless -d $self->{'index'};
119    
120     $iso2utf = Text::Iconv->new($self->{'codepage'},'UTF-8') if ($self->{'codepage'});
121    
122 dpavlin 4 # speedup
123 dpavlin 1 memoize('in_slice');
124 dpavlin 4 memoize('to_xml') if ($self->{'memoize_to_xml'});
125 dpavlin 1
126     $self ? return $self : return undef;
127    
128     }
129    
130     =head2 add
131    
132     Add document to index.
133    
134     $i->add($swishpath, {
135     headline => 'foobar result',
136     property => 'data',
137     })
138    
139     =cut
140    
141     sub add {
142     my $self = shift;
143 dpavlin 3
144     my $swishpath = shift || return;
145     my $data = shift || return;
146    
147 dpavlin 4 my ($out,$err) = $self->put_slice($swishpath, $self->to_xml($data));
148    
149     if ($err) {
150     carp "$swishpath: $err";
151     return 0;
152     }
153    
154 dpavlin 3 return 1;
155 dpavlin 1 }
156    
157     =head2 delete
158    
159 dpavlin 3 Delete documents from index.
160 dpavlin 1
161 dpavlin 3 $i->delete(@swishpath);
162 dpavlin 1
163     =cut
164    
165     sub delete {
166     my $self = shift;
167 dpavlin 3
168     my @paths = @_ || return;
169    
170     return 42;
171 dpavlin 1 }
172    
173    
174 dpavlin 4 =head2 finish
175 dpavlin 1
176 dpavlin 4 Finish indexing and close index file(s).
177 dpavlin 1
178 dpavlin 4 $i->finish;
179 dpavlin 1
180     This is most time-consuming operation. When it's called, it will re-index
181     all entries which haven't changed in all slices.
182    
183 dpavlin 4 Returns number of slices updated.
184    
185 dpavlin 1 =cut
186    
187 dpavlin 4 sub finish {
188 dpavlin 1 my $self = shift;
189 dpavlin 3
190 dpavlin 4 my $ret = 0;
191    
192     foreach my $s (keys %{$self->{'slice'}}) {
193     $ret += $self->close_slice($s);
194     }
195    
196     return $ret;
197 dpavlin 1 }
198    
199    
200    
201     =head1 Reporting methods
202    
203     This methods return statistics about your index.
204    
205     =head2 swishpaths
206    
207     Return array of C<swishpath>s in index.
208    
209     my @p = $i->swishpaths;
210    
211     =cut
212    
213     sub swishpaths {
214     my $self = shift;
215     }
216    
217     =head2 swishpaths_updated
218    
219     Return array with updated C<swishpath>s.
220    
221     my @d = $i->swishpaths_updated;
222    
223     =cut
224    
225     sub swishpaths_updated {
226     my $self = shift;
227     }
228    
229    
230     =head2 swishpaths_deleted
231    
232     Return array with deleted C<swishpath>s.
233    
234     my $n = $i->swishpaths_deleted;
235    
236     =cut
237    
238     sub swishpaths_deleted {
239     my $self = shift;
240     }
241    
242    
243     =head2 slices
244    
245     Return array with all slice names.
246    
247     my @s = $i->slices;
248    
249     =cut
250    
251     sub slices {
252     my $self = shift;
253     }
254    
255     =head1 Helper methods
256    
257     This methods are used internally, but they might be useful.
258    
259     =head2 in_slice
260    
261     Takes path and return slice in which this path belongs.
262    
263     my $s = $i->in_slice('path/to/document/in/index');
264    
265     If there are C<slices> parametar to L<"open"> it will use
266     MD5 hash to spread documents across slices. That will produce random
267     distribution of your documents in slices, which might or might not be best
268     for your data. If you have to re-index large number of slices on each
269     run, think about creating your own C<slice> function and distributing
270     documents manually across slices.
271    
272 dpavlin 4 Slice number must always be true value or various sanity checks will fail.
273    
274 dpavlin 1 This function is C<Memoize>ed for performance reasons.
275    
276     =cut
277    
278     sub in_slice {
279     my $self = shift;
280    
281     my $path = shift || confess "need path";
282    
283     confess "need slice_name function" unless ref ($self->{'slice_name'});
284    
285     if ($self->{'slices'}) {
286     # first, pass path through slice_name function
287     my $slice = &{$self->{'slice_name'}}($path);
288     # then calculate MD5 hash
289     $slice = md5_hex($slice);
290     # take first 8 chars to produce number
291     # FIXME how random is this?
292     $slice = hex(substr($slice,0,8));
293    
294 dpavlin 4 $slice = ($slice % $self->{'slices'}) + 1;
295     print "hash: $slice / ",$self->{'slices'}," => $slice\n";
296     return $slice;
297 dpavlin 1 } else {
298     return &{$self->{'split'}}($path);
299     }
300     }
301    
302 dpavlin 3 =head2 find_paths
303 dpavlin 1
304 dpavlin 3 Return array of C<swishpath>s for given C<swish-e> query.
305 dpavlin 1
306 dpavlin 3 my @p = $i->find_paths("headline=test*");
307    
308     Useful for combining with L<"delete_documents"> to delete documents
309     which hasn't changed a while (so, expired).
310    
311     =cut
312    
313     sub find_paths {
314     my $self = shift;
315    
316     my $s = shift || return;
317     }
318    
319    
320 dpavlin 4 =head2 make_config
321 dpavlin 3
322 dpavlin 4 Create C<swish-e> configuration file for given slice.
323    
324     my $config_filename = $i->make_config('slice name');
325    
326     It returns configuration filename. If no C<swish_config> was defined in
327     L<"open">, default swish-e configuration will be used. It will index all data for
328     searching, but none for properties.
329    
330     If you want to see what is allready defined for swish-e in configuration
331     take a look at source code for C<DEFAULT_SWISH_CONF>.
332    
333     It uses C<cat> utility to comunicate with C<swish-e>. Path is provided
334     by C<File::Which>. Do Windows users have to change that to C<COPY /B>
335     or something similar?
336    
337     =cut
338    
339     sub make_config {
340     my $self = shift;
341    
342    
343     my $index_file = $self->{'index'}."/";
344     $index_file .= shift || confess "need slice name";
345    
346     my ($tmp_fh, $swish_config_filename) = mkstemp("/tmp/swishXXXXX");
347    
348     # find cat on filesystem
349     my $cat = which('cat');
350    
351     print $tmp_fh <<"DEFAULT_SWISH_CONF";
352     # swish-e config file
353    
354     IndexDir cat
355     #SwishProgParameters -
356    
357     # input file definition
358     DefaultContents XML*
359    
360     # indexed metatags
361     MetaNames xml swishdocpath
362    
363    
364     #XMLClassAttributes type
365     UndefinedMetaTags auto
366     UndefinedXMLAttributes auto
367    
368     IndexFile $index_file
369    
370     # Croatian ISO-8859-2 characters to unaccented equivalents
371     TranslateCharacters ¹©ðÐèÈæƾ® ssddcccczz
372    
373    
374     # disable output
375     ParserWarnLevel 0
376     IndexReport 1
377    
378     DEFAULT_SWISH_CONF
379    
380     # add user parametars (like stored properties)
381     print $tmp_fh $self->{'swish_config'} if ($self->{'swish_config'});
382    
383     close($tmp_fh);
384    
385     return $swish_config_filename;
386     }
387    
388     =head2 create_slice
389    
390     On first run, starts C<swish-e> using L<IPC::Run>. On subsequent calls just return
391     it's handles using L<Memoize>.
392    
393     my $s = create_slice('/path/to/document');
394    
395     You shouldn't need to call C<create_slice> directly because it will be called
396     from L<"put_slice"> when needed.
397    
398     =cut
399    
400     sub create_slice {
401     my $self = shift;
402    
403     my $path = shift || confess "create_slice need path!";
404    
405     my $s = $self->in_slice($path) || confess "in_slice returned null";
406    
407     return $s if (exists($self->{'slice'}->{$s}));
408    
409     my $swish_config = $self->make_config($s);
410    
411     print STDERR "creating slice $s\n"; # FIXME
412    
413     my @swish = qw(swish-e -S prog -c);
414     push @swish, $swish_config;
415    
416     ## Build the harness, open all pipes, and launch the subprocesses
417     $self->{'slice'}->{$s}->{'h'} = start \@swish,
418     \$self->{'slice'}->{$s}->{'in'},
419     \$self->{'slice'}->{$s}->{'out'},
420     \$self->{'slice'}->{$s}->{'err'},
421     timeout( 90 ); # FIXME
422    
423     $self->{'slice'}->{$s}->{'out_len'} = 0;
424     $self->{'slice'}->{$s}->{'err_len'} = 0;
425    
426     $self->slice_output($s);
427    
428     return $s;
429     }
430    
431     =head2 put_slice
432    
433     Pass XML data to swish and receive output and errors.
434    
435     my ($out,$err) = $i->put_slice('/swish/path', '<xml>data</xml>');
436    
437     =cut
438    
439     sub put_slice {
440     my $self = shift;
441    
442     my $path = shift || confess "need path";
443     my $xml = shift || confess "need xml";
444    
445     $xml = $iso2utf->convert($xml) || carp "XML conversion error in $xml";
446    
447     my $s = $self->create_slice($path) || confess "create_slice returned null";
448    
449     confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
450     confess "no 'in' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'in'}));
451     confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));
452    
453     $self->slice_output($s);
454    
455     use bytes; # as opposed to chars
456     $self->{'slice'}->{$s}->{'in'} .=
457     "Path-Name: $path\n".
458     "Content-Length: ".(length($xml)+1)."\n".
459     "Document-Type: XML\n\n$xml\n";
460    
461     # do I/O
462     $self->{'slice'}->{$s}->{'h'}->pump while length $self->{'slice'}->{$s}->{'in'} ; # wait for all input to go
463    
464     $self->slice_output($s);
465    
466     return $s;
467     }
468    
469     =head2 slice_output
470    
471     Prints to STDERR output and errors from C<swish-e>.
472    
473     $i->slice_output($s);
474    
475     Normally, you don't need to call it.
476    
477     =cut
478    
479     sub slice_output {
480     my $self = shift;
481    
482     my $s = shift || confess "slice_output needs slice";
483    
484     confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
485     confess "no 'in' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'in'}));
486     confess "no 'out' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'out'}));
487    
488     if (length $self->{'slice'}->{$s}->{'out'} > $self->{'slice'}->{$s}->{'out_len'}) {
489     #print STDERR "swish-e OUT: ",$self->{'slice'}->{$s}->{'out'},"\n" if ($self->{'slice'}->{$s}->{'out'});
490     $self->{'slice'}->{$s}->{'out_len'} = length $self->{'slice'}->{$s}->{'out'};
491     return 1;
492     } elsif (length $self->{'slice'}->{$s}->{'err'} > $self->{'slice'}->{$s}->{'err_len'}) {
493     print STDERR "swish-e ERR: ",$self->{'slice'}->{$s}->{'err'},"\n" if ($self->{'slice'}->{$s}->{'err'});
494     $self->{'slice'}->{$s}->{'err_len'} = length $self->{'slice'}->{$s}->{'err'};
495     # this is fatal
496     return 0;
497     }
498    
499     return 1;
500     }
501    
502     =head2 close_slice {
503    
504     Close slice (terminates swish-e process for that slice).
505    
506     my $i->close_slice($s);
507    
508     Returns true if slice is closed, false otherwise.
509    
510     =cut
511    
512     sub close_slice {
513     my $self = shift;
514    
515     my $s = shift || confess "close_slice needs slice";
516    
517     confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
518     confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));
519    
520     # pump rest of content (if any)
521     $self->{'slice'}->{$s}->{'h'}->pump while length $self->{'slice'}->{$s}->{'in'};
522    
523     $self->slice_output($s);
524    
525     # clean up
526     $self->{'slice'}->{$s}->{'h'}->finish or confess "finish on slice $s returned: $?";
527    
528     delete($self->{'slice'}->{$s}) && return 1;
529     return 0;
530     }
531    
532     =head2 to_xml
533    
534     Convert (binary safe, I hope) your data into XML for C<swish-e>.
535     Data will not yet be recoded to UTF-8. L<"put_slice"> will do that.
536    
537     my $xml = $i->to_xml({ foo => 'bar' });
538    
539     This function is extracted from L<"add"> method so that you can L<Memoize> it.
540     If your data set has a lot of repeatable data, and memory is not a problem, you
541     can add C<memoize_to_xml> option to L<"open">.
542    
543     =cut
544    
545     my %escape = ('<'=>'&lt;', '>'=>'&gt;', '&'=>'&amp;', '"'=>'&quot;');
546     my $escape_re = join '|' => keys %escape;
547    
548     sub to_xml {
549     my $self = shift;
550    
551     my $data = shift || return;
552    
553     my $xml = qq{<xml>};
554     foreach my $tag (keys %$data) {
555     my $content = $data->{$tag};
556     next if (! $content || $content eq '');
557     # save [cr/]lf before conversion to XML
558     # $content =~ s/\n\r/##lf##/gs;
559     # $content =~ s/\n/##lf##/gs;
560     $content =~ s/($escape_re)/$escape{$1}/gs;
561     $xml .= "<$tag><![CDATA[".$content."]]></$tag>";
562     }
563     $xml .= qq{</xml>};
564     }
565    
566 dpavlin 1 1;
567     __END__
568    
569    
570     =head2 Searching
571    
572     Searching is still conducted using L<SWISH::API>, but you have to glob
573     index names.
574    
575     use SWISH::API;
576    
577     my $swish = SWISH::API->new( glob('index.swish-e/*') );
578    
579     You can also alternativly create merged index (using C<merge> option) and
580     not change your source code at all.
581    
582     That would also benefit performance, but it increases indexing time
583     because merged indexes must be re-created on each indexing run.
584    
585     =head2 EXPORT
586    
587 dpavlin 4 Nothing by default.
588 dpavlin 1
589 dpavlin 4 =head2 EXAMPLES
590 dpavlin 1
591 dpavlin 4 Test script for this module uses all parts of API. It's also nice example
592     how to use C<SWISH::Split>.
593 dpavlin 1
594     =head1 SEE ALSO
595    
596     L<SWISH::API>,
597     L<http://www.swish-e.org/>
598    
599     =head1 AUTHOR
600    
601     Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
602    
603     =head1 COPYRIGHT AND LICENSE
604    
605     Copyright (C) 2004 by Dobrica Pavlinusic
606    
607     This library is free software; you can redistribute it and/or modify
608     it under the same terms as Perl itself, either Perl version 5.8.4 or,
609     at your option, any later version of Perl 5 you may have available.
610    
611    
612     =cut

  ViewVC Help
Powered by ViewVC 1.1.26