/[SWISH-Split]/trunk/Split.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/Split.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 11 - (hide annotations)
Fri Apr 29 22:50:16 2005 UTC (18 years, 11 months ago) by dpavlin
File size: 12940 byte(s)
some cleanups

1 dpavlin 1 package SWISH::Split;
2    
3     use 5.008;
4     use strict;
5     use warnings;
6    
7 dpavlin 9 our $VERSION = '0.03';
8 dpavlin 1
9     use SWISH::API;
10     use Text::Iconv;
11     use File::Temp qw/ :mktemp /;
12     use Carp;
13     use Digest::MD5 qw(md5_hex);
14     use Memoize;
15 dpavlin 4 use File::Which;
16 dpavlin 1
17     use Data::Dumper;
18    
19 dpavlin 5 use constant {
20     ADDED => 1,
21     DELETED => 2,
22     };
23    
24 dpavlin 1 =head1 NAME
25    
26     SWISH::Split - Perl interface to split index variant of Swish-e
27    
28     =head1 SYNOPSIS
29    
30     use SWISH::Split;
31    
32    
33     =head1 DESCRIPTION
34    
35     This is alternative interface for indexing data with swish-e. It's designed
36 dpavlin 4 to split indexes over multiple files (slices) to allow updates of records in index
37     by reindexing just changed parts (slice).
38 dpavlin 1
39     Data is stored in index using intrface which is somewhat similar to
40     L<Plucene::Simple>. This could make your migration (or supporting two index
41     engines) easier.
42    
43     In the background, it will fork swish-e binaries (one for each index slice)
44 dpavlin 6 and produce UTF-8 encoded XML files for it. So, if your input charset isn't
45 dpavlin 1 C<ISO-8859-1> you will have to specify it.
46    
47     =head1 Methods used for indexing
48    
49 dpavlin 8 =head2 open_index
50 dpavlin 1
51     Create new object for index.
52    
53 dpavlin 8 my $i = SWISH::Split->open_index({
54 dpavlin 1 index => '/path/to/index',
55     slice_name => \&slice_on_path,
56     slices => 30,
57 dpavlin 4 merge => 0,
58     codepage => 'ISO-8859-2',
59     swish_config => qq{
60     PropertyNames from date
61     PropertyNamesDate date
62     },
63     memoize_to_xml => 0,
64 dpavlin 1 );
65    
66     # split index on first component of path
67     sub slice_on_path {
68     return shift split(/\//,$_[0]);
69     }
70    
71 dpavlin 8 Options to C<open_index> are following:
72 dpavlin 1
73 dpavlin 4 =over 5
74    
75     =item C<index>
76    
77     path to (existing) directory in which index slices will be created.
78    
79     =item C<slice_name>
80    
81     coderef to function which provide slicing from path.
82    
83     =item C<slices>
84    
85     maximum number of index slices. See L<"in_slice"> for
86 dpavlin 1 more explanation.
87    
88 dpavlin 4 =item C<merge>
89    
90     (planned) option to merge indexes into one at end.
91    
92     =item C<codepage>
93    
94     data codepage (needed for conversion to UTF-8).
95     By default, it's C<ISO-8859-1>.
96    
97     =item C<swish_config>
98    
99     additional parametars which will be inserted into
100 dpavlin 11 C<swish-e> configuration file. See C<swish-config>.
101 dpavlin 4
102     =item C<memoize_to_xml>
103    
104     speed up repeatable data, see L<"to_xml">.
105    
106     =back
107    
108 dpavlin 1 =cut
109    
110     my $iso2utf = Text::Iconv->new('ISO-8859-1','UTF-8');
111    
112 dpavlin 8 sub open_index {
113 dpavlin 1 my $class = shift;
114     my $self = {@_};
115     bless($self, $class);
116    
117     croak "need slice_name coderef" unless ref $self->{'slice_name'};
118     croak "need slices" unless $self->{'slices'};
119    
120     croak "need index" unless $self->{'index'};
121     croak "index '",$self->{'index'},"' doesn't exist" unless -e $self->{'index'};
122     croak "index '",$self->{'index'},"' is not directory" unless -d $self->{'index'};
123    
124     $iso2utf = Text::Iconv->new($self->{'codepage'},'UTF-8') if ($self->{'codepage'});
125    
126 dpavlin 4 # speedup
127 dpavlin 1 memoize('in_slice');
128 dpavlin 4 memoize('to_xml') if ($self->{'memoize_to_xml'});
129 dpavlin 1
130     $self ? return $self : return undef;
131    
132     }
133    
134     =head2 add
135    
136     Add document to index.
137    
138     $i->add($swishpath, {
139     headline => 'foobar result',
140     property => 'data',
141     })
142    
143     =cut
144    
145     sub add {
146     my $self = shift;
147 dpavlin 3
148     my $swishpath = shift || return;
149     my $data = shift || return;
150    
151 dpavlin 7 my $slice = $self->put_slice($swishpath, $self->to_xml($data));
152 dpavlin 4
153 dpavlin 7 return $slice;
154 dpavlin 1 }
155    
156     =head2 delete
157    
158 dpavlin 3 Delete documents from index.
159 dpavlin 1
160 dpavlin 3 $i->delete(@swishpath);
161 dpavlin 1
162     =cut
163    
164     sub delete {
165     my $self = shift;
166 dpavlin 3
167     my @paths = @_ || return;
168    
169 dpavlin 5 foreach my $path (@paths) {
170     $self->{'paths'}->{$path} = DELETED;
171     }
172    
173 dpavlin 3 return 42;
174 dpavlin 1 }
175    
176    
177 dpavlin 5 =head2 done
178 dpavlin 1
179 dpavlin 4 Finish indexing and close index file(s).
180 dpavlin 1
181 dpavlin 5 $i->done;
182 dpavlin 1
183     This is most time-consuming operation. When it's called, it will re-index
184     all entries which haven't changed in all slices.
185    
186 dpavlin 4 Returns number of slices updated.
187    
188 dpavlin 5 This method should really be called close or finish, but both of those are
189     allready used.
190    
191 dpavlin 1 =cut
192    
193 dpavlin 5 sub done {
194 dpavlin 1 my $self = shift;
195 dpavlin 3
196 dpavlin 4 my $ret = 0;
197    
198     foreach my $s (keys %{$self->{'slice'}}) {
199 dpavlin 8 $self->_debug("closing slice $s");
200 dpavlin 4 $ret += $self->close_slice($s);
201     }
202    
203     return $ret;
204 dpavlin 1 }
205    
206    
207    
208     =head1 Reporting methods
209    
210     This methods return statistics about your index.
211    
212     =head2 swishpaths
213    
214     Return array of C<swishpath>s in index.
215    
216     my @p = $i->swishpaths;
217    
218     =cut
219    
220     sub swishpaths {
221     my $self = shift;
222 dpavlin 5
223     my $s = shift || return;
224     return if (! exists($self->{'slice'}->{'s'}));
225    
226     return keys %{$self->{'slice'}->{'s'}};
227 dpavlin 1 }
228    
229     =head2 swishpaths_updated
230    
231     Return array with updated C<swishpath>s.
232    
233     my @d = $i->swishpaths_updated;
234    
235     =cut
236    
237     sub swishpaths_updated {
238     my $self = shift;
239     }
240    
241    
242     =head2 swishpaths_deleted
243    
244     Return array with deleted C<swishpath>s.
245    
246     my $n = $i->swishpaths_deleted;
247    
248     =cut
249    
250     sub swishpaths_deleted {
251     my $self = shift;
252     }
253    
254    
255     =head2 slices
256    
257     Return array with all slice names.
258    
259     my @s = $i->slices;
260    
261     =cut
262    
263     sub slices {
264     my $self = shift;
265     }
266    
267     =head1 Helper methods
268    
269     This methods are used internally, but they might be useful.
270    
271     =head2 in_slice
272    
273     Takes path and return slice in which this path belongs.
274    
275     my $s = $i->in_slice('path/to/document/in/index');
276    
277 dpavlin 8 If there are C<slices> parametar to L<"open_index"> it will use
278 dpavlin 1 MD5 hash to spread documents across slices. That will produce random
279     distribution of your documents in slices, which might or might not be best
280     for your data. If you have to re-index large number of slices on each
281     run, think about creating your own C<slice> function and distributing
282     documents manually across slices.
283    
284 dpavlin 4 Slice number must always be true value or various sanity checks will fail.
285    
286 dpavlin 1 This function is C<Memoize>ed for performance reasons.
287    
288     =cut
289    
290     sub in_slice {
291     my $self = shift;
292    
293     my $path = shift || confess "need path";
294    
295     confess "need slice_name function" unless ref ($self->{'slice_name'});
296    
297     if ($self->{'slices'}) {
298     # first, pass path through slice_name function
299     my $slice = &{$self->{'slice_name'}}($path);
300     # then calculate MD5 hash
301 dpavlin 8 my $hash = md5_hex($slice);
302 dpavlin 1 # take first 8 chars to produce number
303     # FIXME how random is this?
304 dpavlin 8 $hash = hex(substr($hash,0,8));
305 dpavlin 1
306 dpavlin 8 $slice = ($hash % $self->{'slices'}) + 1;
307     $self->_debug("hash: $hash / ",$self->{'slices'}," => $slice");
308 dpavlin 4 return $slice;
309 dpavlin 1 } else {
310     return &{$self->{'split'}}($path);
311     }
312     }
313    
314 dpavlin 3 =head2 find_paths
315 dpavlin 1
316 dpavlin 3 Return array of C<swishpath>s for given C<swish-e> query.
317 dpavlin 1
318 dpavlin 3 my @p = $i->find_paths("headline=test*");
319    
320     Useful for combining with L<"delete_documents"> to delete documents
321     which hasn't changed a while (so, expired).
322    
323     =cut
324    
325     sub find_paths {
326     my $self = shift;
327    
328     }
329    
330    
331 dpavlin 4 =head2 make_config
332 dpavlin 3
333 dpavlin 4 Create C<swish-e> configuration file for given slice.
334    
335     my $config_filename = $i->make_config('slice name');
336    
337     It returns configuration filename. If no C<swish_config> was defined in
338 dpavlin 8 L<"open_index">, default swish-e configuration will be used. It will index all data for
339 dpavlin 4 searching, but none for properties.
340    
341     If you want to see what is allready defined for swish-e in configuration
342     take a look at source code for C<DEFAULT_SWISH_CONF>.
343    
344 dpavlin 7 It uses C<stdin> as C<IndexDir> to comunicate with C<swish-e>.
345 dpavlin 4
346     =cut
347    
348     sub make_config {
349     my $self = shift;
350    
351    
352     my $index_file = $self->{'index'}."/";
353     $index_file .= shift || confess "need slice name";
354    
355     my ($tmp_fh, $swish_config_filename) = mkstemp("/tmp/swishXXXXX");
356    
357     # find cat on filesystem
358     my $cat = which('cat');
359    
360     print $tmp_fh <<"DEFAULT_SWISH_CONF";
361     # swish-e config file
362    
363 dpavlin 7 IndexDir stdin
364 dpavlin 4
365     # input file definition
366     DefaultContents XML*
367    
368     # indexed metatags
369     MetaNames xml swishdocpath
370    
371    
372     #XMLClassAttributes type
373     UndefinedMetaTags auto
374     UndefinedXMLAttributes auto
375    
376     IndexFile $index_file
377    
378     # Croatian ISO-8859-2 characters to unaccented equivalents
379     TranslateCharacters ¹©ðÐèÈæƾ® ssddcccczz
380    
381    
382     # disable output
383     ParserWarnLevel 0
384     IndexReport 1
385    
386     DEFAULT_SWISH_CONF
387    
388     # add user parametars (like stored properties)
389     print $tmp_fh $self->{'swish_config'} if ($self->{'swish_config'});
390    
391     close($tmp_fh);
392    
393     return $swish_config_filename;
394     }
395    
396     =head2 create_slice
397    
398 dpavlin 8 On first run, starts C<swish-e>. On subsequent calls just return
399 dpavlin 11 it's handles using C<Memoize>.
400 dpavlin 4
401     my $s = create_slice('/path/to/document');
402    
403     You shouldn't need to call C<create_slice> directly because it will be called
404     from L<"put_slice"> when needed.
405    
406     =cut
407    
408     sub create_slice {
409     my $self = shift;
410    
411     my $path = shift || confess "create_slice need path!";
412    
413     my $s = $self->in_slice($path) || confess "in_slice returned null";
414    
415     return $s if (exists($self->{'slice'}->{$s}));
416    
417     my $swish_config = $self->make_config($s);
418    
419 dpavlin 8 my $swish = qq{| swish-e };
420 dpavlin 9 if (-f $self->{'index'}.'/'.$s) {
421     $swish .= qq{ -u };
422     $self->{'slice'}->{$s}->{'update_mode'}++;
423     }
424 dpavlin 8 $swish .= qq{ -S prog -c } . $swish_config;
425 dpavlin 4
426 dpavlin 8 $self->_debug("creating slice $s using $swish");
427 dpavlin 4
428     ## Build the harness, open all pipes, and launch the subprocesses
429 dpavlin 8 open(my $fh, $swish) || croak "can't open $swish: $!";
430 dpavlin 4
431 dpavlin 8 $self->{'slice'}->{$s}->{'h'} = $fh;
432 dpavlin 4
433     $self->slice_output($s);
434    
435     return $s;
436     }
437    
438     =head2 put_slice
439    
440 dpavlin 7 Pass XML data to swish.
441 dpavlin 4
442 dpavlin 7 my $slice = $i->put_slice('/swish/path', '<xml>data</xml>');
443 dpavlin 4
444 dpavlin 7 Returns slice in which XML ended up.
445    
446 dpavlin 4 =cut
447    
448     sub put_slice {
449     my $self = shift;
450    
451     my $path = shift || confess "need path";
452     my $xml = shift || confess "need xml";
453    
454     $xml = $iso2utf->convert($xml) || carp "XML conversion error in $xml";
455    
456     my $s = $self->create_slice($path) || confess "create_slice returned null";
457    
458     confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
459     confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));
460    
461     $self->slice_output($s);
462    
463     use bytes; # as opposed to chars
464 dpavlin 8 my $fh = $self->{'slice'}->{$s}->{'h'} || confess "handle for slice $s undefined";
465 dpavlin 9
466     my $update_header = "Update-Mode: Index\n";
467     $update_header = '' unless ($self->{'slice'}->{$s}->{'update_mode'});
468    
469 dpavlin 8 print { $fh } "Path-Name: $path\n".
470 dpavlin 9 "Content-Length: ".(length($xml)+1)."\n" . $update_header .
471 dpavlin 4 "Document-Type: XML\n\n$xml\n";
472    
473     $self->slice_output($s);
474    
475 dpavlin 8 $self->_debug("dumping in slice $s: $path");
476    
477 dpavlin 5 $self->{'paths'}->{$path} = ADDED;
478    
479 dpavlin 4 return $s;
480     }
481    
482     =head2 slice_output
483    
484     Prints to STDERR output and errors from C<swish-e>.
485    
486 dpavlin 7 my $slice = $i->slice_output($s);
487 dpavlin 4
488     Normally, you don't need to call it.
489    
490 dpavlin 8 B<This is dummy placeholder function for very old code that assumes this
491     module is using C<IPC::Run> which it isn't any more.>
492    
493 dpavlin 4 =cut
494    
495     sub slice_output {
496     my $self = shift;
497    
498     my $s = shift || confess "slice_output needs slice";
499    
500     confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
501    
502 dpavlin 8 # FIXME
503 dpavlin 4
504 dpavlin 7 return $s;
505 dpavlin 4 }
506    
507 dpavlin 5 =head2 close_slice
508 dpavlin 4
509     Close slice (terminates swish-e process for that slice).
510    
511     my $i->close_slice($s);
512    
513     Returns true if slice is closed, false otherwise.
514    
515     =cut
516    
517     sub close_slice {
518     my $self = shift;
519    
520     my $s = shift || confess "close_slice needs slice";
521    
522     confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
523     confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));
524    
525     # pump rest of content (if any)
526 dpavlin 8 close $self->{'slice'}->{$s}->{'h'} || carp "can't close slice $s: $!";
527 dpavlin 4
528     $self->slice_output($s);
529    
530 dpavlin 8 undef $self->{'slice'}->{$s}->{'h'};
531 dpavlin 4
532     delete($self->{'slice'}->{$s}) && return 1;
533     return 0;
534     }
535    
536     =head2 to_xml
537    
538     Convert (binary safe, I hope) your data into XML for C<swish-e>.
539     Data will not yet be recoded to UTF-8. L<"put_slice"> will do that.
540    
541     my $xml = $i->to_xml({ foo => 'bar' });
542    
543 dpavlin 11 This function is extracted from L<"add"> method so that you can C<Memoize> it.
544 dpavlin 4 If your data set has a lot of repeatable data, and memory is not a problem, you
545 dpavlin 8 can add C<memoize_to_xml> option to L<"open_index">.
546 dpavlin 4
547     =cut
548    
549     my %escape = ('<'=>'&lt;', '>'=>'&gt;', '&'=>'&amp;', '"'=>'&quot;');
550     my $escape_re = join '|' => keys %escape;
551    
552     sub to_xml {
553     my $self = shift;
554    
555     my $data = shift || return;
556    
557     my $xml = qq{<xml>};
558     foreach my $tag (keys %$data) {
559     my $content = $data->{$tag};
560     next if (! $content || $content eq '');
561     # save [cr/]lf before conversion to XML
562     # $content =~ s/\n\r/##lf##/gs;
563     # $content =~ s/\n/##lf##/gs;
564     $content =~ s/($escape_re)/$escape{$1}/gs;
565     $xml .= "<$tag><![CDATA[".$content."]]></$tag>";
566     }
567     $xml .= qq{</xml>};
568     }
569    
570 dpavlin 8 sub _debug {
571     my $self = shift;
572     print STDERR "## ",@_,"\n" if ($self->{'debug'});
573     return;
574     }
575    
576 dpavlin 1 1;
577     __END__
578    
579    
580 dpavlin 5 =head1 Searching
581 dpavlin 1
582     Searching is still conducted using L<SWISH::API>, but you have to glob
583     index names.
584    
585     use SWISH::API;
586    
587     my $swish = SWISH::API->new( glob('index.swish-e/*') );
588    
589     You can also alternativly create merged index (using C<merge> option) and
590     not change your source code at all.
591    
592     That would also benefit performance, but it increases indexing time
593     because merged indexes must be re-created on each indexing run.
594    
595 dpavlin 5 =head1 EXPORT
596 dpavlin 1
597 dpavlin 4 Nothing by default.
598 dpavlin 1
599 dpavlin 5 =head1 EXAMPLES
600 dpavlin 1
601 dpavlin 4 Test script for this module uses all parts of API. It's also nice example
602     how to use C<SWISH::Split>.
603 dpavlin 1
604     =head1 SEE ALSO
605    
606     L<SWISH::API>,
607     L<http://www.swish-e.org/>
608    
609     =head1 AUTHOR
610    
611     Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
612    
613     =head1 COPYRIGHT AND LICENSE
614    
615     Copyright (C) 2004 by Dobrica Pavlinusic
616    
617     This library is free software; you can redistribute it and/or modify
618     it under the same terms as Perl itself, either Perl version 5.8.4 or,
619     at your option, any later version of Perl 5 you may have available.
620    
621    
622     =cut

  ViewVC Help
Powered by ViewVC 1.1.26