/[SWISH-Split]/trunk/Split.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/Split.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 7 - (hide annotations)
Fri Dec 17 18:32:34 2004 UTC (19 years, 4 months ago) by dpavlin
File size: 13727 byte(s)
a lot of changes:
- better testing framework
- changed put_slice API (to actually confirm with documentation)
- use swish-e stdin instead of external cat utility
- added tags target

1 dpavlin 1 package SWISH::Split;
2    
3     use 5.008;
4     use strict;
5     use warnings;
6    
7 dpavlin 7 our $VERSION = '0.01';
8 dpavlin 1
9     use SWISH::API;
10     use Text::Iconv;
11     use File::Temp qw/ :mktemp /;
12     use Carp;
13     use Digest::MD5 qw(md5_hex);
14     use Memoize;
15 dpavlin 4 use IPC::Run qw(start timeout pump finish);
16     use File::Which;
17 dpavlin 1
18     use Data::Dumper;
19    
20 dpavlin 5 use constant {
21     ADDED => 1,
22     DELETED => 2,
23     };
24    
25 dpavlin 1 =head1 NAME
26    
27     SWISH::Split - Perl interface to split index variant of Swish-e
28    
29     =head1 SYNOPSIS
30    
31     use SWISH::Split;
32    
33    
34     =head1 DESCRIPTION
35    
36     This is alternative interface for indexing data with swish-e. It's designed
37 dpavlin 4 to split indexes over multiple files (slices) to allow updates of records in index
38     by reindexing just changed parts (slice).
39 dpavlin 1
40     Data is stored in index using intrface which is somewhat similar to
41     L<Plucene::Simple>. This could make your migration (or supporting two index
42     engines) easier.
43    
44     In the background, it will fork swish-e binaries (one for each index slice)
45 dpavlin 6 and produce UTF-8 encoded XML files for it. So, if your input charset isn't
46 dpavlin 1 C<ISO-8859-1> you will have to specify it.
47    
48     =head1 Methods used for indexing
49    
50     =head2 open
51    
52     Create new object for index.
53    
54     my $i = SWISH::Split->open({
55     index => '/path/to/index',
56     slice_name => \&slice_on_path,
57     slices => 30,
58 dpavlin 4 merge => 0,
59     codepage => 'ISO-8859-2',
60     swish_config => qq{
61     PropertyNames from date
62     PropertyNamesDate date
63     },
64     memoize_to_xml => 0,
65 dpavlin 1 );
66    
67     # split index on first component of path
68     sub slice_on_path {
69     return shift split(/\//,$_[0]);
70     }
71    
72 dpavlin 4 Options to open are following:
73 dpavlin 1
74 dpavlin 4 =over 5
75    
76     =item C<index>
77    
78     path to (existing) directory in which index slices will be created.
79    
80     =item C<slice_name>
81    
82     coderef to function which provide slicing from path.
83    
84     =item C<slices>
85    
86     maximum number of index slices. See L<"in_slice"> for
87 dpavlin 1 more explanation.
88    
89 dpavlin 4 =item C<merge>
90    
91     (planned) option to merge indexes into one at end.
92    
93     =item C<codepage>
94    
95     data codepage (needed for conversion to UTF-8).
96     By default, it's C<ISO-8859-1>.
97    
98     =item C<swish_config>
99    
100     additional parametars which will be inserted into
101     C<swish-e> configuration file. See L<swish-config>.
102    
103     =item C<memoize_to_xml>
104    
105     speed up repeatable data, see L<"to_xml">.
106    
107     =back
108    
109 dpavlin 1 =cut
110    
111     my $iso2utf = Text::Iconv->new('ISO-8859-1','UTF-8');
112    
113     sub open {
114     my $class = shift;
115     my $self = {@_};
116     bless($self, $class);
117    
118     croak "need slice_name coderef" unless ref $self->{'slice_name'};
119     croak "need slices" unless $self->{'slices'};
120    
121     croak "need index" unless $self->{'index'};
122     croak "index '",$self->{'index'},"' doesn't exist" unless -e $self->{'index'};
123     croak "index '",$self->{'index'},"' is not directory" unless -d $self->{'index'};
124    
125     $iso2utf = Text::Iconv->new($self->{'codepage'},'UTF-8') if ($self->{'codepage'});
126    
127 dpavlin 4 # speedup
128 dpavlin 1 memoize('in_slice');
129 dpavlin 4 memoize('to_xml') if ($self->{'memoize_to_xml'});
130 dpavlin 1
131     $self ? return $self : return undef;
132    
133     }
134    
135     =head2 add
136    
137     Add document to index.
138    
139     $i->add($swishpath, {
140     headline => 'foobar result',
141     property => 'data',
142     })
143    
144     =cut
145    
146     sub add {
147     my $self = shift;
148 dpavlin 3
149     my $swishpath = shift || return;
150     my $data = shift || return;
151    
152 dpavlin 7 my $slice = $self->put_slice($swishpath, $self->to_xml($data));
153 dpavlin 4
154 dpavlin 7 # if ($err) {
155     # carp "$swishpath: $err";
156     # return undef;
157     # }
158 dpavlin 4
159 dpavlin 7 return $slice;
160 dpavlin 1 }
161    
162     =head2 delete
163    
164 dpavlin 3 Delete documents from index.
165 dpavlin 1
166 dpavlin 3 $i->delete(@swishpath);
167 dpavlin 1
168     =cut
169    
170     sub delete {
171     my $self = shift;
172 dpavlin 3
173     my @paths = @_ || return;
174    
175 dpavlin 5 foreach my $path (@paths) {
176     $self->{'paths'}->{$path} = DELETED;
177     }
178    
179 dpavlin 3 return 42;
180 dpavlin 1 }
181    
182    
183 dpavlin 5 =head2 done
184 dpavlin 1
185 dpavlin 4 Finish indexing and close index file(s).
186 dpavlin 1
187 dpavlin 5 $i->done;
188 dpavlin 1
189     This is most time-consuming operation. When it's called, it will re-index
190     all entries which haven't changed in all slices.
191    
192 dpavlin 4 Returns number of slices updated.
193    
194 dpavlin 5 This method should really be called close or finish, but both of those are
195     allready used.
196    
197 dpavlin 1 =cut
198    
199 dpavlin 5 sub done {
200 dpavlin 1 my $self = shift;
201 dpavlin 3
202 dpavlin 4 my $ret = 0;
203    
204     foreach my $s (keys %{$self->{'slice'}}) {
205 dpavlin 5 print STDERR "closing slice $s\n";
206 dpavlin 4 $ret += $self->close_slice($s);
207     }
208    
209     return $ret;
210 dpavlin 1 }
211    
212    
213    
214     =head1 Reporting methods
215    
216     This methods return statistics about your index.
217    
218     =head2 swishpaths
219    
220     Return array of C<swishpath>s in index.
221    
222     my @p = $i->swishpaths;
223    
224     =cut
225    
226     sub swishpaths {
227     my $self = shift;
228 dpavlin 5
229     my $s = shift || return;
230     return if (! exists($self->{'slice'}->{'s'}));
231    
232     return keys %{$self->{'slice'}->{'s'}};
233 dpavlin 1 }
234    
235     =head2 swishpaths_updated
236    
237     Return array with updated C<swishpath>s.
238    
239     my @d = $i->swishpaths_updated;
240    
241     =cut
242    
243     sub swishpaths_updated {
244     my $self = shift;
245     }
246    
247    
248     =head2 swishpaths_deleted
249    
250     Return array with deleted C<swishpath>s.
251    
252     my $n = $i->swishpaths_deleted;
253    
254     =cut
255    
256     sub swishpaths_deleted {
257     my $self = shift;
258     }
259    
260    
261     =head2 slices
262    
263     Return array with all slice names.
264    
265     my @s = $i->slices;
266    
267     =cut
268    
269     sub slices {
270     my $self = shift;
271     }
272    
273     =head1 Helper methods
274    
275     This methods are used internally, but they might be useful.
276    
277     =head2 in_slice
278    
279     Takes path and return slice in which this path belongs.
280    
281     my $s = $i->in_slice('path/to/document/in/index');
282    
283     If there are C<slices> parametar to L<"open"> it will use
284     MD5 hash to spread documents across slices. That will produce random
285     distribution of your documents in slices, which might or might not be best
286     for your data. If you have to re-index large number of slices on each
287     run, think about creating your own C<slice> function and distributing
288     documents manually across slices.
289    
290 dpavlin 4 Slice number must always be true value or various sanity checks will fail.
291    
292 dpavlin 1 This function is C<Memoize>ed for performance reasons.
293    
294     =cut
295    
296     sub in_slice {
297     my $self = shift;
298    
299     my $path = shift || confess "need path";
300    
301     confess "need slice_name function" unless ref ($self->{'slice_name'});
302    
303     if ($self->{'slices'}) {
304     # first, pass path through slice_name function
305     my $slice = &{$self->{'slice_name'}}($path);
306     # then calculate MD5 hash
307     $slice = md5_hex($slice);
308     # take first 8 chars to produce number
309     # FIXME how random is this?
310     $slice = hex(substr($slice,0,8));
311    
312 dpavlin 4 $slice = ($slice % $self->{'slices'}) + 1;
313     print "hash: $slice / ",$self->{'slices'}," => $slice\n";
314     return $slice;
315 dpavlin 1 } else {
316     return &{$self->{'split'}}($path);
317     }
318     }
319    
320 dpavlin 3 =head2 find_paths
321 dpavlin 1
322 dpavlin 3 Return array of C<swishpath>s for given C<swish-e> query.
323 dpavlin 1
324 dpavlin 3 my @p = $i->find_paths("headline=test*");
325    
326     Useful for combining with L<"delete_documents"> to delete documents
327     which hasn't changed a while (so, expired).
328    
329     =cut
330    
331     sub find_paths {
332     my $self = shift;
333    
334     }
335    
336    
337 dpavlin 4 =head2 make_config
338 dpavlin 3
339 dpavlin 4 Create C<swish-e> configuration file for given slice.
340    
341     my $config_filename = $i->make_config('slice name');
342    
343     It returns configuration filename. If no C<swish_config> was defined in
344     L<"open">, default swish-e configuration will be used. It will index all data for
345     searching, but none for properties.
346    
347     If you want to see what is allready defined for swish-e in configuration
348     take a look at source code for C<DEFAULT_SWISH_CONF>.
349    
350 dpavlin 7 It uses C<stdin> as C<IndexDir> to comunicate with C<swish-e>.
351 dpavlin 4
352     =cut
353    
354     sub make_config {
355     my $self = shift;
356    
357    
358     my $index_file = $self->{'index'}."/";
359     $index_file .= shift || confess "need slice name";
360    
361     my ($tmp_fh, $swish_config_filename) = mkstemp("/tmp/swishXXXXX");
362    
363     # find cat on filesystem
364     my $cat = which('cat');
365    
366     print $tmp_fh <<"DEFAULT_SWISH_CONF";
367     # swish-e config file
368    
369 dpavlin 7 IndexDir stdin
370 dpavlin 4
371     # input file definition
372     DefaultContents XML*
373    
374     # indexed metatags
375     MetaNames xml swishdocpath
376    
377    
378     #XMLClassAttributes type
379     UndefinedMetaTags auto
380     UndefinedXMLAttributes auto
381    
382     IndexFile $index_file
383    
384     # Croatian ISO-8859-2 characters to unaccented equivalents
385     TranslateCharacters ¹©ðÐèÈæƾ® ssddcccczz
386    
387    
388     # disable output
389     ParserWarnLevel 0
390     IndexReport 1
391    
392     DEFAULT_SWISH_CONF
393    
394     # add user parametars (like stored properties)
395     print $tmp_fh $self->{'swish_config'} if ($self->{'swish_config'});
396    
397     close($tmp_fh);
398    
399     return $swish_config_filename;
400     }
401    
402     =head2 create_slice
403    
404     On first run, starts C<swish-e> using L<IPC::Run>. On subsequent calls just return
405     it's handles using L<Memoize>.
406    
407     my $s = create_slice('/path/to/document');
408    
409     You shouldn't need to call C<create_slice> directly because it will be called
410     from L<"put_slice"> when needed.
411    
412     =cut
413    
414     sub create_slice {
415     my $self = shift;
416    
417     my $path = shift || confess "create_slice need path!";
418    
419     my $s = $self->in_slice($path) || confess "in_slice returned null";
420    
421     return $s if (exists($self->{'slice'}->{$s}));
422    
423     my $swish_config = $self->make_config($s);
424    
425     print STDERR "creating slice $s\n"; # FIXME
426    
427 dpavlin 7 my @swish = qw(swish-e -u -S prog -c);
428 dpavlin 4 push @swish, $swish_config;
429    
430     ## Build the harness, open all pipes, and launch the subprocesses
431     $self->{'slice'}->{$s}->{'h'} = start \@swish,
432     \$self->{'slice'}->{$s}->{'in'},
433     \$self->{'slice'}->{$s}->{'out'},
434     \$self->{'slice'}->{$s}->{'err'},
435     timeout( 90 ); # FIXME
436    
437     $self->{'slice'}->{$s}->{'out_len'} = 0;
438     $self->{'slice'}->{$s}->{'err_len'} = 0;
439    
440     $self->slice_output($s);
441    
442     return $s;
443     }
444    
445     =head2 put_slice
446    
447 dpavlin 7 Pass XML data to swish.
448 dpavlin 4
449 dpavlin 7 my $slice = $i->put_slice('/swish/path', '<xml>data</xml>');
450 dpavlin 4
451 dpavlin 7 Returns slice in which XML ended up.
452    
453 dpavlin 4 =cut
454    
455     sub put_slice {
456     my $self = shift;
457    
458     my $path = shift || confess "need path";
459     my $xml = shift || confess "need xml";
460    
461     $xml = $iso2utf->convert($xml) || carp "XML conversion error in $xml";
462    
463     my $s = $self->create_slice($path) || confess "create_slice returned null";
464    
465     confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
466     confess "no 'in' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'in'}));
467     confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));
468    
469     $self->slice_output($s);
470    
471     use bytes; # as opposed to chars
472     $self->{'slice'}->{$s}->{'in'} .=
473     "Path-Name: $path\n".
474     "Content-Length: ".(length($xml)+1)."\n".
475 dpavlin 7 "Update-Mode: Index\n".
476 dpavlin 4 "Document-Type: XML\n\n$xml\n";
477    
478     # do I/O
479     $self->{'slice'}->{$s}->{'h'}->pump while length $self->{'slice'}->{$s}->{'in'} ; # wait for all input to go
480    
481     $self->slice_output($s);
482    
483 dpavlin 5 $self->{'paths'}->{$path} = ADDED;
484    
485 dpavlin 4 return $s;
486     }
487    
488     =head2 slice_output
489    
490     Prints to STDERR output and errors from C<swish-e>.
491    
492 dpavlin 7 my $slice = $i->slice_output($s);
493 dpavlin 4
494     Normally, you don't need to call it.
495    
496     =cut
497    
498     sub slice_output {
499     my $self = shift;
500    
501     my $s = shift || confess "slice_output needs slice";
502    
503     confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
504     confess "no 'in' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'in'}));
505     confess "no 'out' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'out'}));
506    
507     if (length $self->{'slice'}->{$s}->{'out'} > $self->{'slice'}->{$s}->{'out_len'}) {
508     #print STDERR "swish-e OUT: ",$self->{'slice'}->{$s}->{'out'},"\n" if ($self->{'slice'}->{$s}->{'out'});
509     $self->{'slice'}->{$s}->{'out_len'} = length $self->{'slice'}->{$s}->{'out'};
510 dpavlin 7 return $s;
511 dpavlin 4 } elsif (length $self->{'slice'}->{$s}->{'err'} > $self->{'slice'}->{$s}->{'err_len'}) {
512     print STDERR "swish-e ERR: ",$self->{'slice'}->{$s}->{'err'},"\n" if ($self->{'slice'}->{$s}->{'err'});
513     $self->{'slice'}->{$s}->{'err_len'} = length $self->{'slice'}->{$s}->{'err'};
514     # this is fatal
515 dpavlin 7 return undef;
516 dpavlin 4 }
517    
518 dpavlin 7 return $s;
519 dpavlin 4 }
520    
521 dpavlin 5 =head2 close_slice
522 dpavlin 4
523     Close slice (terminates swish-e process for that slice).
524    
525     my $i->close_slice($s);
526    
527     Returns true if slice is closed, false otherwise.
528    
529     =cut
530    
531     sub close_slice {
532     my $self = shift;
533    
534     my $s = shift || confess "close_slice needs slice";
535    
536     confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
537     confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));
538    
539     # pump rest of content (if any)
540     $self->{'slice'}->{$s}->{'h'}->pump while length $self->{'slice'}->{$s}->{'in'};
541    
542     $self->slice_output($s);
543    
544     # clean up
545 dpavlin 7 $self->{'slice'}->{$s}->{'h'}->finish or confess "finish on slice $s returned $?: $! -- ",$self->{'slice'}->{$s}->{'err'};
546 dpavlin 4
547     delete($self->{'slice'}->{$s}) && return 1;
548     return 0;
549     }
550    
551     =head2 to_xml
552    
553     Convert (binary safe, I hope) your data into XML for C<swish-e>.
554     Data will not yet be recoded to UTF-8. L<"put_slice"> will do that.
555    
556     my $xml = $i->to_xml({ foo => 'bar' });
557    
558     This function is extracted from L<"add"> method so that you can L<Memoize> it.
559     If your data set has a lot of repeatable data, and memory is not a problem, you
560     can add C<memoize_to_xml> option to L<"open">.
561    
562     =cut
563    
564     my %escape = ('<'=>'&lt;', '>'=>'&gt;', '&'=>'&amp;', '"'=>'&quot;');
565     my $escape_re = join '|' => keys %escape;
566    
567     sub to_xml {
568     my $self = shift;
569    
570     my $data = shift || return;
571    
572     my $xml = qq{<xml>};
573     foreach my $tag (keys %$data) {
574     my $content = $data->{$tag};
575     next if (! $content || $content eq '');
576     # save [cr/]lf before conversion to XML
577     # $content =~ s/\n\r/##lf##/gs;
578     # $content =~ s/\n/##lf##/gs;
579     $content =~ s/($escape_re)/$escape{$1}/gs;
580     $xml .= "<$tag><![CDATA[".$content."]]></$tag>";
581     }
582     $xml .= qq{</xml>};
583     }
584    
585 dpavlin 1 1;
586     __END__
587    
588    
589 dpavlin 5 =head1 Searching
590 dpavlin 1
591     Searching is still conducted using L<SWISH::API>, but you have to glob
592     index names.
593    
594     use SWISH::API;
595    
596     my $swish = SWISH::API->new( glob('index.swish-e/*') );
597    
598     You can also alternativly create merged index (using C<merge> option) and
599     not change your source code at all.
600    
601     That would also benefit performance, but it increases indexing time
602     because merged indexes must be re-created on each indexing run.
603    
604 dpavlin 5 =head1 EXPORT
605 dpavlin 1
606 dpavlin 4 Nothing by default.
607 dpavlin 1
608 dpavlin 5 =head1 EXAMPLES
609 dpavlin 1
610 dpavlin 4 Test script for this module uses all parts of API. It's also nice example
611     how to use C<SWISH::Split>.
612 dpavlin 1
613     =head1 SEE ALSO
614    
615     L<SWISH::API>,
616     L<http://www.swish-e.org/>
617    
618     =head1 AUTHOR
619    
620     Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
621    
622     =head1 COPYRIGHT AND LICENSE
623    
624     Copyright (C) 2004 by Dobrica Pavlinusic
625    
626     This library is free software; you can redistribute it and/or modify
627     it under the same terms as Perl itself, either Perl version 5.8.4 or,
628     at your option, any later version of Perl 5 you may have available.
629    
630    
631     =cut

  ViewVC Help
Powered by ViewVC 1.1.26