/[SWISH-Split]/trunk/Split.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/Split.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 5 - (hide annotations)
Wed Aug 11 14:28:40 2004 UTC (19 years, 7 months ago) by dpavlin
File size: 13752 byte(s)
smaller improvements

1 dpavlin 1 package SWISH::Split;
2    
3     use 5.008;
4     use strict;
5     use warnings;
6    
7     our $VERSION = '0.00';
8    
9     use SWISH::API;
10     use Text::Iconv;
11     use File::Temp qw/ :mktemp /;
12     use Carp;
13     use Digest::MD5 qw(md5_hex);
14     use Memoize;
15 dpavlin 4 use IPC::Run qw(start timeout pump finish);
16     use File::Which;
17 dpavlin 1
18     use Data::Dumper;
19    
20 dpavlin 5 use constant {
21     ADDED => 1,
22     DELETED => 2,
23     };
24    
25 dpavlin 1 =head1 NAME
26    
27     SWISH::Split - Perl interface to split index variant of Swish-e
28    
29     =head1 SYNOPSIS
30    
31     use SWISH::Split;
32    
33    
34     =head1 DESCRIPTION
35    
36     This is alternative interface for indexing data with swish-e. It's designed
37 dpavlin 4 to split indexes over multiple files (slices) to allow updates of records in index
38     by reindexing just changed parts (slice).
39 dpavlin 1
40     Data is stored in index using intrface which is somewhat similar to
41     L<Plucene::Simple>. This could make your migration (or supporting two index
42     engines) easier.
43    
44     In the background, it will fork swish-e binaries (one for each index slice)
45 dpavlin 4 and produce UTF-8 encoded XML files for it. So, if your imput charset isn't
46 dpavlin 1 C<ISO-8859-1> you will have to specify it.
47    
48     =head1 Methods used for indexing
49    
50     =head2 open
51    
52     Create new object for index.
53    
54     my $i = SWISH::Split->open({
55     index => '/path/to/index',
56     slice_name => \&slice_on_path,
57     slices => 30,
58 dpavlin 4 merge => 0,
59     codepage => 'ISO-8859-2',
60     swish_config => qq{
61     PropertyNames from date
62     PropertyNamesDate date
63     },
64     memoize_to_xml => 0,
65 dpavlin 1 );
66    
67     # split index on first component of path
68     sub slice_on_path {
69     return shift split(/\//,$_[0]);
70     }
71    
72 dpavlin 4 Options to open are following:
73 dpavlin 1
74 dpavlin 4 =over 5
75    
76     =item C<index>
77    
78     path to (existing) directory in which index slices will be created.
79    
80     =item C<slice_name>
81    
82     coderef to function which provide slicing from path.
83    
84     =item C<slices>
85    
86     maximum number of index slices. See L<"in_slice"> for
87 dpavlin 1 more explanation.
88    
89 dpavlin 4 =item C<merge>
90    
91     (planned) option to merge indexes into one at end.
92    
93     =item C<codepage>
94    
95     data codepage (needed for conversion to UTF-8).
96     By default, it's C<ISO-8859-1>.
97    
98     =item C<swish_config>
99    
100     additional parametars which will be inserted into
101     C<swish-e> configuration file. See L<swish-config>.
102    
103     =item C<memoize_to_xml>
104    
105     speed up repeatable data, see L<"to_xml">.
106    
107     =back
108    
109 dpavlin 1 =cut
110    
111     my $iso2utf = Text::Iconv->new('ISO-8859-1','UTF-8');
112    
113     sub open {
114     my $class = shift;
115     my $self = {@_};
116     bless($self, $class);
117    
118     croak "need slice_name coderef" unless ref $self->{'slice_name'};
119     croak "need slices" unless $self->{'slices'};
120    
121     croak "need index" unless $self->{'index'};
122     croak "index '",$self->{'index'},"' doesn't exist" unless -e $self->{'index'};
123     croak "index '",$self->{'index'},"' is not directory" unless -d $self->{'index'};
124    
125     $iso2utf = Text::Iconv->new($self->{'codepage'},'UTF-8') if ($self->{'codepage'});
126    
127 dpavlin 4 # speedup
128 dpavlin 1 memoize('in_slice');
129 dpavlin 4 memoize('to_xml') if ($self->{'memoize_to_xml'});
130 dpavlin 1
131     $self ? return $self : return undef;
132    
133     }
134    
135     =head2 add
136    
137     Add document to index.
138    
139     $i->add($swishpath, {
140     headline => 'foobar result',
141     property => 'data',
142     })
143    
144     =cut
145    
146     sub add {
147     my $self = shift;
148 dpavlin 3
149     my $swishpath = shift || return;
150     my $data = shift || return;
151    
152 dpavlin 4 my ($out,$err) = $self->put_slice($swishpath, $self->to_xml($data));
153    
154     if ($err) {
155     carp "$swishpath: $err";
156     return 0;
157     }
158    
159 dpavlin 3 return 1;
160 dpavlin 1 }
161    
162     =head2 delete
163    
164 dpavlin 3 Delete documents from index.
165 dpavlin 1
166 dpavlin 3 $i->delete(@swishpath);
167 dpavlin 1
168     =cut
169    
170     sub delete {
171     my $self = shift;
172 dpavlin 3
173     my @paths = @_ || return;
174    
175 dpavlin 5 foreach my $path (@paths) {
176     $self->{'paths'}->{$path} = DELETED;
177     }
178    
179 dpavlin 3 return 42;
180 dpavlin 1 }
181    
182    
183 dpavlin 5 =head2 done
184 dpavlin 1
185 dpavlin 4 Finish indexing and close index file(s).
186 dpavlin 1
187 dpavlin 5 $i->done;
188 dpavlin 1
189     This is most time-consuming operation. When it's called, it will re-index
190     all entries which haven't changed in all slices.
191    
192 dpavlin 4 Returns number of slices updated.
193    
194 dpavlin 5 This method should really be called close or finish, but both of those are
195     allready used.
196    
197 dpavlin 1 =cut
198    
199 dpavlin 5 sub done {
200 dpavlin 1 my $self = shift;
201 dpavlin 3
202 dpavlin 4 my $ret = 0;
203    
204     foreach my $s (keys %{$self->{'slice'}}) {
205 dpavlin 5 print STDERR "closing slice $s\n";
206 dpavlin 4 $ret += $self->close_slice($s);
207     }
208    
209     return $ret;
210 dpavlin 1 }
211    
212    
213    
214     =head1 Reporting methods
215    
216     This methods return statistics about your index.
217    
218     =head2 swishpaths
219    
220     Return array of C<swishpath>s in index.
221    
222     my @p = $i->swishpaths;
223    
224     =cut
225    
226     sub swishpaths {
227     my $self = shift;
228 dpavlin 5
229     my $s = shift || return;
230     return if (! exists($self->{'slice'}->{'s'}));
231    
232     return keys %{$self->{'slice'}->{'s'}};
233 dpavlin 1 }
234    
235     =head2 swishpaths_updated
236    
237     Return array with updated C<swishpath>s.
238    
239     my @d = $i->swishpaths_updated;
240    
241     =cut
242    
243     sub swishpaths_updated {
244     my $self = shift;
245     }
246    
247    
248     =head2 swishpaths_deleted
249    
250     Return array with deleted C<swishpath>s.
251    
252     my $n = $i->swishpaths_deleted;
253    
254     =cut
255    
256     sub swishpaths_deleted {
257     my $self = shift;
258     }
259    
260    
261     =head2 slices
262    
263     Return array with all slice names.
264    
265     my @s = $i->slices;
266    
267     =cut
268    
269     sub slices {
270     my $self = shift;
271     }
272    
273     =head1 Helper methods
274    
275     This methods are used internally, but they might be useful.
276    
277     =head2 in_slice
278    
279     Takes path and return slice in which this path belongs.
280    
281     my $s = $i->in_slice('path/to/document/in/index');
282    
283     If there are C<slices> parametar to L<"open"> it will use
284     MD5 hash to spread documents across slices. That will produce random
285     distribution of your documents in slices, which might or might not be best
286     for your data. If you have to re-index large number of slices on each
287     run, think about creating your own C<slice> function and distributing
288     documents manually across slices.
289    
290 dpavlin 4 Slice number must always be true value or various sanity checks will fail.
291    
292 dpavlin 1 This function is C<Memoize>ed for performance reasons.
293    
294     =cut
295    
296     sub in_slice {
297     my $self = shift;
298    
299     my $path = shift || confess "need path";
300    
301     confess "need slice_name function" unless ref ($self->{'slice_name'});
302    
303     if ($self->{'slices'}) {
304     # first, pass path through slice_name function
305     my $slice = &{$self->{'slice_name'}}($path);
306     # then calculate MD5 hash
307     $slice = md5_hex($slice);
308     # take first 8 chars to produce number
309     # FIXME how random is this?
310     $slice = hex(substr($slice,0,8));
311    
312 dpavlin 4 $slice = ($slice % $self->{'slices'}) + 1;
313     print "hash: $slice / ",$self->{'slices'}," => $slice\n";
314     return $slice;
315 dpavlin 1 } else {
316     return &{$self->{'split'}}($path);
317     }
318     }
319    
320 dpavlin 3 =head2 find_paths
321 dpavlin 1
322 dpavlin 3 Return array of C<swishpath>s for given C<swish-e> query.
323 dpavlin 1
324 dpavlin 3 my @p = $i->find_paths("headline=test*");
325    
326     Useful for combining with L<"delete_documents"> to delete documents
327     which hasn't changed a while (so, expired).
328    
329     =cut
330    
331     sub find_paths {
332     my $self = shift;
333    
334     }
335    
336    
337 dpavlin 4 =head2 make_config
338 dpavlin 3
339 dpavlin 4 Create C<swish-e> configuration file for given slice.
340    
341     my $config_filename = $i->make_config('slice name');
342    
343     It returns configuration filename. If no C<swish_config> was defined in
344     L<"open">, default swish-e configuration will be used. It will index all data for
345     searching, but none for properties.
346    
347     If you want to see what is allready defined for swish-e in configuration
348     take a look at source code for C<DEFAULT_SWISH_CONF>.
349    
350     It uses C<cat> utility to comunicate with C<swish-e>. Path is provided
351     by C<File::Which>. Do Windows users have to change that to C<COPY /B>
352     or something similar?
353    
354     =cut
355    
356     sub make_config {
357     my $self = shift;
358    
359    
360     my $index_file = $self->{'index'}."/";
361     $index_file .= shift || confess "need slice name";
362    
363     my ($tmp_fh, $swish_config_filename) = mkstemp("/tmp/swishXXXXX");
364    
365     # find cat on filesystem
366     my $cat = which('cat');
367    
368     print $tmp_fh <<"DEFAULT_SWISH_CONF";
369     # swish-e config file
370    
371     IndexDir cat
372     #SwishProgParameters -
373    
374     # input file definition
375     DefaultContents XML*
376    
377     # indexed metatags
378     MetaNames xml swishdocpath
379    
380    
381     #XMLClassAttributes type
382     UndefinedMetaTags auto
383     UndefinedXMLAttributes auto
384    
385     IndexFile $index_file
386    
387     # Croatian ISO-8859-2 characters to unaccented equivalents
388     TranslateCharacters ¹©ðÐèÈæƾ® ssddcccczz
389    
390    
391     # disable output
392     ParserWarnLevel 0
393     IndexReport 1
394    
395     DEFAULT_SWISH_CONF
396    
397     # add user parametars (like stored properties)
398     print $tmp_fh $self->{'swish_config'} if ($self->{'swish_config'});
399    
400     close($tmp_fh);
401    
402     return $swish_config_filename;
403     }
404    
405     =head2 create_slice
406    
407     On first run, starts C<swish-e> using L<IPC::Run>. On subsequent calls just return
408     it's handles using L<Memoize>.
409    
410     my $s = create_slice('/path/to/document');
411    
412     You shouldn't need to call C<create_slice> directly because it will be called
413     from L<"put_slice"> when needed.
414    
415     =cut
416    
417     sub create_slice {
418     my $self = shift;
419    
420     my $path = shift || confess "create_slice need path!";
421    
422     my $s = $self->in_slice($path) || confess "in_slice returned null";
423    
424     return $s if (exists($self->{'slice'}->{$s}));
425    
426     my $swish_config = $self->make_config($s);
427    
428     print STDERR "creating slice $s\n"; # FIXME
429    
430     my @swish = qw(swish-e -S prog -c);
431     push @swish, $swish_config;
432    
433     ## Build the harness, open all pipes, and launch the subprocesses
434     $self->{'slice'}->{$s}->{'h'} = start \@swish,
435     \$self->{'slice'}->{$s}->{'in'},
436     \$self->{'slice'}->{$s}->{'out'},
437     \$self->{'slice'}->{$s}->{'err'},
438     timeout( 90 ); # FIXME
439    
440     $self->{'slice'}->{$s}->{'out_len'} = 0;
441     $self->{'slice'}->{$s}->{'err_len'} = 0;
442    
443     $self->slice_output($s);
444    
445     return $s;
446     }
447    
448     =head2 put_slice
449    
450     Pass XML data to swish and receive output and errors.
451    
452     my ($out,$err) = $i->put_slice('/swish/path', '<xml>data</xml>');
453    
454     =cut
455    
456     sub put_slice {
457     my $self = shift;
458    
459     my $path = shift || confess "need path";
460     my $xml = shift || confess "need xml";
461    
462     $xml = $iso2utf->convert($xml) || carp "XML conversion error in $xml";
463    
464     my $s = $self->create_slice($path) || confess "create_slice returned null";
465    
466     confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
467     confess "no 'in' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'in'}));
468     confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));
469    
470     $self->slice_output($s);
471    
472     use bytes; # as opposed to chars
473     $self->{'slice'}->{$s}->{'in'} .=
474     "Path-Name: $path\n".
475     "Content-Length: ".(length($xml)+1)."\n".
476     "Document-Type: XML\n\n$xml\n";
477    
478     # do I/O
479     $self->{'slice'}->{$s}->{'h'}->pump while length $self->{'slice'}->{$s}->{'in'} ; # wait for all input to go
480    
481     $self->slice_output($s);
482    
483 dpavlin 5
484     $self->{'paths'}->{$path} = ADDED;
485    
486 dpavlin 4 return $s;
487     }
488    
489     =head2 slice_output
490    
491     Prints to STDERR output and errors from C<swish-e>.
492    
493     $i->slice_output($s);
494    
495     Normally, you don't need to call it.
496    
497     =cut
498    
499     sub slice_output {
500     my $self = shift;
501    
502     my $s = shift || confess "slice_output needs slice";
503    
504     confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
505     confess "no 'in' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'in'}));
506     confess "no 'out' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'out'}));
507    
508     if (length $self->{'slice'}->{$s}->{'out'} > $self->{'slice'}->{$s}->{'out_len'}) {
509     #print STDERR "swish-e OUT: ",$self->{'slice'}->{$s}->{'out'},"\n" if ($self->{'slice'}->{$s}->{'out'});
510     $self->{'slice'}->{$s}->{'out_len'} = length $self->{'slice'}->{$s}->{'out'};
511     return 1;
512     } elsif (length $self->{'slice'}->{$s}->{'err'} > $self->{'slice'}->{$s}->{'err_len'}) {
513     print STDERR "swish-e ERR: ",$self->{'slice'}->{$s}->{'err'},"\n" if ($self->{'slice'}->{$s}->{'err'});
514     $self->{'slice'}->{$s}->{'err_len'} = length $self->{'slice'}->{$s}->{'err'};
515     # this is fatal
516     return 0;
517     }
518    
519     return 1;
520     }
521    
522 dpavlin 5 =head2 close_slice
523 dpavlin 4
524     Close slice (terminates swish-e process for that slice).
525    
526     my $i->close_slice($s);
527    
528     Returns true if slice is closed, false otherwise.
529    
530     =cut
531    
532     sub close_slice {
533     my $self = shift;
534    
535     my $s = shift || confess "close_slice needs slice";
536    
537     confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
538     confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));
539    
540     # pump rest of content (if any)
541     $self->{'slice'}->{$s}->{'h'}->pump while length $self->{'slice'}->{$s}->{'in'};
542    
543     $self->slice_output($s);
544    
545     # clean up
546     $self->{'slice'}->{$s}->{'h'}->finish or confess "finish on slice $s returned: $?";
547    
548     delete($self->{'slice'}->{$s}) && return 1;
549     return 0;
550     }
551    
552     =head2 to_xml
553    
554     Convert (binary safe, I hope) your data into XML for C<swish-e>.
555     Data will not yet be recoded to UTF-8. L<"put_slice"> will do that.
556    
557     my $xml = $i->to_xml({ foo => 'bar' });
558    
559     This function is extracted from L<"add"> method so that you can L<Memoize> it.
560     If your data set has a lot of repeatable data, and memory is not a problem, you
561     can add C<memoize_to_xml> option to L<"open">.
562    
563     =cut
564    
565     my %escape = ('<'=>'&lt;', '>'=>'&gt;', '&'=>'&amp;', '"'=>'&quot;');
566     my $escape_re = join '|' => keys %escape;
567    
568     sub to_xml {
569     my $self = shift;
570    
571     my $data = shift || return;
572    
573     my $xml = qq{<xml>};
574     foreach my $tag (keys %$data) {
575     my $content = $data->{$tag};
576     next if (! $content || $content eq '');
577     # save [cr/]lf before conversion to XML
578     # $content =~ s/\n\r/##lf##/gs;
579     # $content =~ s/\n/##lf##/gs;
580     $content =~ s/($escape_re)/$escape{$1}/gs;
581     $xml .= "<$tag><![CDATA[".$content."]]></$tag>";
582     }
583     $xml .= qq{</xml>};
584     }
585    
586 dpavlin 1 1;
587     __END__
588    
589    
590 dpavlin 5 =head1 Searching
591 dpavlin 1
592     Searching is still conducted using L<SWISH::API>, but you have to glob
593     index names.
594    
595     use SWISH::API;
596    
597     my $swish = SWISH::API->new( glob('index.swish-e/*') );
598    
599     You can also alternativly create merged index (using C<merge> option) and
600     not change your source code at all.
601    
602     That would also benefit performance, but it increases indexing time
603     because merged indexes must be re-created on each indexing run.
604    
605 dpavlin 5 =head1 EXPORT
606 dpavlin 1
607 dpavlin 4 Nothing by default.
608 dpavlin 1
609 dpavlin 5 =head1 EXAMPLES
610 dpavlin 1
611 dpavlin 4 Test script for this module uses all parts of API. It's also nice example
612     how to use C<SWISH::Split>.
613 dpavlin 1
614     =head1 SEE ALSO
615    
616     L<SWISH::API>,
617     L<http://www.swish-e.org/>
618    
619     =head1 AUTHOR
620    
621     Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
622    
623     =head1 COPYRIGHT AND LICENSE
624    
625     Copyright (C) 2004 by Dobrica Pavlinusic
626    
627     This library is free software; you can redistribute it and/or modify
628     it under the same terms as Perl itself, either Perl version 5.8.4 or,
629     at your option, any later version of Perl 5 you may have available.
630    
631    
632     =cut

  ViewVC Help
Powered by ViewVC 1.1.26