/[SWISH-Split]/trunk/Split.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/Split.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 8 - (hide annotations)
Sun Dec 19 03:06:01 2004 UTC (17 years, 6 months ago) by dpavlin
File size: 12778 byte(s)
new api:
- renamed open to open_index
- removed dependency on IPC::Run
- tests which all pass

1 dpavlin 1 package SWISH::Split;
2    
3     use 5.008;
4     use strict;
5     use warnings;
6    
7 dpavlin 8 our $VERSION = '0.02';
8 dpavlin 1
9     use SWISH::API;
10     use Text::Iconv;
11     use File::Temp qw/ :mktemp /;
12     use Carp;
13     use Digest::MD5 qw(md5_hex);
14     use Memoize;
15 dpavlin 4 use File::Which;
16 dpavlin 1
17     use Data::Dumper;
18    
19 dpavlin 5 use constant {
20     ADDED => 1,
21     DELETED => 2,
22     };
23    
24 dpavlin 1 =head1 NAME
25    
26     SWISH::Split - Perl interface to split index variant of Swish-e
27    
28     =head1 SYNOPSIS
29    
30     use SWISH::Split;
31    
32    
33     =head1 DESCRIPTION
34    
35     This is alternative interface for indexing data with swish-e. It's designed
36 dpavlin 4 to split indexes over multiple files (slices) to allow updates of records in index
37     by reindexing just changed parts (slice).
38 dpavlin 1
39     Data is stored in index using intrface which is somewhat similar to
40     L<Plucene::Simple>. This could make your migration (or supporting two index
41     engines) easier.
42    
43     In the background, it will fork swish-e binaries (one for each index slice)
44 dpavlin 6 and produce UTF-8 encoded XML files for it. So, if your input charset isn't
45 dpavlin 1 C<ISO-8859-1> you will have to specify it.
46    
47     =head1 Methods used for indexing
48    
49 dpavlin 8 =head2 open_index
50 dpavlin 1
51     Create new object for index.
52    
53 dpavlin 8 my $i = SWISH::Split->open_index({
54 dpavlin 1 index => '/path/to/index',
55     slice_name => \&slice_on_path,
56     slices => 30,
57 dpavlin 4 merge => 0,
58     codepage => 'ISO-8859-2',
59     swish_config => qq{
60     PropertyNames from date
61     PropertyNamesDate date
62     },
63     memoize_to_xml => 0,
64 dpavlin 1 );
65    
66     # split index on first component of path
67     sub slice_on_path {
68     return shift split(/\//,$_[0]);
69     }
70    
71 dpavlin 8 Options to C<open_index> are following:
72 dpavlin 1
73 dpavlin 4 =over 5
74    
75     =item C<index>
76    
77     path to (existing) directory in which index slices will be created.
78    
79     =item C<slice_name>
80    
81     coderef to function which provide slicing from path.
82    
83     =item C<slices>
84    
85     maximum number of index slices. See L<"in_slice"> for
86 dpavlin 1 more explanation.
87    
88 dpavlin 4 =item C<merge>
89    
90     (planned) option to merge indexes into one at end.
91    
92     =item C<codepage>
93    
94     data codepage (needed for conversion to UTF-8).
95     By default, it's C<ISO-8859-1>.
96    
97     =item C<swish_config>
98    
99     additional parametars which will be inserted into
100     C<swish-e> configuration file. See L<swish-config>.
101    
102     =item C<memoize_to_xml>
103    
104     speed up repeatable data, see L<"to_xml">.
105    
106     =back
107    
108 dpavlin 1 =cut
109    
110     my $iso2utf = Text::Iconv->new('ISO-8859-1','UTF-8');
111    
112 dpavlin 8 sub open_index {
113 dpavlin 1 my $class = shift;
114     my $self = {@_};
115     bless($self, $class);
116    
117     croak "need slice_name coderef" unless ref $self->{'slice_name'};
118     croak "need slices" unless $self->{'slices'};
119    
120     croak "need index" unless $self->{'index'};
121     croak "index '",$self->{'index'},"' doesn't exist" unless -e $self->{'index'};
122     croak "index '",$self->{'index'},"' is not directory" unless -d $self->{'index'};
123    
124     $iso2utf = Text::Iconv->new($self->{'codepage'},'UTF-8') if ($self->{'codepage'});
125    
126 dpavlin 4 # speedup
127 dpavlin 1 memoize('in_slice');
128 dpavlin 4 memoize('to_xml') if ($self->{'memoize_to_xml'});
129 dpavlin 1
130     $self ? return $self : return undef;
131    
132     }
133    
134     =head2 add
135    
136     Add document to index.
137    
138     $i->add($swishpath, {
139     headline => 'foobar result',
140     property => 'data',
141     })
142    
143     =cut
144    
145     sub add {
146     my $self = shift;
147 dpavlin 3
148     my $swishpath = shift || return;
149     my $data = shift || return;
150    
151 dpavlin 7 my $slice = $self->put_slice($swishpath, $self->to_xml($data));
152 dpavlin 4
153 dpavlin 7 return $slice;
154 dpavlin 1 }
155    
156     =head2 delete
157    
158 dpavlin 3 Delete documents from index.
159 dpavlin 1
160 dpavlin 3 $i->delete(@swishpath);
161 dpavlin 1
162     =cut
163    
164     sub delete {
165     my $self = shift;
166 dpavlin 3
167     my @paths = @_ || return;
168    
169 dpavlin 5 foreach my $path (@paths) {
170     $self->{'paths'}->{$path} = DELETED;
171     }
172    
173 dpavlin 3 return 42;
174 dpavlin 1 }
175    
176    
177 dpavlin 5 =head2 done
178 dpavlin 1
179 dpavlin 4 Finish indexing and close index file(s).
180 dpavlin 1
181 dpavlin 5 $i->done;
182 dpavlin 1
183     This is most time-consuming operation. When it's called, it will re-index
184     all entries which haven't changed in all slices.
185    
186 dpavlin 4 Returns number of slices updated.
187    
188 dpavlin 5 This method should really be called close or finish, but both of those are
189     allready used.
190    
191 dpavlin 1 =cut
192    
193 dpavlin 5 sub done {
194 dpavlin 1 my $self = shift;
195 dpavlin 3
196 dpavlin 4 my $ret = 0;
197    
198     foreach my $s (keys %{$self->{'slice'}}) {
199 dpavlin 8 $self->_debug("closing slice $s");
200 dpavlin 4 $ret += $self->close_slice($s);
201     }
202    
203     return $ret;
204 dpavlin 1 }
205    
206    
207    
208     =head1 Reporting methods
209    
210     This methods return statistics about your index.
211    
212     =head2 swishpaths
213    
214     Return array of C<swishpath>s in index.
215    
216     my @p = $i->swishpaths;
217    
218     =cut
219    
220     sub swishpaths {
221     my $self = shift;
222 dpavlin 5
223     my $s = shift || return;
224     return if (! exists($self->{'slice'}->{'s'}));
225    
226     return keys %{$self->{'slice'}->{'s'}};
227 dpavlin 1 }
228    
229     =head2 swishpaths_updated
230    
231     Return array with updated C<swishpath>s.
232    
233     my @d = $i->swishpaths_updated;
234    
235     =cut
236    
237     sub swishpaths_updated {
238     my $self = shift;
239     }
240    
241    
242     =head2 swishpaths_deleted
243    
244     Return array with deleted C<swishpath>s.
245    
246     my $n = $i->swishpaths_deleted;
247    
248     =cut
249    
250     sub swishpaths_deleted {
251     my $self = shift;
252     }
253    
254    
255     =head2 slices
256    
257     Return array with all slice names.
258    
259     my @s = $i->slices;
260    
261     =cut
262    
263     sub slices {
264     my $self = shift;
265     }
266    
267     =head1 Helper methods
268    
269     This methods are used internally, but they might be useful.
270    
271     =head2 in_slice
272    
273     Takes path and return slice in which this path belongs.
274    
275     my $s = $i->in_slice('path/to/document/in/index');
276    
277 dpavlin 8 If there are C<slices> parametar to L<"open_index"> it will use
278 dpavlin 1 MD5 hash to spread documents across slices. That will produce random
279     distribution of your documents in slices, which might or might not be best
280     for your data. If you have to re-index large number of slices on each
281     run, think about creating your own C<slice> function and distributing
282     documents manually across slices.
283    
284 dpavlin 4 Slice number must always be true value or various sanity checks will fail.
285    
286 dpavlin 1 This function is C<Memoize>ed for performance reasons.
287    
288     =cut
289    
290     sub in_slice {
291     my $self = shift;
292    
293     my $path = shift || confess "need path";
294    
295     confess "need slice_name function" unless ref ($self->{'slice_name'});
296    
297     if ($self->{'slices'}) {
298     # first, pass path through slice_name function
299     my $slice = &{$self->{'slice_name'}}($path);
300     # then calculate MD5 hash
301 dpavlin 8 my $hash = md5_hex($slice);
302 dpavlin 1 # take first 8 chars to produce number
303     # FIXME how random is this?
304 dpavlin 8 $hash = hex(substr($hash,0,8));
305 dpavlin 1
306 dpavlin 8 $slice = ($hash % $self->{'slices'}) + 1;
307     $self->_debug("hash: $hash / ",$self->{'slices'}," => $slice");
308 dpavlin 4 return $slice;
309 dpavlin 1 } else {
310     return &{$self->{'split'}}($path);
311     }
312     }
313    
314 dpavlin 3 =head2 find_paths
315 dpavlin 1
316 dpavlin 3 Return array of C<swishpath>s for given C<swish-e> query.
317 dpavlin 1
318 dpavlin 3 my @p = $i->find_paths("headline=test*");
319    
320     Useful for combining with L<"delete_documents"> to delete documents
321     which hasn't changed a while (so, expired).
322    
323     =cut
324    
325     sub find_paths {
326     my $self = shift;
327    
328     }
329    
330    
331 dpavlin 4 =head2 make_config
332 dpavlin 3
333 dpavlin 4 Create C<swish-e> configuration file for given slice.
334    
335     my $config_filename = $i->make_config('slice name');
336    
337     It returns configuration filename. If no C<swish_config> was defined in
338 dpavlin 8 L<"open_index">, default swish-e configuration will be used. It will index all data for
339 dpavlin 4 searching, but none for properties.
340    
341     If you want to see what is allready defined for swish-e in configuration
342     take a look at source code for C<DEFAULT_SWISH_CONF>.
343    
344 dpavlin 7 It uses C<stdin> as C<IndexDir> to comunicate with C<swish-e>.
345 dpavlin 4
346     =cut
347    
348     sub make_config {
349     my $self = shift;
350    
351    
352     my $index_file = $self->{'index'}."/";
353     $index_file .= shift || confess "need slice name";
354    
355     my ($tmp_fh, $swish_config_filename) = mkstemp("/tmp/swishXXXXX");
356    
357     # find cat on filesystem
358     my $cat = which('cat');
359    
360     print $tmp_fh <<"DEFAULT_SWISH_CONF";
361     # swish-e config file
362    
363 dpavlin 7 IndexDir stdin
364 dpavlin 4
365     # input file definition
366     DefaultContents XML*
367    
368     # indexed metatags
369     MetaNames xml swishdocpath
370    
371    
372     #XMLClassAttributes type
373     UndefinedMetaTags auto
374     UndefinedXMLAttributes auto
375    
376     IndexFile $index_file
377    
378     # Croatian ISO-8859-2 characters to unaccented equivalents
379     TranslateCharacters ¹©ðÐèÈæƾ® ssddcccczz
380    
381    
382     # disable output
383     ParserWarnLevel 0
384     IndexReport 1
385    
386     DEFAULT_SWISH_CONF
387    
388     # add user parametars (like stored properties)
389     print $tmp_fh $self->{'swish_config'} if ($self->{'swish_config'});
390    
391     close($tmp_fh);
392    
393     return $swish_config_filename;
394     }
395    
396     =head2 create_slice
397    
398 dpavlin 8 On first run, starts C<swish-e>. On subsequent calls just return
399 dpavlin 4 it's handles using L<Memoize>.
400    
401     my $s = create_slice('/path/to/document');
402    
403     You shouldn't need to call C<create_slice> directly because it will be called
404     from L<"put_slice"> when needed.
405    
406     =cut
407    
408     sub create_slice {
409     my $self = shift;
410    
411     my $path = shift || confess "create_slice need path!";
412    
413     my $s = $self->in_slice($path) || confess "in_slice returned null";
414    
415     return $s if (exists($self->{'slice'}->{$s}));
416    
417     my $swish_config = $self->make_config($s);
418    
419 dpavlin 8 my $swish = qq{| swish-e };
420     $swish .= qq{ -u } if (-f $self->{'index'}.'/'.$s);
421     $swish .= qq{ -S prog -c } . $swish_config;
422 dpavlin 4
423 dpavlin 8 $self->_debug("creating slice $s using $swish");
424 dpavlin 4
425     ## Build the harness, open all pipes, and launch the subprocesses
426 dpavlin 8 open(my $fh, $swish) || croak "can't open $swish: $!";
427 dpavlin 4
428 dpavlin 8 $self->{'slice'}->{$s}->{'h'} = $fh;
429 dpavlin 4
430     $self->slice_output($s);
431    
432     return $s;
433     }
434    
435     =head2 put_slice
436    
437 dpavlin 7 Pass XML data to swish.
438 dpavlin 4
439 dpavlin 7 my $slice = $i->put_slice('/swish/path', '<xml>data</xml>');
440 dpavlin 4
441 dpavlin 7 Returns slice in which XML ended up.
442    
443 dpavlin 4 =cut
444    
445     sub put_slice {
446     my $self = shift;
447    
448     my $path = shift || confess "need path";
449     my $xml = shift || confess "need xml";
450    
451     $xml = $iso2utf->convert($xml) || carp "XML conversion error in $xml";
452    
453     my $s = $self->create_slice($path) || confess "create_slice returned null";
454    
455     confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
456     confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));
457    
458     $self->slice_output($s);
459    
460     use bytes; # as opposed to chars
461 dpavlin 8 my $fh = $self->{'slice'}->{$s}->{'h'} || confess "handle for slice $s undefined";
462     print { $fh } "Path-Name: $path\n".
463 dpavlin 4 "Content-Length: ".(length($xml)+1)."\n".
464 dpavlin 7 "Update-Mode: Index\n".
465 dpavlin 4 "Document-Type: XML\n\n$xml\n";
466    
467     $self->slice_output($s);
468    
469 dpavlin 8 $self->_debug("dumping in slice $s: $path");
470    
471 dpavlin 5 $self->{'paths'}->{$path} = ADDED;
472    
473 dpavlin 4 return $s;
474     }
475    
476     =head2 slice_output
477    
478     Prints to STDERR output and errors from C<swish-e>.
479    
480 dpavlin 7 my $slice = $i->slice_output($s);
481 dpavlin 4
482     Normally, you don't need to call it.
483    
484 dpavlin 8 B<This is dummy placeholder function for very old code that assumes this
485     module is using C<IPC::Run> which it isn't any more.>
486    
487 dpavlin 4 =cut
488    
489     sub slice_output {
490     my $self = shift;
491    
492     my $s = shift || confess "slice_output needs slice";
493    
494     confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
495    
496 dpavlin 8 # FIXME
497 dpavlin 4
498 dpavlin 7 return $s;
499 dpavlin 4 }
500    
501 dpavlin 5 =head2 close_slice
502 dpavlin 4
503     Close slice (terminates swish-e process for that slice).
504    
505     my $i->close_slice($s);
506    
507     Returns true if slice is closed, false otherwise.
508    
509     =cut
510    
511     sub close_slice {
512     my $self = shift;
513    
514     my $s = shift || confess "close_slice needs slice";
515    
516     confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
517     confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));
518    
519     # pump rest of content (if any)
520 dpavlin 8 close $self->{'slice'}->{$s}->{'h'} || carp "can't close slice $s: $!";
521 dpavlin 4
522     $self->slice_output($s);
523    
524 dpavlin 8 undef $self->{'slice'}->{$s}->{'h'};
525 dpavlin 4
526     delete($self->{'slice'}->{$s}) && return 1;
527     return 0;
528     }
529    
530     =head2 to_xml
531    
532     Convert (binary safe, I hope) your data into XML for C<swish-e>.
533     Data will not yet be recoded to UTF-8. L<"put_slice"> will do that.
534    
535     my $xml = $i->to_xml({ foo => 'bar' });
536    
537     This function is extracted from L<"add"> method so that you can L<Memoize> it.
538     If your data set has a lot of repeatable data, and memory is not a problem, you
539 dpavlin 8 can add C<memoize_to_xml> option to L<"open_index">.
540 dpavlin 4
541     =cut
542    
543     my %escape = ('<'=>'&lt;', '>'=>'&gt;', '&'=>'&amp;', '"'=>'&quot;');
544     my $escape_re = join '|' => keys %escape;
545    
546     sub to_xml {
547     my $self = shift;
548    
549     my $data = shift || return;
550    
551     my $xml = qq{<xml>};
552     foreach my $tag (keys %$data) {
553     my $content = $data->{$tag};
554     next if (! $content || $content eq '');
555     # save [cr/]lf before conversion to XML
556     # $content =~ s/\n\r/##lf##/gs;
557     # $content =~ s/\n/##lf##/gs;
558     $content =~ s/($escape_re)/$escape{$1}/gs;
559     $xml .= "<$tag><![CDATA[".$content."]]></$tag>";
560     }
561     $xml .= qq{</xml>};
562     }
563    
564 dpavlin 8 sub _debug {
565     my $self = shift;
566     print STDERR "## ",@_,"\n" if ($self->{'debug'});
567     return;
568     }
569    
570 dpavlin 1 1;
571     __END__
572    
573    
574 dpavlin 5 =head1 Searching
575 dpavlin 1
576     Searching is still conducted using L<SWISH::API>, but you have to glob
577     index names.
578    
579     use SWISH::API;
580    
581     my $swish = SWISH::API->new( glob('index.swish-e/*') );
582    
583     You can also alternativly create merged index (using C<merge> option) and
584     not change your source code at all.
585    
586     That would also benefit performance, but it increases indexing time
587     because merged indexes must be re-created on each indexing run.
588    
589 dpavlin 5 =head1 EXPORT
590 dpavlin 1
591 dpavlin 4 Nothing by default.
592 dpavlin 1
593 dpavlin 5 =head1 EXAMPLES
594 dpavlin 1
595 dpavlin 4 Test script for this module uses all parts of API. It's also nice example
596     how to use C<SWISH::Split>.
597 dpavlin 1
598     =head1 SEE ALSO
599    
600     L<SWISH::API>,
601     L<http://www.swish-e.org/>
602    
603     =head1 AUTHOR
604    
605     Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
606    
607     =head1 COPYRIGHT AND LICENSE
608    
609     Copyright (C) 2004 by Dobrica Pavlinusic
610    
611     This library is free software; you can redistribute it and/or modify
612     it under the same terms as Perl itself, either Perl version 5.8.4 or,
613     at your option, any later version of Perl 5 you may have available.
614    
615    
616     =cut

  ViewVC Help
Powered by ViewVC 1.1.26