/[SWISH-Split]/trunk/Split.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/Split.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 13 - (hide annotations)
Fri Apr 29 23:25:02 2005 UTC (18 years, 11 months ago) by dpavlin
File size: 13017 byte(s)
added warning about unimplemented delete

1 dpavlin 1 package SWISH::Split;
2    
3     use 5.008;
4     use strict;
5     use warnings;
6    
7 dpavlin 9 our $VERSION = '0.03';
8 dpavlin 1
9     use SWISH::API;
10     use Text::Iconv;
11     use File::Temp qw/ :mktemp /;
12     use Carp;
13     use Digest::MD5 qw(md5_hex);
14     use Memoize;
15 dpavlin 4 use File::Which;
16 dpavlin 1
17     use Data::Dumper;
18    
19 dpavlin 5 use constant {
20     ADDED => 1,
21     DELETED => 2,
22     };
23    
24 dpavlin 1 =head1 NAME
25    
26     SWISH::Split - Perl interface to split index variant of Swish-e
27    
28     =head1 SYNOPSIS
29    
30     use SWISH::Split;
31    
32    
33     =head1 DESCRIPTION
34    
35     This is alternative interface for indexing data with swish-e. It's designed
36 dpavlin 4 to split indexes over multiple files (slices) to allow updates of records in index
37     by reindexing just changed parts (slice).
38 dpavlin 1
39     Data is stored in index using intrface which is somewhat similar to
40     L<Plucene::Simple>. This could make your migration (or supporting two index
41     engines) easier.
42    
43     In the background, it will fork swish-e binaries (one for each index slice)
44 dpavlin 6 and produce UTF-8 encoded XML files for it. So, if your input charset isn't
45 dpavlin 1 C<ISO-8859-1> you will have to specify it.
46    
47     =head1 Methods used for indexing
48    
49 dpavlin 8 =head2 open_index
50 dpavlin 1
51     Create new object for index.
52    
53 dpavlin 8 my $i = SWISH::Split->open_index({
54 dpavlin 1 index => '/path/to/index',
55     slice_name => \&slice_on_path,
56     slices => 30,
57 dpavlin 4 merge => 0,
58     codepage => 'ISO-8859-2',
59     swish_config => qq{
60     PropertyNames from date
61     PropertyNamesDate date
62     },
63     memoize_to_xml => 0,
64 dpavlin 1 );
65    
66     # split index on first component of path
67     sub slice_on_path {
68     return shift split(/\//,$_[0]);
69     }
70    
71 dpavlin 8 Options to C<open_index> are following:
72 dpavlin 1
73 dpavlin 4 =over 5
74    
75     =item C<index>
76    
77     path to (existing) directory in which index slices will be created.
78    
79     =item C<slice_name>
80    
81     coderef to function which provide slicing from path.
82    
83     =item C<slices>
84    
85     maximum number of index slices. See L<"in_slice"> for
86 dpavlin 1 more explanation.
87    
88 dpavlin 4 =item C<merge>
89    
90     (planned) option to merge indexes into one at end.
91    
92     =item C<codepage>
93    
94     data codepage (needed for conversion to UTF-8).
95     By default, it's C<ISO-8859-1>.
96    
97     =item C<swish_config>
98    
99     additional parametars which will be inserted into
100 dpavlin 11 C<swish-e> configuration file. See C<swish-config>.
101 dpavlin 4
102     =item C<memoize_to_xml>
103    
104     speed up repeatable data, see L<"to_xml">.
105    
106     =back
107    
108 dpavlin 1 =cut
109    
110     my $iso2utf = Text::Iconv->new('ISO-8859-1','UTF-8');
111    
112 dpavlin 8 sub open_index {
113 dpavlin 1 my $class = shift;
114     my $self = {@_};
115     bless($self, $class);
116    
117     croak "need slice_name coderef" unless ref $self->{'slice_name'};
118     croak "need slices" unless $self->{'slices'};
119    
120     croak "need index" unless $self->{'index'};
121     croak "index '",$self->{'index'},"' doesn't exist" unless -e $self->{'index'};
122     croak "index '",$self->{'index'},"' is not directory" unless -d $self->{'index'};
123    
124     $iso2utf = Text::Iconv->new($self->{'codepage'},'UTF-8') if ($self->{'codepage'});
125    
126 dpavlin 4 # speedup
127 dpavlin 1 memoize('in_slice');
128 dpavlin 4 memoize('to_xml') if ($self->{'memoize_to_xml'});
129 dpavlin 1
130     $self ? return $self : return undef;
131    
132     }
133    
134     =head2 add
135    
136     Add document to index.
137    
138     $i->add($swishpath, {
139     headline => 'foobar result',
140     property => 'data',
141     })
142    
143     =cut
144    
145     sub add {
146     my $self = shift;
147 dpavlin 3
148     my $swishpath = shift || return;
149     my $data = shift || return;
150    
151 dpavlin 7 my $slice = $self->put_slice($swishpath, $self->to_xml($data));
152 dpavlin 4
153 dpavlin 7 return $slice;
154 dpavlin 1 }
155    
156     =head2 delete
157    
158 dpavlin 3 Delete documents from index.
159 dpavlin 1
160 dpavlin 3 $i->delete(@swishpath);
161 dpavlin 1
162 dpavlin 13 B<This function is not implemented.>
163    
164 dpavlin 1 =cut
165    
166     sub delete {
167     my $self = shift;
168 dpavlin 3
169     my @paths = @_ || return;
170    
171 dpavlin 5 foreach my $path (@paths) {
172     $self->{'paths'}->{$path} = DELETED;
173     }
174    
175 dpavlin 13 die "delete is not yet implemented";
176    
177 dpavlin 3 return 42;
178 dpavlin 1 }
179    
180    
181 dpavlin 5 =head2 done
182 dpavlin 1
183 dpavlin 4 Finish indexing and close index file(s).
184 dpavlin 1
185 dpavlin 5 $i->done;
186 dpavlin 1
187     This is most time-consuming operation. When it's called, it will re-index
188     all entries which haven't changed in all slices.
189    
190 dpavlin 4 Returns number of slices updated.
191    
192 dpavlin 5 This method should really be called close or finish, but both of those are
193     allready used.
194    
195 dpavlin 1 =cut
196    
197 dpavlin 5 sub done {
198 dpavlin 1 my $self = shift;
199 dpavlin 3
200 dpavlin 4 my $ret = 0;
201    
202     foreach my $s (keys %{$self->{'slice'}}) {
203 dpavlin 8 $self->_debug("closing slice $s");
204 dpavlin 4 $ret += $self->close_slice($s);
205     }
206    
207     return $ret;
208 dpavlin 1 }
209    
210    
211    
212     =head1 Reporting methods
213    
214     This methods return statistics about your index.
215    
216     =head2 swishpaths
217    
218     Return array of C<swishpath>s in index.
219    
220     my @p = $i->swishpaths;
221    
222     =cut
223    
224     sub swishpaths {
225     my $self = shift;
226 dpavlin 5
227     my $s = shift || return;
228     return if (! exists($self->{'slice'}->{'s'}));
229    
230     return keys %{$self->{'slice'}->{'s'}};
231 dpavlin 1 }
232    
233     =head2 swishpaths_updated
234    
235     Return array with updated C<swishpath>s.
236    
237     my @d = $i->swishpaths_updated;
238    
239     =cut
240    
241     sub swishpaths_updated {
242     my $self = shift;
243     }
244    
245    
246     =head2 swishpaths_deleted
247    
248     Return array with deleted C<swishpath>s.
249    
250     my $n = $i->swishpaths_deleted;
251    
252     =cut
253    
254     sub swishpaths_deleted {
255     my $self = shift;
256     }
257    
258    
259     =head2 slices
260    
261     Return array with all slice names.
262    
263     my @s = $i->slices;
264    
265     =cut
266    
267     sub slices {
268     my $self = shift;
269     }
270    
271     =head1 Helper methods
272    
273     This methods are used internally, but they might be useful.
274    
275     =head2 in_slice
276    
277     Takes path and return slice in which this path belongs.
278    
279     my $s = $i->in_slice('path/to/document/in/index');
280    
281 dpavlin 8 If there are C<slices> parametar to L<"open_index"> it will use
282 dpavlin 1 MD5 hash to spread documents across slices. That will produce random
283     distribution of your documents in slices, which might or might not be best
284     for your data. If you have to re-index large number of slices on each
285     run, think about creating your own C<slice> function and distributing
286     documents manually across slices.
287    
288 dpavlin 4 Slice number must always be true value or various sanity checks will fail.
289    
290 dpavlin 1 This function is C<Memoize>ed for performance reasons.
291    
292     =cut
293    
294     sub in_slice {
295     my $self = shift;
296    
297     my $path = shift || confess "need path";
298    
299     confess "need slice_name function" unless ref ($self->{'slice_name'});
300    
301     if ($self->{'slices'}) {
302     # first, pass path through slice_name function
303     my $slice = &{$self->{'slice_name'}}($path);
304     # then calculate MD5 hash
305 dpavlin 8 my $hash = md5_hex($slice);
306 dpavlin 1 # take first 8 chars to produce number
307     # FIXME how random is this?
308 dpavlin 8 $hash = hex(substr($hash,0,8));
309 dpavlin 1
310 dpavlin 8 $slice = ($hash % $self->{'slices'}) + 1;
311     $self->_debug("hash: $hash / ",$self->{'slices'}," => $slice");
312 dpavlin 4 return $slice;
313 dpavlin 1 } else {
314     return &{$self->{'split'}}($path);
315     }
316     }
317    
318 dpavlin 3 =head2 find_paths
319 dpavlin 1
320 dpavlin 3 Return array of C<swishpath>s for given C<swish-e> query.
321 dpavlin 1
322 dpavlin 3 my @p = $i->find_paths("headline=test*");
323    
324     Useful for combining with L<"delete_documents"> to delete documents
325     which hasn't changed a while (so, expired).
326    
327     =cut
328    
329     sub find_paths {
330     my $self = shift;
331    
332     }
333    
334    
335 dpavlin 4 =head2 make_config
336 dpavlin 3
337 dpavlin 4 Create C<swish-e> configuration file for given slice.
338    
339     my $config_filename = $i->make_config('slice name');
340    
341     It returns configuration filename. If no C<swish_config> was defined in
342 dpavlin 8 L<"open_index">, default swish-e configuration will be used. It will index all data for
343 dpavlin 4 searching, but none for properties.
344    
345     If you want to see what is allready defined for swish-e in configuration
346     take a look at source code for C<DEFAULT_SWISH_CONF>.
347    
348 dpavlin 7 It uses C<stdin> as C<IndexDir> to comunicate with C<swish-e>.
349 dpavlin 4
350     =cut
351    
352     sub make_config {
353     my $self = shift;
354    
355    
356     my $index_file = $self->{'index'}."/";
357     $index_file .= shift || confess "need slice name";
358    
359     my ($tmp_fh, $swish_config_filename) = mkstemp("/tmp/swishXXXXX");
360    
361     # find cat on filesystem
362     my $cat = which('cat');
363    
364     print $tmp_fh <<"DEFAULT_SWISH_CONF";
365     # swish-e config file
366    
367 dpavlin 7 IndexDir stdin
368 dpavlin 4
369     # input file definition
370     DefaultContents XML*
371    
372     # indexed metatags
373     MetaNames xml swishdocpath
374    
375    
376     #XMLClassAttributes type
377     UndefinedMetaTags auto
378     UndefinedXMLAttributes auto
379    
380     IndexFile $index_file
381    
382     # Croatian ISO-8859-2 characters to unaccented equivalents
383     TranslateCharacters ¹©ðÐèÈæƾ® ssddcccczz
384    
385    
386     # disable output
387     ParserWarnLevel 0
388     IndexReport 1
389    
390     DEFAULT_SWISH_CONF
391    
392     # add user parametars (like stored properties)
393     print $tmp_fh $self->{'swish_config'} if ($self->{'swish_config'});
394    
395     close($tmp_fh);
396    
397     return $swish_config_filename;
398     }
399    
400     =head2 create_slice
401    
402 dpavlin 8 On first run, starts C<swish-e>. On subsequent calls just return
403 dpavlin 11 it's handles using C<Memoize>.
404 dpavlin 4
405     my $s = create_slice('/path/to/document');
406    
407     You shouldn't need to call C<create_slice> directly because it will be called
408     from L<"put_slice"> when needed.
409    
410     =cut
411    
412     sub create_slice {
413     my $self = shift;
414    
415     my $path = shift || confess "create_slice need path!";
416    
417     my $s = $self->in_slice($path) || confess "in_slice returned null";
418    
419     return $s if (exists($self->{'slice'}->{$s}));
420    
421     my $swish_config = $self->make_config($s);
422    
423 dpavlin 8 my $swish = qq{| swish-e };
424 dpavlin 9 if (-f $self->{'index'}.'/'.$s) {
425     $swish .= qq{ -u };
426     $self->{'slice'}->{$s}->{'update_mode'}++;
427     }
428 dpavlin 8 $swish .= qq{ -S prog -c } . $swish_config;
429 dpavlin 4
430 dpavlin 8 $self->_debug("creating slice $s using $swish");
431 dpavlin 4
432     ## Build the harness, open all pipes, and launch the subprocesses
433 dpavlin 8 open(my $fh, $swish) || croak "can't open $swish: $!";
434 dpavlin 4
435 dpavlin 8 $self->{'slice'}->{$s}->{'h'} = $fh;
436 dpavlin 4
437     $self->slice_output($s);
438    
439     return $s;
440     }
441    
442     =head2 put_slice
443    
444 dpavlin 7 Pass XML data to swish.
445 dpavlin 4
446 dpavlin 7 my $slice = $i->put_slice('/swish/path', '<xml>data</xml>');
447 dpavlin 4
448 dpavlin 7 Returns slice in which XML ended up.
449    
450 dpavlin 4 =cut
451    
452     sub put_slice {
453     my $self = shift;
454    
455     my $path = shift || confess "need path";
456     my $xml = shift || confess "need xml";
457    
458     $xml = $iso2utf->convert($xml) || carp "XML conversion error in $xml";
459    
460     my $s = $self->create_slice($path) || confess "create_slice returned null";
461    
462     confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
463     confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));
464    
465     $self->slice_output($s);
466    
467     use bytes; # as opposed to chars
468 dpavlin 8 my $fh = $self->{'slice'}->{$s}->{'h'} || confess "handle for slice $s undefined";
469 dpavlin 9
470     my $update_header = "Update-Mode: Index\n";
471     $update_header = '' unless ($self->{'slice'}->{$s}->{'update_mode'});
472    
473 dpavlin 8 print { $fh } "Path-Name: $path\n".
474 dpavlin 9 "Content-Length: ".(length($xml)+1)."\n" . $update_header .
475 dpavlin 4 "Document-Type: XML\n\n$xml\n";
476    
477     $self->slice_output($s);
478    
479 dpavlin 8 $self->_debug("dumping in slice $s: $path");
480    
481 dpavlin 5 $self->{'paths'}->{$path} = ADDED;
482    
483 dpavlin 4 return $s;
484     }
485    
486     =head2 slice_output
487    
488     Prints to STDERR output and errors from C<swish-e>.
489    
490 dpavlin 7 my $slice = $i->slice_output($s);
491 dpavlin 4
492     Normally, you don't need to call it.
493    
494 dpavlin 8 B<This is dummy placeholder function for very old code that assumes this
495     module is using C<IPC::Run> which it isn't any more.>
496    
497 dpavlin 4 =cut
498    
499     sub slice_output {
500     my $self = shift;
501    
502     my $s = shift || confess "slice_output needs slice";
503    
504     confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
505    
506 dpavlin 8 # FIXME
507 dpavlin 4
508 dpavlin 7 return $s;
509 dpavlin 4 }
510    
511 dpavlin 5 =head2 close_slice
512 dpavlin 4
513     Close slice (terminates swish-e process for that slice).
514    
515     my $i->close_slice($s);
516    
517     Returns true if slice is closed, false otherwise.
518    
519     =cut
520    
521     sub close_slice {
522     my $self = shift;
523    
524     my $s = shift || confess "close_slice needs slice";
525    
526     confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
527     confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));
528    
529     # pump rest of content (if any)
530 dpavlin 8 close $self->{'slice'}->{$s}->{'h'} || carp "can't close slice $s: $!";
531 dpavlin 4
532     $self->slice_output($s);
533    
534 dpavlin 8 undef $self->{'slice'}->{$s}->{'h'};
535 dpavlin 4
536     delete($self->{'slice'}->{$s}) && return 1;
537     return 0;
538     }
539    
540     =head2 to_xml
541    
542     Convert (binary safe, I hope) your data into XML for C<swish-e>.
543     Data will not yet be recoded to UTF-8. L<"put_slice"> will do that.
544    
545     my $xml = $i->to_xml({ foo => 'bar' });
546    
547 dpavlin 11 This function is extracted from L<"add"> method so that you can C<Memoize> it.
548 dpavlin 4 If your data set has a lot of repeatable data, and memory is not a problem, you
549 dpavlin 8 can add C<memoize_to_xml> option to L<"open_index">.
550 dpavlin 4
551     =cut
552    
553     my %escape = ('<'=>'&lt;', '>'=>'&gt;', '&'=>'&amp;', '"'=>'&quot;');
554     my $escape_re = join '|' => keys %escape;
555    
556     sub to_xml {
557     my $self = shift;
558    
559     my $data = shift || return;
560    
561     my $xml = qq{<xml>};
562     foreach my $tag (keys %$data) {
563     my $content = $data->{$tag};
564     next if (! $content || $content eq '');
565     # save [cr/]lf before conversion to XML
566     # $content =~ s/\n\r/##lf##/gs;
567     # $content =~ s/\n/##lf##/gs;
568     $content =~ s/($escape_re)/$escape{$1}/gs;
569     $xml .= "<$tag><![CDATA[".$content."]]></$tag>";
570     }
571     $xml .= qq{</xml>};
572     }
573    
574 dpavlin 8 sub _debug {
575     my $self = shift;
576     print STDERR "## ",@_,"\n" if ($self->{'debug'});
577     return;
578     }
579    
580 dpavlin 1 1;
581     __END__
582    
583    
584 dpavlin 5 =head1 Searching
585 dpavlin 1
586     Searching is still conducted using L<SWISH::API>, but you have to glob
587     index names.
588    
589     use SWISH::API;
590    
591     my $swish = SWISH::API->new( glob('index.swish-e/*') );
592    
593     You can also alternativly create merged index (using C<merge> option) and
594     not change your source code at all.
595    
596     That would also benefit performance, but it increases indexing time
597     because merged indexes must be re-created on each indexing run.
598    
599 dpavlin 5 =head1 EXPORT
600 dpavlin 1
601 dpavlin 4 Nothing by default.
602 dpavlin 1
603 dpavlin 5 =head1 EXAMPLES
604 dpavlin 1
605 dpavlin 4 Test script for this module uses all parts of API. It's also nice example
606     how to use C<SWISH::Split>.
607 dpavlin 1
608     =head1 SEE ALSO
609    
610     L<SWISH::API>,
611     L<http://www.swish-e.org/>
612    
613     =head1 AUTHOR
614    
615     Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
616    
617     =head1 COPYRIGHT AND LICENSE
618    
619     Copyright (C) 2004 by Dobrica Pavlinusic
620    
621     This library is free software; you can redistribute it and/or modify
622     it under the same terms as Perl itself, either Perl version 5.8.4 or,
623     at your option, any later version of Perl 5 you may have available.
624    
625    
626     =cut

  ViewVC Help
Powered by ViewVC 1.1.26