/[SWISH-Split]/trunk/Split.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/Split.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1 by dpavlin, Sun Aug 8 10:09:55 2004 UTC revision 13 by dpavlin, Fri Apr 29 23:25:02 2005 UTC
# Line 4  use 5.008; Line 4  use 5.008;
4  use strict;  use strict;
5  use warnings;  use warnings;
6    
7  our $VERSION = '0.00';  our $VERSION = '0.03';
8    
9  use SWISH::API;  use SWISH::API;
10  use Text::Iconv;  use Text::Iconv;
# Line 12  use File::Temp qw/ :mktemp /; Line 12  use File::Temp qw/ :mktemp /;
12  use Carp;  use Carp;
13  use Digest::MD5 qw(md5_hex);  use Digest::MD5 qw(md5_hex);
14  use Memoize;  use Memoize;
15    use File::Which;
16    
17  use Data::Dumper;  use Data::Dumper;
18    
19    use constant {
20            ADDED => 1,
21            DELETED => 2,
22    };
23    
24  =head1 NAME  =head1 NAME
25    
26  SWISH::Split - Perl interface to split index variant of Swish-e  SWISH::Split - Perl interface to split index variant of Swish-e
# Line 27  SWISH::Split - Perl interface to split i Line 33  SWISH::Split - Perl interface to split i
33  =head1 DESCRIPTION  =head1 DESCRIPTION
34    
35  This is alternative interface for indexing data with swish-e. It's designed  This is alternative interface for indexing data with swish-e. It's designed
36  to split indexes over multiple files to allow updates of records in index  to split indexes over multiple files (slices) to allow updates of records in index
37  by reindexing just changed parts.  by reindexing just changed parts (slice).
38    
39  Data is stored in index using intrface which is somewhat similar to  Data is stored in index using intrface which is somewhat similar to
40  L<Plucene::Simple>. This could make your migration (or supporting two index  L<Plucene::Simple>. This could make your migration (or supporting two index
41  engines) easier.  engines) easier.
42    
43  In the background, it will fork swish-e binaries (one for each index slice)  In the background, it will fork swish-e binaries (one for each index slice)
44  and produce UTF-8 encoded XML files. So, if your imput charset isn't  and produce UTF-8 encoded XML files for it. So, if your input charset isn't
45  C<ISO-8859-1> you will have to specify it.  C<ISO-8859-1> you will have to specify it.
46    
47  =head1 Methods used for indexing  =head1 Methods used for indexing
48    
49  =head2 open  =head2 open_index
50    
51  Create new object for index.  Create new object for index.
52    
53    my $i = SWISH::Split->open({    my $i = SWISH::Split->open_index({
54          index => '/path/to/index',          index => '/path/to/index',
55          slice_name => \&slice_on_path,          slice_name => \&slice_on_path,
56          slices => 30,          slices => 30,
57          merge => 1,          merge => 0,
58          codepage => 'ISO-8859-2'          codepage => 'ISO-8859-2',
59            swish_config => qq{
60                    PropertyNames from date
61                    PropertyNamesDate date
62            },
63            memoize_to_xml => 0,
64    );    );
65    
66    # split index on first component of path    # split index on first component of path
# Line 57  Create new object for index. Line 68  Create new object for index.
68          return shift split(/\//,$_[0]);          return shift split(/\//,$_[0]);
69    }    }
70    
71    Options to C<open_index> are following:
72    
73    =over 5
74    
75    =item C<index>
76    
77    path to (existing) directory in which index slices will be created.
78    
79    =item C<slice_name>
80    
81  C<slices> is maximum number of index slices. See L<"in_slice"> for  coderef to function which provide slicing from path.
82    
83    =item C<slices>
84    
85    maximum number of index slices. See L<"in_slice"> for
86  more explanation.  more explanation.
87    
88    =item C<merge>
89    
90    (planned) option to merge indexes into one at end.
91    
92    =item C<codepage>
93    
94    data codepage (needed for conversion to UTF-8).
95    By default, it's C<ISO-8859-1>.
96    
97    =item C<swish_config>
98    
99    additional parametars which will be inserted into
100    C<swish-e> configuration file. See C<swish-config>.
101    
102    =item C<memoize_to_xml>
103    
104    speed up repeatable data, see L<"to_xml">.
105    
106    =back
107    
108  =cut  =cut
109    
110  my $iso2utf = Text::Iconv->new('ISO-8859-1','UTF-8');  my $iso2utf = Text::Iconv->new('ISO-8859-1','UTF-8');
111    
112  sub open {  sub open_index {
113          my $class = shift;          my $class = shift;
114          my $self = {@_};          my $self = {@_};
115          bless($self, $class);          bless($self, $class);
116    
         print Dumper($self->{'slice_name'});  
   
117          croak "need slice_name coderef" unless ref $self->{'slice_name'};          croak "need slice_name coderef" unless ref $self->{'slice_name'};
118          croak "need slices" unless $self->{'slices'};          croak "need slices" unless $self->{'slices'};
119    
# Line 81  sub open { Line 123  sub open {
123    
124          $iso2utf = Text::Iconv->new($self->{'codepage'},'UTF-8') if ($self->{'codepage'});          $iso2utf = Text::Iconv->new($self->{'codepage'},'UTF-8') if ($self->{'codepage'});
125    
126            # speedup
127          memoize('in_slice');          memoize('in_slice');
128            memoize('to_xml') if ($self->{'memoize_to_xml'});
129    
130          $self ? return $self : return undef;          $self ? return $self : return undef;
131    
# Line 100  Add document to index. Line 144  Add document to index.
144    
145  sub add {  sub add {
146          my $self = shift;          my $self = shift;
147    
148            my $swishpath = shift || return;
149            my $data = shift || return;
150    
151            my $slice = $self->put_slice($swishpath, $self->to_xml($data));
152    
153            return $slice;
154  }  }
155    
156  =head2 delete  =head2 delete
157    
158  Delete document from index.  Delete documents from index.
159    
160    $i->delete($swishpath);    $i->delete(@swishpath);
161    
162    B<This function is not implemented.>
163    
164  =cut  =cut
165    
166  sub delete {  sub delete {
167          my $self = shift;          my $self = shift;
168    
169            my @paths = @_ || return;
170    
171            foreach my $path (@paths) {
172                    $self->{'paths'}->{$path} = DELETED;
173            }
174    
175            die "delete is not yet implemented";
176    
177            return 42;
178  }  }
179    
180    
181  =head2 close  =head2 done
182    
183  Close index file and finish indexing.  Finish indexing and close index file(s).
184    
185    $i->close;    $i->done;
186    
187  This is most time-consuming operation. When it's called, it will re-index  This is most time-consuming operation. When it's called, it will re-index
188  all entries which haven't changed in all slices.  all entries which haven't changed in all slices.
189    
190    Returns number of slices updated.
191    
192    This method should really be called close or finish, but both of those are
193    allready used.
194    
195  =cut  =cut
196    
197  sub close {  sub done {
198          my $self = shift;          my $self = shift;
199    
200            my $ret = 0;
201    
202            foreach my $s (keys %{$self->{'slice'}}) {
203                    $self->_debug("closing slice $s");
204                    $ret += $self->close_slice($s);
205            }
206    
207            return $ret;
208  }  }
209    
210    
# Line 146  Return array of C<swishpath>s in index. Line 223  Return array of C<swishpath>s in index.
223    
224  sub swishpaths {  sub swishpaths {
225          my $self = shift;          my $self = shift;
226    
227            my $s = shift || return;
228            return if (! exists($self->{'slice'}->{'s'}));
229    
230            return keys %{$self->{'slice'}->{'s'}};
231  }  }
232    
233  =head2 swishpaths_updated  =head2 swishpaths_updated
# Line 196  Takes path and return slice in which thi Line 278  Takes path and return slice in which thi
278    
279    my $s = $i->in_slice('path/to/document/in/index');    my $s = $i->in_slice('path/to/document/in/index');
280    
281  If there are C<slices> parametar to L<"open"> it will use  If there are C<slices> parametar to L<"open_index"> it will use
282  MD5 hash to spread documents across slices. That will produce random  MD5 hash to spread documents across slices. That will produce random
283  distribution of your documents in slices, which might or might not be best  distribution of your documents in slices, which might or might not be best
284  for your data. If you have to re-index large number of slices on each  for your data. If you have to re-index large number of slices on each
285  run, think about creating your own C<slice> function and distributing  run, think about creating your own C<slice> function and distributing
286  documents manually across slices.  documents manually across slices.
287    
288    Slice number must always be true value or various sanity checks will fail.
289    
290  This function is C<Memoize>ed for performance reasons.  This function is C<Memoize>ed for performance reasons.
291    
292  =cut  =cut
# Line 212  sub in_slice { Line 296  sub in_slice {
296    
297          my $path = shift || confess "need path";          my $path = shift || confess "need path";
298    
         print Dumper($self->{'slice_name'});  
299          confess "need slice_name function" unless ref ($self->{'slice_name'});          confess "need slice_name function" unless ref ($self->{'slice_name'});
300    
301          if ($self->{'slices'}) {          if ($self->{'slices'}) {
302                  # first, pass path through slice_name function                  # first, pass path through slice_name function
303                  my $slice = &{$self->{'slice_name'}}($path);                  my $slice = &{$self->{'slice_name'}}($path);
304                  # then calculate MD5 hash                  # then calculate MD5 hash
305                  $slice = md5_hex($slice);                  my $hash = md5_hex($slice);
306                  # take first 8 chars to produce number                  # take first 8 chars to produce number
307                  # FIXME how random is this?                  # FIXME how random is this?
308                  $slice = hex(substr($slice,0,8));                  $hash = hex(substr($hash,0,8));
309                                    
310                  print "slice_nr: $slice slices: ",$self->{'slices'},"\n";                  $slice = ($hash % $self->{'slices'}) + 1;
311                  return ($slice % $self->{'slices'});                  $self->_debug("hash: $hash / ",$self->{'slices'}," => $slice");
312                    return $slice;
313          } else {          } else {
314                  return &{$self->{'split'}}($path);                  return &{$self->{'split'}}($path);
315          }          }
316  }  }
317    
318    =head2 find_paths
319    
320    Return array of C<swishpath>s for given C<swish-e> query.
321    
322      my @p = $i->find_paths("headline=test*");
323    
324    Useful for combining with L<"delete_documents"> to delete documents
325    which hasn't changed a while (so, expired).
326    
327    =cut
328    
329    sub find_paths {
330            my $self = shift;
331    
332    }
333    
334    
335    =head2 make_config
336    
337    Create C<swish-e> configuration file for given slice.
338    
339      my $config_filename = $i->make_config('slice name');
340    
341    It returns configuration filename. If no C<swish_config> was defined in
342    L<"open_index">, default swish-e configuration will be used. It will index all data for
343    searching, but none for properties.
344    
345    If you want to see what is allready defined for swish-e in configuration
346    take a look at source code for C<DEFAULT_SWISH_CONF>.
347    
348    It uses C<stdin> as C<IndexDir> to comunicate with C<swish-e>.
349    
350    =cut
351    
352    sub make_config {
353            my $self = shift;
354    
355    
356            my $index_file = $self->{'index'}."/";
357            $index_file .= shift || confess "need slice name";
358    
359            my ($tmp_fh, $swish_config_filename) = mkstemp("/tmp/swishXXXXX");
360    
361            # find cat on filesystem
362            my $cat = which('cat');
363    
364            print $tmp_fh <<"DEFAULT_SWISH_CONF";
365    # swish-e config file
366    
367    IndexDir stdin
368    
369    # input file definition
370    DefaultContents XML*
371    
372    # indexed metatags
373    MetaNames xml swishdocpath
374    
375    
376    #XMLClassAttributes type
377    UndefinedMetaTags auto
378    UndefinedXMLAttributes auto
379    
380    IndexFile $index_file
381    
382    # Croatian ISO-8859-2 characters to unaccented equivalents
383    TranslateCharacters ¹©ðÐèÈæƾ® ssddcccczz
384    
385    
386    # disable output
387    ParserWarnLevel 0
388    IndexReport 1
389    
390    DEFAULT_SWISH_CONF
391    
392            # add user parametars (like stored properties)
393            print $tmp_fh $self->{'swish_config'} if ($self->{'swish_config'});
394    
395            close($tmp_fh);
396    
397            return $swish_config_filename;
398    }
399    
400    =head2 create_slice
401    
402    On first run, starts C<swish-e>. On subsequent calls just return
403    it's handles using C<Memoize>.
404    
405      my $s = create_slice('/path/to/document');
406    
407    You shouldn't need to call C<create_slice> directly because it will be called
408    from L<"put_slice"> when needed.
409    
410    =cut
411    
412    sub create_slice {
413            my $self = shift;
414    
415            my $path = shift || confess "create_slice need path!";
416    
417            my $s = $self->in_slice($path) || confess "in_slice returned null";
418    
419            return $s if (exists($self->{'slice'}->{$s}));
420    
421            my $swish_config = $self->make_config($s);
422    
423            my $swish = qq{| swish-e };
424            if (-f $self->{'index'}.'/'.$s) {
425                    $swish .= qq{ -u };
426                    $self->{'slice'}->{$s}->{'update_mode'}++;
427            }
428            $swish .= qq{ -S prog -c } . $swish_config;
429    
430            $self->_debug("creating slice $s using $swish");
431    
432            ## Build the harness, open all pipes, and launch the subprocesses
433            open(my $fh, $swish) || croak "can't open $swish: $!";
434    
435            $self->{'slice'}->{$s}->{'h'} = $fh;
436    
437            $self->slice_output($s);
438    
439            return $s;
440    }
441    
442    =head2 put_slice
443    
444    Pass XML data to swish.
445    
446      my $slice = $i->put_slice('/swish/path', '<xml>data</xml>');
447    
448    Returns slice in which XML ended up.
449    
450    =cut
451    
452    sub put_slice {
453            my $self = shift;
454    
455            my $path = shift || confess "need path";
456            my $xml = shift || confess "need xml";
457    
458            $xml = $iso2utf->convert($xml) || carp "XML conversion error in $xml";
459    
460            my $s = $self->create_slice($path) || confess "create_slice returned null";
461    
462            confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
463            confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));
464    
465            $self->slice_output($s);
466    
467            use bytes;      # as opposed to chars
468            my $fh = $self->{'slice'}->{$s}->{'h'} || confess "handle for slice $s undefined";
469    
470            my $update_header = "Update-Mode: Index\n";
471            $update_header = '' unless ($self->{'slice'}->{$s}->{'update_mode'});
472    
473            print { $fh } "Path-Name: $path\n".
474                    "Content-Length: ".(length($xml)+1)."\n" . $update_header .
475                    "Document-Type: XML\n\n$xml\n";
476    
477            $self->slice_output($s);
478    
479            $self->_debug("dumping in slice $s: $path");
480    
481            $self->{'paths'}->{$path} = ADDED;
482    
483            return $s;
484    }
485    
486    =head2 slice_output
487    
488    Prints to STDERR output and errors from C<swish-e>.
489    
490      my $slice = $i->slice_output($s);
491    
492    Normally, you don't need to call it.
493    
494    B<This is dummy placeholder function for very old code that assumes this
495    module is using C<IPC::Run> which it isn't any more.>
496    
497    =cut
498    
499    sub slice_output {
500            my $self = shift;
501    
502            my $s = shift || confess "slice_output needs slice";
503    
504            confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
505    
506            # FIXME
507    
508            return $s;
509    }
510    
511    =head2 close_slice
512    
513    Close slice (terminates swish-e process for that slice).
514    
515      my $i->close_slice($s);
516    
517    Returns true if slice is closed, false otherwise.
518    
519    =cut
520    
521    sub close_slice {
522            my $self = shift;
523    
524            my $s = shift || confess "close_slice needs slice";
525    
526            confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
527            confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));
528    
529            # pump rest of content (if any)
530            close $self->{'slice'}->{$s}->{'h'} || carp "can't close slice $s: $!";
531    
532            $self->slice_output($s);
533    
534            undef $self->{'slice'}->{$s}->{'h'};
535            
536            delete($self->{'slice'}->{$s}) && return 1;
537            return 0;
538    }
539    
540    =head2 to_xml
541    
542    Convert (binary safe, I hope) your data into XML for C<swish-e>.
543    Data will not yet be recoded to UTF-8. L<"put_slice"> will do that.
544    
545      my $xml = $i->to_xml({ foo => 'bar' });
546    
547    This function is extracted from L<"add"> method so that you can C<Memoize> it.
548    If your data set has a lot of repeatable data, and memory is not a problem, you
549    can add C<memoize_to_xml> option to L<"open_index">.
550    
551    =cut
552    
553    my %escape = ('<'=>'&lt;', '>'=>'&gt;', '&'=>'&amp;', '"'=>'&quot;');
554    my $escape_re  = join '|' => keys %escape;
555    
556    sub to_xml {
557            my $self = shift;
558    
559            my $data = shift || return;
560    
561            my $xml = qq{<xml>};
562            foreach my $tag (keys %$data) {
563                    my $content = $data->{$tag};
564                    next if (! $content || $content eq '');
565                    # save [cr/]lf before conversion to XML
566    #               $content =~ s/\n\r/##lf##/gs;
567    #               $content =~ s/\n/##lf##/gs;
568                    $content =~ s/($escape_re)/$escape{$1}/gs;
569                    $xml .= "<$tag><![CDATA[".$content."]]></$tag>";
570            }
571            $xml .= qq{</xml>};
572    }
573    
574    sub _debug {
575            my $self = shift;
576            print STDERR "## ",@_,"\n" if ($self->{'debug'});
577            return;
578    }
579    
580  1;  1;
581  __END__  __END__
582    
583    
584  =head2 Searching  =head1 Searching
585    
586  Searching is still conducted using L<SWISH::API>, but you have to glob  Searching is still conducted using L<SWISH::API>, but you have to glob
587  index names.  index names.
# Line 252  not change your source code at all. Line 596  not change your source code at all.
596  That would also benefit performance, but it increases indexing time  That would also benefit performance, but it increases indexing time
597  because merged indexes must be re-created on each indexing run.  because merged indexes must be re-created on each indexing run.
598    
599  =head2 EXPORT  =head1 EXPORT
600    
601  None by default.  Nothing by default.
602    
603    =head1 EXAMPLES
604    
605    Test script for this module uses all parts of API. It's also nice example
606    how to use C<SWISH::Split>.
607    
608  =head1 SEE ALSO  =head1 SEE ALSO
609    

Legend:
Removed from v.1  
changed lines
  Added in v.13

  ViewVC Help
Powered by ViewVC 1.1.26