/[SWISH-Split]/trunk/Split.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/Split.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 3 by dpavlin, Sun Aug 8 10:53:04 2004 UTC revision 7 by dpavlin, Fri Dec 17 18:32:34 2004 UTC
# Line 4  use 5.008; Line 4  use 5.008;
4  use strict;  use strict;
5  use warnings;  use warnings;
6    
7  our $VERSION = '0.00';  our $VERSION = '0.01';
8    
9  use SWISH::API;  use SWISH::API;
10  use Text::Iconv;  use Text::Iconv;
# Line 12  use File::Temp qw/ :mktemp /; Line 12  use File::Temp qw/ :mktemp /;
12  use Carp;  use Carp;
13  use Digest::MD5 qw(md5_hex);  use Digest::MD5 qw(md5_hex);
14  use Memoize;  use Memoize;
15    use IPC::Run qw(start timeout pump finish);
16    use File::Which;
17    
18  use Data::Dumper;  use Data::Dumper;
19    
20    use constant {
21            ADDED => 1,
22            DELETED => 2,
23    };
24    
25  =head1 NAME  =head1 NAME
26    
27  SWISH::Split - Perl interface to split index variant of Swish-e  SWISH::Split - Perl interface to split index variant of Swish-e
# Line 27  SWISH::Split - Perl interface to split i Line 34  SWISH::Split - Perl interface to split i
34  =head1 DESCRIPTION  =head1 DESCRIPTION
35    
36  This is alternative interface for indexing data with swish-e. It's designed  This is alternative interface for indexing data with swish-e. It's designed
37  to split indexes over multiple files to allow updates of records in index  to split indexes over multiple files (slices) to allow updates of records in index
38  by reindexing just changed parts.  by reindexing just changed parts (slice).
39    
40  Data is stored in index using intrface which is somewhat similar to  Data is stored in index using intrface which is somewhat similar to
41  L<Plucene::Simple>. This could make your migration (or supporting two index  L<Plucene::Simple>. This could make your migration (or supporting two index
42  engines) easier.  engines) easier.
43    
44  In the background, it will fork swish-e binaries (one for each index slice)  In the background, it will fork swish-e binaries (one for each index slice)
45  and produce UTF-8 encoded XML files. So, if your imput charset isn't  and produce UTF-8 encoded XML files for it. So, if your input charset isn't
46  C<ISO-8859-1> you will have to specify it.  C<ISO-8859-1> you will have to specify it.
47    
48  =head1 Methods used for indexing  =head1 Methods used for indexing
# Line 48  Create new object for index. Line 55  Create new object for index.
55          index => '/path/to/index',          index => '/path/to/index',
56          slice_name => \&slice_on_path,          slice_name => \&slice_on_path,
57          slices => 30,          slices => 30,
58          merge => 1,          merge => 0,
59          codepage => 'ISO-8859-2'          codepage => 'ISO-8859-2',
60            swish_config => qq{
61                    PropertyNames from date
62                    PropertyNamesDate date
63            },
64            memoize_to_xml => 0,
65    );    );
66    
67    # split index on first component of path    # split index on first component of path
# Line 57  Create new object for index. Line 69  Create new object for index.
69          return shift split(/\//,$_[0]);          return shift split(/\//,$_[0]);
70    }    }
71    
72    Options to open are following:
73    
74    =over 5
75    
76    =item C<index>
77    
78    path to (existing) directory in which index slices will be created.
79    
80    =item C<slice_name>
81    
82  C<slices> is maximum number of index slices. See L<"in_slice"> for  coderef to function which provide slicing from path.
83    
84    =item C<slices>
85    
86    maximum number of index slices. See L<"in_slice"> for
87  more explanation.  more explanation.
88    
89    =item C<merge>
90    
91    (planned) option to merge indexes into one at end.
92    
93    =item C<codepage>
94    
95    data codepage (needed for conversion to UTF-8).
96    By default, it's C<ISO-8859-1>.
97    
98    =item C<swish_config>
99    
100    additional parametars which will be inserted into
101    C<swish-e> configuration file. See L<swish-config>.
102    
103    =item C<memoize_to_xml>
104    
105    speed up repeatable data, see L<"to_xml">.
106    
107    =back
108    
109  =cut  =cut
110    
111  my $iso2utf = Text::Iconv->new('ISO-8859-1','UTF-8');  my $iso2utf = Text::Iconv->new('ISO-8859-1','UTF-8');
# Line 70  sub open { Line 115  sub open {
115          my $self = {@_};          my $self = {@_};
116          bless($self, $class);          bless($self, $class);
117    
         print Dumper($self->{'slice_name'});  
   
118          croak "need slice_name coderef" unless ref $self->{'slice_name'};          croak "need slice_name coderef" unless ref $self->{'slice_name'};
119          croak "need slices" unless $self->{'slices'};          croak "need slices" unless $self->{'slices'};
120    
# Line 81  sub open { Line 124  sub open {
124    
125          $iso2utf = Text::Iconv->new($self->{'codepage'},'UTF-8') if ($self->{'codepage'});          $iso2utf = Text::Iconv->new($self->{'codepage'},'UTF-8') if ($self->{'codepage'});
126    
127            # speedup
128          memoize('in_slice');          memoize('in_slice');
129            memoize('to_xml') if ($self->{'memoize_to_xml'});
130    
131          $self ? return $self : return undef;          $self ? return $self : return undef;
132    
# Line 104  sub add { Line 149  sub add {
149          my $swishpath = shift || return;          my $swishpath = shift || return;
150          my $data = shift || return;          my $data = shift || return;
151    
152          return 1;          my $slice = $self->put_slice($swishpath, $self->to_xml($data));
153    
154    #       if ($err) {
155    #               carp "$swishpath: $err";
156    #               return undef;
157    #       }
158    
159            return $slice;
160  }  }
161    
162  =head2 delete  =head2 delete
# Line 120  sub delete { Line 172  sub delete {
172    
173          my @paths = @_ || return;          my @paths = @_ || return;
174    
175            foreach my $path (@paths) {
176                    $self->{'paths'}->{$path} = DELETED;
177            }
178    
179          return 42;          return 42;
180  }  }
181    
182    
183  =head2 close  =head2 done
184    
185  Close index file and finish indexing.  Finish indexing and close index file(s).
186    
187    $i->close;    $i->done;
188    
189  This is most time-consuming operation. When it's called, it will re-index  This is most time-consuming operation. When it's called, it will re-index
190  all entries which haven't changed in all slices.  all entries which haven't changed in all slices.
191    
192    Returns number of slices updated.
193    
194    This method should really be called close or finish, but both of those are
195    allready used.
196    
197  =cut  =cut
198    
199  sub close {  sub done {
200          my $self = shift;          my $self = shift;
201    
202          return 1;          my $ret = 0;
203    
204            foreach my $s (keys %{$self->{'slice'}}) {
205                    print STDERR "closing slice $s\n";
206                    $ret += $self->close_slice($s);
207            }
208    
209            return $ret;
210  }  }
211    
212    
# Line 157  Return array of C<swishpath>s in index. Line 225  Return array of C<swishpath>s in index.
225    
226  sub swishpaths {  sub swishpaths {
227          my $self = shift;          my $self = shift;
228    
229            my $s = shift || return;
230            return if (! exists($self->{'slice'}->{'s'}));
231    
232            return keys %{$self->{'slice'}->{'s'}};
233  }  }
234    
235  =head2 swishpaths_updated  =head2 swishpaths_updated
# Line 214  for your data. If you have to re-index l Line 287  for your data. If you have to re-index l
287  run, think about creating your own C<slice> function and distributing  run, think about creating your own C<slice> function and distributing
288  documents manually across slices.  documents manually across slices.
289    
290    Slice number must always be true value or various sanity checks will fail.
291    
292  This function is C<Memoize>ed for performance reasons.  This function is C<Memoize>ed for performance reasons.
293    
294  =cut  =cut
# Line 223  sub in_slice { Line 298  sub in_slice {
298    
299          my $path = shift || confess "need path";          my $path = shift || confess "need path";
300    
         print Dumper($self->{'slice_name'});  
301          confess "need slice_name function" unless ref ($self->{'slice_name'});          confess "need slice_name function" unless ref ($self->{'slice_name'});
302    
303          if ($self->{'slices'}) {          if ($self->{'slices'}) {
# Line 235  sub in_slice { Line 309  sub in_slice {
309                  # FIXME how random is this?                  # FIXME how random is this?
310                  $slice = hex(substr($slice,0,8));                  $slice = hex(substr($slice,0,8));
311                                    
312                  print "slice_nr: $slice slices: ",$self->{'slices'},"\n";                  $slice = ($slice % $self->{'slices'}) + 1;
313                  return ($slice % $self->{'slices'});                  print "hash: $slice / ",$self->{'slices'}," => $slice\n";
314                    return $slice;
315          } else {          } else {
316                  return &{$self->{'split'}}($path);                  return &{$self->{'split'}}($path);
317          }          }
# Line 256  which hasn't changed a while (so, expire Line 331  which hasn't changed a while (so, expire
331  sub find_paths {  sub find_paths {
332          my $self = shift;          my $self = shift;
333    
         my $s = shift || return;  
334  }  }
335    
336    
337    =head2 make_config
338    
339    Create C<swish-e> configuration file for given slice.
340    
341      my $config_filename = $i->make_config('slice name');
342    
343    It returns configuration filename. If no C<swish_config> was defined in
344    L<"open">, default swish-e configuration will be used. It will index all data for
345    searching, but none for properties.
346    
347    If you want to see what is allready defined for swish-e in configuration
348    take a look at source code for C<DEFAULT_SWISH_CONF>.
349    
350    It uses C<stdin> as C<IndexDir> to comunicate with C<swish-e>.
351    
352    =cut
353    
354    sub make_config {
355            my $self = shift;
356    
357    
358            my $index_file = $self->{'index'}."/";
359            $index_file .= shift || confess "need slice name";
360    
361            my ($tmp_fh, $swish_config_filename) = mkstemp("/tmp/swishXXXXX");
362    
363            # find cat on filesystem
364            my $cat = which('cat');
365    
366            print $tmp_fh <<"DEFAULT_SWISH_CONF";
367    # swish-e config file
368    
369    IndexDir stdin
370    
371    # input file definition
372    DefaultContents XML*
373    
374    # indexed metatags
375    MetaNames xml swishdocpath
376    
377    
378    #XMLClassAttributes type
379    UndefinedMetaTags auto
380    UndefinedXMLAttributes auto
381    
382    IndexFile $index_file
383    
384    # Croatian ISO-8859-2 characters to unaccented equivalents
385    TranslateCharacters ¹©ðÐèÈæƾ® ssddcccczz
386    
387    
388    # disable output
389    ParserWarnLevel 0
390    IndexReport 1
391    
392    DEFAULT_SWISH_CONF
393    
394            # add user parametars (like stored properties)
395            print $tmp_fh $self->{'swish_config'} if ($self->{'swish_config'});
396    
397            close($tmp_fh);
398    
399            return $swish_config_filename;
400    }
401    
402    =head2 create_slice
403    
404    On first run, starts C<swish-e> using L<IPC::Run>. On subsequent calls just return
405    it's handles using L<Memoize>.
406    
407      my $s = create_slice('/path/to/document');
408    
409    You shouldn't need to call C<create_slice> directly because it will be called
410    from L<"put_slice"> when needed.
411    
412    =cut
413    
414    sub create_slice {
415            my $self = shift;
416    
417            my $path = shift || confess "create_slice need path!";
418    
419            my $s = $self->in_slice($path) || confess "in_slice returned null";
420    
421            return $s if (exists($self->{'slice'}->{$s}));
422    
423            my $swish_config = $self->make_config($s);
424    
425            print STDERR "creating slice $s\n";     # FIXME
426    
427            my @swish = qw(swish-e -u -S prog -c);
428            push @swish, $swish_config;
429    
430            ## Build the harness, open all pipes, and launch the subprocesses
431            $self->{'slice'}->{$s}->{'h'} = start \@swish,
432                    \$self->{'slice'}->{$s}->{'in'},
433                    \$self->{'slice'}->{$s}->{'out'},
434                    \$self->{'slice'}->{$s}->{'err'},
435                    timeout( 90 );  # FIXME
436    
437            $self->{'slice'}->{$s}->{'out_len'} = 0;
438            $self->{'slice'}->{$s}->{'err_len'} = 0;
439    
440            $self->slice_output($s);
441    
442            return $s;
443    }
444    
445    =head2 put_slice
446    
447    Pass XML data to swish.
448    
449      my $slice = $i->put_slice('/swish/path', '<xml>data</xml>');
450    
451    Returns slice in which XML ended up.
452    
453    =cut
454    
455    sub put_slice {
456            my $self = shift;
457    
458            my $path = shift || confess "need path";
459            my $xml = shift || confess "need xml";
460    
461            $xml = $iso2utf->convert($xml) || carp "XML conversion error in $xml";
462    
463            my $s = $self->create_slice($path) || confess "create_slice returned null";
464    
465            confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
466            confess "no 'in' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'in'}));
467            confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));
468    
469            $self->slice_output($s);
470    
471            use bytes;      # as opposed to chars
472            $self->{'slice'}->{$s}->{'in'} .=
473                    "Path-Name: $path\n".
474                    "Content-Length: ".(length($xml)+1)."\n".
475                    "Update-Mode: Index\n".
476                    "Document-Type: XML\n\n$xml\n";
477    
478            # do I/O
479            $self->{'slice'}->{$s}->{'h'}->pump while length $self->{'slice'}->{$s}->{'in'} ;  # wait for all input to go
480    
481            $self->slice_output($s);
482    
483            $self->{'paths'}->{$path} = ADDED;
484    
485            return $s;
486    }
487    
488    =head2 slice_output
489    
490    Prints to STDERR output and errors from C<swish-e>.
491    
492      my $slice = $i->slice_output($s);
493    
494    Normally, you don't need to call it.
495    
496    =cut
497    
498    sub slice_output {
499            my $self = shift;
500    
501            my $s = shift || confess "slice_output needs slice";
502    
503            confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
504            confess "no 'in' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'in'}));
505            confess "no 'out' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'out'}));
506    
507            if (length $self->{'slice'}->{$s}->{'out'} > $self->{'slice'}->{$s}->{'out_len'}) {
508                    #print STDERR "swish-e OUT: ",$self->{'slice'}->{$s}->{'out'},"\n" if ($self->{'slice'}->{$s}->{'out'});
509                    $self->{'slice'}->{$s}->{'out_len'} = length $self->{'slice'}->{$s}->{'out'};
510                    return $s;
511            } elsif (length $self->{'slice'}->{$s}->{'err'} > $self->{'slice'}->{$s}->{'err_len'}) {
512                    print STDERR "swish-e ERR: ",$self->{'slice'}->{$s}->{'err'},"\n" if ($self->{'slice'}->{$s}->{'err'});
513                    $self->{'slice'}->{$s}->{'err_len'} = length $self->{'slice'}->{$s}->{'err'};
514                    # this is fatal
515                    return undef;
516            }
517    
518            return $s;
519    }
520    
521    =head2 close_slice
522    
523    Close slice (terminates swish-e process for that slice).
524    
525      my $i->close_slice($s);
526    
527    Returns true if slice is closed, false otherwise.
528    
529    =cut
530    
531    sub close_slice {
532            my $self = shift;
533    
534            my $s = shift || confess "close_slice needs slice";
535    
536            confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
537            confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));
538    
539            # pump rest of content (if any)
540            $self->{'slice'}->{$s}->{'h'}->pump while length $self->{'slice'}->{$s}->{'in'};
541    
542            $self->slice_output($s);
543    
544            # clean up
545            $self->{'slice'}->{$s}->{'h'}->finish or confess "finish on slice $s returned $?: $! -- ",$self->{'slice'}->{$s}->{'err'};
546            
547            delete($self->{'slice'}->{$s}) && return 1;
548            return 0;
549    }
550    
551    =head2 to_xml
552    
553    Convert (binary safe, I hope) your data into XML for C<swish-e>.
554    Data will not yet be recoded to UTF-8. L<"put_slice"> will do that.
555    
556      my $xml = $i->to_xml({ foo => 'bar' });
557    
558    This function is extracted from L<"add"> method so that you can L<Memoize> it.
559    If your data set has a lot of repeatable data, and memory is not a problem, you
560    can add C<memoize_to_xml> option to L<"open">.
561    
562    =cut
563    
564    my %escape = ('<'=>'&lt;', '>'=>'&gt;', '&'=>'&amp;', '"'=>'&quot;');
565    my $escape_re  = join '|' => keys %escape;
566    
567    sub to_xml {
568            my $self = shift;
569    
570            my $data = shift || return;
571    
572            my $xml = qq{<xml>};
573            foreach my $tag (keys %$data) {
574                    my $content = $data->{$tag};
575                    next if (! $content || $content eq '');
576                    # save [cr/]lf before conversion to XML
577    #               $content =~ s/\n\r/##lf##/gs;
578    #               $content =~ s/\n/##lf##/gs;
579                    $content =~ s/($escape_re)/$escape{$1}/gs;
580                    $xml .= "<$tag><![CDATA[".$content."]]></$tag>";
581            }
582            $xml .= qq{</xml>};
583    }
584    
585  1;  1;
586  __END__  __END__
587    
588    
589  =head2 Searching  =head1 Searching
590    
591  Searching is still conducted using L<SWISH::API>, but you have to glob  Searching is still conducted using L<SWISH::API>, but you have to glob
592  index names.  index names.
# Line 280  not change your source code at all. Line 601  not change your source code at all.
601  That would also benefit performance, but it increases indexing time  That would also benefit performance, but it increases indexing time
602  because merged indexes must be re-created on each indexing run.  because merged indexes must be re-created on each indexing run.
603    
604  =head2 EXPORT  =head1 EXPORT
605    
606  None by default.  Nothing by default.
607    
608    =head1 EXAMPLES
609    
610    Test script for this module uses all parts of API. It's also nice example
611    how to use C<SWISH::Split>.
612    
613  =head1 SEE ALSO  =head1 SEE ALSO
614    

Legend:
Removed from v.3  
changed lines
  Added in v.7

  ViewVC Help
Powered by ViewVC 1.1.26