/[SWISH-Split]/trunk/Split.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/Split.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1 by dpavlin, Sun Aug 8 10:09:55 2004 UTC revision 6 by dpavlin, Wed Dec 8 20:35:49 2004 UTC
# Line 12  use File::Temp qw/ :mktemp /; Line 12  use File::Temp qw/ :mktemp /;
12  use Carp;  use Carp;
13  use Digest::MD5 qw(md5_hex);  use Digest::MD5 qw(md5_hex);
14  use Memoize;  use Memoize;
15    use IPC::Run qw(start timeout pump finish);
16    use File::Which;
17    
18  use Data::Dumper;  use Data::Dumper;
19    
20    use constant {
21            ADDED => 1,
22            DELETED => 2,
23    };
24    
25  =head1 NAME  =head1 NAME
26    
27  SWISH::Split - Perl interface to split index variant of Swish-e  SWISH::Split - Perl interface to split index variant of Swish-e
# Line 27  SWISH::Split - Perl interface to split i Line 34  SWISH::Split - Perl interface to split i
34  =head1 DESCRIPTION  =head1 DESCRIPTION
35    
36  This is alternative interface for indexing data with swish-e. It's designed  This is alternative interface for indexing data with swish-e. It's designed
37  to split indexes over multiple files to allow updates of records in index  to split indexes over multiple files (slices) to allow updates of records in index
38  by reindexing just changed parts.  by reindexing just changed parts (slice).
39    
40  Data is stored in index using intrface which is somewhat similar to  Data is stored in index using intrface which is somewhat similar to
41  L<Plucene::Simple>. This could make your migration (or supporting two index  L<Plucene::Simple>. This could make your migration (or supporting two index
42  engines) easier.  engines) easier.
43    
44  In the background, it will fork swish-e binaries (one for each index slice)  In the background, it will fork swish-e binaries (one for each index slice)
45  and produce UTF-8 encoded XML files. So, if your imput charset isn't  and produce UTF-8 encoded XML files for it. So, if your input charset isn't
46  C<ISO-8859-1> you will have to specify it.  C<ISO-8859-1> you will have to specify it.
47    
48  =head1 Methods used for indexing  =head1 Methods used for indexing
# Line 48  Create new object for index. Line 55  Create new object for index.
55          index => '/path/to/index',          index => '/path/to/index',
56          slice_name => \&slice_on_path,          slice_name => \&slice_on_path,
57          slices => 30,          slices => 30,
58          merge => 1,          merge => 0,
59          codepage => 'ISO-8859-2'          codepage => 'ISO-8859-2',
60            swish_config => qq{
61                    PropertyNames from date
62                    PropertyNamesDate date
63            },
64            memoize_to_xml => 0,
65    );    );
66    
67    # split index on first component of path    # split index on first component of path
# Line 57  Create new object for index. Line 69  Create new object for index.
69          return shift split(/\//,$_[0]);          return shift split(/\//,$_[0]);
70    }    }
71    
72    Options to open are following:
73    
74    =over 5
75    
76    =item C<index>
77    
78    path to (existing) directory in which index slices will be created.
79    
80    =item C<slice_name>
81    
82  C<slices> is maximum number of index slices. See L<"in_slice"> for  coderef to function which provide slicing from path.
83    
84    =item C<slices>
85    
86    maximum number of index slices. See L<"in_slice"> for
87  more explanation.  more explanation.
88    
89    =item C<merge>
90    
91    (planned) option to merge indexes into one at end.
92    
93    =item C<codepage>
94    
95    data codepage (needed for conversion to UTF-8).
96    By default, it's C<ISO-8859-1>.
97    
98    =item C<swish_config>
99    
100    additional parametars which will be inserted into
101    C<swish-e> configuration file. See L<swish-config>.
102    
103    =item C<memoize_to_xml>
104    
105    speed up repeatable data, see L<"to_xml">.
106    
107    =back
108    
109  =cut  =cut
110    
111  my $iso2utf = Text::Iconv->new('ISO-8859-1','UTF-8');  my $iso2utf = Text::Iconv->new('ISO-8859-1','UTF-8');
# Line 70  sub open { Line 115  sub open {
115          my $self = {@_};          my $self = {@_};
116          bless($self, $class);          bless($self, $class);
117    
         print Dumper($self->{'slice_name'});  
   
118          croak "need slice_name coderef" unless ref $self->{'slice_name'};          croak "need slice_name coderef" unless ref $self->{'slice_name'};
119          croak "need slices" unless $self->{'slices'};          croak "need slices" unless $self->{'slices'};
120    
# Line 81  sub open { Line 124  sub open {
124    
125          $iso2utf = Text::Iconv->new($self->{'codepage'},'UTF-8') if ($self->{'codepage'});          $iso2utf = Text::Iconv->new($self->{'codepage'},'UTF-8') if ($self->{'codepage'});
126    
127            # speedup
128          memoize('in_slice');          memoize('in_slice');
129            memoize('to_xml') if ($self->{'memoize_to_xml'});
130    
131          $self ? return $self : return undef;          $self ? return $self : return undef;
132    
# Line 100  Add document to index. Line 145  Add document to index.
145    
146  sub add {  sub add {
147          my $self = shift;          my $self = shift;
148    
149            my $swishpath = shift || return;
150            my $data = shift || return;
151    
152            my ($out,$err) = $self->put_slice($swishpath, $self->to_xml($data));
153    
154            if ($err) {
155                    carp "$swishpath: $err";
156                    return 0;
157            }
158    
159            return 1;
160  }  }
161    
162  =head2 delete  =head2 delete
163    
164  Delete document from index.  Delete documents from index.
165    
166    $i->delete($swishpath);    $i->delete(@swishpath);
167    
168  =cut  =cut
169    
170  sub delete {  sub delete {
171          my $self = shift;          my $self = shift;
172    
173            my @paths = @_ || return;
174    
175            foreach my $path (@paths) {
176                    $self->{'paths'}->{$path} = DELETED;
177            }
178    
179            return 42;
180  }  }
181    
182    
183  =head2 close  =head2 done
184    
185  Close index file and finish indexing.  Finish indexing and close index file(s).
186    
187    $i->close;    $i->done;
188    
189  This is most time-consuming operation. When it's called, it will re-index  This is most time-consuming operation. When it's called, it will re-index
190  all entries which haven't changed in all slices.  all entries which haven't changed in all slices.
191    
192    Returns number of slices updated.
193    
194    This method should really be called close or finish, but both of those are
195    allready used.
196    
197  =cut  =cut
198    
199  sub close {  sub done {
200          my $self = shift;          my $self = shift;
201    
202            my $ret = 0;
203    
204            foreach my $s (keys %{$self->{'slice'}}) {
205                    print STDERR "closing slice $s\n";
206                    $ret += $self->close_slice($s);
207            }
208    
209            return $ret;
210  }  }
211    
212    
# Line 146  Return array of C<swishpath>s in index. Line 225  Return array of C<swishpath>s in index.
225    
226  sub swishpaths {  sub swishpaths {
227          my $self = shift;          my $self = shift;
228    
229            my $s = shift || return;
230            return if (! exists($self->{'slice'}->{'s'}));
231    
232            return keys %{$self->{'slice'}->{'s'}};
233  }  }
234    
235  =head2 swishpaths_updated  =head2 swishpaths_updated
# Line 203  for your data. If you have to re-index l Line 287  for your data. If you have to re-index l
287  run, think about creating your own C<slice> function and distributing  run, think about creating your own C<slice> function and distributing
288  documents manually across slices.  documents manually across slices.
289    
290    Slice number must always be true value or various sanity checks will fail.
291    
292  This function is C<Memoize>ed for performance reasons.  This function is C<Memoize>ed for performance reasons.
293    
294  =cut  =cut
# Line 212  sub in_slice { Line 298  sub in_slice {
298    
299          my $path = shift || confess "need path";          my $path = shift || confess "need path";
300    
         print Dumper($self->{'slice_name'});  
301          confess "need slice_name function" unless ref ($self->{'slice_name'});          confess "need slice_name function" unless ref ($self->{'slice_name'});
302    
303          if ($self->{'slices'}) {          if ($self->{'slices'}) {
# Line 224  sub in_slice { Line 309  sub in_slice {
309                  # FIXME how random is this?                  # FIXME how random is this?
310                  $slice = hex(substr($slice,0,8));                  $slice = hex(substr($slice,0,8));
311                                    
312                  print "slice_nr: $slice slices: ",$self->{'slices'},"\n";                  $slice = ($slice % $self->{'slices'}) + 1;
313                  return ($slice % $self->{'slices'});                  print "hash: $slice / ",$self->{'slices'}," => $slice\n";
314                    return $slice;
315          } else {          } else {
316                  return &{$self->{'split'}}($path);                  return &{$self->{'split'}}($path);
317          }          }
318  }  }
319    
320    =head2 find_paths
321    
322    Return array of C<swishpath>s for given C<swish-e> query.
323    
324      my @p = $i->find_paths("headline=test*");
325    
326    Useful for combining with L<"delete_documents"> to delete documents
327    which hasn't changed a while (so, expired).
328    
329    =cut
330    
331    sub find_paths {
332            my $self = shift;
333    
334    }
335    
336    
337    =head2 make_config
338    
339    Create C<swish-e> configuration file for given slice.
340    
341      my $config_filename = $i->make_config('slice name');
342    
343    It returns configuration filename. If no C<swish_config> was defined in
344    L<"open">, default swish-e configuration will be used. It will index all data for
345    searching, but none for properties.
346    
347    If you want to see what is allready defined for swish-e in configuration
348    take a look at source code for C<DEFAULT_SWISH_CONF>.
349    
350    It uses C<cat> utility to comunicate with C<swish-e>. Path is provided
351    by C<File::Which>. Do Windows users have to change that to C<COPY /B>
352    or something similar?
353    
354    =cut
355    
356    sub make_config {
357            my $self = shift;
358    
359    
360            my $index_file = $self->{'index'}."/";
361            $index_file .= shift || confess "need slice name";
362    
363            my ($tmp_fh, $swish_config_filename) = mkstemp("/tmp/swishXXXXX");
364    
365            # find cat on filesystem
366            my $cat = which('cat');
367    
368            print $tmp_fh <<"DEFAULT_SWISH_CONF";
369    # swish-e config file
370    
371    IndexDir cat
372    #SwishProgParameters -
373    
374    # input file definition
375    DefaultContents XML*
376    
377    # indexed metatags
378    MetaNames xml swishdocpath
379    
380    
381    #XMLClassAttributes type
382    UndefinedMetaTags auto
383    UndefinedXMLAttributes auto
384    
385    IndexFile $index_file
386    
387    # Croatian ISO-8859-2 characters to unaccented equivalents
388    TranslateCharacters ¹©ðÐèÈæƾ® ssddcccczz
389    
390    
391    # disable output
392    ParserWarnLevel 0
393    IndexReport 1
394    
395    DEFAULT_SWISH_CONF
396    
397            # add user parametars (like stored properties)
398            print $tmp_fh $self->{'swish_config'} if ($self->{'swish_config'});
399    
400            close($tmp_fh);
401    
402            return $swish_config_filename;
403    }
404    
405    =head2 create_slice
406    
407    On first run, starts C<swish-e> using L<IPC::Run>. On subsequent calls just return
408    it's handles using L<Memoize>.
409    
410      my $s = create_slice('/path/to/document');
411    
412    You shouldn't need to call C<create_slice> directly because it will be called
413    from L<"put_slice"> when needed.
414    
415    =cut
416    
417    sub create_slice {
418            my $self = shift;
419    
420            my $path = shift || confess "create_slice need path!";
421    
422            my $s = $self->in_slice($path) || confess "in_slice returned null";
423    
424            return $s if (exists($self->{'slice'}->{$s}));
425    
426            my $swish_config = $self->make_config($s);
427    
428            print STDERR "creating slice $s\n";     # FIXME
429    
430            my @swish = qw(swish-e -S prog -c);
431            push @swish, $swish_config;
432    
433            ## Build the harness, open all pipes, and launch the subprocesses
434            $self->{'slice'}->{$s}->{'h'} = start \@swish,
435                    \$self->{'slice'}->{$s}->{'in'},
436                    \$self->{'slice'}->{$s}->{'out'},
437                    \$self->{'slice'}->{$s}->{'err'},
438                    timeout( 90 );  # FIXME
439    
440            $self->{'slice'}->{$s}->{'out_len'} = 0;
441            $self->{'slice'}->{$s}->{'err_len'} = 0;
442    
443            $self->slice_output($s);
444    
445            return $s;
446    }
447    
448    =head2 put_slice
449    
450    Pass XML data to swish and receive output and errors.
451    
452      my ($out,$err) = $i->put_slice('/swish/path', '<xml>data</xml>');
453    
454    =cut
455    
456    sub put_slice {
457            my $self = shift;
458    
459            my $path = shift || confess "need path";
460            my $xml = shift || confess "need xml";
461    
462            $xml = $iso2utf->convert($xml) || carp "XML conversion error in $xml";
463    
464            my $s = $self->create_slice($path) || confess "create_slice returned null";
465    
466            confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
467            confess "no 'in' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'in'}));
468            confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));
469    
470            $self->slice_output($s);
471    
472            use bytes;      # as opposed to chars
473            $self->{'slice'}->{$s}->{'in'} .=
474                    "Path-Name: $path\n".
475                    "Content-Length: ".(length($xml)+1)."\n".
476                    "Document-Type: XML\n\n$xml\n";
477    
478            # do I/O
479            $self->{'slice'}->{$s}->{'h'}->pump while length $self->{'slice'}->{$s}->{'in'} ;  # wait for all input to go
480    
481            $self->slice_output($s);
482    
483    
484            $self->{'paths'}->{$path} = ADDED;
485    
486            return $s;
487    }
488    
489    =head2 slice_output
490    
491    Prints to STDERR output and errors from C<swish-e>.
492    
493      $i->slice_output($s);
494    
495    Normally, you don't need to call it.
496    
497    =cut
498    
499    sub slice_output {
500            my $self = shift;
501    
502            my $s = shift || confess "slice_output needs slice";
503    
504            confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
505            confess "no 'in' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'in'}));
506            confess "no 'out' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'out'}));
507    
508            if (length $self->{'slice'}->{$s}->{'out'} > $self->{'slice'}->{$s}->{'out_len'}) {
509                    #print STDERR "swish-e OUT: ",$self->{'slice'}->{$s}->{'out'},"\n" if ($self->{'slice'}->{$s}->{'out'});
510                    $self->{'slice'}->{$s}->{'out_len'} = length $self->{'slice'}->{$s}->{'out'};
511                    return 1;
512            } elsif (length $self->{'slice'}->{$s}->{'err'} > $self->{'slice'}->{$s}->{'err_len'}) {
513                    print STDERR "swish-e ERR: ",$self->{'slice'}->{$s}->{'err'},"\n" if ($self->{'slice'}->{$s}->{'err'});
514                    $self->{'slice'}->{$s}->{'err_len'} = length $self->{'slice'}->{$s}->{'err'};
515                    # this is fatal
516                    return 0;
517            }
518    
519            return 1;
520    }
521    
522    =head2 close_slice
523    
524    Close slice (terminates swish-e process for that slice).
525    
526      my $i->close_slice($s);
527    
528    Returns true if slice is closed, false otherwise.
529    
530    =cut
531    
532    sub close_slice {
533            my $self = shift;
534    
535            my $s = shift || confess "close_slice needs slice";
536    
537            confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
538            confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));
539    
540            # pump rest of content (if any)
541            $self->{'slice'}->{$s}->{'h'}->pump while length $self->{'slice'}->{$s}->{'in'};
542    
543            $self->slice_output($s);
544    
545            # clean up
546            $self->{'slice'}->{$s}->{'h'}->finish or confess "finish on slice $s returned: $?";
547            
548            delete($self->{'slice'}->{$s}) && return 1;
549            return 0;
550    }
551    
552    =head2 to_xml
553    
554    Convert (binary safe, I hope) your data into XML for C<swish-e>.
555    Data will not yet be recoded to UTF-8. L<"put_slice"> will do that.
556    
557      my $xml = $i->to_xml({ foo => 'bar' });
558    
559    This function is extracted from L<"add"> method so that you can L<Memoize> it.
560    If your data set has a lot of repeatable data, and memory is not a problem, you
561    can add C<memoize_to_xml> option to L<"open">.
562    
563    =cut
564    
565    my %escape = ('<'=>'&lt;', '>'=>'&gt;', '&'=>'&amp;', '"'=>'&quot;');
566    my $escape_re  = join '|' => keys %escape;
567    
568    sub to_xml {
569            my $self = shift;
570    
571            my $data = shift || return;
572    
573            my $xml = qq{<xml>};
574            foreach my $tag (keys %$data) {
575                    my $content = $data->{$tag};
576                    next if (! $content || $content eq '');
577                    # save [cr/]lf before conversion to XML
578    #               $content =~ s/\n\r/##lf##/gs;
579    #               $content =~ s/\n/##lf##/gs;
580                    $content =~ s/($escape_re)/$escape{$1}/gs;
581                    $xml .= "<$tag><![CDATA[".$content."]]></$tag>";
582            }
583            $xml .= qq{</xml>};
584    }
585    
586  1;  1;
587  __END__  __END__
588    
589    
590  =head2 Searching  =head1 Searching
591    
592  Searching is still conducted using L<SWISH::API>, but you have to glob  Searching is still conducted using L<SWISH::API>, but you have to glob
593  index names.  index names.
# Line 252  not change your source code at all. Line 602  not change your source code at all.
602  That would also benefit performance, but it increases indexing time  That would also benefit performance, but it increases indexing time
603  because merged indexes must be re-created on each indexing run.  because merged indexes must be re-created on each indexing run.
604    
605  =head2 EXPORT  =head1 EXPORT
606    
607  None by default.  Nothing by default.
608    
609    =head1 EXAMPLES
610    
611    Test script for this module uses all parts of API. It's also nice example
612    how to use C<SWISH::Split>.
613    
614  =head1 SEE ALSO  =head1 SEE ALSO
615    

Legend:
Removed from v.1  
changed lines
  Added in v.6

  ViewVC Help
Powered by ViewVC 1.1.26