/[SWISH-Split]/trunk/Split.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/Split.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 4 by dpavlin, Sun Aug 8 19:22:56 2004 UTC revision 8 by dpavlin, Sun Dec 19 03:06:01 2004 UTC
# Line 4  use 5.008; Line 4  use 5.008;
4  use strict;  use strict;
5  use warnings;  use warnings;
6    
7  our $VERSION = '0.00';  our $VERSION = '0.02';
8    
9  use SWISH::API;  use SWISH::API;
10  use Text::Iconv;  use Text::Iconv;
# Line 12  use File::Temp qw/ :mktemp /; Line 12  use File::Temp qw/ :mktemp /;
12  use Carp;  use Carp;
13  use Digest::MD5 qw(md5_hex);  use Digest::MD5 qw(md5_hex);
14  use Memoize;  use Memoize;
 use IPC::Run qw(start timeout pump finish);  
15  use File::Which;  use File::Which;
16    
17  use Data::Dumper;  use Data::Dumper;
18    
19    use constant {
20            ADDED => 1,
21            DELETED => 2,
22    };
23    
24  =head1 NAME  =head1 NAME
25    
26  SWISH::Split - Perl interface to split index variant of Swish-e  SWISH::Split - Perl interface to split index variant of Swish-e
# Line 37  L<Plucene::Simple>. This could make your Line 41  L<Plucene::Simple>. This could make your
41  engines) easier.  engines) easier.
42    
43  In the background, it will fork swish-e binaries (one for each index slice)  In the background, it will fork swish-e binaries (one for each index slice)
44  and produce UTF-8 encoded XML files for it. So, if your imput charset isn't  and produce UTF-8 encoded XML files for it. So, if your input charset isn't
45  C<ISO-8859-1> you will have to specify it.  C<ISO-8859-1> you will have to specify it.
46    
47  =head1 Methods used for indexing  =head1 Methods used for indexing
48    
49  =head2 open  =head2 open_index
50    
51  Create new object for index.  Create new object for index.
52    
53    my $i = SWISH::Split->open({    my $i = SWISH::Split->open_index({
54          index => '/path/to/index',          index => '/path/to/index',
55          slice_name => \&slice_on_path,          slice_name => \&slice_on_path,
56          slices => 30,          slices => 30,
# Line 64  Create new object for index. Line 68  Create new object for index.
68          return shift split(/\//,$_[0]);          return shift split(/\//,$_[0]);
69    }    }
70    
71  Options to open are following:  Options to C<open_index> are following:
72    
73  =over 5  =over 5
74    
# Line 105  speed up repeatable data, see L<"to_xml" Line 109  speed up repeatable data, see L<"to_xml"
109    
110  my $iso2utf = Text::Iconv->new('ISO-8859-1','UTF-8');  my $iso2utf = Text::Iconv->new('ISO-8859-1','UTF-8');
111    
112  sub open {  sub open_index {
113          my $class = shift;          my $class = shift;
114          my $self = {@_};          my $self = {@_};
115          bless($self, $class);          bless($self, $class);
# Line 144  sub add { Line 148  sub add {
148          my $swishpath = shift || return;          my $swishpath = shift || return;
149          my $data = shift || return;          my $data = shift || return;
150    
151          my ($out,$err) = $self->put_slice($swishpath, $self->to_xml($data));          my $slice = $self->put_slice($swishpath, $self->to_xml($data));
   
         if ($err) {  
                 carp "$swishpath: $err";  
                 return 0;  
         }  
152    
153          return 1;          return $slice;
154  }  }
155    
156  =head2 delete  =head2 delete
# Line 167  sub delete { Line 166  sub delete {
166    
167          my @paths = @_ || return;          my @paths = @_ || return;
168    
169            foreach my $path (@paths) {
170                    $self->{'paths'}->{$path} = DELETED;
171            }
172    
173          return 42;          return 42;
174  }  }
175    
176    
177  =head2 finish  =head2 done
178    
179  Finish indexing and close index file(s).  Finish indexing and close index file(s).
180    
181    $i->finish;    $i->done;
182    
183  This is most time-consuming operation. When it's called, it will re-index  This is most time-consuming operation. When it's called, it will re-index
184  all entries which haven't changed in all slices.  all entries which haven't changed in all slices.
185    
186  Returns number of slices updated.  Returns number of slices updated.
187    
188    This method should really be called close or finish, but both of those are
189    allready used.
190    
191  =cut  =cut
192    
193  sub finish {  sub done {
194          my $self = shift;          my $self = shift;
195    
196          my $ret = 0;          my $ret = 0;
197    
198          foreach my $s (keys %{$self->{'slice'}}) {          foreach my $s (keys %{$self->{'slice'}}) {
199                    $self->_debug("closing slice $s");
200                  $ret += $self->close_slice($s);                  $ret += $self->close_slice($s);
201          }          }
202    
# Line 212  Return array of C<swishpath>s in index. Line 219  Return array of C<swishpath>s in index.
219    
220  sub swishpaths {  sub swishpaths {
221          my $self = shift;          my $self = shift;
222    
223            my $s = shift || return;
224            return if (! exists($self->{'slice'}->{'s'}));
225    
226            return keys %{$self->{'slice'}->{'s'}};
227  }  }
228    
229  =head2 swishpaths_updated  =head2 swishpaths_updated
# Line 262  Takes path and return slice in which thi Line 274  Takes path and return slice in which thi
274    
275    my $s = $i->in_slice('path/to/document/in/index');    my $s = $i->in_slice('path/to/document/in/index');
276    
277  If there are C<slices> parametar to L<"open"> it will use  If there are C<slices> parametar to L<"open_index"> it will use
278  MD5 hash to spread documents across slices. That will produce random  MD5 hash to spread documents across slices. That will produce random
279  distribution of your documents in slices, which might or might not be best  distribution of your documents in slices, which might or might not be best
280  for your data. If you have to re-index large number of slices on each  for your data. If you have to re-index large number of slices on each
# Line 286  sub in_slice { Line 298  sub in_slice {
298                  # first, pass path through slice_name function                  # first, pass path through slice_name function
299                  my $slice = &{$self->{'slice_name'}}($path);                  my $slice = &{$self->{'slice_name'}}($path);
300                  # then calculate MD5 hash                  # then calculate MD5 hash
301                  $slice = md5_hex($slice);                  my $hash = md5_hex($slice);
302                  # take first 8 chars to produce number                  # take first 8 chars to produce number
303                  # FIXME how random is this?                  # FIXME how random is this?
304                  $slice = hex(substr($slice,0,8));                  $hash = hex(substr($hash,0,8));
305                                    
306                  $slice = ($slice % $self->{'slices'}) + 1;                  $slice = ($hash % $self->{'slices'}) + 1;
307                  print "hash: $slice / ",$self->{'slices'}," => $slice\n";                  $self->_debug("hash: $hash / ",$self->{'slices'}," => $slice");
308                  return $slice;                  return $slice;
309          } else {          } else {
310                  return &{$self->{'split'}}($path);                  return &{$self->{'split'}}($path);
# Line 313  which hasn't changed a while (so, expire Line 325  which hasn't changed a while (so, expire
325  sub find_paths {  sub find_paths {
326          my $self = shift;          my $self = shift;
327    
         my $s = shift || return;  
328  }  }
329    
330    
# Line 324  Create C<swish-e> configuration file for Line 335  Create C<swish-e> configuration file for
335    my $config_filename = $i->make_config('slice name');    my $config_filename = $i->make_config('slice name');
336    
337  It returns configuration filename. If no C<swish_config> was defined in  It returns configuration filename. If no C<swish_config> was defined in
338  L<"open">, default swish-e configuration will be used. It will index all data for  L<"open_index">, default swish-e configuration will be used. It will index all data for
339  searching, but none for properties.  searching, but none for properties.
340    
341  If you want to see what is allready defined for swish-e in configuration  If you want to see what is allready defined for swish-e in configuration
342  take a look at source code for C<DEFAULT_SWISH_CONF>.  take a look at source code for C<DEFAULT_SWISH_CONF>.
343    
344  It uses C<cat> utility to comunicate with C<swish-e>. Path is provided  It uses C<stdin> as C<IndexDir> to comunicate with C<swish-e>.
 by C<File::Which>. Do Windows users have to change that to C<COPY /B>  
 or something similar?  
345    
346  =cut  =cut
347    
# Line 351  sub make_config { Line 360  sub make_config {
360          print $tmp_fh <<"DEFAULT_SWISH_CONF";          print $tmp_fh <<"DEFAULT_SWISH_CONF";
361  # swish-e config file  # swish-e config file
362    
363  IndexDir cat  IndexDir stdin
 #SwishProgParameters -  
364    
365  # input file definition  # input file definition
366  DefaultContents XML*  DefaultContents XML*
# Line 387  DEFAULT_SWISH_CONF Line 395  DEFAULT_SWISH_CONF
395    
396  =head2 create_slice  =head2 create_slice
397    
398  On first run, starts C<swish-e> using L<IPC::Run>. On subsequent calls just return  On first run, starts C<swish-e>. On subsequent calls just return
399  it's handles using L<Memoize>.  it's handles using L<Memoize>.
400    
401    my $s = create_slice('/path/to/document');    my $s = create_slice('/path/to/document');
# Line 408  sub create_slice { Line 416  sub create_slice {
416    
417          my $swish_config = $self->make_config($s);          my $swish_config = $self->make_config($s);
418    
419          print STDERR "creating slice $s\n";     # FIXME          my $swish = qq{| swish-e };
420            $swish .= qq{ -u } if (-f $self->{'index'}.'/'.$s);    
421            $swish .= qq{ -S prog -c } . $swish_config;
422    
423          my @swish = qw(swish-e -S prog -c);          $self->_debug("creating slice $s using $swish");
         push @swish, $swish_config;  
424    
425          ## Build the harness, open all pipes, and launch the subprocesses          ## Build the harness, open all pipes, and launch the subprocesses
426          $self->{'slice'}->{$s}->{'h'} = start \@swish,          open(my $fh, $swish) || croak "can't open $swish: $!";
                 \$self->{'slice'}->{$s}->{'in'},  
                 \$self->{'slice'}->{$s}->{'out'},  
                 \$self->{'slice'}->{$s}->{'err'},  
                 timeout( 90 );  # FIXME  
427    
428          $self->{'slice'}->{$s}->{'out_len'} = 0;          $self->{'slice'}->{$s}->{'h'} = $fh;
         $self->{'slice'}->{$s}->{'err_len'} = 0;  
429    
430          $self->slice_output($s);          $self->slice_output($s);
431    
# Line 430  sub create_slice { Line 434  sub create_slice {
434    
435  =head2 put_slice  =head2 put_slice
436    
437  Pass XML data to swish and receive output and errors.  Pass XML data to swish.
438    
439    my ($out,$err) = $i->put_slice('/swish/path', '<xml>data</xml>');    my $slice = $i->put_slice('/swish/path', '<xml>data</xml>');
440    
441    Returns slice in which XML ended up.
442    
443  =cut  =cut
444    
# Line 447  sub put_slice { Line 453  sub put_slice {
453          my $s = $self->create_slice($path) || confess "create_slice returned null";          my $s = $self->create_slice($path) || confess "create_slice returned null";
454    
455          confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));          confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
         confess "no 'in' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'in'}));  
456          confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));          confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));
457    
458          $self->slice_output($s);          $self->slice_output($s);
459    
460          use bytes;      # as opposed to chars          use bytes;      # as opposed to chars
461          $self->{'slice'}->{$s}->{'in'} .=          my $fh = $self->{'slice'}->{$s}->{'h'} || confess "handle for slice $s undefined";
462                  "Path-Name: $path\n".          print { $fh } "Path-Name: $path\n".
463                  "Content-Length: ".(length($xml)+1)."\n".                  "Content-Length: ".(length($xml)+1)."\n".
464                    "Update-Mode: Index\n".
465                  "Document-Type: XML\n\n$xml\n";                  "Document-Type: XML\n\n$xml\n";
466    
         # do I/O  
         $self->{'slice'}->{$s}->{'h'}->pump while length $self->{'slice'}->{$s}->{'in'} ;  # wait for all input to go  
   
467          $self->slice_output($s);          $self->slice_output($s);
468    
469            $self->_debug("dumping in slice $s: $path");
470    
471            $self->{'paths'}->{$path} = ADDED;
472    
473          return $s;          return $s;
474  }  }
475    
# Line 470  sub put_slice { Line 477  sub put_slice {
477    
478  Prints to STDERR output and errors from C<swish-e>.  Prints to STDERR output and errors from C<swish-e>.
479    
480    $i->slice_output($s);    my $slice = $i->slice_output($s);
481    
482  Normally, you don't need to call it.  Normally, you don't need to call it.
483    
484    B<This is dummy placeholder function for very old code that assumes this
485    module is using C<IPC::Run> which it isn't any more.>
486    
487  =cut  =cut
488    
489  sub slice_output {  sub slice_output {
# Line 482  sub slice_output { Line 492  sub slice_output {
492          my $s = shift || confess "slice_output needs slice";          my $s = shift || confess "slice_output needs slice";
493    
494          confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));          confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
         confess "no 'in' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'in'}));  
         confess "no 'out' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'out'}));  
495    
496          if (length $self->{'slice'}->{$s}->{'out'} > $self->{'slice'}->{$s}->{'out_len'}) {          # FIXME
                 #print STDERR "swish-e OUT: ",$self->{'slice'}->{$s}->{'out'},"\n" if ($self->{'slice'}->{$s}->{'out'});  
                 $self->{'slice'}->{$s}->{'out_len'} = length $self->{'slice'}->{$s}->{'out'};  
                 return 1;  
         } elsif (length $self->{'slice'}->{$s}->{'err'} > $self->{'slice'}->{$s}->{'err_len'}) {  
                 print STDERR "swish-e ERR: ",$self->{'slice'}->{$s}->{'err'},"\n" if ($self->{'slice'}->{$s}->{'err'});  
                 $self->{'slice'}->{$s}->{'err_len'} = length $self->{'slice'}->{$s}->{'err'};  
                 # this is fatal  
                 return 0;  
         }  
497    
498          return 1;          return $s;
499  }  }
500    
501  =head2 close_slice {  =head2 close_slice
502    
503  Close slice (terminates swish-e process for that slice).  Close slice (terminates swish-e process for that slice).
504    
# Line 518  sub close_slice { Line 517  sub close_slice {
517          confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));          confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));
518    
519          # pump rest of content (if any)          # pump rest of content (if any)
520          $self->{'slice'}->{$s}->{'h'}->pump while length $self->{'slice'}->{$s}->{'in'};          close $self->{'slice'}->{$s}->{'h'} || carp "can't close slice $s: $!";
521    
522          $self->slice_output($s);          $self->slice_output($s);
523    
524          # clean up          undef $self->{'slice'}->{$s}->{'h'};
         $self->{'slice'}->{$s}->{'h'}->finish or confess "finish on slice $s returned: $?";  
525                    
526          delete($self->{'slice'}->{$s}) && return 1;          delete($self->{'slice'}->{$s}) && return 1;
527          return 0;          return 0;
# Line 538  Data will not yet be recoded to UTF-8. L Line 536  Data will not yet be recoded to UTF-8. L
536    
537  This function is extracted from L<"add"> method so that you can L<Memoize> it.  This function is extracted from L<"add"> method so that you can L<Memoize> it.
538  If your data set has a lot of repeatable data, and memory is not a problem, you  If your data set has a lot of repeatable data, and memory is not a problem, you
539  can add C<memoize_to_xml> option to L<"open">.  can add C<memoize_to_xml> option to L<"open_index">.
540    
541  =cut  =cut
542    
# Line 563  sub to_xml { Line 561  sub to_xml {
561          $xml .= qq{</xml>};          $xml .= qq{</xml>};
562  }  }
563    
564    sub _debug {
565            my $self = shift;
566            print STDERR "## ",@_,"\n" if ($self->{'debug'});
567            return;
568    }
569    
570  1;  1;
571  __END__  __END__
572    
573    
574  =head2 Searching  =head1 Searching
575    
576  Searching is still conducted using L<SWISH::API>, but you have to glob  Searching is still conducted using L<SWISH::API>, but you have to glob
577  index names.  index names.
# Line 582  not change your source code at all. Line 586  not change your source code at all.
586  That would also benefit performance, but it increases indexing time  That would also benefit performance, but it increases indexing time
587  because merged indexes must be re-created on each indexing run.  because merged indexes must be re-created on each indexing run.
588    
589  =head2 EXPORT  =head1 EXPORT
590    
591  Nothing by default.  Nothing by default.
592    
593  =head2 EXAMPLES  =head1 EXAMPLES
594    
595  Test script for this module uses all parts of API. It's also nice example  Test script for this module uses all parts of API. It's also nice example
596  how to use C<SWISH::Split>.  how to use C<SWISH::Split>.

Legend:
Removed from v.4  
changed lines
  Added in v.8

  ViewVC Help
Powered by ViewVC 1.1.26