/[SWISH-Split]/trunk/Split.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/Split.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 4 by dpavlin, Sun Aug 8 19:22:56 2004 UTC revision 13 by dpavlin, Fri Apr 29 23:25:02 2005 UTC
# Line 4  use 5.008; Line 4  use 5.008;
4  use strict;  use strict;
5  use warnings;  use warnings;
6    
7  our $VERSION = '0.00';  our $VERSION = '0.03';
8    
9  use SWISH::API;  use SWISH::API;
10  use Text::Iconv;  use Text::Iconv;
# Line 12  use File::Temp qw/ :mktemp /; Line 12  use File::Temp qw/ :mktemp /;
12  use Carp;  use Carp;
13  use Digest::MD5 qw(md5_hex);  use Digest::MD5 qw(md5_hex);
14  use Memoize;  use Memoize;
 use IPC::Run qw(start timeout pump finish);  
15  use File::Which;  use File::Which;
16    
17  use Data::Dumper;  use Data::Dumper;
18    
19    use constant {
20            ADDED => 1,
21            DELETED => 2,
22    };
23    
24  =head1 NAME  =head1 NAME
25    
26  SWISH::Split - Perl interface to split index variant of Swish-e  SWISH::Split - Perl interface to split index variant of Swish-e
# Line 37  L<Plucene::Simple>. This could make your Line 41  L<Plucene::Simple>. This could make your
41  engines) easier.  engines) easier.
42    
43  In the background, it will fork swish-e binaries (one for each index slice)  In the background, it will fork swish-e binaries (one for each index slice)
44  and produce UTF-8 encoded XML files for it. So, if your imput charset isn't  and produce UTF-8 encoded XML files for it. So, if your input charset isn't
45  C<ISO-8859-1> you will have to specify it.  C<ISO-8859-1> you will have to specify it.
46    
47  =head1 Methods used for indexing  =head1 Methods used for indexing
48    
49  =head2 open  =head2 open_index
50    
51  Create new object for index.  Create new object for index.
52    
53    my $i = SWISH::Split->open({    my $i = SWISH::Split->open_index({
54          index => '/path/to/index',          index => '/path/to/index',
55          slice_name => \&slice_on_path,          slice_name => \&slice_on_path,
56          slices => 30,          slices => 30,
# Line 64  Create new object for index. Line 68  Create new object for index.
68          return shift split(/\//,$_[0]);          return shift split(/\//,$_[0]);
69    }    }
70    
71  Options to open are following:  Options to C<open_index> are following:
72    
73  =over 5  =over 5
74    
# Line 93  By default, it's C<ISO-8859-1>. Line 97  By default, it's C<ISO-8859-1>.
97  =item C<swish_config>  =item C<swish_config>
98    
99  additional parametars which will be inserted into  additional parametars which will be inserted into
100  C<swish-e> configuration file. See L<swish-config>.  C<swish-e> configuration file. See C<swish-config>.
101    
102  =item C<memoize_to_xml>  =item C<memoize_to_xml>
103    
# Line 105  speed up repeatable data, see L<"to_xml" Line 109  speed up repeatable data, see L<"to_xml"
109    
110  my $iso2utf = Text::Iconv->new('ISO-8859-1','UTF-8');  my $iso2utf = Text::Iconv->new('ISO-8859-1','UTF-8');
111    
112  sub open {  sub open_index {
113          my $class = shift;          my $class = shift;
114          my $self = {@_};          my $self = {@_};
115          bless($self, $class);          bless($self, $class);
# Line 144  sub add { Line 148  sub add {
148          my $swishpath = shift || return;          my $swishpath = shift || return;
149          my $data = shift || return;          my $data = shift || return;
150    
151          my ($out,$err) = $self->put_slice($swishpath, $self->to_xml($data));          my $slice = $self->put_slice($swishpath, $self->to_xml($data));
   
         if ($err) {  
                 carp "$swishpath: $err";  
                 return 0;  
         }  
152    
153          return 1;          return $slice;
154  }  }
155    
156  =head2 delete  =head2 delete
# Line 160  Delete documents from index. Line 159  Delete documents from index.
159    
160    $i->delete(@swishpath);    $i->delete(@swishpath);
161    
162    B<This function is not implemented.>
163    
164  =cut  =cut
165    
166  sub delete {  sub delete {
# Line 167  sub delete { Line 168  sub delete {
168    
169          my @paths = @_ || return;          my @paths = @_ || return;
170    
171            foreach my $path (@paths) {
172                    $self->{'paths'}->{$path} = DELETED;
173            }
174    
175            die "delete is not yet implemented";
176    
177          return 42;          return 42;
178  }  }
179    
180    
181  =head2 finish  =head2 done
182    
183  Finish indexing and close index file(s).  Finish indexing and close index file(s).
184    
185    $i->finish;    $i->done;
186    
187  This is most time-consuming operation. When it's called, it will re-index  This is most time-consuming operation. When it's called, it will re-index
188  all entries which haven't changed in all slices.  all entries which haven't changed in all slices.
189    
190  Returns number of slices updated.  Returns number of slices updated.
191    
192    This method should really be called close or finish, but both of those are
193    allready used.
194    
195  =cut  =cut
196    
197  sub finish {  sub done {
198          my $self = shift;          my $self = shift;
199    
200          my $ret = 0;          my $ret = 0;
201    
202          foreach my $s (keys %{$self->{'slice'}}) {          foreach my $s (keys %{$self->{'slice'}}) {
203                    $self->_debug("closing slice $s");
204                  $ret += $self->close_slice($s);                  $ret += $self->close_slice($s);
205          }          }
206    
# Line 212  Return array of C<swishpath>s in index. Line 223  Return array of C<swishpath>s in index.
223    
224  sub swishpaths {  sub swishpaths {
225          my $self = shift;          my $self = shift;
226    
227            my $s = shift || return;
228            return if (! exists($self->{'slice'}->{'s'}));
229    
230            return keys %{$self->{'slice'}->{'s'}};
231  }  }
232    
233  =head2 swishpaths_updated  =head2 swishpaths_updated
# Line 262  Takes path and return slice in which thi Line 278  Takes path and return slice in which thi
278    
279    my $s = $i->in_slice('path/to/document/in/index');    my $s = $i->in_slice('path/to/document/in/index');
280    
281  If there are C<slices> parametar to L<"open"> it will use  If there are C<slices> parametar to L<"open_index"> it will use
282  MD5 hash to spread documents across slices. That will produce random  MD5 hash to spread documents across slices. That will produce random
283  distribution of your documents in slices, which might or might not be best  distribution of your documents in slices, which might or might not be best
284  for your data. If you have to re-index large number of slices on each  for your data. If you have to re-index large number of slices on each
# Line 286  sub in_slice { Line 302  sub in_slice {
302                  # first, pass path through slice_name function                  # first, pass path through slice_name function
303                  my $slice = &{$self->{'slice_name'}}($path);                  my $slice = &{$self->{'slice_name'}}($path);
304                  # then calculate MD5 hash                  # then calculate MD5 hash
305                  $slice = md5_hex($slice);                  my $hash = md5_hex($slice);
306                  # take first 8 chars to produce number                  # take first 8 chars to produce number
307                  # FIXME how random is this?                  # FIXME how random is this?
308                  $slice = hex(substr($slice,0,8));                  $hash = hex(substr($hash,0,8));
309                                    
310                  $slice = ($slice % $self->{'slices'}) + 1;                  $slice = ($hash % $self->{'slices'}) + 1;
311                  print "hash: $slice / ",$self->{'slices'}," => $slice\n";                  $self->_debug("hash: $hash / ",$self->{'slices'}," => $slice");
312                  return $slice;                  return $slice;
313          } else {          } else {
314                  return &{$self->{'split'}}($path);                  return &{$self->{'split'}}($path);
# Line 313  which hasn't changed a while (so, expire Line 329  which hasn't changed a while (so, expire
329  sub find_paths {  sub find_paths {
330          my $self = shift;          my $self = shift;
331    
         my $s = shift || return;  
332  }  }
333    
334    
# Line 324  Create C<swish-e> configuration file for Line 339  Create C<swish-e> configuration file for
339    my $config_filename = $i->make_config('slice name');    my $config_filename = $i->make_config('slice name');
340    
341  It returns configuration filename. If no C<swish_config> was defined in  It returns configuration filename. If no C<swish_config> was defined in
342  L<"open">, default swish-e configuration will be used. It will index all data for  L<"open_index">, default swish-e configuration will be used. It will index all data for
343  searching, but none for properties.  searching, but none for properties.
344    
345  If you want to see what is allready defined for swish-e in configuration  If you want to see what is allready defined for swish-e in configuration
346  take a look at source code for C<DEFAULT_SWISH_CONF>.  take a look at source code for C<DEFAULT_SWISH_CONF>.
347    
348  It uses C<cat> utility to comunicate with C<swish-e>. Path is provided  It uses C<stdin> as C<IndexDir> to comunicate with C<swish-e>.
 by C<File::Which>. Do Windows users have to change that to C<COPY /B>  
 or something similar?  
349    
350  =cut  =cut
351    
# Line 351  sub make_config { Line 364  sub make_config {
364          print $tmp_fh <<"DEFAULT_SWISH_CONF";          print $tmp_fh <<"DEFAULT_SWISH_CONF";
365  # swish-e config file  # swish-e config file
366    
367  IndexDir cat  IndexDir stdin
 #SwishProgParameters -  
368    
369  # input file definition  # input file definition
370  DefaultContents XML*  DefaultContents XML*
# Line 387  DEFAULT_SWISH_CONF Line 399  DEFAULT_SWISH_CONF
399    
400  =head2 create_slice  =head2 create_slice
401    
402  On first run, starts C<swish-e> using L<IPC::Run>. On subsequent calls just return  On first run, starts C<swish-e>. On subsequent calls just return
403  it's handles using L<Memoize>.  it's handles using C<Memoize>.
404    
405    my $s = create_slice('/path/to/document');    my $s = create_slice('/path/to/document');
406    
# Line 408  sub create_slice { Line 420  sub create_slice {
420    
421          my $swish_config = $self->make_config($s);          my $swish_config = $self->make_config($s);
422    
423          print STDERR "creating slice $s\n";     # FIXME          my $swish = qq{| swish-e };
424            if (-f $self->{'index'}.'/'.$s) {
425                    $swish .= qq{ -u };
426                    $self->{'slice'}->{$s}->{'update_mode'}++;
427            }
428            $swish .= qq{ -S prog -c } . $swish_config;
429    
430          my @swish = qw(swish-e -S prog -c);          $self->_debug("creating slice $s using $swish");
         push @swish, $swish_config;  
431    
432          ## Build the harness, open all pipes, and launch the subprocesses          ## Build the harness, open all pipes, and launch the subprocesses
433          $self->{'slice'}->{$s}->{'h'} = start \@swish,          open(my $fh, $swish) || croak "can't open $swish: $!";
                 \$self->{'slice'}->{$s}->{'in'},  
                 \$self->{'slice'}->{$s}->{'out'},  
                 \$self->{'slice'}->{$s}->{'err'},  
                 timeout( 90 );  # FIXME  
434    
435          $self->{'slice'}->{$s}->{'out_len'} = 0;          $self->{'slice'}->{$s}->{'h'} = $fh;
         $self->{'slice'}->{$s}->{'err_len'} = 0;  
436    
437          $self->slice_output($s);          $self->slice_output($s);
438    
# Line 430  sub create_slice { Line 441  sub create_slice {
441    
442  =head2 put_slice  =head2 put_slice
443    
444  Pass XML data to swish and receive output and errors.  Pass XML data to swish.
445    
446    my ($out,$err) = $i->put_slice('/swish/path', '<xml>data</xml>');    my $slice = $i->put_slice('/swish/path', '<xml>data</xml>');
447    
448    Returns slice in which XML ended up.
449    
450  =cut  =cut
451    
# Line 447  sub put_slice { Line 460  sub put_slice {
460          my $s = $self->create_slice($path) || confess "create_slice returned null";          my $s = $self->create_slice($path) || confess "create_slice returned null";
461    
462          confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));          confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
         confess "no 'in' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'in'}));  
463          confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));          confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));
464    
465          $self->slice_output($s);          $self->slice_output($s);
466    
467          use bytes;      # as opposed to chars          use bytes;      # as opposed to chars
468          $self->{'slice'}->{$s}->{'in'} .=          my $fh = $self->{'slice'}->{$s}->{'h'} || confess "handle for slice $s undefined";
469                  "Path-Name: $path\n".  
470                  "Content-Length: ".(length($xml)+1)."\n".          my $update_header = "Update-Mode: Index\n";
471                  "Document-Type: XML\n\n$xml\n";          $update_header = '' unless ($self->{'slice'}->{$s}->{'update_mode'});
472    
473          # do I/O          print { $fh } "Path-Name: $path\n".
474          $self->{'slice'}->{$s}->{'h'}->pump while length $self->{'slice'}->{$s}->{'in'} ;  # wait for all input to go                  "Content-Length: ".(length($xml)+1)."\n" . $update_header .
475                    "Document-Type: XML\n\n$xml\n";
476    
477          $self->slice_output($s);          $self->slice_output($s);
478    
479            $self->_debug("dumping in slice $s: $path");
480    
481            $self->{'paths'}->{$path} = ADDED;
482    
483          return $s;          return $s;
484  }  }
485    
# Line 470  sub put_slice { Line 487  sub put_slice {
487    
488  Prints to STDERR output and errors from C<swish-e>.  Prints to STDERR output and errors from C<swish-e>.
489    
490    $i->slice_output($s);    my $slice = $i->slice_output($s);
491    
492  Normally, you don't need to call it.  Normally, you don't need to call it.
493    
494    B<This is dummy placeholder function for very old code that assumes this
495    module is using C<IPC::Run> which it isn't any more.>
496    
497  =cut  =cut
498    
499  sub slice_output {  sub slice_output {
# Line 482  sub slice_output { Line 502  sub slice_output {
502          my $s = shift || confess "slice_output needs slice";          my $s = shift || confess "slice_output needs slice";
503    
504          confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));          confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
         confess "no 'in' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'in'}));  
         confess "no 'out' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'out'}));  
505    
506          if (length $self->{'slice'}->{$s}->{'out'} > $self->{'slice'}->{$s}->{'out_len'}) {          # FIXME
                 #print STDERR "swish-e OUT: ",$self->{'slice'}->{$s}->{'out'},"\n" if ($self->{'slice'}->{$s}->{'out'});  
                 $self->{'slice'}->{$s}->{'out_len'} = length $self->{'slice'}->{$s}->{'out'};  
                 return 1;  
         } elsif (length $self->{'slice'}->{$s}->{'err'} > $self->{'slice'}->{$s}->{'err_len'}) {  
                 print STDERR "swish-e ERR: ",$self->{'slice'}->{$s}->{'err'},"\n" if ($self->{'slice'}->{$s}->{'err'});  
                 $self->{'slice'}->{$s}->{'err_len'} = length $self->{'slice'}->{$s}->{'err'};  
                 # this is fatal  
                 return 0;  
         }  
507    
508          return 1;          return $s;
509  }  }
510    
511  =head2 close_slice {  =head2 close_slice
512    
513  Close slice (terminates swish-e process for that slice).  Close slice (terminates swish-e process for that slice).
514    
# Line 518  sub close_slice { Line 527  sub close_slice {
527          confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));          confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));
528    
529          # pump rest of content (if any)          # pump rest of content (if any)
530          $self->{'slice'}->{$s}->{'h'}->pump while length $self->{'slice'}->{$s}->{'in'};          close $self->{'slice'}->{$s}->{'h'} || carp "can't close slice $s: $!";
531    
532          $self->slice_output($s);          $self->slice_output($s);
533    
534          # clean up          undef $self->{'slice'}->{$s}->{'h'};
         $self->{'slice'}->{$s}->{'h'}->finish or confess "finish on slice $s returned: $?";  
535                    
536          delete($self->{'slice'}->{$s}) && return 1;          delete($self->{'slice'}->{$s}) && return 1;
537          return 0;          return 0;
# Line 536  Data will not yet be recoded to UTF-8. L Line 544  Data will not yet be recoded to UTF-8. L
544    
545    my $xml = $i->to_xml({ foo => 'bar' });    my $xml = $i->to_xml({ foo => 'bar' });
546    
547  This function is extracted from L<"add"> method so that you can L<Memoize> it.  This function is extracted from L<"add"> method so that you can C<Memoize> it.
548  If your data set has a lot of repeatable data, and memory is not a problem, you  If your data set has a lot of repeatable data, and memory is not a problem, you
549  can add C<memoize_to_xml> option to L<"open">.  can add C<memoize_to_xml> option to L<"open_index">.
550    
551  =cut  =cut
552    
# Line 563  sub to_xml { Line 571  sub to_xml {
571          $xml .= qq{</xml>};          $xml .= qq{</xml>};
572  }  }
573    
574    sub _debug {
575            my $self = shift;
576            print STDERR "## ",@_,"\n" if ($self->{'debug'});
577            return;
578    }
579    
580  1;  1;
581  __END__  __END__
582    
583    
584  =head2 Searching  =head1 Searching
585    
586  Searching is still conducted using L<SWISH::API>, but you have to glob  Searching is still conducted using L<SWISH::API>, but you have to glob
587  index names.  index names.
# Line 582  not change your source code at all. Line 596  not change your source code at all.
596  That would also benefit performance, but it increases indexing time  That would also benefit performance, but it increases indexing time
597  because merged indexes must be re-created on each indexing run.  because merged indexes must be re-created on each indexing run.
598    
599  =head2 EXPORT  =head1 EXPORT
600    
601  Nothing by default.  Nothing by default.
602    
603  =head2 EXAMPLES  =head1 EXAMPLES
604    
605  Test script for this module uses all parts of API. It's also nice example  Test script for this module uses all parts of API. It's also nice example
606  how to use C<SWISH::Split>.  how to use C<SWISH::Split>.

Legend:
Removed from v.4  
changed lines
  Added in v.13

  ViewVC Help
Powered by ViewVC 1.1.26