/[SWISH-Split]/trunk/Split.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/Split.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 7 by dpavlin, Fri Dec 17 18:32:34 2004 UTC revision 9 by dpavlin, Fri Apr 29 22:35:21 2005 UTC
# Line 4  use 5.008; Line 4  use 5.008;
4  use strict;  use strict;
5  use warnings;  use warnings;
6    
7  our $VERSION = '0.01';  our $VERSION = '0.03';
8    
9  use SWISH::API;  use SWISH::API;
10  use Text::Iconv;  use Text::Iconv;
# Line 12  use File::Temp qw/ :mktemp /; Line 12  use File::Temp qw/ :mktemp /;
12  use Carp;  use Carp;
13  use Digest::MD5 qw(md5_hex);  use Digest::MD5 qw(md5_hex);
14  use Memoize;  use Memoize;
 use IPC::Run qw(start timeout pump finish);  
15  use File::Which;  use File::Which;
16    
17  use Data::Dumper;  use Data::Dumper;
# Line 47  C<ISO-8859-1> you will have to specify i Line 46  C<ISO-8859-1> you will have to specify i
46    
47  =head1 Methods used for indexing  =head1 Methods used for indexing
48    
49  =head2 open  =head2 open_index
50    
51  Create new object for index.  Create new object for index.
52    
53    my $i = SWISH::Split->open({    my $i = SWISH::Split->open_index({
54          index => '/path/to/index',          index => '/path/to/index',
55          slice_name => \&slice_on_path,          slice_name => \&slice_on_path,
56          slices => 30,          slices => 30,
# Line 69  Create new object for index. Line 68  Create new object for index.
68          return shift split(/\//,$_[0]);          return shift split(/\//,$_[0]);
69    }    }
70    
71  Options to open are following:  Options to C<open_index> are following:
72    
73  =over 5  =over 5
74    
# Line 110  speed up repeatable data, see L<"to_xml" Line 109  speed up repeatable data, see L<"to_xml"
109    
110  my $iso2utf = Text::Iconv->new('ISO-8859-1','UTF-8');  my $iso2utf = Text::Iconv->new('ISO-8859-1','UTF-8');
111    
112  sub open {  sub open_index {
113          my $class = shift;          my $class = shift;
114          my $self = {@_};          my $self = {@_};
115          bless($self, $class);          bless($self, $class);
# Line 151  sub add { Line 150  sub add {
150    
151          my $slice = $self->put_slice($swishpath, $self->to_xml($data));          my $slice = $self->put_slice($swishpath, $self->to_xml($data));
152    
 #       if ($err) {  
 #               carp "$swishpath: $err";  
 #               return undef;  
 #       }  
   
153          return $slice;          return $slice;
154  }  }
155    
# Line 202  sub done { Line 196  sub done {
196          my $ret = 0;          my $ret = 0;
197    
198          foreach my $s (keys %{$self->{'slice'}}) {          foreach my $s (keys %{$self->{'slice'}}) {
199                  print STDERR "closing slice $s\n";                  $self->_debug("closing slice $s");
200                  $ret += $self->close_slice($s);                  $ret += $self->close_slice($s);
201          }          }
202    
# Line 280  Takes path and return slice in which thi Line 274  Takes path and return slice in which thi
274    
275    my $s = $i->in_slice('path/to/document/in/index');    my $s = $i->in_slice('path/to/document/in/index');
276    
277  If there are C<slices> parametar to L<"open"> it will use  If there are C<slices> parametar to L<"open_index"> it will use
278  MD5 hash to spread documents across slices. That will produce random  MD5 hash to spread documents across slices. That will produce random
279  distribution of your documents in slices, which might or might not be best  distribution of your documents in slices, which might or might not be best
280  for your data. If you have to re-index large number of slices on each  for your data. If you have to re-index large number of slices on each
# Line 304  sub in_slice { Line 298  sub in_slice {
298                  # first, pass path through slice_name function                  # first, pass path through slice_name function
299                  my $slice = &{$self->{'slice_name'}}($path);                  my $slice = &{$self->{'slice_name'}}($path);
300                  # then calculate MD5 hash                  # then calculate MD5 hash
301                  $slice = md5_hex($slice);                  my $hash = md5_hex($slice);
302                  # take first 8 chars to produce number                  # take first 8 chars to produce number
303                  # FIXME how random is this?                  # FIXME how random is this?
304                  $slice = hex(substr($slice,0,8));                  $hash = hex(substr($hash,0,8));
305                                    
306                  $slice = ($slice % $self->{'slices'}) + 1;                  $slice = ($hash % $self->{'slices'}) + 1;
307                  print "hash: $slice / ",$self->{'slices'}," => $slice\n";                  $self->_debug("hash: $hash / ",$self->{'slices'}," => $slice");
308                  return $slice;                  return $slice;
309          } else {          } else {
310                  return &{$self->{'split'}}($path);                  return &{$self->{'split'}}($path);
# Line 341  Create C<swish-e> configuration file for Line 335  Create C<swish-e> configuration file for
335    my $config_filename = $i->make_config('slice name');    my $config_filename = $i->make_config('slice name');
336    
337  It returns configuration filename. If no C<swish_config> was defined in  It returns configuration filename. If no C<swish_config> was defined in
338  L<"open">, default swish-e configuration will be used. It will index all data for  L<"open_index">, default swish-e configuration will be used. It will index all data for
339  searching, but none for properties.  searching, but none for properties.
340    
341  If you want to see what is allready defined for swish-e in configuration  If you want to see what is allready defined for swish-e in configuration
# Line 401  DEFAULT_SWISH_CONF Line 395  DEFAULT_SWISH_CONF
395    
396  =head2 create_slice  =head2 create_slice
397    
398  On first run, starts C<swish-e> using L<IPC::Run>. On subsequent calls just return  On first run, starts C<swish-e>. On subsequent calls just return
399  it's handles using L<Memoize>.  it's handles using L<Memoize>.
400    
401    my $s = create_slice('/path/to/document');    my $s = create_slice('/path/to/document');
# Line 422  sub create_slice { Line 416  sub create_slice {
416    
417          my $swish_config = $self->make_config($s);          my $swish_config = $self->make_config($s);
418    
419          print STDERR "creating slice $s\n";     # FIXME          my $swish = qq{| swish-e };
420            if (-f $self->{'index'}.'/'.$s) {
421                    $swish .= qq{ -u };
422                    $self->{'slice'}->{$s}->{'update_mode'}++;
423            }
424            $swish .= qq{ -S prog -c } . $swish_config;
425    
426          my @swish = qw(swish-e -u -S prog -c);          $self->_debug("creating slice $s using $swish");
         push @swish, $swish_config;  
427    
428          ## Build the harness, open all pipes, and launch the subprocesses          ## Build the harness, open all pipes, and launch the subprocesses
429          $self->{'slice'}->{$s}->{'h'} = start \@swish,          open(my $fh, $swish) || croak "can't open $swish: $!";
                 \$self->{'slice'}->{$s}->{'in'},  
                 \$self->{'slice'}->{$s}->{'out'},  
                 \$self->{'slice'}->{$s}->{'err'},  
                 timeout( 90 );  # FIXME  
430    
431          $self->{'slice'}->{$s}->{'out_len'} = 0;          $self->{'slice'}->{$s}->{'h'} = $fh;
         $self->{'slice'}->{$s}->{'err_len'} = 0;  
432    
433          $self->slice_output($s);          $self->slice_output($s);
434    
# Line 463  sub put_slice { Line 456  sub put_slice {
456          my $s = $self->create_slice($path) || confess "create_slice returned null";          my $s = $self->create_slice($path) || confess "create_slice returned null";
457    
458          confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));          confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
         confess "no 'in' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'in'}));  
459          confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));          confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));
460    
461          $self->slice_output($s);          $self->slice_output($s);
462    
463          use bytes;      # as opposed to chars          use bytes;      # as opposed to chars
464          $self->{'slice'}->{$s}->{'in'} .=          my $fh = $self->{'slice'}->{$s}->{'h'} || confess "handle for slice $s undefined";
                 "Path-Name: $path\n".  
                 "Content-Length: ".(length($xml)+1)."\n".  
                 "Update-Mode: Index\n".  
                 "Document-Type: XML\n\n$xml\n";  
465    
466          # do I/O          my $update_header = "Update-Mode: Index\n";
467          $self->{'slice'}->{$s}->{'h'}->pump while length $self->{'slice'}->{$s}->{'in'} ;  # wait for all input to go          $update_header = '' unless ($self->{'slice'}->{$s}->{'update_mode'});
468    
469            print { $fh } "Path-Name: $path\n".
470                    "Content-Length: ".(length($xml)+1)."\n" . $update_header .
471                    "Document-Type: XML\n\n$xml\n";
472    
473          $self->slice_output($s);          $self->slice_output($s);
474    
475            $self->_debug("dumping in slice $s: $path");
476    
477          $self->{'paths'}->{$path} = ADDED;          $self->{'paths'}->{$path} = ADDED;
478    
479          return $s;          return $s;
# Line 493  Prints to STDERR output and errors from Line 487  Prints to STDERR output and errors from
487    
488  Normally, you don't need to call it.  Normally, you don't need to call it.
489    
490    B<This is dummy placeholder function for very old code that assumes this
491    module is using C<IPC::Run> which it isn't any more.>
492    
493  =cut  =cut
494    
495  sub slice_output {  sub slice_output {
# Line 501  sub slice_output { Line 498  sub slice_output {
498          my $s = shift || confess "slice_output needs slice";          my $s = shift || confess "slice_output needs slice";
499    
500          confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));          confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
         confess "no 'in' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'in'}));  
         confess "no 'out' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'out'}));  
501    
502          if (length $self->{'slice'}->{$s}->{'out'} > $self->{'slice'}->{$s}->{'out_len'}) {          # FIXME
                 #print STDERR "swish-e OUT: ",$self->{'slice'}->{$s}->{'out'},"\n" if ($self->{'slice'}->{$s}->{'out'});  
                 $self->{'slice'}->{$s}->{'out_len'} = length $self->{'slice'}->{$s}->{'out'};  
                 return $s;  
         } elsif (length $self->{'slice'}->{$s}->{'err'} > $self->{'slice'}->{$s}->{'err_len'}) {  
                 print STDERR "swish-e ERR: ",$self->{'slice'}->{$s}->{'err'},"\n" if ($self->{'slice'}->{$s}->{'err'});  
                 $self->{'slice'}->{$s}->{'err_len'} = length $self->{'slice'}->{$s}->{'err'};  
                 # this is fatal  
                 return undef;  
         }  
503    
504          return $s;          return $s;
505  }  }
# Line 537  sub close_slice { Line 523  sub close_slice {
523          confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));          confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));
524    
525          # pump rest of content (if any)          # pump rest of content (if any)
526          $self->{'slice'}->{$s}->{'h'}->pump while length $self->{'slice'}->{$s}->{'in'};          close $self->{'slice'}->{$s}->{'h'} || carp "can't close slice $s: $!";
527    
528          $self->slice_output($s);          $self->slice_output($s);
529    
530          # clean up          undef $self->{'slice'}->{$s}->{'h'};
         $self->{'slice'}->{$s}->{'h'}->finish or confess "finish on slice $s returned $?: $! -- ",$self->{'slice'}->{$s}->{'err'};  
531                    
532          delete($self->{'slice'}->{$s}) && return 1;          delete($self->{'slice'}->{$s}) && return 1;
533          return 0;          return 0;
# Line 557  Data will not yet be recoded to UTF-8. L Line 542  Data will not yet be recoded to UTF-8. L
542    
543  This function is extracted from L<"add"> method so that you can L<Memoize> it.  This function is extracted from L<"add"> method so that you can L<Memoize> it.
544  If your data set has a lot of repeatable data, and memory is not a problem, you  If your data set has a lot of repeatable data, and memory is not a problem, you
545  can add C<memoize_to_xml> option to L<"open">.  can add C<memoize_to_xml> option to L<"open_index">.
546    
547  =cut  =cut
548    
# Line 582  sub to_xml { Line 567  sub to_xml {
567          $xml .= qq{</xml>};          $xml .= qq{</xml>};
568  }  }
569    
570    sub _debug {
571            my $self = shift;
572            print STDERR "## ",@_,"\n" if ($self->{'debug'});
573            return;
574    }
575    
576  1;  1;
577  __END__  __END__
578    

Legend:
Removed from v.7  
changed lines
  Added in v.9

  ViewVC Help
Powered by ViewVC 1.1.26