/[SWISH-Split]/trunk/Split.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/Split.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 4 by dpavlin, Sun Aug 8 19:22:56 2004 UTC revision 7 by dpavlin, Fri Dec 17 18:32:34 2004 UTC
# Line 4  use 5.008; Line 4  use 5.008;
4  use strict;  use strict;
5  use warnings;  use warnings;
6    
7  our $VERSION = '0.00';  our $VERSION = '0.01';
8    
9  use SWISH::API;  use SWISH::API;
10  use Text::Iconv;  use Text::Iconv;
# Line 17  use File::Which; Line 17  use File::Which;
17    
18  use Data::Dumper;  use Data::Dumper;
19    
20    use constant {
21            ADDED => 1,
22            DELETED => 2,
23    };
24    
25  =head1 NAME  =head1 NAME
26    
27  SWISH::Split - Perl interface to split index variant of Swish-e  SWISH::Split - Perl interface to split index variant of Swish-e
# Line 37  L<Plucene::Simple>. This could make your Line 42  L<Plucene::Simple>. This could make your
42  engines) easier.  engines) easier.
43    
44  In the background, it will fork swish-e binaries (one for each index slice)  In the background, it will fork swish-e binaries (one for each index slice)
45  and produce UTF-8 encoded XML files for it. So, if your imput charset isn't  and produce UTF-8 encoded XML files for it. So, if your input charset isn't
46  C<ISO-8859-1> you will have to specify it.  C<ISO-8859-1> you will have to specify it.
47    
48  =head1 Methods used for indexing  =head1 Methods used for indexing
# Line 144  sub add { Line 149  sub add {
149          my $swishpath = shift || return;          my $swishpath = shift || return;
150          my $data = shift || return;          my $data = shift || return;
151    
152          my ($out,$err) = $self->put_slice($swishpath, $self->to_xml($data));          my $slice = $self->put_slice($swishpath, $self->to_xml($data));
153    
154          if ($err) {  #       if ($err) {
155                  carp "$swishpath: $err";  #               carp "$swishpath: $err";
156                  return 0;  #               return undef;
157          }  #       }
158    
159          return 1;          return $slice;
160  }  }
161    
162  =head2 delete  =head2 delete
# Line 167  sub delete { Line 172  sub delete {
172    
173          my @paths = @_ || return;          my @paths = @_ || return;
174    
175            foreach my $path (@paths) {
176                    $self->{'paths'}->{$path} = DELETED;
177            }
178    
179          return 42;          return 42;
180  }  }
181    
182    
183  =head2 finish  =head2 done
184    
185  Finish indexing and close index file(s).  Finish indexing and close index file(s).
186    
187    $i->finish;    $i->done;
188    
189  This is most time-consuming operation. When it's called, it will re-index  This is most time-consuming operation. When it's called, it will re-index
190  all entries which haven't changed in all slices.  all entries which haven't changed in all slices.
191    
192  Returns number of slices updated.  Returns number of slices updated.
193    
194    This method should really be called close or finish, but both of those are
195    allready used.
196    
197  =cut  =cut
198    
199  sub finish {  sub done {
200          my $self = shift;          my $self = shift;
201    
202          my $ret = 0;          my $ret = 0;
203    
204          foreach my $s (keys %{$self->{'slice'}}) {          foreach my $s (keys %{$self->{'slice'}}) {
205                    print STDERR "closing slice $s\n";
206                  $ret += $self->close_slice($s);                  $ret += $self->close_slice($s);
207          }          }
208    
# Line 212  Return array of C<swishpath>s in index. Line 225  Return array of C<swishpath>s in index.
225    
226  sub swishpaths {  sub swishpaths {
227          my $self = shift;          my $self = shift;
228    
229            my $s = shift || return;
230            return if (! exists($self->{'slice'}->{'s'}));
231    
232            return keys %{$self->{'slice'}->{'s'}};
233  }  }
234    
235  =head2 swishpaths_updated  =head2 swishpaths_updated
# Line 313  which hasn't changed a while (so, expire Line 331  which hasn't changed a while (so, expire
331  sub find_paths {  sub find_paths {
332          my $self = shift;          my $self = shift;
333    
         my $s = shift || return;  
334  }  }
335    
336    
# Line 330  searching, but none for properties. Line 347  searching, but none for properties.
347  If you want to see what is allready defined for swish-e in configuration  If you want to see what is allready defined for swish-e in configuration
348  take a look at source code for C<DEFAULT_SWISH_CONF>.  take a look at source code for C<DEFAULT_SWISH_CONF>.
349    
350  It uses C<cat> utility to comunicate with C<swish-e>. Path is provided  It uses C<stdin> as C<IndexDir> to comunicate with C<swish-e>.
 by C<File::Which>. Do Windows users have to change that to C<COPY /B>  
 or something similar?  
351    
352  =cut  =cut
353    
# Line 351  sub make_config { Line 366  sub make_config {
366          print $tmp_fh <<"DEFAULT_SWISH_CONF";          print $tmp_fh <<"DEFAULT_SWISH_CONF";
367  # swish-e config file  # swish-e config file
368    
369  IndexDir cat  IndexDir stdin
 #SwishProgParameters -  
370    
371  # input file definition  # input file definition
372  DefaultContents XML*  DefaultContents XML*
# Line 410  sub create_slice { Line 424  sub create_slice {
424    
425          print STDERR "creating slice $s\n";     # FIXME          print STDERR "creating slice $s\n";     # FIXME
426    
427          my @swish = qw(swish-e -S prog -c);          my @swish = qw(swish-e -u -S prog -c);
428          push @swish, $swish_config;          push @swish, $swish_config;
429    
430          ## Build the harness, open all pipes, and launch the subprocesses          ## Build the harness, open all pipes, and launch the subprocesses
# Line 430  sub create_slice { Line 444  sub create_slice {
444    
445  =head2 put_slice  =head2 put_slice
446    
447  Pass XML data to swish and receive output and errors.  Pass XML data to swish.
448    
449    my ($out,$err) = $i->put_slice('/swish/path', '<xml>data</xml>');    my $slice = $i->put_slice('/swish/path', '<xml>data</xml>');
450    
451    Returns slice in which XML ended up.
452    
453  =cut  =cut
454    
# Line 456  sub put_slice { Line 472  sub put_slice {
472          $self->{'slice'}->{$s}->{'in'} .=          $self->{'slice'}->{$s}->{'in'} .=
473                  "Path-Name: $path\n".                  "Path-Name: $path\n".
474                  "Content-Length: ".(length($xml)+1)."\n".                  "Content-Length: ".(length($xml)+1)."\n".
475                    "Update-Mode: Index\n".
476                  "Document-Type: XML\n\n$xml\n";                  "Document-Type: XML\n\n$xml\n";
477    
478          # do I/O          # do I/O
# Line 463  sub put_slice { Line 480  sub put_slice {
480    
481          $self->slice_output($s);          $self->slice_output($s);
482    
483            $self->{'paths'}->{$path} = ADDED;
484    
485          return $s;          return $s;
486  }  }
487    
# Line 470  sub put_slice { Line 489  sub put_slice {
489    
490  Prints to STDERR output and errors from C<swish-e>.  Prints to STDERR output and errors from C<swish-e>.
491    
492    $i->slice_output($s);    my $slice = $i->slice_output($s);
493    
494  Normally, you don't need to call it.  Normally, you don't need to call it.
495    
# Line 488  sub slice_output { Line 507  sub slice_output {
507          if (length $self->{'slice'}->{$s}->{'out'} > $self->{'slice'}->{$s}->{'out_len'}) {          if (length $self->{'slice'}->{$s}->{'out'} > $self->{'slice'}->{$s}->{'out_len'}) {
508                  #print STDERR "swish-e OUT: ",$self->{'slice'}->{$s}->{'out'},"\n" if ($self->{'slice'}->{$s}->{'out'});                  #print STDERR "swish-e OUT: ",$self->{'slice'}->{$s}->{'out'},"\n" if ($self->{'slice'}->{$s}->{'out'});
509                  $self->{'slice'}->{$s}->{'out_len'} = length $self->{'slice'}->{$s}->{'out'};                  $self->{'slice'}->{$s}->{'out_len'} = length $self->{'slice'}->{$s}->{'out'};
510                  return 1;                  return $s;
511          } elsif (length $self->{'slice'}->{$s}->{'err'} > $self->{'slice'}->{$s}->{'err_len'}) {          } elsif (length $self->{'slice'}->{$s}->{'err'} > $self->{'slice'}->{$s}->{'err_len'}) {
512                  print STDERR "swish-e ERR: ",$self->{'slice'}->{$s}->{'err'},"\n" if ($self->{'slice'}->{$s}->{'err'});                  print STDERR "swish-e ERR: ",$self->{'slice'}->{$s}->{'err'},"\n" if ($self->{'slice'}->{$s}->{'err'});
513                  $self->{'slice'}->{$s}->{'err_len'} = length $self->{'slice'}->{$s}->{'err'};                  $self->{'slice'}->{$s}->{'err_len'} = length $self->{'slice'}->{$s}->{'err'};
514                  # this is fatal                  # this is fatal
515                  return 0;                  return undef;
516          }          }
517    
518          return 1;          return $s;
519  }  }
520    
521  =head2 close_slice {  =head2 close_slice
522    
523  Close slice (terminates swish-e process for that slice).  Close slice (terminates swish-e process for that slice).
524    
# Line 523  sub close_slice { Line 542  sub close_slice {
542          $self->slice_output($s);          $self->slice_output($s);
543    
544          # clean up          # clean up
545          $self->{'slice'}->{$s}->{'h'}->finish or confess "finish on slice $s returned: $?";          $self->{'slice'}->{$s}->{'h'}->finish or confess "finish on slice $s returned $?: $! -- ",$self->{'slice'}->{$s}->{'err'};
546                    
547          delete($self->{'slice'}->{$s}) && return 1;          delete($self->{'slice'}->{$s}) && return 1;
548          return 0;          return 0;
# Line 567  sub to_xml { Line 586  sub to_xml {
586  __END__  __END__
587    
588    
589  =head2 Searching  =head1 Searching
590    
591  Searching is still conducted using L<SWISH::API>, but you have to glob  Searching is still conducted using L<SWISH::API>, but you have to glob
592  index names.  index names.
# Line 582  not change your source code at all. Line 601  not change your source code at all.
601  That would also benefit performance, but it increases indexing time  That would also benefit performance, but it increases indexing time
602  because merged indexes must be re-created on each indexing run.  because merged indexes must be re-created on each indexing run.
603    
604  =head2 EXPORT  =head1 EXPORT
605    
606  Nothing by default.  Nothing by default.
607    
608  =head2 EXAMPLES  =head1 EXAMPLES
609    
610  Test script for this module uses all parts of API. It's also nice example  Test script for this module uses all parts of API. It's also nice example
611  how to use C<SWISH::Split>.  how to use C<SWISH::Split>.

Legend:
Removed from v.4  
changed lines
  Added in v.7

  ViewVC Help
Powered by ViewVC 1.1.26