/[SWISH-Split]/trunk/Split.pm

This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!

Diff of /trunk/Split.pm

Parent Directory | Revision Log | View Patch Patch

-revision 3 by dpavlin,
Sun Aug  8 10:53:04 2004 UTC
+revision 9 by dpavlin,
Fri Apr 29 22:35:21 2005 UTC
 Line 4 
 use 5.008;
  use strict;
  use warnings;
- our $VERSION = '0.00';
+ our $VERSION = '0.03';
  use SWISH::API;
  use Text::Iconv;
 Line 12 
 use File::Temp qw/ :mktemp /;
  use Carp;
  use Digest::MD5 qw(md5_hex);
  use Memoize;
+ use File::Which;
  use Data::Dumper;
+ use constant {
+         ADDED => 1,
+         DELETED => 2,
+ };
  =head1 NAME
  SWISH::Split - Perl interface to split index variant of Swish-e
-Line 27 
 SWISH::Split - Perl interface to split i
+Line 33 
 SWISH::Split - Perl interface to split i
  =head1 DESCRIPTION
  This is alternative interface for indexing data with swish-e. It's designed
- to split indexes over multiple files to allow updates of records in index
+ to split indexes over multiple files (slices) to allow updates of records in index
- by reindexing just changed parts.
+ by reindexing just changed parts (slice).
  Data is stored in index using intrface which is somewhat similar to
  L<Plucene::Simple>. This could make your migration (or supporting two index
  engines) easier.
  In the background, it will fork swish-e binaries (one for each index slice)
- and produce UTF-8 encoded XML files. So, if your imput charset isn't
+ and produce UTF-8 encoded XML files for it. So, if your input charset isn't
  C<ISO-8859-1> you will have to specify it.
  =head1 Methods used for indexing
- =head2 open
+ =head2 open_index
  Create new object for index.
-   my $i = SWISH::Split->open({
+   my $i = SWISH::Split->open_index({
          index => '/path/to/index',
          slice_name => \&slice_on_path,
          slices => 30,
-         merge => 1,
+         merge => 0,
-         codepage => 'ISO-8859-2'
+         codepage => 'ISO-8859-2',
+         swish_config => qq{
+                 PropertyNames from date
+                 PropertyNamesDate date
+         },
+         memoize_to_xml => 0,
    );
    # split index on first component of path
-Line 57 
 Create new object for index.
+Line 68 
 Create new object for index.
          return shift split(/\//,$_[0]);
    }
+ Options to C<open_index> are following:
+ =over 5
+ =item C<index>
+ path to (existing) directory in which index slices will be created.
+ =item C<slice_name>
- C<slices> is maximum number of index slices. See L<"in_slice"> for
+ coderef to function which provide slicing from path.
+ =item C<slices>
+ maximum number of index slices. See L<"in_slice"> for
  more explanation.
+ =item C<merge>
+ (planned) option to merge indexes into one at end.
+ =item C<codepage>
+ data codepage (needed for conversion to UTF-8).
+ By default, it's C<ISO-8859-1>.
+ =item C<swish_config>
+ additional parametars which will be inserted into
+ C<swish-e> configuration file. See L<swish-config>.
+ =item C<memoize_to_xml>
+ speed up repeatable data, see L<"to_xml">.
+ =back
  =cut
  my $iso2utf = Text::Iconv->new('ISO-8859-1','UTF-8');
- sub open {
+ sub open_index {
          my $class = shift;
          my $self = {@_};
          bless($self, $class);
-         print Dumper($self->{'slice_name'});
          croak "need slice_name coderef" unless ref $self->{'slice_name'};
          croak "need slices" unless $self->{'slices'};
-Line 81 
 sub open {
+Line 123 
 sub open {
          $iso2utf = Text::Iconv->new($self->{'codepage'},'UTF-8') if ($self->{'codepage'});
+         # speedup
          memoize('in_slice');
+         memoize('to_xml') if ($self->{'memoize_to_xml'});
          $self ? return $self : return undef;
-Line 104 
 sub add {
+Line 148 
 sub add {
          my $swishpath = shift || return;
          my $data = shift || return;
-         return 1;
+         my $slice = $self->put_slice($swishpath, $self->to_xml($data));
+         return $slice;
  }
  =head2 delete
-Line 120 
 sub delete {
+Line 166 
 sub delete {
          my @paths = @_ || return;
+         foreach my $path (@paths) {
+                 $self->{'paths'}->{$path} = DELETED;
+         }
          return 42;
  }
- =head2 close
+ =head2 done
- Close index file and finish indexing.
+ Finish indexing and close index file(s).
-   $i->close;
+   $i->done;
  This is most time-consuming operation. When it's called, it will re-index
  all entries which haven't changed in all slices.
+ Returns number of slices updated.
+ This method should really be called close or finish, but both of those are
+ allready used.
  =cut
- sub close {
+ sub done {
          my $self = shift;
-         return 1;
+         my $ret = 0;
+         foreach my $s (keys %{$self->{'slice'}}) {
+                 $self->_debug("closing slice $s");
+                 $ret += $self->close_slice($s);
+         }
+         return $ret;
  }
-Line 157 
 Return array of C<swishpath>s in index.
+Line 219 
 Return array of C<swishpath>s in index.
  sub swishpaths {
          my $self = shift;
+         my $s = shift || return;
+         return if (! exists($self->{'slice'}->{'s'}));
+         return keys %{$self->{'slice'}->{'s'}};
  }
  =head2 swishpaths_updated
-Line 207 
 Takes path and return slice in which thi
+Line 274 
 Takes path and return slice in which thi
    my $s = $i->in_slice('path/to/document/in/index');
- If there are C<slices> parametar to L<"open"> it will use
+ If there are C<slices> parametar to L<"open_index"> it will use
  MD5 hash to spread documents across slices. That will produce random
  distribution of your documents in slices, which might or might not be best
  for your data. If you have to re-index large number of slices on each
  run, think about creating your own C<slice> function and distributing
  documents manually across slices.
+ Slice number must always be true value or various sanity checks will fail.
  This function is C<Memoize>ed for performance reasons.
  =cut
-Line 223 
 sub in_slice {
+Line 292 
 sub in_slice {
          my $path = shift || confess "need path";
-         print Dumper($self->{'slice_name'});
          confess "need slice_name function" unless ref ($self->{'slice_name'});
          if ($self->{'slices'}) {
                  # first, pass path through slice_name function
                  my $slice = &{$self->{'slice_name'}}($path);
                  # then calculate MD5 hash
-                 $slice = md5_hex($slice);
+                 my $hash = md5_hex($slice);
                  # take first 8 chars to produce number
                  # FIXME how random is this?
-                 $slice = hex(substr($slice,0,8));
+                 $hash = hex(substr($hash,0,8));
-                 print "slice_nr: $slice slices: ",$self->{'slices'},"\n";
+                 $slice = ($hash % $self->{'slices'}) + 1;
-                 return ($slice % $self->{'slices'});
+                 $self->_debug("hash: $hash / ",$self->{'slices'}," => $slice");
+                 return $slice;
          } else {
                  return &{$self->{'split'}}($path);
          }
-Line 256 
 which hasn't changed a while (so, expire
+Line 325 
 which hasn't changed a while (so, expire
  sub find_paths {
          my $self = shift;
-         my $s = shift || return;
  }
+ =head2 make_config
+ Create C<swish-e> configuration file for given slice.
+   my $config_filename = $i->make_config('slice name');
+ It returns configuration filename. If no C<swish_config> was defined in
+ L<"open_index">, default swish-e configuration will be used. It will index all data for
+ searching, but none for properties.
+ If you want to see what is allready defined for swish-e in configuration
+ take a look at source code for C<DEFAULT_SWISH_CONF>.
+ It uses C<stdin> as C<IndexDir> to comunicate with C<swish-e>.
+ =cut
+ sub make_config {
+         my $self = shift;
+         my $index_file = $self->{'index'}."/";
+         $index_file .= shift || confess "need slice name";
+         my ($tmp_fh, $swish_config_filename) = mkstemp("/tmp/swishXXXXX");
+         # find cat on filesystem
+         my $cat = which('cat');
+         print $tmp_fh <<"DEFAULT_SWISH_CONF";
+ # swish-e config file
+ IndexDir stdin
+ # input file definition
+ DefaultContents XML*
+ # indexed metatags
+ MetaNames xml swishdocpath
+ #XMLClassAttributes type
+ UndefinedMetaTags auto
+ UndefinedXMLAttributes auto
+ IndexFile $index_file
+ # Croatian ISO-8859-2 characters to unaccented equivalents
+ TranslateCharacters �������ƾ� ssddcccczz
+ # disable output
+ ParserWarnLevel 0
+ IndexReport 1
+ DEFAULT_SWISH_CONF
+         # add user parametars (like stored properties)
+         print $tmp_fh $self->{'swish_config'} if ($self->{'swish_config'});
+         close($tmp_fh);
+         return $swish_config_filename;
+ }
+ =head2 create_slice
+ On first run, starts C<swish-e>. On subsequent calls just return
+ it's handles using L<Memoize>.
+   my $s = create_slice('/path/to/document');
+ You shouldn't need to call C<create_slice> directly because it will be called
+ from L<"put_slice"> when needed.
+ =cut
+ sub create_slice {
+         my $self = shift;
+         my $path = shift || confess "create_slice need path!";
+         my $s = $self->in_slice($path) || confess "in_slice returned null";
+         return $s if (exists($self->{'slice'}->{$s}));
+         my $swish_config = $self->make_config($s);
+         my $swish = qq{| swish-e };
+         if (-f $self->{'index'}.'/'.$s) {
+                 $swish .= qq{ -u };
+                 $self->{'slice'}->{$s}->{'update_mode'}++;
+         }
+         $swish .= qq{ -S prog -c } . $swish_config;
+         $self->_debug("creating slice $s using $swish");
+         ## Build the harness, open all pipes, and launch the subprocesses
+         open(my $fh, $swish) || croak "can't open $swish: $!";
+         $self->{'slice'}->{$s}->{'h'} = $fh;
+         $self->slice_output($s);
+         return $s;
+ }
+ =head2 put_slice
+ Pass XML data to swish.
+   my $slice = $i->put_slice('/swish/path', '<xml>data</xml>');
+ Returns slice in which XML ended up.
+ =cut
+ sub put_slice {
+         my $self = shift;
+         my $path = shift || confess "need path";
+         my $xml = shift || confess "need xml";
+         $xml = $iso2utf->convert($xml) || carp "XML conversion error in $xml";
+         my $s = $self->create_slice($path) || confess "create_slice returned null";
+         confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
+         confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));
+         $self->slice_output($s);
+         use bytes;      # as opposed to chars
+         my $fh = $self->{'slice'}->{$s}->{'h'} || confess "handle for slice $s undefined";
+         my $update_header = "Update-Mode: Index\n";
+         $update_header = '' unless ($self->{'slice'}->{$s}->{'update_mode'});
+         print { $fh } "Path-Name: $path\n".
+                 "Content-Length: ".(length($xml)+1)."\n" . $update_header .
+                 "Document-Type: XML\n\n$xml\n";
+         $self->slice_output($s);
+         $self->_debug("dumping in slice $s: $path");
+         $self->{'paths'}->{$path} = ADDED;
+         return $s;
+ }
+ =head2 slice_output
+ Prints to STDERR output and errors from C<swish-e>.
+   my $slice = $i->slice_output($s);
+ Normally, you don't need to call it.
+ B<This is dummy placeholder function for very old code that assumes this
+ module is using C<IPC::Run> which it isn't any more.>
+ =cut
+ sub slice_output {
+         my $self = shift;
+         my $s = shift || confess "slice_output needs slice";
+         confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
+         # FIXME
+         return $s;
+ }
+ =head2 close_slice
+ Close slice (terminates swish-e process for that slice).
+   my $i->close_slice($s);
+ Returns true if slice is closed, false otherwise.
+ =cut
+ sub close_slice {
+         my $self = shift;
+         my $s = shift || confess "close_slice needs slice";
+         confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
+         confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));
+         # pump rest of content (if any)
+         close $self->{'slice'}->{$s}->{'h'} || carp "can't close slice $s: $!";
+         $self->slice_output($s);
+         undef $self->{'slice'}->{$s}->{'h'};
+         delete($self->{'slice'}->{$s}) && return 1;
+         return 0;
+ }
+ =head2 to_xml
+ Convert (binary safe, I hope) your data into XML for C<swish-e>.
+ Data will not yet be recoded to UTF-8. L<"put_slice"> will do that.
+   my $xml = $i->to_xml({ foo => 'bar' });
+ This function is extracted from L<"add"> method so that you can L<Memoize> it.
+ If your data set has a lot of repeatable data, and memory is not a problem, you
+ can add C<memoize_to_xml> option to L<"open_index">.
+ =cut
+ my %escape = ('<'=>'&lt;', '>'=>'&gt;', '&'=>'&amp;', '"'=>'&quot;');
+ my $escape_re  = join '|' => keys %escape;
+ sub to_xml {
+         my $self = shift;
+         my $data = shift || return;
+         my $xml = qq{<xml>};
+         foreach my $tag (keys %$data) {
+                 my $content = $data->{$tag};
+                 next if (! $content || $content eq '');
+                 # save [cr/]lf before conversion to XML
+ #               $content =~ s/\n\r/##lf##/gs;
+ #               $content =~ s/\n/##lf##/gs;
+                 $content =~ s/($escape_re)/$escape{$1}/gs;
+                 $xml .= "<$tag><![CDATA[".$content."]]></$tag>";
+         }
+         $xml .= qq{</xml>};
+ }
+ sub _debug {
+         my $self = shift;
+         print STDERR "## ",@_,"\n" if ($self->{'debug'});
+         return;
+ }
 ;
  __END__
- =head2 Searching
+ =head1 Searching
  Searching is still conducted using L<SWISH::API>, but you have to glob
  index names.
-Line 280 
 not change your source code at all.
+Line 592 
 not change your source code at all.
  That would also benefit performance, but it increases indexing time
  because merged indexes must be re-created on each indexing run.
- =head2 EXPORT
+ =head1 EXPORT
- None by default.
+ Nothing by default.
+ =head1 EXAMPLES
+ Test script for this module uses all parts of API. It's also nice example
+ how to use C<SWISH::Split>.
  =head1 SEE ALSO

 Legend:



Removed from v.3
 


changed lines


 
Added in v.9
 Legend:



Removed from v.3
 


changed lines


 
Added in v.9
-Removed from v.3
+Added in v.9

	ViewVC Help
Powered by ViewVC 1.1.26