/[SWISH-Split]/trunk/Split.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/Split.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 3 by dpavlin, Sun Aug 8 10:53:04 2004 UTC revision 8 by dpavlin, Sun Dec 19 03:06:01 2004 UTC
# Line 4  use 5.008; Line 4  use 5.008;
4  use strict;  use strict;
5  use warnings;  use warnings;
6    
7  our $VERSION = '0.00';  our $VERSION = '0.02';
8    
9  use SWISH::API;  use SWISH::API;
10  use Text::Iconv;  use Text::Iconv;
# Line 12  use File::Temp qw/ :mktemp /; Line 12  use File::Temp qw/ :mktemp /;
12  use Carp;  use Carp;
13  use Digest::MD5 qw(md5_hex);  use Digest::MD5 qw(md5_hex);
14  use Memoize;  use Memoize;
15    use File::Which;
16    
17  use Data::Dumper;  use Data::Dumper;
18    
19    use constant {
20            ADDED => 1,
21            DELETED => 2,
22    };
23    
24  =head1 NAME  =head1 NAME
25    
26  SWISH::Split - Perl interface to split index variant of Swish-e  SWISH::Split - Perl interface to split index variant of Swish-e
# Line 27  SWISH::Split - Perl interface to split i Line 33  SWISH::Split - Perl interface to split i
33  =head1 DESCRIPTION  =head1 DESCRIPTION
34    
35  This is alternative interface for indexing data with swish-e. It's designed  This is alternative interface for indexing data with swish-e. It's designed
36  to split indexes over multiple files to allow updates of records in index  to split indexes over multiple files (slices) to allow updates of records in index
37  by reindexing just changed parts.  by reindexing just changed parts (slice).
38    
39  Data is stored in index using intrface which is somewhat similar to  Data is stored in index using intrface which is somewhat similar to
40  L<Plucene::Simple>. This could make your migration (or supporting two index  L<Plucene::Simple>. This could make your migration (or supporting two index
41  engines) easier.  engines) easier.
42    
43  In the background, it will fork swish-e binaries (one for each index slice)  In the background, it will fork swish-e binaries (one for each index slice)
44  and produce UTF-8 encoded XML files. So, if your imput charset isn't  and produce UTF-8 encoded XML files for it. So, if your input charset isn't
45  C<ISO-8859-1> you will have to specify it.  C<ISO-8859-1> you will have to specify it.
46    
47  =head1 Methods used for indexing  =head1 Methods used for indexing
48    
49  =head2 open  =head2 open_index
50    
51  Create new object for index.  Create new object for index.
52    
53    my $i = SWISH::Split->open({    my $i = SWISH::Split->open_index({
54          index => '/path/to/index',          index => '/path/to/index',
55          slice_name => \&slice_on_path,          slice_name => \&slice_on_path,
56          slices => 30,          slices => 30,
57          merge => 1,          merge => 0,
58          codepage => 'ISO-8859-2'          codepage => 'ISO-8859-2',
59            swish_config => qq{
60                    PropertyNames from date
61                    PropertyNamesDate date
62            },
63            memoize_to_xml => 0,
64    );    );
65    
66    # split index on first component of path    # split index on first component of path
# Line 57  Create new object for index. Line 68  Create new object for index.
68          return shift split(/\//,$_[0]);          return shift split(/\//,$_[0]);
69    }    }
70    
71    Options to C<open_index> are following:
72    
73    =over 5
74    
75    =item C<index>
76    
77    path to (existing) directory in which index slices will be created.
78    
79    =item C<slice_name>
80    
81  C<slices> is maximum number of index slices. See L<"in_slice"> for  coderef to function which provide slicing from path.
82    
83    =item C<slices>
84    
85    maximum number of index slices. See L<"in_slice"> for
86  more explanation.  more explanation.
87    
88    =item C<merge>
89    
90    (planned) option to merge indexes into one at end.
91    
92    =item C<codepage>
93    
94    data codepage (needed for conversion to UTF-8).
95    By default, it's C<ISO-8859-1>.
96    
97    =item C<swish_config>
98    
99    additional parametars which will be inserted into
100    C<swish-e> configuration file. See L<swish-config>.
101    
102    =item C<memoize_to_xml>
103    
104    speed up repeatable data, see L<"to_xml">.
105    
106    =back
107    
108  =cut  =cut
109    
110  my $iso2utf = Text::Iconv->new('ISO-8859-1','UTF-8');  my $iso2utf = Text::Iconv->new('ISO-8859-1','UTF-8');
111    
112  sub open {  sub open_index {
113          my $class = shift;          my $class = shift;
114          my $self = {@_};          my $self = {@_};
115          bless($self, $class);          bless($self, $class);
116    
         print Dumper($self->{'slice_name'});  
   
117          croak "need slice_name coderef" unless ref $self->{'slice_name'};          croak "need slice_name coderef" unless ref $self->{'slice_name'};
118          croak "need slices" unless $self->{'slices'};          croak "need slices" unless $self->{'slices'};
119    
# Line 81  sub open { Line 123  sub open {
123    
124          $iso2utf = Text::Iconv->new($self->{'codepage'},'UTF-8') if ($self->{'codepage'});          $iso2utf = Text::Iconv->new($self->{'codepage'},'UTF-8') if ($self->{'codepage'});
125    
126            # speedup
127          memoize('in_slice');          memoize('in_slice');
128            memoize('to_xml') if ($self->{'memoize_to_xml'});
129    
130          $self ? return $self : return undef;          $self ? return $self : return undef;
131    
# Line 104  sub add { Line 148  sub add {
148          my $swishpath = shift || return;          my $swishpath = shift || return;
149          my $data = shift || return;          my $data = shift || return;
150    
151          return 1;          my $slice = $self->put_slice($swishpath, $self->to_xml($data));
152    
153            return $slice;
154  }  }
155    
156  =head2 delete  =head2 delete
# Line 120  sub delete { Line 166  sub delete {
166    
167          my @paths = @_ || return;          my @paths = @_ || return;
168    
169            foreach my $path (@paths) {
170                    $self->{'paths'}->{$path} = DELETED;
171            }
172    
173          return 42;          return 42;
174  }  }
175    
176    
177  =head2 close  =head2 done
178    
179  Close index file and finish indexing.  Finish indexing and close index file(s).
180    
181    $i->close;    $i->done;
182    
183  This is most time-consuming operation. When it's called, it will re-index  This is most time-consuming operation. When it's called, it will re-index
184  all entries which haven't changed in all slices.  all entries which haven't changed in all slices.
185    
186    Returns number of slices updated.
187    
188    This method should really be called close or finish, but both of those are
189    allready used.
190    
191  =cut  =cut
192    
193  sub close {  sub done {
194          my $self = shift;          my $self = shift;
195    
196          return 1;          my $ret = 0;
197    
198            foreach my $s (keys %{$self->{'slice'}}) {
199                    $self->_debug("closing slice $s");
200                    $ret += $self->close_slice($s);
201            }
202    
203            return $ret;
204  }  }
205    
206    
# Line 157  Return array of C<swishpath>s in index. Line 219  Return array of C<swishpath>s in index.
219    
220  sub swishpaths {  sub swishpaths {
221          my $self = shift;          my $self = shift;
222    
223            my $s = shift || return;
224            return if (! exists($self->{'slice'}->{'s'}));
225    
226            return keys %{$self->{'slice'}->{'s'}};
227  }  }
228    
229  =head2 swishpaths_updated  =head2 swishpaths_updated
# Line 207  Takes path and return slice in which thi Line 274  Takes path and return slice in which thi
274    
275    my $s = $i->in_slice('path/to/document/in/index');    my $s = $i->in_slice('path/to/document/in/index');
276    
277  If there are C<slices> parametar to L<"open"> it will use  If there are C<slices> parametar to L<"open_index"> it will use
278  MD5 hash to spread documents across slices. That will produce random  MD5 hash to spread documents across slices. That will produce random
279  distribution of your documents in slices, which might or might not be best  distribution of your documents in slices, which might or might not be best
280  for your data. If you have to re-index large number of slices on each  for your data. If you have to re-index large number of slices on each
281  run, think about creating your own C<slice> function and distributing  run, think about creating your own C<slice> function and distributing
282  documents manually across slices.  documents manually across slices.
283    
284    Slice number must always be true value or various sanity checks will fail.
285    
286  This function is C<Memoize>ed for performance reasons.  This function is C<Memoize>ed for performance reasons.
287    
288  =cut  =cut
# Line 223  sub in_slice { Line 292  sub in_slice {
292    
293          my $path = shift || confess "need path";          my $path = shift || confess "need path";
294    
         print Dumper($self->{'slice_name'});  
295          confess "need slice_name function" unless ref ($self->{'slice_name'});          confess "need slice_name function" unless ref ($self->{'slice_name'});
296    
297          if ($self->{'slices'}) {          if ($self->{'slices'}) {
298                  # first, pass path through slice_name function                  # first, pass path through slice_name function
299                  my $slice = &{$self->{'slice_name'}}($path);                  my $slice = &{$self->{'slice_name'}}($path);
300                  # then calculate MD5 hash                  # then calculate MD5 hash
301                  $slice = md5_hex($slice);                  my $hash = md5_hex($slice);
302                  # take first 8 chars to produce number                  # take first 8 chars to produce number
303                  # FIXME how random is this?                  # FIXME how random is this?
304                  $slice = hex(substr($slice,0,8));                  $hash = hex(substr($hash,0,8));
305                                    
306                  print "slice_nr: $slice slices: ",$self->{'slices'},"\n";                  $slice = ($hash % $self->{'slices'}) + 1;
307                  return ($slice % $self->{'slices'});                  $self->_debug("hash: $hash / ",$self->{'slices'}," => $slice");
308                    return $slice;
309          } else {          } else {
310                  return &{$self->{'split'}}($path);                  return &{$self->{'split'}}($path);
311          }          }
# Line 256  which hasn't changed a while (so, expire Line 325  which hasn't changed a while (so, expire
325  sub find_paths {  sub find_paths {
326          my $self = shift;          my $self = shift;
327    
         my $s = shift || return;  
328  }  }
329    
330    
331    =head2 make_config
332    
333    Create C<swish-e> configuration file for given slice.
334    
335      my $config_filename = $i->make_config('slice name');
336    
337    It returns configuration filename. If no C<swish_config> was defined in
338    L<"open_index">, default swish-e configuration will be used. It will index all data for
339    searching, but none for properties.
340    
341    If you want to see what is allready defined for swish-e in configuration
342    take a look at source code for C<DEFAULT_SWISH_CONF>.
343    
344    It uses C<stdin> as C<IndexDir> to comunicate with C<swish-e>.
345    
346    =cut
347    
348    sub make_config {
349            my $self = shift;
350    
351    
352            my $index_file = $self->{'index'}."/";
353            $index_file .= shift || confess "need slice name";
354    
355            my ($tmp_fh, $swish_config_filename) = mkstemp("/tmp/swishXXXXX");
356    
357            # find cat on filesystem
358            my $cat = which('cat');
359    
360            print $tmp_fh <<"DEFAULT_SWISH_CONF";
361    # swish-e config file
362    
363    IndexDir stdin
364    
365    # input file definition
366    DefaultContents XML*
367    
368    # indexed metatags
369    MetaNames xml swishdocpath
370    
371    
372    #XMLClassAttributes type
373    UndefinedMetaTags auto
374    UndefinedXMLAttributes auto
375    
376    IndexFile $index_file
377    
378    # Croatian ISO-8859-2 characters to unaccented equivalents
379    TranslateCharacters ¹©ðÐèÈæƾ® ssddcccczz
380    
381    
382    # disable output
383    ParserWarnLevel 0
384    IndexReport 1
385    
386    DEFAULT_SWISH_CONF
387    
388            # add user parametars (like stored properties)
389            print $tmp_fh $self->{'swish_config'} if ($self->{'swish_config'});
390    
391            close($tmp_fh);
392    
393            return $swish_config_filename;
394    }
395    
396    =head2 create_slice
397    
398    On first run, starts C<swish-e>. On subsequent calls just return
399    it's handles using L<Memoize>.
400    
401      my $s = create_slice('/path/to/document');
402    
403    You shouldn't need to call C<create_slice> directly because it will be called
404    from L<"put_slice"> when needed.
405    
406    =cut
407    
408    sub create_slice {
409            my $self = shift;
410    
411            my $path = shift || confess "create_slice need path!";
412    
413            my $s = $self->in_slice($path) || confess "in_slice returned null";
414    
415            return $s if (exists($self->{'slice'}->{$s}));
416    
417            my $swish_config = $self->make_config($s);
418    
419            my $swish = qq{| swish-e };
420            $swish .= qq{ -u } if (-f $self->{'index'}.'/'.$s);    
421            $swish .= qq{ -S prog -c } . $swish_config;
422    
423            $self->_debug("creating slice $s using $swish");
424    
425            ## Build the harness, open all pipes, and launch the subprocesses
426            open(my $fh, $swish) || croak "can't open $swish: $!";
427    
428            $self->{'slice'}->{$s}->{'h'} = $fh;
429    
430            $self->slice_output($s);
431    
432            return $s;
433    }
434    
435    =head2 put_slice
436    
437    Pass XML data to swish.
438    
439      my $slice = $i->put_slice('/swish/path', '<xml>data</xml>');
440    
441    Returns slice in which XML ended up.
442    
443    =cut
444    
445    sub put_slice {
446            my $self = shift;
447    
448            my $path = shift || confess "need path";
449            my $xml = shift || confess "need xml";
450    
451            $xml = $iso2utf->convert($xml) || carp "XML conversion error in $xml";
452    
453            my $s = $self->create_slice($path) || confess "create_slice returned null";
454    
455            confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
456            confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));
457    
458            $self->slice_output($s);
459    
460            use bytes;      # as opposed to chars
461            my $fh = $self->{'slice'}->{$s}->{'h'} || confess "handle for slice $s undefined";
462            print { $fh } "Path-Name: $path\n".
463                    "Content-Length: ".(length($xml)+1)."\n".
464                    "Update-Mode: Index\n".
465                    "Document-Type: XML\n\n$xml\n";
466    
467            $self->slice_output($s);
468    
469            $self->_debug("dumping in slice $s: $path");
470    
471            $self->{'paths'}->{$path} = ADDED;
472    
473            return $s;
474    }
475    
476    =head2 slice_output
477    
478    Prints to STDERR output and errors from C<swish-e>.
479    
480      my $slice = $i->slice_output($s);
481    
482    Normally, you don't need to call it.
483    
484    B<This is dummy placeholder function for very old code that assumes this
485    module is using C<IPC::Run> which it isn't any more.>
486    
487    =cut
488    
489    sub slice_output {
490            my $self = shift;
491    
492            my $s = shift || confess "slice_output needs slice";
493    
494            confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
495    
496            # FIXME
497    
498            return $s;
499    }
500    
501    =head2 close_slice
502    
503    Close slice (terminates swish-e process for that slice).
504    
505      my $i->close_slice($s);
506    
507    Returns true if slice is closed, false otherwise.
508    
509    =cut
510    
511    sub close_slice {
512            my $self = shift;
513    
514            my $s = shift || confess "close_slice needs slice";
515    
516            confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
517            confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));
518    
519            # pump rest of content (if any)
520            close $self->{'slice'}->{$s}->{'h'} || carp "can't close slice $s: $!";
521    
522            $self->slice_output($s);
523    
524            undef $self->{'slice'}->{$s}->{'h'};
525            
526            delete($self->{'slice'}->{$s}) && return 1;
527            return 0;
528    }
529    
530    =head2 to_xml
531    
532    Convert (binary safe, I hope) your data into XML for C<swish-e>.
533    Data will not yet be recoded to UTF-8. L<"put_slice"> will do that.
534    
535      my $xml = $i->to_xml({ foo => 'bar' });
536    
537    This function is extracted from L<"add"> method so that you can L<Memoize> it.
538    If your data set has a lot of repeatable data, and memory is not a problem, you
539    can add C<memoize_to_xml> option to L<"open_index">.
540    
541    =cut
542    
543    my %escape = ('<'=>'&lt;', '>'=>'&gt;', '&'=>'&amp;', '"'=>'&quot;');
544    my $escape_re  = join '|' => keys %escape;
545    
546    sub to_xml {
547            my $self = shift;
548    
549            my $data = shift || return;
550    
551            my $xml = qq{<xml>};
552            foreach my $tag (keys %$data) {
553                    my $content = $data->{$tag};
554                    next if (! $content || $content eq '');
555                    # save [cr/]lf before conversion to XML
556    #               $content =~ s/\n\r/##lf##/gs;
557    #               $content =~ s/\n/##lf##/gs;
558                    $content =~ s/($escape_re)/$escape{$1}/gs;
559                    $xml .= "<$tag><![CDATA[".$content."]]></$tag>";
560            }
561            $xml .= qq{</xml>};
562    }
563    
564    sub _debug {
565            my $self = shift;
566            print STDERR "## ",@_,"\n" if ($self->{'debug'});
567            return;
568    }
569    
570  1;  1;
571  __END__  __END__
572    
573    
574  =head2 Searching  =head1 Searching
575    
576  Searching is still conducted using L<SWISH::API>, but you have to glob  Searching is still conducted using L<SWISH::API>, but you have to glob
577  index names.  index names.
# Line 280  not change your source code at all. Line 586  not change your source code at all.
586  That would also benefit performance, but it increases indexing time  That would also benefit performance, but it increases indexing time
587  because merged indexes must be re-created on each indexing run.  because merged indexes must be re-created on each indexing run.
588    
589  =head2 EXPORT  =head1 EXPORT
590    
591  None by default.  Nothing by default.
592    
593    =head1 EXAMPLES
594    
595    Test script for this module uses all parts of API. It's also nice example
596    how to use C<SWISH::Split>.
597    
598  =head1 SEE ALSO  =head1 SEE ALSO
599    

Legend:
Removed from v.3  
changed lines
  Added in v.8

  ViewVC Help
Powered by ViewVC 1.1.26