/[SWISH-Split]/trunk/Split.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/Split.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 3 by dpavlin, Sun Aug 8 10:53:04 2004 UTC revision 9 by dpavlin, Fri Apr 29 22:35:21 2005 UTC
# Line 4  use 5.008; Line 4  use 5.008;
4  use strict;  use strict;
5  use warnings;  use warnings;
6    
7  our $VERSION = '0.00';  our $VERSION = '0.03';
8    
9  use SWISH::API;  use SWISH::API;
10  use Text::Iconv;  use Text::Iconv;
# Line 12  use File::Temp qw/ :mktemp /; Line 12  use File::Temp qw/ :mktemp /;
12  use Carp;  use Carp;
13  use Digest::MD5 qw(md5_hex);  use Digest::MD5 qw(md5_hex);
14  use Memoize;  use Memoize;
15    use File::Which;
16    
17  use Data::Dumper;  use Data::Dumper;
18    
19    use constant {
20            ADDED => 1,
21            DELETED => 2,
22    };
23    
24  =head1 NAME  =head1 NAME
25    
26  SWISH::Split - Perl interface to split index variant of Swish-e  SWISH::Split - Perl interface to split index variant of Swish-e
# Line 27  SWISH::Split - Perl interface to split i Line 33  SWISH::Split - Perl interface to split i
33  =head1 DESCRIPTION  =head1 DESCRIPTION
34    
35  This is alternative interface for indexing data with swish-e. It's designed  This is alternative interface for indexing data with swish-e. It's designed
36  to split indexes over multiple files to allow updates of records in index  to split indexes over multiple files (slices) to allow updates of records in index
37  by reindexing just changed parts.  by reindexing just changed parts (slice).
38    
39  Data is stored in index using intrface which is somewhat similar to  Data is stored in index using intrface which is somewhat similar to
40  L<Plucene::Simple>. This could make your migration (or supporting two index  L<Plucene::Simple>. This could make your migration (or supporting two index
41  engines) easier.  engines) easier.
42    
43  In the background, it will fork swish-e binaries (one for each index slice)  In the background, it will fork swish-e binaries (one for each index slice)
44  and produce UTF-8 encoded XML files. So, if your imput charset isn't  and produce UTF-8 encoded XML files for it. So, if your input charset isn't
45  C<ISO-8859-1> you will have to specify it.  C<ISO-8859-1> you will have to specify it.
46    
47  =head1 Methods used for indexing  =head1 Methods used for indexing
48    
49  =head2 open  =head2 open_index
50    
51  Create new object for index.  Create new object for index.
52    
53    my $i = SWISH::Split->open({    my $i = SWISH::Split->open_index({
54          index => '/path/to/index',          index => '/path/to/index',
55          slice_name => \&slice_on_path,          slice_name => \&slice_on_path,
56          slices => 30,          slices => 30,
57          merge => 1,          merge => 0,
58          codepage => 'ISO-8859-2'          codepage => 'ISO-8859-2',
59            swish_config => qq{
60                    PropertyNames from date
61                    PropertyNamesDate date
62            },
63            memoize_to_xml => 0,
64    );    );
65    
66    # split index on first component of path    # split index on first component of path
# Line 57  Create new object for index. Line 68  Create new object for index.
68          return shift split(/\//,$_[0]);          return shift split(/\//,$_[0]);
69    }    }
70    
71    Options to C<open_index> are following:
72    
73    =over 5
74    
75    =item C<index>
76    
77    path to (existing) directory in which index slices will be created.
78    
79    =item C<slice_name>
80    
81  C<slices> is maximum number of index slices. See L<"in_slice"> for  coderef to function which provide slicing from path.
82    
83    =item C<slices>
84    
85    maximum number of index slices. See L<"in_slice"> for
86  more explanation.  more explanation.
87    
88    =item C<merge>
89    
90    (planned) option to merge indexes into one at end.
91    
92    =item C<codepage>
93    
94    data codepage (needed for conversion to UTF-8).
95    By default, it's C<ISO-8859-1>.
96    
97    =item C<swish_config>
98    
99    additional parametars which will be inserted into
100    C<swish-e> configuration file. See L<swish-config>.
101    
102    =item C<memoize_to_xml>
103    
104    speed up repeatable data, see L<"to_xml">.
105    
106    =back
107    
108  =cut  =cut
109    
110  my $iso2utf = Text::Iconv->new('ISO-8859-1','UTF-8');  my $iso2utf = Text::Iconv->new('ISO-8859-1','UTF-8');
111    
112  sub open {  sub open_index {
113          my $class = shift;          my $class = shift;
114          my $self = {@_};          my $self = {@_};
115          bless($self, $class);          bless($self, $class);
116    
         print Dumper($self->{'slice_name'});  
   
117          croak "need slice_name coderef" unless ref $self->{'slice_name'};          croak "need slice_name coderef" unless ref $self->{'slice_name'};
118          croak "need slices" unless $self->{'slices'};          croak "need slices" unless $self->{'slices'};
119    
# Line 81  sub open { Line 123  sub open {
123    
124          $iso2utf = Text::Iconv->new($self->{'codepage'},'UTF-8') if ($self->{'codepage'});          $iso2utf = Text::Iconv->new($self->{'codepage'},'UTF-8') if ($self->{'codepage'});
125    
126            # speedup
127          memoize('in_slice');          memoize('in_slice');
128            memoize('to_xml') if ($self->{'memoize_to_xml'});
129    
130          $self ? return $self : return undef;          $self ? return $self : return undef;
131    
# Line 104  sub add { Line 148  sub add {
148          my $swishpath = shift || return;          my $swishpath = shift || return;
149          my $data = shift || return;          my $data = shift || return;
150    
151          return 1;          my $slice = $self->put_slice($swishpath, $self->to_xml($data));
152    
153            return $slice;
154  }  }
155    
156  =head2 delete  =head2 delete
# Line 120  sub delete { Line 166  sub delete {
166    
167          my @paths = @_ || return;          my @paths = @_ || return;
168    
169            foreach my $path (@paths) {
170                    $self->{'paths'}->{$path} = DELETED;
171            }
172    
173          return 42;          return 42;
174  }  }
175    
176    
177  =head2 close  =head2 done
178    
179  Close index file and finish indexing.  Finish indexing and close index file(s).
180    
181    $i->close;    $i->done;
182    
183  This is most time-consuming operation. When it's called, it will re-index  This is most time-consuming operation. When it's called, it will re-index
184  all entries which haven't changed in all slices.  all entries which haven't changed in all slices.
185    
186    Returns number of slices updated.
187    
188    This method should really be called close or finish, but both of those are
189    allready used.
190    
191  =cut  =cut
192    
193  sub close {  sub done {
194          my $self = shift;          my $self = shift;
195    
196          return 1;          my $ret = 0;
197    
198            foreach my $s (keys %{$self->{'slice'}}) {
199                    $self->_debug("closing slice $s");
200                    $ret += $self->close_slice($s);
201            }
202    
203            return $ret;
204  }  }
205    
206    
# Line 157  Return array of C<swishpath>s in index. Line 219  Return array of C<swishpath>s in index.
219    
220  sub swishpaths {  sub swishpaths {
221          my $self = shift;          my $self = shift;
222    
223            my $s = shift || return;
224            return if (! exists($self->{'slice'}->{'s'}));
225    
226            return keys %{$self->{'slice'}->{'s'}};
227  }  }
228    
229  =head2 swishpaths_updated  =head2 swishpaths_updated
# Line 207  Takes path and return slice in which thi Line 274  Takes path and return slice in which thi
274    
275    my $s = $i->in_slice('path/to/document/in/index');    my $s = $i->in_slice('path/to/document/in/index');
276    
277  If there are C<slices> parametar to L<"open"> it will use  If there are C<slices> parametar to L<"open_index"> it will use
278  MD5 hash to spread documents across slices. That will produce random  MD5 hash to spread documents across slices. That will produce random
279  distribution of your documents in slices, which might or might not be best  distribution of your documents in slices, which might or might not be best
280  for your data. If you have to re-index large number of slices on each  for your data. If you have to re-index large number of slices on each
281  run, think about creating your own C<slice> function and distributing  run, think about creating your own C<slice> function and distributing
282  documents manually across slices.  documents manually across slices.
283    
284    Slice number must always be true value or various sanity checks will fail.
285    
286  This function is C<Memoize>ed for performance reasons.  This function is C<Memoize>ed for performance reasons.
287    
288  =cut  =cut
# Line 223  sub in_slice { Line 292  sub in_slice {
292    
293          my $path = shift || confess "need path";          my $path = shift || confess "need path";
294    
         print Dumper($self->{'slice_name'});  
295          confess "need slice_name function" unless ref ($self->{'slice_name'});          confess "need slice_name function" unless ref ($self->{'slice_name'});
296    
297          if ($self->{'slices'}) {          if ($self->{'slices'}) {
298                  # first, pass path through slice_name function                  # first, pass path through slice_name function
299                  my $slice = &{$self->{'slice_name'}}($path);                  my $slice = &{$self->{'slice_name'}}($path);
300                  # then calculate MD5 hash                  # then calculate MD5 hash
301                  $slice = md5_hex($slice);                  my $hash = md5_hex($slice);
302                  # take first 8 chars to produce number                  # take first 8 chars to produce number
303                  # FIXME how random is this?                  # FIXME how random is this?
304                  $slice = hex(substr($slice,0,8));                  $hash = hex(substr($hash,0,8));
305                                    
306                  print "slice_nr: $slice slices: ",$self->{'slices'},"\n";                  $slice = ($hash % $self->{'slices'}) + 1;
307                  return ($slice % $self->{'slices'});                  $self->_debug("hash: $hash / ",$self->{'slices'}," => $slice");
308                    return $slice;
309          } else {          } else {
310                  return &{$self->{'split'}}($path);                  return &{$self->{'split'}}($path);
311          }          }
# Line 256  which hasn't changed a while (so, expire Line 325  which hasn't changed a while (so, expire
325  sub find_paths {  sub find_paths {
326          my $self = shift;          my $self = shift;
327    
         my $s = shift || return;  
328  }  }
329    
330    
331    =head2 make_config
332    
333    Create C<swish-e> configuration file for given slice.
334    
335      my $config_filename = $i->make_config('slice name');
336    
337    It returns configuration filename. If no C<swish_config> was defined in
338    L<"open_index">, default swish-e configuration will be used. It will index all data for
339    searching, but none for properties.
340    
341    If you want to see what is allready defined for swish-e in configuration
342    take a look at source code for C<DEFAULT_SWISH_CONF>.
343    
344    It uses C<stdin> as C<IndexDir> to comunicate with C<swish-e>.
345    
346    =cut
347    
348    sub make_config {
349            my $self = shift;
350    
351    
352            my $index_file = $self->{'index'}."/";
353            $index_file .= shift || confess "need slice name";
354    
355            my ($tmp_fh, $swish_config_filename) = mkstemp("/tmp/swishXXXXX");
356    
357            # find cat on filesystem
358            my $cat = which('cat');
359    
360            print $tmp_fh <<"DEFAULT_SWISH_CONF";
361    # swish-e config file
362    
363    IndexDir stdin
364    
365    # input file definition
366    DefaultContents XML*
367    
368    # indexed metatags
369    MetaNames xml swishdocpath
370    
371    
372    #XMLClassAttributes type
373    UndefinedMetaTags auto
374    UndefinedXMLAttributes auto
375    
376    IndexFile $index_file
377    
378    # Croatian ISO-8859-2 characters to unaccented equivalents
379    TranslateCharacters ¹©ðÐèÈæƾ® ssddcccczz
380    
381    
382    # disable output
383    ParserWarnLevel 0
384    IndexReport 1
385    
386    DEFAULT_SWISH_CONF
387    
388            # add user parametars (like stored properties)
389            print $tmp_fh $self->{'swish_config'} if ($self->{'swish_config'});
390    
391            close($tmp_fh);
392    
393            return $swish_config_filename;
394    }
395    
396    =head2 create_slice
397    
398    On first run, starts C<swish-e>. On subsequent calls just return
399    it's handles using L<Memoize>.
400    
401      my $s = create_slice('/path/to/document');
402    
403    You shouldn't need to call C<create_slice> directly because it will be called
404    from L<"put_slice"> when needed.
405    
406    =cut
407    
408    sub create_slice {
409            my $self = shift;
410    
411            my $path = shift || confess "create_slice need path!";
412    
413            my $s = $self->in_slice($path) || confess "in_slice returned null";
414    
415            return $s if (exists($self->{'slice'}->{$s}));
416    
417            my $swish_config = $self->make_config($s);
418    
419            my $swish = qq{| swish-e };
420            if (-f $self->{'index'}.'/'.$s) {
421                    $swish .= qq{ -u };
422                    $self->{'slice'}->{$s}->{'update_mode'}++;
423            }
424            $swish .= qq{ -S prog -c } . $swish_config;
425    
426            $self->_debug("creating slice $s using $swish");
427    
428            ## Build the harness, open all pipes, and launch the subprocesses
429            open(my $fh, $swish) || croak "can't open $swish: $!";
430    
431            $self->{'slice'}->{$s}->{'h'} = $fh;
432    
433            $self->slice_output($s);
434    
435            return $s;
436    }
437    
438    =head2 put_slice
439    
440    Pass XML data to swish.
441    
442      my $slice = $i->put_slice('/swish/path', '<xml>data</xml>');
443    
444    Returns slice in which XML ended up.
445    
446    =cut
447    
448    sub put_slice {
449            my $self = shift;
450    
451            my $path = shift || confess "need path";
452            my $xml = shift || confess "need xml";
453    
454            $xml = $iso2utf->convert($xml) || carp "XML conversion error in $xml";
455    
456            my $s = $self->create_slice($path) || confess "create_slice returned null";
457    
458            confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
459            confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));
460    
461            $self->slice_output($s);
462    
463            use bytes;      # as opposed to chars
464            my $fh = $self->{'slice'}->{$s}->{'h'} || confess "handle for slice $s undefined";
465    
466            my $update_header = "Update-Mode: Index\n";
467            $update_header = '' unless ($self->{'slice'}->{$s}->{'update_mode'});
468    
469            print { $fh } "Path-Name: $path\n".
470                    "Content-Length: ".(length($xml)+1)."\n" . $update_header .
471                    "Document-Type: XML\n\n$xml\n";
472    
473            $self->slice_output($s);
474    
475            $self->_debug("dumping in slice $s: $path");
476    
477            $self->{'paths'}->{$path} = ADDED;
478    
479            return $s;
480    }
481    
482    =head2 slice_output
483    
484    Prints to STDERR output and errors from C<swish-e>.
485    
486      my $slice = $i->slice_output($s);
487    
488    Normally, you don't need to call it.
489    
490    B<This is dummy placeholder function for very old code that assumes this
491    module is using C<IPC::Run> which it isn't any more.>
492    
493    =cut
494    
495    sub slice_output {
496            my $self = shift;
497    
498            my $s = shift || confess "slice_output needs slice";
499    
500            confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
501    
502            # FIXME
503    
504            return $s;
505    }
506    
507    =head2 close_slice
508    
509    Close slice (terminates swish-e process for that slice).
510    
511      my $i->close_slice($s);
512    
513    Returns true if slice is closed, false otherwise.
514    
515    =cut
516    
517    sub close_slice {
518            my $self = shift;
519    
520            my $s = shift || confess "close_slice needs slice";
521    
522            confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
523            confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));
524    
525            # pump rest of content (if any)
526            close $self->{'slice'}->{$s}->{'h'} || carp "can't close slice $s: $!";
527    
528            $self->slice_output($s);
529    
530            undef $self->{'slice'}->{$s}->{'h'};
531            
532            delete($self->{'slice'}->{$s}) && return 1;
533            return 0;
534    }
535    
536    =head2 to_xml
537    
538    Convert (binary safe, I hope) your data into XML for C<swish-e>.
539    Data will not yet be recoded to UTF-8. L<"put_slice"> will do that.
540    
541      my $xml = $i->to_xml({ foo => 'bar' });
542    
543    This function is extracted from L<"add"> method so that you can L<Memoize> it.
544    If your data set has a lot of repeatable data, and memory is not a problem, you
545    can add C<memoize_to_xml> option to L<"open_index">.
546    
547    =cut
548    
549    my %escape = ('<'=>'&lt;', '>'=>'&gt;', '&'=>'&amp;', '"'=>'&quot;');
550    my $escape_re  = join '|' => keys %escape;
551    
552    sub to_xml {
553            my $self = shift;
554    
555            my $data = shift || return;
556    
557            my $xml = qq{<xml>};
558            foreach my $tag (keys %$data) {
559                    my $content = $data->{$tag};
560                    next if (! $content || $content eq '');
561                    # save [cr/]lf before conversion to XML
562    #               $content =~ s/\n\r/##lf##/gs;
563    #               $content =~ s/\n/##lf##/gs;
564                    $content =~ s/($escape_re)/$escape{$1}/gs;
565                    $xml .= "<$tag><![CDATA[".$content."]]></$tag>";
566            }
567            $xml .= qq{</xml>};
568    }
569    
570    sub _debug {
571            my $self = shift;
572            print STDERR "## ",@_,"\n" if ($self->{'debug'});
573            return;
574    }
575    
576  1;  1;
577  __END__  __END__
578    
579    
580  =head2 Searching  =head1 Searching
581    
582  Searching is still conducted using L<SWISH::API>, but you have to glob  Searching is still conducted using L<SWISH::API>, but you have to glob
583  index names.  index names.
# Line 280  not change your source code at all. Line 592  not change your source code at all.
592  That would also benefit performance, but it increases indexing time  That would also benefit performance, but it increases indexing time
593  because merged indexes must be re-created on each indexing run.  because merged indexes must be re-created on each indexing run.
594    
595  =head2 EXPORT  =head1 EXPORT
596    
597  None by default.  Nothing by default.
598    
599    =head1 EXAMPLES
600    
601    Test script for this module uses all parts of API. It's also nice example
602    how to use C<SWISH::Split>.
603    
604  =head1 SEE ALSO  =head1 SEE ALSO
605    

Legend:
Removed from v.3  
changed lines
  Added in v.9

  ViewVC Help
Powered by ViewVC 1.1.26