/[SWISH-Split]/trunk/Split.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/Split.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 3 by dpavlin, Sun Aug 8 10:53:04 2004 UTC revision 4 by dpavlin, Sun Aug 8 19:22:56 2004 UTC
# Line 12  use File::Temp qw/ :mktemp /; Line 12  use File::Temp qw/ :mktemp /;
12  use Carp;  use Carp;
13  use Digest::MD5 qw(md5_hex);  use Digest::MD5 qw(md5_hex);
14  use Memoize;  use Memoize;
15    use IPC::Run qw(start timeout pump finish);
16    use File::Which;
17    
18  use Data::Dumper;  use Data::Dumper;
19    
# Line 27  SWISH::Split - Perl interface to split i Line 29  SWISH::Split - Perl interface to split i
29  =head1 DESCRIPTION  =head1 DESCRIPTION
30    
31  This is alternative interface for indexing data with swish-e. It's designed  This is alternative interface for indexing data with swish-e. It's designed
32  to split indexes over multiple files to allow updates of records in index  to split indexes over multiple files (slices) to allow updates of records in index
33  by reindexing just changed parts.  by reindexing just changed parts (slice).
34    
35  Data is stored in index using intrface which is somewhat similar to  Data is stored in index using intrface which is somewhat similar to
36  L<Plucene::Simple>. This could make your migration (or supporting two index  L<Plucene::Simple>. This could make your migration (or supporting two index
37  engines) easier.  engines) easier.
38    
39  In the background, it will fork swish-e binaries (one for each index slice)  In the background, it will fork swish-e binaries (one for each index slice)
40  and produce UTF-8 encoded XML files. So, if your imput charset isn't  and produce UTF-8 encoded XML files for it. So, if your imput charset isn't
41  C<ISO-8859-1> you will have to specify it.  C<ISO-8859-1> you will have to specify it.
42    
43  =head1 Methods used for indexing  =head1 Methods used for indexing
# Line 48  Create new object for index. Line 50  Create new object for index.
50          index => '/path/to/index',          index => '/path/to/index',
51          slice_name => \&slice_on_path,          slice_name => \&slice_on_path,
52          slices => 30,          slices => 30,
53          merge => 1,          merge => 0,
54          codepage => 'ISO-8859-2'          codepage => 'ISO-8859-2',
55            swish_config => qq{
56                    PropertyNames from date
57                    PropertyNamesDate date
58            },
59            memoize_to_xml => 0,
60    );    );
61    
62    # split index on first component of path    # split index on first component of path
# Line 57  Create new object for index. Line 64  Create new object for index.
64          return shift split(/\//,$_[0]);          return shift split(/\//,$_[0]);
65    }    }
66    
67    Options to open are following:
68    
69  C<slices> is maximum number of index slices. See L<"in_slice"> for  =over 5
70    
71    =item C<index>
72    
73    path to (existing) directory in which index slices will be created.
74    
75    =item C<slice_name>
76    
77    coderef to function which provide slicing from path.
78    
79    =item C<slices>
80    
81    maximum number of index slices. See L<"in_slice"> for
82  more explanation.  more explanation.
83    
84    =item C<merge>
85    
86    (planned) option to merge indexes into one at end.
87    
88    =item C<codepage>
89    
90    data codepage (needed for conversion to UTF-8).
91    By default, it's C<ISO-8859-1>.
92    
93    =item C<swish_config>
94    
95    additional parametars which will be inserted into
96    C<swish-e> configuration file. See L<swish-config>.
97    
98    =item C<memoize_to_xml>
99    
100    speed up repeatable data, see L<"to_xml">.
101    
102    =back
103    
104  =cut  =cut
105    
106  my $iso2utf = Text::Iconv->new('ISO-8859-1','UTF-8');  my $iso2utf = Text::Iconv->new('ISO-8859-1','UTF-8');
# Line 70  sub open { Line 110  sub open {
110          my $self = {@_};          my $self = {@_};
111          bless($self, $class);          bless($self, $class);
112    
         print Dumper($self->{'slice_name'});  
   
113          croak "need slice_name coderef" unless ref $self->{'slice_name'};          croak "need slice_name coderef" unless ref $self->{'slice_name'};
114          croak "need slices" unless $self->{'slices'};          croak "need slices" unless $self->{'slices'};
115    
# Line 81  sub open { Line 119  sub open {
119    
120          $iso2utf = Text::Iconv->new($self->{'codepage'},'UTF-8') if ($self->{'codepage'});          $iso2utf = Text::Iconv->new($self->{'codepage'},'UTF-8') if ($self->{'codepage'});
121    
122            # speedup
123          memoize('in_slice');          memoize('in_slice');
124            memoize('to_xml') if ($self->{'memoize_to_xml'});
125    
126          $self ? return $self : return undef;          $self ? return $self : return undef;
127    
# Line 104  sub add { Line 144  sub add {
144          my $swishpath = shift || return;          my $swishpath = shift || return;
145          my $data = shift || return;          my $data = shift || return;
146    
147            my ($out,$err) = $self->put_slice($swishpath, $self->to_xml($data));
148    
149            if ($err) {
150                    carp "$swishpath: $err";
151                    return 0;
152            }
153    
154          return 1;          return 1;
155  }  }
156    
# Line 124  sub delete { Line 171  sub delete {
171  }  }
172    
173    
174  =head2 close  =head2 finish
175    
176  Close index file and finish indexing.  Finish indexing and close index file(s).
177    
178    $i->close;    $i->finish;
179    
180  This is most time-consuming operation. When it's called, it will re-index  This is most time-consuming operation. When it's called, it will re-index
181  all entries which haven't changed in all slices.  all entries which haven't changed in all slices.
182    
183    Returns number of slices updated.
184    
185  =cut  =cut
186    
187  sub close {  sub finish {
188          my $self = shift;          my $self = shift;
189    
190          return 1;          my $ret = 0;
191    
192            foreach my $s (keys %{$self->{'slice'}}) {
193                    $ret += $self->close_slice($s);
194            }
195    
196            return $ret;
197  }  }
198    
199    
# Line 214  for your data. If you have to re-index l Line 269  for your data. If you have to re-index l
269  run, think about creating your own C<slice> function and distributing  run, think about creating your own C<slice> function and distributing
270  documents manually across slices.  documents manually across slices.
271    
272    Slice number must always be true value or various sanity checks will fail.
273    
274  This function is C<Memoize>ed for performance reasons.  This function is C<Memoize>ed for performance reasons.
275    
276  =cut  =cut
# Line 223  sub in_slice { Line 280  sub in_slice {
280    
281          my $path = shift || confess "need path";          my $path = shift || confess "need path";
282    
         print Dumper($self->{'slice_name'});  
283          confess "need slice_name function" unless ref ($self->{'slice_name'});          confess "need slice_name function" unless ref ($self->{'slice_name'});
284    
285          if ($self->{'slices'}) {          if ($self->{'slices'}) {
# Line 235  sub in_slice { Line 291  sub in_slice {
291                  # FIXME how random is this?                  # FIXME how random is this?
292                  $slice = hex(substr($slice,0,8));                  $slice = hex(substr($slice,0,8));
293                                    
294                  print "slice_nr: $slice slices: ",$self->{'slices'},"\n";                  $slice = ($slice % $self->{'slices'}) + 1;
295                  return ($slice % $self->{'slices'});                  print "hash: $slice / ",$self->{'slices'}," => $slice\n";
296                    return $slice;
297          } else {          } else {
298                  return &{$self->{'split'}}($path);                  return &{$self->{'split'}}($path);
299          }          }
# Line 260  sub find_paths { Line 317  sub find_paths {
317  }  }
318    
319    
320    =head2 make_config
321    
322    Create C<swish-e> configuration file for given slice.
323    
324      my $config_filename = $i->make_config('slice name');
325    
326    It returns configuration filename. If no C<swish_config> was defined in
327    L<"open">, default swish-e configuration will be used. It will index all data for
328    searching, but none for properties.
329    
330    If you want to see what is allready defined for swish-e in configuration
331    take a look at source code for C<DEFAULT_SWISH_CONF>.
332    
333    It uses C<cat> utility to comunicate with C<swish-e>. Path is provided
334    by C<File::Which>. Do Windows users have to change that to C<COPY /B>
335    or something similar?
336    
337    =cut
338    
339    sub make_config {
340            my $self = shift;
341    
342    
343            my $index_file = $self->{'index'}."/";
344            $index_file .= shift || confess "need slice name";
345    
346            my ($tmp_fh, $swish_config_filename) = mkstemp("/tmp/swishXXXXX");
347    
348            # find cat on filesystem
349            my $cat = which('cat');
350    
351            print $tmp_fh <<"DEFAULT_SWISH_CONF";
352    # swish-e config file
353    
354    IndexDir cat
355    #SwishProgParameters -
356    
357    # input file definition
358    DefaultContents XML*
359    
360    # indexed metatags
361    MetaNames xml swishdocpath
362    
363    
364    #XMLClassAttributes type
365    UndefinedMetaTags auto
366    UndefinedXMLAttributes auto
367    
368    IndexFile $index_file
369    
370    # Croatian ISO-8859-2 characters to unaccented equivalents
371    TranslateCharacters ƾ ssddcccczz
372    
373    
374    # disable output
375    ParserWarnLevel 0
376    IndexReport 1
377    
378    DEFAULT_SWISH_CONF
379    
380            # add user parametars (like stored properties)
381            print $tmp_fh $self->{'swish_config'} if ($self->{'swish_config'});
382    
383            close($tmp_fh);
384    
385            return $swish_config_filename;
386    }
387    
388    =head2 create_slice
389    
390    On first run, starts C<swish-e> using L<IPC::Run>. On subsequent calls just return
391    it's handles using L<Memoize>.
392    
393      my $s = create_slice('/path/to/document');
394    
395    You shouldn't need to call C<create_slice> directly because it will be called
396    from L<"put_slice"> when needed.
397    
398    =cut
399    
400    sub create_slice {
401            my $self = shift;
402    
403            my $path = shift || confess "create_slice need path!";
404    
405            my $s = $self->in_slice($path) || confess "in_slice returned null";
406    
407            return $s if (exists($self->{'slice'}->{$s}));
408    
409            my $swish_config = $self->make_config($s);
410    
411            print STDERR "creating slice $s\n";     # FIXME
412    
413            my @swish = qw(swish-e -S prog -c);
414            push @swish, $swish_config;
415    
416            ## Build the harness, open all pipes, and launch the subprocesses
417            $self->{'slice'}->{$s}->{'h'} = start \@swish,
418                    \$self->{'slice'}->{$s}->{'in'},
419                    \$self->{'slice'}->{$s}->{'out'},
420                    \$self->{'slice'}->{$s}->{'err'},
421                    timeout( 90 );  # FIXME
422    
423            $self->{'slice'}->{$s}->{'out_len'} = 0;
424            $self->{'slice'}->{$s}->{'err_len'} = 0;
425    
426            $self->slice_output($s);
427    
428            return $s;
429    }
430    
431    =head2 put_slice
432    
433    Pass XML data to swish and receive output and errors.
434    
435      my ($out,$err) = $i->put_slice('/swish/path', '<xml>data</xml>');
436    
437    =cut
438    
439    sub put_slice {
440            my $self = shift;
441    
442            my $path = shift || confess "need path";
443            my $xml = shift || confess "need xml";
444    
445            $xml = $iso2utf->convert($xml) || carp "XML conversion error in $xml";
446    
447            my $s = $self->create_slice($path) || confess "create_slice returned null";
448    
449            confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
450            confess "no 'in' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'in'}));
451            confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));
452    
453            $self->slice_output($s);
454    
455            use bytes;      # as opposed to chars
456            $self->{'slice'}->{$s}->{'in'} .=
457                    "Path-Name: $path\n".
458                    "Content-Length: ".(length($xml)+1)."\n".
459                    "Document-Type: XML\n\n$xml\n";
460    
461            # do I/O
462            $self->{'slice'}->{$s}->{'h'}->pump while length $self->{'slice'}->{$s}->{'in'} ;  # wait for all input to go
463    
464            $self->slice_output($s);
465    
466            return $s;
467    }
468    
469    =head2 slice_output
470    
471    Prints to STDERR output and errors from C<swish-e>.
472    
473      $i->slice_output($s);
474    
475    Normally, you don't need to call it.
476    
477    =cut
478    
479    sub slice_output {
480            my $self = shift;
481    
482            my $s = shift || confess "slice_output needs slice";
483    
484            confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
485            confess "no 'in' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'in'}));
486            confess "no 'out' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'out'}));
487    
488            if (length $self->{'slice'}->{$s}->{'out'} > $self->{'slice'}->{$s}->{'out_len'}) {
489                    #print STDERR "swish-e OUT: ",$self->{'slice'}->{$s}->{'out'},"\n" if ($self->{'slice'}->{$s}->{'out'});
490                    $self->{'slice'}->{$s}->{'out_len'} = length $self->{'slice'}->{$s}->{'out'};
491                    return 1;
492            } elsif (length $self->{'slice'}->{$s}->{'err'} > $self->{'slice'}->{$s}->{'err_len'}) {
493                    print STDERR "swish-e ERR: ",$self->{'slice'}->{$s}->{'err'},"\n" if ($self->{'slice'}->{$s}->{'err'});
494                    $self->{'slice'}->{$s}->{'err_len'} = length $self->{'slice'}->{$s}->{'err'};
495                    # this is fatal
496                    return 0;
497            }
498    
499            return 1;
500    }
501    
502    =head2 close_slice {
503    
504    Close slice (terminates swish-e process for that slice).
505    
506      my $i->close_slice($s);
507    
508    Returns true if slice is closed, false otherwise.
509    
510    =cut
511    
512    sub close_slice {
513            my $self = shift;
514    
515            my $s = shift || confess "close_slice needs slice";
516    
517            confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
518            confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));
519    
520            # pump rest of content (if any)
521            $self->{'slice'}->{$s}->{'h'}->pump while length $self->{'slice'}->{$s}->{'in'};
522    
523            $self->slice_output($s);
524    
525            # clean up
526            $self->{'slice'}->{$s}->{'h'}->finish or confess "finish on slice $s returned: $?";
527            
528            delete($self->{'slice'}->{$s}) && return 1;
529            return 0;
530    }
531    
532    =head2 to_xml
533    
534    Convert (binary safe, I hope) your data into XML for C<swish-e>.
535    Data will not yet be recoded to UTF-8. L<"put_slice"> will do that.
536    
537      my $xml = $i->to_xml({ foo => 'bar' });
538    
539    This function is extracted from L<"add"> method so that you can L<Memoize> it.
540    If your data set has a lot of repeatable data, and memory is not a problem, you
541    can add C<memoize_to_xml> option to L<"open">.
542    
543    =cut
544    
545    my %escape = ('<'=>'&lt;', '>'=>'&gt;', '&'=>'&amp;', '"'=>'&quot;');
546    my $escape_re  = join '|' => keys %escape;
547    
548    sub to_xml {
549            my $self = shift;
550    
551            my $data = shift || return;
552    
553            my $xml = qq{<xml>};
554            foreach my $tag (keys %$data) {
555                    my $content = $data->{$tag};
556                    next if (! $content || $content eq '');
557                    # save [cr/]lf before conversion to XML
558    #               $content =~ s/\n\r/##lf##/gs;
559    #               $content =~ s/\n/##lf##/gs;
560                    $content =~ s/($escape_re)/$escape{$1}/gs;
561                    $xml .= "<$tag><![CDATA[".$content."]]></$tag>";
562            }
563            $xml .= qq{</xml>};
564    }
565    
566  1;  1;
567  __END__  __END__
# Line 282  because merged indexes must be re-create Line 584  because merged indexes must be re-create
584    
585  =head2 EXPORT  =head2 EXPORT
586    
587  None by default.  Nothing by default.
588    
589    =head2 EXAMPLES
590    
591    Test script for this module uses all parts of API. It's also nice example
592    how to use C<SWISH::Split>.
593    
594  =head1 SEE ALSO  =head1 SEE ALSO
595    

Legend:
Removed from v.3  
changed lines
  Added in v.4

  ViewVC Help
Powered by ViewVC 1.1.26