/[SWISH-Split]/trunk/Split.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/Split.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 6 - (show annotations)
Wed Dec 8 20:35:49 2004 UTC (19 years, 3 months ago) by dpavlin
File size: 13752 byte(s)
better distribution packaging and html target

1 package SWISH::Split;
2
3 use 5.008;
4 use strict;
5 use warnings;
6
7 our $VERSION = '0.00';
8
9 use SWISH::API;
10 use Text::Iconv;
11 use File::Temp qw/ :mktemp /;
12 use Carp;
13 use Digest::MD5 qw(md5_hex);
14 use Memoize;
15 use IPC::Run qw(start timeout pump finish);
16 use File::Which;
17
18 use Data::Dumper;
19
20 use constant {
21 ADDED => 1,
22 DELETED => 2,
23 };
24
25 =head1 NAME
26
27 SWISH::Split - Perl interface to split index variant of Swish-e
28
29 =head1 SYNOPSIS
30
31 use SWISH::Split;
32
33
34 =head1 DESCRIPTION
35
36 This is alternative interface for indexing data with swish-e. It's designed
37 to split indexes over multiple files (slices) to allow updates of records in index
38 by reindexing just changed parts (slice).
39
40 Data is stored in index using intrface which is somewhat similar to
41 L<Plucene::Simple>. This could make your migration (or supporting two index
42 engines) easier.
43
44 In the background, it will fork swish-e binaries (one for each index slice)
45 and produce UTF-8 encoded XML files for it. So, if your input charset isn't
46 C<ISO-8859-1> you will have to specify it.
47
48 =head1 Methods used for indexing
49
50 =head2 open
51
52 Create new object for index.
53
54 my $i = SWISH::Split->open({
55 index => '/path/to/index',
56 slice_name => \&slice_on_path,
57 slices => 30,
58 merge => 0,
59 codepage => 'ISO-8859-2',
60 swish_config => qq{
61 PropertyNames from date
62 PropertyNamesDate date
63 },
64 memoize_to_xml => 0,
65 );
66
67 # split index on first component of path
68 sub slice_on_path {
69 return shift split(/\//,$_[0]);
70 }
71
72 Options to open are following:
73
74 =over 5
75
76 =item C<index>
77
78 path to (existing) directory in which index slices will be created.
79
80 =item C<slice_name>
81
82 coderef to function which provide slicing from path.
83
84 =item C<slices>
85
86 maximum number of index slices. See L<"in_slice"> for
87 more explanation.
88
89 =item C<merge>
90
91 (planned) option to merge indexes into one at end.
92
93 =item C<codepage>
94
95 data codepage (needed for conversion to UTF-8).
96 By default, it's C<ISO-8859-1>.
97
98 =item C<swish_config>
99
100 additional parametars which will be inserted into
101 C<swish-e> configuration file. See L<swish-config>.
102
103 =item C<memoize_to_xml>
104
105 speed up repeatable data, see L<"to_xml">.
106
107 =back
108
109 =cut
110
111 my $iso2utf = Text::Iconv->new('ISO-8859-1','UTF-8');
112
113 sub open {
114 my $class = shift;
115 my $self = {@_};
116 bless($self, $class);
117
118 croak "need slice_name coderef" unless ref $self->{'slice_name'};
119 croak "need slices" unless $self->{'slices'};
120
121 croak "need index" unless $self->{'index'};
122 croak "index '",$self->{'index'},"' doesn't exist" unless -e $self->{'index'};
123 croak "index '",$self->{'index'},"' is not directory" unless -d $self->{'index'};
124
125 $iso2utf = Text::Iconv->new($self->{'codepage'},'UTF-8') if ($self->{'codepage'});
126
127 # speedup
128 memoize('in_slice');
129 memoize('to_xml') if ($self->{'memoize_to_xml'});
130
131 $self ? return $self : return undef;
132
133 }
134
135 =head2 add
136
137 Add document to index.
138
139 $i->add($swishpath, {
140 headline => 'foobar result',
141 property => 'data',
142 })
143
144 =cut
145
146 sub add {
147 my $self = shift;
148
149 my $swishpath = shift || return;
150 my $data = shift || return;
151
152 my ($out,$err) = $self->put_slice($swishpath, $self->to_xml($data));
153
154 if ($err) {
155 carp "$swishpath: $err";
156 return 0;
157 }
158
159 return 1;
160 }
161
162 =head2 delete
163
164 Delete documents from index.
165
166 $i->delete(@swishpath);
167
168 =cut
169
170 sub delete {
171 my $self = shift;
172
173 my @paths = @_ || return;
174
175 foreach my $path (@paths) {
176 $self->{'paths'}->{$path} = DELETED;
177 }
178
179 return 42;
180 }
181
182
183 =head2 done
184
185 Finish indexing and close index file(s).
186
187 $i->done;
188
189 This is most time-consuming operation. When it's called, it will re-index
190 all entries which haven't changed in all slices.
191
192 Returns number of slices updated.
193
194 This method should really be called close or finish, but both of those are
195 allready used.
196
197 =cut
198
199 sub done {
200 my $self = shift;
201
202 my $ret = 0;
203
204 foreach my $s (keys %{$self->{'slice'}}) {
205 print STDERR "closing slice $s\n";
206 $ret += $self->close_slice($s);
207 }
208
209 return $ret;
210 }
211
212
213
214 =head1 Reporting methods
215
216 This methods return statistics about your index.
217
218 =head2 swishpaths
219
220 Return array of C<swishpath>s in index.
221
222 my @p = $i->swishpaths;
223
224 =cut
225
226 sub swishpaths {
227 my $self = shift;
228
229 my $s = shift || return;
230 return if (! exists($self->{'slice'}->{'s'}));
231
232 return keys %{$self->{'slice'}->{'s'}};
233 }
234
235 =head2 swishpaths_updated
236
237 Return array with updated C<swishpath>s.
238
239 my @d = $i->swishpaths_updated;
240
241 =cut
242
243 sub swishpaths_updated {
244 my $self = shift;
245 }
246
247
248 =head2 swishpaths_deleted
249
250 Return array with deleted C<swishpath>s.
251
252 my $n = $i->swishpaths_deleted;
253
254 =cut
255
256 sub swishpaths_deleted {
257 my $self = shift;
258 }
259
260
261 =head2 slices
262
263 Return array with all slice names.
264
265 my @s = $i->slices;
266
267 =cut
268
269 sub slices {
270 my $self = shift;
271 }
272
273 =head1 Helper methods
274
275 This methods are used internally, but they might be useful.
276
277 =head2 in_slice
278
279 Takes path and return slice in which this path belongs.
280
281 my $s = $i->in_slice('path/to/document/in/index');
282
283 If there are C<slices> parametar to L<"open"> it will use
284 MD5 hash to spread documents across slices. That will produce random
285 distribution of your documents in slices, which might or might not be best
286 for your data. If you have to re-index large number of slices on each
287 run, think about creating your own C<slice> function and distributing
288 documents manually across slices.
289
290 Slice number must always be true value or various sanity checks will fail.
291
292 This function is C<Memoize>ed for performance reasons.
293
294 =cut
295
296 sub in_slice {
297 my $self = shift;
298
299 my $path = shift || confess "need path";
300
301 confess "need slice_name function" unless ref ($self->{'slice_name'});
302
303 if ($self->{'slices'}) {
304 # first, pass path through slice_name function
305 my $slice = &{$self->{'slice_name'}}($path);
306 # then calculate MD5 hash
307 $slice = md5_hex($slice);
308 # take first 8 chars to produce number
309 # FIXME how random is this?
310 $slice = hex(substr($slice,0,8));
311
312 $slice = ($slice % $self->{'slices'}) + 1;
313 print "hash: $slice / ",$self->{'slices'}," => $slice\n";
314 return $slice;
315 } else {
316 return &{$self->{'split'}}($path);
317 }
318 }
319
320 =head2 find_paths
321
322 Return array of C<swishpath>s for given C<swish-e> query.
323
324 my @p = $i->find_paths("headline=test*");
325
326 Useful for combining with L<"delete_documents"> to delete documents
327 which hasn't changed a while (so, expired).
328
329 =cut
330
331 sub find_paths {
332 my $self = shift;
333
334 }
335
336
337 =head2 make_config
338
339 Create C<swish-e> configuration file for given slice.
340
341 my $config_filename = $i->make_config('slice name');
342
343 It returns configuration filename. If no C<swish_config> was defined in
344 L<"open">, default swish-e configuration will be used. It will index all data for
345 searching, but none for properties.
346
347 If you want to see what is allready defined for swish-e in configuration
348 take a look at source code for C<DEFAULT_SWISH_CONF>.
349
350 It uses C<cat> utility to comunicate with C<swish-e>. Path is provided
351 by C<File::Which>. Do Windows users have to change that to C<COPY /B>
352 or something similar?
353
354 =cut
355
356 sub make_config {
357 my $self = shift;
358
359
360 my $index_file = $self->{'index'}."/";
361 $index_file .= shift || confess "need slice name";
362
363 my ($tmp_fh, $swish_config_filename) = mkstemp("/tmp/swishXXXXX");
364
365 # find cat on filesystem
366 my $cat = which('cat');
367
368 print $tmp_fh <<"DEFAULT_SWISH_CONF";
369 # swish-e config file
370
371 IndexDir cat
372 #SwishProgParameters -
373
374 # input file definition
375 DefaultContents XML*
376
377 # indexed metatags
378 MetaNames xml swishdocpath
379
380
381 #XMLClassAttributes type
382 UndefinedMetaTags auto
383 UndefinedXMLAttributes auto
384
385 IndexFile $index_file
386
387 # Croatian ISO-8859-2 characters to unaccented equivalents
388 TranslateCharacters ¹©ðÐèÈæƾ® ssddcccczz
389
390
391 # disable output
392 ParserWarnLevel 0
393 IndexReport 1
394
395 DEFAULT_SWISH_CONF
396
397 # add user parametars (like stored properties)
398 print $tmp_fh $self->{'swish_config'} if ($self->{'swish_config'});
399
400 close($tmp_fh);
401
402 return $swish_config_filename;
403 }
404
405 =head2 create_slice
406
407 On first run, starts C<swish-e> using L<IPC::Run>. On subsequent calls just return
408 it's handles using L<Memoize>.
409
410 my $s = create_slice('/path/to/document');
411
412 You shouldn't need to call C<create_slice> directly because it will be called
413 from L<"put_slice"> when needed.
414
415 =cut
416
417 sub create_slice {
418 my $self = shift;
419
420 my $path = shift || confess "create_slice need path!";
421
422 my $s = $self->in_slice($path) || confess "in_slice returned null";
423
424 return $s if (exists($self->{'slice'}->{$s}));
425
426 my $swish_config = $self->make_config($s);
427
428 print STDERR "creating slice $s\n"; # FIXME
429
430 my @swish = qw(swish-e -S prog -c);
431 push @swish, $swish_config;
432
433 ## Build the harness, open all pipes, and launch the subprocesses
434 $self->{'slice'}->{$s}->{'h'} = start \@swish,
435 \$self->{'slice'}->{$s}->{'in'},
436 \$self->{'slice'}->{$s}->{'out'},
437 \$self->{'slice'}->{$s}->{'err'},
438 timeout( 90 ); # FIXME
439
440 $self->{'slice'}->{$s}->{'out_len'} = 0;
441 $self->{'slice'}->{$s}->{'err_len'} = 0;
442
443 $self->slice_output($s);
444
445 return $s;
446 }
447
448 =head2 put_slice
449
450 Pass XML data to swish and receive output and errors.
451
452 my ($out,$err) = $i->put_slice('/swish/path', '<xml>data</xml>');
453
454 =cut
455
456 sub put_slice {
457 my $self = shift;
458
459 my $path = shift || confess "need path";
460 my $xml = shift || confess "need xml";
461
462 $xml = $iso2utf->convert($xml) || carp "XML conversion error in $xml";
463
464 my $s = $self->create_slice($path) || confess "create_slice returned null";
465
466 confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
467 confess "no 'in' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'in'}));
468 confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));
469
470 $self->slice_output($s);
471
472 use bytes; # as opposed to chars
473 $self->{'slice'}->{$s}->{'in'} .=
474 "Path-Name: $path\n".
475 "Content-Length: ".(length($xml)+1)."\n".
476 "Document-Type: XML\n\n$xml\n";
477
478 # do I/O
479 $self->{'slice'}->{$s}->{'h'}->pump while length $self->{'slice'}->{$s}->{'in'} ; # wait for all input to go
480
481 $self->slice_output($s);
482
483
484 $self->{'paths'}->{$path} = ADDED;
485
486 return $s;
487 }
488
489 =head2 slice_output
490
491 Prints to STDERR output and errors from C<swish-e>.
492
493 $i->slice_output($s);
494
495 Normally, you don't need to call it.
496
497 =cut
498
499 sub slice_output {
500 my $self = shift;
501
502 my $s = shift || confess "slice_output needs slice";
503
504 confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
505 confess "no 'in' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'in'}));
506 confess "no 'out' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'out'}));
507
508 if (length $self->{'slice'}->{$s}->{'out'} > $self->{'slice'}->{$s}->{'out_len'}) {
509 #print STDERR "swish-e OUT: ",$self->{'slice'}->{$s}->{'out'},"\n" if ($self->{'slice'}->{$s}->{'out'});
510 $self->{'slice'}->{$s}->{'out_len'} = length $self->{'slice'}->{$s}->{'out'};
511 return 1;
512 } elsif (length $self->{'slice'}->{$s}->{'err'} > $self->{'slice'}->{$s}->{'err_len'}) {
513 print STDERR "swish-e ERR: ",$self->{'slice'}->{$s}->{'err'},"\n" if ($self->{'slice'}->{$s}->{'err'});
514 $self->{'slice'}->{$s}->{'err_len'} = length $self->{'slice'}->{$s}->{'err'};
515 # this is fatal
516 return 0;
517 }
518
519 return 1;
520 }
521
522 =head2 close_slice
523
524 Close slice (terminates swish-e process for that slice).
525
526 my $i->close_slice($s);
527
528 Returns true if slice is closed, false otherwise.
529
530 =cut
531
532 sub close_slice {
533 my $self = shift;
534
535 my $s = shift || confess "close_slice needs slice";
536
537 confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
538 confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));
539
540 # pump rest of content (if any)
541 $self->{'slice'}->{$s}->{'h'}->pump while length $self->{'slice'}->{$s}->{'in'};
542
543 $self->slice_output($s);
544
545 # clean up
546 $self->{'slice'}->{$s}->{'h'}->finish or confess "finish on slice $s returned: $?";
547
548 delete($self->{'slice'}->{$s}) && return 1;
549 return 0;
550 }
551
552 =head2 to_xml
553
554 Convert (binary safe, I hope) your data into XML for C<swish-e>.
555 Data will not yet be recoded to UTF-8. L<"put_slice"> will do that.
556
557 my $xml = $i->to_xml({ foo => 'bar' });
558
559 This function is extracted from L<"add"> method so that you can L<Memoize> it.
560 If your data set has a lot of repeatable data, and memory is not a problem, you
561 can add C<memoize_to_xml> option to L<"open">.
562
563 =cut
564
565 my %escape = ('<'=>'&lt;', '>'=>'&gt;', '&'=>'&amp;', '"'=>'&quot;');
566 my $escape_re = join '|' => keys %escape;
567
568 sub to_xml {
569 my $self = shift;
570
571 my $data = shift || return;
572
573 my $xml = qq{<xml>};
574 foreach my $tag (keys %$data) {
575 my $content = $data->{$tag};
576 next if (! $content || $content eq '');
577 # save [cr/]lf before conversion to XML
578 # $content =~ s/\n\r/##lf##/gs;
579 # $content =~ s/\n/##lf##/gs;
580 $content =~ s/($escape_re)/$escape{$1}/gs;
581 $xml .= "<$tag><![CDATA[".$content."]]></$tag>";
582 }
583 $xml .= qq{</xml>};
584 }
585
586 1;
587 __END__
588
589
590 =head1 Searching
591
592 Searching is still conducted using L<SWISH::API>, but you have to glob
593 index names.
594
595 use SWISH::API;
596
597 my $swish = SWISH::API->new( glob('index.swish-e/*') );
598
599 You can also alternativly create merged index (using C<merge> option) and
600 not change your source code at all.
601
602 That would also benefit performance, but it increases indexing time
603 because merged indexes must be re-created on each indexing run.
604
605 =head1 EXPORT
606
607 Nothing by default.
608
609 =head1 EXAMPLES
610
611 Test script for this module uses all parts of API. It's also nice example
612 how to use C<SWISH::Split>.
613
614 =head1 SEE ALSO
615
616 L<SWISH::API>,
617 L<http://www.swish-e.org/>
618
619 =head1 AUTHOR
620
621 Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
622
623 =head1 COPYRIGHT AND LICENSE
624
625 Copyright (C) 2004 by Dobrica Pavlinusic
626
627 This library is free software; you can redistribute it and/or modify
628 it under the same terms as Perl itself, either Perl version 5.8.4 or,
629 at your option, any later version of Perl 5 you may have available.
630
631
632 =cut

  ViewVC Help
Powered by ViewVC 1.1.26