/[SWISH-Split]/trunk/Split.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/Split.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 7 - (show annotations)
Fri Dec 17 18:32:34 2004 UTC (19 years, 4 months ago) by dpavlin
File size: 13727 byte(s)
a lot of changes:
- better testing framework
- changed put_slice API (to actually confirm with documentation)
- use swish-e stdin instead of external cat utility
- added tags target

1 package SWISH::Split;
2
3 use 5.008;
4 use strict;
5 use warnings;
6
7 our $VERSION = '0.01';
8
9 use SWISH::API;
10 use Text::Iconv;
11 use File::Temp qw/ :mktemp /;
12 use Carp;
13 use Digest::MD5 qw(md5_hex);
14 use Memoize;
15 use IPC::Run qw(start timeout pump finish);
16 use File::Which;
17
18 use Data::Dumper;
19
20 use constant {
21 ADDED => 1,
22 DELETED => 2,
23 };
24
25 =head1 NAME
26
27 SWISH::Split - Perl interface to split index variant of Swish-e
28
29 =head1 SYNOPSIS
30
31 use SWISH::Split;
32
33
34 =head1 DESCRIPTION
35
36 This is alternative interface for indexing data with swish-e. It's designed
37 to split indexes over multiple files (slices) to allow updates of records in index
38 by reindexing just changed parts (slice).
39
40 Data is stored in index using intrface which is somewhat similar to
41 L<Plucene::Simple>. This could make your migration (or supporting two index
42 engines) easier.
43
44 In the background, it will fork swish-e binaries (one for each index slice)
45 and produce UTF-8 encoded XML files for it. So, if your input charset isn't
46 C<ISO-8859-1> you will have to specify it.
47
48 =head1 Methods used for indexing
49
50 =head2 open
51
52 Create new object for index.
53
54 my $i = SWISH::Split->open({
55 index => '/path/to/index',
56 slice_name => \&slice_on_path,
57 slices => 30,
58 merge => 0,
59 codepage => 'ISO-8859-2',
60 swish_config => qq{
61 PropertyNames from date
62 PropertyNamesDate date
63 },
64 memoize_to_xml => 0,
65 );
66
67 # split index on first component of path
68 sub slice_on_path {
69 return shift split(/\//,$_[0]);
70 }
71
72 Options to open are following:
73
74 =over 5
75
76 =item C<index>
77
78 path to (existing) directory in which index slices will be created.
79
80 =item C<slice_name>
81
82 coderef to function which provide slicing from path.
83
84 =item C<slices>
85
86 maximum number of index slices. See L<"in_slice"> for
87 more explanation.
88
89 =item C<merge>
90
91 (planned) option to merge indexes into one at end.
92
93 =item C<codepage>
94
95 data codepage (needed for conversion to UTF-8).
96 By default, it's C<ISO-8859-1>.
97
98 =item C<swish_config>
99
100 additional parametars which will be inserted into
101 C<swish-e> configuration file. See L<swish-config>.
102
103 =item C<memoize_to_xml>
104
105 speed up repeatable data, see L<"to_xml">.
106
107 =back
108
109 =cut
110
111 my $iso2utf = Text::Iconv->new('ISO-8859-1','UTF-8');
112
113 sub open {
114 my $class = shift;
115 my $self = {@_};
116 bless($self, $class);
117
118 croak "need slice_name coderef" unless ref $self->{'slice_name'};
119 croak "need slices" unless $self->{'slices'};
120
121 croak "need index" unless $self->{'index'};
122 croak "index '",$self->{'index'},"' doesn't exist" unless -e $self->{'index'};
123 croak "index '",$self->{'index'},"' is not directory" unless -d $self->{'index'};
124
125 $iso2utf = Text::Iconv->new($self->{'codepage'},'UTF-8') if ($self->{'codepage'});
126
127 # speedup
128 memoize('in_slice');
129 memoize('to_xml') if ($self->{'memoize_to_xml'});
130
131 $self ? return $self : return undef;
132
133 }
134
135 =head2 add
136
137 Add document to index.
138
139 $i->add($swishpath, {
140 headline => 'foobar result',
141 property => 'data',
142 })
143
144 =cut
145
146 sub add {
147 my $self = shift;
148
149 my $swishpath = shift || return;
150 my $data = shift || return;
151
152 my $slice = $self->put_slice($swishpath, $self->to_xml($data));
153
154 # if ($err) {
155 # carp "$swishpath: $err";
156 # return undef;
157 # }
158
159 return $slice;
160 }
161
162 =head2 delete
163
164 Delete documents from index.
165
166 $i->delete(@swishpath);
167
168 =cut
169
170 sub delete {
171 my $self = shift;
172
173 my @paths = @_ || return;
174
175 foreach my $path (@paths) {
176 $self->{'paths'}->{$path} = DELETED;
177 }
178
179 return 42;
180 }
181
182
183 =head2 done
184
185 Finish indexing and close index file(s).
186
187 $i->done;
188
189 This is most time-consuming operation. When it's called, it will re-index
190 all entries which haven't changed in all slices.
191
192 Returns number of slices updated.
193
194 This method should really be called close or finish, but both of those are
195 allready used.
196
197 =cut
198
199 sub done {
200 my $self = shift;
201
202 my $ret = 0;
203
204 foreach my $s (keys %{$self->{'slice'}}) {
205 print STDERR "closing slice $s\n";
206 $ret += $self->close_slice($s);
207 }
208
209 return $ret;
210 }
211
212
213
214 =head1 Reporting methods
215
216 This methods return statistics about your index.
217
218 =head2 swishpaths
219
220 Return array of C<swishpath>s in index.
221
222 my @p = $i->swishpaths;
223
224 =cut
225
226 sub swishpaths {
227 my $self = shift;
228
229 my $s = shift || return;
230 return if (! exists($self->{'slice'}->{'s'}));
231
232 return keys %{$self->{'slice'}->{'s'}};
233 }
234
235 =head2 swishpaths_updated
236
237 Return array with updated C<swishpath>s.
238
239 my @d = $i->swishpaths_updated;
240
241 =cut
242
243 sub swishpaths_updated {
244 my $self = shift;
245 }
246
247
248 =head2 swishpaths_deleted
249
250 Return array with deleted C<swishpath>s.
251
252 my $n = $i->swishpaths_deleted;
253
254 =cut
255
256 sub swishpaths_deleted {
257 my $self = shift;
258 }
259
260
261 =head2 slices
262
263 Return array with all slice names.
264
265 my @s = $i->slices;
266
267 =cut
268
269 sub slices {
270 my $self = shift;
271 }
272
273 =head1 Helper methods
274
275 This methods are used internally, but they might be useful.
276
277 =head2 in_slice
278
279 Takes path and return slice in which this path belongs.
280
281 my $s = $i->in_slice('path/to/document/in/index');
282
283 If there are C<slices> parametar to L<"open"> it will use
284 MD5 hash to spread documents across slices. That will produce random
285 distribution of your documents in slices, which might or might not be best
286 for your data. If you have to re-index large number of slices on each
287 run, think about creating your own C<slice> function and distributing
288 documents manually across slices.
289
290 Slice number must always be true value or various sanity checks will fail.
291
292 This function is C<Memoize>ed for performance reasons.
293
294 =cut
295
296 sub in_slice {
297 my $self = shift;
298
299 my $path = shift || confess "need path";
300
301 confess "need slice_name function" unless ref ($self->{'slice_name'});
302
303 if ($self->{'slices'}) {
304 # first, pass path through slice_name function
305 my $slice = &{$self->{'slice_name'}}($path);
306 # then calculate MD5 hash
307 $slice = md5_hex($slice);
308 # take first 8 chars to produce number
309 # FIXME how random is this?
310 $slice = hex(substr($slice,0,8));
311
312 $slice = ($slice % $self->{'slices'}) + 1;
313 print "hash: $slice / ",$self->{'slices'}," => $slice\n";
314 return $slice;
315 } else {
316 return &{$self->{'split'}}($path);
317 }
318 }
319
320 =head2 find_paths
321
322 Return array of C<swishpath>s for given C<swish-e> query.
323
324 my @p = $i->find_paths("headline=test*");
325
326 Useful for combining with L<"delete_documents"> to delete documents
327 which hasn't changed a while (so, expired).
328
329 =cut
330
331 sub find_paths {
332 my $self = shift;
333
334 }
335
336
337 =head2 make_config
338
339 Create C<swish-e> configuration file for given slice.
340
341 my $config_filename = $i->make_config('slice name');
342
343 It returns configuration filename. If no C<swish_config> was defined in
344 L<"open">, default swish-e configuration will be used. It will index all data for
345 searching, but none for properties.
346
347 If you want to see what is allready defined for swish-e in configuration
348 take a look at source code for C<DEFAULT_SWISH_CONF>.
349
350 It uses C<stdin> as C<IndexDir> to comunicate with C<swish-e>.
351
352 =cut
353
354 sub make_config {
355 my $self = shift;
356
357
358 my $index_file = $self->{'index'}."/";
359 $index_file .= shift || confess "need slice name";
360
361 my ($tmp_fh, $swish_config_filename) = mkstemp("/tmp/swishXXXXX");
362
363 # find cat on filesystem
364 my $cat = which('cat');
365
366 print $tmp_fh <<"DEFAULT_SWISH_CONF";
367 # swish-e config file
368
369 IndexDir stdin
370
371 # input file definition
372 DefaultContents XML*
373
374 # indexed metatags
375 MetaNames xml swishdocpath
376
377
378 #XMLClassAttributes type
379 UndefinedMetaTags auto
380 UndefinedXMLAttributes auto
381
382 IndexFile $index_file
383
384 # Croatian ISO-8859-2 characters to unaccented equivalents
385 TranslateCharacters ¹©ðÐèÈæƾ® ssddcccczz
386
387
388 # disable output
389 ParserWarnLevel 0
390 IndexReport 1
391
392 DEFAULT_SWISH_CONF
393
394 # add user parametars (like stored properties)
395 print $tmp_fh $self->{'swish_config'} if ($self->{'swish_config'});
396
397 close($tmp_fh);
398
399 return $swish_config_filename;
400 }
401
402 =head2 create_slice
403
404 On first run, starts C<swish-e> using L<IPC::Run>. On subsequent calls just return
405 it's handles using L<Memoize>.
406
407 my $s = create_slice('/path/to/document');
408
409 You shouldn't need to call C<create_slice> directly because it will be called
410 from L<"put_slice"> when needed.
411
412 =cut
413
414 sub create_slice {
415 my $self = shift;
416
417 my $path = shift || confess "create_slice need path!";
418
419 my $s = $self->in_slice($path) || confess "in_slice returned null";
420
421 return $s if (exists($self->{'slice'}->{$s}));
422
423 my $swish_config = $self->make_config($s);
424
425 print STDERR "creating slice $s\n"; # FIXME
426
427 my @swish = qw(swish-e -u -S prog -c);
428 push @swish, $swish_config;
429
430 ## Build the harness, open all pipes, and launch the subprocesses
431 $self->{'slice'}->{$s}->{'h'} = start \@swish,
432 \$self->{'slice'}->{$s}->{'in'},
433 \$self->{'slice'}->{$s}->{'out'},
434 \$self->{'slice'}->{$s}->{'err'},
435 timeout( 90 ); # FIXME
436
437 $self->{'slice'}->{$s}->{'out_len'} = 0;
438 $self->{'slice'}->{$s}->{'err_len'} = 0;
439
440 $self->slice_output($s);
441
442 return $s;
443 }
444
445 =head2 put_slice
446
447 Pass XML data to swish.
448
449 my $slice = $i->put_slice('/swish/path', '<xml>data</xml>');
450
451 Returns slice in which XML ended up.
452
453 =cut
454
455 sub put_slice {
456 my $self = shift;
457
458 my $path = shift || confess "need path";
459 my $xml = shift || confess "need xml";
460
461 $xml = $iso2utf->convert($xml) || carp "XML conversion error in $xml";
462
463 my $s = $self->create_slice($path) || confess "create_slice returned null";
464
465 confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
466 confess "no 'in' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'in'}));
467 confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));
468
469 $self->slice_output($s);
470
471 use bytes; # as opposed to chars
472 $self->{'slice'}->{$s}->{'in'} .=
473 "Path-Name: $path\n".
474 "Content-Length: ".(length($xml)+1)."\n".
475 "Update-Mode: Index\n".
476 "Document-Type: XML\n\n$xml\n";
477
478 # do I/O
479 $self->{'slice'}->{$s}->{'h'}->pump while length $self->{'slice'}->{$s}->{'in'} ; # wait for all input to go
480
481 $self->slice_output($s);
482
483 $self->{'paths'}->{$path} = ADDED;
484
485 return $s;
486 }
487
488 =head2 slice_output
489
490 Prints to STDERR output and errors from C<swish-e>.
491
492 my $slice = $i->slice_output($s);
493
494 Normally, you don't need to call it.
495
496 =cut
497
498 sub slice_output {
499 my $self = shift;
500
501 my $s = shift || confess "slice_output needs slice";
502
503 confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
504 confess "no 'in' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'in'}));
505 confess "no 'out' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'out'}));
506
507 if (length $self->{'slice'}->{$s}->{'out'} > $self->{'slice'}->{$s}->{'out_len'}) {
508 #print STDERR "swish-e OUT: ",$self->{'slice'}->{$s}->{'out'},"\n" if ($self->{'slice'}->{$s}->{'out'});
509 $self->{'slice'}->{$s}->{'out_len'} = length $self->{'slice'}->{$s}->{'out'};
510 return $s;
511 } elsif (length $self->{'slice'}->{$s}->{'err'} > $self->{'slice'}->{$s}->{'err_len'}) {
512 print STDERR "swish-e ERR: ",$self->{'slice'}->{$s}->{'err'},"\n" if ($self->{'slice'}->{$s}->{'err'});
513 $self->{'slice'}->{$s}->{'err_len'} = length $self->{'slice'}->{$s}->{'err'};
514 # this is fatal
515 return undef;
516 }
517
518 return $s;
519 }
520
521 =head2 close_slice
522
523 Close slice (terminates swish-e process for that slice).
524
525 my $i->close_slice($s);
526
527 Returns true if slice is closed, false otherwise.
528
529 =cut
530
531 sub close_slice {
532 my $self = shift;
533
534 my $s = shift || confess "close_slice needs slice";
535
536 confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
537 confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));
538
539 # pump rest of content (if any)
540 $self->{'slice'}->{$s}->{'h'}->pump while length $self->{'slice'}->{$s}->{'in'};
541
542 $self->slice_output($s);
543
544 # clean up
545 $self->{'slice'}->{$s}->{'h'}->finish or confess "finish on slice $s returned $?: $! -- ",$self->{'slice'}->{$s}->{'err'};
546
547 delete($self->{'slice'}->{$s}) && return 1;
548 return 0;
549 }
550
551 =head2 to_xml
552
553 Convert (binary safe, I hope) your data into XML for C<swish-e>.
554 Data will not yet be recoded to UTF-8. L<"put_slice"> will do that.
555
556 my $xml = $i->to_xml({ foo => 'bar' });
557
558 This function is extracted from L<"add"> method so that you can L<Memoize> it.
559 If your data set has a lot of repeatable data, and memory is not a problem, you
560 can add C<memoize_to_xml> option to L<"open">.
561
562 =cut
563
564 my %escape = ('<'=>'&lt;', '>'=>'&gt;', '&'=>'&amp;', '"'=>'&quot;');
565 my $escape_re = join '|' => keys %escape;
566
567 sub to_xml {
568 my $self = shift;
569
570 my $data = shift || return;
571
572 my $xml = qq{<xml>};
573 foreach my $tag (keys %$data) {
574 my $content = $data->{$tag};
575 next if (! $content || $content eq '');
576 # save [cr/]lf before conversion to XML
577 # $content =~ s/\n\r/##lf##/gs;
578 # $content =~ s/\n/##lf##/gs;
579 $content =~ s/($escape_re)/$escape{$1}/gs;
580 $xml .= "<$tag><![CDATA[".$content."]]></$tag>";
581 }
582 $xml .= qq{</xml>};
583 }
584
585 1;
586 __END__
587
588
589 =head1 Searching
590
591 Searching is still conducted using L<SWISH::API>, but you have to glob
592 index names.
593
594 use SWISH::API;
595
596 my $swish = SWISH::API->new( glob('index.swish-e/*') );
597
598 You can also alternativly create merged index (using C<merge> option) and
599 not change your source code at all.
600
601 That would also benefit performance, but it increases indexing time
602 because merged indexes must be re-created on each indexing run.
603
604 =head1 EXPORT
605
606 Nothing by default.
607
608 =head1 EXAMPLES
609
610 Test script for this module uses all parts of API. It's also nice example
611 how to use C<SWISH::Split>.
612
613 =head1 SEE ALSO
614
615 L<SWISH::API>,
616 L<http://www.swish-e.org/>
617
618 =head1 AUTHOR
619
620 Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
621
622 =head1 COPYRIGHT AND LICENSE
623
624 Copyright (C) 2004 by Dobrica Pavlinusic
625
626 This library is free software; you can redistribute it and/or modify
627 it under the same terms as Perl itself, either Perl version 5.8.4 or,
628 at your option, any later version of Perl 5 you may have available.
629
630
631 =cut

  ViewVC Help
Powered by ViewVC 1.1.26