/[SWISH-Split]/trunk/Split.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/Split.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 4 - (show annotations)
Sun Aug 8 19:22:56 2004 UTC (19 years, 7 months ago) by dpavlin
File size: 13382 byte(s)
first version which passes 51 test. It still doesn't update documents, just
insert.

1 package SWISH::Split;
2
3 use 5.008;
4 use strict;
5 use warnings;
6
7 our $VERSION = '0.00';
8
9 use SWISH::API;
10 use Text::Iconv;
11 use File::Temp qw/ :mktemp /;
12 use Carp;
13 use Digest::MD5 qw(md5_hex);
14 use Memoize;
15 use IPC::Run qw(start timeout pump finish);
16 use File::Which;
17
18 use Data::Dumper;
19
20 =head1 NAME
21
22 SWISH::Split - Perl interface to split index variant of Swish-e
23
24 =head1 SYNOPSIS
25
26 use SWISH::Split;
27
28
29 =head1 DESCRIPTION
30
31 This is alternative interface for indexing data with swish-e. It's designed
32 to split indexes over multiple files (slices) to allow updates of records in index
33 by reindexing just changed parts (slice).
34
35 Data is stored in index using intrface which is somewhat similar to
36 L<Plucene::Simple>. This could make your migration (or supporting two index
37 engines) easier.
38
39 In the background, it will fork swish-e binaries (one for each index slice)
40 and produce UTF-8 encoded XML files for it. So, if your imput charset isn't
41 C<ISO-8859-1> you will have to specify it.
42
43 =head1 Methods used for indexing
44
45 =head2 open
46
47 Create new object for index.
48
49 my $i = SWISH::Split->open({
50 index => '/path/to/index',
51 slice_name => \&slice_on_path,
52 slices => 30,
53 merge => 0,
54 codepage => 'ISO-8859-2',
55 swish_config => qq{
56 PropertyNames from date
57 PropertyNamesDate date
58 },
59 memoize_to_xml => 0,
60 );
61
62 # split index on first component of path
63 sub slice_on_path {
64 return shift split(/\//,$_[0]);
65 }
66
67 Options to open are following:
68
69 =over 5
70
71 =item C<index>
72
73 path to (existing) directory in which index slices will be created.
74
75 =item C<slice_name>
76
77 coderef to function which provide slicing from path.
78
79 =item C<slices>
80
81 maximum number of index slices. See L<"in_slice"> for
82 more explanation.
83
84 =item C<merge>
85
86 (planned) option to merge indexes into one at end.
87
88 =item C<codepage>
89
90 data codepage (needed for conversion to UTF-8).
91 By default, it's C<ISO-8859-1>.
92
93 =item C<swish_config>
94
95 additional parametars which will be inserted into
96 C<swish-e> configuration file. See L<swish-config>.
97
98 =item C<memoize_to_xml>
99
100 speed up repeatable data, see L<"to_xml">.
101
102 =back
103
104 =cut
105
106 my $iso2utf = Text::Iconv->new('ISO-8859-1','UTF-8');
107
108 sub open {
109 my $class = shift;
110 my $self = {@_};
111 bless($self, $class);
112
113 croak "need slice_name coderef" unless ref $self->{'slice_name'};
114 croak "need slices" unless $self->{'slices'};
115
116 croak "need index" unless $self->{'index'};
117 croak "index '",$self->{'index'},"' doesn't exist" unless -e $self->{'index'};
118 croak "index '",$self->{'index'},"' is not directory" unless -d $self->{'index'};
119
120 $iso2utf = Text::Iconv->new($self->{'codepage'},'UTF-8') if ($self->{'codepage'});
121
122 # speedup
123 memoize('in_slice');
124 memoize('to_xml') if ($self->{'memoize_to_xml'});
125
126 $self ? return $self : return undef;
127
128 }
129
130 =head2 add
131
132 Add document to index.
133
134 $i->add($swishpath, {
135 headline => 'foobar result',
136 property => 'data',
137 })
138
139 =cut
140
141 sub add {
142 my $self = shift;
143
144 my $swishpath = shift || return;
145 my $data = shift || return;
146
147 my ($out,$err) = $self->put_slice($swishpath, $self->to_xml($data));
148
149 if ($err) {
150 carp "$swishpath: $err";
151 return 0;
152 }
153
154 return 1;
155 }
156
157 =head2 delete
158
159 Delete documents from index.
160
161 $i->delete(@swishpath);
162
163 =cut
164
165 sub delete {
166 my $self = shift;
167
168 my @paths = @_ || return;
169
170 return 42;
171 }
172
173
174 =head2 finish
175
176 Finish indexing and close index file(s).
177
178 $i->finish;
179
180 This is most time-consuming operation. When it's called, it will re-index
181 all entries which haven't changed in all slices.
182
183 Returns number of slices updated.
184
185 =cut
186
187 sub finish {
188 my $self = shift;
189
190 my $ret = 0;
191
192 foreach my $s (keys %{$self->{'slice'}}) {
193 $ret += $self->close_slice($s);
194 }
195
196 return $ret;
197 }
198
199
200
201 =head1 Reporting methods
202
203 This methods return statistics about your index.
204
205 =head2 swishpaths
206
207 Return array of C<swishpath>s in index.
208
209 my @p = $i->swishpaths;
210
211 =cut
212
213 sub swishpaths {
214 my $self = shift;
215 }
216
217 =head2 swishpaths_updated
218
219 Return array with updated C<swishpath>s.
220
221 my @d = $i->swishpaths_updated;
222
223 =cut
224
225 sub swishpaths_updated {
226 my $self = shift;
227 }
228
229
230 =head2 swishpaths_deleted
231
232 Return array with deleted C<swishpath>s.
233
234 my $n = $i->swishpaths_deleted;
235
236 =cut
237
238 sub swishpaths_deleted {
239 my $self = shift;
240 }
241
242
243 =head2 slices
244
245 Return array with all slice names.
246
247 my @s = $i->slices;
248
249 =cut
250
251 sub slices {
252 my $self = shift;
253 }
254
255 =head1 Helper methods
256
257 This methods are used internally, but they might be useful.
258
259 =head2 in_slice
260
261 Takes path and return slice in which this path belongs.
262
263 my $s = $i->in_slice('path/to/document/in/index');
264
265 If there are C<slices> parametar to L<"open"> it will use
266 MD5 hash to spread documents across slices. That will produce random
267 distribution of your documents in slices, which might or might not be best
268 for your data. If you have to re-index large number of slices on each
269 run, think about creating your own C<slice> function and distributing
270 documents manually across slices.
271
272 Slice number must always be true value or various sanity checks will fail.
273
274 This function is C<Memoize>ed for performance reasons.
275
276 =cut
277
278 sub in_slice {
279 my $self = shift;
280
281 my $path = shift || confess "need path";
282
283 confess "need slice_name function" unless ref ($self->{'slice_name'});
284
285 if ($self->{'slices'}) {
286 # first, pass path through slice_name function
287 my $slice = &{$self->{'slice_name'}}($path);
288 # then calculate MD5 hash
289 $slice = md5_hex($slice);
290 # take first 8 chars to produce number
291 # FIXME how random is this?
292 $slice = hex(substr($slice,0,8));
293
294 $slice = ($slice % $self->{'slices'}) + 1;
295 print "hash: $slice / ",$self->{'slices'}," => $slice\n";
296 return $slice;
297 } else {
298 return &{$self->{'split'}}($path);
299 }
300 }
301
302 =head2 find_paths
303
304 Return array of C<swishpath>s for given C<swish-e> query.
305
306 my @p = $i->find_paths("headline=test*");
307
308 Useful for combining with L<"delete_documents"> to delete documents
309 which hasn't changed a while (so, expired).
310
311 =cut
312
313 sub find_paths {
314 my $self = shift;
315
316 my $s = shift || return;
317 }
318
319
320 =head2 make_config
321
322 Create C<swish-e> configuration file for given slice.
323
324 my $config_filename = $i->make_config('slice name');
325
326 It returns configuration filename. If no C<swish_config> was defined in
327 L<"open">, default swish-e configuration will be used. It will index all data for
328 searching, but none for properties.
329
330 If you want to see what is allready defined for swish-e in configuration
331 take a look at source code for C<DEFAULT_SWISH_CONF>.
332
333 It uses C<cat> utility to comunicate with C<swish-e>. Path is provided
334 by C<File::Which>. Do Windows users have to change that to C<COPY /B>
335 or something similar?
336
337 =cut
338
339 sub make_config {
340 my $self = shift;
341
342
343 my $index_file = $self->{'index'}."/";
344 $index_file .= shift || confess "need slice name";
345
346 my ($tmp_fh, $swish_config_filename) = mkstemp("/tmp/swishXXXXX");
347
348 # find cat on filesystem
349 my $cat = which('cat');
350
351 print $tmp_fh <<"DEFAULT_SWISH_CONF";
352 # swish-e config file
353
354 IndexDir cat
355 #SwishProgParameters -
356
357 # input file definition
358 DefaultContents XML*
359
360 # indexed metatags
361 MetaNames xml swishdocpath
362
363
364 #XMLClassAttributes type
365 UndefinedMetaTags auto
366 UndefinedXMLAttributes auto
367
368 IndexFile $index_file
369
370 # Croatian ISO-8859-2 characters to unaccented equivalents
371 TranslateCharacters ¹©ðÐèÈæƾ® ssddcccczz
372
373
374 # disable output
375 ParserWarnLevel 0
376 IndexReport 1
377
378 DEFAULT_SWISH_CONF
379
380 # add user parametars (like stored properties)
381 print $tmp_fh $self->{'swish_config'} if ($self->{'swish_config'});
382
383 close($tmp_fh);
384
385 return $swish_config_filename;
386 }
387
388 =head2 create_slice
389
390 On first run, starts C<swish-e> using L<IPC::Run>. On subsequent calls just return
391 it's handles using L<Memoize>.
392
393 my $s = create_slice('/path/to/document');
394
395 You shouldn't need to call C<create_slice> directly because it will be called
396 from L<"put_slice"> when needed.
397
398 =cut
399
400 sub create_slice {
401 my $self = shift;
402
403 my $path = shift || confess "create_slice need path!";
404
405 my $s = $self->in_slice($path) || confess "in_slice returned null";
406
407 return $s if (exists($self->{'slice'}->{$s}));
408
409 my $swish_config = $self->make_config($s);
410
411 print STDERR "creating slice $s\n"; # FIXME
412
413 my @swish = qw(swish-e -S prog -c);
414 push @swish, $swish_config;
415
416 ## Build the harness, open all pipes, and launch the subprocesses
417 $self->{'slice'}->{$s}->{'h'} = start \@swish,
418 \$self->{'slice'}->{$s}->{'in'},
419 \$self->{'slice'}->{$s}->{'out'},
420 \$self->{'slice'}->{$s}->{'err'},
421 timeout( 90 ); # FIXME
422
423 $self->{'slice'}->{$s}->{'out_len'} = 0;
424 $self->{'slice'}->{$s}->{'err_len'} = 0;
425
426 $self->slice_output($s);
427
428 return $s;
429 }
430
431 =head2 put_slice
432
433 Pass XML data to swish and receive output and errors.
434
435 my ($out,$err) = $i->put_slice('/swish/path', '<xml>data</xml>');
436
437 =cut
438
439 sub put_slice {
440 my $self = shift;
441
442 my $path = shift || confess "need path";
443 my $xml = shift || confess "need xml";
444
445 $xml = $iso2utf->convert($xml) || carp "XML conversion error in $xml";
446
447 my $s = $self->create_slice($path) || confess "create_slice returned null";
448
449 confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
450 confess "no 'in' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'in'}));
451 confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));
452
453 $self->slice_output($s);
454
455 use bytes; # as opposed to chars
456 $self->{'slice'}->{$s}->{'in'} .=
457 "Path-Name: $path\n".
458 "Content-Length: ".(length($xml)+1)."\n".
459 "Document-Type: XML\n\n$xml\n";
460
461 # do I/O
462 $self->{'slice'}->{$s}->{'h'}->pump while length $self->{'slice'}->{$s}->{'in'} ; # wait for all input to go
463
464 $self->slice_output($s);
465
466 return $s;
467 }
468
469 =head2 slice_output
470
471 Prints to STDERR output and errors from C<swish-e>.
472
473 $i->slice_output($s);
474
475 Normally, you don't need to call it.
476
477 =cut
478
479 sub slice_output {
480 my $self = shift;
481
482 my $s = shift || confess "slice_output needs slice";
483
484 confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
485 confess "no 'in' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'in'}));
486 confess "no 'out' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'out'}));
487
488 if (length $self->{'slice'}->{$s}->{'out'} > $self->{'slice'}->{$s}->{'out_len'}) {
489 #print STDERR "swish-e OUT: ",$self->{'slice'}->{$s}->{'out'},"\n" if ($self->{'slice'}->{$s}->{'out'});
490 $self->{'slice'}->{$s}->{'out_len'} = length $self->{'slice'}->{$s}->{'out'};
491 return 1;
492 } elsif (length $self->{'slice'}->{$s}->{'err'} > $self->{'slice'}->{$s}->{'err_len'}) {
493 print STDERR "swish-e ERR: ",$self->{'slice'}->{$s}->{'err'},"\n" if ($self->{'slice'}->{$s}->{'err'});
494 $self->{'slice'}->{$s}->{'err_len'} = length $self->{'slice'}->{$s}->{'err'};
495 # this is fatal
496 return 0;
497 }
498
499 return 1;
500 }
501
502 =head2 close_slice {
503
504 Close slice (terminates swish-e process for that slice).
505
506 my $i->close_slice($s);
507
508 Returns true if slice is closed, false otherwise.
509
510 =cut
511
512 sub close_slice {
513 my $self = shift;
514
515 my $s = shift || confess "close_slice needs slice";
516
517 confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
518 confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));
519
520 # pump rest of content (if any)
521 $self->{'slice'}->{$s}->{'h'}->pump while length $self->{'slice'}->{$s}->{'in'};
522
523 $self->slice_output($s);
524
525 # clean up
526 $self->{'slice'}->{$s}->{'h'}->finish or confess "finish on slice $s returned: $?";
527
528 delete($self->{'slice'}->{$s}) && return 1;
529 return 0;
530 }
531
532 =head2 to_xml
533
534 Convert (binary safe, I hope) your data into XML for C<swish-e>.
535 Data will not yet be recoded to UTF-8. L<"put_slice"> will do that.
536
537 my $xml = $i->to_xml({ foo => 'bar' });
538
539 This function is extracted from L<"add"> method so that you can L<Memoize> it.
540 If your data set has a lot of repeatable data, and memory is not a problem, you
541 can add C<memoize_to_xml> option to L<"open">.
542
543 =cut
544
545 my %escape = ('<'=>'&lt;', '>'=>'&gt;', '&'=>'&amp;', '"'=>'&quot;');
546 my $escape_re = join '|' => keys %escape;
547
548 sub to_xml {
549 my $self = shift;
550
551 my $data = shift || return;
552
553 my $xml = qq{<xml>};
554 foreach my $tag (keys %$data) {
555 my $content = $data->{$tag};
556 next if (! $content || $content eq '');
557 # save [cr/]lf before conversion to XML
558 # $content =~ s/\n\r/##lf##/gs;
559 # $content =~ s/\n/##lf##/gs;
560 $content =~ s/($escape_re)/$escape{$1}/gs;
561 $xml .= "<$tag><![CDATA[".$content."]]></$tag>";
562 }
563 $xml .= qq{</xml>};
564 }
565
566 1;
567 __END__
568
569
570 =head2 Searching
571
572 Searching is still conducted using L<SWISH::API>, but you have to glob
573 index names.
574
575 use SWISH::API;
576
577 my $swish = SWISH::API->new( glob('index.swish-e/*') );
578
579 You can also alternativly create merged index (using C<merge> option) and
580 not change your source code at all.
581
582 That would also benefit performance, but it increases indexing time
583 because merged indexes must be re-created on each indexing run.
584
585 =head2 EXPORT
586
587 Nothing by default.
588
589 =head2 EXAMPLES
590
591 Test script for this module uses all parts of API. It's also nice example
592 how to use C<SWISH::Split>.
593
594 =head1 SEE ALSO
595
596 L<SWISH::API>,
597 L<http://www.swish-e.org/>
598
599 =head1 AUTHOR
600
601 Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
602
603 =head1 COPYRIGHT AND LICENSE
604
605 Copyright (C) 2004 by Dobrica Pavlinusic
606
607 This library is free software; you can redistribute it and/or modify
608 it under the same terms as Perl itself, either Perl version 5.8.4 or,
609 at your option, any later version of Perl 5 you may have available.
610
611
612 =cut

  ViewVC Help
Powered by ViewVC 1.1.26