/[SWISH-Split]/trunk/Split.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/Split.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 8 - (show annotations)
Sun Dec 19 03:06:01 2004 UTC (19 years, 3 months ago) by dpavlin
File size: 12778 byte(s)
new api:
- renamed open to open_index
- removed dependency on IPC::Run
- tests which all pass

1 package SWISH::Split;
2
3 use 5.008;
4 use strict;
5 use warnings;
6
7 our $VERSION = '0.02';
8
9 use SWISH::API;
10 use Text::Iconv;
11 use File::Temp qw/ :mktemp /;
12 use Carp;
13 use Digest::MD5 qw(md5_hex);
14 use Memoize;
15 use File::Which;
16
17 use Data::Dumper;
18
19 use constant {
20 ADDED => 1,
21 DELETED => 2,
22 };
23
24 =head1 NAME
25
26 SWISH::Split - Perl interface to split index variant of Swish-e
27
28 =head1 SYNOPSIS
29
30 use SWISH::Split;
31
32
33 =head1 DESCRIPTION
34
35 This is alternative interface for indexing data with swish-e. It's designed
36 to split indexes over multiple files (slices) to allow updates of records in index
37 by reindexing just changed parts (slice).
38
39 Data is stored in index using intrface which is somewhat similar to
40 L<Plucene::Simple>. This could make your migration (or supporting two index
41 engines) easier.
42
43 In the background, it will fork swish-e binaries (one for each index slice)
44 and produce UTF-8 encoded XML files for it. So, if your input charset isn't
45 C<ISO-8859-1> you will have to specify it.
46
47 =head1 Methods used for indexing
48
49 =head2 open_index
50
51 Create new object for index.
52
53 my $i = SWISH::Split->open_index({
54 index => '/path/to/index',
55 slice_name => \&slice_on_path,
56 slices => 30,
57 merge => 0,
58 codepage => 'ISO-8859-2',
59 swish_config => qq{
60 PropertyNames from date
61 PropertyNamesDate date
62 },
63 memoize_to_xml => 0,
64 );
65
66 # split index on first component of path
67 sub slice_on_path {
68 return shift split(/\//,$_[0]);
69 }
70
71 Options to C<open_index> are following:
72
73 =over 5
74
75 =item C<index>
76
77 path to (existing) directory in which index slices will be created.
78
79 =item C<slice_name>
80
81 coderef to function which provide slicing from path.
82
83 =item C<slices>
84
85 maximum number of index slices. See L<"in_slice"> for
86 more explanation.
87
88 =item C<merge>
89
90 (planned) option to merge indexes into one at end.
91
92 =item C<codepage>
93
94 data codepage (needed for conversion to UTF-8).
95 By default, it's C<ISO-8859-1>.
96
97 =item C<swish_config>
98
99 additional parametars which will be inserted into
100 C<swish-e> configuration file. See L<swish-config>.
101
102 =item C<memoize_to_xml>
103
104 speed up repeatable data, see L<"to_xml">.
105
106 =back
107
108 =cut
109
110 my $iso2utf = Text::Iconv->new('ISO-8859-1','UTF-8');
111
112 sub open_index {
113 my $class = shift;
114 my $self = {@_};
115 bless($self, $class);
116
117 croak "need slice_name coderef" unless ref $self->{'slice_name'};
118 croak "need slices" unless $self->{'slices'};
119
120 croak "need index" unless $self->{'index'};
121 croak "index '",$self->{'index'},"' doesn't exist" unless -e $self->{'index'};
122 croak "index '",$self->{'index'},"' is not directory" unless -d $self->{'index'};
123
124 $iso2utf = Text::Iconv->new($self->{'codepage'},'UTF-8') if ($self->{'codepage'});
125
126 # speedup
127 memoize('in_slice');
128 memoize('to_xml') if ($self->{'memoize_to_xml'});
129
130 $self ? return $self : return undef;
131
132 }
133
134 =head2 add
135
136 Add document to index.
137
138 $i->add($swishpath, {
139 headline => 'foobar result',
140 property => 'data',
141 })
142
143 =cut
144
145 sub add {
146 my $self = shift;
147
148 my $swishpath = shift || return;
149 my $data = shift || return;
150
151 my $slice = $self->put_slice($swishpath, $self->to_xml($data));
152
153 return $slice;
154 }
155
156 =head2 delete
157
158 Delete documents from index.
159
160 $i->delete(@swishpath);
161
162 =cut
163
164 sub delete {
165 my $self = shift;
166
167 my @paths = @_ || return;
168
169 foreach my $path (@paths) {
170 $self->{'paths'}->{$path} = DELETED;
171 }
172
173 return 42;
174 }
175
176
177 =head2 done
178
179 Finish indexing and close index file(s).
180
181 $i->done;
182
183 This is most time-consuming operation. When it's called, it will re-index
184 all entries which haven't changed in all slices.
185
186 Returns number of slices updated.
187
188 This method should really be called close or finish, but both of those are
189 allready used.
190
191 =cut
192
193 sub done {
194 my $self = shift;
195
196 my $ret = 0;
197
198 foreach my $s (keys %{$self->{'slice'}}) {
199 $self->_debug("closing slice $s");
200 $ret += $self->close_slice($s);
201 }
202
203 return $ret;
204 }
205
206
207
208 =head1 Reporting methods
209
210 This methods return statistics about your index.
211
212 =head2 swishpaths
213
214 Return array of C<swishpath>s in index.
215
216 my @p = $i->swishpaths;
217
218 =cut
219
220 sub swishpaths {
221 my $self = shift;
222
223 my $s = shift || return;
224 return if (! exists($self->{'slice'}->{'s'}));
225
226 return keys %{$self->{'slice'}->{'s'}};
227 }
228
229 =head2 swishpaths_updated
230
231 Return array with updated C<swishpath>s.
232
233 my @d = $i->swishpaths_updated;
234
235 =cut
236
237 sub swishpaths_updated {
238 my $self = shift;
239 }
240
241
242 =head2 swishpaths_deleted
243
244 Return array with deleted C<swishpath>s.
245
246 my $n = $i->swishpaths_deleted;
247
248 =cut
249
250 sub swishpaths_deleted {
251 my $self = shift;
252 }
253
254
255 =head2 slices
256
257 Return array with all slice names.
258
259 my @s = $i->slices;
260
261 =cut
262
263 sub slices {
264 my $self = shift;
265 }
266
267 =head1 Helper methods
268
269 This methods are used internally, but they might be useful.
270
271 =head2 in_slice
272
273 Takes path and return slice in which this path belongs.
274
275 my $s = $i->in_slice('path/to/document/in/index');
276
277 If there are C<slices> parametar to L<"open_index"> it will use
278 MD5 hash to spread documents across slices. That will produce random
279 distribution of your documents in slices, which might or might not be best
280 for your data. If you have to re-index large number of slices on each
281 run, think about creating your own C<slice> function and distributing
282 documents manually across slices.
283
284 Slice number must always be true value or various sanity checks will fail.
285
286 This function is C<Memoize>ed for performance reasons.
287
288 =cut
289
290 sub in_slice {
291 my $self = shift;
292
293 my $path = shift || confess "need path";
294
295 confess "need slice_name function" unless ref ($self->{'slice_name'});
296
297 if ($self->{'slices'}) {
298 # first, pass path through slice_name function
299 my $slice = &{$self->{'slice_name'}}($path);
300 # then calculate MD5 hash
301 my $hash = md5_hex($slice);
302 # take first 8 chars to produce number
303 # FIXME how random is this?
304 $hash = hex(substr($hash,0,8));
305
306 $slice = ($hash % $self->{'slices'}) + 1;
307 $self->_debug("hash: $hash / ",$self->{'slices'}," => $slice");
308 return $slice;
309 } else {
310 return &{$self->{'split'}}($path);
311 }
312 }
313
314 =head2 find_paths
315
316 Return array of C<swishpath>s for given C<swish-e> query.
317
318 my @p = $i->find_paths("headline=test*");
319
320 Useful for combining with L<"delete_documents"> to delete documents
321 which hasn't changed a while (so, expired).
322
323 =cut
324
325 sub find_paths {
326 my $self = shift;
327
328 }
329
330
331 =head2 make_config
332
333 Create C<swish-e> configuration file for given slice.
334
335 my $config_filename = $i->make_config('slice name');
336
337 It returns configuration filename. If no C<swish_config> was defined in
338 L<"open_index">, default swish-e configuration will be used. It will index all data for
339 searching, but none for properties.
340
341 If you want to see what is allready defined for swish-e in configuration
342 take a look at source code for C<DEFAULT_SWISH_CONF>.
343
344 It uses C<stdin> as C<IndexDir> to comunicate with C<swish-e>.
345
346 =cut
347
348 sub make_config {
349 my $self = shift;
350
351
352 my $index_file = $self->{'index'}."/";
353 $index_file .= shift || confess "need slice name";
354
355 my ($tmp_fh, $swish_config_filename) = mkstemp("/tmp/swishXXXXX");
356
357 # find cat on filesystem
358 my $cat = which('cat');
359
360 print $tmp_fh <<"DEFAULT_SWISH_CONF";
361 # swish-e config file
362
363 IndexDir stdin
364
365 # input file definition
366 DefaultContents XML*
367
368 # indexed metatags
369 MetaNames xml swishdocpath
370
371
372 #XMLClassAttributes type
373 UndefinedMetaTags auto
374 UndefinedXMLAttributes auto
375
376 IndexFile $index_file
377
378 # Croatian ISO-8859-2 characters to unaccented equivalents
379 TranslateCharacters ¹©ðÐèÈæƾ® ssddcccczz
380
381
382 # disable output
383 ParserWarnLevel 0
384 IndexReport 1
385
386 DEFAULT_SWISH_CONF
387
388 # add user parametars (like stored properties)
389 print $tmp_fh $self->{'swish_config'} if ($self->{'swish_config'});
390
391 close($tmp_fh);
392
393 return $swish_config_filename;
394 }
395
396 =head2 create_slice
397
398 On first run, starts C<swish-e>. On subsequent calls just return
399 it's handles using L<Memoize>.
400
401 my $s = create_slice('/path/to/document');
402
403 You shouldn't need to call C<create_slice> directly because it will be called
404 from L<"put_slice"> when needed.
405
406 =cut
407
408 sub create_slice {
409 my $self = shift;
410
411 my $path = shift || confess "create_slice need path!";
412
413 my $s = $self->in_slice($path) || confess "in_slice returned null";
414
415 return $s if (exists($self->{'slice'}->{$s}));
416
417 my $swish_config = $self->make_config($s);
418
419 my $swish = qq{| swish-e };
420 $swish .= qq{ -u } if (-f $self->{'index'}.'/'.$s);
421 $swish .= qq{ -S prog -c } . $swish_config;
422
423 $self->_debug("creating slice $s using $swish");
424
425 ## Build the harness, open all pipes, and launch the subprocesses
426 open(my $fh, $swish) || croak "can't open $swish: $!";
427
428 $self->{'slice'}->{$s}->{'h'} = $fh;
429
430 $self->slice_output($s);
431
432 return $s;
433 }
434
435 =head2 put_slice
436
437 Pass XML data to swish.
438
439 my $slice = $i->put_slice('/swish/path', '<xml>data</xml>');
440
441 Returns slice in which XML ended up.
442
443 =cut
444
445 sub put_slice {
446 my $self = shift;
447
448 my $path = shift || confess "need path";
449 my $xml = shift || confess "need xml";
450
451 $xml = $iso2utf->convert($xml) || carp "XML conversion error in $xml";
452
453 my $s = $self->create_slice($path) || confess "create_slice returned null";
454
455 confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
456 confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));
457
458 $self->slice_output($s);
459
460 use bytes; # as opposed to chars
461 my $fh = $self->{'slice'}->{$s}->{'h'} || confess "handle for slice $s undefined";
462 print { $fh } "Path-Name: $path\n".
463 "Content-Length: ".(length($xml)+1)."\n".
464 "Update-Mode: Index\n".
465 "Document-Type: XML\n\n$xml\n";
466
467 $self->slice_output($s);
468
469 $self->_debug("dumping in slice $s: $path");
470
471 $self->{'paths'}->{$path} = ADDED;
472
473 return $s;
474 }
475
476 =head2 slice_output
477
478 Prints to STDERR output and errors from C<swish-e>.
479
480 my $slice = $i->slice_output($s);
481
482 Normally, you don't need to call it.
483
484 B<This is dummy placeholder function for very old code that assumes this
485 module is using C<IPC::Run> which it isn't any more.>
486
487 =cut
488
489 sub slice_output {
490 my $self = shift;
491
492 my $s = shift || confess "slice_output needs slice";
493
494 confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
495
496 # FIXME
497
498 return $s;
499 }
500
501 =head2 close_slice
502
503 Close slice (terminates swish-e process for that slice).
504
505 my $i->close_slice($s);
506
507 Returns true if slice is closed, false otherwise.
508
509 =cut
510
511 sub close_slice {
512 my $self = shift;
513
514 my $s = shift || confess "close_slice needs slice";
515
516 confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
517 confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));
518
519 # pump rest of content (if any)
520 close $self->{'slice'}->{$s}->{'h'} || carp "can't close slice $s: $!";
521
522 $self->slice_output($s);
523
524 undef $self->{'slice'}->{$s}->{'h'};
525
526 delete($self->{'slice'}->{$s}) && return 1;
527 return 0;
528 }
529
530 =head2 to_xml
531
532 Convert (binary safe, I hope) your data into XML for C<swish-e>.
533 Data will not yet be recoded to UTF-8. L<"put_slice"> will do that.
534
535 my $xml = $i->to_xml({ foo => 'bar' });
536
537 This function is extracted from L<"add"> method so that you can L<Memoize> it.
538 If your data set has a lot of repeatable data, and memory is not a problem, you
539 can add C<memoize_to_xml> option to L<"open_index">.
540
541 =cut
542
543 my %escape = ('<'=>'&lt;', '>'=>'&gt;', '&'=>'&amp;', '"'=>'&quot;');
544 my $escape_re = join '|' => keys %escape;
545
546 sub to_xml {
547 my $self = shift;
548
549 my $data = shift || return;
550
551 my $xml = qq{<xml>};
552 foreach my $tag (keys %$data) {
553 my $content = $data->{$tag};
554 next if (! $content || $content eq '');
555 # save [cr/]lf before conversion to XML
556 # $content =~ s/\n\r/##lf##/gs;
557 # $content =~ s/\n/##lf##/gs;
558 $content =~ s/($escape_re)/$escape{$1}/gs;
559 $xml .= "<$tag><![CDATA[".$content."]]></$tag>";
560 }
561 $xml .= qq{</xml>};
562 }
563
564 sub _debug {
565 my $self = shift;
566 print STDERR "## ",@_,"\n" if ($self->{'debug'});
567 return;
568 }
569
570 1;
571 __END__
572
573
574 =head1 Searching
575
576 Searching is still conducted using L<SWISH::API>, but you have to glob
577 index names.
578
579 use SWISH::API;
580
581 my $swish = SWISH::API->new( glob('index.swish-e/*') );
582
583 You can also alternativly create merged index (using C<merge> option) and
584 not change your source code at all.
585
586 That would also benefit performance, but it increases indexing time
587 because merged indexes must be re-created on each indexing run.
588
589 =head1 EXPORT
590
591 Nothing by default.
592
593 =head1 EXAMPLES
594
595 Test script for this module uses all parts of API. It's also nice example
596 how to use C<SWISH::Split>.
597
598 =head1 SEE ALSO
599
600 L<SWISH::API>,
601 L<http://www.swish-e.org/>
602
603 =head1 AUTHOR
604
605 Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
606
607 =head1 COPYRIGHT AND LICENSE
608
609 Copyright (C) 2004 by Dobrica Pavlinusic
610
611 This library is free software; you can redistribute it and/or modify
612 it under the same terms as Perl itself, either Perl version 5.8.4 or,
613 at your option, any later version of Perl 5 you may have available.
614
615
616 =cut

  ViewVC Help
Powered by ViewVC 1.1.26