/[SWISH-Split]/trunk/Split.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/Split.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 13 - (show annotations)
Fri Apr 29 23:25:02 2005 UTC (18 years, 11 months ago) by dpavlin
File size: 13017 byte(s)
added warning about unimplemented delete

1 package SWISH::Split;
2
3 use 5.008;
4 use strict;
5 use warnings;
6
7 our $VERSION = '0.03';
8
9 use SWISH::API;
10 use Text::Iconv;
11 use File::Temp qw/ :mktemp /;
12 use Carp;
13 use Digest::MD5 qw(md5_hex);
14 use Memoize;
15 use File::Which;
16
17 use Data::Dumper;
18
19 use constant {
20 ADDED => 1,
21 DELETED => 2,
22 };
23
24 =head1 NAME
25
26 SWISH::Split - Perl interface to split index variant of Swish-e
27
28 =head1 SYNOPSIS
29
30 use SWISH::Split;
31
32
33 =head1 DESCRIPTION
34
35 This is alternative interface for indexing data with swish-e. It's designed
36 to split indexes over multiple files (slices) to allow updates of records in index
37 by reindexing just changed parts (slice).
38
39 Data is stored in index using intrface which is somewhat similar to
40 L<Plucene::Simple>. This could make your migration (or supporting two index
41 engines) easier.
42
43 In the background, it will fork swish-e binaries (one for each index slice)
44 and produce UTF-8 encoded XML files for it. So, if your input charset isn't
45 C<ISO-8859-1> you will have to specify it.
46
47 =head1 Methods used for indexing
48
49 =head2 open_index
50
51 Create new object for index.
52
53 my $i = SWISH::Split->open_index({
54 index => '/path/to/index',
55 slice_name => \&slice_on_path,
56 slices => 30,
57 merge => 0,
58 codepage => 'ISO-8859-2',
59 swish_config => qq{
60 PropertyNames from date
61 PropertyNamesDate date
62 },
63 memoize_to_xml => 0,
64 );
65
66 # split index on first component of path
67 sub slice_on_path {
68 return shift split(/\//,$_[0]);
69 }
70
71 Options to C<open_index> are following:
72
73 =over 5
74
75 =item C<index>
76
77 path to (existing) directory in which index slices will be created.
78
79 =item C<slice_name>
80
81 coderef to function which provide slicing from path.
82
83 =item C<slices>
84
85 maximum number of index slices. See L<"in_slice"> for
86 more explanation.
87
88 =item C<merge>
89
90 (planned) option to merge indexes into one at end.
91
92 =item C<codepage>
93
94 data codepage (needed for conversion to UTF-8).
95 By default, it's C<ISO-8859-1>.
96
97 =item C<swish_config>
98
99 additional parametars which will be inserted into
100 C<swish-e> configuration file. See C<swish-config>.
101
102 =item C<memoize_to_xml>
103
104 speed up repeatable data, see L<"to_xml">.
105
106 =back
107
108 =cut
109
110 my $iso2utf = Text::Iconv->new('ISO-8859-1','UTF-8');
111
112 sub open_index {
113 my $class = shift;
114 my $self = {@_};
115 bless($self, $class);
116
117 croak "need slice_name coderef" unless ref $self->{'slice_name'};
118 croak "need slices" unless $self->{'slices'};
119
120 croak "need index" unless $self->{'index'};
121 croak "index '",$self->{'index'},"' doesn't exist" unless -e $self->{'index'};
122 croak "index '",$self->{'index'},"' is not directory" unless -d $self->{'index'};
123
124 $iso2utf = Text::Iconv->new($self->{'codepage'},'UTF-8') if ($self->{'codepage'});
125
126 # speedup
127 memoize('in_slice');
128 memoize('to_xml') if ($self->{'memoize_to_xml'});
129
130 $self ? return $self : return undef;
131
132 }
133
134 =head2 add
135
136 Add document to index.
137
138 $i->add($swishpath, {
139 headline => 'foobar result',
140 property => 'data',
141 })
142
143 =cut
144
145 sub add {
146 my $self = shift;
147
148 my $swishpath = shift || return;
149 my $data = shift || return;
150
151 my $slice = $self->put_slice($swishpath, $self->to_xml($data));
152
153 return $slice;
154 }
155
156 =head2 delete
157
158 Delete documents from index.
159
160 $i->delete(@swishpath);
161
162 B<This function is not implemented.>
163
164 =cut
165
166 sub delete {
167 my $self = shift;
168
169 my @paths = @_ || return;
170
171 foreach my $path (@paths) {
172 $self->{'paths'}->{$path} = DELETED;
173 }
174
175 die "delete is not yet implemented";
176
177 return 42;
178 }
179
180
181 =head2 done
182
183 Finish indexing and close index file(s).
184
185 $i->done;
186
187 This is most time-consuming operation. When it's called, it will re-index
188 all entries which haven't changed in all slices.
189
190 Returns number of slices updated.
191
192 This method should really be called close or finish, but both of those are
193 allready used.
194
195 =cut
196
197 sub done {
198 my $self = shift;
199
200 my $ret = 0;
201
202 foreach my $s (keys %{$self->{'slice'}}) {
203 $self->_debug("closing slice $s");
204 $ret += $self->close_slice($s);
205 }
206
207 return $ret;
208 }
209
210
211
212 =head1 Reporting methods
213
214 This methods return statistics about your index.
215
216 =head2 swishpaths
217
218 Return array of C<swishpath>s in index.
219
220 my @p = $i->swishpaths;
221
222 =cut
223
224 sub swishpaths {
225 my $self = shift;
226
227 my $s = shift || return;
228 return if (! exists($self->{'slice'}->{'s'}));
229
230 return keys %{$self->{'slice'}->{'s'}};
231 }
232
233 =head2 swishpaths_updated
234
235 Return array with updated C<swishpath>s.
236
237 my @d = $i->swishpaths_updated;
238
239 =cut
240
241 sub swishpaths_updated {
242 my $self = shift;
243 }
244
245
246 =head2 swishpaths_deleted
247
248 Return array with deleted C<swishpath>s.
249
250 my $n = $i->swishpaths_deleted;
251
252 =cut
253
254 sub swishpaths_deleted {
255 my $self = shift;
256 }
257
258
259 =head2 slices
260
261 Return array with all slice names.
262
263 my @s = $i->slices;
264
265 =cut
266
267 sub slices {
268 my $self = shift;
269 }
270
271 =head1 Helper methods
272
273 This methods are used internally, but they might be useful.
274
275 =head2 in_slice
276
277 Takes path and return slice in which this path belongs.
278
279 my $s = $i->in_slice('path/to/document/in/index');
280
281 If there are C<slices> parametar to L<"open_index"> it will use
282 MD5 hash to spread documents across slices. That will produce random
283 distribution of your documents in slices, which might or might not be best
284 for your data. If you have to re-index large number of slices on each
285 run, think about creating your own C<slice> function and distributing
286 documents manually across slices.
287
288 Slice number must always be true value or various sanity checks will fail.
289
290 This function is C<Memoize>ed for performance reasons.
291
292 =cut
293
294 sub in_slice {
295 my $self = shift;
296
297 my $path = shift || confess "need path";
298
299 confess "need slice_name function" unless ref ($self->{'slice_name'});
300
301 if ($self->{'slices'}) {
302 # first, pass path through slice_name function
303 my $slice = &{$self->{'slice_name'}}($path);
304 # then calculate MD5 hash
305 my $hash = md5_hex($slice);
306 # take first 8 chars to produce number
307 # FIXME how random is this?
308 $hash = hex(substr($hash,0,8));
309
310 $slice = ($hash % $self->{'slices'}) + 1;
311 $self->_debug("hash: $hash / ",$self->{'slices'}," => $slice");
312 return $slice;
313 } else {
314 return &{$self->{'split'}}($path);
315 }
316 }
317
318 =head2 find_paths
319
320 Return array of C<swishpath>s for given C<swish-e> query.
321
322 my @p = $i->find_paths("headline=test*");
323
324 Useful for combining with L<"delete_documents"> to delete documents
325 which hasn't changed a while (so, expired).
326
327 =cut
328
329 sub find_paths {
330 my $self = shift;
331
332 }
333
334
335 =head2 make_config
336
337 Create C<swish-e> configuration file for given slice.
338
339 my $config_filename = $i->make_config('slice name');
340
341 It returns configuration filename. If no C<swish_config> was defined in
342 L<"open_index">, default swish-e configuration will be used. It will index all data for
343 searching, but none for properties.
344
345 If you want to see what is allready defined for swish-e in configuration
346 take a look at source code for C<DEFAULT_SWISH_CONF>.
347
348 It uses C<stdin> as C<IndexDir> to comunicate with C<swish-e>.
349
350 =cut
351
352 sub make_config {
353 my $self = shift;
354
355
356 my $index_file = $self->{'index'}."/";
357 $index_file .= shift || confess "need slice name";
358
359 my ($tmp_fh, $swish_config_filename) = mkstemp("/tmp/swishXXXXX");
360
361 # find cat on filesystem
362 my $cat = which('cat');
363
364 print $tmp_fh <<"DEFAULT_SWISH_CONF";
365 # swish-e config file
366
367 IndexDir stdin
368
369 # input file definition
370 DefaultContents XML*
371
372 # indexed metatags
373 MetaNames xml swishdocpath
374
375
376 #XMLClassAttributes type
377 UndefinedMetaTags auto
378 UndefinedXMLAttributes auto
379
380 IndexFile $index_file
381
382 # Croatian ISO-8859-2 characters to unaccented equivalents
383 TranslateCharacters ¹©ðÐèÈæƾ® ssddcccczz
384
385
386 # disable output
387 ParserWarnLevel 0
388 IndexReport 1
389
390 DEFAULT_SWISH_CONF
391
392 # add user parametars (like stored properties)
393 print $tmp_fh $self->{'swish_config'} if ($self->{'swish_config'});
394
395 close($tmp_fh);
396
397 return $swish_config_filename;
398 }
399
400 =head2 create_slice
401
402 On first run, starts C<swish-e>. On subsequent calls just return
403 it's handles using C<Memoize>.
404
405 my $s = create_slice('/path/to/document');
406
407 You shouldn't need to call C<create_slice> directly because it will be called
408 from L<"put_slice"> when needed.
409
410 =cut
411
412 sub create_slice {
413 my $self = shift;
414
415 my $path = shift || confess "create_slice need path!";
416
417 my $s = $self->in_slice($path) || confess "in_slice returned null";
418
419 return $s if (exists($self->{'slice'}->{$s}));
420
421 my $swish_config = $self->make_config($s);
422
423 my $swish = qq{| swish-e };
424 if (-f $self->{'index'}.'/'.$s) {
425 $swish .= qq{ -u };
426 $self->{'slice'}->{$s}->{'update_mode'}++;
427 }
428 $swish .= qq{ -S prog -c } . $swish_config;
429
430 $self->_debug("creating slice $s using $swish");
431
432 ## Build the harness, open all pipes, and launch the subprocesses
433 open(my $fh, $swish) || croak "can't open $swish: $!";
434
435 $self->{'slice'}->{$s}->{'h'} = $fh;
436
437 $self->slice_output($s);
438
439 return $s;
440 }
441
442 =head2 put_slice
443
444 Pass XML data to swish.
445
446 my $slice = $i->put_slice('/swish/path', '<xml>data</xml>');
447
448 Returns slice in which XML ended up.
449
450 =cut
451
452 sub put_slice {
453 my $self = shift;
454
455 my $path = shift || confess "need path";
456 my $xml = shift || confess "need xml";
457
458 $xml = $iso2utf->convert($xml) || carp "XML conversion error in $xml";
459
460 my $s = $self->create_slice($path) || confess "create_slice returned null";
461
462 confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
463 confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));
464
465 $self->slice_output($s);
466
467 use bytes; # as opposed to chars
468 my $fh = $self->{'slice'}->{$s}->{'h'} || confess "handle for slice $s undefined";
469
470 my $update_header = "Update-Mode: Index\n";
471 $update_header = '' unless ($self->{'slice'}->{$s}->{'update_mode'});
472
473 print { $fh } "Path-Name: $path\n".
474 "Content-Length: ".(length($xml)+1)."\n" . $update_header .
475 "Document-Type: XML\n\n$xml\n";
476
477 $self->slice_output($s);
478
479 $self->_debug("dumping in slice $s: $path");
480
481 $self->{'paths'}->{$path} = ADDED;
482
483 return $s;
484 }
485
486 =head2 slice_output
487
488 Prints to STDERR output and errors from C<swish-e>.
489
490 my $slice = $i->slice_output($s);
491
492 Normally, you don't need to call it.
493
494 B<This is dummy placeholder function for very old code that assumes this
495 module is using C<IPC::Run> which it isn't any more.>
496
497 =cut
498
499 sub slice_output {
500 my $self = shift;
501
502 my $s = shift || confess "slice_output needs slice";
503
504 confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
505
506 # FIXME
507
508 return $s;
509 }
510
511 =head2 close_slice
512
513 Close slice (terminates swish-e process for that slice).
514
515 my $i->close_slice($s);
516
517 Returns true if slice is closed, false otherwise.
518
519 =cut
520
521 sub close_slice {
522 my $self = shift;
523
524 my $s = shift || confess "close_slice needs slice";
525
526 confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
527 confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));
528
529 # pump rest of content (if any)
530 close $self->{'slice'}->{$s}->{'h'} || carp "can't close slice $s: $!";
531
532 $self->slice_output($s);
533
534 undef $self->{'slice'}->{$s}->{'h'};
535
536 delete($self->{'slice'}->{$s}) && return 1;
537 return 0;
538 }
539
540 =head2 to_xml
541
542 Convert (binary safe, I hope) your data into XML for C<swish-e>.
543 Data will not yet be recoded to UTF-8. L<"put_slice"> will do that.
544
545 my $xml = $i->to_xml({ foo => 'bar' });
546
547 This function is extracted from L<"add"> method so that you can C<Memoize> it.
548 If your data set has a lot of repeatable data, and memory is not a problem, you
549 can add C<memoize_to_xml> option to L<"open_index">.
550
551 =cut
552
553 my %escape = ('<'=>'&lt;', '>'=>'&gt;', '&'=>'&amp;', '"'=>'&quot;');
554 my $escape_re = join '|' => keys %escape;
555
556 sub to_xml {
557 my $self = shift;
558
559 my $data = shift || return;
560
561 my $xml = qq{<xml>};
562 foreach my $tag (keys %$data) {
563 my $content = $data->{$tag};
564 next if (! $content || $content eq '');
565 # save [cr/]lf before conversion to XML
566 # $content =~ s/\n\r/##lf##/gs;
567 # $content =~ s/\n/##lf##/gs;
568 $content =~ s/($escape_re)/$escape{$1}/gs;
569 $xml .= "<$tag><![CDATA[".$content."]]></$tag>";
570 }
571 $xml .= qq{</xml>};
572 }
573
574 sub _debug {
575 my $self = shift;
576 print STDERR "## ",@_,"\n" if ($self->{'debug'});
577 return;
578 }
579
580 1;
581 __END__
582
583
584 =head1 Searching
585
586 Searching is still conducted using L<SWISH::API>, but you have to glob
587 index names.
588
589 use SWISH::API;
590
591 my $swish = SWISH::API->new( glob('index.swish-e/*') );
592
593 You can also alternativly create merged index (using C<merge> option) and
594 not change your source code at all.
595
596 That would also benefit performance, but it increases indexing time
597 because merged indexes must be re-created on each indexing run.
598
599 =head1 EXPORT
600
601 Nothing by default.
602
603 =head1 EXAMPLES
604
605 Test script for this module uses all parts of API. It's also nice example
606 how to use C<SWISH::Split>.
607
608 =head1 SEE ALSO
609
610 L<SWISH::API>,
611 L<http://www.swish-e.org/>
612
613 =head1 AUTHOR
614
615 Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
616
617 =head1 COPYRIGHT AND LICENSE
618
619 Copyright (C) 2004 by Dobrica Pavlinusic
620
621 This library is free software; you can redistribute it and/or modify
622 it under the same terms as Perl itself, either Perl version 5.8.4 or,
623 at your option, any later version of Perl 5 you may have available.
624
625
626 =cut

  ViewVC Help
Powered by ViewVC 1.1.26